Exemplo n.º 1
0
    def GetTermIDFs(self, terms):
        if not terms or not len(terms):
            return json.loads({"idfs": []})

        url = 'http://50.56.221.228/cgi-bin/idf.php?'
        # TODO: HTML entity encoding (?)
        # TODO: Enhanced encoding detection - first term's encoding may not be always appropriate.
        data = ('terms=' + ','.join(terms).replace("#", "%23")).encode("utf-8")
        debuglog.msg(data)

        txt_unicode = UnicodeDammit(urllib.request.urlopen(url, data).read())
        txt = txt_unicode.unicode_markup
        txt = txt.replace(",null:", ',"null":')  #workaround
        data = json.loads(txt, encoding=txt_unicode.original_encoding)
        return data
Exemplo n.º 2
0
    def getUserTweetsData(self, user):
        debuglog.msg("=====\n\nGetting data for @%s from Search API..." % user)
        try:
            twitter_query = "from:%s" % user
            twitter_query = urllib.parse.quote(twitter_query)

            query_url = "http://search.twitter.com/search.json?lang=en&rpp=100&q=%s" % twitter_query

            response_unicode = UnicodeDammit(
                urllib.request.urlopen(query_url).read())
            data = json.loads(response_unicode.unicode_markup,
                              encoding=response_unicode.original_encoding if
                              response_unicode.original_encoding else "utf-8")
            debuglog.msg("\tGot %s tweets for @%s from Search API." %
                         (str(len(data['results'])), user))
            return data
        except:
            debuglog.msg("\tFailed to get data from Search API :(")
            debuglog.msg("\t\tURL:\t%s" % query_url)
            return {'results': []}
Exemplo n.º 3
0
    def fetchUserTimeline(self,
                          user,
                          format="default",
                          use_cache=True,
                          write_cache=True,
                          use_filesystem_cache=False):
        # TODO: Clean this function up, format parameter does magic it shouldn't be doing.
        # Currently, format="default" means that we're adding celebrity timeline tweets, we never call this.
        # If we do call with format="default" we want to add the timeline tweets to the celebrity tweets table.
        # This is called from DataGrabber to get user timelines with format="searchapi". In this case we want to check
        # if we have matching non-celebrity tweets, and if so return them (in future: possibly add new tweets from
        # search api as well). If not, get tweets from the timeline api, store them in the tweets_non_celeb table,
        # and return an object with those tweets.
        # Also, if a user is cached and called with default, we will just get back the cached data and not insert anything.

        debuglog.msg("Fetching timeline for @%s..." % user)
        got_cache_data = False
        json_txt = "{}"
        json_encoding = "utf-8"

        if use_cache and not use_filesystem_cache:
            q = "SELECT * FROM tweets_non_celeb WHERE from_user=%(user)s;"
            vals = {'user': user}
            cached_tweets = self.sql.q(q, vals)
            if len(cached_tweets) > 0:
                return [tweet[0] for tweet in cached_tweets]
        elif use_cache and use_filesystem_cache:
            debuglog.msg("\tchecking cache...")
            cached_list = os.listdir('./timelines')
            userjsonfilename = user.lower() + '.json'
            if userjsonfilename in cached_list:
                #modtime = os.stat('./timelines/'+userjsonfilename)[ST_MTIME]
                ##cache stays fresh for a day
                #if ((float(time.time()) - modtime)/60)/60 <= 24:
                debuglog.msg("\t\tgot cache data.")
                json_txt = open('./timelines/' + userjsonfilename, 'r').read()
                got_cache_data = True

        if not got_cache_data:
            debuglog.msg("\tNo cache data, calling timeline api...")
            if self.checkRateLimit() > 0:
                debuglog.msg("\t\tHave to wait.")
                return {'status': 'wait'}
            url = "https://api.twitter.com/1/statuses/user_timeline.json?&screen_name=%s&count=150" % user
            debuglog.msg(url)
            try:
                response = urllib.request.urlopen(url)
                debuglog.msg(response.info())
            except urllib.error.HTTPError as e:
                if "404" in str(e):
                    return {'status': '404'}
                elif "502" in str(e):
                    return {'status': 'retry'}
                else:
                    return {'status': 'error'}

            json_unicode = UnicodeDammit(response.read())
            json_txt = json_unicode.unicode_markup
            if json_unicode.original_encoding:
                json_encoding = json_unicode.original_encoding

            if write_cache and use_filesystem_cache:
                fname = './timelines/' + user.lower() + '.json'
                with open(fname, 'wt') as f:
                    os.chmod(fname, 0o777)
                    f.write(json_txt)

        data = json.loads(json_txt, encoding=json_encoding)
        debuglog.msg("\tdata is...", str(data)[:100])

        if format == "searchapi":
            # For now, format="searchapi" indicates we are getting non-celebrity tweets.
            debuglog.msg("\tGot %d results for %s from user timeline API." %
                         (len(data), user))

            if write_cache and not use_filesystem_cache:
                for non_celeb_timeline_tweet in data:
                    self.tweet_adder.addNonCelebTimelineTweet(
                        non_celeb_timeline_tweet)

            return {'results': data}

        # For now, format="default" (only way to reach here) means we are adding celebrity tweets.
        for timeline_tweet in data:
            self.tweet_adder.addTimelineTweet(timeline_tweet)

        return {'status': 'success'}
Exemplo n.º 4
0
 def toUtf(self, html):
     return UnicodeDammit(html).unicode_markup
Exemplo n.º 5
0
 def detectCharset(self, html):
     return UnicodeDammit(html).original_enconding