Пример #1
0
def start_stream(self, track=None, follow=False, credentials=False):

    logger.info('***Starting twitter filter stream***')
    # TODO one of filter_terms or follow is required, return error if neither present
    # start the stream
    stream_task = stream_filter.delay(credentials=credentials, track=track)
    cache.set("stream_id_" + self.request.id, stream_task.id.encode('utf-8'))
Пример #2
0
def pushTwitterConnections(self, twits, user, friends=True, cacheKey=False):
    """Push the Twitter connections of a given user to Neo4J.
    
    Positional arguments:
    twits -- a list of Twitter users as returned by Twython
    user -- The screen_name of the user

    Keyword arguments:
    friends -- "twits" are the user's friends if True, (default) else they're followers 
    cacheKey -- a Redis key that identifies an on-going task to grab a user's friends or followers
    
    """

    if friends:
        job = ' FRIENDS'
    else:
        job = ' FOLLOWERS'
    
    if twits:
        rendered_twits = [renderTwitterUser(twit) for twit in twits]
        pushRenderedConnections2Neo.delay(user, rendered_twits, friends=friends)

    if cacheKey:  # These are the last connections, tell the scraper we're done.
        cache.set(cacheKey, 'done')
        logger.info('*** %s: DONE WITH %s ***' % (user, job))
Пример #3
0
    def method_call(self, method_name, *args, **kwargs):
        """Make a Twitter API call via the underlying Twython object.
    
        Returns a tuple: (True,<API call return value>) | (False,<reason for failure>)
    
        Positional arguments:
        method_name -- the name of the API call to test

        """

        # Does Twython even know how to do that?
        try:
            method = getattr(self.twitter, method_name)
        except:
            logging.error('*** NO SUCH TWITTER METHOD: ' + method_name +
                          ' ***')
            return (False, 'no_such_method')

        # Call the method of the Twython object.
        try:
            result = (True, method(*args, **kwargs))
        except TwythonAuthError:
            logging.error('*** TWITTER METHOD 401: ' + method_name + ' ***')
            result = (False, 'forbidden')
        except TwythonRateLimitError:
            logging.error('*** TWITTER METHOD LIMITED: ' + method_name +
                          ' ***')
            result = (False, 'limited')
        except TwythonError as e:
            if str(e.error_code) == '404':
                logging.error('*** TWITTER METHOD 404: ' + method_name +
                              ' ***')
                result = (False, '404')
            else:
                logging.error('*** TWITTER METHOD FAILED: ' + method_name +
                              ' ***')
                result = (False, 'unknown')
            logging.error(args)

        # Have we been told how many calls remain in the current window?
        try:
            xLimit = self.twitter.get_lastfunction_header(
                'x-rate-limit-remaining')
            xReset = self.twitter.get_lastfunction_header('x-rate-limit-reset')
        except:
            xLimit = xReset = False

        if xLimit:
            limit = int(xLimit)
        if xReset:
            reset = datetime.utcfromtimestamp(int(xReset)).isoformat()
        if xLimit and xReset:
            # Store the current number of remaining calls and time when the window resets.
            cache.set(self.handle + method_name,
                      json.dumps({
                          'limit': limit,
                          'reset': reset
                      }))

        return result
Пример #4
0
def doUserScrape(self, credentials=False):
    """Retrieve the next timelines, friends and followers for the next accounts in the user scrape. """
    keep_going = cache.get('user_scrape_' + self.request.root_id)
    if (not keep_going) or keep_going.decode('utf-8') != 'true':
        logger.info('*** STOPPED USER SCRAPE ***')
        # mark crawl as stopped on crawl node
        db = get_neo_driver()
        update_crawl(db, crawl_task=self.request.root_id, status='done')
        db.close()
        return False

    user = cache.get('scrape_user_' + self.request.root_id).decode('utf-8')
    logger.info('*** SCRAPING USER: %s... ***' % (user,))

    this_friend = cache.get('scrape_friends_' + self.request.root_id).decode('utf-8')
    if (not this_friend) or this_friend == 'done':
        db = get_neo_driver()
        next_friends = nextNearest(db, user, 'friends', self.request.root_id)
        db.close()
        if next_friends:
            cache.set('scrape_friends_' + self.request.root_id, 'running')
            getTwitterConnections.delay(next_friends, cacheKey='scrape_friends_' + self.request.root_id)
    else:
        logger.info('*** FRIENDS BUSY ***')

    this_follower = cache.get('scrape_followers_' + self.request.root_id).decode('utf-8')
    if (not this_follower) or this_follower == 'done':
        db = get_neo_driver()
        next_followers = nextNearest(db, user, 'followers', self.request.root_id)
        db.close()
        if next_followers:
            cache.set('scrape_followers_' + self.request.root_id, 'running')
            getTwitterConnections.delay(next_followers, friends=False, cacheKey='scrape_followers_' + self.request.root_id)
    else:
        logger.info('*** FOLLOWERS BUSY ***')

    this_tweet = cache.get('scrape_tweets_' + self.request.root_id).decode('utf-8')
    if (not this_tweet) or this_tweet == 'done':
        db = get_neo_driver()
        next_tweets = nextNearest(db, user, 'tweets', self.request.root_id)
        db.close()
        if next_tweets:
            cache.set('scrape_tweets_' + self.request.root_id, 'running')
            getTweets.delay(next_tweets, maxTweets=1000, credentials=credentials, cacheKey='scrape_tweets_' + self.request.root_id)
    else:
        logger.info('*** TWEETS BUSY ***')

    if 'running' in [cache.get(k).decode('utf-8') for k in
                     ['scrape_friends_' + self.request.root_id, 'scrape_followers_' + self.request.root_id,
                      'scrape_tweets_' + self.request.root_id]]:
        doUserScrape.apply_async(countdown=30, credentials=credentials)
    else:
        cache.set('user_scrape_' + self.request.root_id, 'false')
        cache.set('scrape_mode_' + self.request.root_id, '')
        logger.info('*** FINISHED SCRAPING USER: %s ***' % (user,))
Пример #5
0
def getTweets(self, user, maxTweets=3000,  count=0, tweetId=0, cacheKey=False, credentials=False):
    logger.info('Executing getTweets task id {0.id}, args: {0.args!r} kwargs: {0.kwargs!r}'.format(self.request))
    logger.info('task parent id {0.parent_id}, root id {0.root_id}'.format(self.request))
    """Get tweets from the timeline of the given user, push them to Neo4J.
    
    Positional arguments:
    user -- The screen_name of the user

    Keyword arguments:
    maxTweets -- The maximum number of tweets to retrieve
    cacheKey -- a Redis key that identifies an on-going task to grab a user's timeline
    count -- The number of tweets already retrieved, set when the task calls itself
    tweetId -- The maximum tweet ID to retrieve, set when the task calls itself
    
    """
    api = RatedTwitter(credentials=credentials)
    limit = api.get_user_timeline_wait()
    if limit:
        logger.info('*** TWITTER RATE-LIMITED: statuses.user_timeline: %s:%d  ***' % (user, str(count)))
        raise getTweets.retry(countdown=limit)
    else:
        args = {'screen_name': user, 'exclude_replies': False, 'include_rts': True, 'trim_user': False, 'count': 200}
        if tweetId:
            args['max_id'] = tweetId

        okay, result = api.get_user_timeline(**args)

        if okay:
            logger.info('*** TWITTER USER_TIMELINE: %s:%s ***' % (user, str(tweetId)))
            if result:
                newCount = count + len(result)
                if maxTweets:
                    if newCount > maxTweets: # No need for the task to call itself again.
                        pushTweets.delay(result, user, cacheKey=cacheKey) # Give pushTweets the cache-key to end the job.
                        return
                    else:
                        pushTweets.delay(result, user)

                newTweetId = min([t['id'] for t in result]) - 1
                # Not done yet, the task calls itself with an updated count and tweetId.
                getTweets.delay(user, maxTweets=maxTweets, count=newCount, tweetId=newTweetId, cacheKey=cacheKey, credentials=credentials)
            else:
                pushTweets.delay([], user, cacheKey=cacheKey)  # Nothing more found, so tell pushTweets the job is done.
        else:
            if result == '404':
                db = get_neo_driver()
                setUserDefunct(db, user)
                db.close()
            cache.set('scrape_tweets_' + self.request.root_id, 'done')
            if result == 'limited':
                raise getTweets.retry(countdown=api.get_user_timeline_wait())
Пример #6
0
def getTwitterConnections(self, user, friends=True, cursor=-1, credentials=False, cacheKey=False):
    """Get the connections of the given user, push them to Neo4J.

    Positional arguments:
    user -- The screen_name of the user

    Keyword arguments:
    friends -- "twits" are the user's friends if True, (default) else they're followers 
    cacheKey -- a Redis key that identifies an on-going task to grab a user's friends or followers
    cursor -- Id of the next block of connections to retrieve, set when the task calls itself
    """
    api = RatedTwitter(credentials=credentials)
    if friends:
        method = api.get_friends_list
        limit = api.get_friends_list_wait()
        method_name = 'get_friends_list'
    else:
        method = api.get_followers_list
        limit = api.get_followers_list_wait()
        method_name = 'get_followers_list'
    
    if limit:
        logger.info('*** TWITTER RATE-LIMITED: %s:%s ***' % (method_name, str(cursor)))
        raise getTwitterConnections.retry(countdown=limit)
    else:
        okay, result = method(screen_name=user, cursor=cursor, count=200)  # We can get a maximum of 200 connections at once.
        if okay:
            logger.info('*** TWITTER CURSOR: %s:%s:%s ***' % (method_name, user, str(cursor)))
            twits = result['users']
            next_cursor = result.get('next_cursor', False)
            if next_cursor: # Unless the next cursor is 0, we're not done yet.
                getTwitterConnections.delay(user, friends=friends, cursor=next_cursor, cacheKey=cacheKey, credentials=credentials)
                pushTwitterConnections.delay(twits, user, friends=friends)
            else:
                pushTwitterConnections.delay(twits, user, friends=friends, cacheKey=cacheKey) # All done, send the cacheKey.
                    
        else:
            if result == 'limited':
                raise getTwitterConnections.retry(exc=Exception('Twitter rate-limited', method_name), countdown=API_TIMEOUT)
            if result == '404':
                db = get_neo_driver()
                setUserDefunct(db, user)
                db.close()
                if friends:
                    cache.set('scrape_friends_' + self.request.root_id, 'done')
                else:
                    cache.set('scrape_followers_' + self.request.root_id, 'done')
Пример #7
0
def startUserScrape(self, user, credentials=False):
    """Start scraping around the given user."""
    logger.info('*** STARTED SCRAPING: USER: %s ***' % (user,))
    cache.set('user_scrape_' + self.request.root_id, 'true')
    cache.set('scrape_mode_' + self.request.root_id, 'user')
    cache.set('scrape_user_' + self.request.root_id, user)

    # add crawl node for this user as centre of scrape
    db = get_neo_driver()
    start_user_crawl(db, user, crawl_task=self.request.root_id, status='initiated')
    db.close()

    for key in ['scrape_friends', 'scrape_followers', 'scrape_tweets']:
        cache.set(key + '_' + self.request.root_id, '')
        
    for job in ['friends', 'followers', 'tweets']:
        cache_key = '_'.join(['nextnearest', job, user, self.request.root_id])
        cache.set(cache_key, '')
        
    doUserScrape.delay(credentials=credentials)
Пример #8
0
def doDefaultScrape(self, latest=False, credentials=False):
    """Retrieve the tweets, friends or followers of trhe next users in the default scrape."""
    keep_going = cache.get('default_scrape_' + self.request.root_id)
    if (not keep_going) or keep_going.decode('utf-8') != 'true':
        logger.info('*** STOPPED DEFAULT SCRAPE ***') 
        return False
    
    logger.info('*** SCRAPING... ***')

    this_friend = cache.get('scrape_friends_' + self.request.root_id)
    if (not this_friend) or this_friend.decode('utf-8') == 'done':
        cache.set('scrape_friends_' + self.request.root_id, 'running')
        getTwitterConnections.delay(whoNext('friends', latest=latest),  credentials=credentials, cacheKey='scrape_friends_' + self.request.root_id)
    else:
        logger.info('*** FRIENDS BUSY ***')

    this_follower = cache.get('scrape_followers_' + self.request.root_id)
    if (not this_follower) or this_follower.decode('utf-8') == 'done':
        cache.set('scrape_followers_' + self.request.root_id, 'running')
        getTwitterConnections.delay(whoNext('friends', latest=latest), credentials=credentials, friends=False, cacheKey='scrape_followers_' + self.request.root_id)
    else:
        logger.info('*** FOLLOWERS BUSY ***')

    this_tweet = cache.get('scrape_tweets_' + self.request.root_id)
    if (not this_tweet) or this_tweet.decode('utf-8') == 'done':
        cache.set('scrape_tweets_' + self.request.root_id, 'running')
        getTweets.delay(whoNext('tweets', latest=latest), maxTweets=1000, credentials=credentials, cacheKey='scrape_tweets_' + self.request.root_id)
    else:
        logger.info('*** TWEETS BUSY ***')
                    
    doDefaultScrape.apply_async(kwargs={'latest': latest}, credentials=credentials, countdown=30)
Пример #9
0
def pushTweets(self, tweets, user, cacheKey=False):
    """ Dump a set of tweets from a given user's timeline to Neo4J/Solr.

    Positional arguments:
    tweets -- a list of tweets as returned by Twython.
    user -- screen_name of the user
    
    Keyword arguments:
    cacheKey -- a Redis key that identifies an on-going task to grab a user's timeline
    
    """
    logger.info('Executing pushTweets task id {0.id}, task parent id {0.parent_id}, root id {0.root_id}'.format(self.request))

    tweetDump = decomposeTweets(tweets)  # Extract mentions, URLs, replies hashtags etc...

    pushRenderedTweets2Neo.delay(user, tweetDump)
        
    for label in ['tweet', 'retweet', 'quotetweet']:
        pushRenderedTweets2Solr.delay([t[0] for t in tweetDump[label]])

    if cacheKey: # These are the last Tweets, tell the scraper we're done.
        cache.set(cacheKey, 'done')
        logger.info('*** %s: DONE WITH TWEETS ***' % user) 
Пример #10
0
def startScrape(self, latest=False, credentials=False):
    """Start the default scrape, retrieving the users that need timelines, friends or followers updated,
    in the order that they were first added. """
    logger.info('*** STARTED SCRAPING: DEFAULT: ***') 
    cache.set('default_scrape_' + self.request.root_id, 'true')
    cache.set('scrape_mode_' + self.request.root_id, 'default')
    
    for key in ['scrape_friends', 'scrape_followers', 'scrape_tweets']:
        cache.set(key + '_' + self.request.root_id, '')
    
    doDefaultScrape.delay(latest=latest, credentials=credentials)
Пример #11
0
def nextNearest(db,
                user,
                job,
                root_task,
                max_friends=2000,
                max_followers=2000,
                limit=20,
                max_tweets=2000,
                test=False):
    """Find the next user to retrieve friends, followers or tweets, closest to a given user."""
    cacheKey = '_'.join(['nextnearest', job, user, root_task])
    nextUserDump = cache.get(cacheKey).decode('utf-8')
    next_users = False
    if nextUserDump:
        try:
            next_users = json.loads(nextUserDump)
        except:
            next_users = []
    if next_users:
        logging.info('*** NEXT ' + job + ': ' + ', '.join(next_users) +
                     ' from ' + user + ' ***')
        next_user = next_users.pop(0)
        cache.set(cacheKey, json.dumps(next_users))
        return next_user

    query_str = "MATCH (a:twitter_user {{screen_name: '{}'}})-[:FOLLOWS]-(d:twitter_user)".format(
        user)
    query_str += ' MATCH (b:twitter_user)-[:FOLLOWS]-(d) WITH DISTINCT b '
    if job == 'friends':
        query_str += 'MATCH (b)-[:FOLLOWS]->(c:twitter_user) '
    if job == 'followers':
        query_str += 'MATCH (b)<-[:FOLLOWS]-(c:twitter_user) '
    if job == 'tweets':
        query_str += 'MATCH (b)-[:TWEETED]->(c:tweet) '
    query_str += 'WITH b, COUNT(c) AS n '
    query_str += 'WHERE b.friends_count < {} AND b.followers_count < {} ' \
                 'AND NOT EXISTS (b.protected) AND NOT EXISTS (b.defunct) '.format(max_friends, max_followers)
    if job == 'friends':
        query_str += 'AND n < b.friends_count/2 '
    if job == 'followers':
        query_str += 'AND n < b.followers_count/2 '
    if job == 'tweets':
        query_str += 'AND b.statuses_count > 0 AND n < b.statuses_count/2 AND n<{} '.format(
            max_tweets)
    query_str += 'RETURN b.screen_name ORDER BY b.{}_last_scraped LIMIT {}'.format(
        job, limit)

    logging.info('*** Looking for ' + job + ' for ' + user + ' ***')

    if test:
        return query_str

    query = query_str
    try:
        with db.session() as session:
            with session.begin_transaction() as tx:
                result = tx.run(query)
                next_users = [record.values()[0] for record in result]
    except:
        next_users = []

    if next_users:
        logging.info('*** NEXT ' + job + ': ' + ', '.join(next_users) +
                     ' from ' + user + ' ***')
        next_user = next_users.pop(0)
        cache.set(cacheKey, json.dumps(next_users))
        return next_user
    else:
        logging.info('No more ' + job + ' for ' + user)

    return False
Пример #12
0
def stop_scrape(self, task_id):
    '''Stop an excuting scrape on the next loop.'''
    scrape_mode = cache.get('scrape_mode_' + task_id)
    if scrape_mode:
        scrape_mode = scrape_mode.decode('utf-8')
        cache.set(scrape_mode + '_scrape_' + task_id, 'false')
Пример #13
0
 def connected(self, val):
     key = 'stream_' + self.stream_id + '_connected'
     cache.set(key, val)