def start_stream(self, track=None, follow=False, credentials=False): logger.info('***Starting twitter filter stream***') # TODO one of filter_terms or follow is required, return error if neither present # start the stream stream_task = stream_filter.delay(credentials=credentials, track=track) cache.set("stream_id_" + self.request.id, stream_task.id.encode('utf-8'))
def pushTwitterConnections(self, twits, user, friends=True, cacheKey=False): """Push the Twitter connections of a given user to Neo4J. Positional arguments: twits -- a list of Twitter users as returned by Twython user -- The screen_name of the user Keyword arguments: friends -- "twits" are the user's friends if True, (default) else they're followers cacheKey -- a Redis key that identifies an on-going task to grab a user's friends or followers """ if friends: job = ' FRIENDS' else: job = ' FOLLOWERS' if twits: rendered_twits = [renderTwitterUser(twit) for twit in twits] pushRenderedConnections2Neo.delay(user, rendered_twits, friends=friends) if cacheKey: # These are the last connections, tell the scraper we're done. cache.set(cacheKey, 'done') logger.info('*** %s: DONE WITH %s ***' % (user, job))
def method_call(self, method_name, *args, **kwargs): """Make a Twitter API call via the underlying Twython object. Returns a tuple: (True,<API call return value>) | (False,<reason for failure>) Positional arguments: method_name -- the name of the API call to test """ # Does Twython even know how to do that? try: method = getattr(self.twitter, method_name) except: logging.error('*** NO SUCH TWITTER METHOD: ' + method_name + ' ***') return (False, 'no_such_method') # Call the method of the Twython object. try: result = (True, method(*args, **kwargs)) except TwythonAuthError: logging.error('*** TWITTER METHOD 401: ' + method_name + ' ***') result = (False, 'forbidden') except TwythonRateLimitError: logging.error('*** TWITTER METHOD LIMITED: ' + method_name + ' ***') result = (False, 'limited') except TwythonError as e: if str(e.error_code) == '404': logging.error('*** TWITTER METHOD 404: ' + method_name + ' ***') result = (False, '404') else: logging.error('*** TWITTER METHOD FAILED: ' + method_name + ' ***') result = (False, 'unknown') logging.error(args) # Have we been told how many calls remain in the current window? try: xLimit = self.twitter.get_lastfunction_header( 'x-rate-limit-remaining') xReset = self.twitter.get_lastfunction_header('x-rate-limit-reset') except: xLimit = xReset = False if xLimit: limit = int(xLimit) if xReset: reset = datetime.utcfromtimestamp(int(xReset)).isoformat() if xLimit and xReset: # Store the current number of remaining calls and time when the window resets. cache.set(self.handle + method_name, json.dumps({ 'limit': limit, 'reset': reset })) return result
def doUserScrape(self, credentials=False): """Retrieve the next timelines, friends and followers for the next accounts in the user scrape. """ keep_going = cache.get('user_scrape_' + self.request.root_id) if (not keep_going) or keep_going.decode('utf-8') != 'true': logger.info('*** STOPPED USER SCRAPE ***') # mark crawl as stopped on crawl node db = get_neo_driver() update_crawl(db, crawl_task=self.request.root_id, status='done') db.close() return False user = cache.get('scrape_user_' + self.request.root_id).decode('utf-8') logger.info('*** SCRAPING USER: %s... ***' % (user,)) this_friend = cache.get('scrape_friends_' + self.request.root_id).decode('utf-8') if (not this_friend) or this_friend == 'done': db = get_neo_driver() next_friends = nextNearest(db, user, 'friends', self.request.root_id) db.close() if next_friends: cache.set('scrape_friends_' + self.request.root_id, 'running') getTwitterConnections.delay(next_friends, cacheKey='scrape_friends_' + self.request.root_id) else: logger.info('*** FRIENDS BUSY ***') this_follower = cache.get('scrape_followers_' + self.request.root_id).decode('utf-8') if (not this_follower) or this_follower == 'done': db = get_neo_driver() next_followers = nextNearest(db, user, 'followers', self.request.root_id) db.close() if next_followers: cache.set('scrape_followers_' + self.request.root_id, 'running') getTwitterConnections.delay(next_followers, friends=False, cacheKey='scrape_followers_' + self.request.root_id) else: logger.info('*** FOLLOWERS BUSY ***') this_tweet = cache.get('scrape_tweets_' + self.request.root_id).decode('utf-8') if (not this_tweet) or this_tweet == 'done': db = get_neo_driver() next_tweets = nextNearest(db, user, 'tweets', self.request.root_id) db.close() if next_tweets: cache.set('scrape_tweets_' + self.request.root_id, 'running') getTweets.delay(next_tweets, maxTweets=1000, credentials=credentials, cacheKey='scrape_tweets_' + self.request.root_id) else: logger.info('*** TWEETS BUSY ***') if 'running' in [cache.get(k).decode('utf-8') for k in ['scrape_friends_' + self.request.root_id, 'scrape_followers_' + self.request.root_id, 'scrape_tweets_' + self.request.root_id]]: doUserScrape.apply_async(countdown=30, credentials=credentials) else: cache.set('user_scrape_' + self.request.root_id, 'false') cache.set('scrape_mode_' + self.request.root_id, '') logger.info('*** FINISHED SCRAPING USER: %s ***' % (user,))
def getTweets(self, user, maxTweets=3000, count=0, tweetId=0, cacheKey=False, credentials=False): logger.info('Executing getTweets task id {0.id}, args: {0.args!r} kwargs: {0.kwargs!r}'.format(self.request)) logger.info('task parent id {0.parent_id}, root id {0.root_id}'.format(self.request)) """Get tweets from the timeline of the given user, push them to Neo4J. Positional arguments: user -- The screen_name of the user Keyword arguments: maxTweets -- The maximum number of tweets to retrieve cacheKey -- a Redis key that identifies an on-going task to grab a user's timeline count -- The number of tweets already retrieved, set when the task calls itself tweetId -- The maximum tweet ID to retrieve, set when the task calls itself """ api = RatedTwitter(credentials=credentials) limit = api.get_user_timeline_wait() if limit: logger.info('*** TWITTER RATE-LIMITED: statuses.user_timeline: %s:%d ***' % (user, str(count))) raise getTweets.retry(countdown=limit) else: args = {'screen_name': user, 'exclude_replies': False, 'include_rts': True, 'trim_user': False, 'count': 200} if tweetId: args['max_id'] = tweetId okay, result = api.get_user_timeline(**args) if okay: logger.info('*** TWITTER USER_TIMELINE: %s:%s ***' % (user, str(tweetId))) if result: newCount = count + len(result) if maxTweets: if newCount > maxTweets: # No need for the task to call itself again. pushTweets.delay(result, user, cacheKey=cacheKey) # Give pushTweets the cache-key to end the job. return else: pushTweets.delay(result, user) newTweetId = min([t['id'] for t in result]) - 1 # Not done yet, the task calls itself with an updated count and tweetId. getTweets.delay(user, maxTweets=maxTweets, count=newCount, tweetId=newTweetId, cacheKey=cacheKey, credentials=credentials) else: pushTweets.delay([], user, cacheKey=cacheKey) # Nothing more found, so tell pushTweets the job is done. else: if result == '404': db = get_neo_driver() setUserDefunct(db, user) db.close() cache.set('scrape_tweets_' + self.request.root_id, 'done') if result == 'limited': raise getTweets.retry(countdown=api.get_user_timeline_wait())
def getTwitterConnections(self, user, friends=True, cursor=-1, credentials=False, cacheKey=False): """Get the connections of the given user, push them to Neo4J. Positional arguments: user -- The screen_name of the user Keyword arguments: friends -- "twits" are the user's friends if True, (default) else they're followers cacheKey -- a Redis key that identifies an on-going task to grab a user's friends or followers cursor -- Id of the next block of connections to retrieve, set when the task calls itself """ api = RatedTwitter(credentials=credentials) if friends: method = api.get_friends_list limit = api.get_friends_list_wait() method_name = 'get_friends_list' else: method = api.get_followers_list limit = api.get_followers_list_wait() method_name = 'get_followers_list' if limit: logger.info('*** TWITTER RATE-LIMITED: %s:%s ***' % (method_name, str(cursor))) raise getTwitterConnections.retry(countdown=limit) else: okay, result = method(screen_name=user, cursor=cursor, count=200) # We can get a maximum of 200 connections at once. if okay: logger.info('*** TWITTER CURSOR: %s:%s:%s ***' % (method_name, user, str(cursor))) twits = result['users'] next_cursor = result.get('next_cursor', False) if next_cursor: # Unless the next cursor is 0, we're not done yet. getTwitterConnections.delay(user, friends=friends, cursor=next_cursor, cacheKey=cacheKey, credentials=credentials) pushTwitterConnections.delay(twits, user, friends=friends) else: pushTwitterConnections.delay(twits, user, friends=friends, cacheKey=cacheKey) # All done, send the cacheKey. else: if result == 'limited': raise getTwitterConnections.retry(exc=Exception('Twitter rate-limited', method_name), countdown=API_TIMEOUT) if result == '404': db = get_neo_driver() setUserDefunct(db, user) db.close() if friends: cache.set('scrape_friends_' + self.request.root_id, 'done') else: cache.set('scrape_followers_' + self.request.root_id, 'done')
def startUserScrape(self, user, credentials=False): """Start scraping around the given user.""" logger.info('*** STARTED SCRAPING: USER: %s ***' % (user,)) cache.set('user_scrape_' + self.request.root_id, 'true') cache.set('scrape_mode_' + self.request.root_id, 'user') cache.set('scrape_user_' + self.request.root_id, user) # add crawl node for this user as centre of scrape db = get_neo_driver() start_user_crawl(db, user, crawl_task=self.request.root_id, status='initiated') db.close() for key in ['scrape_friends', 'scrape_followers', 'scrape_tweets']: cache.set(key + '_' + self.request.root_id, '') for job in ['friends', 'followers', 'tweets']: cache_key = '_'.join(['nextnearest', job, user, self.request.root_id]) cache.set(cache_key, '') doUserScrape.delay(credentials=credentials)
def doDefaultScrape(self, latest=False, credentials=False): """Retrieve the tweets, friends or followers of trhe next users in the default scrape.""" keep_going = cache.get('default_scrape_' + self.request.root_id) if (not keep_going) or keep_going.decode('utf-8') != 'true': logger.info('*** STOPPED DEFAULT SCRAPE ***') return False logger.info('*** SCRAPING... ***') this_friend = cache.get('scrape_friends_' + self.request.root_id) if (not this_friend) or this_friend.decode('utf-8') == 'done': cache.set('scrape_friends_' + self.request.root_id, 'running') getTwitterConnections.delay(whoNext('friends', latest=latest), credentials=credentials, cacheKey='scrape_friends_' + self.request.root_id) else: logger.info('*** FRIENDS BUSY ***') this_follower = cache.get('scrape_followers_' + self.request.root_id) if (not this_follower) or this_follower.decode('utf-8') == 'done': cache.set('scrape_followers_' + self.request.root_id, 'running') getTwitterConnections.delay(whoNext('friends', latest=latest), credentials=credentials, friends=False, cacheKey='scrape_followers_' + self.request.root_id) else: logger.info('*** FOLLOWERS BUSY ***') this_tweet = cache.get('scrape_tweets_' + self.request.root_id) if (not this_tweet) or this_tweet.decode('utf-8') == 'done': cache.set('scrape_tweets_' + self.request.root_id, 'running') getTweets.delay(whoNext('tweets', latest=latest), maxTweets=1000, credentials=credentials, cacheKey='scrape_tweets_' + self.request.root_id) else: logger.info('*** TWEETS BUSY ***') doDefaultScrape.apply_async(kwargs={'latest': latest}, credentials=credentials, countdown=30)
def pushTweets(self, tweets, user, cacheKey=False): """ Dump a set of tweets from a given user's timeline to Neo4J/Solr. Positional arguments: tweets -- a list of tweets as returned by Twython. user -- screen_name of the user Keyword arguments: cacheKey -- a Redis key that identifies an on-going task to grab a user's timeline """ logger.info('Executing pushTweets task id {0.id}, task parent id {0.parent_id}, root id {0.root_id}'.format(self.request)) tweetDump = decomposeTweets(tweets) # Extract mentions, URLs, replies hashtags etc... pushRenderedTweets2Neo.delay(user, tweetDump) for label in ['tweet', 'retweet', 'quotetweet']: pushRenderedTweets2Solr.delay([t[0] for t in tweetDump[label]]) if cacheKey: # These are the last Tweets, tell the scraper we're done. cache.set(cacheKey, 'done') logger.info('*** %s: DONE WITH TWEETS ***' % user)
def startScrape(self, latest=False, credentials=False): """Start the default scrape, retrieving the users that need timelines, friends or followers updated, in the order that they were first added. """ logger.info('*** STARTED SCRAPING: DEFAULT: ***') cache.set('default_scrape_' + self.request.root_id, 'true') cache.set('scrape_mode_' + self.request.root_id, 'default') for key in ['scrape_friends', 'scrape_followers', 'scrape_tweets']: cache.set(key + '_' + self.request.root_id, '') doDefaultScrape.delay(latest=latest, credentials=credentials)
def nextNearest(db, user, job, root_task, max_friends=2000, max_followers=2000, limit=20, max_tweets=2000, test=False): """Find the next user to retrieve friends, followers or tweets, closest to a given user.""" cacheKey = '_'.join(['nextnearest', job, user, root_task]) nextUserDump = cache.get(cacheKey).decode('utf-8') next_users = False if nextUserDump: try: next_users = json.loads(nextUserDump) except: next_users = [] if next_users: logging.info('*** NEXT ' + job + ': ' + ', '.join(next_users) + ' from ' + user + ' ***') next_user = next_users.pop(0) cache.set(cacheKey, json.dumps(next_users)) return next_user query_str = "MATCH (a:twitter_user {{screen_name: '{}'}})-[:FOLLOWS]-(d:twitter_user)".format( user) query_str += ' MATCH (b:twitter_user)-[:FOLLOWS]-(d) WITH DISTINCT b ' if job == 'friends': query_str += 'MATCH (b)-[:FOLLOWS]->(c:twitter_user) ' if job == 'followers': query_str += 'MATCH (b)<-[:FOLLOWS]-(c:twitter_user) ' if job == 'tweets': query_str += 'MATCH (b)-[:TWEETED]->(c:tweet) ' query_str += 'WITH b, COUNT(c) AS n ' query_str += 'WHERE b.friends_count < {} AND b.followers_count < {} ' \ 'AND NOT EXISTS (b.protected) AND NOT EXISTS (b.defunct) '.format(max_friends, max_followers) if job == 'friends': query_str += 'AND n < b.friends_count/2 ' if job == 'followers': query_str += 'AND n < b.followers_count/2 ' if job == 'tweets': query_str += 'AND b.statuses_count > 0 AND n < b.statuses_count/2 AND n<{} '.format( max_tweets) query_str += 'RETURN b.screen_name ORDER BY b.{}_last_scraped LIMIT {}'.format( job, limit) logging.info('*** Looking for ' + job + ' for ' + user + ' ***') if test: return query_str query = query_str try: with db.session() as session: with session.begin_transaction() as tx: result = tx.run(query) next_users = [record.values()[0] for record in result] except: next_users = [] if next_users: logging.info('*** NEXT ' + job + ': ' + ', '.join(next_users) + ' from ' + user + ' ***') next_user = next_users.pop(0) cache.set(cacheKey, json.dumps(next_users)) return next_user else: logging.info('No more ' + job + ' for ' + user) return False
def stop_scrape(self, task_id): '''Stop an excuting scrape on the next loop.''' scrape_mode = cache.get('scrape_mode_' + task_id) if scrape_mode: scrape_mode = scrape_mode.decode('utf-8') cache.set(scrape_mode + '_scrape_' + task_id, 'false')
def connected(self, val): key = 'stream_' + self.stream_id + '_connected' cache.set(key, val)