def crawl_followers(user_id, cursor=-1, must_include=lambda x: True): """ Try to download the entire timeline of the use starting from a given page. Before starting issuing requests the last tweet_id of the user is retrieved if present. @return a TwitterResponse """ log.msg("Retrieving followers of user_id %d" % user_id) # TODO: in case of duplication errors is better to open it as rw and load # all the users in a set thus removing duplicates writer = FollowerFile(user_id) msg, followers, sleep_time, new_cursor = fetch_followers(user_id=user_id, cursor=cursor) writer.add_followers(followers) response = TwitterResponse(TwitterResponse.msg_to_status(msg), user_id, new_cursor, sleep_time) response['follower.total_fetched'] = len(followers) if response.status != STATUS_ERROR and len(followers) > 0: writer.commit() return response
def crawl_followers(user_id, cursor=-1, must_include=lambda x: True): """ Try to download the entire timeline of the use starting from a given page. Before starting issuing requests the last tweet_id of the user is retrieved if present. @return a TwitterResponse """ log.msg("Retrieving followers of user_id %d" % user_id) # TODO: in case of duplication errors is better to open it as rw and load # all the users in a set thus removing duplicates writer = FollowerFile(user_id) msg, followers, sleep_time, new_cursor = fetch_followers(user_id=user_id, cursor=cursor) writer.add_followers(followers) response = TwitterResponse(TwitterResponse.msg_to_status(msg), user_id, new_cursor, sleep_time) response["follower.total_fetched"] = len(followers) if response.status != STATUS_ERROR and len(followers) > 0: writer.commit() return response
def analyze_followers_of(user_id, start_cursor=0, already_processed=lambda x: False, must_follow=lambda x: True): log.msg("Analyzing followers of user_id %d" % user_id) reader = FollowerFile(user_id) if len(reader) == 0: log.msg("Follower file for user_id %d is not present. Bogus data?" % user_id) # Let's treat this as not found user return TwitterResponse(STATUS_UNAUTHORIZED, user_id, start_cursor, 0) def log_progress(lookup_infos, current, total): log.msg("user_id %d Follower file: analyzed %d of %d [%02d%%]" % \ (user_id, current, total, 100 * (current / float(total)))) msg, lookup_infos, sleep_time, current_line = analyze_followers( reader, start_cursor=start_cursor, already_processed=already_processed, progress_cb=log_progress ) included = [] for info in lookup_infos: if must_follow(info): included.append(info['id_str']) total_included = len(included) total_fetched = len(lookup_infos) response = TwitterResponse(TwitterResponse.msg_to_status(msg), user_id, current_line, sleep_time ) response['analyzer.total_included'] = total_included response['analyzer.total_fetched'] = total_fetched response['analyzer.target_users'] = included return response
msg, timeline, sleep_time = fetch_timeline(user_id=user_id, since_id=since_id) if len(timeline) > 0 and msg == MSG_OK: # Here we need to create a new file containing the delta timeline # and also include the previous tweets total_included = 0 total_fetched = len(timeline) for tweet in timeline: if must_include(tweet): writer.add_tweet(tweet) total_included += 1 response = TwitterResponse(TwitterResponse.msg_to_status(msg), user_id, 0, sleep_time) response['update.total_included'] = total_included response['update.total_fetched'] = total_fetched writer.commit() return response # Well this is an unfortunate yet interesting situation which we decided # not to handle at all since. In this case the delta update is not # completely downloaded because may be you hit the rate limit. Therefore # if we write the information we collected in the file we will end up # having a hole in the timeline. # The official REST API documentation specify that you can at most download
msg, timeline, sleep_time = fetch_timeline(user_id=user_id, since_id=since_id) if len(timeline) > 0 and msg == MSG_OK: # Here we need to create a new file containing the delta timeline # and also include the previous tweets total_included = 0 total_fetched = len(timeline) for tweet in timeline: if must_include(tweet): writer.add_tweet(tweet) total_included += 1 response = TwitterResponse(TwitterResponse.msg_to_status(msg), user_id, 0, sleep_time ) response['update.total_included'] = total_included response['update.total_fetched'] = total_fetched writer.commit() return response # Well this is an unfortunate yet interesting situation which we decided # not to handle at all since. In this case the delta update is not # completely downloaded because may be you hit the rate limit. Therefore
def crawl_timeline(user_id, must_include=lambda x: True): """ Try to download the entire timeline of the use starting from a given page. Before starting issuing requests the last tweet_id of the user is retrieved if present. @return a TwitterResponse """ log.msg("Fetching timeline of user_id %d" % user_id) writer = TimelineFile(user_id) max_id = '' total_tweets = writer.get_total() try: last_tweet_id = int(writer.get_last()['id_str']) - 1 except: log.msg("This seems to be a new timeline file") last_tweet_id = -1 msg, timeline, sleep_time = fetch_timeline(user_id=user_id, last_tweet_id=last_tweet_id) total_included = 0 total_fetched = len(timeline) total_tweets += total_included timeline = filter(must_include, timeline) total_included = len(timeline) writer.add_tweets(timeline) # Signal completion must_include(None) response = TwitterResponse(TwitterResponse.msg_to_status(msg), user_id, 0, sleep_time) if total_fetched >= 2: screen_name = timeline[0]['user']['screen_name'] first_tweet = timeline[0]['text'].replace('\n', '').replace( '\r', '').replace('\t', '').encode('utf8') last_tweet = timeline[-1]['text'].replace('\n', '').replace( '\r', '').replace('\t', '').encode('utf8') # TODO: We could add some statics like the number of hashtags and so on. # but may be we could exploits the pub/sub architecture. Other option is # to use directly the must_follow callback to collect statistics log.msg("Got %d tweets for user_id %d screen_name %s" % (total_fetched, user_id, screen_name)) log.msg(" First tweet: '%s'" % first_tweet) log.msg(" Last tweet: '%s'" % last_tweet) response['timeline.total_included'] = total_included response['timeline.total_fetched'] = total_fetched if response.status != STATUS_ERROR and total_fetched > 0: writer.commit() return response