def populate_uid(name, force=False, api=None): """ For a TwitterUser, populate its uid based on its stored screen name, if uid==0 (default value, indicating it hasn't been set yet). if force==True, do it even if uid isn't 0 Only do this for active users. see https://dev.twitter.com/docs/api/1.1/get/users/lookup for explanation of get_user call see https://dev.twitter.com/docs/working-with-timelines for explanation of max_id, since_id usage see also: https://dev.twitter.com/docs/error-codes-responses https://dev.twitter.com/docs/rate-limiting """ if api is None: api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME) qs_tweeps = TwitterUser.objects.filter(is_active=True, name=name) for tweep in qs_tweeps: if tweep.uid == 0 or force is True: try: user_status = api.get_user(screen_name=name) tweep.uid = user_status['id'] tweep.save() print 'updated user \'%s\' uid to %d' % (name, tweep.uid) except tweepy.error.TweepError as e: print 'Failed to find user \'%s\'. Error: %s' % (name, e) finally: time.sleep(set_wait_time(api.last_response))
def handle(self, *args, **options): api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME) qs_tweeps = TwitterUser.objects.filter(is_active=True) if options.get('user', None): qs_tweeps = qs_tweeps.filter(name=options.get('user')) for tweep in qs_tweeps: print 'user: %s' % tweep.name # check user status, update twitter user name if it has changed if tweep.uid == 0: print 'uid has not been set yet - skipping.' continue try: user_status = api.get_user(id=tweep.uid) if user_status['screen_name'] != tweep.name: print ' -- updating screen name to %s' % \ user_status['screen_name'] former_names = tweep.former_names if not tweep.former_names: former_names = '{}' oldnames = json.loads(former_names) oldnames[datetime.datetime.now().strftime('%c')] = \ tweep.name tweep.former_names = json.dumps(oldnames) tweep.name = user_status['screen_name'] #TODO: Is this save unnecessary, since it gets saved below? tweep.save() except tweepy.error.TweepError as e: print 'Error: %s' % e #go to the next tweep in the for loop continue finally: time.sleep(set_wait_time(api.last_response))
def handle(self, *args, **options): api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME) qs_tweeps = TwitterUser.objects.filter(is_active=True) if options.get('user', None): qs_tweeps = qs_tweeps.filter(name=options.get('user')) for tweep in qs_tweeps: print 'user: %s' % tweep.name # check user status, update twitter user name if it has changed if tweep.uid == 0: print 'uid has not been set yet - skipping.' continue try: user_status = api.get_user(id=tweep.uid) if user_status['screen_name'] != tweep.name: print ' -- updating screen name to %s' % \ user_status['screen_name'] former_names = tweep.former_names if not tweep.former_names: former_names = '{}' oldnames = json.loads(former_names) oldnames[datetime.datetime.utcnow().strftime( '%Y-%m-%dT%H:%M:%SZ')] = tweep.name tweep.former_names = json.dumps(oldnames) tweep.name = user_status['screen_name'] #TODO: Is this save unnecessary, since it gets saved below? tweep.save() except tweepy.error.TweepError as e: print 'Error: %s' % e #go to the next tweep in the for loop continue finally: time.sleep(set_wait_time(api.last_response))
def handle(self, *args, **options): api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME) qs_tweeps = TwitterUser.objects.filter(is_active=True) if options.get('user', None): qs_tweeps = qs_tweeps.filter(name=options.get('user')) paginator = Paginator(qs_tweeps, 100) page_count = paginator.num_pages for page_counter in range(1, page_count + 1): print "Page %s of %s" % (page_counter, page_count) qs_page = paginator.page(page_counter) tweep_map = {} for tweep in qs_page: # check user status, update twitter user name if it has changed if tweep.uid == 0: print 'user: %s' % tweep.name print ' -- uid has not been set yet - skipping.' continue else: tweep_map[tweep.uid] = tweep if tweep_map: try: user_statuses = api.lookup_users(user_ids=tweep_map.keys()) for user_status in user_statuses: tweep = tweep_map[user_status['id']] print 'user: %s' % tweep.name if user_status['screen_name'] != tweep.name: print ' -- updating screen name to %s' % \ user_status['screen_name'] former_names = tweep.former_names if not tweep.former_names: former_names = '{}' oldnames = json.loads(former_names) oldnames[datetime.datetime.utcnow().strftime( '%Y-%m-%dT%H:%M:%SZ')] = tweep.name tweep.former_names = json.dumps(oldnames) tweep.name = user_status['screen_name'] #TODO: Is this save unnecessary, since it gets saved below? tweep.save() except tweepy.error.TweepError as e: print 'Error: %s' % e #go to the next tweep in the for loop continue finally: time.sleep(set_wait_time(api.last_response))
def handle(self, *args, **options): api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME) job = TwitterUserTimelineJob() job.save() qs_tweeps = TwitterUser.objects.filter(is_active=True) if options.get('user', None): qs_tweeps = qs_tweeps.filter(name=options.get('user')) else: # NOTE: randomizing here might be healthier when considering # possibility of multiple parallel jobs running and competing # for api calls but this is an instinctual call, not data-driven qs_tweeps = qs_tweeps.order_by('?') for tweep in qs_tweeps: print 'user: %s' % tweep.name # can't do this unless we have a twitter user_id stored if tweep.uid == 0: skipmsg = 'uid has not been set yet - skipping this ' + \ 'user. May need to run populate_uids if this ' + \ 'is an old database.' print skipmsg error = TwitterUserTimelineError(job=job, user=tweep, error=skipmsg) error.save() continue # now move on to determining first tweet id to get since_id = 1 # set since_id if they have any statuses recorded if tweep.items.count() > 0: max_dict = tweep.items.all().aggregate(Max('twitter_id')) since_id = max_dict['twitter_id__max'] max_id = 0 # update their record (auto_now) as we're checking it now tweep.save() while True: stop = False try: print 'since: %s' % (since_id) if max_id: print 'max: %s' % max_id timeline = api.user_timeline(id=tweep.uid, since_id=since_id, max_id=max_id, count=200) else: timeline = api.user_timeline(id=tweep.uid, since_id=since_id, count=200) except tweepy.error.TweepError as e: print 'ERROR: %s' % e error = TwitterUserTimelineError(job=job, user=tweep, error=e) error.save() timeline = [] if len(timeline) == 0: # Nothing new; stop for this user stop = True new_status_count = 0 for status in timeline: # eg 'Mon Oct 15 20:15:12 +0000 2012' dt_aware = dt_aware_from_created_at(status['created_at']) try: item, created = TwitterUserItem.objects.get_or_create( twitter_user=tweep, twitter_id=status['id'], date_published=dt_aware, item_text=status['text'], item_json=json.dumps(status), place=status['place'] or '', source=status['source']) if created: max_id = item.twitter_id - 1 new_status_count += 1 else: print 'skip: id %s' % item.id except IntegrityError as ie: print 'ERROR: %s' % ie error = TwitterUserTimelineError(job=job, user=tweep, error=ie) error.save() print 'saved: %s item(s)' % new_status_count job.num_added += new_status_count # max new statuses per call is 200, so check for less than # a reasonable fraction of that to see if we should stop if new_status_count < 150: print 'stop: < 150 new statuses' stop = True if max_id < since_id: # Got 'em all, stop for this user print 'stop: max_id < since_id' stop = True # Check response codes for issues response_status = api.last_response.status if response_status >= 400: print 'error:', api.last_response.getheader('status') error = TwitterUserTimelineError(job=job, user=tweep, error=e) error.save() stop = True job.save() # wait before next call no matter what time.sleep(set_wait_time(api.last_response)) if stop: break
def handle(self, *args, **options): api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME) qs_tweeps = TwitterUser.objects.filter(is_active=True) if options.get('user', None): qs_tweeps = qs_tweeps.filter(name=options.get('user')) qs_tweeps = qs_tweeps.order_by('date_last_checked') for tweep in qs_tweeps: print 'user: %s' % tweep.name # can't do this unless we have a twitter user_id stored if tweep.uid == 0: print 'uid has not been set yet - skipping this user. ' + \ 'May need to run populate_uids if this is an old ' + \ 'database.' continue # now move on to determining first tweet id to get since_id = 1 # set since_id if they have any statuses recorded if tweep.items.count() > 0: max_dict = tweep.items.all().aggregate(Max('twitter_id')) since_id = max_dict['twitter_id__max'] max_id = 0 # update their record (auto_now) as we're checking it now tweep.save() while True: stop = False try: print 'since: %s' % (since_id) if max_id: print 'max: %s' % max_id timeline = api.user_timeline(id=tweep.uid, since_id=since_id, max_id=max_id, count=200) else: timeline = api.user_timeline(id=tweep.uid, since_id=since_id, count=200) except tweepy.error.TweepError as e: print 'ERROR: %s' % e timeline = [] if len(timeline) == 0: # Nothing new; stop for this user stop = True new_status_count = 0 for status in timeline: # eg 'Mon Oct 15 20:15:12 +0000 2012' dt_aware = dt_aware_from_created_at(status['created_at']) try: item, created = TwitterUserItem.objects.get_or_create( twitter_user=tweep, twitter_id=status['id'], date_published=dt_aware, item_text=status['text'], item_json=json.dumps(status), place=status['place'] or '', source=status['source']) if created: max_id = item.twitter_id - 1 new_status_count += 1 else: print 'skip: id %s' % item.id except IntegrityError as ie: print 'ERROR: %s' % ie print 'saved: %s item(s)' % new_status_count # max new statuses per call is 200, so check for less than # a reasonable fraction of that to see if we should stop if new_status_count < 150: print 'stop: < 150 new statuses' stop = True if max_id < since_id: # Got 'em all, stop for this user print 'stop: max_id < since_id' stop = True # Check response codes for issues response_status = api.last_response.status if response_status >= 400: print 'error:', api.last_response.getheader('status') stop = True # wait before next call no matter what time.sleep(set_wait_time(api.last_response)) if stop: break
def handle(self, *args, **options): api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME) job = TwitterUserTimelineJob() job.save() qs_tweeps = TwitterUser.objects.filter(is_active=True) if options.get('user', None): qs_tweeps = qs_tweeps.filter(name=options.get('user')) else: # NOTE: randomizing here might be healthier when considering # possibility of multiple parallel jobs running and competing # for api calls but this is an instinctual call, not data-driven qs_tweeps = qs_tweeps.order_by('?') for tweep in qs_tweeps: print 'user: %s' % tweep.name # can't do this unless we have a twitter user_id stored if tweep.uid == 0: skipmsg = 'uid has not been set yet - skipping this ' + \ 'user. May need to run populate_uids if this ' + \ 'is an old database.' print skipmsg error = TwitterUserTimelineError(job=job, user=tweep, error=skipmsg) error.save() continue # now move on to determining first tweet id to get since_id = 1 # set since_id if they have any statuses recorded if tweep.items.count() > 0: max_dict = tweep.items.all().aggregate(Max('twitter_id')) since_id = max_dict['twitter_id__max'] max_id = 0 # update their record (auto_now) as we're checking it now tweep.save() while True: # wait before next call no matter what; # use getattr() because api might be None the first time or # after errors time.sleep(set_wait_time(getattr(api, 'last_response', None))) job.save() stop = False try: print 'since: %s' % (since_id) if max_id: print 'max: %s' % max_id timeline = api.user_timeline(id=tweep.uid, since_id=since_id, max_id=max_id, count=200) else: timeline = api.user_timeline(id=tweep.uid, since_id=since_id, count=200) except tweepy.error.TweepError as e: print 'ERROR: %s' % e error = TwitterUserTimelineError(job=job, user=tweep, error=e) error.save() timeline = [] break if len(timeline) == 0: # Nothing new; stop for this user stop = True new_status_count = 0 for status in timeline: # eg 'Mon Oct 15 20:15:12 +0000 2012' dt_aware = dt_aware_from_created_at(status['created_at']) try: item, created = TwitterUserItem.objects.get_or_create( twitter_user=tweep, twitter_id=status['id'], date_published=dt_aware, item_text=status['text'], item_json=json.dumps(status), place=status['place'] or '', source=status['source']) if created: max_id = item.twitter_id - 1 new_status_count += 1 else: print 'skip: id %s' % item.id except IntegrityError as ie: print 'ERROR: %s' % ie error = TwitterUserTimelineError(job=job, user=tweep, error=ie) error.save() print 'saved: %s item(s)' % new_status_count job.num_added += new_status_count # max new statuses per call is 200, so check for less than # a reasonable fraction of that to see if we should stop if new_status_count < 150: print 'stop: < 150 new statuses' stop = True if max_id < since_id: # Got 'em all, stop for this user print 'stop: max_id < since_id' stop = True # Check response codes for issues response_status = api.last_response.status_code if response_status >= 400: print 'error:', api.last_response.getheader('status') error = TwitterUserTimelineError(job=job, user=tweep, error=e) error.save() stop = True if stop: break