예제 #1
0
 def handle(self, *args, **options):
     api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME)
     qs_tweeps = TwitterUser.objects.filter(is_active=True)
     if options.get('user', None):
         qs_tweeps = qs_tweeps.filter(name=options.get('user'))
     qs_tweeps = qs_tweeps.order_by('date_last_checked')
     for tweep in qs_tweeps:
         print 'user: %s' % tweep.name
         # can't do this unless we have a twitter user_id stored
         if tweep.uid == 0:
             print 'uid has not been set yet - skipping this user.  ' + \
                   'May need to run populate_uids if this is an old ' + \
                   'database.'
             continue
         # now move on to determining first tweet id to get
         since_id = 1
         # set since_id if they have any statuses recorded
         if tweep.items.count() > 0:
             max_dict = tweep.items.all().aggregate(Max('twitter_id'))
             since_id = max_dict['twitter_id__max']
         max_id = 0
         # update their record (auto_now) as we're checking it now
         tweep.save()
         while True:
             stop = False
             try:
                 print 'since: %s' % (since_id)
                 if max_id:
                     print 'max: %s' % max_id
                     timeline = api.user_timeline(id=tweep.uid,
                                                  since_id=since_id,
                                                  max_id=max_id,
                                                  count=200)
                 else:
                     timeline = api.user_timeline(id=tweep.uid,
                                                  since_id=since_id,
                                                  count=200)
             except tweepy.error.TweepError as e:
                 print 'ERROR: %s' % e
                 timeline = []
             if len(timeline) == 0:
                 # Nothing new; stop for this user
                 stop = True
             new_status_count = 0
             for status in timeline:
                 # eg 'Mon Oct 15 20:15:12 +0000 2012'
                 dt_aware = dt_aware_from_created_at(status['created_at'])
                 try:
                     item, created = TwitterUserItem.objects.get_or_create(
                         twitter_user=tweep,
                         twitter_id=status['id'],
                         date_published=dt_aware,
                         item_text=status['text'],
                         item_json=json.dumps(status),
                         place=status['place'] or '',
                         source=status['source'])
                     if created:
                         max_id = item.twitter_id - 1
                         new_status_count += 1
                     else:
                         print 'skip: id %s' % item.id
                 except IntegrityError as ie:
                     print 'ERROR: %s' % ie
             print 'saved: %s item(s)' % new_status_count
             # max new statuses per call is 200, so check for less than
             # a reasonable fraction of that to see if we should stop
             if new_status_count < 150:
                 print 'stop: < 150 new statuses'
                 stop = True
             if max_id < since_id:
                 # Got 'em all, stop for this user
                 print 'stop: max_id < since_id'
                 stop = True
             # Check response codes for issues
             response_status = api.last_response.status
             if response_status >= 400:
                 print 'error:', api.last_response.getheader('status')
                 stop = True
             # wait before next call no matter what
             time.sleep(set_wait_time(api.last_response))
             if stop:
                 break
예제 #2
0
 def handle(self, *args, **options):
     api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME)
     job = TwitterUserTimelineJob()
     job.save()
     qs_tweeps = TwitterUser.objects.filter(is_active=True)
     if options.get('user', None):
         qs_tweeps = qs_tweeps.filter(name=options.get('user'))
     else:
         # NOTE: randomizing here might be healthier when considering
         # possibility of multiple parallel jobs running and competing
         # for api calls but this is an instinctual call, not data-driven
         qs_tweeps = qs_tweeps.order_by('?')
     for tweep in qs_tweeps:
         print 'user: %s' % tweep.name
         # can't do this unless we have a twitter user_id stored
         if tweep.uid == 0:
             skipmsg = 'uid has not been set yet - skipping this ' + \
                       'user.  May need to run populate_uids if this ' + \
                       'is an old database.'
             print skipmsg
             error = TwitterUserTimelineError(job=job, user=tweep,
                                              error=skipmsg)
             error.save()
             continue
         # now move on to determining first tweet id to get
         since_id = 1
         # set since_id if they have any statuses recorded
         if tweep.items.count() > 0:
             max_dict = tweep.items.all().aggregate(Max('twitter_id'))
             since_id = max_dict['twitter_id__max']
         max_id = 0
         # update their record (auto_now) as we're checking it now
         tweep.save()
         while True:
             stop = False
             try:
                 print 'since: %s' % (since_id)
                 if max_id:
                     print 'max: %s' % max_id
                     timeline = api.user_timeline(id=tweep.uid,
                                                  since_id=since_id,
                                                  max_id=max_id, count=200)
                 else:
                     timeline = api.user_timeline(id=tweep.uid,
                                                  since_id=since_id,
                                                  count=200)
             except tweepy.error.TweepError as e:
                 print 'ERROR: %s' % e
                 error = TwitterUserTimelineError(job=job, user=tweep,
                                                  error=e)
                 error.save()
                 timeline = []
             if len(timeline) == 0:
                 # Nothing new; stop for this user
                 stop = True
             new_status_count = 0
             for status in timeline:
                 # eg 'Mon Oct 15 20:15:12 +0000 2012'
                 dt_aware = dt_aware_from_created_at(status['created_at'])
                 try:
                     item, created = TwitterUserItem.objects.get_or_create(
                         twitter_user=tweep,
                         twitter_id=status['id'],
                         date_published=dt_aware,
                         item_text=status['text'],
                         item_json=json.dumps(status),
                         place=status['place'] or '',
                         source=status['source'])
                     if created:
                         max_id = item.twitter_id - 1
                         new_status_count += 1
                     else:
                         print 'skip: id %s' % item.id
                 except IntegrityError as ie:
                     print 'ERROR: %s' % ie
                     error = TwitterUserTimelineError(job=job, user=tweep,
                                                      error=ie)
                     error.save()
             print 'saved: %s item(s)' % new_status_count
             job.num_added += new_status_count
             # max new statuses per call is 200, so check for less than
             # a reasonable fraction of that to see if we should stop
             if new_status_count < 150:
                 print 'stop: < 150 new statuses'
                 stop = True
             if max_id < since_id:
                 # Got 'em all, stop for this user
                 print 'stop: max_id < since_id'
                 stop = True
             # Check response codes for issues
             response_status = api.last_response.status
             if response_status >= 400:
                 print 'error:', api.last_response.getheader('status')
                 error = TwitterUserTimelineError(job=job, user=tweep,
                                                  error=e)
                 error.save()
                 stop = True
             job.save()
             # wait before next call no matter what
             time.sleep(set_wait_time(api.last_response))
             if stop:
                 break
예제 #3
0
 def handle(self, *args, **options):
     api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME)
     job = TwitterUserTimelineJob()
     job.save()
     qs_tweeps = TwitterUser.objects.filter(is_active=True)
     if options.get('user', None):
         qs_tweeps = qs_tweeps.filter(name=options.get('user'))
     else:
         # NOTE: randomizing here might be healthier when considering
         # possibility of multiple parallel jobs running and competing
         # for api calls but this is an instinctual call, not data-driven
         qs_tweeps = qs_tweeps.order_by('?')
     for tweep in qs_tweeps:
         print 'user: %s' % tweep.name
         # can't do this unless we have a twitter user_id stored
         if tweep.uid == 0:
             skipmsg = 'uid has not been set yet - skipping this ' + \
                       'user.  May need to run populate_uids if this ' + \
                       'is an old database.'
             print skipmsg
             error = TwitterUserTimelineError(job=job,
                                              user=tweep,
                                              error=skipmsg)
             error.save()
             continue
         # now move on to determining first tweet id to get
         since_id = 1
         # set since_id if they have any statuses recorded
         if tweep.items.count() > 0:
             max_dict = tweep.items.all().aggregate(Max('twitter_id'))
             since_id = max_dict['twitter_id__max']
         max_id = 0
         # update their record (auto_now) as we're checking it now
         tweep.save()
         while True:
             # wait before next call no matter what;
             # use getattr() because api might be None the first time or
             # after errors
             time.sleep(set_wait_time(getattr(api, 'last_response', None)))
             job.save()
             stop = False
             try:
                 print 'since: %s' % (since_id)
                 if max_id:
                     print 'max: %s' % max_id
                     timeline = api.user_timeline(id=tweep.uid,
                                                  since_id=since_id,
                                                  max_id=max_id,
                                                  count=200)
                 else:
                     timeline = api.user_timeline(id=tweep.uid,
                                                  since_id=since_id,
                                                  count=200)
             except tweepy.error.TweepError as e:
                 print 'ERROR: %s' % e
                 error = TwitterUserTimelineError(job=job,
                                                  user=tweep,
                                                  error=e)
                 error.save()
                 timeline = []
                 break
             if len(timeline) == 0:
                 # Nothing new; stop for this user
                 stop = True
             new_status_count = 0
             for status in timeline:
                 # eg 'Mon Oct 15 20:15:12 +0000 2012'
                 dt_aware = dt_aware_from_created_at(status['created_at'])
                 try:
                     item, created = TwitterUserItem.objects.get_or_create(
                         twitter_user=tweep,
                         twitter_id=status['id'],
                         date_published=dt_aware,
                         item_text=status['text'],
                         item_json=json.dumps(status),
                         place=status['place'] or '',
                         source=status['source'])
                     if created:
                         max_id = item.twitter_id - 1
                         new_status_count += 1
                     else:
                         print 'skip: id %s' % item.id
                 except IntegrityError as ie:
                     print 'ERROR: %s' % ie
                     error = TwitterUserTimelineError(job=job,
                                                      user=tweep,
                                                      error=ie)
                     error.save()
             print 'saved: %s item(s)' % new_status_count
             job.num_added += new_status_count
             # max new statuses per call is 200, so check for less than
             # a reasonable fraction of that to see if we should stop
             if new_status_count < 150:
                 print 'stop: < 150 new statuses'
                 stop = True
             if max_id < since_id:
                 # Got 'em all, stop for this user
                 print 'stop: max_id < since_id'
                 stop = True
             # Check response codes for issues
             response_status = api.last_response.status_code
             if response_status >= 400:
                 print 'error:', api.last_response.getheader('status')
                 error = TwitterUserTimelineError(job=job,
                                                  user=tweep,
                                                  error=e)
                 error.save()
                 stop = True
             if stop:
                 break