Exemplo n.º 1
0
def categorize(stdout):
    # Hash categories by name
    categories = {}
    for c in Category.objects.all():
        categories[c.name] = c

    while True:
        tweets = Tweet.objects.filter(category__isnull=True)[:_limit]
        n = len(tweets)
        print 'Categorizing %d tweets' % n
        
        if not n:
            return
            
        start = datetime.now()
       
        for tweet in tweets:
            data = classifier.classify_tweet({
                'text': tweet.text,
                'entities': json.loads(tweet.entities)          
            }) or [['other', 0]]  
          
            tweet.category = categories[data[0][0]]
            tweet.score = data[0][1]
            tweet.save()
                      
        print "Categorized %d tweets in %d seconds" % \
            (n, (datetime.now() - start).seconds)
Exemplo n.º 2
0
 def on_status(self, status):
     """Categorize and save tweets with coords in a state and community."""
     if status.coordinates and not Tweet.objects.filter(id_str=status.id_str):
         coords = Point(
             status.coordinates['coordinates'][0], 
             status.coordinates['coordinates'][1]
         )            
         states = State.objects.filter(geom__contains=coords)            
         communities = Community.objects.filter(geom__contains=coords)
         
         if states and communities:                                                            
             created_at = status.created_at
             if not created_at.tzinfo:
                 created_at = created_at.replace(tzinfo=pytz.utc)
             
             lang = 'en'    
             if hasattr(status, 'lang'):
                 lang = status.lang
                 
             data = classifier.classify_tweet({
                     'text': status.text,
                     'entities': status.entities        
                 }) or [['other', 0]]  
                           
             tweet = Tweet(
                 id_str=status.id_str,
                 created_at=created_at,
                 lang=lang,
                 text=status.text,
                 entities=json.dumps(status.entities),
                 coords=coords,
                 user_id_str=status.user.id_str,
                 user_name=status.user.name,
                 user_screen_name=status.user.screen_name,
                 user_profile_image_url=status.user.profile_image_url,
                 state=states[0],
                 community=communities[0],
                 category_id=self.category_ids[data[0][0]],
                 score=data[0][1]
              )                                                 
             tweet.save()
             
             # Update aggregate stats
             category_id = self.category_ids['other']                
             if tweet.score >= self.category_thresholds[tweet.category_id]:
                 category_id = tweet.category_id                
                 
             r, created = Aggregate.objects.get_or_create(
                 date=tweet.created_at.date(),
                 city=tweet.community.city,
                 community=tweet.community,
                 category_id=category_id,
                 defaults={'count': 1})
             if not created:
                 r.count = F('count') + 1
                 r.save()