def backfill_tweets(self): """ Get all tweets from the author of the source tweet since the last time this method was called, adding them to the database. """ name = self.source.user.screen_name new_tweets = search(['@'+name, 'from:'+name], self.last_tweet_id) if not new_tweets: return logger.info("Backfilling %s tweets for %s, last id %s" % (len(new_tweets), name, self.last_tweet_id)) new_tweets = [db.merge(tweet_to_Tweet(t)) for t in new_tweets] new_ids = [t.id for t in new_tweets] stray = [db.merge(t) for t in self.stray_tweets if t.id not in new_ids] logger.info("%s other stray tweets" % len(stray)) # doing the db.merge already added the tweets self.tweets.extend(stray) self.tweets.extend(new_tweets) self.update_last_tweet_id() self.stray_tweets = [] logger.info("%s total tweets in %s" % (len(self.tweets), self))
def receive_tweet(incidents, search_queue, tweet): """ Take a tweet and process it, possibly adding it to an incident or creating a new one out of it """ global last_search_time, search_interval logger.info("Received %s" % tweet) # disregard retweets if (tweet.retweet_of is not None): return tweet = db.merge(tweet) offered = False first_inactive_inc = None for inc in reversed(incidents): # stop when we reach the inactive incidents. if a new incident has # become inactive since we last added a tweet, refresh the tracked users # this way depends on the incidents list being sorted by inactive time if not inc.active(): if inc is not first_inactive_inc: first_inactive_inc = inc break # try to add the tweet to an incident. if inc.offer_tweet(tweet): logger.info("Found %s for %s" % (inc, tweet)) # TODO: check if it's okay to just discard this tweet because # incidents will find the tweet themselves when they backfill offered = True newlen = len(inc.tweets) + len(inc.stray_tweets) update_histo(newlen-1, newlen) # make incidents for any tweets unrelated to current incidents if not offered and incident_tweet(tweet): inc = Incident(tweet) incidents.append(inc) update_histo(0, 1) search_queue.put(inc) logger.info("Created incident for %s" % tweet) # every search-interval seconds, backfill the oldest-updated incident if time() - last_search_time >= search_interval: next_incident = search_queue.get() logger.info("Doing backfill on %s" % next_incident) oldlen = len(next_incident.tweets) + len(next_incident.stray_tweets) next_incident.backfill_tweets() db.commit() newlen = len(next_incident.tweets) + len(next_incident.stray_tweets) update_histo(oldlen, newlen) if next_incident.active: search_queue.put(next_incident) last_search_time = time() logger.info("%s incidents: %s" % (len(incidents), get_histo()))
def query_twitter(how_long=0, interval=5): """ Interface function """ reset_location_cache() # can send 180 requests per 15 min = 5 sec start = time() # make sure we don't create duplicates. # keeping track of this ourselves saves many db hits # if we don't specify go indefinitely last_tweet_id = 0 while time() - start < how_long: tweets = search(search_terms, last_tweet_id) if not tweets: # if we dont get anything back, sleep and try again sleep(interval) continue # if a retrieved tweet has a loc/user with a matching ID already in the # db, that loc/user is updated instead of a new one added, bc of merge try: db.add_all([db.merge(tweet_to_Tweet(t)) for t in tweets]) db.commit() last_tweet_id = tweets[0]['id_str'] except OperationalError: pass sleep(interval)