Пример #1
0
def test_stream():
    t = Twarc()
    count = 0
    for tweet in t.stream("obama"):
        assert tweet['id_str']
        assert tweet['text']
        count += 1
        if count == 50:
            break
    assert count == 50
Пример #2
0
class TwitterStreamKafka(object):

	# WORKING TWITTER HOSE
	def __init__(self, search_terms):

		logging.info("initializing TwitterStream Kafka")

		# globals to all instances
		self.t = Twarc(localConfig.client_key, localConfig.client_secret, localConfig.access_token, localConfig.access_token_secret)
		self.search_terms = search_terms

	# method to capture twitter stream
	def captureStream(self):
		for tweet in self.t.stream(",".join(self.search_terms)):
			result = producer.send_messages("betweezered", json.dumps(tweet))
Пример #3
0
class TwitterStreamKafka(object):

    # WORKING TWITTER HOSE
    def __init__(self, search_terms):

        logging.info("initializing TwitterStream Kafka")

        # globals to all instances
        self.t = Twarc(localConfig.client_key, localConfig.client_secret,
                       localConfig.access_token,
                       localConfig.access_token_secret)
        self.search_terms = search_terms

    # method to capture twitter stream
    def captureStream(self):
        for tweet in self.t.stream(",".join(self.search_terms)):
            result = producer.send_messages("betweezered", json.dumps(tweet))
class TwitterHarvester(BaseHarvester):
    def __init__(self, process_interval_secs=1200, mq_config=None, debug=False):
        BaseHarvester.__init__(self, mq_config=mq_config, process_interval_secs=process_interval_secs, debug=debug)
        self.twarc = None

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"])

    def search(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            query = seed.get("token")
            # Get since_id from state_store
            since_id = self.state_store.get_state(__name__, "{}.since_id".format(query)) if incremental else None

            max_tweet_id = self._process_tweets(self.twarc.search(query, since_id=since_id))
            log.debug("Searching on %s since %s returned %s tweets.", query,
                      since_id, self.harvest_result.summary.get("tweet"))

            # Update state store
            if incremental and max_tweet_id:
                self.state_store.set_state(__name__, "{}.since_id".format(query), max_tweet_id)

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"]

        self._process_tweets(self.twarc.stream(track))

    def _process_tweets(self, tweets):
        max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Processed %s tweets", count)
            if self.stop_event.is_set():
                log.debug("Stopping since stop event set.")
                break
            if "text" in tweet:
                with self.harvest_result_lock:
                    max_tweet_id = max(max_tweet_id, tweet.get("id"))
                    self.harvest_result.increment_summary("tweet")
                    if "urls" in tweet["entities"]:
                        for url in tweet["entities"]["urls"]:
                            self.harvest_result.urls.append(url["expanded_url"])
                    if "media" in tweet["entities"]:
                        for media in tweet["entities"]["media"]:
                            self.harvest_result.urls.append(media["media_url"])
        return max_tweet_id