def stream_city(cf, city, keywords=None): bbox = { "great_syd": [149.971885992, -34.33117400499998, 151.63054702400007, -32.99606922499993], "great_mel": [144.33363404800002, -38.50298801599996, 145.8784120140001, -37.17509899299995], "great_brisbane": [152.07339276400012, -28.363962911999977, 153.54670756200005, -26.452339004999942], "great_ald": [138.435645001, -35.350296029999974, 139.04403010400003, -34.50022530299998] } if keywords == None: keywords = cf["search_words"] t = Twarc(**cf['account']) # no keyword restriction but from a specific city # reason see this https://stackoverflow.com/questions/22889122/how-to-add-a-location-filter-to-tweepy-module if not os.path.isdir(city+"/"): os.makedirs(city) path = city + "/" + str(datetime.date.today())+".jsonl" locations = ",".join([str(i) for i in bbox[city]]) for tweet in t.filter(locations=locations): print("get one tweet") #TODO send_to_db(tweet)
def main(get_method=None, input_hashtags=None, storage_location=None): if not os.path.exists(storage_location): os.makedirs(storage_location, exist_ok=True) hashtag_query = input_hashtags.strip().replace(",", "+OR+") try: tweets = 0 t = Twarc( consumer_key, consumer_secret, access_token, access_token_secret, tweet_mode="extended", ) print( "Started storing tweets related to " + input_hashtags + " at " + storage_location + " since " + str(datetime.datetime.now()) ) if get_method == "populate": for tweet in t.search(hashtag_query, lang=language): with open( os.path.join( storage_location + "tweet" + str(tweet["id"]) + ".json" ), "w", encoding="utf8", ) as file: json.dump(tweet, file) tweets += 1 elif get_method == "track": for tweet in t.filter(hashtag_query): with open( storage_location + "/tweet" + str(tweet["id"]) + ".json", "w", encoding="utf8", ) as file: json.dump(tweet, file) tweets += 1 else: print("No method defined, exiting...") except KeyboardInterrupt: print("Shutdown requested...successfully stored " + str(tweets) + " tweets") except BaseException: traceback.print_exc(file=sys.stdout) sys.exit(0)
class TwitterHarvester(BaseHarvester): def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__( self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors self.extract_media = False self.extract_web_resources = False self.extract_user_profile_images = False def harvest_seeds(self): # Create a twarc self._create_twarc() # Get harvest extract options. self.extract_media = self.message.get("options", {}).get("media", False) self.extract_web_resources = self.message.get("options", {}).get( "web_resources", False) self.extract_user_profile_images = self.message.get("options", {}).get( "user_images", False) # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() elif harvest_type == "twitter_sample": self.sample() elif harvest_type == "twitter_user_timeline": self.user_timeline() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors) def search(self): assert len(self.message.get("seeds", [])) == 1 incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state( __name__, u"{}.since_id".format(query)) if incremental else None self._harvest_tweets(self.twarc.search(query, since_id=since_id)) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"].get("track") follow = self.message["seeds"][0]["token"].get("follow") locations = self.message["seeds"][0]["token"].get("locations") self._harvest_tweets( self.twarc.filter(track=track, follow=follow, locations=locations, event=self.stop_harvest_seeds_event)) def sample(self): self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event)) def user_timeline(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): seed_id = seed["id"] screen_name = seed.get("token") user_id = seed.get("uid") log.debug( "Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id) assert screen_name or user_id # If there is not a user_id, look it up. if screen_name and not user_id: user_id = self._lookup_user_id(screen_name) if user_id: # Report back if nsid found self.result.uids[seed_id] = user_id else: msg = "User id not found for user {} because account is not found or suspended".format( screen_name) log.exception(msg) self.result.warnings.append( Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id)) # Otherwise, get the current screen_name else: new_screen_name = self._lookup_screen_name(user_id) # if can't find the screen_name, ignore get timeline if not new_screen_name: msg = "Screen name not found for user id {} because account is not found or suspended".format( user_id) log.exception(msg) self.result.warnings.append( Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id)) # reset the user_id, ignore the get timeline user_id = None if new_screen_name and new_screen_name != screen_name: self.result.token_updates[seed_id] = new_screen_name screen_name = new_screen_name if user_id: try: # Get since_id from state_store since_id = self.state_store.get_state( __name__, "timeline.{}.since_id".format( user_id)) if incremental else None self._harvest_tweets( self.twarc.timeline(user_id=user_id, since_id=since_id)) except HTTPError as e: if e.response.status_code == 401: account = "user {} (User ID: {})".format( screen_name, user_id ) if screen_name else "user ID: {}".format(user_id) msg = "Unauthorized for {} because account is suspended or protected".format( account) log.exception(msg) self.result.warnings.append( Msg(CODE_TOKEN_UNAUTHORIZED, msg, seed_id=seed_id)) else: raise e def _lookup_screen_name(self, user_id): """ Lookup a screen name given a user id. """ try: users = list(self.twarc.user_lookup(user_ids=(user_id, ))) assert len(users) in (0, 1) if users: return users[0]["screen_name"] except HTTPError as e: if e.response.status_code != 404: raise e return None def _lookup_user_id(self, screen_name): """ Lookup a user id given a screen name. """ try: users = list(self.twarc.user_lookup(screen_names=(screen_name, ))) assert len(users) in (0, 1) if users: return users[0]["id_str"] except HTTPError as e: if e.response.status_code != 404: raise e return None def _harvest_tweets(self, tweets): # max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Harvested %s tweets", count) self.result.harvest_counter["tweets"] += 1 if self.stop_harvest_seeds_event.is_set(): log.debug("Stopping since stop event set.") break def _process_entities(self, entities): if self.extract_web_resources: for url in entities.get("urls", []): # Exclude links for tweets if url["expanded_url"] and not status_re.match( url["expanded_url"]): self.result.urls.append(url["expanded_url"]) if self.extract_media: for media in entities.get("media", []): if media["media_url"]: self.result.urls.append(media["media_url"]) def process_warc(self, warc_filepath): # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.process_search_warc(warc_filepath) elif harvest_type == "twitter_filter": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_sample": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_user_timeline": self.process_user_timeline_warc(warc_filepath) else: raise KeyError def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state( __name__, u"{}.since_id".format(query)) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and max_tweet_id > since_id: self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id) def process_user_timeline_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) for count, status in enumerate(TwitterRestWarcIter(warc_filepath)): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: user_id = tweet["user"]["id_str"] if incremental: # Update state key = "timeline.{}.since_id".format(user_id) self.state_store.set_state( __name__, key, max(self.state_store.get_state(__name__, key), tweet.get("id"))) self._process_tweet(tweet) def _process_tweets(self, warc_iter): max_tweet_id = None for count, status in enumerate(warc_iter): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: max_tweet_id = max(max_tweet_id, tweet.get("id")) self._process_tweet(tweet) return max_tweet_id def _process_tweet(self, tweet): self.result.increment_stats("tweets") # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects statuses = [tweet] if "retweeted_status" in tweet: statuses.append(tweet["retweeted_status"]) elif "quoted_status" in tweet: statuses.append(tweet["quoted_status"]) for status in statuses: self._process_entities(status.get("entities", {})) self._process_entities(status.get("extended_entities", {})) if self.extract_user_profile_images: self.result.urls.append(tweet["user"]["profile_image_url"]) self.result.urls.append( tweet["user"]["profile_background_image_url"]) if "profile_banner_url" in tweet["user"]: self.result.urls.append(tweet["user"]["profile_banner_url"])
class TwitterHarvester(BaseHarvester): def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__( self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors def harvest_seeds(self): # Create a twarc self._create_twarc() # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() elif harvest_type == "twitter_sample": self.sample() elif harvest_type == "twitter_user_timeline": self.user_timeline() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors, tweet_mode="extended") def search(self): assert len(self.message.get("seeds", [])) == 1 incremental = self.message.get("options", {}).get("incremental", False) since_id = self.state_store.get_state( __name__, u"{}.since_id".format( self._search_id())) if incremental else None query, geocode = self._search_parameters() self._harvest_tweets( self.twarc.search(query, geocode=geocode, since_id=since_id)) def _search_parameters(self): if type(self.message["seeds"][0]["token"]) is dict: query = self.message["seeds"][0]["token"].get("query") geocode = self.message["seeds"][0]["token"].get("geocode") else: query = self.message["seeds"][0]["token"] geocode = None return query, geocode def _search_id(self): query, geocode = self._search_parameters() if query and not geocode: return query if geocode and not query: return geocode return ":".join([query, geocode]) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"].get("track") follow = self.message["seeds"][0]["token"].get("follow") locations = self.message["seeds"][0]["token"].get("locations") language = self.message["seeds"][0]["token"].get("language") self._harvest_tweets( self.twarc.filter(track=track, follow=follow, locations=locations, lang=language, event=self.stop_harvest_seeds_event)) def sample(self): self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event)) def user_timeline(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): seed_id = seed["id"] screen_name = seed.get("token") user_id = seed.get("uid") log.debug( "Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id) assert screen_name or user_id # If there is not a user_id, look it up. if screen_name and not user_id: result, user = self._lookup_user(screen_name, "screen_name") if result == "OK": user_id = user["id_str"] self.result.uids[seed_id] = user_id else: msg = u"User id not found for {} because account is {}".format( screen_name, self._result_to_reason(result)) log.exception(msg) self.result.warnings.append( Msg("token_{}".format(result), msg, seed_id=seed_id)) # Otherwise, get the current screen_name else: result, user = self._lookup_user(user_id, "user_id") if result == "OK": new_screen_name = user["screen_name"] if new_screen_name and new_screen_name != screen_name: self.result.token_updates[seed_id] = new_screen_name else: msg = u"User {} (User ID: {}) not found because account is {}".format( screen_name, user_id, self._result_to_reason(result)) log.exception(msg) self.result.warnings.append( Msg("uid_{}".format(result), msg, seed_id=seed_id)) user_id = None if user_id: # Get since_id from state_store since_id = self.state_store.get_state( __name__, "timeline.{}.since_id".format( user_id)) if incremental else None self._harvest_tweets( self.twarc.timeline(user_id=user_id, since_id=since_id)) def _lookup_user(self, id, id_type): url = "https://api.twitter.com/1.1/users/show.json" params = {id_type: id} # USER_DELETED: 404 and {"errors": [{"code": 50, "message": "User not found."}]} # USER_PROTECTED: 200 and user object with "protected": true # USER_SUSPENDED: 403 and {"errors":[{"code":63,"message":"User has been suspended."}]} result = "OK" user = None try: resp = self.twarc.get(url, params=params, allow_404=True) user = resp.json() if user['protected']: result = "unauthorized" except requests.exceptions.HTTPError as e: try: resp_json = e.response.json() except json.decoder.JSONDecodeError: raise e if e.response.status_code == 404 and self._has_error_code( resp_json, 50): result = "not_found" elif e.response.status_code == 403 and self._has_error_code( resp_json, 63): result = "suspended" else: raise e return result, user @staticmethod def _has_error_code(resp, code): if isinstance(code, int): code = (code, ) for error in resp['errors']: if error['code'] in code: return True return False @staticmethod def _result_to_reason(result): if result == "unauthorized": return "protected" elif result == "suspended": return "suspended" return "not found or deleted" def _harvest_tweets(self, tweets): # max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Harvested %s tweets", count) self.result.harvest_counter["tweets"] += 1 if self.stop_harvest_seeds_event.is_set(): log.debug("Stopping since stop event set.") break def process_warc(self, warc_filepath): # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.process_search_warc(warc_filepath) elif harvest_type == "twitter_filter": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_sample": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_user_timeline": self.process_user_timeline_warc(warc_filepath) else: raise KeyError def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) since_id = self.state_store.get_state( __name__, u"{}.since_id".format( self._search_id())) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and (max_tweet_id or 0) > (since_id or 0): self.state_store.set_state( __name__, u"{}.since_id".format(self._search_id()), max_tweet_id) def process_user_timeline_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) for count, status in enumerate(TwitterRestWarcIter(warc_filepath)): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet or "full_text" in tweet: user_id = tweet["user"]["id_str"] if incremental: # Update state key = "timeline.{}.since_id".format(user_id) self.state_store.set_state( __name__, key, max( self.state_store.get_state(__name__, key) or 0, tweet.get("id"))) self._process_tweet(tweet) def _process_tweets(self, warc_iter): max_tweet_id = None for count, status in enumerate(warc_iter): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet or "full_text" in tweet: max_tweet_id = max(max_tweet_id or 0, tweet.get("id")) self._process_tweet(tweet) return max_tweet_id def _process_tweet(self, _): self.result.increment_stats("tweets")
import json import datetime from twarc import Twarc # Collection end period_end = datetime.datetime(2017, 12, 31, 23, 59, 59, 999999) # Twitter API keys - geobgu2 t = Twarc('JA5KZiEuU8HDIFDtLXwkHCpdx', 'NdGoBYXuYHbHOAInNHHumjz0xeCp8zEYfbm0RW0dzpvcRY8Ovc', '2782755278-ARD36i5dPBU6fxRdgvomZoxuCOI3ewVVGPizZCf', 'ceN8O8yIVV2C7o6CJyLYYo3CNIm48Tnojpxj69pqqv36u') # Twitter stream request t = t.filter(locations="\-72.21437,41.19034,-69.64939,43.30924") # Collect tweets while datetime.datetime.now( ) < period_end: # Loop until collection period ends day_start = datetime.datetime.now() day_end = datetime.datetime(day_start.year, day_start.month, day_start.day, day_start.hour, 59, 59, 999999) fh = open( "boston_geobgu2_" + day_start.strftime("%Y-%m-%d_%H:%M:%S") + ".json", "w") #fh.write("[") for tweet in t: # Loop until hour ends x = tweet try: if x["geo"] != None: print(x["text"])
if existing_files: last_file = sorted(existing_files)[-1] last_file_number = re.search(r'%s(\d+)\.json' % config.file_prefix, last_file) if last_file_number: file_count = int(last_file_number.groups()[0]) + 1 tweet_count = 0 queue = [] twarc = Twarc(config.consumer_key, config.consumer_secret, config.access_token, config.access_token_secret) start_time = time.time() log("Running... Press Ctrl-C to stop.") while True: try: for tweet in twarc.filter(track=",".join(config.word_filter), locations=",".join(config.location_filter), lang=config.language_filter): queue.append(tweet) tweet_count += 1 if config.print_tweets: log("Tweet: " + tweet.get("extended_tweet", {}).get("full_text", tweet.get("text", ""))) # Print progress every 1000 tweets if tweet_count % 1000 == 0: log("%d tweets collected" % tweet_count) if len(queue) >= config.tweets_per_file: save(queue, file_number=file_count) file_count += 1 queue = [] except Exception as e:
from datetime import datetime, timedelta consumer_key = "" consumer_secret = "" access_token = "" access_token_secret = "" already_checked = [] pbar = tqdm.tqdm(desc='tweet analysed', unit='tweets') def isSuspicious(user): created = datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S +0000 %Y') now = datetime.now() - timedelta(days=90) if created > now: tqdm.tqdm.write("[!] Suspicious: " "@{} ({}), created : {}".format( user["screen_name"], user["id"], created.strftime("%d/%m/%Y"))) if __name__ == "__main__": t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) for tweet in t.filter(track="Strasbourg"): user = tweet["user"] if user["id"] not in already_checked: isSuspicious(user) already_checked.append(user["id"]) pbar.update(1)
# **Uniqueness Constraints:** session = driver.session() # Add uniqueness constraints. session.run( "CREATE CONSTRAINT ON (t:Tweet) ASSERT t.id IS UNIQUE;") session.run( "CREATE CONSTRAINT ON (u:User) ASSERT u.screen_name IS UNIQUE;") session.run( "CREATE CONSTRAINT ON (h:Hashtag) ASSERT h.name IS UNIQUE;") session.run( "CREATE CONSTRAINT ON (l:Link) ASSERT l.url IS UNIQUE;") session.run( "CREATE CONSTRAINT ON (s:Source) ASSERT s.name IS UNIQUE;") # **Twitter Follow List:** list = '16796735,820954235357593602' for tweets in t.filter(follow=list): try: query = ''' UNWIND {tweets} AS t WITH t, t.entities AS e, t.user AS u, t.retweeted_status AS retweet WHERE t.id is not null MERGE (tweet:Tweet {id:t.id_str}) SET tweet.text = t.text, tweet.created = t.created_at, tweet.favorites = t.favorite_count MERGE (user:User {screen_name:u.screen_name}) SET user.name = u.name,
class TwitterHarvester(BaseHarvester): def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__(self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors self.extract_media = False self.extract_web_resources = False self.extract_user_profile_images = False def harvest_seeds(self): # Create a twarc self._create_twarc() # Get harvest extract options. self.extract_media = self.message.get("options", {}).get("media", False) self.extract_web_resources = self.message.get("options", {}).get("web_resources", False) self.extract_user_profile_images = self.message.get("options", {}).get("user_images", False) # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() elif harvest_type == "twitter_sample": self.sample() elif harvest_type == "twitter_user_timeline": self.user_timeline() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors) def search(self): assert len(self.message.get("seeds", [])) == 1 incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None self._harvest_tweets(self.twarc.search(query, since_id=since_id)) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"].get("track") follow = self.message["seeds"][0]["token"].get("follow") locations = self.message["seeds"][0]["token"].get("locations") self._harvest_tweets(self.twarc.filter(track=track, follow=follow, locations=locations)) def sample(self): self._harvest_tweets(self.twarc.sample()) def user_timeline(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): seed_id = seed["id"] screen_name = seed.get("token") user_id = seed.get("uid") log.debug("Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id) assert screen_name or user_id # If there is not a user_id, look it up. if screen_name and not user_id: user_id = self._lookup_user_id(screen_name) if user_id: # Report back if nsid found self.result.uids[seed_id] = user_id else: msg = "User id not found for user {}".format(screen_name) log.exception(msg) self.result.warnings.append(Msg(CODE_TOKEN_NOT_FOUND, msg)) # Otherwise, get the current screen_name else: new_screen_name = self._lookup_screen_name(user_id) if new_screen_name != screen_name: self.result.token_updates[seed_id] = new_screen_name screen_name = new_screen_name if user_id: try: # Get since_id from state_store since_id = self.state_store.get_state(__name__, "timeline.{}.since_id".format( user_id)) if incremental else None self._harvest_tweets(self.twarc.timeline(user_id=user_id, since_id=since_id)) except HTTPError as e: if e.response.status_code == 401: msg = "Unauthorized for user {} (User ID: {}) because account is suspended or private".format(screen_name, user_id) log.exception(msg) self.result.warnings.append(Msg(CODE_TOKEN_UNAUTHORIZED, msg)) else: raise e def _lookup_screen_name(self, user_id): """ Lookup a screen name given a user id. """ users = list(self.twarc.user_lookup(user_ids=(user_id,))) assert len(users) in (0, 1) if users: return users[0]["screen_name"] return None def _lookup_user_id(self, screen_name): """ Lookup a user id given a screen name. """ users = list(self.twarc.user_lookup(screen_names=(screen_name,))) assert len(users) in (0, 1) if users: return users[0]["id_str"] return None def _harvest_tweets(self, tweets): # max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Harvested %s tweets", count) self.result.harvest_counter["tweets"] += 1 if self.stop_harvest_seeds_event.is_set(): log.debug("Stopping since stop event set.") break def _process_entities(self, entities): if self.extract_web_resources: for url in entities.get("urls", []): # Exclude links for tweets if url["expanded_url"] and not status_re.match(url["expanded_url"]): self.result.urls.append(url["expanded_url"]) if self.extract_media: for media in entities.get("media", []): if media["media_url"]: self.result.urls.append(media["media_url"]) def process_warc(self, warc_filepath): # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.process_search_warc(warc_filepath) elif harvest_type == "twitter_filter": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_sample": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_user_timeline": self.process_user_timeline_warc(warc_filepath) else: raise KeyError def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and max_tweet_id > since_id: self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id) def process_user_timeline_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) for count, status in enumerate(TwitterRestWarcIter(warc_filepath)): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: user_id = tweet["user"]["id_str"] if incremental: # Update state key = "timeline.{}.since_id".format(user_id) self.state_store.set_state(__name__, key, max(self.state_store.get_state(__name__, key), tweet.get("id"))) self._process_tweet(tweet) def _process_tweets(self, warc_iter): max_tweet_id = None for count, status in enumerate(warc_iter): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: max_tweet_id = max(max_tweet_id, tweet.get("id")) self._process_tweet(tweet) return max_tweet_id def _process_tweet(self, tweet): self.result.increment_stats("tweets") # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects statuses = [tweet] if "retweeted_status" in tweet: statuses.append(tweet["retweeted_status"]) elif "quoted_status" in tweet: statuses.append(tweet["quoted_status"]) for status in statuses: self._process_entities(status.get("entities", {})) self._process_entities(status.get("extended_entities", {})) if self.extract_user_profile_images: self.result.urls.append(tweet["user"]["profile_image_url"]) self.result.urls.append(tweet["user"]["profile_background_image_url"]) if "profile_banner_url" in tweet["user"]: self.result.urls.append(tweet["user"]["profile_banner_url"])
import datetime from twarc import Twarc # Collection end period_end = datetime.datetime(2017, 12, 31, 23, 59, 59, 999999) # Twitter API keys - geobgu2 t = Twarc( 'JA5KZiEuU8HDIFDtLXwkHCpdx', 'NdGoBYXuYHbHOAInNHHumjz0xeCp8zEYfbm0RW0dzpvcRY8Ovc', '2782755278-ARD36i5dPBU6fxRdgvomZoxuCOI3ewVVGPizZCf', 'ceN8O8yIVV2C7o6CJyLYYo3CNIm48Tnojpxj69pqqv36u' ) # Twitter stream request t = t.filter(locations = "\-72.21437,41.19034,-69.64939,43.30924") # Collect tweets while datetime.datetime.now() < period_end: # Loop until collection period ends day_start = datetime.datetime.now() day_end = datetime.datetime(day_start.year, day_start.month, day_start.day, day_start.hour, 59, 59, 999999) fh = open("boston_geobgu2_" + day_start.strftime("%Y-%m-%d_%H:%M:%S") + ".json", "w") #fh.write("[") for tweet in t: # Loop until hour ends x = tweet try: if x["geo"] != None: print(x["text"]) fh.write(json.dumps(x)) fh.write("\n") except:
import sys print (" # Loading keys") consumer_key = 'INSERT YOUR CONSUMER KEY HERE' consumer_secret = 'INSERT YOUR CONSUMER SECRET HERE' access_token = 'INSERT YOUR TOKEN HERE' access_token_secret = 'INSERT YOUR TOKEN SECRET HERE' twarc_auth = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) print (" # Reading search terms") with open('tweet_terms.txt','r') as tweet_terms_file_content: my_tweet_terms = [line.strip() for line in tweet_terms_file_content] print (" # Search terms loaded") if len(my_tweet_terms) > 0: twitter_query = ",".join(my_tweet_terms) print " # Search terms: " + twitter_query for tweet in twarc_auth.filter(track = twitter_query): with open('data_dump.json', 'a') as json_output_file: json.dump(tweet, json_output_file, indent=4, sort_keys=True) else: print "No search terms provided, printing generic stream" for tweet in twarc.sample(): print(tweet) print (" # Authentication successful, dumping results") tweet_terms_file_content.close() json_output_file.close()
import os.path from twarc import Twarc from File_manager import File_manager #usage, takes in a file called tags with name or hashtag to search by per line. t = Twarc('4lJGm5YUrXgtwfMUmlo9L4KgH', 'YIYeIiZGCJpolIASa56eqLJsEa54vGKFt07CkTai3SWKoCPx3w', '276679861-dgKpojdDSWRutxEG7NH2A2ZgD7xHtzcuwOisMo1T', '4ckJoGnRm5defgDegfV4opPETmarGuNQr9U6pAVEwR9sT') out_file = File_manager.open_for_download("track") in_file = open("tags", "r", 1) for tag in in_file: print tag for tweet in t.filter(track=tag): tweet = str(tweet) out_file.write(tweet + "\n") out_file.close() in_file.close()