def auth_ua(creds_path): # app auth can supposedly get 4 as much replies according to: # https://github.com/DocNow/twarc/issues/323 if creds_path == "": t_user = Twarc(app_auth=False) t_app = Twarc(app_auth=True) else: creds = tutils.retrieve_creds(creds_path) t_user = Twarc(creds[0], creds[1], creds[2], creds[3], app_auth=False) t_app = Twarc(creds[0], creds[1], creds[2], creds[3], app_auth=True) return t_user, t_app
def __init__(self, creds = [], neo4j_creds = None, TWEETS_PER_PROCESS=100, TWEETS_PER_ROWGROUP=5000, save_to_neo=False, PARQUET_SAMPLE_RATE_TIME_S=None, debug=False, BATCH_LEN=100, writers = {'snappy': None}): self.queue = deque() self.writers = writers self.last_write_epoch = '' self.current_table = None self.schema = pa.schema([ (name, t) for (i, name, t) in KNOWN_FIELDS ]) self.timer = Timer() self.debug = debug self.twarc_pool = TwarcPool([ Twarc(o['consumer_key'], o['consumer_secret'], o['access_token'], o['access_token_secret']) for o in creds ]) self.save_to_neo = save_to_neo self.TWEETS_PER_PROCESS = TWEETS_PER_PROCESS #100 self.TWEETS_PER_ROWGROUP = TWEETS_PER_ROWGROUP #100 1KB x 1000 = 1MB uncompressed parquet self.PARQUET_SAMPLE_RATE_TIME_S = PARQUET_SAMPLE_RATE_TIME_S self.last_df = None self.last_arr = None self.last_write_arr = None self.last_writes_arr = [] self.neo4j_creds = neo4j_creds self.BATCH_LEN = BATCH_LEN self.needs_to_flush = False self.__file_names = []
def add_users_by_screen_names(self, screen_names): if 'keys' not in self: raise CollectionConfigException( 'Keys are required to add users by screen name.') keys = self['keys'] twarc = Twarc(keys['consumer_key'], keys['consumer_secret'], keys['access_token'], keys['access_token_secret']) # Lower case to original case screen_name_case_map = {} for screen_name in screen_names: clean_screen_name = screen_name.lstrip('@') if clean_screen_name: screen_name_case_map[ clean_screen_name.lower()] = clean_screen_name if 'users' not in self: self['users'] = {} delete_users = [] for user in twarc.user_lookup(screen_name_case_map.keys(), id_type='screen_name'): if user['id_str'] not in self['users']: self['users'][user['id_str']] = { 'screen_name': user['screen_name'] } delete_users.append(user['screen_name'].lower()) for screen_name in delete_users: del screen_name_case_map[screen_name] return screen_name_case_map.values()
def dehydrate(self, tweet_ids: List[str]): t = Twarc(self.configuration["twitter"]["consumer_key"], self.configuration["twitter"]["consumer_secret"], self.configuration["twitter"]["access_token"], self.configuration["twitter"]["access_token_secret"], tweet_mode="extended") count: int = 0 print("Reading tweets from Twitter") with tqdm(total=self.configuration["sampling"]["size"], unit="tweet") as written_progress_bar: with tqdm(total=len(tweet_ids), unit="tweet") as hydrate_progress_bar: for tweet in t.hydrate(tweet_ids): hydrate_progress_bar.update(1) if any(keyword in tweet["full_text"].lower() for keyword in self.configuration["sampling"]["keywords"]): append: bool = True if "only_media" in self.configuration["sampling"].keys( ): if self.configuration["sampling"]["only_media"]: if not self.contains_media(tweet): append = False if len(self.configuration["sampling"] ["languages"]) > 0: if tweet["lang"] not in self.configuration[ "sampling"]["languages"]: append = False if append: written_progress_bar.update(1) count += 1 yield tweet if count == self.configuration["sampling"]["size"]: return
def __init__(self, consumer_key, consumer_secret, access_token_key="", access_token_secret=""): """ This method authenticates and creates a twitterapi object. In case the system is unable to authenticate the object, a SystemError is returned. """ auth = tweepy.OAuthHandler(consumer_key, consumer_secret) try: auth.get_authorization_url() except TweepError as e: print("Unable to authenticate", str(e)) raise ApplicationError(*error_list["AUTH_ERROR"]) auth.set_access_token(access_token_key, access_token_secret) self._api = tweepy.API(auth_handler=auth) try: self._api_twarc = Twarc(cnst.CONSUMER_KEY, cnst.CONSUMER_SECRET, cnst.ACCESS_TOKEN_KEY, cnst.ACCESS_TOKEN_SECRET) except Exception as e: print("Unable to authenticate", str(e)) raise ApplicationError(*error_list["AUTH_ERROR"])
def get_account(item): """ Uses the Twarc libtrary to surface all the tweet twarc can see via a twitter username Searches for media in all tweets - if it can find any it also tries to download that media item """ item.agent_name = agent_name + "_1_get_account" if not os.path.exists(item.storage_folder): os.makedirs(item.storage_folder) t = Twarc(twitter_consumer_key, twitter_consumer_secret, twitter_access_token, twitter_access_token_secret) name = item.url.strip().replace("https://twitter.com/", "").replace("?", "") file_path = os.path.join( item.storage_folder, "{}_{}.json".format(time.strftime("%d-%m-%Y_%H-%M-%S"), name)) if not os.path.exists(item.storage_folder): os.makedirs(item.storage_folder) tweets = [] for tweet in t.timeline(screen_name=name): tweets.append(tweet) tweets = filter_tweets_by_start_date(tweets, item.date_range) for tweet in tweets: get_assets(tweet, item.storage_folder) with open(file_path, "w") as outfile: json.dump(tweets, outfile) item.completed = True return item
def get_tweet(item): """ takes a tweet id and uses the twarc lib to harvest it searches for media in the tweet - if it can find any it also tries to download that media item """ item.agent_name = agent_name + "_1_get_tweet" if not os.path.exists(item.storage_folder): os.makedirs(item.storage_folder) my_content_types = [] url = item.url if url.endswith("/"): url = url[:-1] __, __id = url.rsplit("/", 1) t = Twarc(twitter_consumer_key, twitter_consumer_secret, twitter_access_token, twitter_access_token_secret) for tweet in t.hydrate([__id]): get_assets(tweet, item.storage_folder) file_path = os.path.join( item.storage_folder, "{}_{}.json".format(time.strftime("%d-%m-%Y_%H-%M-%S"), tweet['id'])) with open(file_path, "w") as outfile: json.dump(tweet, outfile) item.completed = True return item
def collect_timelines(input_file, output_file, credentials_file): with open(credentials_file) as fp: credentials = tuple(map(str.strip, fp.readlines())) twarc_obj = Twarc(*credentials) df = pd.read_csv(input_file, sep="\t") with open(output_file, "w+") as fp: total = 0 found_users = 0 pbar = tqdm.tqdm(df.values) for uid, tid, u_statuses in pbar: found = 0 pbar.set_description("User {}".format(uid)) try: for tweet_json in twarc_obj.timeline(user_id="{}".format(uid)): found += 1 if found > 190: break total += 1 print(json.dumps(tweet_json), file=fp) pbar.set_postfix(found=found_users + 1, total=total) except requests.exceptions.HTTPError as e: pbar.write("Error for uid={}. {}".format(uid, e)) else: found_users += 1 pbar.close() print("Collected {} tweets.".format(total))
def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors)
def stream_city(cf, city, keywords=None): bbox = { "great_syd": [149.971885992, -34.33117400499998, 151.63054702400007, -32.99606922499993], "great_mel": [144.33363404800002, -38.50298801599996, 145.8784120140001, -37.17509899299995], "great_brisbane": [152.07339276400012, -28.363962911999977, 153.54670756200005, -26.452339004999942], "great_ald": [138.435645001, -35.350296029999974, 139.04403010400003, -34.50022530299998] } if keywords == None: keywords = cf["search_words"] t = Twarc(**cf['account']) # no keyword restriction but from a specific city # reason see this https://stackoverflow.com/questions/22889122/how-to-add-a-location-filter-to-tweepy-module if not os.path.isdir(city+"/"): os.makedirs(city) path = city + "/" + str(datetime.date.today())+".jsonl" locations = ",".join([str(i) for i in bbox[city]]) for tweet in t.filter(locations=locations): print("get one tweet") #TODO send_to_db(tweet)
def collect_replies(self): """ Collect replies for all tweets from query using twarc :return: """ twarc = Twarc(self.ak, self.aks, self.at, self.ats) reply_count = 0 # loop through all parent tweets from query for tweet in self.tweets: replies = [] reps = twarc.replies( self.tweepy_to_twarc(tweet), recursive=False) # get iterator for replies from twarc rep = next( reps) # first "rep" is the parent tweet so we don't use it i = 0 # max 30 replies while i < 30: try: rep = next(reps) # get next reply and add it to list replies.append(rep) i = i + 1 except StopIteration: break except Exception as e: print('error: ', e) self.dict[tweet.id] = replies # add tweet to dict {id:replies} reply_count += len(replies) print(reply_count, ' replies were collected')
def pull_tweet(input_file_name): CONSUMER_KEY = "9At2u3Y2DraTHLSg3D9w6LhE9" CONSUMER_KEY_SECRET = "DRFCbI2t0gMhfV2KnEub6cljowW9zRwmkeMJ0GT9MlMkrkzspM" ACCESS_TOKEN = "1259913765614751745-LwtSI48si3sYekzvxW86syIFsRgirl" ACCESS_TOKEN_SECRET = "e0gpJdT0IXOSxFrhplKMl8FlP0dVnuLg1vwBHzt5Fc9J9" t = Twarc(CONSUMER_KEY, CONSUMER_KEY_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET) inputF = open(input_file_name, "r") line = inputF.readline() data = [] i = 0 while line != "" and i < 10: try: tweet = t.tweet(line.strip()) if tweet["lang"] == "en": if 'retweeted_status' in tweet.keys(): data.append(tweet['retweeted_status']['full_text'].replace( '\n', ' ')) else: data.append(data, tweet['full_text'].replace('\n', ' ')) i += 1 line = inputF.readline() except Exception as e: line = inputF.readline() return data
def authorize(): """ Return authorized Twarc handler with the credentials stored in config file. """ config = configuration()['twitter'] twarc_auth = Twarc(config['consumer_key'], config['consumer_secret'], config['access_token'], config['access_token_secret']) return twarc_auth
def get_twitter_keys(profile=None, twarc_config=None): twarc = Twarc(config=twarc_config, profile=profile) return { 'consumer_key': twarc.consumer_key, 'consumer_secret': twarc.consumer_secret, 'access_token': twarc.access_token, 'access_token_secret': twarc.access_token_secret }
def _create_twarc(self): return Twarc(self.config["keys"]["consumer_key"], self.config["keys"]["consumer_secret"], self.config["keys"]["access_token"], self.config["keys"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors, tweet_mode="extended")
def main(): """ Main program """ # argument check if len(sys.argv) > 1: # if argument file exists if os.access(sys.argv[1], os.R_OK): input_file = sys.argv[1] else: sys.stderr.write("ERROR, NEED VALID FILE\n") sys.exit(1) else: sys.stderr.write("ERROR, NEED FILE\n") sys.exit(1) # check if data folder exists or create it if not os.path.isdir("data"): os.makedirs("data") # keep running stream function (every hour) while True: # string of streaming words print "Starting" keys = "" lines = [] projects = [] # open file for read with open(input_file, "r") as fr: string_txt = fr.read() projects = json.loads(string_txt) #for line in fr: # # empty line # if line != '\n': # # remove white chars in start and end of line # line = line.rstrip('\n\t ') # line = line.strip('\t ') # # append line to array and string # keywords = line.split("::")[1:] # project_id = line.split("::")[0] # keys = keys + ",".join(keywords) + "," # project = {"id":project_id, "name": keywords[0], "keywords":keywords} # projects.append(project) for project in projects: keys += ",".join(project["synonyms"]) + "," print("Projects %s" % str(projects)) keys = keys.rstrip(",") # create Twarc class t = Twarc(client_key, client_secret, access_token, access_token_secret) # call stream function every hour if stream(keys, projects, t) != True: sys.stderr.write("ERROR, STREAM QUITS\n") sys.exit(1)
def test_search(): count = 0 t = Twarc() for tweet in t.search('obama'): assert tweet['id_str'] count += 1 if count == 10: break assert count == 10
def test_since_id(): t = Twarc() for tweet in t.search('obama'): id = tweet['id_str'] break assert id time.sleep(5) for tweet in t.search('obama', since_id=id): assert tweet['id_str'] > id
def __init__(self, secretsfile='/Users/sara/twittersecrets.txt'): fsecret = open(secretsfile, 'r') secrets = fsecret.readline() access_token, access_token_secret, consumer_key, consumer_secret = \ [x.strip() for x in secrets.split(',')] self.twarc = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)
def get_interactions(consumer_key, consumer_secret, access_token, access_token_secret): """ Arguments are Twitter API credentials. To get them you can go here http://apps.twitter.com/. Saves pickled lists of tweet authors and users they mention, and a list of users considered. """ from twarc import Twarc from tqdm import tqdm import pickle t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) list_ids = ["1335885096063295488", "1288082572195639296", "1287444819015618561", "1283739792702713856", "1081734288368898048", "910757441855459328", "193445218", "90205656", "85315110"] users = set([m['screen_name'] for lid in list_ids for m in t.list_members(lid)]) users_to_exclude = ['premierleague', 'SpursOfficial', 'Arsenal', 'ManCity', 'sterling7', 'kylewalker2', 'HKane', 'benmendy23', 'dele_official', 'RobHolding95', 'm8arteta'] [users.remove(u) for u in users_to_exclude] authors = [] mentions = [] for user in tqdm(users): tl = t.timeline(screen_name=user) tweets = [tt for tt in tl] m = [u['screen_name'] for tw in tweets for u in tw['entities']['user_mentions']] a = [user] * len(m) mentions.append(m) authors.append(a) flat_a = [item for sublist in authors for item in sublist] flat_m = [item for sublist in mentions for item in sublist] pickle.dump(flat_a, open('authors.p', 'wb')) pickle.dump(flat_m, open('mentions.p', 'wb')) pickle.dump(users, open('users.p', 'wb'))
def __init__(self, search_terms): logging.info("initializing TwitterStream Kafka") # globals to all instances self.t = Twarc(localConfig.client_key, localConfig.client_secret, localConfig.access_token, localConfig.access_token_secret) self.search_terms = search_terms
def test_paging(): # pages are 100 tweets big so if we can get 500 paging is working t = Twarc() count = 0 for tweet in t.search('obama'): count += 1 if count == 500: break assert count == 500
def main(): if len(sys.argv) > 2: screen_name = sys.argv[1] keyword = sys.argv[2] t = Twarc(consumer_key, consumer_secret, access_token, access_secret) follower_ids = get_followers_id(t, screen_name) get_screen_name(t, screen_name, follower_ids, keyword) else: print('Usage: python poc.py fs0c131y chowkidar')
def test_stream(): t = Twarc() count = 0 for tweet in t.stream("obama"): assert tweet['id_str'] assert tweet['text'] count += 1 if count == 50: break assert count == 50
def main(get_method=None, input_hashtags=None, storage_location=None): if not os.path.exists(storage_location): os.makedirs(storage_location, exist_ok=True) hashtag_query = input_hashtags.strip().replace(",", "+OR+") try: tweets = 0 t = Twarc( consumer_key, consumer_secret, access_token, access_token_secret, tweet_mode="extended", ) print( "Started storing tweets related to " + input_hashtags + " at " + storage_location + " since " + str(datetime.datetime.now()) ) if get_method == "populate": for tweet in t.search(hashtag_query, lang=language): with open( os.path.join( storage_location + "tweet" + str(tweet["id"]) + ".json" ), "w", encoding="utf8", ) as file: json.dump(tweet, file) tweets += 1 elif get_method == "track": for tweet in t.filter(hashtag_query): with open( storage_location + "/tweet" + str(tweet["id"]) + ".json", "w", encoding="utf8", ) as file: json.dump(tweet, file) tweets += 1 else: print("No method defined, exiting...") except KeyboardInterrupt: print("Shutdown requested...successfully stored " + str(tweets) + " tweets") except BaseException: traceback.print_exc(file=sys.stdout) sys.exit(0)
def get_traing_data(self): ''' :return: combined data (tweets info and trec-is data) as dictionary {tweet_id: Tweet} ''' # load tweets retrieved by TREC-Tweets downloader # retrieved_tweets, f_name = self.load_Tweets() #retrieved_tweets, f_name = self.load_event_tweets() file = open('data/all_tweets.pkl', 'rb') retrieved_tweets = pickle.load(file) file.close() missed_tweets = [] training_data = {} # dict {'tweet id': Tweet} # load TREC data data: tweetsID, tweet_priority, tweet_categories, indicator_terms events = json.load(open(self.trec_path)) events = pd.DataFrame.from_dict(events['events'], orient='columns') for _, event in events.iterrows(): for trec_tweet in event['tweets']: if trec_tweet[ 'postID'] in retrieved_tweets: # check if tweets_full is retrieved ? retriev_tweet = retrieved_tweets[trec_tweet['postID']] training_data[trec_tweet['postID']] = Tweet( id=retriev_tweet.id, text=retriev_tweet.text, metadata=retriev_tweet.metadata, priority=trec_tweet['priority'], indicatorTerms=trec_tweet['indicatorTerms'], categories=trec_tweet['categories'], event_type=trec_tweet['event_type']) else: # adding missed tweets training_data[trec_tweet['postID']] = Tweet( id=trec_tweet['postID'], priority=trec_tweet['priority'], indicatorTerms=trec_tweet['indicatorTerms'], categories=trec_tweet['categories'], event_type=trec_tweet['event_type']) missed_tweets.append(trec_tweet['postID']) # Retrieve the missed tweets by Twarc tool and combine with training data t = Twarc(self.consumer_key, self.consumer_secret, self.access_token, self.access_token_secret) tweets_twarc = t.hydrate( iter(missed_tweets)) # retrieve all tweets by IDs for twtt in tweets_twarc: training_data[str(twtt['id'])].add_tweets_data( twtt['full_text'], {'created_at': twtt['created_at']}) return training_data
def main(): """ Main program """ # argument check if len(sys.argv) > 1: # if argument file exists if os.access(sys.argv[1], os.R_OK): input_file = sys.argv[1] else: sys.stderr.write("ERROR, NEED VALID FILE\n") sys.exit(1) else: sys.stderr.write("ERROR, NEED FILE\n") sys.exit(1) # string of streaming words keys = "" # open file for read with open(input_file, "r") as fr: for line in fr: # empty line if line != '\n': # remove white chars in start and end of line line = line.rstrip('\n\t ') line = line.strip('\t ') # append line to array keys = keys + line + "," keys = keys.rstrip(",") # check if data folder exists or create it if not os.path.isdir("data"): os.makedirs("data") # keep running stream function (every hour) while True: # create Twarc class t = Twarc(client_key, client_secret, access_token, access_token_secret) # call stream function every hour if stream(keys, t) != True: sys.stderr.write("ERROR, STREAM QUITS\n") sys.exit(1) # open file for statistics of user tweets with open("data/statistics.txt", "w") as fs: # write user's id + number of tweets to file for key, value in sorted(friends.iteritems(), key=lambda (k, v): (v, k), reverse=True): fs.write(str(key) + " : " + str(value) + "\n")
def __init__(self, target_list=[], data_dir='', secretfile='/Users/sara/twittersecrets.txt', getimages=True): # Set up link to Twitter with open(secretfile, 'r') as fsecret: secrets = fsecret.readline() access_token, access_token_secret, consumer_key, consumer_secret = \ [x.strip() for x in secrets.split(',')] twarc = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) self.target_list = target_list self.data_dir = data_dir self.getimages = getimages
def test_max_id(): t = Twarc() for tweet in t.search('obama'): id = tweet['id_str'] break assert id time.sleep(5) count = 0 for tweet in t.search('obama', max_id=id): count += 1 assert tweet['id_str'] <= id if count > 100: break
def test_hydrate(): ids = [ "501064188211765249", "501064196642340864", "501064197632167936", "501064196931330049", "501064198005481472", "501064198009655296", "501064198059597824", "501064198513000450", "501064180468682752", "501064199142117378", "501064171707170816", "501064200186118145", "501064200035516416", "501064201041743872", "501064201251880961", "501064198973960192", "501064201256071168", "501064202027798529", "501064202245521409", "501064201503113216", "501064202363359232", "501064202295848960", "501064202380115971", "501064202904403970", "501064203135102977", "501064203508412416", "501064203516407810", "501064203546148864", "501064203697156096", "501064204191690752", "501064204288540672", "501064197396914176", "501064194309906436", "501064204989001728", "501064204980592642", "501064204661850113", "501064205400039424", "501064205089665024", "501064206666702848", "501064207274868736", "501064197686296576", "501064207623000064", "501064207824351232", "501064208083980290", "501064208277319680", "501064208398573568", "501064202794971136", "501064208789045248", "501064209535614976", "501064209551994881", "501064141332029440", "501064207387742210", "501064210177331200", "501064210395037696", "501064210693230592", "501064210840035329", "501064211855069185", "501064192024006657", "501064200316125184", "501064205642903552", "501064212547137536", "501064205382848512", "501064213843169280", "501064208562135042", "501064214211870720", "501064214467731457", "501064215160172545", "501064209648848896", "501064215990648832", "501064216241897472", "501064215759568897", "501064211858870273", "501064216522932227", "501064216930160640", "501064217667960832", "501064211997274114", "501064212303446016", "501064213675012096", "501064218343661568", "501064213951823873", "501064219467341824", "501064219677044738", "501064210080473088", "501064220415229953", "501064220847656960", "501064222340423681", "501064222772445187", "501064222923440130", "501064220121632768", "501064222948593664", "501064224936714240", "501064225096499201", "501064225142624256", "501064225314185216", "501064225926561794", "501064226451259392", "501064226816143361", "501064227302674433", "501064227344646144", "501064227688558592", "501064228288364546", "501064228627705857", "501064229764751360", "501064229915729921", "501064231304065026", "501064231366983681", "501064231387947008", "501064231488200704", "501064231941570561", "501064232188665856", "501064232449114112", "501064232570724352", "501064232700350464", "501064233186893824", "501064233438568450", "501064233774510081", "501064235107897344", "501064235175399425", "501064235456401410", ] t = Twarc() count = 0 for tweet in t.hydrate(iter(ids)): assert tweet['id_str'] count += 1 assert count > 100 # may need to adjust as these might get deleted