def get_query(query_string, jjmin, jjmax, tweet_fields, nb_tweets=10): """Generate formatted query for Twitter v2 API Args: query_string (string): string used to build query jjmin (int): min day offset from current day jjmax (int): max day offset from current day tweet_fields (string): fields required to query Tiwtter API nb_tweets (int, optional): Max number of tweets to return. Defaults to 10. Returns: dict: formatted query """ current_date = pd.to_datetime('today') start_time = (current_date + pd.Timedelta(jjmin, "D")).strftime("%Y-%m-%d") end_time = (current_date + pd.Timedelta(jjmax, "D")).strftime("%Y-%m-%d") if jjmax == 1: # Return end time as today - 1min end_time = (current_date - pd.Timedelta(1, "m")).strftime("%Y-%m-%dT%H:%M") query = gen_request_parameters(query_string, tweet_fields=tweet_fields, start_time=start_time, end_time=end_time, results_per_call=nb_tweets) return query
def load(): config = twitter_conifg() base_date = datetime.datetime.today() date_list = [base_date - datetime.timedelta(days=x) for x in range(5)] date_list.reverse() all_tweets = [] for idx, date in enumerate(date_list): if idx != 4: final_date = date + datetime.timedelta(days=1) search_args = load_credentials( filename="./configs/twitter_api.yaml", yaml_key="search_tweets_v2", env_overwrite=False) query = gen_request_parameters( config['query'], results_per_call=100, place_fields='country', start_time=date.strftime('%Y-%m-%d'), end_time=final_date.strftime('%Y-%m-%d')) tweets = collect_results(query, max_tweets=1000, result_stream_args=search_args) def add_date(x): x['fecha'] = date.strftime('%Y-%m-%d') return x tweets = list(map(add_date, tweets)) all_tweets.append(tweets) all_tweets = reduce(lambda x, y: x + y, all_tweets) return all_tweets
def search(queryString, outputpath, api_key_yaml,startTime="2016-01-01",endTime="2021-03-15", lang="en"): search_args = load_credentials(api_key_yaml, yaml_key="search_tweets_v2", env_overwrite=False) print("Should be 1024, but it:") print(len(queryString + " -is:nullcast -is:retweet -is:verified -is:quote " + "lang:"+lang)) #,created_at,geo,in_reply_to_user_id,lang,author_id,conversation_id,public_metrics,entities,context_annotations query = gen_request_parameters(query=queryString.strip() + " -is:nullcast -is:retweet -is:verified -is:quote " + "lang:"+lang, media_fields="media_key,type",user_fields="id,description,location,name,entities,url,username,public_metrics,verified,withheld,protected",tweet_fields="id,text,created_at,geo,in_reply_to_user_id,lang,author_id,conversation_id,public_metrics,entities,context_annotations,attachments",start_time=startTime,end_time=endTime, stringify=False, expansions="author_id,attachments.media_keys",results_per_call=500) rs = ResultStream(request_parameters=query, max_tweets=sys.maxsize, max_requests=sys.maxsize, **search_args) i = 0 with open(outputpath, 'w') as outputcsv: writer = csv.writer(outputcsv) writer.writerow(headers) for tweet in rs.stream(): # print(tweet) if "id" in tweet: writer.writerow(createRow(headers, tweet)) if "users" in tweet: print("parsing users") dump_users_info(tweet,outputpath.replace(".csv",str(i) +"-users.csv")) i+=1
def create_query_obj(query_str: str, start_date: str, end_date: str): return gen_request_parameters( query=query_str, results_per_call=500, start_time=start_date, end_time=end_date, tweet_fields="created_at", expansions="author_id,referenced_tweets.id,referenced_tweets.id.author_id", )
def main(mytimer: func.TimerRequest, fetchedTweetsQue: func.Out[func.QueueMessage]) -> None: time = datetime.utcnow().replace(tzinfo=timezone.utc) hashtags = get_hashtags() credentials = load_twitter_credentials() start_time = time - timedelta(minutes=5) tweet_fields = ['id', 'text', 'created_at', 'lang'] for hashtag in hashtags: query = hashtag logging.info(f'Fetching tweets with query: {query}') retquest_params = gen_request_parameters( query, start_time=start_time.strftime("%Y-%m-%d %H:%M"), tweet_fields=','.join(tweet_fields), # since_id= # TODO: Use last fetch tweet id in request ) response = collect_results(retquest_params, max_tweets=100, result_stream_args=credentials) if (response): tweets = response[:-1] response_metadata = response[-1] # TODO: Store 'newest_id' # TODO: Support pagination logging.info(f'Unfiltered tweets count: {len(tweets)}') messages = [] for t in filter_tweets(tweets): t['hashtag'] = hashtag messages.append(dumps(t)) logging.info(f'Filtered tweets count: {len(messages)}') logging.info(messages) fetchedTweetsQue.set(messages) logging.info('Python timer trigger function ran at %s', time.isoformat())
# loop through dates for single_date in daterange(start_date, end_date): # set start timestamp start_ts = single_date # set end timestamp end_ts = single_date + timedelta(days=1) # payload rules for v2 api rule = gen_request_parameters( query=config['query'], results_per_call=config['results_per_call'], start_time=start_ts.isoformat(), end_time=end_ts.isoformat(), tweet_fields=tweetfields, user_fields=userfields, media_fields=mediafields, place_fields=placefields, expansions=expansions, stringify=False) # result stream from twitter v2 api rs = ResultStream(request_parameters=rule, max_results=100000, max_pages=1, max_tweets=config['max_tweets'], **search_creds) # number of reconnection tries tries = 10
def lookup(self, config: TwitterSourceConfig, **kwargs) -> List[AnalyzerRequest]: if not config.query and not config.keywords and not config.hashtags and config.usernames: raise AttributeError( "At least one non empty parameter required (query, keywords, hashtags, and usernames)" ) place_fields = ",".join( config.place_fields) if config.place_fields is not None else None user_fields = ",".join( config.user_fields) if config.user_fields is not None else None expansions = ",".join( config.expansions) if config.expansions is not None else None tweet_fields = ",".join( config.tweet_fields) if config.tweet_fields is not None else None # Get data from state id: str = kwargs.get("id", None) state: Dict[ str, Any] = None if id is None else self.store.get_source_state(id) since_id: Optional[ int] = config.since_id or None if state is None else state.get( "since_id", None) until_id: Optional[ int] = config.until_id or None if state is None else state.get( "until_id", None) update_state: bool = True if id else False state = state or dict() max_tweet_id = since_id min_tweet_id = until_id lookup_period = config.lookup_period start_time = None if lookup_period is None else datetime.strptime( convert_utc_time(lookup_period), "%Y-%m-%dT%H:%M:%S%z") if since_id or until_id: lookup_period = None query = self._generate_query_string(query=config.query, keywords=config.keywords, hashtags=config.hashtags, usernames=config.usernames, operators=config.operators) source_responses: List[AnalyzerRequest] = [] need_more_lookup = True while need_more_lookup: search_query = gen_request_parameters( query=query, results_per_call=config.max_tweets, place_fields=place_fields, expansions=expansions, user_fields=user_fields, tweet_fields=tweet_fields, since_id=since_id, until_id=until_id, start_time=lookup_period) logger.info(search_query) tweets_output = collect_results( query=search_query, max_tweets=config.max_tweets, result_stream_args=config.credential.get_twitter_credentials()) if not tweets_output: logger.info("No Tweets found") need_more_lookup = False break tweets = [] users = [] meta_info = None for raw_output in tweets_output: if "text" in raw_output: tweets.append(raw_output) elif "users" in raw_output: users = raw_output["users"] elif "meta" in raw_output: meta_info = raw_output["meta"] # Extract user info and create user map user_map: Dict[str, Dict[str, Any]] = {} if len(users) > 0 and "id" in users[0]: for user in users: user_map[user["id"]] = user # TODO use it later logger.info(f"Twitter API meta_info='{meta_info}'") for tweet in tweets: if "author_id" in tweet and tweet["author_id"] in user_map: tweet["author_info"] = user_map.get(tweet["author_id"]) source_responses.append(self._get_source_output(tweet)) # Get latest tweet id current_tweet_id = int(tweet["id"]) logger.info( f'{tweet["created_at"]}:{current_tweet_id}:{since_id}:{until_id}' ) if start_time: created_date = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") if start_time > created_date: need_more_lookup = False break if max_tweet_id is None: max_tweet_id = current_tweet_id if min_tweet_id is None: min_tweet_id = current_tweet_id if max_tweet_id < current_tweet_id: max_tweet_id = current_tweet_id if min_tweet_id > current_tweet_id: min_tweet_id = current_tweet_id logger.info(f'{max_tweet_id}:{min_tweet_id}') until_id = min_tweet_id lookup_period = None if update_state: state["since_id"] = max_tweet_id self.store.update_source_state(workflow_id=id, state=state) return source_responses
with open(value, "r") as credfile: os.environ[key] = credfile.read() stream_args = load_credentials(filename="config.yaml", yaml_key="search_tweets_pgdinamica", env_overwrite=True) LIMIT = 100 search_term = "python" if not os.path.exists(OUT_DIR): os.makedirs(OUT_DIR) for day in range(14, 21): query = gen_request_parameters(f"{search_term} lang:pt", start_time=f"2021-03-{day} 09:00", results_per_call=LIMIT) tweets = collect_results(query, max_tweets=LIMIT, result_stream_args=stream_args) print(f"{len(tweets)} resultados no dia {day}") with open(os.path.join(OUT_DIR, f"tweets_{search_term}.txt"), "a") as tweetsfile: lines = [tweet['text'] for tweet in tweets if 'text' in tweet] tweetsfile.writelines(lines) print("FIM")
def lookup(self, config: TwitterSourceConfig, **kwargs) -> List[TextPayload]: # type: ignore[override] if (not config.query and not config.keywords and not config.hashtags and config.usernames): raise AttributeError( "At least one non empty parameter required (query, keywords, hashtags, and usernames)" ) place_fields = (",".join(config.place_fields) if config.place_fields is not None else None) user_fields = (",".join(config.user_fields) if config.user_fields is not None else None) expansions = (",".join(config.expansions) if config.expansions is not None else None) tweet_fields = (",".join(config.tweet_fields) if config.tweet_fields is not None else None) # Get data from state identifier: str = kwargs.get("id", None) state: Optional[Dict[str, Any]] = (None if identifier is None or self.store is None else self.store.get_source_state(identifier)) since_id: Optional[int] = (config.since_id or None if state is None else state.get("since_id", None)) until_id: Optional[int] = (config.until_id or None if state is None else state.get("until_id", None)) update_state: bool = True if identifier else False state = state or dict() max_tweet_id = since_id lookup_period = config.lookup_period if lookup_period is None: start_time = None elif len(lookup_period) <= 5: start_time = convert_utc_time(lookup_period).replace( tzinfo=pytz.UTC) else: start_time = datetime.strptime(lookup_period, "%Y-%m-%dT%H:%M:%S%z") if since_id or until_id: lookup_period = None query = self._generate_query_string( query=config.query, keywords=config.keywords, hashtags=config.hashtags, usernames=config.usernames, operators=config.operators, ) source_responses: List[TextPayload] = [] search_query = gen_request_parameters( granularity=None, query=query, results_per_call=config.max_tweets, place_fields=place_fields, expansions=expansions, user_fields=user_fields, tweet_fields=tweet_fields, since_id=since_id, until_id=until_id, start_time=lookup_period, stringify=False, ) logger.info(search_query) tweets_output = collect_results( query=search_query, max_tweets=config.max_tweets, result_stream_args=config.get_twitter_credentials(), ) tweets: List[Dict[str, Any]] = [] users: List[Dict[str, Any]] = [] meta_info: Dict[str, Any] = {} if not tweets_output and len(tweets_output) == 0: logger.info("No Tweets found") else: tweets = tweets_output[0]["data"] if "data" in tweets_output[ 0] else tweets if "includes" in tweets_output[0] and "users" in tweets_output[0][ "includes"]: users = tweets_output[0]["includes"]["users"] meta_info = tweets_output[0]["meta"] if "meta" in tweets_output[ 0] else meta_info # Extract user info and create user map user_map: Dict[str, Dict[str, Any]] = {} if len(users) > 0 and "id" in users[0]: for user in users: if "username" in user: user[ "user_url"] = f'https://twitter.com/{user["username"]}' user_map[user["id"]] = user logger.info(f"Twitter API meta_info='{meta_info}'") for tweet in tweets: if "author_id" in tweet and tweet["author_id"] in user_map: tweet["author_info"] = user_map.get(tweet["author_id"]) source_responses.append(self._get_source_output(tweet)) if start_time: created_date = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") if start_time > created_date: break max_tweet_id = meta_info[ "newest_id"] if "newest_id" in meta_info else max_tweet_id # min_tweet_id = meta_info["oldest_id"] if "oldest_id" in meta_info else min_tweet_id if update_state and self.store is not None: state["since_id"] = max_tweet_id self.store.update_source_state(workflow_id=identifier, state=state) return source_responses
# Where to save our results should be defined here saving_path = r'/Volumes/My Passport for Mac/tweets/tweets_metadata_08052021.csv' """ Credentials file for developer accounts, mandatory for the access to API!!! """ credentials = load_credentials(filename="credentials.yaml", yaml_key="credentials", env_overwrite=False) # change if needed """ Query can be defined here, there always has to be a certain search keyword, I put 'a' here because of the wider reach, might be possible to exclude (have to do further investigations in this case. results_per_call can be redefined via a .yaml file """ # request params for this query query = searchtweets.gen_request_parameters("a lang:de", start_time="2021-05-08T00:00", end_time="2021-05-08T23:59", results_per_call=100) """ List of tweet dicts, including the ids and the tweet text. Can be directly printed or stored in a file """ # tweets = collect_results(query, # max_tweets=100, # result_stream_args=credentials) # # with open('./tweets.txt', 'w') as tweet_file: # for x in tweets: # for y in x: # if y == 'text': # tweet_file.write(x[y] + '\n') def check_files():
print("Empty txt can't generate a word cloud") else: wordcloud = WordCloud().generate(text) image = wordcloud.to_image() image.save(os.path.join("output", f"{OUT_FILE}.png")) print("Image Loaded") stream_args = load_credentials(filename="config.yalm", yaml_key="search_tweets_v2", env_overwrite=False) tweeterUser = input("Inform the Tweet user: "******"from:{tweeterUser} -has:links", results_per_call=TWITTER_QUANTITY) try: tweets = collect_results(query, max_tweets=TWITTER_QUANTITY, result_stream_args=stream_args) except requests.exceptions.HTTPError as exception: print(colored("There's an error in your api request, Error: ", 'red')) sys.exit() if not os.path.exists(OUT_DIR): os.makedirs(OUT_DIR) with io.open(os.path.join(OUT_DIR, f"{OUT_FILE}.txt"), "w", encoding="utf-8") as tweetsfile: for tweet in tweets:
def lookup(self, config: TwitterSourceConfig) -> List[AnalyzerRequest]: if not config.query and not config.keywords and not config.hashtags and config.usernames: raise AttributeError( "At least one non empty parameter required (query, keywords, hashtags, and usernames)" ) place_fields = ",".join( config.place_fields) if config.place_fields is not None else None user_fields = ",".join( config.user_fields) if config.user_fields is not None else None expansions = ",".join( config.expansions) if config.expansions is not None else None tweet_fields = ",".join( config.tweet_fields) if config.tweet_fields is not None else None query = self._generate_query_string(query=config.query, keywords=config.keywords, hashtags=config.hashtags, usernames=config.usernames, operators=config.operators) search_query = gen_request_parameters( query=query, results_per_call=config.max_tweets, place_fields=place_fields, expansions=expansions, user_fields=user_fields, tweet_fields=tweet_fields, since_id=config.since_id, until_id=config.until_id, start_time=config.lookup_period) tweets_output = collect_results( query=search_query, max_tweets=config.max_tweets, result_stream_args=config.credential.get_twitter_credentials()) if not tweets_output: logger.info("No Tweets found") return [] tweets = [] users = [] meta_info = None for raw_output in tweets_output: if "text" in raw_output: tweets.append(raw_output) elif "users" in raw_output: users = raw_output["users"] elif "meta" in raw_output: meta_info = raw_output["meta"] # Extract user info and create user map user_map: Dict[str, Dict[str, Any]] = {} if len(users) > 0 and "id" in users[0]: for user in users: user_map[user["id"]] = user # TODO use it later logger.info(f"Twitter API meta_info='{meta_info}'") source_responses: List[AnalyzerRequest] = [] for tweet in tweets: if "author_id" in tweet and tweet["author_id"] in user_map: tweet["author_info"] = user_map.get(tweet["author_id"]) source_responses.append(self._get_source_output(tweet)) return source_responses
from searchtweets import ResultStream, gen_request_parameters, load_credentials search_args = load_credentials("~/.twitter_keys.yaml", yaml_key="search_tweets_v2", env_overwrite=False) query = gen_request_parameters("Electric Vehicle", results_per_call=100) rs = ResultStream(request_parameters=query, max_results=500, max_pages=1, **search_args) tweets = list(rs.stream())