def SN_db_operations(hashtag: str, since: str, until: str) -> tuple: """ 1. Creates staging table for new hashtag 2. Calls tweet ID downloader 3. Saves tweet IDs to DB Downloads TweetIDs of Hashtags via snScrapeTweet :param hashtag: hashtag, to be downloaded :param since: start date for tweet download (string) :param until: until date for tweet download (string) :return: table_name, hashtag, dataframe length """ table_name = db_functions.get_staging_table_name( hashtag) #adds prefix (s_h_) and suffix (dateand time) to tablename db_functions.drop_table(table_name) db_functions.create_empty_staging_table(table_name) tweet_ids = SN_get_tweet_ids_for_hashtag(since, until, hashtag) #downloads tweets if len(tweet_ids) == 0: print("####################################################") print("####Warning! No Tweets fetched! Process aborted!####") print("####################################################") sys.exit() df = pd.DataFrame(tweet_ids) write_to_table = "update_temp" db_functions.df_to_sql(df, write_to_table, drop="replace") #Write Tweet ID to temp table #db_functions.update_table('insert into ' + str(table_name) + ' (id) select cast ("0" as bigint) from update_temp') #insert temp table content into staging table db_functions.update_table( f'insert into {table_name} (id) select cast ("0" as bigint) from {write_to_table}' ) # insert temp table content into staging table try: print(f"Table {table_name} created with {len(df)} tweets.") except: print("Unhandled Error") db_functions.drop_table(write_to_table) return table_name, hashtag, df.shape[0]
def hashtag_download_launcher(hashtag, since: str, until: str, download_parent_tweets: bool): """ Manages whole hashtag download process. Step1: Calls procedures for SQL table creation, Tweet ID download Step2: Calls Tweet Details download launcher to add tweet details. Does this in iterations to minimize data loss risk Result is stored in staging table s_h_HASHTAG_TIMESTAMP :param hashtag: hashtag, to be downloaded :param since: start date for tweet download (string) :param until: until date for tweet download (string) :return: table_name """ table_name, hashtag, len_df = SN_db_operations(hashtag, since, until) print("Hashtag Twitter ID download complete. Starting detail download.") bulk_size = 1000 new_tweets_fetched = 1 loop_counter = 1 while new_tweets_fetched != 0: start_time = time.time() print( f"Iteration {loop_counter} running. Estimated Iterations {int (len_df / bulk_size) +1+5}" ) #+1 to avoid 0 itteratios, +5 is a good estimate for number of iterations to get all parent tweets new_tweets_fetched = TwitterAPI.tweet_details_download_launcher( table_name, hashtag, bulk_size, download_parent_tweets) loop_counter += 1 print(f"Iteration {loop_counter} runtime: {time.time() - start_time} ") print("Hashtag downloaded successfully.") #insert new users into table n_users new_users = f"""insert into n_users (id) select f.user_id from {table_name} f left join n_users u on f.user_id = u.id where u.id is null and f.user_id is not null""" db_functions.update_table(new_users) return table_name
def inference_bert_friends(classifier, column_list: list, sql: str, min_matches: int): """ Performs inference based on users an account follows. Stores result to n_users :param classifier: Classifier :param friend_column_list_path: Column list to be used. Any Users friends are matched against this column list :param sql: Sql with combination of User to be inferendes, their label (combined rating) and their friend ID :param min_matches: Minimum friends that must be found in friend_column for the user to get a prediction. More connections = more accurate prediction result :return: """ start = time.time() friends = db_functions.select_from_db(sql) input_dataset_length = len(friends) print(f"SQL fetching time: {time.time() - start}") if len(friends) == 0: rows_processed = 0 return rows_processed friend_set = set(friends['follows_ids'].values.tolist()) friend_list = friends['follows_ids'].values.tolist() user_list = friends['user_id'].values.tolist() rating_list = friends['combined_rating'].values.tolist() del friends #Transforms DataFrame into DefaultDict relationship_dict = defaultdict(lambda: defaultdict(list)) for i, element in enumerate(friend_list): relationship_dict[element][0].append(user_list[i]) relationship_dict[element][1].append(rating_list[i]) conditions_not_met_list = [ ] # Ids in this list will still get a last seen date in DB to ignore them during next loop result_dict = {} #Iteration though friend_set and prediction of faction for element in tqdm(friend_set): common_friends = set(relationship_dict[element][0]) & set(column_list) number_of_common_friends = len(common_friends) if number_of_common_friends >= min_matches: df = pd.DataFrame(index=column_list).T df = df.append(pd.Series(), ignore_index=True).fillna(0) for friend in relationship_dict[element][0]: df.loc[:, friend] = 1 df = df.iloc[:, :len(column_list)] prediction_proba = classifier.predict_proba( df.values.tolist()) # pure predict text, conf = helper_functions.conf_value("LR", prediction_proba, min_boundary=0.5, max_boundary=1) result_dict[element] = [text, conf, number_of_common_friends] else: conditions_not_met_list.append(element) del friend_set del friend_list del user_list del rating_list timestamp = db_functions.staging_timestamp() result_df = pd.DataFrame(result_dict).T result_df['last_seen'] = timestamp rows_processed = len(result_df) if rows_processed > 0: #checks if data has been written to DF db_functions.df_to_sql(result_df, "temp_table", "replace") update_sql = """update n_users set bert_friends_ml_result = "0", bert_friends_ml_conf = cast("1" as numeric), bert_friends_ml_count = cast ("2" as integer), bert_friends_ml_last_seen = temp_table.last_seen from temp_table where cast (id as text) = temp_table."index" """ start = time.time() db_functions.update_table(update_sql) db_functions.drop_table("temp_table") print(f"Update Time: {time.time() - start}") else: print( f"WARNING: 0 new ratings generated despite an input dataset of {input_dataset_length} rows." ) if len(conditions_not_met_list) > 0: stamps = [timestamp for elm in conditions_not_met_list] ziped = list(zip(conditions_not_met_list, stamps)) db_functions.df_to_sql(pd.DataFrame(ziped), "temp_table", drop='replace') sql = 'update n_users set bert_friends_ml_last_seen = temp_table."1" from temp_table where n_users.id::text = temp_table."0"' db_functions.update_table(sql) #db_functions.drop_table('temp_table') return rows_processed
lr_pol_last_analysed text COLLATE pg_catalog."default", result_bert_friends text COLLATE pg_catalog."default", bert_friends_conf numeric, bert_friends_last_seen text COLLATE pg_catalog."default", bf_left_number integer, bf_right_number integer, combined_rating text COLLATE pg_catalog."default", combined_conf numeric, private_profile boolean, batch integer, bert_friends_ml_result text COLLATE pg_catalog."default", bert_friends_ml_conf numeric, bert_friends_ml_count integer, bert_friends_ml_last_seen text COLLATE pg_catalog."default" )""" db_functions.update_table(sql_table) # Create table n_followers table_name = "n_followers" sql_table = f""" CREATE TABLE public.{table_name} ( index integer NOT NULL DEFAULT nextval('n_likes_index_seq'::regclass), username text COLLATE pg_catalog."default", user_id bigint, follows_users text COLLATE pg_catalog."default", follows_ids text COLLATE pg_catalog."default", retrieve_date text COLLATE pg_catalog."default" ) """ db_functions.update_table(sql_table)
def prediction_launcher(table_name: str, BERT_model, sql: str, write_to_db: bool = True, TFIDF_pol_unpol_conv=0, Algo_pol_unpol=0): """ Loads 200 Tweets (one page call) per users and sends them to BERT for inference :param table_name: Name of temp table used to store results :param BERT_model: BERT Model :param sql: Statement providing Users to be inferenced :param write_to_db: True or False :param TFIDF_pol_unpol_conv: tfidf converter (optional) :param Algo_pol_unpol: Random Forrest classifier (optional) :return: """ start_time_overal = time.time() cur_date = str(date.today()) # date for last seen columns #methods = ['pol', 'LR'] methods = ['LR'] data = [] update_to_invalid_list = [] # This DF will store all precditions results df_pred_data = pd.DataFrame(data, columns=[ 'user_id', 'screen_name', 'pol', 'unpol', 'pol_text', 'pol_conf', 'pol_time', 'left', 'right', 'lr_text', 'lr_conf', 'lr_time', 'analyse_date' ]) sql_time_start = time.time() df = db_functions.select_from_db(sql) print( f"############## --- SQL Select time : {sql_time_start - time.time()} --- #############" ) if df.shape[1] != 2: print("ERROR: DF must ONLY have columns user_id and username") gc.collect() sys.exit() gc.collect() for index, element in tqdm(df.iterrows(), total=df.shape[0]): start_time = time.time() user_id = element[0] screen_name = element[1] def tweet_download_and_lang_detect(df_tweets, user_id, update_to_invalid_list): """ Calls language detection and checks if enough german tweets remain. If it found almost enough german Tweets it will load more. If it found almost none it will abort. :param df_tweets: 0 during first run, dataframe with tweets during later runs :param user_id: Twitter User_ID for tweet download and language check :param update_to_invalid_list: List of user that can not be downloaded from. Will append to if applicable. :return: df_tweets, update_to_invalid_list, abort_loop, len_df """ if isinstance(df_tweets, int): df_tweets = TwitterAPI.API_tweet_multitool( user_id, 'temp', pages=1, method='user_timeline', append=False, write_to_db=False) # fills DF with 200 tweets of 1 page df_tweets = helper_functions.lang_detect(df_tweets) else: df_tweets_additions = TwitterAPI.API_tweet_multitool( user_id, 'temp', pages=1, method='user_timeline', append=False, write_to_db=False) # fills DF with 200 tweets of 1 page df_tweets_additions = helper_functions.lang_detect( df_tweets_additions) if isinstance(df_tweets_additions, pd.DataFrame): df_tweets = pd.concat([df_tweets, df_tweets_additions]) df_tweets.reset_index(inplace=True) del df_tweets['index'] # if df_tweets is None: #no tweets found or all tweets deleted (non german) # abort_loop = True # return df_tweets, update_to_invalid_list, abort_loop' len_df = helper_functions.dataframe_length(df_tweets) if len_df <= 50: # if almost no tweets are german don't try to get more german tweets from this users. # would take to many page loads update_to_invalid_list.append(user_id) abort_loop = True elif len_df >= 200: abort_loop = True else: # if to few tweets are german load more tweets to get a better result abort_loop = False gc.collect() return df_tweets, update_to_invalid_list, abort_loop, len_df df_tweets = 0 # tries two times to get at least 200 german tweets, if first attempt returns less than 150 german tweets for i in range(2): df_tweets, update_to_invalid_list, abort_loop, len_df = tweet_download_and_lang_detect( df_tweets, user_id, update_to_invalid_list) if abort_loop == True: break if len_df > 0: for method in methods: prediction_result = [] if method == 'pol': if TFIDF_pol_unpol_conv == 0 or Algo_pol_unpol == 0: print( "Warning: No Political/Unpolitical classifier given. Check function parameters." ) else: prediction_result.append( TFIDF_inference.TFIDF_inference( df_tweets['tweet'], TFIDF_pol_unpol_conv, Algo_pol_unpol)) if method == 'LR': #prediction_result.append(inference_political_bert.bert_predictions(df_tweets['tweet'], BERT_model)) prediction_result.append( bert_predictions(df_tweets['tweet'], BERT_model)) runtime = int(time.time() - start_time) # returns text interpretation of inference text, conf = helper_functions.conf_value( method, prediction_result, max_boundary=len(df_tweets)) # result and confidence score df_pred_data.at[index, 'user_id'] = user_id df_pred_data.at[index, 'screen_name'] = screen_name df_pred_data.at[index, 'analyse_date'] = cur_date # TODO: If you store the column names in variables that update depending on the method, you only need # one block pred_result_zero = 'left' if method == "LR" else 'pol' pred_result_one = 'right' if method == "LR" else 'unpol' df_pred_data.at[index, pred_result_zero] = prediction_result[0][0] df_pred_data.at[index, pred_result_one] = prediction_result[0][1] if method == "LR": df_pred_data.at[index, 'left'] = prediction_result[0][0] df_pred_data.at[index, 'right'] = prediction_result[0][1] df_pred_data.at[index, 'lr_text'] = text df_pred_data.at[index, 'lr_conf'] = conf df_pred_data.at[index, 'lr_time'] = runtime else: df_pred_data.at[index, 'pol'] = prediction_result[0][0] df_pred_data.at[index, 'unpol'] = prediction_result[0][1] df_pred_data.at[index, 'pol_text'] = text df_pred_data.at[index, 'pol_conf'] = conf df_pred_data.at[index, 'pol_time'] = runtime # print("screen_name,Pol,Unpol,Pol_Time,Left,Right,LR_Time") # for index, element in df_pred_data.iterrows(): # print( # f"{element['user_id']},{element['screen_name']},{element['pol']}" # f",{element['unpol']},{element['pol_time']},{element['left']},{element['right']},{element['lr_time']}") # print("\n") # if index == 6: # print ("Stopp") if (write_to_db is True and index != 0 and index % batch_size == 0) or ( write_to_db is True and (df.shape[0]) == (index + 1) ): #saved data x iterations OR when df has no further rows if len(update_to_invalid_list) > 0: invalids = pd.DataFrame(update_to_invalid_list) invalids['cur_date'] = cur_date db_functions.df_to_sql(invalids, "temp_invalids", drop='replace') update_sql = """update n_users set lr = 'invalid', pol= 'invalid', lr_pol_last_analysed = temp_invalids.cur_Date from temp_invalids where id = temp_invalids."0" """ db_functions.update_table(update_sql) db_functions.drop_table("temp_invalids") if helper_functions.dataframe_length(df_pred_data) > 0: db_functions.df_to_sql(df_pred_data, table_name, drop='replace') update_sql = f""" update n_users set lr = lr_text, lr_conf = cast (a.lr_conf as numeric), pol = pol_text, pol_conf = cast (a.pol_conf as numeric), lr_pol_last_analysed = analyse_date from {table_name} a where id = cast(user_id as bigint)""" db_functions.update_table( update_sql) # update n_users table with new resulsts print(f"Data written to table: {table_name}.") gc.collect()
def combined_scores_calc_launcher(sql: str, bert_friends_high_confidence_capp_off, self_conf_high_conf_capp_off, min_required_bert_friend_opinions): """ Calculates combined score from users self-LR score and users bert_friend score :return: """ # limit = 1000 # sql = f"select id, screen_name, lr, lr_conf, result_bert_friends, bert_friends_conf, bf_left_number, # bf_right_number from n_users where lr is not null limit {limit}" # sql = f"select id, screen_name, lr, lr_conf, result_bert_friends, bert_friends_conf, bf_left_number, # bf_right_number from n_users where lr is not null or result_bert_friends is not null" df = db_functions.select_from_db(sql) df.fillna(0, inplace=True) count_rated_accounts = 0 count_uncategorized_accounts = 0 count_rating_less_accounts = 0 count_to_few_bert_friends_to_rate_and_LRself_is_invalid = 0 count_bert_friends_result_is_mediocre = 0 id_list = df['id'].to_list() id_dict = {i: 0 for i in id_list} # ToDo: Runtime 30 minutes. Changes to Dict for index, element in tqdm(df.iterrows(), total=df.shape[0]): result, rated_accounts, rating_less_accounts, to_few_bert_friends_to_rate_and_LRself_is_invalid, bert_friends_result_is_mediocre, uncategorized_accounts = calculate_combined_score( bert_friends_high_confidence_cap_off= bert_friends_high_confidence_capp_off, self_conf_high_conf_cap_off=self_conf_high_conf_capp_off, min_required_bert_friend_opinions=min_required_bert_friend_opinions, user_id=element[0], self_lr=element[2], self_lr_conf=element[3], bert_friends_lr=element[4], bert_friends_lr_conf=element[5], number_of_bert_friends_L=element[6], number_of_bert_friends_R=element[7], BERT_ML_rating=element[8], BERT_ML_conf=element[9]) id_dict[element[0]] = result count_rated_accounts += rated_accounts count_rating_less_accounts += rating_less_accounts count_to_few_bert_friends_to_rate_and_LRself_is_invalid += to_few_bert_friends_to_rate_and_LRself_is_invalid count_bert_friends_result_is_mediocre += bert_friends_result_is_mediocre count_uncategorized_accounts += uncategorized_accounts print("\n\n") print(f"ratingless_accounts: {count_rating_less_accounts}") print( f"to_few_bert_friends_to_rate_and_LRself_is_invalid_or_unknown_or_of_low_conf: " f"{count_to_few_bert_friends_to_rate_and_LRself_is_invalid}") print( f"bert_friends_result_is_medicore: {count_bert_friends_result_is_mediocre}" ) print(f"uncategorized_accounts: {count_uncategorized_accounts}") total_rating_less_accounts = count_rating_less_accounts + \ count_to_few_bert_friends_to_rate_and_LRself_is_invalid + \ count_bert_friends_result_is_mediocre + \ count_uncategorized_accounts print(f"\nAccounts without rating: {total_rating_less_accounts}") print(f"Rated accounts: {count_rated_accounts}") print("\n\n") print("Calculation done. Writing results to DB.") del df id_dict = {k: v for k, v in id_dict.items() if v != 0} df_result = pd.DataFrame(id_dict).transpose() df_result = df_result.replace('null', np.NaN) if len(df_result) == 0: print("Now new data.") else: print(f"{len(df_result)} new results found.") db_functions.df_to_sql(df_result, "temp_table", drop='replace') update_sql = 'update n_users set combined_rating = t."0", combined_conf = cast(t."1" as numeric) from temp_table t where n_users.id = t.index' db_functions.update_table(update_sql) # runtime 8 minutes db_functions.drop_table("temp_table")
def friend_rating_launcher(sql: str, get_data_from_DB: bool) -> None: # def bert_friends_score(get_data_from_DB): """Refreshes score for all users in DB who... 1) have a Bert LR rating and 2) follow someone in n_followers Writes result to table n_users (total runtime 87 min) """ timestamp = db_functions.staging_timestamp() start_time = time.time() if get_data_from_DB is True: # Runtime 18 min # --Bert_Friends: Zu bewertende User und die Scores ihrer Freunde df = db_functions.select_from_db(sql) db_functions.save_pickle(df, "bert_friends.pkl") else: df = db_functions.load_pickle("bert_friends.pkl") # df = df.iloc[:50000,:] df_sub0 = df.groupby(['follows_ids', 'bert_self']).size().unstack(fill_value=0) df_sub1 = df.groupby(['follows_ids', 'bert_friends']).size().unstack(fill_value=0) del df result = df_sub1.join(df_sub0, lsuffix='_friend_Bert', rsuffix='_self_Bert') del df_sub0 del df_sub1 user_list = result.index.to_list() try: left_friend_Bert_list = result['links_friend_Bert'].to_list() except: print( "Warning: 0 Results. Staging results possibly not copied to facts_hashtags." ) right_friend_Bert_list = result['rechts_friend_Bert'].to_list() del result user_dict = {} for i, user in enumerate(tqdm(user_list)): if user not in user_dict: user_dict[user] = {} right = right_friend_Bert_list[i] left = left_friend_Bert_list[i] text, conf = helper_functions.conf_value( method='LR', prediction_result=[[left, right]], min_boundary=0, max_boundary=left + right) user_dict[user]["text"] = text user_dict[user]["confidence"] = conf user_dict[user]["last_seen"] = timestamp user_dict[user]["bf_left_number"] = left user_dict[user]["bf_right_number"] = right print("User dict erstellt.") print(len(user_dict)) result = pd.DataFrame(user_dict).T print("DF transponiert.") db_functions.df_to_sql(result, "temp_result", drop='replace') print("Insert into temp done.") sql = "update n_users set result_bert_friends = text, bert_friends_conf = cast(confidence as numeric), " \ "bert_friends_last_seen = temp_result.last_seen, bf_left_number = temp_result.bf_left_number, " \ "bf_right_number = temp_result.bf_right_number from temp_result where id = cast (temp_result.index as bigint)" db_functions.update_table(sql) db_functions.drop_table("temp_result") print(f"Runtime in min: {(time.time() - start_time) / 60} ")
def get_followers(sql, sql_insert_new_followers, download_limit=12500000, time_limit=False) -> None: """ Is given user ids in form of SQL statement OR as List. Will retrieve followers for this user from Twitter. :param sql: SQL statement containing followers OR list of users IDs :param download_limit: max follower download for each accounts. :param sql_insert_new_followers: Contains SQL statement used to write results from temp_table to n_followers :return: none """ # Block 1: Check Twitter Limits startime = time.time() #timeout = 86400 timeout = 3600 #run one iteration or one hour limit = TwitterAPI.api_limit() ts = limit['resources']['followers']['/followers/ids']['reset'] limit = limit['resources']['followers']['/followers/ids'][ 'remaining'] # gets the remaining limit for follower retrival print("Reset Time: " + str(datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'))) if limit == 0: print('Twitter API limit used up. Aborting query.') print("Reset Time: " + str(datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'))) sys.exit() else: print("Current API Follower Limit: " + str(limit)) # Block 2: Get users whose followers we want from DB or list if isinstance(sql, str): df = db_functions.select_from_db(sql) elif isinstance(sql, list): df = pd.DataFrame(sql, columns=['id']) else: print("Error: Must either use SQL statement or List") sys.exit() # Block 3: Get followers for each of the users retrieved in Block 2 for index, element in tqdm(df.iterrows(), total=df.shape[0]): try: # this option works if we load followers for cores id = element['user_id'] screen_name = element['screen_name'] except KeyError: # this setting is used if we load follower for anything but cores id = element['id'] screen_name = 0 #Query Twitter API to find out how many followers a user has try: scr_name, followers_count = TwitterAPI.API_get_single_user_object( id) #except TypeError: #print ("STOPP!") #Did happen once. Did not happen again when trying to investigate except ValueError: print(f"User {id} not found. Skipping.") continue if followers_count >= download_limit: continue print("Getting Followers of " + str(id) + " | Element " + str(index + 1) + " of " + str(len(df))) TwitterAPI.API_Followers( screen_name, id, download_limit=download_limit) # <== API follower retrieval db_functions.update_table(sql_insert_new_followers) # Block 4: Write follower retrieve date back to n_cores timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if screen_name != 0: sql = "update n_cores set followers_retrieved =' " + str( timestamp) + "' where user_id = " + str(id) db_functions.update_table(sql) if startime - time.time() > timeout: print("###Timeout reached!###") return id
if config['TASKS'].getboolean('download_hashtag'): since = config['DOWNLOAD_HASHTAG'].get('since') until = config['DOWNLOAD_HASHTAG'].get('until') download_parent_tweets = config['DOWNLOAD_HASHTAG'].getboolean( 'download_parent_tweets') table_name = sn_scrape.hashtag_download_launcher( hashtag=hashtag, since=since, until=until, download_parent_tweets=download_parent_tweets) copy_sql = config['DOWNLOAD_HASHTAG'].get('copy_sql') copy_sql = copy_sql.replace("INSERT_HASHTAG", "s_h" + hashtag) copy_sql = copy_sql.replace("INSERT_TABLE", table_name) db_functions.update_table(copy_sql) if config['TASKS'].getboolean('download_friends'): sql = config['DOWNLOAD_FRIENDS'].get('sql') sql = sql.replace("INSERT_HASHTAG", "%" + hashtag + "%") get_friends(sql) #copy friends to n_followers table #This process makes an update of the downloaded followers (many millions) unnecessary since it gradealy adds #followers to Bog accounts using the followership lists of small accounts #Insert all friends of a hashtag into n_followers #Same effect as donwloading all the followers of many big accounts but much much gasters sql = """INSERT INTO n_followers (username, user_id, follows_users, follows_ids, retrieve_date) select follows_users, follows_ids::bigint , follows_users, f.user_id, retrieve_date FROM n_friends f, facts_hashtags fh
def tweet_details_download_launcher(table_name: str, hashtag: str, bulk_size: int = 1000, download_parent_tweets=True): """ 1. Calls Tweet downloader 2. Adds details to Tweet IDs in staging table via update 3. If Tweets are are reply to another tweet, those parent tweets can be downloaded via option download_parent_tweets :param table_name: Staging table name which will be updated :param hashtag: hashtag scraped :param bulk_size: number of tweets to processed in one function call :param download_parent_tweets: If True: Loops through all tweets until no further father elements can be found :return: none """ parent_sql = f"""select * from {table_name} where tweet is null or (retweet is not null and retweet not in (select id from {table_name})) limit {bulk_size}""" df_parent = db_functions.select_from_db(parent_sql) df_parent['id'] = df_parent[ 'retweet'] #replaces tweet ID with parent id. Otherwise previously downloaded tweets would be downloaded again sql = f"select * from {table_name} where tweet is null limit {bulk_size}" df = db_functions.select_from_db(sql) download_parents = helper_functions.dataframe_length( df) == 0 and download_parent_tweets == True if download_parents == True: df = df_parent for index, element in df.iterrows(): error = False # downloads details for Tweets from staging table result = API_get_tweet_details(element[1], sleep=True) if result == 'Error: Not authorized': #Tweet is set to private error = True if result == 'Error: No status found with that ID.' or result == "Undefined Error": #sql = f"update {table_name} set tweet = 'deleted' where id = {element[1]}" #db_functions.update_table(sql) df.tweet[df.id == element[1]] = 'deleted' error = True if error is False: df.iloc[index:index + 1, 4:5] = result[1] # date df.iloc[index:index + 1, 5:6] = result[2] # tweet df.iloc[index:index + 1, 6:7] = hashtag # hashtag df.iloc[index:index + 1, 7:8] = result[0] # user_id df.iloc[index:index + 1, 8:9] = result[4] # screen_name df.iloc[index:index + 1, 9:10] = result[3] # name df.iloc[index:index + 1, 11:12] = result[5] # in reply to tweet id df.iloc[index:index + 1, 18:19] = table_name #print ("Fetched Tweet: {}".format(element[1])) if len(df) == 0: return 0 db_functions.df_to_sql(df, 'temp_df', 'replace') new_tweets_fetched = helper_functions.dataframe_length(df) # try: # print (new_tweets_fetched) # except UnboundLocalError: # print ("STOPP") #update staging table with values form temp_df if download_parents == False: sql = f"""update {table_name} a set date = b.date, tweet = b.tweet, hashtags = b.hashtags, user_id = cast (b.user_id as bigint), username = b.username, name = b.name, retweet = cast (b.retweet as bigint), staging_name = b.staging_name from (select * from temp_df) b where a.id = b.id""" else: sql = f"""INSERT INTO {table_name} SELECT index::bigint, id, conversation_id::bigint, created_at, date, tweet, hashtags, user_id, username, name, link::bigint, retweet::bigint, nlikes::bigint, nreplies::bigint, nretweets::bigint, quote_url::bigint, user_rt_id::bigint, user_rt::bigint, staging_name FROM temp_df""" print(f"{new_tweets_fetched} parent tweets added.") db_functions.update_table(sql) db_functions.drop_table('temp_df') #new_tweets_fetched = helper_functions.dataframe_length(df) return new_tweets_fetched