def main(): args = get_args() with open('/mnt/instagram/' + args.file_name + '.json', 'r') as f: for line in tqdm(f): try: #in case there are data which not well-formatted try: data = json.loads(line) except: data = ast.literal_eval(line) caption_list = data['data']['hashtag'][ 'edge_hashtag_to_media']['edges'] dict_output = {} for i in range(len(caption_list)): date = datetime.datetime.fromtimestamp( caption_list[i]['node']['taken_at_timestamp']).date() if datetime.datetime( date.year, date.month, date.day) >= datetime.datetime( 2020, args.start_month, args.start_date): #Get new data only dict_output['created_at'] = str( date.month) + '/' + str(date.day) + '/' + str( date.year) dict_output['user_id'] = caption_list[i]['node'][ 'owner']['id'] text = caption_list[i]['node'][ 'edge_media_to_caption']['edges'][0]['node'][ 'text'] dict_output['text'] = sentences_cleaner(text) dict_output['hashtags'] = re.findall( r'#[A-Za-z0-9_]+', text) dict_output['comments_count'] = caption_list[i][ 'node']['edge_media_to_comment']['count'] dict_output['likes_count'] = caption_list[i]['node'][ 'edge_media_preview_like'][ 'count'] #count of likes including organic likes & sponsored likes with open( DATA_PATH + "Cleaned/" + "cleaned_insta_1.txt", 'a+') as f_out: json.dump(dict_output, f_out) f_out.write('\n') except Exception as e: print(e)
def main(): #Load Model model_all = gensim.models.Word2Vec.load( '/mnt/louis/Issue Monitoring/model/w2v_gdelt_all_300.model') SOURCE_general = '/mnt/louis/Issue Monitoring/topic/general/' SOURCE_new_normal = '/mnt/louis/Issue Monitoring/topic/new_normal/' _, _, filenames_general = next(os.walk(SOURCE_general)) _, _, filenames_new_normal = next(os.walk(SOURCE_new_normal)) #Import gdelt corpus df_gdelt = pd.read_csv('/mnt/louis/Dataset/GDELT/preprocessed_gdelt.csv') df_gdelt['created_at'] = pd.to_datetime(df_gdelt['date']) df_gdelt['created_week'] = df_gdelt['created_at'].apply( lambda x: x.weekofyear) df_gdelt = df_gdelt[(df_gdelt.created_week >= 13)].reset_index(drop=True) df_gdelt['created_date'] = df_gdelt['created_at'].apply(lambda x: x.date()) #Create Similar Keyword Dataframe df = pd.DataFrame(columns=[ 'topic', 'keyword', 'similar_keyword', 'cosine_similarity', 'keyword_count', 'similar_keyword_count' ]) #Iterate through all topics corpus print( "-------------------------- Extracting Similar General Keyword for each Topic" ) for topic_file in tqdm(filenames_general): topic = ' '.join( re.sub('_', ' ', topic_file.split('.')[0]).split()[1:]) keyword_list = [] with open(SOURCE_general + topic_file, "r") as f: for line in f: keyword_list.append(sentences_cleaner(line)) keyword_list = [ keyword_list[i] for i in range(len(keyword_list)) if keyword_list[i] not in forbidden_keywords_list ] for word in keyword_list: try: similar_word_list = model_all.most_similar(word, topn=10) for similar_word in similar_word_list: if (similar_word[0] not in forbidden_keywords_list) and ( not similar_word[0].isdigit()): df = df.append( { 'topic': topic, 'keyword': word, 'keyword_count': model_all.wv.vocab[word].count, 'similar_keyword': similar_word[0], 'similar_keyword_count': model_all.wv.vocab[similar_word[0]].count, 'cosine_similarity': similar_word[1] }, ignore_index=True) except Exception as e: print(e) df.to_csv( '/mnt/louis/Issue Monitoring/gdelt_TOP_10_Similar_Issue_Keyword.csv', index=False) #Extract count of all aggregated keywords for each topic daily print( "-------------------------- Extracting Keyword Count for each Topic Daily" ) #Create Weekly Keyword Count Dataframe df_topic_daily = pd.DataFrame( columns=['topic', 'date', 'count', 'tweets_volume']) #Iterate through all topic and tweets in each week for topic in tqdm(list(df['topic'].unique())): df_topic_keyword_temp = df[df.topic == topic].reset_index(drop=True) keyword_list = list(df_topic_keyword_temp['similar_keyword'].unique()) for keyword in list(df_topic_keyword_temp['keyword'].unique()): keyword_list.append(keyword) keyword_list = [ keyword_list[i] for i in range(len(keyword_list)) if keyword_list[i] not in forbidden_keywords_list ] for date in list(df_gdelt['created_date'].unique()): cnt = 0 df_filter_date = df_gdelt[df_gdelt['created_date'] == date] corpus = df_filter_date['cleaned_text'].to_list() for tweets in corpus: for keyword in keyword_list: if (keyword in tweets) and (keyword not in forbidden_keywords_list ) and (not keyword.isdigit()): cnt += 1 df_topic_daily = df_topic_daily.append( { 'topic': topic, 'date': date, 'count': cnt, 'tweets_volume': len(df_filter_date) }, ignore_index=True) df_topic_daily.to_csv( '/mnt/louis/Issue Monitoring/gdelt_daily_keyword_count.csv', index=False) #Extract the evolution of similar topic keyword weekly print( "-------------------------- Extracting Evolution of Similar Keyword for each Topic Weekly" ) #Create Similar Keyword Evolution Dataframe df_keyword_evolution = pd.DataFrame(columns=[ 'topic', 'week_of_the_year', 'keyword', 'similar_keyword', 'cosine_similarity', 'keyword_count', 'similar_keyword_count' ]) #Iterate through all topic and similar keyword in each week for topic in tqdm(list(df['topic'].unique())): keyword_list = [] df_topic_keyword_temp = df[df.topic == topic].reset_index(drop=True) for keyword in list(df_topic_keyword_temp['keyword'].unique()): keyword_list.append(keyword) keyword_list = [ keyword_list[i] for i in range(len(keyword_list)) if keyword_list[i] not in forbidden_keywords_list ] for week in list(df_gdelt['created_week'].unique()): #Load Model model = gensim.models.Word2Vec.load( '/mnt/louis/Issue Monitoring/model/w2v_gdelt_week_{}_300.model' .format(week)) for word in keyword_list: try: similar_word_list = model.most_similar(word, topn=10) for similar_word in similar_word_list: if (similar_word[0] not in forbidden_keywords_list ) and (not similar_word[0].isdigit()): df_keyword_evolution = df_keyword_evolution.append( { 'topic': topic, 'week_of_the_year': week, 'keyword': word, 'similar_keyword': similar_word[0], 'cosine_similarity': similar_word[1], 'keyword_count': model.wv.vocab[word].count, 'similar_keyword_count': model.wv.vocab[similar_word[0]].count }, ignore_index=True) except Exception as e: print(e) df_keyword_evolution.to_csv( '/mnt/louis/Issue Monitoring/gdelt_TOP_10_Evolution_Weekly_Similar_Issue_Keyword.csv', index=False) ########################################## New Normal Keyword ########################################## #Iterate through all topics corpus new_normal_keyword_list = [] for topic_file in filenames_new_normal: with open(SOURCE_new_normal + topic_file, "r") as f: for line in f: new_normal_keyword_list.append(sentences_cleaner(line)) new_normal_keyword_list = [ new_normal_keyword_list[i] for i in range(len(new_normal_keyword_list)) if new_normal_keyword_list[i] not in forbidden_keywords_list ] #Create Similar Keyword Dataframe df_new_normal = df[df['keyword'].isin( new_normal_keyword_list)].reset_index(drop=True) #Extract count of all aggregated keywords for each topic daily print( "-------------------------- Extracting New Normal Keyword Count for each Topic Daily" ) #Create Weekly Keyword Count Dataframe df_topic_daily_new_normal = pd.DataFrame( columns=['topic', 'date', 'count', 'tweets_volume']) #Iterate through all topic and tweets in each week for topic in tqdm(list(df_new_normal['topic'].unique())): df_topic_keyword_temp = df_new_normal[df_new_normal.topic == topic].reset_index(drop=True) keyword_list = list(df_topic_keyword_temp['similar_keyword'].unique()) for keyword in list(df_topic_keyword_temp['keyword'].unique()): keyword_list.append(keyword) keyword_list = [ keyword_list[i] for i in range(len(keyword_list)) if keyword_list[i] not in forbidden_keywords_list ] for date in list(df_gdelt['created_date'].unique()): cnt = 0 df_filter_date = df_gdelt[df_gdelt['created_date'] == date] corpus = df_filter_date['cleaned_text'].to_list() for tweets in corpus: for keyword in keyword_list: if (keyword in tweets) and (keyword not in forbidden_keywords_list ) and (not keyword.isdigit()): cnt += 1 df_topic_daily_new_normal = df_topic_daily_new_normal.append( { 'topic': topic, 'date': date, 'count': cnt, 'tweets_volume': len(df_filter_date) }, ignore_index=True) df_topic_daily_new_normal.to_csv( '/mnt/louis/Issue Monitoring/gdelt_daily_keyword_count_new_normal.csv', index=False)
def main(): args = get_args() #Import tweets corpus df_tweets = pd.read_csv('/mnt/louis/Dataset/Final/agg_final_data.csv', usecols=[ 'created_at', 'text', 'user_id', 'favorite_count', 'retweet_count', 'reply_count', 'verified', 'sentiment_score' ]) df_tweets['created_at'] = pd.to_datetime(df_tweets['created_at']) df_tweets['created_week'] = df_tweets['created_at'].apply( lambda x: x.weekofyear) df_tweets = df_tweets[(df_tweets.created_week >= args.start_week)].reset_index(drop=True) df_tweets['created_date'] = df_tweets['created_at'].apply( lambda x: x.date()) #Create list of unique week unique_week_list = list(df_tweets['created_week'].unique()) #Import Similar Keyword Dataframe or Create Similar Keyword Dataframe if args.start_week > 13: df_weekly_general = pd.read_csv( '/mnt/louis/Issue Monitoring/TOP_10_Evolution_Weekly_Issue_Keyword_user_level.csv' ) df_weekly_general = df_weekly_general[ df_weekly_general['week_of_the_year'] < args.start_week].reset_index(drop=True) df_weekly_new_normal = pd.read_csv( '/mnt/louis/Issue Monitoring/TOP_10_Evolution_Weekly_Issue_Keyword_user_level_new_normal.csv' ) df_weekly_new_normal = df_weekly_new_normal[ df_weekly_new_normal['week_of_the_year'] < args.start_week].reset_index(drop=True) else: df_weekly_general = pd.DataFrame(columns=[ 'user_id', 'tweets_count', 'week_of_the_year', 'favorite_count', 'retweet_count', 'reply_count', 'verified', 'sentiment_score' ]) df_weekly_new_normal = pd.DataFrame(columns=[ 'user_id', 'tweets_count', 'week_of_the_year', 'favorite_count', 'retweet_count', 'reply_count', 'verified', 'sentiment_score' ]) SOURCE_general = '/mnt/louis/Issue Monitoring/topic/general/' SOURCE_new_normal = '/mnt/louis/Issue Monitoring/topic/new_normal/' _, _, filenames_general = next(os.walk(SOURCE_general)) _, _, filenames_new_normal = next(os.walk(SOURCE_new_normal)) #Iterate through all topics corpus week by week topic_keyword_dict_weekly_general = {} topic_keyword_dict_weekly_new_normal = {} for week in unique_week_list: #Load Model model = gensim.models.Word2Vec.load( '/mnt/louis/Issue Monitoring/model/w2v_week_{}_300.model'.format( week)) for topic_file in tqdm(filenames_general): topic = ' '.join( re.sub('_', ' ', topic_file.split('.')[0]).split()[1:]) keyword_list_temp = [] with open(SOURCE_general + topic_file, "r") as f: for line in f: keyword_list_temp.append(sentences_cleaner(line)) keyword_list_temp = [ keyword_list_temp[i] for i in range(len(keyword_list_temp)) if keyword_list_temp[i] not in forbidden_keywords_list ] keyword_list = keyword_list_temp.copy() for word in keyword_list_temp: try: similar_word_list = model.most_similar(word, topn=10) for similar_word in similar_word_list: if (similar_word[0] not in keyword_list) and ( similar_word[0] not in forbidden_keywords_list ) and (not similar_word[0].isdigit()): keyword_list.append(similar_word[0]) except Exception as e: print(e) topic_keyword_dict_weekly_general[str(week) + '_' + topic] = keyword_list for topic_file in tqdm(filenames_new_normal): topic = ' '.join( re.sub('_', ' ', topic_file.split('.')[0]).split()[1:]) keyword_list_temp = [] with open(SOURCE_new_normal + topic_file, "r") as f: for line in f: keyword_list_temp.append(sentences_cleaner(line)) keyword_list_temp = [ keyword_list_temp[i] for i in range(len(keyword_list_temp)) if keyword_list_temp[i] not in forbidden_keywords_list ] keyword_list = keyword_list_temp.copy() for word in keyword_list_temp: try: similar_word_list = model.most_similar(word, topn=10) for similar_word in similar_word_list: if (similar_word[0] not in keyword_list) and ( similar_word[0] not in forbidden_keywords_list ) and (not similar_word[0].isdigit()): keyword_list.append(similar_word[0]) except Exception as e: print(e) topic_keyword_dict_weekly_new_normal[str(week) + '_' + topic] = keyword_list #Optimize Looping so that Only work on unique tweets unique_tweets_dict = {} df_tweets_index = [i for i in range(len(df_tweets))] tweets_list = df_tweets['text'].to_list() for dup in list_duplicates(tweets_list): unique_tweets_dict[dup[0]] = dup[1] unique_df_tweets_index = [ df_tweets_index[unique_tweets_dict[x][-1]] for x in unique_tweets_dict.keys() ] length_duplicate_df_tweets_index = [ len(unique_tweets_dict[x]) for x in unique_tweets_dict.keys() ] j = 0 weekly_general_dict = {} weekly_new_normal_dict = {} for i in tqdm(unique_df_tweets_index): dict_temp_weekly_general = { 'user_id': df_tweets.loc[i, 'user_id'], 'tweets_count': length_duplicate_df_tweets_index[j], 'week_of_the_year': df_tweets.loc[i, 'created_week'], 'favorite_count': df_tweets.loc[i, 'favorite_count'], 'retweet_count': df_tweets.loc[i, 'retweet_count'], 'reply_count': df_tweets.loc[i, 'reply_count'], 'verified': df_tweets.loc[i, 'verified'], 'sentiment_score': df_tweets.loc[i, 'sentiment_score'] } dict_temp_weekly_new_normal = dict_temp_weekly_general.copy() for topic in [ 'confidence in government', 'economic policy', 'employment', 'food access', 'health care', 'health protocol', 'mobility', 'stigma' ]: if any(keyword in df_tweets.loc[i, 'text'] for keyword in topic_keyword_dict_weekly_general[ str(dict_temp_weekly_general['week_of_the_year']) + '_' + topic]): dict_temp_weekly_general[topic] = 1 else: dict_temp_weekly_general[topic] = 0 for topic in [ 'confidence in government new normal', 'economic policy new normal', 'employment new normal', 'health care new normal', 'health protocol new normal', 'mobility new normal' ]: if any(keyword in df_tweets.loc[i, 'text'] for keyword in topic_keyword_dict_weekly_new_normal[ str(dict_temp_weekly_new_normal['week_of_the_year']) + '_' + topic]): dict_temp_weekly_new_normal[topic] = 1 else: dict_temp_weekly_new_normal[topic] = 0 weekly_general_dict[i] = dict_temp_weekly_general weekly_new_normal_dict[i] = dict_temp_weekly_new_normal j += 1 df_weekly_general = df_weekly_general.append(pd.DataFrame.from_dict( weekly_general_dict, orient='index'), ignore_index=True) df_weekly_new_normal = df_weekly_new_normal.append(pd.DataFrame.from_dict( weekly_new_normal_dict, orient='index'), ignore_index=True) df_weekly_general.to_csv( '/mnt/louis/Issue Monitoring/TOP_10_Evolution_Weekly_Issue_Keyword_user_level.csv', index=False) df_weekly_new_normal.to_csv( '/mnt/louis/Issue Monitoring/TOP_10_Evolution_Weekly_Issue_Keyword_user_level_new_normal.csv', index=False)
def main(): #Uncomment this block of code if you haven't train the POS-tagging model # # 1. get the corpus # corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_INDONESIAN) # # 2. what tag do we want to predict? # tag_type = 'upos' # # 3. make the tag dictionary from the corpus # tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # print(tag_dictionary.idx2item) # # 4. initialize embeddings # embedding_types: List[TokenEmbeddings] = [ # WordEmbeddings('id-crawl'), # WordEmbeddings('id'), # #WordEmbeddings('glove'), # #BertEmbeddings('bert-base-multilingual-cased') # ] # embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # # 5. initialize sequence tagger # tagger: SequenceTagger = SequenceTagger(hidden_size=256, # embeddings=embeddings, # tag_dictionary=tag_dictionary, # tag_type=tag_type, # use_crf=True) # # 6. start training # trainer: ModelTrainer = ModelTrainer(tagger, corpus) # trainer.train('resources/taggers/example-universal-pos', # learning_rate=0.1, # mini_batch_size=32, # max_epochs=10) #7. Import News List news_list = [] with open("/mnt/louis/Dataset/news_corpus.txt", "r", encoding='utf-8') as f: for line in f: news_list.append(sentences_cleaner(line)) #8. Predict POS Tag for each news tag_pos = SequenceTagger.load( 'resources/taggers/example-universal-pos/best-model.pt') for news in news_list: sentence = Sentence(news) tag_pos.predict(sentence) sentence_list = sentence.to_tagged_string().split() verb_word_list = [] for i, token in enumerate(sentence_list): if token == '<VERB>': verb_word_list.append(sentence_list[i - 1]) if verb_word_list: verbs = verb_word_list[0] verb_word_list.pop(0) while True: if verb_word_list: verbs += ',' + verb_word_list[0] verb_word_list.pop(0) else: break else: verbs = '' with open("/mnt/louis/Dataset/news_corpus_verb.txt", "a+") as f_out: f_out.write(verbs + '\n')
def main(): args = get_args() if args.type == 'geo': root_path = '/mnt/louis/Dataset/Raw/' else: root_path = '/mnt/twitter_hashtag_data/' with open(root_path + args.file_name + '.txt', 'r') as f: for line in tqdm(f): try: #in case there are tweets which not well-formatted and make ast failed to compile try: tweet = ast.literal_eval(line) except: tweet = json.loads(line) try: #to get the full version of the tweets if it is a retweeted text urls = tweet['retweeted_status']['extended_tweet'][ 'entities']['urls'] if urls: created_at = tweet['created_at'] text = sentences_cleaner( tweet['retweeted_status']['extended_tweet'] ['full_text']) line = created_at + ',' + text for i in range(len(urls)): url = urls[i]['expanded_url'] if 'twitter' not in url: line += ',' + url with open("/mnt/louis/Dataset/tweet_URL_list.txt", "a+") as f_out: f_out.write(line + '\n') except: try: urls = tweet['retweeted_status']['entities']['urls'] if urls: created_at = tweet['created_at'] text = sentences_cleaner( tweet['retweeted_status']['text']) line = created_at + ',' + text for i in range(len(urls)): url = urls[i]['expanded_url'] if 'twitter' not in url: line += ',' + url with open("/mnt/louis/Dataset/tweet_URL_list.txt", "a+") as f_out: f_out.write(line + '\n') except: try: urls = tweet['extended_tweet']['entities']['urls'] if urls: created_at = tweet['created_at'] text = sentences_cleaner( tweet['extended_tweet']['full_text']) line = created_at + ',' + text for i in range(len(urls)): url = urls[i]['expanded_url'] if 'twitter' not in url: line += ',' + url with open( "/mnt/louis/Dataset/tweet_URL_list.txt", "a+") as f_out: f_out.write(line + '\n') except: urls = tweet['entities']['urls'] if urls: created_at = tweet['created_at'] text = sentences_cleaner(tweet['text']) line = created_at + ',' + text for i in range(len(urls)): url = urls[i]['expanded_url'] if 'twitter' not in url: line += ',' + url with open( "/mnt/louis/Dataset/tweet_URL_list.txt", "a+") as f_out: f_out.write(line + '\n') except Exception as e: print(e)
def main(): args = get_args() i = 1 if args.type == 'geo': root_path = SCRIPT_PATH + '/Dataset/Raw/' else: root_path = '/mnt/twitter_hashtag_data/' thres = 0 if os.path.exists(DATA_PATH + 'Cleaned/' + "cleaned_tweets_{}.txt".format(args.iter)): with open( DATA_PATH + 'Cleaned/' + "cleaned_tweets_{}.txt".format(args.iter), "r") as f: for line in f: thres += 1 with open(root_path + args.file_name + '.txt', 'r') as f: for line in tqdm(f): if i > thres: try: #in case there are tweets which not well-formatted and make ast failed to compile try: tweet = ast.literal_eval(line) except: tweet = json.loads(line) json_output = {} try: #to get the full version of the tweets if it is a retweeted text text = tweet['retweeted_status']['extended_tweet'][ 'full_text'] json_output['hashtags'] = tweet['retweeted_status'][ 'extended_tweet']['entities']['hashtags'] except: try: text = tweet['retweeted_status']['text'] json_output['hashtags'] = tweet[ 'retweeted_status']['entities']['hashtags'] except: try: #to get the full version of the tweets text = tweet['extended_tweet']['full_text'] json_output['hashtags'] = tweet[ 'extended_tweet']['entities']['hashtags'] except: text = tweet['text'] json_output['hashtags'] = tweet['entities'][ 'hashtags'] if args.type == 'hashtag': json_output['text'] = sentences_cleaner(text) else: json_output['text'] = text if json_output['text'] != '': try: json_output['created_at'] = tweet[ 'retweeted_status']['created_at'] except: json_output['created_at'] = tweet['created_at'] try: json_output['location'] = tweet[ 'retweeted_status']['place'][ 'full_name'].split(',')[0] except: try: json_output['location'] = tweet[ 'retweeted_status']['place'] except: try: json_output['location'] = tweet['place'][ 'full_name'].split(',')[0] except: json_output['location'] = tweet['place'] try: json_output['tweet_id'] = tweet[ 'retweeted_status']['id_str'] json_output['user_id'] = tweet['retweeted_status'][ 'user']['id_str'] json_output['verified'] = tweet[ 'retweeted_status']['user']['verified'] json_output['reply_count'] = tweet[ 'retweeted_status']['reply_count'] json_output['retweet_count'] = tweet[ 'retweeted_status']['retweet_count'] json_output['favorite_count'] = tweet[ 'retweeted_status']['favorite_count'] except: json_output['tweet_id'] = tweet['id_str'] json_output['user_id'] = tweet['user']['id_str'] json_output['verified'] = tweet['user']['verified'] json_output['reply_count'] = tweet['reply_count'] json_output['retweet_count'] = tweet[ 'retweet_count'] json_output['favorite_count'] = tweet[ 'favorite_count'] with open( DATA_PATH + 'Cleaned/' + "cleaned_tweets_{}.txt".format(args.iter), 'a+') as f_out: json.dump(json_output, f_out) f_out.write('\n') except Exception as e: print(e) i += 1
def extract_closest_news_id_by_keyword(start_at, type, keyword_type='manual'): ''' Function to extract closest news id by keyword ''' #Import Tweets List unique_tweets_dict = {} df = pd.read_csv(SCRIPT_PATH + '/Dataset/Final/agg_final_data.csv') tweets_list = df.loc[start_at:, 'text'].to_list() for dup in list_duplicates(tweets_list): unique_tweets_dict[dup[0]] = dup[1] tweets_list = [ tweets_list[unique_tweets_dict[x][0]] for x in unique_tweets_dict.keys() ] #Import News List news_list = [] with open(SCRIPT_PATH + "/Dataset/news_corpus.txt", "r", encoding='utf-8') as f: for line in f: news_list.append(sentences_cleaner(line)) #Import News Keyword List news_keyword_list = [] if keyword_type == 'manual': news_keyword_list_path = "/mnt/louis/Dataset/news_corpus_keyword.txt" elif keyword_type == 'W2VxKeyword': news_keyword_list_path = "/mnt/louis/W2VxKeyword/data/news_corpus_similar_keyword.txt" elif keyword_type == 'tfidf': news_keyword_list_path = "/mnt/louis/Dataset/news_corpus_tfidf_keyword.txt" elif keyword_type == 'expanded': news_keyword_list_path = "/mnt/louis/Dataset/news_corpus_keyword_expanded.txt" if keyword_type != 'expanded': with open(news_keyword_list_path, "r", encoding='utf-8') as f: for line in f: line = line.lower() line = re.sub('\n', '', line) keyword_list = line.replace(' ', ',').split(',') keyword_list = [sentences_cleaner(x) for x in keyword_list] news_keyword_list.append(keyword_list) else: #keyword_type == 'expanded' news_verb_keyword_list = [] with open(news_keyword_list_path, "r", encoding='utf-8') as f: for line in f: line = line.lower() line = re.sub('\n', '', line) verb_non_verb_list = line.split(';') non_verb_list = verb_non_verb_list[0].split() non_verb_list = [sentences_cleaner(x) for x in non_verb_list] verb_non_verb_list.pop(0) verb_list = [] while True: if verb_non_verb_list: list_tmp = verb_non_verb_list[0].split() list_tmp = [sentences_cleaner(x) for x in list_tmp] verb_list.append(list_tmp) verb_non_verb_list.pop(0) else: break news_keyword_list.append(non_verb_list) news_verb_keyword_list.append(verb_list) if type == "W2VxBoW_key": #Import News Similar List news_similar_list = [] with open(SCRIPT_PATH + "/Word EmbeddingxBoW/data/corpus_similar.txt", "r", encoding='utf-8') as f: for line in f: news_similar_list.append(line) elif type == 'keyword': news_similar_list = news_list.copy() #Match keyword in each tweet and each news unique_closest_news_id_list = [] for tweet in tqdm(tweets_list): word_list = tweet.split() rule_passed = (len(word_list) >= 3) and ('pap' not in word_list) and ( 'vcs' not in word_list) and ('vc' not in word_list) and ( 'wa' not in word_list) if not rule_passed: unique_closest_news_id_list.append(-1) else: count_list = [] i = 0 for news_similar in news_similar_list: cnt = 0 for word in word_list: news_match_list = [ x for x in news_similar.split() if ((x != 'video') and (x != 'link') and (x != 'foto')) ] if word in news_match_list: cnt += 1 keyword_list = news_keyword_list[i] abbreviation_keyword_list = [ keyword for keyword in keyword_list if len(keyword) <= 3 ] if keyword_type == 'W2VxKeyword': if 'corona' in keyword_list: #if 'corona' in keyword then check all related 'corona' keyword if any( keyword in tweet for keyword in keyword_list ) and ( ('corona' in tweet) or ('covid' in tweet) or ('covid19' in tweet) or ('covid-19' in tweet) ): #if all keyword from keywords list are in this tweet then multiply the cnt by 2 if len( abbreviation_keyword_list ) > 0: #check if there is abbreviaton keyword if all(keyword in word_list for keyword in abbreviation_keyword_list): count_list.append(cnt) else: count_list.append(-1) else: count_list.append(cnt) else: count_list.append(-1) else: if any( keyword in tweet for keyword in keyword_list ): #if all keyword from keywords list are in this tweet then multiply the cnt by 2 if len( abbreviation_keyword_list ) > 0: #check if there is abbreviaton keyword if all(keyword in word_list for keyword in abbreviation_keyword_list): count_list.append(cnt) else: count_list.append(-1) else: count_list.append(cnt) else: count_list.append(-1) else: #keyword_type == 'manual' or keyword_type == 'tfidf' or keyword_type == 'expanded' extra_rules_passed = True if keyword_type == 'expanded': verb_keyword_lists = news_verb_keyword_list[i] for verb_keyword_list in verb_keyword_lists: if all(verb not in tweet for verb in verb_keyword_list): extra_rules_passed = False if extra_rules_passed: if 'corona' in keyword_list: #if 'corona' in keyword then check all related 'corona' keyword if (all( keyword in tweet for keyword in keyword_list )) and ( ('corona' in tweet) or ('covid' in tweet) or ('covid19' in tweet) or ('covid-19' in tweet) ): #if all keyword from keywords list are in this tweet then multiply the cnt by 2 if len( abbreviation_keyword_list ) > 0: #check if there is abbreviaton keyword if all(keyword in word_list for keyword in abbreviation_keyword_list): count_list.append(cnt * 2) else: count_list.append(-1) else: count_list.append(cnt * 2) else: count_list.append(-1) else: if ( all(keyword in tweet for keyword in keyword_list) ): #if all keyword from keywords list are in this tweet then multiply the cnt by 2 if len( abbreviation_keyword_list ) > 0: #check if there is abbreviaton keyword if all(keyword in word_list for keyword in abbreviation_keyword_list): count_list.append(cnt * 2) else: count_list.append(-1) else: count_list.append(cnt * 2) else: count_list.append(-1) else: count_list.append(-1) i += 1 #iteration of keyword / verb_keyword list if (keyword_type == 'W2VxKeyword') and (not (any(i > (len(word_list) // 3) for i in count_list))): unique_closest_news_id_list.append(-1) else: #check if the proposed tweet contain strict word (region, organization, position) like the news (not news_similar) did check_tweet_news_strict_word = find_words_in_strict_word_list( news_list[np.argmax(count_list)].split()) if len(check_tweet_news_strict_word) > 0: if all(word in word_list for word in check_tweet_news_strict_word): unique_closest_news_id_list.append( np.argmax(count_list)) else: unique_closest_news_id_list.append(-1) else: unique_closest_news_id_list.append(np.argmax(count_list)) return unique_closest_news_id_list, unique_tweets_dict
def extract_closest_news_id_by_ngram(start_at): ''' Function to extract closest news id by n-gram matching ''' #Import Tweets List unique_tweets_dict = {} df = pd.read_csv(SCRIPT_PATH + '/Dataset/Final/agg_final_data.csv') tweets_list = df.loc[start_at:, 'text'].to_list() for dup in list_duplicates(tweets_list): unique_tweets_dict[dup[0]] = dup[1] tweets_list = [ tweets_list[unique_tweets_dict[x][0]] for x in unique_tweets_dict.keys() ] #Import News List news_list = [] with open(SCRIPT_PATH + "/Dataset/news_corpus.txt", "r", encoding='utf-8') as f: for line in f: news_list.append(sentences_cleaner(line)) #Import News Keyword List news_keyword_lists = [] with open("/mnt/louis/Dataset/news_corpus_keyword.txt", "r", encoding='utf-8') as f: for line in f: line = line.lower() line = re.sub('\n', '', line) keyword_list = line.replace(' ', ',').split(',') news_keyword_lists.append(keyword_list) #Extract bigram and trigram for each news news_bigram_lists = [] news_trigram_lists = [] for news in news_list: news_word_list = [ x for x in news.split() if ((x != 'video') and (x != 'link') and (x != 'foto')) ] news_bigram_lists.append(extract_bigrams(news_word_list)) news_trigram_lists.append(extract_trigrams(news_word_list)) #Match bigram & trigram in each tweet and each news unique_closest_news_id_list = [] for tweet in tqdm(tweets_list): word_list = tweet.split() rule_passed = (len(word_list) >= 3) and ('pap' not in word_list) and ( 'vcs' not in word_list) and ('vc' not in word_list) and ( 'wa' not in word_list) if not rule_passed: unique_closest_news_id_list.append(-1) else: count_list = [] bigram_list = extract_bigrams(word_list) trigram_list = extract_trigrams(word_list) for i in range(len(news_list)): cnt_word = 0 cnt_bigram = 0 cnt_trigram = 0 news_keyword_list = news_keyword_lists[i] news_bigram_list = news_bigram_lists[i] news_trigram_list = news_trigram_lists[i] for word in news_keyword_list: if word == 'corona': if ('corona' in tweet) or ('covid' in tweet) or ( 'covid19' in tweet) or ('covid-19' in tweet): cnt_word += 1 else: if len(word) > 3: if (word in tweet) and (not word.isdigit()): cnt_word += 1 else: #Abbreviation Word if (word in word_list) and (not word.isdigit()): cnt_word += 1 for bigram in news_bigram_list: if (bigram in bigram_list) and ( not any(word.isdigit() for word in bigram.split()) ) and ('corona' not in bigram) and ('covid' not in bigram) and ( 'covid19' not in bigram) and ('covid-19' not in bigram): cnt_bigram += 1 for trigram in news_trigram_list: if (trigram in trigram_list) and ( not any(word.isdigit() for word in trigram.split()) ) and ('corona' not in trigram) and ('covid' not in trigram) and ( 'covid19' not in trigram) and ('covid-19' not in trigram): cnt_trigram += 1 if (cnt_word > len(news_keyword_list) * 7 // 8) and ( cnt_bigram >= 2) and (cnt_trigram >= 1): count_list.append(cnt_word + cnt_bigram + cnt_trigram) else: count_list.append(0) if any(cnt > 0 for cnt in count_list): unique_closest_news_id_list.append(np.argmax(count_list)) else: unique_closest_news_id_list.append(-1) return unique_closest_news_id_list, unique_tweets_dict