def main():
    args = get_args()

    with open('/mnt/instagram/' + args.file_name + '.json', 'r') as f:
        for line in tqdm(f):
            try:  #in case there are data which not well-formatted
                try:
                    data = json.loads(line)
                except:
                    data = ast.literal_eval(line)

                caption_list = data['data']['hashtag'][
                    'edge_hashtag_to_media']['edges']

                dict_output = {}

                for i in range(len(caption_list)):
                    date = datetime.datetime.fromtimestamp(
                        caption_list[i]['node']['taken_at_timestamp']).date()
                    if datetime.datetime(
                            date.year, date.month,
                            date.day) >= datetime.datetime(
                                2020, args.start_month,
                                args.start_date):  #Get new data only
                        dict_output['created_at'] = str(
                            date.month) + '/' + str(date.day) + '/' + str(
                                date.year)
                        dict_output['user_id'] = caption_list[i]['node'][
                            'owner']['id']
                        text = caption_list[i]['node'][
                            'edge_media_to_caption']['edges'][0]['node'][
                                'text']
                        dict_output['text'] = sentences_cleaner(text)
                        dict_output['hashtags'] = re.findall(
                            r'#[A-Za-z0-9_]+', text)
                        dict_output['comments_count'] = caption_list[i][
                            'node']['edge_media_to_comment']['count']
                        dict_output['likes_count'] = caption_list[i]['node'][
                            'edge_media_preview_like'][
                                'count']  #count of likes including organic likes & sponsored likes

                        with open(
                                DATA_PATH + "Cleaned/" + "cleaned_insta_1.txt",
                                'a+') as f_out:
                            json.dump(dict_output, f_out)
                            f_out.write('\n')

            except Exception as e:
                print(e)
示例#2
0
def main():

    #Load Model
    model_all = gensim.models.Word2Vec.load(
        '/mnt/louis/Issue Monitoring/model/w2v_gdelt_all_300.model')

    SOURCE_general = '/mnt/louis/Issue Monitoring/topic/general/'
    SOURCE_new_normal = '/mnt/louis/Issue Monitoring/topic/new_normal/'

    _, _, filenames_general = next(os.walk(SOURCE_general))
    _, _, filenames_new_normal = next(os.walk(SOURCE_new_normal))

    #Import gdelt corpus
    df_gdelt = pd.read_csv('/mnt/louis/Dataset/GDELT/preprocessed_gdelt.csv')
    df_gdelt['created_at'] = pd.to_datetime(df_gdelt['date'])
    df_gdelt['created_week'] = df_gdelt['created_at'].apply(
        lambda x: x.weekofyear)
    df_gdelt = df_gdelt[(df_gdelt.created_week >= 13)].reset_index(drop=True)
    df_gdelt['created_date'] = df_gdelt['created_at'].apply(lambda x: x.date())

    #Create Similar Keyword Dataframe
    df = pd.DataFrame(columns=[
        'topic', 'keyword', 'similar_keyword', 'cosine_similarity',
        'keyword_count', 'similar_keyword_count'
    ])

    #Iterate through all topics corpus
    print(
        "-------------------------- Extracting Similar General Keyword for each Topic"
    )
    for topic_file in tqdm(filenames_general):
        topic = ' '.join(
            re.sub('_', ' ',
                   topic_file.split('.')[0]).split()[1:])
        keyword_list = []
        with open(SOURCE_general + topic_file, "r") as f:
            for line in f:
                keyword_list.append(sentences_cleaner(line))

        keyword_list = [
            keyword_list[i] for i in range(len(keyword_list))
            if keyword_list[i] not in forbidden_keywords_list
        ]

        for word in keyword_list:
            try:
                similar_word_list = model_all.most_similar(word, topn=10)
                for similar_word in similar_word_list:
                    if (similar_word[0] not in forbidden_keywords_list) and (
                            not similar_word[0].isdigit()):
                        df = df.append(
                            {
                                'topic':
                                topic,
                                'keyword':
                                word,
                                'keyword_count':
                                model_all.wv.vocab[word].count,
                                'similar_keyword':
                                similar_word[0],
                                'similar_keyword_count':
                                model_all.wv.vocab[similar_word[0]].count,
                                'cosine_similarity':
                                similar_word[1]
                            },
                            ignore_index=True)
            except Exception as e:
                print(e)

    df.to_csv(
        '/mnt/louis/Issue Monitoring/gdelt_TOP_10_Similar_Issue_Keyword.csv',
        index=False)

    #Extract count of all aggregated keywords for each topic daily
    print(
        "-------------------------- Extracting Keyword Count for each Topic Daily"
    )

    #Create Weekly Keyword Count Dataframe
    df_topic_daily = pd.DataFrame(
        columns=['topic', 'date', 'count', 'tweets_volume'])

    #Iterate through all topic and tweets in each week
    for topic in tqdm(list(df['topic'].unique())):
        df_topic_keyword_temp = df[df.topic == topic].reset_index(drop=True)
        keyword_list = list(df_topic_keyword_temp['similar_keyword'].unique())
        for keyword in list(df_topic_keyword_temp['keyword'].unique()):
            keyword_list.append(keyword)

        keyword_list = [
            keyword_list[i] for i in range(len(keyword_list))
            if keyword_list[i] not in forbidden_keywords_list
        ]

        for date in list(df_gdelt['created_date'].unique()):
            cnt = 0
            df_filter_date = df_gdelt[df_gdelt['created_date'] == date]
            corpus = df_filter_date['cleaned_text'].to_list()
            for tweets in corpus:
                for keyword in keyword_list:
                    if (keyword in tweets) and (keyword
                                                not in forbidden_keywords_list
                                                ) and (not keyword.isdigit()):
                        cnt += 1

            df_topic_daily = df_topic_daily.append(
                {
                    'topic': topic,
                    'date': date,
                    'count': cnt,
                    'tweets_volume': len(df_filter_date)
                },
                ignore_index=True)

    df_topic_daily.to_csv(
        '/mnt/louis/Issue Monitoring/gdelt_daily_keyword_count.csv',
        index=False)

    #Extract the evolution of similar topic keyword weekly
    print(
        "-------------------------- Extracting Evolution of Similar Keyword for each Topic Weekly"
    )

    #Create Similar Keyword Evolution Dataframe
    df_keyword_evolution = pd.DataFrame(columns=[
        'topic', 'week_of_the_year', 'keyword', 'similar_keyword',
        'cosine_similarity', 'keyword_count', 'similar_keyword_count'
    ])

    #Iterate through all topic and similar keyword in each week
    for topic in tqdm(list(df['topic'].unique())):
        keyword_list = []
        df_topic_keyword_temp = df[df.topic == topic].reset_index(drop=True)
        for keyword in list(df_topic_keyword_temp['keyword'].unique()):
            keyword_list.append(keyword)

        keyword_list = [
            keyword_list[i] for i in range(len(keyword_list))
            if keyword_list[i] not in forbidden_keywords_list
        ]

        for week in list(df_gdelt['created_week'].unique()):
            #Load Model
            model = gensim.models.Word2Vec.load(
                '/mnt/louis/Issue Monitoring/model/w2v_gdelt_week_{}_300.model'
                .format(week))

            for word in keyword_list:
                try:
                    similar_word_list = model.most_similar(word, topn=10)

                    for similar_word in similar_word_list:

                        if (similar_word[0] not in forbidden_keywords_list
                            ) and (not similar_word[0].isdigit()):
                            df_keyword_evolution = df_keyword_evolution.append(
                                {
                                    'topic':
                                    topic,
                                    'week_of_the_year':
                                    week,
                                    'keyword':
                                    word,
                                    'similar_keyword':
                                    similar_word[0],
                                    'cosine_similarity':
                                    similar_word[1],
                                    'keyword_count':
                                    model.wv.vocab[word].count,
                                    'similar_keyword_count':
                                    model.wv.vocab[similar_word[0]].count
                                },
                                ignore_index=True)
                except Exception as e:
                    print(e)

    df_keyword_evolution.to_csv(
        '/mnt/louis/Issue Monitoring/gdelt_TOP_10_Evolution_Weekly_Similar_Issue_Keyword.csv',
        index=False)

    ########################################## New Normal Keyword ##########################################

    #Iterate through all topics corpus
    new_normal_keyword_list = []

    for topic_file in filenames_new_normal:
        with open(SOURCE_new_normal + topic_file, "r") as f:
            for line in f:
                new_normal_keyword_list.append(sentences_cleaner(line))

    new_normal_keyword_list = [
        new_normal_keyword_list[i] for i in range(len(new_normal_keyword_list))
        if new_normal_keyword_list[i] not in forbidden_keywords_list
    ]

    #Create Similar Keyword Dataframe
    df_new_normal = df[df['keyword'].isin(
        new_normal_keyword_list)].reset_index(drop=True)

    #Extract count of all aggregated keywords for each topic daily
    print(
        "-------------------------- Extracting New Normal Keyword Count for each Topic Daily"
    )

    #Create Weekly Keyword Count Dataframe
    df_topic_daily_new_normal = pd.DataFrame(
        columns=['topic', 'date', 'count', 'tweets_volume'])

    #Iterate through all topic and tweets in each week
    for topic in tqdm(list(df_new_normal['topic'].unique())):
        df_topic_keyword_temp = df_new_normal[df_new_normal.topic ==
                                              topic].reset_index(drop=True)
        keyword_list = list(df_topic_keyword_temp['similar_keyword'].unique())
        for keyword in list(df_topic_keyword_temp['keyword'].unique()):
            keyword_list.append(keyword)

        keyword_list = [
            keyword_list[i] for i in range(len(keyword_list))
            if keyword_list[i] not in forbidden_keywords_list
        ]

        for date in list(df_gdelt['created_date'].unique()):
            cnt = 0
            df_filter_date = df_gdelt[df_gdelt['created_date'] == date]
            corpus = df_filter_date['cleaned_text'].to_list()
            for tweets in corpus:
                for keyword in keyword_list:
                    if (keyword in tweets) and (keyword
                                                not in forbidden_keywords_list
                                                ) and (not keyword.isdigit()):
                        cnt += 1

            df_topic_daily_new_normal = df_topic_daily_new_normal.append(
                {
                    'topic': topic,
                    'date': date,
                    'count': cnt,
                    'tweets_volume': len(df_filter_date)
                },
                ignore_index=True)

    df_topic_daily_new_normal.to_csv(
        '/mnt/louis/Issue Monitoring/gdelt_daily_keyword_count_new_normal.csv',
        index=False)
示例#3
0
def main():
    args = get_args()

    #Import tweets corpus
    df_tweets = pd.read_csv('/mnt/louis/Dataset/Final/agg_final_data.csv',
                            usecols=[
                                'created_at', 'text', 'user_id',
                                'favorite_count', 'retweet_count',
                                'reply_count', 'verified', 'sentiment_score'
                            ])
    df_tweets['created_at'] = pd.to_datetime(df_tweets['created_at'])
    df_tweets['created_week'] = df_tweets['created_at'].apply(
        lambda x: x.weekofyear)
    df_tweets = df_tweets[(df_tweets.created_week >=
                           args.start_week)].reset_index(drop=True)
    df_tweets['created_date'] = df_tweets['created_at'].apply(
        lambda x: x.date())

    #Create list of unique week
    unique_week_list = list(df_tweets['created_week'].unique())

    #Import Similar Keyword Dataframe or Create Similar Keyword Dataframe
    if args.start_week > 13:
        df_weekly_general = pd.read_csv(
            '/mnt/louis/Issue Monitoring/TOP_10_Evolution_Weekly_Issue_Keyword_user_level.csv'
        )
        df_weekly_general = df_weekly_general[
            df_weekly_general['week_of_the_year'] <
            args.start_week].reset_index(drop=True)

        df_weekly_new_normal = pd.read_csv(
            '/mnt/louis/Issue Monitoring/TOP_10_Evolution_Weekly_Issue_Keyword_user_level_new_normal.csv'
        )
        df_weekly_new_normal = df_weekly_new_normal[
            df_weekly_new_normal['week_of_the_year'] <
            args.start_week].reset_index(drop=True)
    else:
        df_weekly_general = pd.DataFrame(columns=[
            'user_id', 'tweets_count', 'week_of_the_year', 'favorite_count',
            'retweet_count', 'reply_count', 'verified', 'sentiment_score'
        ])
        df_weekly_new_normal = pd.DataFrame(columns=[
            'user_id', 'tweets_count', 'week_of_the_year', 'favorite_count',
            'retweet_count', 'reply_count', 'verified', 'sentiment_score'
        ])

    SOURCE_general = '/mnt/louis/Issue Monitoring/topic/general/'
    SOURCE_new_normal = '/mnt/louis/Issue Monitoring/topic/new_normal/'

    _, _, filenames_general = next(os.walk(SOURCE_general))
    _, _, filenames_new_normal = next(os.walk(SOURCE_new_normal))

    #Iterate through all topics corpus week by week
    topic_keyword_dict_weekly_general = {}
    topic_keyword_dict_weekly_new_normal = {}
    for week in unique_week_list:
        #Load Model
        model = gensim.models.Word2Vec.load(
            '/mnt/louis/Issue Monitoring/model/w2v_week_{}_300.model'.format(
                week))

        for topic_file in tqdm(filenames_general):
            topic = ' '.join(
                re.sub('_', ' ',
                       topic_file.split('.')[0]).split()[1:])
            keyword_list_temp = []
            with open(SOURCE_general + topic_file, "r") as f:
                for line in f:
                    keyword_list_temp.append(sentences_cleaner(line))

            keyword_list_temp = [
                keyword_list_temp[i] for i in range(len(keyword_list_temp))
                if keyword_list_temp[i] not in forbidden_keywords_list
            ]

            keyword_list = keyword_list_temp.copy()

            for word in keyword_list_temp:
                try:
                    similar_word_list = model.most_similar(word, topn=10)
                    for similar_word in similar_word_list:
                        if (similar_word[0] not in keyword_list) and (
                                similar_word[0] not in forbidden_keywords_list
                        ) and (not similar_word[0].isdigit()):
                            keyword_list.append(similar_word[0])
                except Exception as e:
                    print(e)

            topic_keyword_dict_weekly_general[str(week) + '_' +
                                              topic] = keyword_list

        for topic_file in tqdm(filenames_new_normal):
            topic = ' '.join(
                re.sub('_', ' ',
                       topic_file.split('.')[0]).split()[1:])
            keyword_list_temp = []
            with open(SOURCE_new_normal + topic_file, "r") as f:
                for line in f:
                    keyword_list_temp.append(sentences_cleaner(line))

            keyword_list_temp = [
                keyword_list_temp[i] for i in range(len(keyword_list_temp))
                if keyword_list_temp[i] not in forbidden_keywords_list
            ]

            keyword_list = keyword_list_temp.copy()

            for word in keyword_list_temp:
                try:
                    similar_word_list = model.most_similar(word, topn=10)
                    for similar_word in similar_word_list:
                        if (similar_word[0] not in keyword_list) and (
                                similar_word[0] not in forbidden_keywords_list
                        ) and (not similar_word[0].isdigit()):
                            keyword_list.append(similar_word[0])
                except Exception as e:
                    print(e)

            topic_keyword_dict_weekly_new_normal[str(week) + '_' +
                                                 topic] = keyword_list

    #Optimize Looping so that Only work on unique tweets
    unique_tweets_dict = {}
    df_tweets_index = [i for i in range(len(df_tweets))]

    tweets_list = df_tweets['text'].to_list()

    for dup in list_duplicates(tweets_list):
        unique_tweets_dict[dup[0]] = dup[1]

    unique_df_tweets_index = [
        df_tweets_index[unique_tweets_dict[x][-1]]
        for x in unique_tweets_dict.keys()
    ]
    length_duplicate_df_tweets_index = [
        len(unique_tweets_dict[x]) for x in unique_tweets_dict.keys()
    ]

    j = 0

    weekly_general_dict = {}
    weekly_new_normal_dict = {}

    for i in tqdm(unique_df_tweets_index):
        dict_temp_weekly_general = {
            'user_id': df_tweets.loc[i, 'user_id'],
            'tweets_count': length_duplicate_df_tweets_index[j],
            'week_of_the_year': df_tweets.loc[i, 'created_week'],
            'favorite_count': df_tweets.loc[i, 'favorite_count'],
            'retweet_count': df_tweets.loc[i, 'retweet_count'],
            'reply_count': df_tweets.loc[i, 'reply_count'],
            'verified': df_tweets.loc[i, 'verified'],
            'sentiment_score': df_tweets.loc[i, 'sentiment_score']
        }

        dict_temp_weekly_new_normal = dict_temp_weekly_general.copy()

        for topic in [
                'confidence in government', 'economic policy', 'employment',
                'food access', 'health care', 'health protocol', 'mobility',
                'stigma'
        ]:
            if any(keyword in df_tweets.loc[i, 'text']
                   for keyword in topic_keyword_dict_weekly_general[
                       str(dict_temp_weekly_general['week_of_the_year']) +
                       '_' + topic]):
                dict_temp_weekly_general[topic] = 1
            else:
                dict_temp_weekly_general[topic] = 0

        for topic in [
                'confidence in government new normal',
                'economic policy new normal', 'employment new normal',
                'health care new normal', 'health protocol new normal',
                'mobility new normal'
        ]:
            if any(keyword in df_tweets.loc[i, 'text']
                   for keyword in topic_keyword_dict_weekly_new_normal[
                       str(dict_temp_weekly_new_normal['week_of_the_year']) +
                       '_' + topic]):
                dict_temp_weekly_new_normal[topic] = 1
            else:
                dict_temp_weekly_new_normal[topic] = 0

        weekly_general_dict[i] = dict_temp_weekly_general

        weekly_new_normal_dict[i] = dict_temp_weekly_new_normal

        j += 1

    df_weekly_general = df_weekly_general.append(pd.DataFrame.from_dict(
        weekly_general_dict, orient='index'),
                                                 ignore_index=True)
    df_weekly_new_normal = df_weekly_new_normal.append(pd.DataFrame.from_dict(
        weekly_new_normal_dict, orient='index'),
                                                       ignore_index=True)

    df_weekly_general.to_csv(
        '/mnt/louis/Issue Monitoring/TOP_10_Evolution_Weekly_Issue_Keyword_user_level.csv',
        index=False)
    df_weekly_new_normal.to_csv(
        '/mnt/louis/Issue Monitoring/TOP_10_Evolution_Weekly_Issue_Keyword_user_level_new_normal.csv',
        index=False)
def main():

    #Uncomment this block of code if you haven't train the POS-tagging model

    # # 1. get the corpus
    # corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_INDONESIAN)

    # # 2. what tag do we want to predict?
    # tag_type = 'upos'

    # # 3. make the tag dictionary from the corpus
    # tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    # print(tag_dictionary.idx2item)

    # # 4. initialize embeddings
    # embedding_types: List[TokenEmbeddings] = [
    #     WordEmbeddings('id-crawl'),
    #     WordEmbeddings('id'),
    #     #WordEmbeddings('glove'),
    #     #BertEmbeddings('bert-base-multilingual-cased')
    # ]

    # embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

    # # 5. initialize sequence tagger
    # tagger: SequenceTagger = SequenceTagger(hidden_size=256,
    #                                         embeddings=embeddings,
    #                                         tag_dictionary=tag_dictionary,
    #                                         tag_type=tag_type,
    #                                         use_crf=True)

    # # 6. start training
    # trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # trainer.train('resources/taggers/example-universal-pos',
    #               learning_rate=0.1,
    #               mini_batch_size=32,
    #               max_epochs=10)

    #7. Import News List
    news_list = []
    with open("/mnt/louis/Dataset/news_corpus.txt", "r",
              encoding='utf-8') as f:
        for line in f:
            news_list.append(sentences_cleaner(line))

    #8. Predict POS Tag for each news
    tag_pos = SequenceTagger.load(
        'resources/taggers/example-universal-pos/best-model.pt')
    for news in news_list:
        sentence = Sentence(news)
        tag_pos.predict(sentence)
        sentence_list = sentence.to_tagged_string().split()

        verb_word_list = []
        for i, token in enumerate(sentence_list):
            if token == '<VERB>':
                verb_word_list.append(sentence_list[i - 1])

        if verb_word_list:
            verbs = verb_word_list[0]
            verb_word_list.pop(0)
            while True:
                if verb_word_list:
                    verbs += ',' + verb_word_list[0]
                    verb_word_list.pop(0)
                else:
                    break
        else:
            verbs = ''

        with open("/mnt/louis/Dataset/news_corpus_verb.txt", "a+") as f_out:
            f_out.write(verbs + '\n')
示例#5
0
def main():
    args = get_args()

    if args.type == 'geo':
        root_path = '/mnt/louis/Dataset/Raw/'
    else:
        root_path = '/mnt/twitter_hashtag_data/'

    with open(root_path + args.file_name + '.txt', 'r') as f:
        for line in tqdm(f):
            try:  #in case there are tweets which not well-formatted and make ast failed to compile
                try:
                    tweet = ast.literal_eval(line)
                except:
                    tweet = json.loads(line)

                try:  #to get the full version of the tweets if it is a retweeted text
                    urls = tweet['retweeted_status']['extended_tweet'][
                        'entities']['urls']
                    if urls:
                        created_at = tweet['created_at']
                        text = sentences_cleaner(
                            tweet['retweeted_status']['extended_tweet']
                            ['full_text'])
                        line = created_at + ',' + text
                        for i in range(len(urls)):
                            url = urls[i]['expanded_url']
                            if 'twitter' not in url:
                                line += ',' + url

                        with open("/mnt/louis/Dataset/tweet_URL_list.txt",
                                  "a+") as f_out:
                            f_out.write(line + '\n')
                except:
                    try:
                        urls = tweet['retweeted_status']['entities']['urls']
                        if urls:
                            created_at = tweet['created_at']
                            text = sentences_cleaner(
                                tweet['retweeted_status']['text'])
                            line = created_at + ',' + text
                            for i in range(len(urls)):
                                url = urls[i]['expanded_url']
                                if 'twitter' not in url:
                                    line += ',' + url

                            with open("/mnt/louis/Dataset/tweet_URL_list.txt",
                                      "a+") as f_out:
                                f_out.write(line + '\n')
                    except:
                        try:
                            urls = tweet['extended_tweet']['entities']['urls']
                            if urls:
                                created_at = tweet['created_at']
                                text = sentences_cleaner(
                                    tweet['extended_tweet']['full_text'])
                                line = created_at + ',' + text
                                for i in range(len(urls)):
                                    url = urls[i]['expanded_url']
                                    if 'twitter' not in url:
                                        line += ',' + url

                                with open(
                                        "/mnt/louis/Dataset/tweet_URL_list.txt",
                                        "a+") as f_out:
                                    f_out.write(line + '\n')
                        except:
                            urls = tweet['entities']['urls']
                            if urls:
                                created_at = tweet['created_at']
                                text = sentences_cleaner(tweet['text'])
                                line = created_at + ',' + text
                                for i in range(len(urls)):
                                    url = urls[i]['expanded_url']
                                    if 'twitter' not in url:
                                        line += ',' + url

                                with open(
                                        "/mnt/louis/Dataset/tweet_URL_list.txt",
                                        "a+") as f_out:
                                    f_out.write(line + '\n')

            except Exception as e:
                print(e)
示例#6
0
def main():
    args = get_args()

    i = 1

    if args.type == 'geo':
        root_path = SCRIPT_PATH + '/Dataset/Raw/'
    else:
        root_path = '/mnt/twitter_hashtag_data/'

    thres = 0
    if os.path.exists(DATA_PATH + 'Cleaned/' +
                      "cleaned_tweets_{}.txt".format(args.iter)):
        with open(
                DATA_PATH + 'Cleaned/' +
                "cleaned_tweets_{}.txt".format(args.iter), "r") as f:
            for line in f:
                thres += 1

    with open(root_path + args.file_name + '.txt', 'r') as f:
        for line in tqdm(f):
            if i > thres:
                try:  #in case there are tweets which not well-formatted and make ast failed to compile
                    try:
                        tweet = ast.literal_eval(line)
                    except:
                        tweet = json.loads(line)

                    json_output = {}

                    try:  #to get the full version of the tweets if it is a retweeted text
                        text = tweet['retweeted_status']['extended_tweet'][
                            'full_text']
                        json_output['hashtags'] = tweet['retweeted_status'][
                            'extended_tweet']['entities']['hashtags']
                    except:
                        try:
                            text = tweet['retweeted_status']['text']
                            json_output['hashtags'] = tweet[
                                'retweeted_status']['entities']['hashtags']
                        except:
                            try:  #to get the full version of the tweets
                                text = tweet['extended_tweet']['full_text']
                                json_output['hashtags'] = tweet[
                                    'extended_tweet']['entities']['hashtags']
                            except:
                                text = tweet['text']
                                json_output['hashtags'] = tweet['entities'][
                                    'hashtags']

                    if args.type == 'hashtag':
                        json_output['text'] = sentences_cleaner(text)
                    else:
                        json_output['text'] = text

                    if json_output['text'] != '':
                        try:
                            json_output['created_at'] = tweet[
                                'retweeted_status']['created_at']
                        except:
                            json_output['created_at'] = tweet['created_at']

                        try:
                            json_output['location'] = tweet[
                                'retweeted_status']['place'][
                                    'full_name'].split(',')[0]
                        except:
                            try:
                                json_output['location'] = tweet[
                                    'retweeted_status']['place']
                            except:
                                try:
                                    json_output['location'] = tweet['place'][
                                        'full_name'].split(',')[0]
                                except:
                                    json_output['location'] = tweet['place']

                        try:
                            json_output['tweet_id'] = tweet[
                                'retweeted_status']['id_str']
                            json_output['user_id'] = tweet['retweeted_status'][
                                'user']['id_str']
                            json_output['verified'] = tweet[
                                'retweeted_status']['user']['verified']
                            json_output['reply_count'] = tweet[
                                'retweeted_status']['reply_count']
                            json_output['retweet_count'] = tweet[
                                'retweeted_status']['retweet_count']
                            json_output['favorite_count'] = tweet[
                                'retweeted_status']['favorite_count']
                        except:
                            json_output['tweet_id'] = tweet['id_str']
                            json_output['user_id'] = tweet['user']['id_str']
                            json_output['verified'] = tweet['user']['verified']
                            json_output['reply_count'] = tweet['reply_count']
                            json_output['retweet_count'] = tweet[
                                'retweet_count']
                            json_output['favorite_count'] = tweet[
                                'favorite_count']

                        with open(
                                DATA_PATH + 'Cleaned/' +
                                "cleaned_tweets_{}.txt".format(args.iter),
                                'a+') as f_out:
                            json.dump(json_output, f_out)
                            f_out.write('\n')

                except Exception as e:
                    print(e)
            i += 1
def extract_closest_news_id_by_keyword(start_at, type, keyword_type='manual'):
    '''
    Function to extract closest news id by keyword
    '''
    #Import Tweets List
    unique_tweets_dict = {}

    df = pd.read_csv(SCRIPT_PATH + '/Dataset/Final/agg_final_data.csv')
    tweets_list = df.loc[start_at:, 'text'].to_list()

    for dup in list_duplicates(tweets_list):
        unique_tweets_dict[dup[0]] = dup[1]

    tweets_list = [
        tweets_list[unique_tweets_dict[x][0]]
        for x in unique_tweets_dict.keys()
    ]

    #Import News List
    news_list = []
    with open(SCRIPT_PATH + "/Dataset/news_corpus.txt", "r",
              encoding='utf-8') as f:
        for line in f:
            news_list.append(sentences_cleaner(line))

    #Import News Keyword List
    news_keyword_list = []
    if keyword_type == 'manual':
        news_keyword_list_path = "/mnt/louis/Dataset/news_corpus_keyword.txt"
    elif keyword_type == 'W2VxKeyword':
        news_keyword_list_path = "/mnt/louis/W2VxKeyword/data/news_corpus_similar_keyword.txt"
    elif keyword_type == 'tfidf':
        news_keyword_list_path = "/mnt/louis/Dataset/news_corpus_tfidf_keyword.txt"
    elif keyword_type == 'expanded':
        news_keyword_list_path = "/mnt/louis/Dataset/news_corpus_keyword_expanded.txt"

    if keyword_type != 'expanded':
        with open(news_keyword_list_path, "r", encoding='utf-8') as f:
            for line in f:
                line = line.lower()
                line = re.sub('\n', '', line)
                keyword_list = line.replace(' ', ',').split(',')
                keyword_list = [sentences_cleaner(x) for x in keyword_list]
                news_keyword_list.append(keyword_list)

    else:  #keyword_type == 'expanded'
        news_verb_keyword_list = []
        with open(news_keyword_list_path, "r", encoding='utf-8') as f:
            for line in f:
                line = line.lower()
                line = re.sub('\n', '', line)
                verb_non_verb_list = line.split(';')
                non_verb_list = verb_non_verb_list[0].split()
                non_verb_list = [sentences_cleaner(x) for x in non_verb_list]

                verb_non_verb_list.pop(0)

                verb_list = []
                while True:
                    if verb_non_verb_list:
                        list_tmp = verb_non_verb_list[0].split()
                        list_tmp = [sentences_cleaner(x) for x in list_tmp]
                        verb_list.append(list_tmp)
                        verb_non_verb_list.pop(0)
                    else:
                        break

                news_keyword_list.append(non_verb_list)
                news_verb_keyword_list.append(verb_list)

    if type == "W2VxBoW_key":
        #Import News Similar List
        news_similar_list = []
        with open(SCRIPT_PATH + "/Word EmbeddingxBoW/data/corpus_similar.txt",
                  "r",
                  encoding='utf-8') as f:
            for line in f:
                news_similar_list.append(line)
    elif type == 'keyword':
        news_similar_list = news_list.copy()

    #Match keyword in each tweet and each news
    unique_closest_news_id_list = []
    for tweet in tqdm(tweets_list):
        word_list = tweet.split()

        rule_passed = (len(word_list) >= 3) and ('pap' not in word_list) and (
            'vcs' not in word_list) and ('vc' not in word_list) and (
                'wa' not in word_list)

        if not rule_passed:
            unique_closest_news_id_list.append(-1)
        else:
            count_list = []
            i = 0
            for news_similar in news_similar_list:
                cnt = 0
                for word in word_list:
                    news_match_list = [
                        x for x in news_similar.split()
                        if ((x != 'video') and (x != 'link') and (x != 'foto'))
                    ]
                    if word in news_match_list:
                        cnt += 1

                keyword_list = news_keyword_list[i]

                abbreviation_keyword_list = [
                    keyword for keyword in keyword_list if len(keyword) <= 3
                ]

                if keyword_type == 'W2VxKeyword':

                    if 'corona' in keyword_list:  #if 'corona' in keyword then check all related 'corona' keyword
                        if any(
                                keyword in tweet for keyword in keyword_list
                        ) and (
                            ('corona' in tweet) or ('covid' in tweet) or
                            ('covid19' in tweet) or ('covid-19' in tweet)
                        ):  #if all keyword from keywords list are in this tweet then multiply the cnt by 2
                            if len(
                                    abbreviation_keyword_list
                            ) > 0:  #check if there is abbreviaton keyword
                                if all(keyword in word_list for keyword in
                                       abbreviation_keyword_list):
                                    count_list.append(cnt)
                                else:
                                    count_list.append(-1)
                            else:
                                count_list.append(cnt)
                        else:
                            count_list.append(-1)
                    else:
                        if any(
                                keyword in tweet for keyword in keyword_list
                        ):  #if all keyword from keywords list are in this tweet then multiply the cnt by 2
                            if len(
                                    abbreviation_keyword_list
                            ) > 0:  #check if there is abbreviaton keyword
                                if all(keyword in word_list for keyword in
                                       abbreviation_keyword_list):
                                    count_list.append(cnt)
                                else:
                                    count_list.append(-1)
                            else:
                                count_list.append(cnt)
                        else:
                            count_list.append(-1)

                else:  #keyword_type == 'manual' or keyword_type == 'tfidf' or keyword_type == 'expanded'

                    extra_rules_passed = True

                    if keyword_type == 'expanded':
                        verb_keyword_lists = news_verb_keyword_list[i]
                        for verb_keyword_list in verb_keyword_lists:
                            if all(verb not in tweet
                                   for verb in verb_keyword_list):
                                extra_rules_passed = False

                    if extra_rules_passed:
                        if 'corona' in keyword_list:  #if 'corona' in keyword then check all related 'corona' keyword
                            if (all(
                                    keyword in tweet
                                    for keyword in keyword_list
                            )) and (
                                ('corona' in tweet) or ('covid' in tweet) or
                                ('covid19' in tweet) or ('covid-19' in tweet)
                            ):  #if all keyword from keywords list are in this tweet then multiply the cnt by 2
                                if len(
                                        abbreviation_keyword_list
                                ) > 0:  #check if there is abbreviaton keyword
                                    if all(keyword in word_list for keyword in
                                           abbreviation_keyword_list):
                                        count_list.append(cnt * 2)
                                    else:
                                        count_list.append(-1)
                                else:
                                    count_list.append(cnt * 2)
                            else:
                                count_list.append(-1)
                        else:
                            if (
                                    all(keyword in tweet
                                        for keyword in keyword_list)
                            ):  #if all keyword from keywords list are in this tweet then multiply the cnt by 2
                                if len(
                                        abbreviation_keyword_list
                                ) > 0:  #check if there is abbreviaton keyword
                                    if all(keyword in word_list for keyword in
                                           abbreviation_keyword_list):
                                        count_list.append(cnt * 2)
                                    else:
                                        count_list.append(-1)
                                else:
                                    count_list.append(cnt * 2)
                            else:
                                count_list.append(-1)

                    else:
                        count_list.append(-1)

                i += 1  #iteration of keyword / verb_keyword list

            if (keyword_type
                    == 'W2VxKeyword') and (not (any(i > (len(word_list) // 3)
                                                    for i in count_list))):
                unique_closest_news_id_list.append(-1)
            else:
                #check if the proposed tweet contain strict word (region, organization, position) like the news (not news_similar) did
                check_tweet_news_strict_word = find_words_in_strict_word_list(
                    news_list[np.argmax(count_list)].split())
                if len(check_tweet_news_strict_word) > 0:
                    if all(word in word_list
                           for word in check_tweet_news_strict_word):
                        unique_closest_news_id_list.append(
                            np.argmax(count_list))
                    else:
                        unique_closest_news_id_list.append(-1)
                else:
                    unique_closest_news_id_list.append(np.argmax(count_list))

    return unique_closest_news_id_list, unique_tweets_dict
def extract_closest_news_id_by_ngram(start_at):
    '''
    Function to extract closest news id by n-gram matching
    '''
    #Import Tweets List
    unique_tweets_dict = {}

    df = pd.read_csv(SCRIPT_PATH + '/Dataset/Final/agg_final_data.csv')
    tweets_list = df.loc[start_at:, 'text'].to_list()

    for dup in list_duplicates(tweets_list):
        unique_tweets_dict[dup[0]] = dup[1]

    tweets_list = [
        tweets_list[unique_tweets_dict[x][0]]
        for x in unique_tweets_dict.keys()
    ]

    #Import News List
    news_list = []
    with open(SCRIPT_PATH + "/Dataset/news_corpus.txt", "r",
              encoding='utf-8') as f:
        for line in f:
            news_list.append(sentences_cleaner(line))

    #Import News Keyword List
    news_keyword_lists = []
    with open("/mnt/louis/Dataset/news_corpus_keyword.txt",
              "r",
              encoding='utf-8') as f:
        for line in f:
            line = line.lower()
            line = re.sub('\n', '', line)
            keyword_list = line.replace(' ', ',').split(',')
            news_keyword_lists.append(keyword_list)

    #Extract bigram and trigram for each news
    news_bigram_lists = []
    news_trigram_lists = []
    for news in news_list:
        news_word_list = [
            x for x in news.split()
            if ((x != 'video') and (x != 'link') and (x != 'foto'))
        ]
        news_bigram_lists.append(extract_bigrams(news_word_list))
        news_trigram_lists.append(extract_trigrams(news_word_list))

    #Match bigram & trigram in each tweet and each news
    unique_closest_news_id_list = []

    for tweet in tqdm(tweets_list):
        word_list = tweet.split()

        rule_passed = (len(word_list) >= 3) and ('pap' not in word_list) and (
            'vcs' not in word_list) and ('vc' not in word_list) and (
                'wa' not in word_list)

        if not rule_passed:
            unique_closest_news_id_list.append(-1)
        else:
            count_list = []
            bigram_list = extract_bigrams(word_list)
            trigram_list = extract_trigrams(word_list)
            for i in range(len(news_list)):
                cnt_word = 0
                cnt_bigram = 0
                cnt_trigram = 0
                news_keyword_list = news_keyword_lists[i]
                news_bigram_list = news_bigram_lists[i]
                news_trigram_list = news_trigram_lists[i]

                for word in news_keyword_list:
                    if word == 'corona':
                        if ('corona' in tweet) or ('covid' in tweet) or (
                                'covid19' in tweet) or ('covid-19' in tweet):
                            cnt_word += 1
                    else:
                        if len(word) > 3:
                            if (word in tweet) and (not word.isdigit()):
                                cnt_word += 1
                        else:  #Abbreviation Word
                            if (word in word_list) and (not word.isdigit()):
                                cnt_word += 1

                for bigram in news_bigram_list:
                    if (bigram in bigram_list) and (
                            not any(word.isdigit() for word in bigram.split())
                    ) and ('corona'
                           not in bigram) and ('covid' not in bigram) and (
                               'covid19' not in bigram) and ('covid-19'
                                                             not in bigram):
                        cnt_bigram += 1

                for trigram in news_trigram_list:
                    if (trigram in trigram_list) and (
                            not any(word.isdigit() for word in trigram.split())
                    ) and ('corona'
                           not in trigram) and ('covid' not in trigram) and (
                               'covid19' not in trigram) and ('covid-19'
                                                              not in trigram):
                        cnt_trigram += 1

                if (cnt_word > len(news_keyword_list) * 7 // 8) and (
                        cnt_bigram >= 2) and (cnt_trigram >= 1):
                    count_list.append(cnt_word + cnt_bigram + cnt_trigram)
                else:
                    count_list.append(0)

            if any(cnt > 0 for cnt in count_list):
                unique_closest_news_id_list.append(np.argmax(count_list))
            else:
                unique_closest_news_id_list.append(-1)

    return unique_closest_news_id_list, unique_tweets_dict