def articles_by_category(input_path, output_path): """Counts total and COVID related articles by categories. Input: CSV File from path Output: CSV File from path """ delete_if_exists(output_path) csv_input = input_path csv_output = output_path # Uses pandas method 'read_csv' for openin a .csv file csv_reader = pd.read_csv(csv_input, sep = ',', encoding = 'utf-8') # Define a new dataframe using initial dataframe dataframe = pd.DataFrame(csv_reader, columns = ['Section', 'COVID']) article_category = dataframe.pivot_table(index = ['Section'], aggfunc = {'Section':len, 'COVID':lambda x: (x>0).sum()}) # Correcting column names article_category.columns = ['COVID_articles', 'Total_articles'] article_category.index.name = 'Section' # Removing mistakenly added category input, add it to 'Kolumne' article_category = article_category.drop('TIHOMIR BRALIĆ') article_category.loc[['Kolumne'], ['Total_articles']] += 1 # Remove irrelevant sub-categories article_category = article_category[article_category['Total_articles'] > 51] article_category.to_csv(csv_output, sep = ',', mode = 'a') print('\n************************************************************************') print('Articles by category') print('************************************************************************') print(article_category)
def lemmatize_wordlist(input_path, output_path): """ Lemmatization of a given word list Args: input_path (string): input path to a word list output_path (string): output path to a word list """ delete_if_exists(output_path) print('Lemmatizing ' + str(input_path) + ' in Croatian language.') with open(input_path, 'r', encoding = 'utf-8') as csv_read, \ open(output_path, 'a', encoding = 'utf-8') as csv_write: csv_reader = csv.reader(csv_read, delimiter = ' ') csv_writer = csv.writer(csv_write, delimiter=' ', quotechar='"', quoting = csv.QUOTE_MINIMAL, lineterminator='\n') # Classla processor nlp = classla.Pipeline(lang='hr', processors='lemma, tokenize, pos', use_gpu=False) print('Lemmatization of ' + input_path + ' started...') for row in csv_reader: row_word = row[0] word_weight = row[1] expression = nlp(row_word) # Change the word into its lemmatized form lem_word = [word.lemma for sent in expression.sentences for word in sent.words] lem_word = ('').join(lem_word) csv_writer.writerow([lem_word, word_weight]) print('Lemmatized file saved at: ' + output_path)
def covid_identifier(input_path, output_path, word_list): """ Filters COVID-19 related articles with more than three emoji reactions. Args: input_path (string): .csv input file output_path (string): .csv output file word_list (string): .txt file with COVID related words """ delete_if_exists(output_path) csv_input = input_path csv_output = output_path covid_dict = file_to_list(word_list) article_counter = 0 covid_counter = 0 with open(csv_input, 'r', encoding = 'utf-8') as csv_read, \ open(csv_output, 'a', encoding = 'utf-8') as csv_write: csv_reader = csv.reader(csv_read, delimiter=',') csv_writer = csv.writer(csv_write, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') #Add header for new columns headers = [ 'ID', 'Title', 'Subtitle', 'URL', 'Section', 'Article_text', 'Published_time', 'Modified_time', 'Author', 'Comments', 'Reaction_love', 'Reaction_laugh', 'Reaction_hug', 'Reaction_ponder', 'Reaction_sad', 'Reaction_mad', 'Reaction_mind_blown' ] # Skip old header, add new a new one next(csv_reader, None) csv_writer.writerow(headers) print('Modifying portal_articles.csv...') print('Calculating the number of COVID articles...') for row in csv_reader: # Rows turned to lower_case title = row[1].lower() subtitle = row[2].lower() article_text = row[5].lower() reaction_love = int(row[10]) reaction_laugh = int(row[11]) reaction_blushy = int(row[12]) reaction_ponder = int(row[13]) reaction_sad = int(row[14]) reaction_mad = int(row[15]) reaction_mind_blown = int(row[16]) # Emoji value total sum used for article filtering emoji_sum = reaction_love + reaction_laugh + reaction_blushy + reaction_ponder + reaction_sad + reaction_mad + reaction_mind_blown # Identifies covid articles, based on a list of words # in covid_dictionary.txt if (any(map(title.__contains__, covid_dict)) or any(map(subtitle.__contains__, covid_dict)) or any(map(article_text.__contains__, covid_dict))): if (emoji_sum >= 3): csv_writer.writerow([ row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9], row[10], row[11], row[12], row[13], row[14], row[15], row[16] ]) covid_counter += 1 article_counter += 1 print('Total articles:', article_counter) print('COVID-19 articles:', covid_counter)
def covid_identifier(input_path, output_path, covid_wordlist): """ Identifies COVID-19 related articles by searching through a dictionary of common expressions used in this kind of articles. Args: input_path (string): path to input .csv file output_path (string): path to output .csv file covid_wordlist (string): path to .txt covid_wordlist """ delete_if_exists(output_path) covid_dict = file_to_list(covid_wordlist) article_counter = 0 covid_counter = 0 #print(covid_dict) with open(input_path, 'r', encoding = 'utf-8') as csv_read, \ open(output_path, 'a', encoding = 'utf-8') as csv_write: csv_reader = csv.reader(csv_read, delimiter = ',') csv_writer = csv.writer(csv_write, delimiter=',', quotechar='"', quoting = csv.QUOTE_MINIMAL, lineterminator='\n') #Add header for new columns headers = ['ID', 'Title', 'Subtitle', 'URL', 'Section','Article_text', 'Published_time', 'Modified_time', 'Author', 'Comments', 'Reaction_love', 'Reaction_laugh', 'Reaction_hug', 'Reaction_ponder', 'Reaction_sad', 'Reaction_mad', 'Reaction_mind_blown', 'COVID'] # Skip old header, add new a new one next(csv_reader, None) csv_writer.writerow(headers) print('Modifying portal_articles.csv...') print('Calculating the number of COVID articles...') for row in csv_reader: # Rows turned to lower_case row[1] = row[1].lower() row[2] = row[2].lower() row[5] = row[5].lower() # Identifies covid articles, based on a list of words # in covid_dictionary.txt if (any(map(row[1].__contains__, covid_dict)) or any(map(row[2].__contains__, covid_dict)) or any(map(row[5].__contains__, covid_dict))): #print('found COVID-19 article at id:', row[0]) # Testing covid_counter += 1 row.append(1) csv_writer.writerow(row) else: row.append(0) csv_writer.writerow(row) # print(row) # Sum of articles article_counter += 1 print('Total articles:', article_counter) print('COVID-19 articles:', covid_counter)
def lemmatize_articles(input_path, output_path): """ Lemmatizatizes textual columns of given articles. Args: input_path (string): input path to a csv file output_path (string): output path to a csv file """ delete_if_exists(output_path) ID = 0 print('Lemmatizing ' + str(input_path) + ' in Croatian language.') with open(input_path, 'r', encoding = 'utf-8') as csv_read, \ open(output_path, 'a', encoding = 'utf-8') as csv_write: csv_reader = csv.reader(csv_read, delimiter = ',') csv_writer = csv.writer(csv_write, delimiter=',', quotechar='"', quoting = csv.QUOTE_MINIMAL, lineterminator='\n') headers = ['ID', 'Title', 'Subtitle', 'URL', 'Section','Article_text', 'Published_time', 'Modified_time', 'Author', 'Comments', 'Reaction_love', 'Reaction_laugh', 'Reaction_hug', 'Reaction_ponder', 'Reaction_sad', 'Reaction_mad', 'Reaction_mind_blown'] csv_writer.writerow(headers) # Skip old header, add new a new one next(csv_reader, None) # Classla processor nlp = classla.Pipeline(lang='hr', processors='lemma, tokenize, pos', use_gpu=False) print('Lemmatization started...') id = 1 for row in csv_reader: title = row[1] subtitle = row[2] article_text = row[5] # None value exception handling try: doc_title = nlp(title) lem_title = [word.lemma for sent in doc_title.sentences for word in sent.words] except: title = 'N/A' lem_title = 'N/A' pass try: doc_subtitle = nlp(subtitle) lem_subtitle = [word.lemma for sent in doc_subtitle.sentences for word in sent.words] except: title = 'N/A' lem_subtitle = 'N/A' pass try: doc_article_text = nlp(article_text) lem_article_text = [word.lemma for sent in doc_article_text.sentences for word in sent.words] except: title = 'N/A' lem_article_text = 'N/A' pass lem_title = (' ').join(lem_title).lower() lem_subtitle = (' ').join(lem_subtitle).lower() lem_article_text = (' ').join(lem_article_text).lower() csv_writer.writerow([id, lem_title, lem_subtitle, row[3], row[4], lem_article_text, row[6], row[7], row[8], row[9], row[10], row[11], row[12], row[13], row[14], row[15], row[16]]) id+=1 print('Lemmatized file saved at: ' + output_path)
def clear_stop_words(input_path, output_path, stop_word_input, mode): """ Clears given stopwords from a .csv file. Args: input_path (string): path to input .csv file output_path (string): path to output .csv. file stop_word_input (string): path to stopwords .txt file mode (int): 0 - full cleaning mode, 1 - interpunction cleaning mode """ delete_if_exists(output_path) stop_words = set(file_to_list(stop_word_input)) ID = 0 print('Cleaning file ' + str(input_path) + ' of stop words...') with open(input_path, 'r', encoding = 'utf-8') as csv_read, \ open(output_path, 'a', encoding = 'utf-8') as csv_write: csv_reader = csv.reader(csv_read, delimiter=',') csv_writer = csv.writer(csv_write, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') headers = [ 'ID', 'Title', 'Subtitle', 'URL', 'Section', 'Article_text', 'Published_time', 'Modified_time', 'Author', 'Comments', 'Reaction_love', 'Reaction_laugh', 'Reaction_hug', 'Reaction_ponder', 'Reaction_sad', 'Reaction_mad', 'Reaction_mind_blown' ] csv_writer.writerow(headers) # Skip old header, add new a new one next(csv_reader, None) for row in csv_reader: title = row[1] subtitle = row[2] article_text = row[5] title = title.lstrip('"') subtitle = subtitle.lstrip('"') title_tokens = word_tokenize(title) subtitle_tokens = word_tokenize(subtitle) text_tokens = word_tokenize(article_text) if mode == 0: filtered_title = word_filter_full(title_tokens, stop_words) filtered_subtitle = word_filter_full(subtitle_tokens, stop_words) filtered_text = word_filter_full(text_tokens, stop_words) elif mode == 1: filtered_title = word_filter_interpunction( title_tokens, stop_words) filtered_subtitle = word_filter_interpunction( subtitle_tokens, stop_words) filtered_text = word_filter_interpunction( text_tokens, stop_words) ID += 1 csv_writer.writerow([ ID, filtered_title, filtered_subtitle, row[3], row[4], filtered_text, row[6], row[7], row[8], row[9], row[10], row[11], row[12], row[13], row[14], row[15], row[16] ]) print('Clean file saved at: ' + output_path)