Пример #1
0
def articles_by_category(input_path, output_path):
    """Counts total and COVID related articles by categories. 

    Input:
        CSV File from path
    
    Output:
        CSV File from path
    """

    delete_if_exists(output_path)

    csv_input = input_path
    csv_output = output_path

    # Uses pandas method 'read_csv' for openin a .csv file
    csv_reader = pd.read_csv(csv_input, sep = ',', encoding = 'utf-8')

    # Define a new dataframe using initial dataframe
    dataframe = pd.DataFrame(csv_reader, columns = ['Section', 'COVID'])

    article_category = dataframe.pivot_table(index = ['Section'], 
                                            aggfunc = {'Section':len, 
                                                'COVID':lambda x: (x>0).sum()})
    
    # Correcting column names
    article_category.columns = ['COVID_articles', 'Total_articles']
    article_category.index.name = 'Section'

    # Removing mistakenly added category input, add it to 'Kolumne'
    article_category = article_category.drop('TIHOMIR BRALIĆ')
    article_category.loc[['Kolumne'], ['Total_articles']] += 1

    # Remove irrelevant sub-categories
    article_category = article_category[article_category['Total_articles'] > 51]
    
    article_category.to_csv(csv_output, sep = ',', mode = 'a')

    print('\n************************************************************************')
    print('Articles by category')
    print('************************************************************************')
    print(article_category)
def lemmatize_wordlist(input_path, output_path):
    """
    Lemmatization of a given word list

    Args:
        input_path (string): input path to a word list
        output_path (string): output path to a word list
    """
    delete_if_exists(output_path)

    print('Lemmatizing ' + str(input_path) + ' in Croatian language.')

    with open(input_path, 'r', encoding = 'utf-8') as csv_read, \
        open(output_path, 'a', encoding = 'utf-8') as csv_write:

        csv_reader = csv.reader(csv_read, delimiter = ' ')
        csv_writer = csv.writer(csv_write,
            delimiter=' ', 
            quotechar='"', 
            quoting = csv.QUOTE_MINIMAL, 
            lineterminator='\n')

        # Classla processor
        nlp = classla.Pipeline(lang='hr', processors='lemma, tokenize, pos', use_gpu=False)

        print('Lemmatization of ' + input_path + ' started...')

        for row in csv_reader:

            row_word = row[0]
            word_weight = row[1]

            expression = nlp(row_word)

            # Change the word into its lemmatized form
            lem_word = [word.lemma for sent in expression.sentences for word in sent.words]
            lem_word =  ('').join(lem_word)

            csv_writer.writerow([lem_word, word_weight])
        
    print('Lemmatized file saved at: ' + output_path)
Пример #3
0
def covid_identifier(input_path, output_path, word_list):
    """
    Filters COVID-19 related articles with more than three emoji reactions.

    Args:
        input_path (string): .csv input file
        output_path (string): .csv output file
        word_list (string): .txt file with COVID related words
    """

    delete_if_exists(output_path)

    csv_input = input_path
    csv_output = output_path
    covid_dict = file_to_list(word_list)

    article_counter = 0
    covid_counter = 0

    with open(csv_input, 'r', encoding = 'utf-8') as csv_read, \
         open(csv_output, 'a', encoding = 'utf-8') as csv_write:

        csv_reader = csv.reader(csv_read, delimiter=',')
        csv_writer = csv.writer(csv_write,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL,
                                lineterminator='\n')

        #Add header for new columns
        headers = [
            'ID', 'Title', 'Subtitle', 'URL', 'Section', 'Article_text',
            'Published_time', 'Modified_time', 'Author', 'Comments',
            'Reaction_love', 'Reaction_laugh', 'Reaction_hug',
            'Reaction_ponder', 'Reaction_sad', 'Reaction_mad',
            'Reaction_mind_blown'
        ]

        # Skip old header, add new a new one
        next(csv_reader, None)
        csv_writer.writerow(headers)

        print('Modifying portal_articles.csv...')
        print('Calculating the number of COVID articles...')

        for row in csv_reader:

            # Rows turned to lower_case
            title = row[1].lower()
            subtitle = row[2].lower()
            article_text = row[5].lower()

            reaction_love = int(row[10])
            reaction_laugh = int(row[11])
            reaction_blushy = int(row[12])
            reaction_ponder = int(row[13])
            reaction_sad = int(row[14])
            reaction_mad = int(row[15])
            reaction_mind_blown = int(row[16])

            # Emoji value total sum used for article filtering
            emoji_sum = reaction_love + reaction_laugh + reaction_blushy + reaction_ponder + reaction_sad + reaction_mad + reaction_mind_blown

            # Identifies covid articles, based on a list of words
            # in covid_dictionary.txt
            if (any(map(title.__contains__, covid_dict))
                    or any(map(subtitle.__contains__, covid_dict))
                    or any(map(article_text.__contains__, covid_dict))):

                if (emoji_sum >= 3):

                    csv_writer.writerow([
                        row[0], row[1], row[2], row[3], row[4], row[5], row[6],
                        row[7], row[8], row[9], row[10], row[11], row[12],
                        row[13], row[14], row[15], row[16]
                    ])

                    covid_counter += 1

            article_counter += 1

    print('Total articles:', article_counter)
    print('COVID-19 articles:', covid_counter)
Пример #4
0
def covid_identifier(input_path, output_path, covid_wordlist):
    """
    Identifies COVID-19 related articles by searching through
    a dictionary of common expressions used in this kind of articles.

    Args:
        input_path (string): path to input .csv file
        output_path (string): path to output .csv file
        covid_wordlist (string): path to .txt covid_wordlist
    """
    delete_if_exists(output_path)
    covid_dict = file_to_list(covid_wordlist)

    article_counter = 0
    covid_counter = 0
    #print(covid_dict)

    with open(input_path, 'r', encoding = 'utf-8') as csv_read, \
         open(output_path, 'a', encoding = 'utf-8') as csv_write:

        csv_reader = csv.reader(csv_read, delimiter = ',')
        csv_writer = csv.writer(csv_write,
            delimiter=',', 
            quotechar='"', 
            quoting = csv.QUOTE_MINIMAL, 
            lineterminator='\n')  
        
        #Add header for new columns
        headers = ['ID', 'Title', 'Subtitle', 'URL', 
        'Section','Article_text', 'Published_time', 'Modified_time',
        'Author', 'Comments', 'Reaction_love',
        'Reaction_laugh', 'Reaction_hug', 'Reaction_ponder',
        'Reaction_sad', 'Reaction_mad', 'Reaction_mind_blown', 'COVID']
        
        # Skip old header, add new a new one
        next(csv_reader, None)
        csv_writer.writerow(headers)

        print('Modifying portal_articles.csv...')
        print('Calculating the number of COVID articles...')

        for row in csv_reader:

            # Rows turned to lower_case
            row[1] = row[1].lower()
            row[2] = row[2].lower()
            row[5] = row[5].lower()

            # Identifies covid articles, based on a list of words
            # in covid_dictionary.txt
            if (any(map(row[1].__contains__, covid_dict)) 
                or any(map(row[2].__contains__, covid_dict))
                     or any(map(row[5].__contains__, covid_dict))):

                #print('found COVID-19 article at id:', row[0])     # Testing
                covid_counter += 1
                row.append(1)
                csv_writer.writerow(row)
            else:
                row.append(0)
                csv_writer.writerow(row)

            # print(row)            
            # Sum of articles
            article_counter += 1

    print('Total articles:', article_counter)
    print('COVID-19 articles:', covid_counter)
Пример #5
0
def lemmatize_articles(input_path, output_path):
    """
    Lemmatizatizes textual columns of given articles.

    Args:
        input_path (string):  input path to a csv file
        output_path (string): output path to a csv file
    """
    
    delete_if_exists(output_path)
    
    ID = 0

    print('Lemmatizing ' + str(input_path) + ' in Croatian language.')

    with open(input_path, 'r', encoding = 'utf-8') as csv_read, \
        open(output_path, 'a', encoding = 'utf-8') as csv_write:

        csv_reader = csv.reader(csv_read, delimiter = ',')
        csv_writer = csv.writer(csv_write,
            delimiter=',', 
            quotechar='"', 
            quoting = csv.QUOTE_MINIMAL, 
            lineterminator='\n')

        headers = ['ID', 'Title', 'Subtitle', 'URL', 
        'Section','Article_text', 'Published_time', 'Modified_time',
        'Author', 'Comments', 'Reaction_love',
        'Reaction_laugh', 'Reaction_hug', 'Reaction_ponder',
        'Reaction_sad', 'Reaction_mad', 'Reaction_mind_blown']

        csv_writer.writerow(headers)

        # Skip old header, add new a new one
        next(csv_reader, None)

        # Classla processor
        nlp = classla.Pipeline(lang='hr', processors='lemma, tokenize, pos', use_gpu=False)

        print('Lemmatization started...')

        id = 1

        for row in csv_reader:

            title = row[1]
            subtitle = row[2]
            article_text = row[5]

            # None value exception handling
            try:
                doc_title = nlp(title)
                lem_title = [word.lemma for sent in doc_title.sentences for word in sent.words]
            except:
                title = 'N/A'
                lem_title = 'N/A'
                pass
            
            try:
                doc_subtitle = nlp(subtitle)
                lem_subtitle = [word.lemma for sent in doc_subtitle.sentences for word in sent.words]
            except:
                title = 'N/A'
                lem_subtitle = 'N/A'
                pass

            try:
                doc_article_text = nlp(article_text)
                lem_article_text =  [word.lemma for sent in doc_article_text.sentences for word in sent.words]
            
            except:
                title = 'N/A'
                lem_article_text = 'N/A'
                pass

            lem_title =         (' ').join(lem_title).lower()
            lem_subtitle =      (' ').join(lem_subtitle).lower()
            lem_article_text =  (' ').join(lem_article_text).lower()

            csv_writer.writerow([id, lem_title, lem_subtitle, row[3], row[4], lem_article_text, row[6], 
                        row[7], row[8], row[9], row[10], row[11], row[12], row[13], row[14], row[15], 
                        row[16]])

            id+=1
        
    print('Lemmatized file saved at: ' + output_path)
Пример #6
0
def clear_stop_words(input_path, output_path, stop_word_input, mode):
    """
    Clears given stopwords from a .csv file.

    Args:
        input_path (string): path to input .csv file
        output_path (string): path to output .csv. file
        stop_word_input (string): path to stopwords .txt file
        mode (int): 0 - full cleaning mode, 1 - interpunction cleaning mode
    """

    delete_if_exists(output_path)

    stop_words = set(file_to_list(stop_word_input))

    ID = 0

    print('Cleaning file ' + str(input_path) + ' of stop words...')

    with open(input_path, 'r', encoding = 'utf-8') as csv_read, \
        open(output_path, 'a', encoding = 'utf-8') as csv_write:

        csv_reader = csv.reader(csv_read, delimiter=',')
        csv_writer = csv.writer(csv_write,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL,
                                lineterminator='\n')

        headers = [
            'ID', 'Title', 'Subtitle', 'URL', 'Section', 'Article_text',
            'Published_time', 'Modified_time', 'Author', 'Comments',
            'Reaction_love', 'Reaction_laugh', 'Reaction_hug',
            'Reaction_ponder', 'Reaction_sad', 'Reaction_mad',
            'Reaction_mind_blown'
        ]

        csv_writer.writerow(headers)

        # Skip old header, add new a new one
        next(csv_reader, None)

        for row in csv_reader:

            title = row[1]
            subtitle = row[2]
            article_text = row[5]

            title = title.lstrip('"')
            subtitle = subtitle.lstrip('"')

            title_tokens = word_tokenize(title)
            subtitle_tokens = word_tokenize(subtitle)
            text_tokens = word_tokenize(article_text)

            if mode == 0:
                filtered_title = word_filter_full(title_tokens, stop_words)
                filtered_subtitle = word_filter_full(subtitle_tokens,
                                                     stop_words)
                filtered_text = word_filter_full(text_tokens, stop_words)
            elif mode == 1:
                filtered_title = word_filter_interpunction(
                    title_tokens, stop_words)
                filtered_subtitle = word_filter_interpunction(
                    subtitle_tokens, stop_words)
                filtered_text = word_filter_interpunction(
                    text_tokens, stop_words)

            ID += 1

            csv_writer.writerow([
                ID, filtered_title, filtered_subtitle, row[3], row[4],
                filtered_text, row[6], row[7], row[8], row[9], row[10],
                row[11], row[12], row[13], row[14], row[15], row[16]
            ])

    print('Clean file saved at: ' + output_path)