示例#1
0
def main():
    # Replace with your path (obvs)
    parc_directory = "./../Data/parc30-conll/train-conll-foreval/"
    polnear_directory = "./../Data/polnear-conll/train-conll-foreval/"  # remember the folder structure should be ./../Data/corpus/corpus_subset/corpus_file1.xml

    one_sentence_total = 0
    multiple_sentences_total = 0

    i = 1
    for filename in os.listdir(
            polnear_directory
    ):  #specify which dir you want to run the code on (i.e. which corpus from above). Adjust on line 53 accordingly.
        if i % 50 == 0:
            # This bit just lets you know where you are (prints some stuff every 100 files)
            print(filename)
            print('one sentence:', one_sentence_total, 'multiple sentence:',
                  multiple_sentences_total)
        i += 1
        df = import_attribution_doc(polnear_directory + filename)
        if df["attribution"][0] != 0:
            atts = extract_attributions(df)
            att_spans = extract_attribution_spans(atts)
            one_sentence, multiple_sentences = count_span_sentence_overlaps(
                df, att_spans)
            one_sentence_total += one_sentence
            multiple_sentences_total += multiple_sentences
    print()
    print('one sentence:', one_sentence_total)
    print('multiple sentence:', multiple_sentences_total)
def main():
    # Replace with your path (obvs)
    parc_directory = "./../Data/parc30-conll/train-conll-foreval/"
    polnear_directory = "../Data/polnear-conll/polnear-conll/train-conll-foreval/"

    one_sentence_total = 0
    multiple_sentences_total = 0

    i = 1
    for filename in os.listdir(polnear_directory):
        if i % 50 == 0:
            # This bit just lets you know where you are (prints some stuff every 100 files)
            print(filename)
            print('one sentence:', one_sentence_total, 'multiple sentence:',
                  multiple_sentences_total)
        i += 1
        df = import_attribution_doc(polnear_directory + filename)
        if df["attribution"][0] != 0:
            atts = extract_attributions(df)
            att_spans = extract_attribution_spans(atts)
            one_sentence, multiple_sentences = count_span_sentence_overlaps(
                df, att_spans)
            one_sentence_total += one_sentence
            multiple_sentences_total += multiple_sentences
    print()
    print('one sentence:', one_sentence_total)
    print('multiple sentence:', multiple_sentences_total)
示例#3
0
文件: ner.py 项目: SorenKF/cueties
def get_ne_info(corpus_directory):
    """
    Adds two columns to the existing DataFrame with
    (1) IOB tags that indicate whether the word is the B(eginning), I(nside) or O(utside) of Named Entity (NE).
    (2) information about the type of NE (e.g., person, organisation, date, etc)
    # TODO: Should I remove the irrelevant types such as dates, and only keep person, organisation, etc.
    """
    for file in os.listdir(corpus_directory):
        list_of_words = []
        df = import_attribution_doc(corpus_directory + file)
        for index, entry in df.iterrows():
            list_of_words.append(entry["word"])

        paragraph = "".join([
            " " +
            i if not i.startswith("'") and i not in string.punctuation else i
            for i in list_of_words
        ]).strip()
        doc = nlp(paragraph)

        iob_tags = []
        ent_types = []
        for word in doc:
            iob_tags.append(word.ent_iob_)
            ent_types.append(
                word.ent_type_
            )  # TODO: should I replace the empty values with O or another value

        df["iob"] = iob_tags
        df["ent"] = ent_types

        # TODO: Where to write the DataFrame to?
        return df
示例#4
0
def get_cue_frequencies():
    """
    This function extracts all attributions in a corpus and returns 
    - a dictionary with for every entry a string of lemmas in cue spans as key and frequency as value
    - a dictionary with for every entry a string of POS in cue spans as key and frequency as value
    """

    lemma_list_cue = []
    freq_dict_lemma_cue = {}
    pos_list_cue = []
    freq_dict_pos_cue = {}

    for filename in os.listdir(directory):
        df = import_attribution_doc(directory + filename)
        if df["attribution"][0] != 0:
            atts = extract_attributions(df)

            for i in range(len(atts)):
                attribution = atts[i]
                cue_span = attribution["CUE"]

                for span in cue_span:
                    if span == None:
                        continue
                    else:
                        cue = ""
                        word_list = df["lemma"][span[0]:span[1]]
                        for word in word_list:
                            cue += word + " "
                        lemma_list_cue.append(cue.strip(" "))

                        pos = ""
                        pos_list = df['POS'][span[0]:span[1]]
                        for tag in pos_list:
                            pos += tag + " "
                        pos_list_cue.append(pos.strip(" "))

    for item in lemma_list_cue:
        if item in freq_dict_lemma_cue:
            freq_dict_lemma_cue[item] += 1
        else:
            freq_dict_lemma_cue[item] = 1

    for item in pos_list_cue:
        if item in freq_dict_pos_cue:
            freq_dict_pos_cue[item] += 1
        else:
            freq_dict_pos_cue[item] = 1

    return (freq_dict_lemma_cue, freq_dict_pos_cue)
示例#5
0
def preprocessing_main(corpus_directory, new_directory):
    '''
    Main function for preprocessing: takes a corpus directory and converts each file to the
    new format (.tsv files from pandas dfs) with updated info (all columns predicted by stanza
    and new cue_label column)
    
    :param corpus_directory: the path to a folder containing all of the desires corpus files
    :param new_directory: the path to the folder where new files will be written
    
    '''
    start_time = time.time()

    nlp = stanza.Pipeline('en', tokenize_pretokenized=True)

    finished = 0

    for file in os.listdir(corpus_directory):
        if finished % 100 == 0:
            print(f"Finished {finished}. On to file {file}")
            print(f"Time elapsed: {time.time()-start_time}")
        outfile = file.split(".")[0] + ".tsv"

        df = main_utils.import_attribution_doc(corpus_directory + file)
        cue_label_df = extract_cue_labels(df)

        sents = file_to_sents(cue_label_df)
        doc = nlp(sents)

        stanza_df = stanza_to_df(doc)
        assert len(stanza_df) == len(
            cue_label_df["cue_label"]), f"Fatal error; file {file}"
        stanza_df["cue_label"] = cue_label_df["cue_label"]
        assert len(stanza_df) == len(
            cue_label_df["attribution"]), f"Fatal error; file {file}"
        stanza_df["attribution"] = cue_label_df["attribution"]

        stanza_df.to_csv(new_directory + outfile, sep="\t")

        finished += 1
    end_time = time.time()
    print(f"Total time elapsed: {end_time-start_time}")
示例#6
0
def get_corpus_stats(corpus_directory):
    """
    Function to find the general statistics on each file in a corpus and in the corpus overall.

    takes one argument: corpus directory- the relative path to the directory containing the corpus.

    returns 2 items: main_list- a list of dictionaries with each dictionary containing stats for each file in the corpus
                    stats_dict- a dictionary containing the stats for the overall corpus
    :param corpus_directory:
    """

    main_list = list()
    for filename in os.listdir(corpus_directory):

        df = import_attribution_doc(corpus_directory + filename)

        doc_dict = dict()

        for col in df.columns:
            filename = df['filename'][0]
            sens = df['sentence_number'].max()

            sent_lens = list()
            count = 0
            for row in df['sent_token_number']:
                count += 1
                if row == 1:
                    sent_lens.append(count)
                    count = 0
            last_item = df['sent_token_number'].iloc[-1]
            sent_lens.append(last_item)
            av_len = mean(sent_lens[1:])

            token_count = df['doc_token_number'].iloc[-1]

        doc_dict['filename'] = filename
        doc_dict['number of sentences'] = sens
        doc_dict['average sentence length'] = av_len
        doc_dict['number of tokens'] = token_count

        main_list.append(doc_dict)

    # print(main_list)

    stats_dict = dict()

    sent_count = list()
    token_count = list()
    sent_len_count = list()

    for doc_dict in main_list:
        sent_count.append(doc_dict['number of sentences'])
        token_count.append(doc_dict['number of tokens'])
        sent_len_count.append(doc_dict['average sentence length'])

    av_sent_count = mean(sent_count)
    av_token_count = mean(token_count)
    av_sent_len = mean(sent_len_count)

    stats_dict['number of docs'] = len(main_list)
    stats_dict['average number of sentences'] = av_sent_count
    stats_dict['average number of tokens'] = av_token_count
    stats_dict['average sentence length'] = av_sent_len
    # print(stats_dict)

    return main_list, stats_dict