def main():
    # Replace with your path (obvs)
    parc_directory = "./../Data/parc30-conll/train-conll-foreval/"
    polnear_directory = "../Data/polnear-conll/polnear-conll/train-conll-foreval/"

    one_sentence_total = 0
    multiple_sentences_total = 0

    i = 1
    for filename in os.listdir(polnear_directory):
        if i % 50 == 0:
            # This bit just lets you know where you are (prints some stuff every 100 files)
            print(filename)
            print('one sentence:', one_sentence_total, 'multiple sentence:',
                  multiple_sentences_total)
        i += 1
        df = import_attribution_doc(polnear_directory + filename)
        if df["attribution"][0] != 0:
            atts = extract_attributions(df)
            att_spans = extract_attribution_spans(atts)
            one_sentence, multiple_sentences = count_span_sentence_overlaps(
                df, att_spans)
            one_sentence_total += one_sentence
            multiple_sentences_total += multiple_sentences
    print()
    print('one sentence:', one_sentence_total)
    print('multiple sentence:', multiple_sentences_total)
示例#2
0
def main():
    # Replace with your path (obvs)
    parc_directory = "./../Data/parc30-conll/train-conll-foreval/"
    polnear_directory = "./../Data/polnear-conll/train-conll-foreval/"  # remember the folder structure should be ./../Data/corpus/corpus_subset/corpus_file1.xml

    one_sentence_total = 0
    multiple_sentences_total = 0

    i = 1
    for filename in os.listdir(
            polnear_directory
    ):  #specify which dir you want to run the code on (i.e. which corpus from above). Adjust on line 53 accordingly.
        if i % 50 == 0:
            # This bit just lets you know where you are (prints some stuff every 100 files)
            print(filename)
            print('one sentence:', one_sentence_total, 'multiple sentence:',
                  multiple_sentences_total)
        i += 1
        df = import_attribution_doc(polnear_directory + filename)
        if df["attribution"][0] != 0:
            atts = extract_attributions(df)
            att_spans = extract_attribution_spans(atts)
            one_sentence, multiple_sentences = count_span_sentence_overlaps(
                df, att_spans)
            one_sentence_total += one_sentence
            multiple_sentences_total += multiple_sentences
    print()
    print('one sentence:', one_sentence_total)
    print('multiple sentence:', multiple_sentences_total)
示例#3
0
def get_cue_frequencies():
    """
    This function extracts all attributions in a corpus and returns 
    - a dictionary with for every entry a string of lemmas in cue spans as key and frequency as value
    - a dictionary with for every entry a string of POS in cue spans as key and frequency as value
    """

    lemma_list_cue = []
    freq_dict_lemma_cue = {}
    pos_list_cue = []
    freq_dict_pos_cue = {}

    for filename in os.listdir(directory):
        df = import_attribution_doc(directory + filename)
        if df["attribution"][0] != 0:
            atts = extract_attributions(df)

            for i in range(len(atts)):
                attribution = atts[i]
                cue_span = attribution["CUE"]

                for span in cue_span:
                    if span == None:
                        continue
                    else:
                        cue = ""
                        word_list = df["lemma"][span[0]:span[1]]
                        for word in word_list:
                            cue += word + " "
                        lemma_list_cue.append(cue.strip(" "))

                        pos = ""
                        pos_list = df['POS'][span[0]:span[1]]
                        for tag in pos_list:
                            pos += tag + " "
                        pos_list_cue.append(pos.strip(" "))

    for item in lemma_list_cue:
        if item in freq_dict_lemma_cue:
            freq_dict_lemma_cue[item] += 1
        else:
            freq_dict_lemma_cue[item] = 1

    for item in pos_list_cue:
        if item in freq_dict_pos_cue:
            freq_dict_pos_cue[item] += 1
        else:
            freq_dict_pos_cue[item] = 1

    return (freq_dict_lemma_cue, freq_dict_pos_cue)