def main(): # Replace with your path (obvs) parc_directory = "./../Data/parc30-conll/train-conll-foreval/" polnear_directory = "../Data/polnear-conll/polnear-conll/train-conll-foreval/" one_sentence_total = 0 multiple_sentences_total = 0 i = 1 for filename in os.listdir(polnear_directory): if i % 50 == 0: # This bit just lets you know where you are (prints some stuff every 100 files) print(filename) print('one sentence:', one_sentence_total, 'multiple sentence:', multiple_sentences_total) i += 1 df = import_attribution_doc(polnear_directory + filename) if df["attribution"][0] != 0: atts = extract_attributions(df) att_spans = extract_attribution_spans(atts) one_sentence, multiple_sentences = count_span_sentence_overlaps( df, att_spans) one_sentence_total += one_sentence multiple_sentences_total += multiple_sentences print() print('one sentence:', one_sentence_total) print('multiple sentence:', multiple_sentences_total)
def main(): # Replace with your path (obvs) parc_directory = "./../Data/parc30-conll/train-conll-foreval/" polnear_directory = "./../Data/polnear-conll/train-conll-foreval/" # remember the folder structure should be ./../Data/corpus/corpus_subset/corpus_file1.xml one_sentence_total = 0 multiple_sentences_total = 0 i = 1 for filename in os.listdir( polnear_directory ): #specify which dir you want to run the code on (i.e. which corpus from above). Adjust on line 53 accordingly. if i % 50 == 0: # This bit just lets you know where you are (prints some stuff every 100 files) print(filename) print('one sentence:', one_sentence_total, 'multiple sentence:', multiple_sentences_total) i += 1 df = import_attribution_doc(polnear_directory + filename) if df["attribution"][0] != 0: atts = extract_attributions(df) att_spans = extract_attribution_spans(atts) one_sentence, multiple_sentences = count_span_sentence_overlaps( df, att_spans) one_sentence_total += one_sentence multiple_sentences_total += multiple_sentences print() print('one sentence:', one_sentence_total) print('multiple sentence:', multiple_sentences_total)
def get_cue_frequencies(): """ This function extracts all attributions in a corpus and returns - a dictionary with for every entry a string of lemmas in cue spans as key and frequency as value - a dictionary with for every entry a string of POS in cue spans as key and frequency as value """ lemma_list_cue = [] freq_dict_lemma_cue = {} pos_list_cue = [] freq_dict_pos_cue = {} for filename in os.listdir(directory): df = import_attribution_doc(directory + filename) if df["attribution"][0] != 0: atts = extract_attributions(df) for i in range(len(atts)): attribution = atts[i] cue_span = attribution["CUE"] for span in cue_span: if span == None: continue else: cue = "" word_list = df["lemma"][span[0]:span[1]] for word in word_list: cue += word + " " lemma_list_cue.append(cue.strip(" ")) pos = "" pos_list = df['POS'][span[0]:span[1]] for tag in pos_list: pos += tag + " " pos_list_cue.append(pos.strip(" ")) for item in lemma_list_cue: if item in freq_dict_lemma_cue: freq_dict_lemma_cue[item] += 1 else: freq_dict_lemma_cue[item] = 1 for item in pos_list_cue: if item in freq_dict_pos_cue: freq_dict_pos_cue[item] += 1 else: freq_dict_pos_cue[item] = 1 return (freq_dict_lemma_cue, freq_dict_pos_cue)