def main(): # Replace with your path (obvs) parc_directory = "./../Data/parc30-conll/train-conll-foreval/" polnear_directory = "./../Data/polnear-conll/train-conll-foreval/" # remember the folder structure should be ./../Data/corpus/corpus_subset/corpus_file1.xml one_sentence_total = 0 multiple_sentences_total = 0 i = 1 for filename in os.listdir( polnear_directory ): #specify which dir you want to run the code on (i.e. which corpus from above). Adjust on line 53 accordingly. if i % 50 == 0: # This bit just lets you know where you are (prints some stuff every 100 files) print(filename) print('one sentence:', one_sentence_total, 'multiple sentence:', multiple_sentences_total) i += 1 df = import_attribution_doc(polnear_directory + filename) if df["attribution"][0] != 0: atts = extract_attributions(df) att_spans = extract_attribution_spans(atts) one_sentence, multiple_sentences = count_span_sentence_overlaps( df, att_spans) one_sentence_total += one_sentence multiple_sentences_total += multiple_sentences print() print('one sentence:', one_sentence_total) print('multiple sentence:', multiple_sentences_total)
def main(): # Replace with your path (obvs) parc_directory = "./../Data/parc30-conll/train-conll-foreval/" polnear_directory = "../Data/polnear-conll/polnear-conll/train-conll-foreval/" one_sentence_total = 0 multiple_sentences_total = 0 i = 1 for filename in os.listdir(polnear_directory): if i % 50 == 0: # This bit just lets you know where you are (prints some stuff every 100 files) print(filename) print('one sentence:', one_sentence_total, 'multiple sentence:', multiple_sentences_total) i += 1 df = import_attribution_doc(polnear_directory + filename) if df["attribution"][0] != 0: atts = extract_attributions(df) att_spans = extract_attribution_spans(atts) one_sentence, multiple_sentences = count_span_sentence_overlaps( df, att_spans) one_sentence_total += one_sentence multiple_sentences_total += multiple_sentences print() print('one sentence:', one_sentence_total) print('multiple sentence:', multiple_sentences_total)
def get_ne_info(corpus_directory): """ Adds two columns to the existing DataFrame with (1) IOB tags that indicate whether the word is the B(eginning), I(nside) or O(utside) of Named Entity (NE). (2) information about the type of NE (e.g., person, organisation, date, etc) # TODO: Should I remove the irrelevant types such as dates, and only keep person, organisation, etc. """ for file in os.listdir(corpus_directory): list_of_words = [] df = import_attribution_doc(corpus_directory + file) for index, entry in df.iterrows(): list_of_words.append(entry["word"]) paragraph = "".join([ " " + i if not i.startswith("'") and i not in string.punctuation else i for i in list_of_words ]).strip() doc = nlp(paragraph) iob_tags = [] ent_types = [] for word in doc: iob_tags.append(word.ent_iob_) ent_types.append( word.ent_type_ ) # TODO: should I replace the empty values with O or another value df["iob"] = iob_tags df["ent"] = ent_types # TODO: Where to write the DataFrame to? return df
def get_cue_frequencies(): """ This function extracts all attributions in a corpus and returns - a dictionary with for every entry a string of lemmas in cue spans as key and frequency as value - a dictionary with for every entry a string of POS in cue spans as key and frequency as value """ lemma_list_cue = [] freq_dict_lemma_cue = {} pos_list_cue = [] freq_dict_pos_cue = {} for filename in os.listdir(directory): df = import_attribution_doc(directory + filename) if df["attribution"][0] != 0: atts = extract_attributions(df) for i in range(len(atts)): attribution = atts[i] cue_span = attribution["CUE"] for span in cue_span: if span == None: continue else: cue = "" word_list = df["lemma"][span[0]:span[1]] for word in word_list: cue += word + " " lemma_list_cue.append(cue.strip(" ")) pos = "" pos_list = df['POS'][span[0]:span[1]] for tag in pos_list: pos += tag + " " pos_list_cue.append(pos.strip(" ")) for item in lemma_list_cue: if item in freq_dict_lemma_cue: freq_dict_lemma_cue[item] += 1 else: freq_dict_lemma_cue[item] = 1 for item in pos_list_cue: if item in freq_dict_pos_cue: freq_dict_pos_cue[item] += 1 else: freq_dict_pos_cue[item] = 1 return (freq_dict_lemma_cue, freq_dict_pos_cue)
def preprocessing_main(corpus_directory, new_directory): ''' Main function for preprocessing: takes a corpus directory and converts each file to the new format (.tsv files from pandas dfs) with updated info (all columns predicted by stanza and new cue_label column) :param corpus_directory: the path to a folder containing all of the desires corpus files :param new_directory: the path to the folder where new files will be written ''' start_time = time.time() nlp = stanza.Pipeline('en', tokenize_pretokenized=True) finished = 0 for file in os.listdir(corpus_directory): if finished % 100 == 0: print(f"Finished {finished}. On to file {file}") print(f"Time elapsed: {time.time()-start_time}") outfile = file.split(".")[0] + ".tsv" df = main_utils.import_attribution_doc(corpus_directory + file) cue_label_df = extract_cue_labels(df) sents = file_to_sents(cue_label_df) doc = nlp(sents) stanza_df = stanza_to_df(doc) assert len(stanza_df) == len( cue_label_df["cue_label"]), f"Fatal error; file {file}" stanza_df["cue_label"] = cue_label_df["cue_label"] assert len(stanza_df) == len( cue_label_df["attribution"]), f"Fatal error; file {file}" stanza_df["attribution"] = cue_label_df["attribution"] stanza_df.to_csv(new_directory + outfile, sep="\t") finished += 1 end_time = time.time() print(f"Total time elapsed: {end_time-start_time}")
def get_corpus_stats(corpus_directory): """ Function to find the general statistics on each file in a corpus and in the corpus overall. takes one argument: corpus directory- the relative path to the directory containing the corpus. returns 2 items: main_list- a list of dictionaries with each dictionary containing stats for each file in the corpus stats_dict- a dictionary containing the stats for the overall corpus :param corpus_directory: """ main_list = list() for filename in os.listdir(corpus_directory): df = import_attribution_doc(corpus_directory + filename) doc_dict = dict() for col in df.columns: filename = df['filename'][0] sens = df['sentence_number'].max() sent_lens = list() count = 0 for row in df['sent_token_number']: count += 1 if row == 1: sent_lens.append(count) count = 0 last_item = df['sent_token_number'].iloc[-1] sent_lens.append(last_item) av_len = mean(sent_lens[1:]) token_count = df['doc_token_number'].iloc[-1] doc_dict['filename'] = filename doc_dict['number of sentences'] = sens doc_dict['average sentence length'] = av_len doc_dict['number of tokens'] = token_count main_list.append(doc_dict) # print(main_list) stats_dict = dict() sent_count = list() token_count = list() sent_len_count = list() for doc_dict in main_list: sent_count.append(doc_dict['number of sentences']) token_count.append(doc_dict['number of tokens']) sent_len_count.append(doc_dict['average sentence length']) av_sent_count = mean(sent_count) av_token_count = mean(token_count) av_sent_len = mean(sent_len_count) stats_dict['number of docs'] = len(main_list) stats_dict['average number of sentences'] = av_sent_count stats_dict['average number of tokens'] = av_token_count stats_dict['average sentence length'] = av_sent_len # print(stats_dict) return main_list, stats_dict