def read_in_existing_csv_files(train_fname, dev_fname, test_fname): train_df, dev_df, test_df, _ = read_in_presplit_data(train_fname, dev_fname, test_fname, None, shuffle_data=False) """print(train_df.columns) for i, row in train_df.iterrows(): print(str(row['contextbefore']) + '\t' + str(row['text'])) print('\n') if i == 20: quit()""" train_dict = {} dev_dict = {} test_dict = {} non_text_columns, has_filename_col_already = get_other_colnames(train_df) for i, col_name in enumerate(dev_df.columns): col_name = str(col_name) assert (col_name == 'text' or col_name == 'filename' or col_name == 'contextbefore') or \ col_name == non_text_columns[i - 1] for i, col_name in enumerate(test_df.columns): col_name = str(col_name) assert (col_name == 'text' or col_name == 'filename' or col_name == 'contextbefore') or \ col_name == non_text_columns[i - 1] def populate_dict_with_data(df, dict_to_populate, is_train_for_debugging=False): if is_train_for_debugging: print('\n\n') for i, row in df.iterrows(): text_from_row = row['text'] all_other_parts_of_row = tuple( [row[colname] for colname in non_text_columns]) if text_from_row in dict_to_populate: if has_filename_col_already: dict_to_populate[text_from_row].append( [i, all_other_parts_of_row, row['filename']]) else: dict_to_populate[text_from_row].append( [i, all_other_parts_of_row]) else: if has_filename_col_already: dict_to_populate[text_from_row] = [[ i, all_other_parts_of_row, row['filename'] ]] else: dict_to_populate[text_from_row] = [[ i, all_other_parts_of_row ]] populate_dict_with_data(train_df, train_dict, is_train_for_debugging=False) populate_dict_with_data(dev_df, dev_dict) populate_dict_with_data(test_df, test_dict) return train_dict, dev_dict, test_dict, non_text_columns, has_filename_col_already
import sys sys.path.append('..') from prep_data import read_in_presplit_data from math import isnan, inf base_data_filename = '../data/binary_' percentile_of_perplexities_to_keep = 85 percentile_of_perplexities_to_keep = percentile_of_perplexities_to_keep / 100 train_file = base_data_filename + 'train-withperplexities.csv' dev_file = base_data_filename + 'dev-withperplexities.csv' test_file = base_data_filename + 'test-withperplexities.csv' label_key_filename = base_data_filename + 'classes.txt' train_df, dev_df, test_df, num_labels = read_in_presplit_data( train_file, dev_file, test_file, label_key_filename) full_list_of_perplexities = train_df['perplexity'].tolist() + \ dev_df['perplexity'].tolist() + \ test_df['perplexity'].tolist() full_list_of_perplexities = [float(val) for val in full_list_of_perplexities] full_list_of_perplexities = sorted(full_list_of_perplexities, key=lambda x: inf if isnan(x) else x) num_nans = 0 for val in full_list_of_perplexities: if isnan(val): num_nans += 1 print('Num NaNs: ' + str(num_nans)) print('Quick check that sorting worked:')
self.cuda_device = cuda_device def __len__(self): return 1 def __getitem__(self, item): return torch.tensor(self.single_example[0], dtype=torch.long) if __name__ == '__main__': if torch.cuda.is_available(): cuda_device = 0 else: cuda_device = -1 train_df, dev_df, test_df, num_labels = \ read_in_presplit_data(train_filename, dev_filename, test_filename, label_key_filename) dev_perplexities = \ get_gpt2_perplexity_for_every_sentence(dev_df, 'dev_sentence_perplexities.tsv', cuda_device=cuda_device) dev_df['perplexity'] = dev_perplexities new_dev_filename = dev_filename[:dev_filename.rfind('.')] + '-withperplexities' + \ dev_filename[dev_filename.rfind('.'):] dev_df.to_csv(new_dev_filename, index=False) test_perplexities = \ get_gpt2_perplexity_for_every_sentence(test_df, 'test_sentence_perplexities.tsv', cuda_device=cuda_device) test_df['perplexity'] = test_perplexities new_test_filename = test_filename[:test_filename.rfind('.')] + '-withperplexities' + \ test_filename[test_filename.rfind('.'):] test_df.to_csv(new_test_filename, index=False) training_perplexities = \
#################################################### train_fname = filename_stub + 'train.csv' dev_fname = filename_stub + 'dev.csv' test_fname = filename_stub + 'test.csv' classkey_fname = filename_stub + 'classes.txt' new_train_fname = new_filename_stub + 'train.csv' new_dev_fname = new_filename_stub + 'dev.csv' new_test_fname = new_filename_stub + 'test.csv' new_classkey_fname = new_filename_stub + 'classes.txt' train_df, dev_df, test_df, num_labels = \ read_in_presplit_data(train_fname, dev_fname, test_fname, classkey_fname) def change_label_of_df_to_new_labels(df): list_of_examples = [] for i, row in df.iterrows(): oldstrlabel = str(row.loc['strlabel']) label = oldlabel_to_newlabel_dict[oldstrlabel] text = str(row.loc['text']) list_of_examples.append((text, label)) return list_of_examples train_list = change_label_of_df_to_new_labels(train_df) dev_list = change_label_of_df_to_new_labels(dev_df) test_list = change_label_of_df_to_new_labels(test_df)
def main(): previously_extracted_header = None dataframe = pd.read_csv('../justifications_clean_text_ohe.csv') """all_fnames_currently_in_data = set(dataframe['img_file_orig']) with open('../../OCRdata/NI_docs/negative_filenames_also_in_current_data.txt', 'r') as f: for line in f: line = line.strip() if line != '': all_fnames_currently_in_data.add(line) for fname in all_fnames_currently_in_data: assert '/' not in fname""" tags_to_documents = {} all_fnames_currently_in_data = set() with open(full_doc_fname, 'r', encoding='utf-8-sig') as f: keep_going = True while keep_going: document, tag, previously_extracted_header = \ extract_and_tag_next_document(f, previously_extracted_header=previously_extracted_header) if document is None: keep_going = False else: tags_to_documents[tag] = document all_fnames_currently_in_data.add(tag) if adding_context_to_data_for_binary_task: for fname in glob('../../OCRdata/NI_docs/NI_docs_all/*.txt'): with open(fname, 'r') as f: relevant_tag_part = fname[fname.rfind('/') + 1:fname.rfind('.')] #if relevant_tag_part not in all_fnames_currently_in_data: document = '' for line in f: document += line document = document.strip() tag = extract_file_image_tag_from_relevant_part_of_header_string( relevant_tag_part) if tag not in all_fnames_currently_in_data or tag == ( 'CJ_4_458', 'IMG_1922'): tags_to_documents[tag] = document # insert redirect to multiway here if 'multiway' in source_train_filename: # load in dataframes train_df, dev_df, test_df, _ = read_in_presplit_data( source_train_filename, source_dev_filename, source_test_filename, None, shuffle_data=False) non_text_columns, has_filename_col_already = get_other_colnames( train_df) assert has_filename_col_already augment_multiway_data(train_df, dev_df, test_df, tags_to_documents, non_text_columns, new_train_filename, new_dev_filename, new_test_filename) """ Couldn't find context for 169 training sentences out of 1647 215 / 1647 training sentences were at document start. Couldn't find context for 20 dev sentences out of 208 27 / 208 dev sentences were at document start. Couldn't find context for 25 test sentences out of 208 31 / 208 test sentences were at document start. """ else: # automatically determines whether filename column exists already train_dict, dev_dict, test_dict, non_text_columns, has_filename_col_already = \ read_in_existing_csv_files(source_train_filename, source_dev_filename, source_test_filename) list_of_all_datasplit_dicts = [train_dict, dev_dict, test_dict] document_text_filename_tuples = [ (doc, tag[0] + '/' + tag[1]) for tag, doc in tags_to_documents.items() ] # for each document: # split its sentences # figure out which data split a document (page) is in # add this document's sentences to a file set_of_fnames_done_so_far = set() for document_tuple in tqdm(document_text_filename_tuples): document_text = document_tuple[0] document_filename = document_tuple[1] assert document_filename not in set_of_fnames_done_so_far set_of_fnames_done_so_far.add(document_filename) sentence_split_inds = get_sentence_split_inds(document_text) list_of_sentences = [] start_ind = 0 for split_ind in sentence_split_inds: list_of_sentences.append( document_text[start_ind:split_ind].strip()) start_ind = split_ind for i in range(len(list_of_sentences) - 1, -1, -1): if len(list_of_sentences[i]) == 0: del list_of_sentences[i] (dict_corresponding_to_document, ind_of_start_sent_in_original_splitfile, ind_of_end_sent_in_original_splitfile) = \ find_which_split_the_document_got_sorted_into(list_of_sentences, list_of_all_datasplit_dicts, len(non_text_columns), has_filename_col_already, document_filename=document_filename) add_contexts_for_document(list_of_sentences, dict_corresponding_to_document, document_filename, ind_of_start_sent_in_original_splitfile, ind_of_end_sent_in_original_splitfile, len(non_text_columns), has_filename_col_already) write_new_files(train_dict, dev_dict, test_dict, non_text_columns, list_of_all_datasplit_dicts)