def get_context_for_positive_sents_in_doc( full_doc_text, list_of_positive_sents_purportedly_in_doc, doctag): # get inds of all sents in full doc text sentence_split_inds = get_sentence_split_inds(full_doc_text) list_of_sentence_ind_tups = [] list_of_sentences = [] start_ind = 0 for split_ind in sentence_split_inds: list_of_sentence_ind_tups.append((start_ind, split_ind)) list_of_sentences.append(full_doc_text[start_ind:split_ind].strip()) start_ind = split_ind for i in range(len(list_of_sentence_ind_tups) - 1, -1, -1): if len(list_of_sentences[i]) == 0: del list_of_sentences[i] del list_of_sentence_ind_tups[i] list_of_corresponding_contexts = [] num_we_couldnt_find_context_for = 0 num_at_start_of_doc = 0 for positive_sentence in list_of_positive_sents_purportedly_in_doc: index_tuple = get_indices_of_sentencematch_in_document( full_doc_text, positive_sentence, doctag, False, False, False, dont_print_at_all=True) if index_tuple is None: # we couldn't find a matching sentence list_of_corresponding_contexts.append(' ') num_we_couldnt_find_context_for += 1 else: # we found a matching sentence, so figure out what the context before should be # what is the latest sentence ending that is <= the start ending of our positive sentence? neighbor_sentence_ind = None for tupind in range(len(list_of_sentence_ind_tups) - 1, -1, -1): ind_tup = list_of_sentence_ind_tups[tupind] if ind_tup[1] <= index_tuple[0]: neighbor_sentence_ind = tupind break if neighbor_sentence_ind is None: num_at_start_of_doc += 1 list_of_corresponding_contexts.append( full_doc_text[:index_tuple[0]]) else: # this neighbor sentence is guaranteed to appear in full-- append any extra that's cut off # by the start of our positive sentence list_of_corresponding_contexts.append( full_doc_text[list_of_sentence_ind_tups[max( 0, neighbor_sentence_ind - num_preceding_sents_to_use_as_context + 1)][0]:index_tuple[0]]) return list_of_corresponding_contexts, num_we_couldnt_find_context_for, num_at_start_of_doc
def get_list_of_sents_in_text(text): sent_split_inds = get_sentence_split_inds(text) list_of_sents = [] start_ind = 0 for end_ind in sent_split_inds: sent_to_add = text[start_ind:end_ind].strip() if sent_to_add != '': list_of_sents.append(sent_to_add) start_ind = end_ind return list_of_sents
def main(): previously_extracted_header = None dataframe = pd.read_csv('../justifications_clean_text_ohe.csv') """all_fnames_currently_in_data = set(dataframe['img_file_orig']) with open('../../OCRdata/NI_docs/negative_filenames_also_in_current_data.txt', 'r') as f: for line in f: line = line.strip() if line != '': all_fnames_currently_in_data.add(line) for fname in all_fnames_currently_in_data: assert '/' not in fname""" tags_to_documents = {} all_fnames_currently_in_data = set() with open(full_doc_fname, 'r', encoding='utf-8-sig') as f: keep_going = True while keep_going: document, tag, previously_extracted_header = \ extract_and_tag_next_document(f, previously_extracted_header=previously_extracted_header) if document is None: keep_going = False else: tags_to_documents[tag] = document all_fnames_currently_in_data.add(tag) if adding_context_to_data_for_binary_task: for fname in glob('../../OCRdata/NI_docs/NI_docs_all/*.txt'): with open(fname, 'r') as f: relevant_tag_part = fname[fname.rfind('/') + 1:fname.rfind('.')] #if relevant_tag_part not in all_fnames_currently_in_data: document = '' for line in f: document += line document = document.strip() tag = extract_file_image_tag_from_relevant_part_of_header_string( relevant_tag_part) if tag not in all_fnames_currently_in_data or tag == ( 'CJ_4_458', 'IMG_1922'): tags_to_documents[tag] = document # insert redirect to multiway here if 'multiway' in source_train_filename: # load in dataframes train_df, dev_df, test_df, _ = read_in_presplit_data( source_train_filename, source_dev_filename, source_test_filename, None, shuffle_data=False) non_text_columns, has_filename_col_already = get_other_colnames( train_df) assert has_filename_col_already augment_multiway_data(train_df, dev_df, test_df, tags_to_documents, non_text_columns, new_train_filename, new_dev_filename, new_test_filename) """ Couldn't find context for 169 training sentences out of 1647 215 / 1647 training sentences were at document start. Couldn't find context for 20 dev sentences out of 208 27 / 208 dev sentences were at document start. Couldn't find context for 25 test sentences out of 208 31 / 208 test sentences were at document start. """ else: # automatically determines whether filename column exists already train_dict, dev_dict, test_dict, non_text_columns, has_filename_col_already = \ read_in_existing_csv_files(source_train_filename, source_dev_filename, source_test_filename) list_of_all_datasplit_dicts = [train_dict, dev_dict, test_dict] document_text_filename_tuples = [ (doc, tag[0] + '/' + tag[1]) for tag, doc in tags_to_documents.items() ] # for each document: # split its sentences # figure out which data split a document (page) is in # add this document's sentences to a file set_of_fnames_done_so_far = set() for document_tuple in tqdm(document_text_filename_tuples): document_text = document_tuple[0] document_filename = document_tuple[1] assert document_filename not in set_of_fnames_done_so_far set_of_fnames_done_so_far.add(document_filename) sentence_split_inds = get_sentence_split_inds(document_text) list_of_sentences = [] start_ind = 0 for split_ind in sentence_split_inds: list_of_sentences.append( document_text[start_ind:split_ind].strip()) start_ind = split_ind for i in range(len(list_of_sentences) - 1, -1, -1): if len(list_of_sentences[i]) == 0: del list_of_sentences[i] (dict_corresponding_to_document, ind_of_start_sent_in_original_splitfile, ind_of_end_sent_in_original_splitfile) = \ find_which_split_the_document_got_sorted_into(list_of_sentences, list_of_all_datasplit_dicts, len(non_text_columns), has_filename_col_already, document_filename=document_filename) add_contexts_for_document(list_of_sentences, dict_corresponding_to_document, document_filename, ind_of_start_sent_in_original_splitfile, ind_of_end_sent_in_original_splitfile, len(non_text_columns), has_filename_col_already) write_new_files(train_dict, dev_dict, test_dict, non_text_columns, list_of_all_datasplit_dicts)
def main(): # get tags to list of all distinct sentence inds tags_we_want = get_list_of_tags_we_want() tags_to_docs = get_dict_of_tags_we_want_to_docs(tags_we_want) tags_to_doc_sentence_inds = {} for tag, doc in tags_to_docs.items(): if use_spacy_to_split_sents: tags_to_doc_sentence_inds[tag] = get_sentence_split_inds_spacy(doc) else: tags_to_doc_sentence_inds[tag] = get_sentence_split_inds(doc) # get tags to list of (positive_sentence_inds, all_labels_for_sentence) sentence_rawtag_isproblemfiller_labels = load_in_positive_sentences_with_multilabels( positive_sentence_filename) for i in range(len(sentence_rawtag_isproblemfiller_labels) - 1, -1, -1): if sentence_rawtag_isproblemfiller_labels[i][ 1] not in tags_to_doc_sentence_inds: del sentence_rawtag_isproblemfiller_labels[i] positivesentences_tags, corresponding_indices_in_document = \ get_corresponding_indices_in_document(sentence_rawtag_isproblemfiller_labels, tags_to_docs, 'problems_writing_excel_sheet' + ('_spacy' if use_spacy_to_split_sents else '') + '.txt', 'successes_writing_excel_sheet' + ('_spacy' if use_spacy_to_split_sents else '') + '.txt', skip_positive_sents_we_have_no_doc_for=True) tags_to_list_of_positive_sentence_inds_and_labels = {} for i in range(len(positivesentences_tags)): tag = positivesentences_tags[i][1] corr_labels = positivesentences_tags[i][3] corr_inds = corresponding_indices_in_document[i] if corr_inds is not None: if tag in tags_to_list_of_positive_sentence_inds_and_labels: tags_to_list_of_positive_sentence_inds_and_labels[tag].append( (corr_inds, corr_labels)) else: tags_to_list_of_positive_sentence_inds_and_labels[tag] = [ (corr_inds, corr_labels) ] tags_to_list_of_foundindoc_positive_sentences_and_labels = {} tags_to_list_of_foundindoc_negative_sentences = {} for tag in tags_to_docs.keys(): document = tags_to_docs[tag] if tag in tags_to_list_of_positive_sentence_inds_and_labels: list_of_positive_sentence_inds_in_doc = tags_to_list_of_positive_sentence_inds_and_labels[ tag] else: list_of_positive_sentence_inds_in_doc = [] positive_sentences_and_labels, negative_sentences, _ = \ get_lists_of_positive_negative_sentences_from_doc_with_all_pos_labels_for_sentence( document, list_of_positive_sentence_inds_in_doc) tags_to_list_of_foundindoc_positive_sentences_and_labels[ tag] = positive_sentences_and_labels tags_to_list_of_foundindoc_negative_sentences[tag] = negative_sentences # now get an ordered list of all sentences in doc with all of their corresponding labels (if any) tags_to_sentslabels = {} for tag in tags_to_docs: list_of_sentencelabels_tuples = [] document = tags_to_docs[tag] ordered_sentence_inds = tags_to_doc_sentence_inds[tag] ordered_positive_sents_and_labels = tags_to_list_of_foundindoc_positive_sentences_and_labels[ tag] ordered_negative_sents = tags_to_list_of_foundindoc_negative_sentences[ tag] cur_pos_ind = 0 cur_neg_ind = 0 sent_start_ind = 0 for ind_ind, ind in enumerate(ordered_sentence_inds): sent_end_ind = ind cur_sentence = document[sent_start_ind:sent_end_ind].strip() if cur_sentence == '': continue if cur_pos_ind < len(ordered_positive_sents_and_labels) and \ cur_sentence == ordered_positive_sents_and_labels[cur_pos_ind][0].strip(): list_of_sentencelabels_tuples.append( (cur_sentence, ordered_positive_sents_and_labels[cur_pos_ind][1])) cur_pos_ind += 1 elif cur_neg_ind < len(ordered_negative_sents) and \ cur_sentence == ordered_negative_sents[cur_neg_ind].strip(): list_of_sentencelabels_tuples.append( (ordered_negative_sents[cur_neg_ind], [])) cur_neg_ind += 1 else: assert False, '\n'.join(['This should never happen. Next sentences:', cur_sentence, ('END' if cur_pos_ind >= len(ordered_positive_sents_and_labels) else ordered_positive_sents_and_labels[cur_pos_ind][0]), ('END' if cur_neg_ind >= len(ordered_negative_sents) else ordered_negative_sents[cur_neg_ind]) ]) + '\n=======================\n' + \ str([ps[0] for ps in ordered_positive_sents_and_labels]) + '\n=====================\n' + \ str(ordered_negative_sents) + '\n=====================\n' + str(ind_ind) sent_start_ind = sent_end_ind sent_end_ind = len(document) cur_sentence = document[sent_start_ind:sent_end_ind].strip() if cur_sentence != '': if cur_pos_ind < len(ordered_positive_sents_and_labels) and cur_sentence == \ ordered_positive_sents_and_labels[cur_pos_ind][0].strip(): list_of_sentencelabels_tuples.append( ordered_positive_sents_and_labels[cur_pos_ind]) cur_pos_ind += 1 elif cur_neg_ind < len(ordered_negative_sents) and \ cur_sentence == ordered_negative_sents[cur_neg_ind].strip(): list_of_sentencelabels_tuples.append( (ordered_negative_sents[cur_neg_ind], [])) cur_neg_ind += 1 else: assert False, '\n'.join(['This should never happen. Next sentences:', cur_sentence, ('END' if cur_pos_ind >= len(ordered_positive_sents_and_labels) else ordered_positive_sents_and_labels[cur_pos_ind][0]), ('END' if cur_neg_ind >= len(ordered_negative_sents) else ordered_negative_sents[cur_neg_ind]) ]) + '\n=======================\n' + \ str([ps[0] for ps in ordered_positive_sents_and_labels]) + '\n=====================\n' + \ str(ordered_negative_sents) + '\n=====================\n' + \ str(len(ordered_sentence_inds)) tags_to_sentslabels[tag] = list_of_sentencelabels_tuples # now assemble CSV file make_csv_file(csv_filename, tags_to_sentslabels) make_excel_file(csv_filename[:csv_filename.rfind('.')] + '.xls', tags_to_sentslabels)
def get_lists_of_positive_negative_sentences_from_doc_with_all_pos_labels_for_sentence( document, list_of_positive_sentence_inds_in_doc): if use_spacy_to_split_sents: sentence_split_inds = get_sentence_split_inds_spacy(document) else: sentence_split_inds = get_sentence_split_inds(document) list_of_positive_sentence_inds_in_doc = sorted( list_of_positive_sentence_inds_in_doc, key=lambda x: x[0][0]) negative_spans = [] positive_spans = [] corresponding_source_positive_sentences = [] span_start = 0 cur_positive_sentence_ind = 0 for split_ind in sentence_split_inds: overlaps_with_positive_sentence = False all_relevant_positive_labels = set() positive_sentence_overlap_start_ind = None while cur_positive_sentence_ind < len(list_of_positive_sentence_inds_in_doc) and \ ((span_start <= list_of_positive_sentence_inds_in_doc[cur_positive_sentence_ind][0][0] < split_ind) or (span_start < list_of_positive_sentence_inds_in_doc[cur_positive_sentence_ind][0][1] <= split_ind)): # this auto-split "sentence" overlaps with a positive one, so it's positive. # this is a while loop because it might overlap with multiple positive sentences. overlaps_with_positive_sentence = True for label in list_of_positive_sentence_inds_in_doc[ cur_positive_sentence_ind][1]: all_relevant_positive_labels.add(label) if positive_sentence_overlap_start_ind is None: positive_sentence_overlap_start_ind = cur_positive_sentence_ind if span_start < list_of_positive_sentence_inds_in_doc[ cur_positive_sentence_ind][0][1] <= split_ind: cur_positive_sentence_ind += 1 else: break if overlaps_with_positive_sentence: positive_spans.append( ((span_start, split_ind), all_relevant_positive_labels)) source_positive_sentences_to_log = list( range(positive_sentence_overlap_start_ind, cur_positive_sentence_ind)) # now decide whether to add cur_positive_sentence_ind to that list as an overlapping sentence if cur_positive_sentence_ind < len(list_of_positive_sentence_inds_in_doc) and \ ((span_start <= list_of_positive_sentence_inds_in_doc[cur_positive_sentence_ind][0][0] < split_ind) or (span_start < list_of_positive_sentence_inds_in_doc[cur_positive_sentence_ind][0][1] <= split_ind)): source_positive_sentences_to_log.append( cur_positive_sentence_ind) corresponding_source_positive_sentences.append( document[list_of_positive_sentence_inds_in_doc[ source_positive_sentences_to_log[0]][0][0]: list_of_positive_sentence_inds_in_doc[ source_positive_sentences_to_log[-1]][0][1]]) else: negative_spans.append((span_start, split_ind)) span_start = split_ind assert cur_positive_sentence_ind == len( list_of_positive_sentence_inds_in_doc) positive_sentences = list( zip( [ document[span[0][0]:span[0][1]].strip() for span in positive_spans ], [span[1] for span in positive_spans], # these are the label lists corresponding_source_positive_sentences)) negative_sentences = [ document[span[0]:span[1]].strip() for span in negative_spans ] for i in range(len(positive_sentences) - 1, -1, -1): if len(positive_sentences[i][0]) == 0: del positive_sentences[i] for i in range(len(negative_sentences) - 1, -1, -1): if len(negative_sentences[i]) == 0: del negative_sentences[i] return [(positive_sentence[0], positive_sentence[1]) for positive_sentence in positive_sentences], \ negative_sentences, \ [positive_sentence[2] for positive_sentence in positive_sentences]