def get_context_for_positive_sents_in_doc(
        full_doc_text, list_of_positive_sents_purportedly_in_doc, doctag):
    # get inds of all sents in full doc text
    sentence_split_inds = get_sentence_split_inds(full_doc_text)
    list_of_sentence_ind_tups = []
    list_of_sentences = []
    start_ind = 0
    for split_ind in sentence_split_inds:
        list_of_sentence_ind_tups.append((start_ind, split_ind))
        list_of_sentences.append(full_doc_text[start_ind:split_ind].strip())
        start_ind = split_ind
    for i in range(len(list_of_sentence_ind_tups) - 1, -1, -1):
        if len(list_of_sentences[i]) == 0:
            del list_of_sentences[i]
            del list_of_sentence_ind_tups[i]

    list_of_corresponding_contexts = []

    num_we_couldnt_find_context_for = 0
    num_at_start_of_doc = 0
    for positive_sentence in list_of_positive_sents_purportedly_in_doc:
        index_tuple = get_indices_of_sentencematch_in_document(
            full_doc_text,
            positive_sentence,
            doctag,
            False,
            False,
            False,
            dont_print_at_all=True)
        if index_tuple is None:
            # we couldn't find a matching sentence
            list_of_corresponding_contexts.append(' ')
            num_we_couldnt_find_context_for += 1
        else:
            # we found a matching sentence, so figure out what the context before should be
            # what is the latest sentence ending that is <= the start ending of our positive sentence?

            neighbor_sentence_ind = None
            for tupind in range(len(list_of_sentence_ind_tups) - 1, -1, -1):
                ind_tup = list_of_sentence_ind_tups[tupind]
                if ind_tup[1] <= index_tuple[0]:
                    neighbor_sentence_ind = tupind
                    break
            if neighbor_sentence_ind is None:
                num_at_start_of_doc += 1
                list_of_corresponding_contexts.append(
                    full_doc_text[:index_tuple[0]])
            else:
                # this neighbor sentence is guaranteed to appear in full-- append any extra that's cut off
                # by the start of our positive sentence
                list_of_corresponding_contexts.append(
                    full_doc_text[list_of_sentence_ind_tups[max(
                        0, neighbor_sentence_ind -
                        num_preceding_sents_to_use_as_context +
                        1)][0]:index_tuple[0]])

    return list_of_corresponding_contexts, num_we_couldnt_find_context_for, num_at_start_of_doc
def get_list_of_sents_in_text(text):
    sent_split_inds = get_sentence_split_inds(text)
    list_of_sents = []
    start_ind = 0
    for end_ind in sent_split_inds:
        sent_to_add = text[start_ind:end_ind].strip()
        if sent_to_add != '':
            list_of_sents.append(sent_to_add)
        start_ind = end_ind
    return list_of_sents
def main():
    previously_extracted_header = None
    dataframe = pd.read_csv('../justifications_clean_text_ohe.csv')
    """all_fnames_currently_in_data = set(dataframe['img_file_orig'])
    with open('../../OCRdata/NI_docs/negative_filenames_also_in_current_data.txt', 'r') as f:
        for line in f:
            line = line.strip()
            if line != '':
                all_fnames_currently_in_data.add(line)
    for fname in all_fnames_currently_in_data:
        assert '/' not in fname"""

    tags_to_documents = {}
    all_fnames_currently_in_data = set()
    with open(full_doc_fname, 'r', encoding='utf-8-sig') as f:
        keep_going = True
        while keep_going:
            document, tag, previously_extracted_header = \
                extract_and_tag_next_document(f, previously_extracted_header=previously_extracted_header)
            if document is None:
                keep_going = False
            else:
                tags_to_documents[tag] = document
                all_fnames_currently_in_data.add(tag)
    if adding_context_to_data_for_binary_task:
        for fname in glob('../../OCRdata/NI_docs/NI_docs_all/*.txt'):
            with open(fname, 'r') as f:
                relevant_tag_part = fname[fname.rfind('/') +
                                          1:fname.rfind('.')]
                #if relevant_tag_part not in all_fnames_currently_in_data:
                document = ''
                for line in f:
                    document += line
                document = document.strip()
                tag = extract_file_image_tag_from_relevant_part_of_header_string(
                    relevant_tag_part)
                if tag not in all_fnames_currently_in_data or tag == (
                        'CJ_4_458', 'IMG_1922'):
                    tags_to_documents[tag] = document

    # insert redirect to multiway here
    if 'multiway' in source_train_filename:
        # load in dataframes
        train_df, dev_df, test_df, _ = read_in_presplit_data(
            source_train_filename,
            source_dev_filename,
            source_test_filename,
            None,
            shuffle_data=False)

        non_text_columns, has_filename_col_already = get_other_colnames(
            train_df)
        assert has_filename_col_already
        augment_multiway_data(train_df, dev_df, test_df, tags_to_documents,
                              non_text_columns, new_train_filename,
                              new_dev_filename, new_test_filename)
        """
        Couldn't find context for 169 training sentences out of 1647
        215 / 1647 training sentences were at document start.
        Couldn't find context for 20 dev sentences out of 208
        27 / 208 dev sentences were at document start.
        Couldn't find context for 25 test sentences out of 208
        31 / 208 test sentences were at document start.
        """
    else:
        # automatically determines whether filename column exists already
        train_dict, dev_dict, test_dict, non_text_columns, has_filename_col_already = \
            read_in_existing_csv_files(source_train_filename, source_dev_filename, source_test_filename)
        list_of_all_datasplit_dicts = [train_dict, dev_dict, test_dict]

        document_text_filename_tuples = [
            (doc, tag[0] + '/' + tag[1])
            for tag, doc in tags_to_documents.items()
        ]

        # for each document:
        #     split its sentences
        #     figure out which data split a document (page) is in
        #     add this document's sentences to a file
        set_of_fnames_done_so_far = set()
        for document_tuple in tqdm(document_text_filename_tuples):
            document_text = document_tuple[0]
            document_filename = document_tuple[1]
            assert document_filename not in set_of_fnames_done_so_far
            set_of_fnames_done_so_far.add(document_filename)
            sentence_split_inds = get_sentence_split_inds(document_text)
            list_of_sentences = []
            start_ind = 0
            for split_ind in sentence_split_inds:
                list_of_sentences.append(
                    document_text[start_ind:split_ind].strip())
                start_ind = split_ind
            for i in range(len(list_of_sentences) - 1, -1, -1):
                if len(list_of_sentences[i]) == 0:
                    del list_of_sentences[i]

            (dict_corresponding_to_document, ind_of_start_sent_in_original_splitfile,
            ind_of_end_sent_in_original_splitfile) = \
                find_which_split_the_document_got_sorted_into(list_of_sentences, list_of_all_datasplit_dicts,
                                                              len(non_text_columns), has_filename_col_already,
                                                              document_filename=document_filename)
            add_contexts_for_document(list_of_sentences,
                                      dict_corresponding_to_document,
                                      document_filename,
                                      ind_of_start_sent_in_original_splitfile,
                                      ind_of_end_sent_in_original_splitfile,
                                      len(non_text_columns),
                                      has_filename_col_already)

        write_new_files(train_dict, dev_dict, test_dict, non_text_columns,
                        list_of_all_datasplit_dicts)
def main():
    # get tags to list of all distinct sentence inds
    tags_we_want = get_list_of_tags_we_want()
    tags_to_docs = get_dict_of_tags_we_want_to_docs(tags_we_want)
    tags_to_doc_sentence_inds = {}
    for tag, doc in tags_to_docs.items():
        if use_spacy_to_split_sents:
            tags_to_doc_sentence_inds[tag] = get_sentence_split_inds_spacy(doc)
        else:
            tags_to_doc_sentence_inds[tag] = get_sentence_split_inds(doc)

    # get tags to list of (positive_sentence_inds, all_labels_for_sentence)
    sentence_rawtag_isproblemfiller_labels = load_in_positive_sentences_with_multilabels(
        positive_sentence_filename)
    for i in range(len(sentence_rawtag_isproblemfiller_labels) - 1, -1, -1):
        if sentence_rawtag_isproblemfiller_labels[i][
                1] not in tags_to_doc_sentence_inds:
            del sentence_rawtag_isproblemfiller_labels[i]
    positivesentences_tags, corresponding_indices_in_document = \
        get_corresponding_indices_in_document(sentence_rawtag_isproblemfiller_labels, tags_to_docs,
                                              'problems_writing_excel_sheet' +
                                              ('_spacy' if use_spacy_to_split_sents else '') + '.txt',
                                              'successes_writing_excel_sheet' +
                                              ('_spacy' if use_spacy_to_split_sents else '') + '.txt',
                                              skip_positive_sents_we_have_no_doc_for=True)
    tags_to_list_of_positive_sentence_inds_and_labels = {}
    for i in range(len(positivesentences_tags)):
        tag = positivesentences_tags[i][1]
        corr_labels = positivesentences_tags[i][3]
        corr_inds = corresponding_indices_in_document[i]
        if corr_inds is not None:
            if tag in tags_to_list_of_positive_sentence_inds_and_labels:
                tags_to_list_of_positive_sentence_inds_and_labels[tag].append(
                    (corr_inds, corr_labels))
            else:
                tags_to_list_of_positive_sentence_inds_and_labels[tag] = [
                    (corr_inds, corr_labels)
                ]

    tags_to_list_of_foundindoc_positive_sentences_and_labels = {}
    tags_to_list_of_foundindoc_negative_sentences = {}
    for tag in tags_to_docs.keys():
        document = tags_to_docs[tag]
        if tag in tags_to_list_of_positive_sentence_inds_and_labels:
            list_of_positive_sentence_inds_in_doc = tags_to_list_of_positive_sentence_inds_and_labels[
                tag]
        else:
            list_of_positive_sentence_inds_in_doc = []
        positive_sentences_and_labels, negative_sentences, _ = \
            get_lists_of_positive_negative_sentences_from_doc_with_all_pos_labels_for_sentence(
                document, list_of_positive_sentence_inds_in_doc)
        tags_to_list_of_foundindoc_positive_sentences_and_labels[
            tag] = positive_sentences_and_labels
        tags_to_list_of_foundindoc_negative_sentences[tag] = negative_sentences

    # now get an ordered list of all sentences in doc with all of their corresponding labels (if any)
    tags_to_sentslabels = {}
    for tag in tags_to_docs:
        list_of_sentencelabels_tuples = []
        document = tags_to_docs[tag]
        ordered_sentence_inds = tags_to_doc_sentence_inds[tag]
        ordered_positive_sents_and_labels = tags_to_list_of_foundindoc_positive_sentences_and_labels[
            tag]
        ordered_negative_sents = tags_to_list_of_foundindoc_negative_sentences[
            tag]

        cur_pos_ind = 0
        cur_neg_ind = 0
        sent_start_ind = 0
        for ind_ind, ind in enumerate(ordered_sentence_inds):
            sent_end_ind = ind
            cur_sentence = document[sent_start_ind:sent_end_ind].strip()
            if cur_sentence == '':
                continue
            if cur_pos_ind < len(ordered_positive_sents_and_labels) and \
                    cur_sentence == ordered_positive_sents_and_labels[cur_pos_ind][0].strip():
                list_of_sentencelabels_tuples.append(
                    (cur_sentence,
                     ordered_positive_sents_and_labels[cur_pos_ind][1]))
                cur_pos_ind += 1
            elif cur_neg_ind < len(ordered_negative_sents) and \
                    cur_sentence == ordered_negative_sents[cur_neg_ind].strip():
                list_of_sentencelabels_tuples.append(
                    (ordered_negative_sents[cur_neg_ind], []))
                cur_neg_ind += 1
            else:
                assert False, '\n'.join(['This should never happen. Next sentences:',
                                         cur_sentence,
                                         ('END' if cur_pos_ind >= len(ordered_positive_sents_and_labels) else
                                          ordered_positive_sents_and_labels[cur_pos_ind][0]),
                                         ('END' if cur_neg_ind >= len(ordered_negative_sents) else
                                          ordered_negative_sents[cur_neg_ind])
                                         ]) + '\n=======================\n' + \
                    str([ps[0] for ps in ordered_positive_sents_and_labels]) + '\n=====================\n' + \
                    str(ordered_negative_sents) + '\n=====================\n' + str(ind_ind)
            sent_start_ind = sent_end_ind
        sent_end_ind = len(document)
        cur_sentence = document[sent_start_ind:sent_end_ind].strip()
        if cur_sentence != '':
            if cur_pos_ind < len(ordered_positive_sents_and_labels) and cur_sentence == \
                    ordered_positive_sents_and_labels[cur_pos_ind][0].strip():
                list_of_sentencelabels_tuples.append(
                    ordered_positive_sents_and_labels[cur_pos_ind])
                cur_pos_ind += 1
            elif cur_neg_ind < len(ordered_negative_sents) and \
                    cur_sentence == ordered_negative_sents[cur_neg_ind].strip():
                list_of_sentencelabels_tuples.append(
                    (ordered_negative_sents[cur_neg_ind], []))
                cur_neg_ind += 1
            else:
                assert False, '\n'.join(['This should never happen. Next sentences:',
                                         cur_sentence,
                                         ('END' if cur_pos_ind >= len(ordered_positive_sents_and_labels) else
                                          ordered_positive_sents_and_labels[cur_pos_ind][0]),
                                         ('END' if cur_neg_ind >= len(ordered_negative_sents) else
                                          ordered_negative_sents[cur_neg_ind])
                                         ]) + '\n=======================\n' + \
                              str([ps[0] for ps in ordered_positive_sents_and_labels]) + '\n=====================\n' + \
                              str(ordered_negative_sents) + '\n=====================\n' + \
                              str(len(ordered_sentence_inds))

        tags_to_sentslabels[tag] = list_of_sentencelabels_tuples

    # now assemble CSV file
    make_csv_file(csv_filename, tags_to_sentslabels)
    make_excel_file(csv_filename[:csv_filename.rfind('.')] + '.xls',
                    tags_to_sentslabels)
def get_lists_of_positive_negative_sentences_from_doc_with_all_pos_labels_for_sentence(
        document, list_of_positive_sentence_inds_in_doc):
    if use_spacy_to_split_sents:
        sentence_split_inds = get_sentence_split_inds_spacy(document)
    else:
        sentence_split_inds = get_sentence_split_inds(document)
    list_of_positive_sentence_inds_in_doc = sorted(
        list_of_positive_sentence_inds_in_doc, key=lambda x: x[0][0])

    negative_spans = []
    positive_spans = []
    corresponding_source_positive_sentences = []
    span_start = 0
    cur_positive_sentence_ind = 0
    for split_ind in sentence_split_inds:
        overlaps_with_positive_sentence = False
        all_relevant_positive_labels = set()
        positive_sentence_overlap_start_ind = None
        while cur_positive_sentence_ind < len(list_of_positive_sentence_inds_in_doc) and \
                ((span_start <= list_of_positive_sentence_inds_in_doc[cur_positive_sentence_ind][0][0] < split_ind) or
                 (span_start < list_of_positive_sentence_inds_in_doc[cur_positive_sentence_ind][0][1] <= split_ind)):
            # this auto-split "sentence" overlaps with a positive one, so it's positive.
            # this is a while loop because it might overlap with multiple positive sentences.
            overlaps_with_positive_sentence = True
            for label in list_of_positive_sentence_inds_in_doc[
                    cur_positive_sentence_ind][1]:
                all_relevant_positive_labels.add(label)
            if positive_sentence_overlap_start_ind is None:
                positive_sentence_overlap_start_ind = cur_positive_sentence_ind
            if span_start < list_of_positive_sentence_inds_in_doc[
                    cur_positive_sentence_ind][0][1] <= split_ind:
                cur_positive_sentence_ind += 1
            else:
                break
        if overlaps_with_positive_sentence:
            positive_spans.append(
                ((span_start, split_ind), all_relevant_positive_labels))
            source_positive_sentences_to_log = list(
                range(positive_sentence_overlap_start_ind,
                      cur_positive_sentence_ind))
            # now decide whether to add cur_positive_sentence_ind to that list as an overlapping sentence
            if cur_positive_sentence_ind < len(list_of_positive_sentence_inds_in_doc) and \
                    ((span_start <= list_of_positive_sentence_inds_in_doc[cur_positive_sentence_ind][0][0] < split_ind) or
                     (span_start < list_of_positive_sentence_inds_in_doc[cur_positive_sentence_ind][0][1] <= split_ind)):
                source_positive_sentences_to_log.append(
                    cur_positive_sentence_ind)
            corresponding_source_positive_sentences.append(
                document[list_of_positive_sentence_inds_in_doc[
                    source_positive_sentences_to_log[0]][0][0]:
                         list_of_positive_sentence_inds_in_doc[
                             source_positive_sentences_to_log[-1]][0][1]])
        else:
            negative_spans.append((span_start, split_ind))
        span_start = split_ind
    assert cur_positive_sentence_ind == len(
        list_of_positive_sentence_inds_in_doc)
    positive_sentences = list(
        zip(
            [
                document[span[0][0]:span[0][1]].strip()
                for span in positive_spans
            ],
            [span[1] for span in positive_spans],  # these are the label lists
            corresponding_source_positive_sentences))
    negative_sentences = [
        document[span[0]:span[1]].strip() for span in negative_spans
    ]
    for i in range(len(positive_sentences) - 1, -1, -1):
        if len(positive_sentences[i][0]) == 0:
            del positive_sentences[i]
    for i in range(len(negative_sentences) - 1, -1, -1):
        if len(negative_sentences[i]) == 0:
            del negative_sentences[i]
    return [(positive_sentence[0], positive_sentence[1]) for positive_sentence in positive_sentences], \
           negative_sentences, \
           [positive_sentence[2] for positive_sentence in positive_sentences]