def read_in_existing_csv_files(train_fname, dev_fname, test_fname):
    train_df, dev_df, test_df, _ = read_in_presplit_data(train_fname,
                                                         dev_fname,
                                                         test_fname,
                                                         None,
                                                         shuffle_data=False)
    """print(train_df.columns)
    for i, row in train_df.iterrows():
        print(str(row['contextbefore']) + '\t' + str(row['text']))
        print('\n')
        if i == 20:
            quit()"""
    train_dict = {}
    dev_dict = {}
    test_dict = {}
    non_text_columns, has_filename_col_already = get_other_colnames(train_df)
    for i, col_name in enumerate(dev_df.columns):
        col_name = str(col_name)
        assert (col_name == 'text' or col_name == 'filename' or col_name == 'contextbefore') or \
               col_name == non_text_columns[i - 1]
    for i, col_name in enumerate(test_df.columns):
        col_name = str(col_name)
        assert (col_name == 'text' or col_name == 'filename' or col_name == 'contextbefore') or \
               col_name == non_text_columns[i - 1]

    def populate_dict_with_data(df,
                                dict_to_populate,
                                is_train_for_debugging=False):
        if is_train_for_debugging:
            print('\n\n')

        for i, row in df.iterrows():
            text_from_row = row['text']
            all_other_parts_of_row = tuple(
                [row[colname] for colname in non_text_columns])
            if text_from_row in dict_to_populate:
                if has_filename_col_already:
                    dict_to_populate[text_from_row].append(
                        [i, all_other_parts_of_row, row['filename']])
                else:
                    dict_to_populate[text_from_row].append(
                        [i, all_other_parts_of_row])
            else:
                if has_filename_col_already:
                    dict_to_populate[text_from_row] = [[
                        i, all_other_parts_of_row, row['filename']
                    ]]
                else:
                    dict_to_populate[text_from_row] = [[
                        i, all_other_parts_of_row
                    ]]

    populate_dict_with_data(train_df, train_dict, is_train_for_debugging=False)
    populate_dict_with_data(dev_df, dev_dict)
    populate_dict_with_data(test_df, test_dict)

    return train_dict, dev_dict, test_dict, non_text_columns, has_filename_col_already
import sys
sys.path.append('..')
from prep_data import read_in_presplit_data
from math import isnan, inf

base_data_filename = '../data/binary_'
percentile_of_perplexities_to_keep = 85

percentile_of_perplexities_to_keep = percentile_of_perplexities_to_keep / 100
train_file = base_data_filename + 'train-withperplexities.csv'
dev_file = base_data_filename + 'dev-withperplexities.csv'
test_file = base_data_filename + 'test-withperplexities.csv'
label_key_filename = base_data_filename + 'classes.txt'

train_df, dev_df, test_df, num_labels = read_in_presplit_data(
    train_file, dev_file, test_file, label_key_filename)


full_list_of_perplexities = train_df['perplexity'].tolist() + \
                            dev_df['perplexity'].tolist() + \
                            test_df['perplexity'].tolist()
full_list_of_perplexities = [float(val) for val in full_list_of_perplexities]
full_list_of_perplexities = sorted(full_list_of_perplexities,
                                   key=lambda x: inf if isnan(x) else x)
num_nans = 0
for val in full_list_of_perplexities:
    if isnan(val):
        num_nans += 1
print('Num NaNs: ' + str(num_nans))

print('Quick check that sorting worked:')
示例#3
0
        self.cuda_device = cuda_device

    def __len__(self):
        return 1

    def __getitem__(self, item):
        return torch.tensor(self.single_example[0], dtype=torch.long)


if __name__ == '__main__':
    if torch.cuda.is_available():
        cuda_device = 0
    else:
        cuda_device = -1
    train_df, dev_df, test_df, num_labels = \
        read_in_presplit_data(train_filename, dev_filename, test_filename,
                              label_key_filename)
    dev_perplexities = \
        get_gpt2_perplexity_for_every_sentence(dev_df, 'dev_sentence_perplexities.tsv', cuda_device=cuda_device)
    dev_df['perplexity'] = dev_perplexities
    new_dev_filename = dev_filename[:dev_filename.rfind('.')] + '-withperplexities' + \
                       dev_filename[dev_filename.rfind('.'):]
    dev_df.to_csv(new_dev_filename, index=False)

    test_perplexities = \
        get_gpt2_perplexity_for_every_sentence(test_df, 'test_sentence_perplexities.tsv', cuda_device=cuda_device)
    test_df['perplexity'] = test_perplexities
    new_test_filename = test_filename[:test_filename.rfind('.')] + '-withperplexities' + \
                        test_filename[test_filename.rfind('.'):]
    test_df.to_csv(new_test_filename, index=False)

    training_perplexities = \
示例#4
0
####################################################

train_fname = filename_stub + 'train.csv'
dev_fname = filename_stub + 'dev.csv'
test_fname = filename_stub + 'test.csv'
classkey_fname = filename_stub + 'classes.txt'

new_train_fname = new_filename_stub + 'train.csv'
new_dev_fname = new_filename_stub + 'dev.csv'
new_test_fname = new_filename_stub + 'test.csv'
new_classkey_fname = new_filename_stub + 'classes.txt'


train_df, dev_df, test_df, num_labels = \
    read_in_presplit_data(train_fname, dev_fname, test_fname, classkey_fname)


def change_label_of_df_to_new_labels(df):
    list_of_examples = []
    for i, row in df.iterrows():
        oldstrlabel = str(row.loc['strlabel'])
        label = oldlabel_to_newlabel_dict[oldstrlabel]
        text = str(row.loc['text'])
        list_of_examples.append((text, label))
    return list_of_examples


train_list = change_label_of_df_to_new_labels(train_df)
dev_list = change_label_of_df_to_new_labels(dev_df)
test_list = change_label_of_df_to_new_labels(test_df)
def main():
    previously_extracted_header = None
    dataframe = pd.read_csv('../justifications_clean_text_ohe.csv')
    """all_fnames_currently_in_data = set(dataframe['img_file_orig'])
    with open('../../OCRdata/NI_docs/negative_filenames_also_in_current_data.txt', 'r') as f:
        for line in f:
            line = line.strip()
            if line != '':
                all_fnames_currently_in_data.add(line)
    for fname in all_fnames_currently_in_data:
        assert '/' not in fname"""

    tags_to_documents = {}
    all_fnames_currently_in_data = set()
    with open(full_doc_fname, 'r', encoding='utf-8-sig') as f:
        keep_going = True
        while keep_going:
            document, tag, previously_extracted_header = \
                extract_and_tag_next_document(f, previously_extracted_header=previously_extracted_header)
            if document is None:
                keep_going = False
            else:
                tags_to_documents[tag] = document
                all_fnames_currently_in_data.add(tag)
    if adding_context_to_data_for_binary_task:
        for fname in glob('../../OCRdata/NI_docs/NI_docs_all/*.txt'):
            with open(fname, 'r') as f:
                relevant_tag_part = fname[fname.rfind('/') +
                                          1:fname.rfind('.')]
                #if relevant_tag_part not in all_fnames_currently_in_data:
                document = ''
                for line in f:
                    document += line
                document = document.strip()
                tag = extract_file_image_tag_from_relevant_part_of_header_string(
                    relevant_tag_part)
                if tag not in all_fnames_currently_in_data or tag == (
                        'CJ_4_458', 'IMG_1922'):
                    tags_to_documents[tag] = document

    # insert redirect to multiway here
    if 'multiway' in source_train_filename:
        # load in dataframes
        train_df, dev_df, test_df, _ = read_in_presplit_data(
            source_train_filename,
            source_dev_filename,
            source_test_filename,
            None,
            shuffle_data=False)

        non_text_columns, has_filename_col_already = get_other_colnames(
            train_df)
        assert has_filename_col_already
        augment_multiway_data(train_df, dev_df, test_df, tags_to_documents,
                              non_text_columns, new_train_filename,
                              new_dev_filename, new_test_filename)
        """
        Couldn't find context for 169 training sentences out of 1647
        215 / 1647 training sentences were at document start.
        Couldn't find context for 20 dev sentences out of 208
        27 / 208 dev sentences were at document start.
        Couldn't find context for 25 test sentences out of 208
        31 / 208 test sentences were at document start.
        """
    else:
        # automatically determines whether filename column exists already
        train_dict, dev_dict, test_dict, non_text_columns, has_filename_col_already = \
            read_in_existing_csv_files(source_train_filename, source_dev_filename, source_test_filename)
        list_of_all_datasplit_dicts = [train_dict, dev_dict, test_dict]

        document_text_filename_tuples = [
            (doc, tag[0] + '/' + tag[1])
            for tag, doc in tags_to_documents.items()
        ]

        # for each document:
        #     split its sentences
        #     figure out which data split a document (page) is in
        #     add this document's sentences to a file
        set_of_fnames_done_so_far = set()
        for document_tuple in tqdm(document_text_filename_tuples):
            document_text = document_tuple[0]
            document_filename = document_tuple[1]
            assert document_filename not in set_of_fnames_done_so_far
            set_of_fnames_done_so_far.add(document_filename)
            sentence_split_inds = get_sentence_split_inds(document_text)
            list_of_sentences = []
            start_ind = 0
            for split_ind in sentence_split_inds:
                list_of_sentences.append(
                    document_text[start_ind:split_ind].strip())
                start_ind = split_ind
            for i in range(len(list_of_sentences) - 1, -1, -1):
                if len(list_of_sentences[i]) == 0:
                    del list_of_sentences[i]

            (dict_corresponding_to_document, ind_of_start_sent_in_original_splitfile,
            ind_of_end_sent_in_original_splitfile) = \
                find_which_split_the_document_got_sorted_into(list_of_sentences, list_of_all_datasplit_dicts,
                                                              len(non_text_columns), has_filename_col_already,
                                                              document_filename=document_filename)
            add_contexts_for_document(list_of_sentences,
                                      dict_corresponding_to_document,
                                      document_filename,
                                      ind_of_start_sent_in_original_splitfile,
                                      ind_of_end_sent_in_original_splitfile,
                                      len(non_text_columns),
                                      has_filename_col_already)

        write_new_files(train_dict, dev_dict, test_dict, non_text_columns,
                        list_of_all_datasplit_dicts)