def get_ood_data(templates, train_dev_partition, test_partition, fo_test_ovit,
                 fo_test_ovot, output_header):
    set_vocab_by_type('ood')
    print('ood')
    num_examples = 300
    output_rows = []
    guid = 0
    for t in templates:
        existing_templates = []
        for i in range(num_examples):
            generated_template = t.generate_one_example()

            # check generated_template duplicate with existing
            while generated_template in existing_templates:
                # print('true')
                generated_template = t.generate_one_example()

            # add guid
            output_rows.append([guid] + generated_template)
            guid += 1
            existing_templates.append(generated_template)

    # filter by templates partitions
    output_rows_test_ovit = filter_by_template_partition(
        train_dev_partition, output_rows)
    output_rows_test_ovot = filter_by_template_partition(
        test_partition, output_rows)
    write_csv(fo_test_ovit, output_rows_test_ovit, output_header)
    write_csv(fo_test_ovot, output_rows_test_ovot, output_header)
def main():
    hans_found_support_case = './esnli_train_subseq.csv'
    output_file = './esnli_train_subseq_with_templates.csv'
    output_header = None
    output_rows = []

    hans_templates = get_hans_templates()

    with open(hans_found_support_case) as f:
        reader = csv.reader(f)
        for (i, line) in enumerate(reader):
            if i == 0:
                output_header = line
                continue
            label = line[1]
            premise_pos = get_pos_tags(line[2])
            hypothesis_pos = get_pos_tags(line[3])
            template_name = match_templates(label, premise_pos, hypothesis_pos,
                                            hans_templates)
            if template_name != None:
                line[-1] = template_name
                print('template_name: ', template_name)
                output_rows.append(line)

    write_csv(output_file, output_rows, output_header)
예제 #3
0
def main():
    fi = './templates.csv'
    fo = './templates_new.csv'

    rows = []

    with open(fi) as f:
        reader = csv.reader(f)
        for (i, line) in enumerate(reader):
            if i == 0:
                header = line
            if i > 0:
                for j in range(len(line)):
                    item = line[j]
                    while item[0] == ' ':
                        item = item[1:]
                    while item[-1] == ' ':
                        item = item[:-1]
                    line[j] = item
                rows.append(line)

    write_csv(fo, rows, header)
예제 #4
0
def main():
    data_dir_name = 'split_abundant_words_subcases'  
    num_seeds = 1
    fi_name_list = ['dev_1', 'dev_2', 'dev_4', 'dev_7', 'dev_13', 'dev_32',
             'train_1', 'train_2', 'train_4', 'train_8', 'train_16', 'train_32', 'train_64',
             'test_ivit_300', 'test_ivot_300', 'test_ovit_300', 'test_ovot_300']
    for seed in range(num_seeds):
        for partition in range(5):
            path = './%s/seed%d/partition%d/' % (data_dir_name, seed, partition)
            for fi in fi_name_list:
                fi_path = path + fi + '.csv'
                fo_nl = path + fi + '_nl.csv' # natural language explanation
                fo_pt = path + fi + '_pt.csv'# pointer-only explanations
                fo_empty_expl = path + fi + '_empty_expl.csv' # empty_expl

                nl_rows = []
                pt_rows = []
                empty_expl_rows = []

                with open(fi_path) as f:
                    reader = csv.reader(f)
                    for (i, line) in enumerate(reader):
                        if i > 0:
                            guid = line[0]
                            label = line[5]
                            p = line[-4]
                            h = line[-3]
                            nl = line[-2]
                            pt = line[-1]

                            nl_row = [""]*19
                            nl_row[0] = guid
                            nl_row[1] = label
                            nl_row[2] = p
                            nl_row[3] = h
                            nl_row[4] = nl

                            pt_row = [""]*19
                            pt_row[0] = guid
                            pt_row[1] = label
                            pt_row[2] = p
                            pt_row[3] = h
                            pt_row[4] = pt

                            empty_expl_row = [""]*19
                            empty_expl_row[0] = guid
                            empty_expl_row[1] = label
                            empty_expl_row[2] = p
                            empty_expl_row[3] = h

                            nl_rows.append(nl_row)
                            pt_rows.append(pt_row)
                            empty_expl_rows.append(empty_expl_row)

                write_csv(fo_nl, nl_rows, esnli_format_header)
                write_csv(fo_pt, pt_rows, esnli_format_header)
                write_csv(fo_empty_expl, empty_expl_rows, esnli_format_header)
def get_ind_data(templates, train_dev_partition, test_partition, fo_dir,
                 fo_train, fo_dev, fo_test_ivit, fo_test_ivot, output_header):
    set_vocab_by_type('ind')
    print('ind')
    num_examples = 492
    output_rows_train = []
    output_rows_dev = []
    output_rows_test = []
    guid_train = 0
    guid_dev = 0
    guid_test = 0
    output_rows_train_by_template = {}
    output_rows_dev_by_template = {}
    for t in templates:
        existing_templates = []
        for i in range(num_examples):
            generated_template = t.generate_one_example()

            # check generated_template duplicate with existing
            while generated_template in existing_templates:
                # print('true')
                generated_template = t.generate_one_example()

            # add guid
            if i < 160:
                line = [guid_train] + generated_template
                template_id = generated_template[0]
                if template_id in output_rows_train_by_template:
                    output_rows_train_by_template[template_id].append(line)
                else:
                    output_rows_train_by_template[template_id] = [line]
                output_rows_train.append(line)
                guid_train += 1
            elif i < 192:
                line = [guid_dev] + generated_template
                template_id = generated_template[0]
                if template_id in output_rows_dev_by_template:
                    output_rows_dev_by_template[template_id].append(line)
                else:
                    output_rows_dev_by_template[template_id] = [line]
                output_rows_dev.append(line)
                guid_dev += 1
            else:
                output_rows_test.append([guid_test] + generated_template)
                guid_test += 1

            existing_templates.append(generated_template)

    # filter by templates partitions
    output_rows_train_it = filter_by_template_partition(
        train_dev_partition, output_rows_train)
    output_rows_dev_it = filter_by_template_partition(train_dev_partition,
                                                      output_rows_dev)
    output_rows_test_ivit = filter_by_template_partition(
        train_dev_partition, output_rows_test)
    output_rows_test_ivot = filter_by_template_partition(
        test_partition, output_rows_test)
    write_csv(fo_train, output_rows_train_it, output_header)
    write_csv(fo_dev, output_rows_dev_it, output_header)
    write_csv(fo_test_ivit, output_rows_test_ivit, output_header)
    write_csv(fo_test_ivot, output_rows_test_ivot, output_header)

    # few sample for train and dev sets
    train_sample_sizes = [1, 2, 4, 8, 16, 32, 64]
    dev_sample_sizes = list(set([int(0.2 * k) + 1
                                 for k in train_sample_sizes]))
    for train_size in train_sample_sizes:
        fo_train = '%strain_%d.csv' % (fo_dir, train_size)
        output_rows = []
        for k, v in output_rows_train_by_template.items():
            output_rows.extend(v[:train_size])
        output_rows = filter_by_template_partition(train_dev_partition,
                                                   output_rows)
        write_csv(fo_train, output_rows, output_header)
    for dev_size in dev_sample_sizes:
        fo_dev = '%sdev_%d.csv' % (fo_dir, dev_size)
        output_rows = []
        for k, v in output_rows_dev_by_template.items():
            output_rows.extend(v[:dev_size])
        output_rows = filter_by_template_partition(train_dev_partition,
                                                   output_rows)
        write_csv(fo_dev, output_rows, output_header)
예제 #6
0
    hyp_words = []

    for word in premise.split():
        if word not in [".", "?", "!"]:
            prem_words.append(word.lower())

    for word in hypothesis.split():
        if word not in [".", "?", "!"]:
            hyp_words.append(word.lower())

    prem_filtered = " ".join(prem_words)
    hyp_filtered = " ".join(hyp_words)

    hypo_len = len(hypothesis.strip().split(" "))
    expl1_len = len(expl1.strip().split(" "))
    if hypo_len >= 3 and expl1_len >= 3:
        if hyp_filtered in prem_filtered:
            if label == "entailment":
                count_entailment += 1
            if label == "neutral":
                count_neutral += 1
            if label == "contradiction":
                count_contradiction += 1
            fo_row_data.append([guid, label, premise, hypothesis, expl1, ""])

print("Entailment:", count_entailment)
print("Contradiction:", count_contradiction)
print("Neutral:", count_neutral)

write_csv(fo_path, fo_row_data, fo_header)