예제 #1
0
def convert_to_formatted_sent(zipped_s_id_list,
                              evidence_set,
                              contain_head=True,
                              id_tokenized=True):
    sent_list = []
    for sent, sid in zipped_s_id_list:
        sent_item = dict()

        cur_sent = sent
        doc_id, ln = sid.split('(-.-)')[0], int(sid.split('(-.-)')[1])
        # print(sent, doc_id, ln)
        if contain_head:
            if not id_tokenized:
                doc_id_natural_format = fever_db.convert_brc(doc_id).replace(
                    '_', ' ')
                t_doc_id_natural_format = ' '.join(
                    easy_tokenize(doc_id_natural_format))
            else:
                t_doc_id_natural_format = common.doc_id_to_tokenized_text(
                    doc_id)

            if ln != 0 and t_doc_id_natural_format.lower() not in sent.lower():
                cur_sent = f"{t_doc_id_natural_format} <t> " + sent

            sent_item['text'] = cur_sent
            sent_item['sid'] = doc_id + c_scorer.SENT_LINE + str(ln)
            # sid is '[doc_id]<SENT_LINE>[line_number]'
            if evidence_set is not None:
                if (doc_id, ln) in evidence_set:
                    sent_item['selection_label'] = "true"
                else:
                    sent_item['selection_label'] = "false"
            else:
                sent_item['selection_label'] = "hidden"

            sent_list.append(sent_item)
        else:
            sent_list.append(sent_item)

    # for s in sent_list:
    # print(s['text'][:20], s['selection_label'])

    return sent_list
def evidence_list_to_text_list(cursor,
                               evidences,
                               contain_head=True,
                               id_tokenized=False):
    # One evidence one text and len(evidences) == len(text_list)
    current_evidence_text_list = []
    evidences = sorted(evidences, key=lambda x: (x[0], x[1]))

    cur_head = 'DO NOT INCLUDE THIS FLAG'

    for doc_id, line_num in evidences:

        _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num)

        cur_text = ""

        if contain_head and cur_head != doc_id:
            cur_head = doc_id

            if not id_tokenized:
                doc_id_natural_format = fever_db.convert_brc(doc_id).replace(
                    '_', ' ')
                t_doc_id_natural_format = ' '.join(
                    easy_tokenize(doc_id_natural_format))
            else:
                t_doc_id_natural_format = common.doc_id_to_tokenized_text(
                    doc_id)

            if line_num != 0:
                cur_text = f"{t_doc_id_natural_format} <t> "

        # Important change move one line below: July 16
        # current_evidence_text.append(e_text)
        cur_text = cur_text + e_text

        current_evidence_text_list.append(cur_text)

    assert len(evidences) == len(current_evidence_text_list)
    return current_evidence_text_list
def select_sent_with_prob_for_eval_list(input_file,
                                        additional_file,
                                        prob_dict_file,
                                        tokenized=False,
                                        pipeline=False,
                                        is_demo=False):
    """
    This method select sentences with upstream sentence retrieval.

    :param input_file: This should be the file with 5 sentences selected.
    :return:
    """
    cursor = fever_db.get_cursor()

    if isinstance(additional_file, list):
        additional_d_list = additional_file
    else:
        additional_d_list = load_data(additional_file)
    additional_data_dict = dict()

    for add_item in additional_d_list:
        additional_data_dict[add_item['id']] = add_item

    d_list = input_file

    for item in tqdm(d_list):
        e_list = additional_data_dict[item['id']]['predicted_sentids']
        if not pipeline:
            assert additional_data_dict[item['id']]['label'] == item['label']
            assert additional_data_dict[
                item['id']]['verifiable'] == item['verifiable']
        assert additional_data_dict[item['id']]['id'] == item['id']

        pred_evidence_list = []
        for i, cur_e in enumerate(e_list):
            doc_id = cur_e.split(c_scorer.SENT_LINE)[0]
            ln = int(cur_e.split(
                c_scorer.SENT_LINE)[1])  # Important changes Bugs: July 21
            pred_evidence_list.append((doc_id, ln))

        pred_evidence = check_sentences.Evidences(pred_evidence_list)

        evidence_text_list = evidence_list_to_text_list(cursor,
                                                        pred_evidence,
                                                        contain_head=True,
                                                        id_tokenized=tokenized)

        evidences = sorted(pred_evidence, key=lambda x: (x[0], x[1]))
        item_id = int(item['id'])

        evidence_text_list_with_prob = []
        for text, (doc_id, ln) in zip(evidence_text_list, evidences):
            ssid = (item_id, doc_id, int(ln))
            if ssid not in prob_dict_file:
                print("Some sentence pair don't have 'prob'.")
                prob = 0.5
            else:
                prob = prob_dict_file[ssid]['prob']
                assert item['claim'] == prob_dict_file[ssid]['claim']

            evidence_text_list_with_prob.append((text, prob))

        if tokenized:
            pass
        else:
            item['claim'] = ' '.join(easy_tokenize(item['claim']))

        item['evid'] = evidence_text_list_with_prob
        item['predicted_evidence'] = convert_evidence2scoring_format(e_list)
        item['predicted_sentids'] = e_list
        # This change need to be saved.
        # item['predicted_label'] = additional_data_dict[item['id']]['label']

    return d_list
def adv_simi_sample_with_prob_v1_1(input_file,
                                   additional_file,
                                   prob_dict_file,
                                   tokenized=False):
    cursor = fever_db.get_cursor()
    d_list = load_data(input_file)

    if isinstance(additional_file, list):
        additional_d_list = additional_file
    else:
        additional_d_list = load_data(additional_file)
    additional_data_dict = dict()

    for add_item in additional_d_list:
        additional_data_dict[add_item['id']] = add_item

    sampled_data_list = []
    count = 0

    for item in tqdm(d_list):
        # e_list = check_sentences.check_and_clean_evidence(item)
        sampled_e_list, flags = sample_additional_data_for_item_v1_1(
            item, additional_data_dict)
        # print(flags)
        for i, (sampled_evidence,
                flag) in enumerate(zip(sampled_e_list, flags)):
            # Do not copy, might change in the future for error analysis
            # new_item = copy.deepcopy(item)
            new_item = dict()
            # print(new_item['claim'])
            # print(e_list)
            # print(sampled_evidence)
            # print(flag)
            evidence_text_list = evidence_list_to_text_list(
                cursor,
                sampled_evidence,
                contain_head=True,
                id_tokenized=tokenized)

            evidences = sorted(sampled_evidence, key=lambda x: (x[0], x[1]))
            item_id = int(item['id'])

            evidence_text_list_with_prob = []
            for text, (doc_id, ln) in zip(evidence_text_list, evidences):
                ssid = (int(item_id), doc_id, int(ln))
                if ssid not in prob_dict_file:
                    count += 1
                    print("Some sentence pair don't have 'prob'.")
                    prob = 0.5
                else:
                    prob = prob_dict_file[ssid]['prob']
                    assert item['claim'] == prob_dict_file[ssid]['claim']

                evidence_text_list_with_prob.append((text, prob))

            new_item['id'] = str(item['id']) + '#' + str(i)

            if tokenized:
                new_item['claim'] = item['claim']
            else:
                new_item['claim'] = ' '.join(easy_tokenize(item['claim']))

            new_item['evid'] = evidence_text_list_with_prob

            new_item['verifiable'] = item['verifiable']
            new_item['label'] = item['label']

            # print("C:", new_item['claim'])
            # print("E:", new_item['evid'])
            # print("L:", new_item['label'])
            # print()
            sampled_data_list.append(new_item)

    cursor.close()

    print(count)
    return sampled_data_list