예제 #1
0
def score_converter_scaled(org_data_file,
                           full_sent_list,
                           scale_prob=0.5,
                           delete_prob=True):
    """
    :param org_data_file:
    :param full_sent_list: append full_sent_score list to evidence of original data file
    :param delete_prob: delete the probability for sanity check
    :param scale_prob:  0.5
    :return:
    """
    d_list = common.load_jsonl(org_data_file)
    augmented_dict = dict()
    print("Build selected sentences file:", len(full_sent_list))
    for sent_item in tqdm(full_sent_list):
        selection_id = sent_item[
            'selection_id']  # The id for the current one selection.
        org_id = int(selection_id.split('<##>')[0])
        if org_id in augmented_dict:
            augmented_dict[org_id].append(sent_item)
        else:
            augmented_dict[org_id] = [sent_item]

    for item in d_list:
        if int(item['id']) not in augmented_dict:
            # cur_predicted_sentids = []
            cur_adv_predicted_sentids = []
        else:
            # cur_predicted_sentids = []  # formating doc_id + c_score.SENTLINT + line_number
            cur_adv_predicted_sentids = []
            sents = augmented_dict[int(item['id'])]
            # Modify some mechaism here to selection sentence whether by some score or label
            for sent_i in sents:
                if sent_i['prob'] >= scale_prob:
                    cur_adv_predicted_sentids.append(
                        (sent_i['sid'], sent_i['score'], sent_i['prob']))
                # del sent_i['prob']

            cur_adv_predicted_sentids = sorted(cur_adv_predicted_sentids,
                                               key=lambda x: -x[1])

        item[
            'scored_sentids'] = cur_adv_predicted_sentids[:
                                                          5]  # Important sentences for scaling training. Jul 21.
        item['predicted_sentids'] = [
            sid for sid, _, _ in item['scored_sentids']
        ][:5]
        item['predicted_evidence'] = convert_evidence2scoring_format(
            item['predicted_sentids'])
        item['predicted_label'] = item['label']  # give ground truth label

    # Removing all score and prob
    if delete_prob:
        for sent_item in full_sent_list:
            if 'score' in sent_item.keys():
                del sent_item['score']
                del sent_item['prob']

    return d_list
def threshold_sampler_insure_unique_list(org_data_file,
                                         full_sent_list,
                                         prob_threshold=0.5,
                                         logist_threshold=None,
                                         top_n=5):
    """
    Providing samples to the Training set by a probability threshold on the upstream selected sentences.
    """
    d_list = org_data_file
    augmented_dict: Dict[int, Dict[str, Dict]] = dict()
    print("Build selected sentences file:", len(full_sent_list))
    for sent_item in tqdm(full_sent_list):
        selection_id = sent_item[
            'selection_id']  # The id for the current one selection.
        org_id = int(selection_id.split('<##>')[0])
        remain_str = selection_id.split('<##>')[1]
        # doc_id = remain_str.split(c_scorer.SENT_LINE)[0]
        # ln = int(remain_str.split(c_scorer.SENT_LINE)[1])
        if org_id in augmented_dict:
            if remain_str not in augmented_dict[org_id]:
                augmented_dict[org_id][remain_str] = sent_item
            else:
                print("Exist")
        else:
            augmented_dict[org_id] = {remain_str: sent_item}

    for item in d_list:
        if int(item['id']) not in augmented_dict:
            # print("Potential error?")
            cur_predicted_sentids = []
        else:
            cur_predicted_sentids = [
            ]  # formating doc_id + c_score.SENTLINT + line_number
            sents = augmented_dict[int(item['id'])].values()
            # Modify some mechaism here to selection sentence whether by some score or label
            for sent_i in sents:
                if sent_i['prob'] >= prob_threshold:
                    cur_predicted_sentids.append(
                        (sent_i['sid'], sent_i['score'], sent_i['prob'])
                    )  # Important sentences for scaling training. Jul 21.
                # del sent_i['prob']

            cur_predicted_sentids = sorted(cur_predicted_sentids,
                                           key=lambda x: -x[1])

        item[
            'scored_sentids'] = cur_predicted_sentids[:
                                                      top_n]  # Important sentences for scaling training. Jul 21.
        item['predicted_sentids'] = [
            sid for sid, _, _ in item['scored_sentids']
        ][:top_n]
        item['predicted_evidence'] = convert_evidence2scoring_format(
            item['predicted_sentids'])
        # item['predicted_label'] = item['label']  # give ground truth label

    return d_list
def threshold_sampler(org_data_file,
                      full_sent_list,
                      prob_threshold=0.5,
                      logist_threshold=None,
                      top_n=5):
    """
    Providing samples to the Training set by a probability threshold on the upstream selected sentences.
    """
    d_list = common.load_jsonl(org_data_file)
    augmented_dict = dict()
    print("Build selected sentences file:", len(full_sent_list))
    for sent_item in tqdm(full_sent_list):
        selection_id = sent_item[
            'selection_id']  # The id for the current one selection.
        org_id = int(selection_id.split('<##>')[0])
        if org_id in augmented_dict:
            # change some logic to remove duplicate.
            augmented_dict[org_id].append(sent_item)
        else:
            augmented_dict[org_id] = [sent_item]

    for item in d_list:
        if int(item['id']) not in augmented_dict:
            cur_predicted_sentids = []
        else:
            cur_predicted_sentids = [
            ]  # formating doc_id + c_score.SENTLINT + line_number
            sents = augmented_dict[int(item['id'])]
            # Modify some mechaism here to selection sentence whether by some score or label
            for sent_i in sents:
                if sent_i['prob'] >= prob_threshold:
                    cur_predicted_sentids.append(
                        (sent_i['sid'], sent_i['score'], sent_i['prob'])
                    )  # Important sentences for scaling training. Jul 21.
                # del sent_i['prob']

            cur_predicted_sentids = sorted(cur_predicted_sentids,
                                           key=lambda x: -x[1])

        item[
            'scored_sentids'] = cur_predicted_sentids[:
                                                      top_n]  # Important sentences for scaling training. Jul 21.
        item['predicted_sentids'] = [
            sid for sid, _, _ in item['scored_sentids']
        ][:top_n]
        item['predicted_evidence'] = convert_evidence2scoring_format(
            item['predicted_sentids'])
        # item['predicted_label'] = item['label']  # give ground truth label

    return d_list
예제 #4
0
def score_converter(org_data_file, full_sent_list, top_k=5, prob_thr=0.5):
    """
        Combines sentences of same claim 
        :param org_data_file:
        :param full_sent_list: append full_sent_score list to evidence of original data file
        :param top_k: top k sentences to be retrieved
        :param prob_thr: probability threshold for retrieved sentences
        :return:
        """
    d_list = common.load_jsonl(org_data_file)
    augmented_dict = dict()
    print("Build selected sentences file:", len(full_sent_list))
    for sent_item in tqdm(full_sent_list):
        selection_id = sent_item['selection_id']
        org_id = int(selection_id.split('<##>')[0])
        if org_id in augmented_dict:
            augmented_dict[org_id].append(sent_item)
        else:
            augmented_dict[org_id] = [sent_item]

    for item in d_list:
        if int(item['id']) not in augmented_dict:
            cur_predicted_sentids = []
        else:
            cur_predicted_sentids = []
            sents = augmented_dict[int(item['id'])]

            for sent_i in sents:
                if sent_i['prob'] >= prob_thr:
                    cur_predicted_sentids.append(
                        (sent_i['sid'], sent_i['score']))

            cur_predicted_sentids = sorted(cur_predicted_sentids,
                                           key=lambda x: -x[1])

        item['scored_sentids'] = cur_predicted_sentids
        item['predicted_sentids'] = [sid for sid, _ in item['scored_sentids']
                                     ][:top_k]
        item['predicted_evidence'] = convert_evidence2scoring_format(
            item['predicted_sentids'])
        item['predicted_label'] = item[
            'label']  # give ground truth label (for OFEVER calculation)

    # Removing all score and prob
    for sent_item in full_sent_list:
        if 'score' in sent_item.keys():
            del sent_item['score']
            del sent_item['prob']

    return d_list
예제 #5
0
def navie_results_builder_for_sanity_check(org_data_file, full_sent_list):
    """
    :param org_data_file:
    :param full_sent_list: append full_sent_score list to evidence of original data file
    :return:
    """
    d_list = common.load_jsonl(org_data_file)
    augmented_dict = dict()
    print("Build selected sentences file")
    for sent_item in tqdm(full_sent_list):
        selection_id = sent_item[
            'selection_id']  # The id for the current one selection.
        org_id = int(selection_id.split('<##>')[0])
        if org_id in augmented_dict:
            augmented_dict[org_id].append(sent_item)
        else:
            augmented_dict[org_id] = [sent_item]

    for item in d_list:
        if int(item['id']) not in augmented_dict:
            cur_predicted_sentids = []
        else:
            cur_predicted_sentids = [
            ]  # formating doc_id + c_score.SENTLINT + line_number
            sents = augmented_dict[int(item['id'])]
            # Modify some mechaism here to selection sentence whether by some score or label
            for sent_i in sents:
                if sent_i['selection_label'] == "true":
                    cur_predicted_sentids.append(sent_i['sid'])

        item['predicted_sentids'] = cur_predicted_sentids
        item['predicted_evidence'] = convert_evidence2scoring_format(
            item['predicted_sentids'])
        item['predicted_label'] = item['label']

    return d_list
def select_sent_with_prob_for_eval_list(input_file,
                                        additional_file,
                                        prob_dict_file,
                                        tokenized=False,
                                        pipeline=False,
                                        is_demo=False):
    """
    This method select sentences with upstream sentence retrieval.

    :param input_file: This should be the file with 5 sentences selected.
    :return:
    """
    cursor = fever_db.get_cursor()

    if isinstance(additional_file, list):
        additional_d_list = additional_file
    else:
        additional_d_list = load_data(additional_file)
    additional_data_dict = dict()

    for add_item in additional_d_list:
        additional_data_dict[add_item['id']] = add_item

    d_list = input_file

    for item in tqdm(d_list):
        e_list = additional_data_dict[item['id']]['predicted_sentids']
        if not pipeline:
            assert additional_data_dict[item['id']]['label'] == item['label']
            assert additional_data_dict[
                item['id']]['verifiable'] == item['verifiable']
        assert additional_data_dict[item['id']]['id'] == item['id']

        pred_evidence_list = []
        for i, cur_e in enumerate(e_list):
            doc_id = cur_e.split(c_scorer.SENT_LINE)[0]
            ln = int(cur_e.split(
                c_scorer.SENT_LINE)[1])  # Important changes Bugs: July 21
            pred_evidence_list.append((doc_id, ln))

        pred_evidence = check_sentences.Evidences(pred_evidence_list)

        evidence_text_list = evidence_list_to_text_list(cursor,
                                                        pred_evidence,
                                                        contain_head=True,
                                                        id_tokenized=tokenized)

        evidences = sorted(pred_evidence, key=lambda x: (x[0], x[1]))
        item_id = int(item['id'])

        evidence_text_list_with_prob = []
        for text, (doc_id, ln) in zip(evidence_text_list, evidences):
            ssid = (item_id, doc_id, int(ln))
            if ssid not in prob_dict_file:
                print("Some sentence pair don't have 'prob'.")
                prob = 0.5
            else:
                prob = prob_dict_file[ssid]['prob']
                assert item['claim'] == prob_dict_file[ssid]['claim']

            evidence_text_list_with_prob.append((text, prob))

        if tokenized:
            pass
        else:
            item['claim'] = ' '.join(easy_tokenize(item['claim']))

        item['evid'] = evidence_text_list_with_prob
        item['predicted_evidence'] = convert_evidence2scoring_format(e_list)
        item['predicted_sentids'] = e_list
        # This change need to be saved.
        # item['predicted_label'] = additional_data_dict[item['id']]['label']

    return d_list
예제 #7
0
def score_converter(org_data_file, full_sent_list, upstream_file, top_k=5, prob_thr=0.5):
    """
        Combines sentences of same claim and retrieves only top k sentences with
        probability greater than threshold
        :param org_data_file:
        :param full_sent_list: append full_sent_score list to evidence of original data file
        :param upstream_file: sentences missed during hyperlink sentences generation are gathered from this file
        :param top_k: top k sentences to be retrieved
        :param prob_thr: probability threshold for retrieved sentences
        :return:
        """
    d_list = common.load_jsonl(org_data_file)
    ans_list = common.load_jsonl(upstream_file)
    augmented_dict = dict()
    print("Build selected sentences file:", len(full_sent_list))

    for sent_item in tqdm(full_sent_list):
        selection_id = sent_item['selection_id'][0]
        org_id = int(selection_id.split('<##>')[0])
        if org_id in augmented_dict:
            augmented_dict[org_id].append(sent_item)
        else:
            augmented_dict[org_id] = [sent_item]

    for item, ans in zip(d_list, ans_list):
        if int(item['id']) not in augmented_dict:
            cur_predicted_sentids = []
        else:
            cur_predicted_sentids_dict = dict() 
            cur_predicted_sentids = []
            sents = augmented_dict[int(item['id'])]
            
            for sent_i in sents:
                if (sent_i['fsid'], sent_i['fscore']) not in cur_predicted_sentids_dict:
                        cur_predicted_sentids_dict[(sent_i['fsid'], sent_i['fscore'])] = []

                assert len(sent_i['selection_id']) == len(sent_i['score'])
                assert len(sent_i['selection_id']) == len(sent_i['prob'])

                for sid, score, prob in zip(sent_i['selection_id'],sent_i['score'],sent_i['prob']):
                    if prob >= prob_thr:
                        cur_predicted_sentids_dict[(sent_i['fsid'], sent_i['fscore'])].append((sid.split('<##>')[1], score))

            # if a first iteration sentence does not contain hyperlink, it would have been missed before, but added here 
            for sid1, score1, _ in ans['scored_sentids']:
                isthere = False
                for sid2, score2 in cur_predicted_sentids_dict:
                    if sid1 == sid2:
                        isthere = True
                        if score1 != score2:
                            print("Something wrong!")
                        break
                if not isthere:
                    cur_predicted_sentids_dict[(sid1, score1)] = []

            sorted_keys = sorted(cur_predicted_sentids_dict, key=lambda x: (-x[1]))        
            for k in sorted_keys:
                cps_tmp = cur_predicted_sentids_dict[k]
                cps_tmp = sorted(cps_tmp, key=lambda x: (-x[1]))
                cur_predicted_sentids.append(k)
                cur_predicted_sentids.extend(cps_tmp[:1])

        item['scored_sentids'] = cur_predicted_sentids

        # if none of the first iteration sentences contain hyperlinks, they would have been missed before, but added here
        for sid1, score1, _ in ans['scored_sentids']:
            isthere = False
            for sid2, _ in item['scored_sentids']:
                if sid1 == sid2:
                    isthere = True
                    break
            if not isthere:
                item['scored_sentids'].append((sid1, score1))

        item['predicted_sentids'] = [sid for sid, _ in item['scored_sentids']][:top_k]
        item['predicted_evidence'] = convert_evidence2scoring_format(item['predicted_sentids'])
        item['predicted_label'] = item['label']  # give ground truth label

    # Removing all score and prob
    for sent_item in full_sent_list:
        if 'score' in sent_item.keys():
            del sent_item['score']
            del sent_item['prob']

    return d_list