예제 #1
0
def get_sentences(tag, is_training, debug=False):
    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    if debug:
        # d_list = d_list[:10]
        d_list = d_list[:50]
        # d_list = d_list[:200]

    doc_results = common.load_jsonl(
        config.RESULT_PATH /
        f"doc_retri_results/fever_results/merged_doc_results/m_doc_{tag}.jsonl"
    )
    doc_results_dict = list_dict_data_tool.list_to_dict(doc_results, 'id')
    fever_db_cursor = fever_db.get_cursor(config.FEVER_DB)
    forward_items = build_full_wiki_document_forward_item(
        doc_results_dict,
        d_list,
        is_training=is_training,
        db_cursor=fever_db_cursor)
    return forward_items
예제 #2
0
def inspect_upstream_eval():
    is_training = True
    debug_mode = True
    d_list = common.load_jsonl(config.OPEN_SQUAD_DEV_GT)
    in_file_name = config.PRO_ROOT / 'saved_models/05-12-08:44:38_mtr_open_qa_p_level_(num_train_epochs:3)/i(2000)|e(2)|squad|top10(0.6909176915799432)|top20(0.7103122043519394)|seed(12)_eval_results.jsonl'
    cur_eval_results_list = common.load_jsonl(in_file_name)
    top_k = 10
    filter_value = 0.1
    t_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_RAW_TEXT)
    match_type = 'string'

    if debug_mode:
        d_list = d_list[:100]
        id_set = set([item['question'] for item in d_list])
        cur_eval_results_list = [
            item for item in cur_eval_results_list if item['qid'] in id_set
        ]

    d_o_dict = list_dict_data_tool.list_to_dict(d_list, 'question')
    copied_d_o_dict = copy.deepcopy(d_o_dict)

    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_eval_results_list, copied_d_o_dict, 'qid', 'fid', check=True)

    cur_results_dict_top10 = od_sample_utils.select_top_k_and_to_results_dict(
        copied_d_o_dict,
        score_field_name='prob',
        top_k=top_k,
        filter_value=filter_value)

    forward_example_items = build_open_qa_forword_item(cur_results_dict_top10,
                                                       d_list, is_training,
                                                       t_cursor, match_type)

    print(forward_example_items)
예제 #3
0
def p_eval():
    dev_list = common.load_jsonl(config.FEVER_DEV)
    # common.save_jsonl(cur_eval_results_list, f"fever_p_level_{tag}_results.jsonl")
    cur_eval_results_list = common.load_jsonl(
        config.PRO_ROOT / "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_dev_results.jsonl"
    )

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id')
    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
    copied_dev_d_list = copy.deepcopy(dev_list)
    list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_dev_o_dict,
                                                          'qid', 'fid', check=True)

    cur_results_dict_th0_5 = select_top_k_and_to_results_dict(copied_dev_o_dict,
                                                              score_field_name='prob',
                                                              top_k=5, filter_value=0.005)

    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(copied_dev_d_list,
                                                                   cur_results_dict_th0_5,
                                                                   'id', 'predicted_docids')
    # mode = {'standard': False, 'check_doc_id_correct': True}
    strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list, dev_list,
                                                            max_evidence=5)

    score_05 = {
        'ss': strict_score,
        'pr': pr, 'rec': rec, 'f1': f1,
    }

    print(score_05)
def post_process():
    from pathlib import Path
    input_file = '/home/easonnie/projects/FunEver/results/pipeline_r/2018_07_24_11:07:41_r(new_model_v1_2_for_realtest)_scaled_0.05_withlb/balance.jsonl'
    nli_results = common.load_jsonl(input_file)
    print("Post Processing enhancement")
    delete_unused_evidence(nli_results)
    print("Deleting Useless Evidence")

    current_pipeline_dir = Path(
        '/home/easonnie/projects/FunEver/results/pipeline_r/2018_07_24_11:07:41_r(new_model_v1_2_for_realtest)_scaled_0.05_withlb'
    )
    dev_sent_file_1 = current_pipeline_dir / "dev_sent_score_1_shared_task_test.jsonl"
    dev_sent_file_2 = current_pipeline_dir / "dev_sent_score_2_shared_task_test.jsonl"

    dev_sent_list_1 = common.load_jsonl(dev_sent_file_1)
    dev_sent_list_2 = common.load_jsonl(dev_sent_file_2)

    print("Appending 1 of second Evidence")
    nli_results = simi_sampler.threshold_sampler_insure_unique_merge(
        nli_results, dev_sent_list_2, 0.9, top_n=5, add_n=1)
    delete_unused_evidence(nli_results)

    # High tolerance enhancement!
    print("Final High Tolerance Enhancement")
    print("Appending all of first Evidence")
    nli_results = simi_sampler.threshold_sampler_insure_unique_merge(
        nli_results, dev_sent_list_1, -1, top_n=5, add_n=100)
    delete_unused_evidence(nli_results)

    # if build_submission:
    output_file = current_pipeline_dir / "predictions.jsonl"
    build_submission_file(nli_results, output_file)
예제 #5
0
def inspect_upstream_eval_v1():
    bert_model_name = "bert-base-uncased"
    bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'
    do_lower_case = True

    max_pre_context_length = 315
    max_query_length = 64
    doc_stride = 128

    is_training = True
    debug_mode = True

    d_list = common.load_jsonl(config.OPEN_SQUAD_DEV_GT)
    in_file_name = config.PRO_ROOT / 'saved_models/05-12-08:44:38_mtr_open_qa_p_level_(num_train_epochs:3)/i(2000)|e(2)|squad|top10(0.6909176915799432)|top20(0.7103122043519394)|seed(12)_eval_results.jsonl'
    cur_eval_results_list = common.load_jsonl(in_file_name)
    top_k = 10
    filter_value = 0.1
    match_type = 'string'
    tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                              do_lower_case=do_lower_case,
                                              cache_dir=bert_pretrain_path)

    fitems_dict, read_fitems_list, _ = get_open_qa_item_with_upstream_paragraphs(
        d_list, cur_eval_results_list, is_training, tokenizer,
        max_pre_context_length, max_query_length, doc_stride, debug_mode,
        top_k, filter_value, match_type)
    print(len(read_fitems_list))
    print(len(fitems_dict))
예제 #6
0
def show_nli_binned_plot(y_axis_value):
    dataset_name = 'Natural Language Inference'
    task_name = 'uncertainty_nli'
    snli_data_file = config.CHAOSNLI_SNLI
    mnli_data_file = config.CHAOSNLI_MNLI

    model_pred_file = config.MODEL_PRED_NLI

    d_list_snli = common.load_jsonl(snli_data_file)
    d_list_mnli = common.load_jsonl(mnli_data_file)

    collected_data_dict = {}
    collected_data_dict_snli = list_dict_data_tool.list_to_dict(d_list_snli, key_fields='uid')
    collected_data_dict_mnli = list_dict_data_tool.list_to_dict(d_list_mnli, key_fields='uid')
    collected_data_dict.update(collected_data_dict_snli)
    collected_data_dict.update(collected_data_dict_mnli)

    model_prediction_dict = common.load_json(model_pred_file)

    bin_num = 5
    split_type = 'quantile'
    column_name = 'ChaosNLI-(S+M)'

    bined_item = build_entropy_bins(collected_data_dict, bin_num, type=split_type)
    bined_item_results = calculate_per_bin_results_simplify(bined_item, model_prediction_dict,
                                                            task_name=task_name)

    plot_histogram(bined_item_results, y_axis_value, column_name)
예제 #7
0
def get_paragraph_forward_pair(tag,
                               ruleterm_doc_results,
                               is_training,
                               debug=False,
                               ignore_non_verifiable=False):
    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    if debug:
        d_list = d_list[:100]
        ruleterm_doc_results = ruleterm_doc_results[:100]

    ruleterm_doc_results_dict = list_dict_data_tool.list_to_dict(
        ruleterm_doc_results, 'id')
    db_cursor = fever_db.get_cursor()
    fitems = build_full_wiki_document_forward_item(ruleterm_doc_results_dict,
                                                   d_list, is_training,
                                                   db_cursor,
                                                   ignore_non_verifiable)

    return fitems
예제 #8
0
def build_anli(path: Path, round=1, version='1.0'):
    data_root_path = (path / "anli")
    if not data_root_path.exists():
        data_root_path.mkdir()

    round_tag = str(round)

    o_train = common.load_jsonl(config.PRO_ROOT / f"data/anli_v{version}/R{round_tag}/train.jsonl")
    o_dev = common.load_jsonl(config.PRO_ROOT / f"data/anli_v{version}/R{round_tag}/dev.jsonl")
    o_test = common.load_jsonl(config.PRO_ROOT / f"data/anli_v{version}/R{round_tag}/test.jsonl")

    d_trian = a_nli2std_format(o_train)
    d_dev = a_nli2std_format(o_dev)
    d_test = a_nli2std_format(o_test)

    print(f"ANLI (R{round_tag}) Train size:", len(d_trian))
    print(f"ANLI (R{round_tag}) Dev size:", len(d_dev))
    print(f"ANLI (R{round_tag}) Test size:", len(d_test))

    if not (data_root_path / f"r{round_tag}").exists():
        (data_root_path / f"r{round_tag}").mkdir()

    common.save_jsonl(d_trian, data_root_path / f"r{round_tag}" / 'train.jsonl')
    common.save_jsonl(d_dev, data_root_path / f"r{round_tag}" / 'dev.jsonl')
    common.save_jsonl(d_test, data_root_path / f"r{round_tag}" / 'test.jsonl')
def single_process_fever_with_dict(start=0, end=None, tag='dev'):
    task_name = 'fever'
    debug = False
    top_k = 20

    query_fieldname = 'claim'
    id_fieldname = 'id'
    debug_name = 'debug' if debug else ""

    g_score_dict = dict()
    g_score_dict = load_from_file(g_score_dict,
                                  config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")

    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    # Important Set this number !!!
    print("Total length:", len(d_list))
    # start, end = 0, len(d_list)
    # Important End !!!

    print(f"Task:{task_name}, Tag:{tag}, TopK:{top_k}, Start/End:{start}/{end}")
    d_list = d_list[start:end]

    print("Data length:", len(d_list))
    if debug:
        d_list = d_list[:10]
        start, end = 0, 10
    print("Data length (Pos-filtering):", len(d_list))

    r_item_list = []

    incr_file = config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/{task_name}_tf_idf_{tag}_incr_({start},{end})_{debug_name}.jsonl"
    if incr_file.is_file():
        print("Warning save file exists.")

    save_path: Path = config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/{task_name}_tf_idf_{tag}_({start},{end})_{debug_name}.jsonl"
    if save_path.is_file():
        print("Warning save file exists.")

    with open(incr_file, mode='w', encoding='utf-8') as out_f:
        process_func = partial(process_fever_item_with_score_dict,
                               top_k=top_k, query_field=query_fieldname, id_field=id_fieldname,
                               global_score_dict=g_score_dict)

        for item in tqdm(d_list, total=len(d_list)):
            r_item = process_func(item)
            r_item_list.append(r_item)
            out_f.write(json.dumps(item) + '\n')
            out_f.flush()

    print(len(r_item_list))
    common.save_jsonl(r_item_list, save_path)
예제 #10
0
def multi_process(start=0, end=None, tag='dev'):
    task_name = 'fever'
    debug = False
    top_k = 20
    num_process = 3
    query_fieldname = 'claim'
    id_fieldname = 'id'
    debug_name = 'debug' if debug else ""

    # print(multiprocessing.cpu_count())
    print("CPU Count:", multiprocessing.cpu_count())

    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    print("Total length:", len(d_list))
    # Important Set this number !!!
    # start, end = 0, None
    # Important End !!!

    print(f"Task:{task_name}, Tag:{tag}, TopK:{top_k}, Start/End:{start}/{end}")
    d_list = d_list[start:end]

    print("Data length:", len(d_list))
    if debug:
        d_list = d_list[:10]
        start, end = 0, 10
    print("Data length (Pos-filtering):", len(d_list))

    r_list = []

    incr_file = config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/{task_name}_tf_idf_{tag}_incr_({start},{end})_{debug_name}.jsonl"
    if incr_file.is_file():
        print("Warning save file exists.")

    save_path: Path = config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/{task_name}_tf_idf_{tag}_({start},{end})_{debug_name}.jsonl"
    if save_path.is_file():
        print("Warning save file exists.")

    with open(incr_file, mode='w', encoding='utf-8') as out_f:
        with Pool(processes=num_process, maxtasksperchild=1000) as pool:

            process_func = partial(process_fever_item_multiprocessing,
                                   top_k=top_k, query_field=query_fieldname, id_field=id_fieldname)

            p_item_list = pool.imap_unordered(process_func, d_list)
            for item in tqdm(p_item_list, total=len(d_list)):
                r_list.append(item)
                out_f.write(json.dumps(item) + '\n')
                out_f.flush()

    print(len(r_list))
    common.save_jsonl(r_list, save_path)
예제 #11
0
def get_inference_pair(tag,
                       is_training,
                       sent_result_path,
                       debug_num=None,
                       evidence_filtering_threshold=0.01):
    # sent_result_path = ""

    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    if debug_num is not None:
        # d_list = d_list[:10]
        d_list = d_list[:50]
        # d_list = d_list[:200]

    d_dict = list_dict_data_tool.list_to_dict(d_list, 'id')

    threshold_value = evidence_filtering_threshold
    # sent_list = common.load_jsonl(
    #     config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/train_sent_results.jsonl")
    # sent_list = common.load_jsonl(
    #     config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/i(5000)|e(0)|s01(0.9170917091709171)|s05(0.8842384238423843)|seed(12)_dev_sent_results.json")

    # debug_num = None if not debug else 2971
    # debug_num = None

    if isinstance(sent_result_path, Path):
        sent_list = common.load_jsonl(sent_result_path, debug_num)
    elif isinstance(sent_result_path, list):
        sent_list = sent_result_path
    else:
        raise ValueError(
            f"{sent_result_path} is not of a valid argument type which should be [list, Path]."
        )

    list_dict_data_tool.append_subfield_from_list_to_dict(sent_list,
                                                          d_dict,
                                                          'oid',
                                                          'fid',
                                                          check=True)

    filltered_sent_dict = select_top_k_and_to_results_dict(
        d_dict, top_k=5, threshold=threshold_value)

    list_dict_data_tool.append_item_from_dict_to_list(
        d_list, filltered_sent_dict, 'id',
        ['predicted_evidence', 'predicted_scored_evidence'])
    fever_db_cursor = fever_db.get_cursor(config.FEVER_DB)
    forward_items = build_nli_forward_item(d_list,
                                           is_training=is_training,
                                           db_cursor=fever_db_cursor)

    return forward_items, d_list
예제 #12
0
def prepare_forward_data(dataset_name, tag, is_training, upstream_top_k=20, distant_gt_top_k=2, down_sample_ratio=None,
                         debug=False):
    if dataset_name == 'webq' and tag == 'test':
        gt_d_list_path = config.OPEN_WEBQ_TEST_GT
    elif dataset_name == 'webq' and tag == 'train':
        gt_d_list_path = config.OPEN_WEBQ_TRAIN_GT
    elif dataset_name == 'curatedtrec' and tag == 'test':
        gt_d_list_path = config.OPEN_CURATEDTERC_TEST_GT
    elif dataset_name == 'curatedtrec' and tag == 'train':
        gt_d_list_path = config.OPEN_CURATEDTERC_TRAIN_GT
    elif dataset_name == 'squad' and tag == 'dev':
        gt_d_list_path = config.OPEN_SQUAD_DEV_GT
    elif dataset_name == 'squad' and tag == 'train':
        gt_d_list_path = config.OPEN_SQUAD_TRAIN_GT
    elif dataset_name == 'wikimovie' and tag == 'test':
        gt_d_list_path = config.OPEN_WIKIM_TEST_GT
    elif dataset_name == 'wikimovie' and tag == 'train':
        gt_d_list_path = config.OPEN_WIKIM_TRAIN_GT
    else:
        raise NotImplemented()

    t_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_RAW_TEXT)
    # debug = False
    # upstream_top_k = 20
    # distant_gt_top_k = 2
    # down_sample_ratio = None

    if dataset_name != 'wikimovie':
        upstream_d_list_before_filter = common.load_jsonl(
            config.PRO_ROOT / f"data/p_{dataset_name}/tf_idf_p_level/{dataset_name}_{tag}_para_tfidf.jsonl")
    else:
        upstream_d_list_before_filter = common.load_jsonl(
            config.PRO_ROOT / f"data/p_{dataset_name}/kwm_p_level/{dataset_name}_{tag}_kwm_tfidf.jsonl")

    if debug:
        upstream_d_list_before_filter = upstream_d_list_before_filter[:50]
    upstream_d_list = top_k_filter_score_list(upstream_d_list_before_filter, top_k=upstream_top_k)

    upstream_d_dict = list_dict_data_tool.list_to_dict(upstream_d_list, 'question')

    gt_d_list = common.load_jsonl(gt_d_list_path)
    gt_d_dict = list_dict_data_tool.list_to_dict(gt_d_list, 'question')
    distant_gt_item_list = get_distant_top_k_ground_truth(gt_d_dict, upstream_d_list_before_filter,
                                                          top_k=distant_gt_top_k)
    distant_gt_item_dict = list_dict_data_tool.list_to_dict(distant_gt_item_list, 'qid')

    fitems_list = build_p_level_forward_item(upstream_d_dict, distant_gt_item_dict, upstream_d_list, is_training,
                                             t_cursor)
    if is_training:
        return down_sample_neg(fitems_list, down_sample_ratio)
    else:
        return down_sample_neg(fitems_list, None)
예제 #13
0
def get_nli_pair(tag,
                 is_training,
                 sent_level_results_list,
                 debug=None,
                 sent_top_k=5,
                 sent_filter_value=0.05):
    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    if debug:
        d_list = d_list[:100]
        # sent_dict = list_dict_data_tool.list_to_dict(sent_level_results_list):

    d_dict = list_dict_data_tool.list_to_dict(d_list, 'id')

    if debug:
        id_set = set([item['id'] for item in d_list])
        new_sent_list = []
        for item in sent_level_results_list:
            if item["qid"] in id_set:
                new_sent_list.append(item)
        sent_level_results_list = new_sent_list

    list_dict_data_tool.append_subfield_from_list_to_dict(
        sent_level_results_list, d_dict, 'qid', 'fid', check=True)

    filltered_sent_dict = select_top_k_and_to_results_dict(
        d_dict,
        score_field_name='prob',
        top_k=sent_top_k,
        filter_value=sent_filter_value,
        result_field='predicted_evidence')

    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
        d_list, filltered_sent_dict, 'id',
        ['predicted_evidence', 'selected_scored_results'])

    fever_db_cursor = fever_db.get_cursor(config.FEVER_DB)
    forward_items = build_nli_forward_item(d_list,
                                           is_training=is_training,
                                           db_cursor=fever_db_cursor)

    return forward_items, d_list
예제 #14
0
def eval_ensemble():
    sent_file = config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl"
    dev_sent_filtering_prob = 0.01
    tag = 'dev'
    top_k = 5

    # dev_list = common.load_jsonl(config.FEVER_DEV)
    dev_sent_results_list = common.load_jsonl(sent_file)

    dev_fitems, dev_list = get_nli_pair(
        tag,
        is_training=False,
        sent_level_results_list=dev_sent_results_list,
        debug=False,
        sent_top_k=top_k,
        sent_filter_value=dev_sent_filtering_prob)

    pred_file_list = [
        config.PRO_ROOT /
        "data/p_fever/fever_nli/04-25-22:02:53_fever_v2_nli_th0.2/ema_i(20000)|e(3)|ss(0.7002700270027002)|ac(0.746024602460246)|pr(0.6141389138913633)|rec(0.8627362736273627)|f1(0.7175148212089147)|seed(12)/nli_dev_label_results_th0.2.jsonl",
        config.PRO_ROOT /
        "data/p_fever/fever_nli/04-26-10:15:39_fever_v2_nli_th0.2/ema_i(14000)|e(2)|ss(0.6991199119911992)|ac(0.7492249224922493)|pr(0.7129412941294097)|rec(0.8338583858385838)|f1(0.7686736484619933)|seed(12)/nli_dev_label_results_th0.2.jsonl",
        config.PRO_ROOT /
        "data/p_fever/fever_nli/04-27-10:03:27_fever_v2_nli_th0.2/ema_i(26000)|e(3)|ss(0.6958695869586958)|ac(0.7447744774477447)|pr(0.7129412941294097)|rec(0.8338583858385838)|f1(0.7686736484619933)|seed(12)/nli_dev_label_results_th0.2.jsonl",
    ]
    pred_d_list = [common.load_jsonl(file) for file in pred_file_list]
    final_list = ensemble_nli_results(pred_d_list)
    pred_list = final_list

    ema_results_dict = list_dict_data_tool.list_to_dict(pred_list, 'oid')
    copied_dev_list = copy.deepcopy(dev_list)
    list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list,
                                                      ema_results_dict, 'id',
                                                      'predicted_label')

    dev_list = common.load_jsonl(config.FEVER_DEV)
    mode = {'standard': True}
    strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(
        copied_dev_list, dev_list, mode=mode, max_evidence=5)
    logging_item = {
        'ss': strict_score,
        'ac': acc_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    print(logging_item)
예제 #15
0
def threshold_sampler_insure_unique_merge(org_data_file,
                                          full_sent_list,
                                          prob_threshold=0.5,
                                          top_n=5,
                                          add_n=1):
    """
    Providing samples to the Training set by a probability threshold on the upstream selected sentences.
    """
    if not isinstance(org_data_file, list):
        d_list = common.load_jsonl(org_data_file)
    else:
        d_list = org_data_file
    augmented_dict = dict()
    for sent_item in full_sent_list:
        selection_id = sent_item[
            'selection_id']  # The id for the current one selection.
        org_id = int(selection_id.split('<##>')[0])
        remain_str = selection_id.split('<##>')[1]
        if org_id in augmented_dict:
            if remain_str not in augmented_dict[org_id]:
                augmented_dict[org_id][remain_str] = sent_item
        else:
            augmented_dict[org_id] = {remain_str: sent_item}

    for item in d_list:
        if int(item['id']) not in augmented_dict:
            # print("Potential error?")
            cur_predicted_sentids = []
        else:
            cur_predicted_sentids = [
            ]  # formating doc_id + c_score.SENTLINT + line_number
            sents = augmented_dict[int(item['id'])].values()
            # Modify some mechaism here to selection sentence whether by some score or label
            for sent_i in sents:
                if sent_i['prob'] >= prob_threshold:
                    cur_predicted_sentids.append(
                        (sent_i['sid'], sent_i['score'], sent_i['prob'])
                    )  # Important sentences for scaling training. Jul 21.
                # del sent_i['prob']

            cur_predicted_sentids = sorted(cur_predicted_sentids,
                                           key=lambda x: -x[1])

        cur_predicted_sentids = cur_predicted_sentids[:add_n]

        # if item['scored_sentids']
        if len(item['predicted_sentids']) >= 5:
            continue
        else:
            item['predicted_sentids'].extend([
                sid for sid, _, _ in cur_predicted_sentids
                if sid not in item['predicted_sentids']
            ])
            item['predicted_sentids'] = item['predicted_sentids'][:top_n]
            item['predicted_evidence'] = convert_evidence2scoring_format(
                item['predicted_sentids'])

        # item['predicted_label'] = item['label']  # give ground truth label

    return d_list
예제 #16
0
def model_perf_binned(dataset_name, task_name, data_file, model_prediction_file, split_type='quantile', bin_num=5,
                      verbose=True):

    d_list = common.load_jsonl(data_file)
    collected_data_dict = list_dict_data_tool.list_to_dict(d_list, key_fields='uid')
    model_prediction_dict = common.load_json(model_prediction_file)

    bined_item = build_entropy_bins(collected_data_dict, bin_num, type=split_type)
    bined_item_results = calculate_per_bin_results_simplify(bined_item, model_prediction_dict,
                                                            task_name=task_name)

    if verbose:
        print('-' * 60)
        print('Data:', dataset_name)
        for model_name, range_items in bined_item_results.items():
            print('Model: {:20s}'.format(model_name))
            print('\t'.join(['{:18s}'.format('Entropy Range'), '{:15s}'.format('# of Example'),
                             '{:10s}'.format('JSD'), '{:10s}'.format('KL'),
                             '{:10s}'.format('Old Acc.'), '{:10s}'.format('New Acc.')]))
            for range_value, model_item in range_items['bin_results'].items():
                print('\t'.join(['{:5f}-{:5f}'.format(range_value[0], range_value[1]),
                                 '{:15s}'.format(format_number(model_item['total_count'])),
                                 '{:10s}'.format(format_number(model_item['average JS div'])),
                                 '{:10s}'.format(format_number(model_item['average KL div'])),
                                 '{:10s}'.format(format_number(model_item['o_acc'])),
                                 '{:10s}'.format(format_number(model_item['m_acc'])),
                                 ]))
        print('-' * 60)
    return bined_item_results
예제 #17
0
def experiment_test_full_wiki():
    multihop_retrieval_top_k = 3
    match_filtering_k = 3
    term_retrieval_top_k = 5

    data_list = common.load_json(config.TEST_FULLWIKI_FILE)
    terms_based_results_list = common.load_jsonl(
        config.RESULT_PATH /
        "doc_retri_results/term_based_methods_results/hotpot_tf_idf_test.jsonl"
    )
    g_score_dict = dict()
    load_from_file(
        g_score_dict, config.PDATA_ROOT /
        "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")
    # WE need to give gt data None.
    doc_retri_pred_dict = init_results_v8(
        data_list,
        None,
        terms_based_results_list,
        g_score_dict,
        match_filtering_k=match_filtering_k,
        term_retrieval_top_k=term_retrieval_top_k)

    len_list = []
    for rset in doc_retri_pred_dict['sp_doc'].values():
        len_list.append(len(rset))

    print("Results without filtering:")
    print(collections.Counter(len_list).most_common(10000))
    print(len(len_list))
    print("Mean:\t", np.mean(len_list))
    print("Std:\t", np.std(len_list))
    print("Max:\t", np.max(len_list))
    print("Min:\t", np.min(len_list))

    common.save_json(
        doc_retri_pred_dict,
        "hotpot_test_doc_retrieval_v8_before_multihop_filtering.json")

    # Filtering
    new_doc_retri_pred_dict = results_multihop_filtering(
        doc_retri_pred_dict, multihop_retrieval_top_k=multihop_retrieval_top_k)
    print("Results with filtering:")

    len_list = []
    for rset in new_doc_retri_pred_dict['sp_doc'].values():
        len_list.append(len(rset))

    print("Results with filtering:")
    print(collections.Counter(len_list).most_common(10000))
    print(len(len_list))
    print("Mean:\t", np.mean(len_list))
    print("Std:\t", np.std(len_list))
    print("Max:\t", np.max(len_list))
    print("Min:\t", np.min(len_list))

    # ext_hotpot_eval.eval(new_doc_retri_pred_dict, data_list)
    common.save_json(new_doc_retri_pred_dict,
                     "hotpot_test_doc_retrieval_v8.json")
예제 #18
0
def spectrum_eval_manual_check():
    batch_size = 64
    lazy = True

    SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-17-12:10:35_mesim_elmo/i(34800)_epoch(5)_dev(0.5563056305630563)_loss(1.6648460462434564)_seed(12)"

    # IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_15:52:19_r/dev_sent.jsonl"
    IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_16:34:19_r/dev_sent.jsonl"
    # IN_FILE = config.RESULT_PATH / "sent_retri_nn/2018_07_17_16-34-19_r/dev_sent.jsonl"
    dev_sent_result_lsit = common.load_jsonl(IN_FILE)

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels')

    print(vocab.get_token_to_index_vocabulary('labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300)

    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    for sc_prob in [0.5, 0.7, 0.8, 0.9, 0.95, 0.98]:
        upstream_dev_list = score_converter_scaled(config.T_FEVER_DEV_JSONL, dev_sent_result_lsit, scale_prob=sc_prob,
                                                   delete_prob=False)
        dev_fever_data_reader = BasicReader(token_indexers=token_indexers, lazy=lazy)
        complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, upstream_dev_list)
        dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

        eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
        builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data)

        print("------------------------------------")
        print("Scaling_prob:", sc_prob)
        eval_mode = {'check_sent_id_correct': True, 'standard': True}
        print(c_scorer.fever_score(builded_dev_data, config.T_FEVER_DEV_JSONL, mode=eval_mode))
        # del upstream_dev_list
        # del complete_upstream_dev_data
        del dev_fever_data_reader
        del dev_instances
        print("------------------------------------")
예제 #19
0
def score_converter_scaled(org_data_file,
                           full_sent_list,
                           scale_prob=0.5,
                           delete_prob=True):
    """
    :param org_data_file:
    :param full_sent_list: append full_sent_score list to evidence of original data file
    :param delete_prob: delete the probability for sanity check
    :param scale_prob:  0.5
    :return:
    """
    d_list = common.load_jsonl(org_data_file)
    augmented_dict = dict()
    print("Build selected sentences file:", len(full_sent_list))
    for sent_item in tqdm(full_sent_list):
        selection_id = sent_item[
            'selection_id']  # The id for the current one selection.
        org_id = int(selection_id.split('<##>')[0])
        if org_id in augmented_dict:
            augmented_dict[org_id].append(sent_item)
        else:
            augmented_dict[org_id] = [sent_item]

    for item in d_list:
        if int(item['id']) not in augmented_dict:
            # cur_predicted_sentids = []
            cur_adv_predicted_sentids = []
        else:
            # cur_predicted_sentids = []  # formating doc_id + c_score.SENTLINT + line_number
            cur_adv_predicted_sentids = []
            sents = augmented_dict[int(item['id'])]
            # Modify some mechaism here to selection sentence whether by some score or label
            for sent_i in sents:
                if sent_i['prob'] >= scale_prob:
                    cur_adv_predicted_sentids.append(
                        (sent_i['sid'], sent_i['score'], sent_i['prob']))
                # del sent_i['prob']

            cur_adv_predicted_sentids = sorted(cur_adv_predicted_sentids,
                                               key=lambda x: -x[1])

        item[
            'scored_sentids'] = cur_adv_predicted_sentids[:
                                                          5]  # Important sentences for scaling training. Jul 21.
        item['predicted_sentids'] = [
            sid for sid, _, _ in item['scored_sentids']
        ][:5]
        item['predicted_evidence'] = convert_evidence2scoring_format(
            item['predicted_sentids'])
        item['predicted_label'] = item['label']  # give ground truth label

    # Removing all score and prob
    if delete_prob:
        for sent_item in full_sent_list:
            if 'score' in sent_item.keys():
                del sent_item['score']
                del sent_item['prob']

    return d_list
def threshold_sampler_insure_unique(org_data_file,
                                    full_sent_list,
                                    prob_threshold=0.5,
                                    logist_threshold=None,
                                    top_n=5):
    """
    Providing samples to the Training set by a probability threshold on the upstream selected sentences.
    """
    d_list = common.load_jsonl(org_data_file)
    augmented_dict: Dict[int, Dict[str, Dict]] = dict()
    print("Build selected sentences file:", len(full_sent_list))
    for sent_item in tqdm(full_sent_list):
        selection_id = sent_item[
            'selection_id']  # The id for the current one selection.
        org_id = int(selection_id.split('<##>')[0])
        remain_str = selection_id.split('<##>')[1]
        # doc_id = remain_str.split(c_scorer.SENT_LINE)[0]
        # ln = int(remain_str.split(c_scorer.SENT_LINE)[1])
        if org_id in augmented_dict:
            if remain_str not in augmented_dict[org_id]:
                augmented_dict[org_id][remain_str] = sent_item
            else:
                print("Exist")
        else:
            augmented_dict[org_id] = {remain_str: sent_item}

    for item in d_list:
        if int(item['id']) not in augmented_dict:
            # print("Potential error?")
            cur_predicted_sentids = []
        else:
            cur_predicted_sentids = [
            ]  # formating doc_id + c_score.SENTLINT + line_number
            sents = augmented_dict[int(item['id'])].values()
            # Modify some mechaism here to selection sentence whether by some score or label
            for sent_i in sents:
                if sent_i['prob'] >= prob_threshold:
                    cur_predicted_sentids.append(
                        (sent_i['sid'], sent_i['score'], sent_i['prob'])
                    )  # Important sentences for scaling training. Jul 21.
                # del sent_i['prob']

            cur_predicted_sentids = sorted(cur_predicted_sentids,
                                           key=lambda x: -x[1])

        item[
            'scored_sentids'] = cur_predicted_sentids[:
                                                      top_n]  # Important sentences for scaling training. Jul 21.
        item['predicted_sentids'] = [
            sid for sid, _, _ in item['scored_sentids']
        ][:top_n]
        item['predicted_evidence'] = convert_evidence2scoring_format(
            item['predicted_sentids'])
        # item['predicted_label'] = item['label']  # give ground truth label

    return d_list
예제 #21
0
def hidden_eval_fever():
    batch_size = 64
    lazy = True

    SAVE_PATH = "/home/easonnie/projects/FunEver/saved_models/07-18-21:07:28_m_esim_wn_elmo_sample_fixed/i(57000)_epoch(8)_dev(0.5755075507550755)_loss(1.7175163737963839)_seed(12)"

    dev_upstream_file = config.RESULT_PATH / "sent_retri/2018_07_05_17:17:50_r/dev.jsonl"

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    p_dict = wn_persistent_api.persistence_load()

    dev_fever_data_reader = WNReader(token_indexers=token_indexers, lazy=lazy, wn_p_dict=p_dict, max_l=360)

    complete_upstream_dev_data = get_actual_data(config.T_FEVER_DEV_JSONL, dev_upstream_file)
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)
    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)
    # dev_biterator = BasicIterator(batch_size=batch_size * 2)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    vocab.change_token_with_index_to_namespace('hidden', -2, namespace='labels')

    print(vocab.get_token_to_index_vocabulary('labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)

    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size,
                               1024 + 300),
                  weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=300)

    print("Model Max length:", model.max_l)
    model.load_state_dict(torch.load(SAVE_PATH))
    model.display()
    model.to(device)

    eval_iter = biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
    builded_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data)

    eval_mode = {'check_sent_id_correct': True, 'standard': True}

    for item in builded_dev_data:
        del item['label']

    print(c_scorer.fever_score(builded_dev_data, common.load_jsonl(config.T_FEVER_DEV_JSONL), mode=eval_mode))
예제 #22
0
def eval_nli():
    dev_list = common.load_jsonl(config.FEVER_DEV)
    # prediction_file = config.PRO_ROOT / "data/p_fever/fever_nli/04-25-22:02:53_fever_v2_nli_th0.2/ema_i(20000)|e(3)|ss(0.7002700270027002)|ac(0.746024602460246)|pr(0.6141389138913633)|rec(0.8627362736273627)|f1(0.7175148212089147)|seed(12)/nli_dev_cp_results_th0.2.jsonl"
    # prediction_file = config.PRO_ROOT / "saved_models/04-15-00:15:59_fever_v1_nli/i(18000)|e(2)|ss(0.6154615461546155)|ac(0.6701170117011701)|pr(0.26657540754071885)|rec(0.8852385238523852)|f1(0.40975857963668794)|seed(12)_dev_nli_results.json"
    prediction_file = config.PRO_ROOT / "data/p_fever/non_sent_level/ema_i(32000)|e(4)|ss(0.5592059205920592)|ac(0.6104110411041104)|pr(0.2638851385138135)|rec(0.8928142814281428)|f1(0.4073667130110584)|seed(12)_dev_nli_results.json"
    pred_list = common.load_jsonl(prediction_file)
    mode = {'standard': True}
    strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(
        pred_list, dev_list, mode=mode, max_evidence=5)
    logging_item = {
        'ss': strict_score,
        'ac': acc_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    print(logging_item)
    fever_scorer.fever_confusion_matrix(pred_list, dev_list)
예제 #23
0
def get_sentence_forward_pair(tag,
                              ruleterm_doc_results,
                              is_training,
                              debug=False,
                              ignore_non_verifiable=False,
                              top_k=5,
                              filter_value=0.005):
    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    if debug:
        d_list = d_list[:100]
        ruleterm_doc_results = ruleterm_doc_results[:100]

    # ruleterm_doc_results_dict = list_dict_data_tool.list_to_dict(ruleterm_doc_results, 'id')
    d_o_dict = list_dict_data_tool.list_to_dict(d_list, 'id')
    copied_d_o_dict = copy.deepcopy(d_o_dict)
    # copied_d_list = copy.deepcopy(d_list)
    list_dict_data_tool.append_subfield_from_list_to_dict(ruleterm_doc_results,
                                                          copied_d_o_dict,
                                                          'qid',
                                                          'fid',
                                                          check=True)

    cur_results_dict_filtered = select_top_k_and_to_results_dict(
        copied_d_o_dict,
        score_field_name='prob',
        top_k=top_k,
        filter_value=filter_value)

    db_cursor = fever_db.get_cursor()
    fitems = build_full_wiki_sentence_forward_item(cur_results_dict_filtered,
                                                   d_list, is_training,
                                                   db_cursor,
                                                   ignore_non_verifiable)

    return fitems
예제 #24
0
def build_snli(path: Path):
    snli_data_root_path = (path / "snli")
    if not snli_data_root_path.exists():
        snli_data_root_path.mkdir()
    o_train = common.load_jsonl(config.PRO_ROOT / "data/snli_1.0/snli_1.0_train.jsonl")
    o_dev = common.load_jsonl(config.PRO_ROOT / "data/snli_1.0/snli_1.0_dev.jsonl")
    o_test = common.load_jsonl(config.PRO_ROOT / "data/snli_1.0/snli_1.0_test.jsonl")

    d_trian = sm_nli2std_format(o_train)
    d_dev = sm_nli2std_format(o_dev)
    d_test = sm_nli2std_format(o_test)

    print("SNLI examples without gold label have been filtered.")
    print("SNLI Train size:", len(d_trian))
    print("SNLI Dev size:", len(d_dev))
    print("SNLI Test size:", len(d_test))

    common.save_jsonl(d_trian, snli_data_root_path / 'train.jsonl')
    common.save_jsonl(d_dev, snli_data_root_path / 'dev.jsonl')
    common.save_jsonl(d_test, snli_data_root_path / 'test.jsonl')
예제 #25
0
def build_mnli(path: Path):
    data_root_path = (path / "mnli")
    if not data_root_path.exists():
        data_root_path.mkdir()
    o_train = common.load_jsonl(config.PRO_ROOT / "data/multinli_1.0/multinli_1.0_train.jsonl")
    o_mm_dev = common.load_jsonl(config.PRO_ROOT / "data/multinli_1.0/multinli_1.0_dev_mismatched.jsonl")
    o_m_dev = common.load_jsonl(config.PRO_ROOT / "data/multinli_1.0/multinli_1.0_dev_matched.jsonl")

    d_trian = sm_nli2std_format(o_train)
    d_mm_dev = sm_nli2std_format(o_mm_dev)
    d_m_test = sm_nli2std_format(o_m_dev)

    print("MNLI examples without gold label have been filtered.")
    print("MNLI Train size:", len(d_trian))
    print("MNLI MisMatched Dev size:", len(d_mm_dev))
    print("MNLI Matched dev size:", len(d_m_test))

    common.save_jsonl(d_trian, data_root_path / 'train.jsonl')
    common.save_jsonl(d_mm_dev, data_root_path / 'mm_dev.jsonl')
    common.save_jsonl(d_m_test, data_root_path / 'm_dev.jsonl')
예제 #26
0
def build_fever_nli(path: Path):
    data_root_path = (path / "fever_nli")
    if not data_root_path.exists():
        data_root_path.mkdir()

    o_train = common.load_jsonl(config.PRO_ROOT / "data/nli_fever/train_fitems.jsonl")
    o_dev = common.load_jsonl(config.PRO_ROOT / "data/nli_fever/dev_fitems.jsonl")
    o_test = common.load_jsonl(config.PRO_ROOT / "data/nli_fever/test_fitems.jsonl")

    d_trian = fever_nli2std_format(o_train)
    d_dev = fever_nli2std_format(o_dev)
    d_test = fever_nli2std_format(o_test)

    print("FEVER-NLI Train size:", len(d_trian))
    print("FEVER-NLI Dev size:", len(d_dev))
    print("FEVER-NLI Test size:", len(d_test))

    common.save_jsonl(d_trian, data_root_path / 'train.jsonl')
    common.save_jsonl(d_dev, data_root_path / 'dev.jsonl')
    common.save_jsonl(d_test, data_root_path / 'test.jsonl')
예제 #27
0
def get_train_sentence_pair(top_k, is_training, debug=False, cur_train_eval_results_list=None):
    train_list = common.load_json(config.TRAIN_FILE)

    if cur_train_eval_results_list is None:
        cur_train_eval_results_list = common.load_jsonl(
            config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/"
                              "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/train_p_level_bert_v1_results.jsonl")

    if debug:
        train_list = train_list[:100]
        id_set = set([item['_id'] for item in train_list])
        cur_train_eval_results_list = [item for item in cur_train_eval_results_list if item['qid'] in id_set]

    return get_sentence_pair(top_k, train_list, cur_train_eval_results_list, is_training)
def threshold_sampler(org_data_file,
                      full_sent_list,
                      prob_threshold=0.5,
                      logist_threshold=None,
                      top_n=5):
    """
    Providing samples to the Training set by a probability threshold on the upstream selected sentences.
    """
    d_list = common.load_jsonl(org_data_file)
    augmented_dict = dict()
    print("Build selected sentences file:", len(full_sent_list))
    for sent_item in tqdm(full_sent_list):
        selection_id = sent_item[
            'selection_id']  # The id for the current one selection.
        org_id = int(selection_id.split('<##>')[0])
        if org_id in augmented_dict:
            # change some logic to remove duplicate.
            augmented_dict[org_id].append(sent_item)
        else:
            augmented_dict[org_id] = [sent_item]

    for item in d_list:
        if int(item['id']) not in augmented_dict:
            cur_predicted_sentids = []
        else:
            cur_predicted_sentids = [
            ]  # formating doc_id + c_score.SENTLINT + line_number
            sents = augmented_dict[int(item['id'])]
            # Modify some mechaism here to selection sentence whether by some score or label
            for sent_i in sents:
                if sent_i['prob'] >= prob_threshold:
                    cur_predicted_sentids.append(
                        (sent_i['sid'], sent_i['score'], sent_i['prob'])
                    )  # Important sentences for scaling training. Jul 21.
                # del sent_i['prob']

            cur_predicted_sentids = sorted(cur_predicted_sentids,
                                           key=lambda x: -x[1])

        item[
            'scored_sentids'] = cur_predicted_sentids[:
                                                      top_n]  # Important sentences for scaling training. Jul 21.
        item['predicted_sentids'] = [
            sid for sid, _, _ in item['scored_sentids']
        ][:top_n]
        item['predicted_evidence'] = convert_evidence2scoring_format(
            item['predicted_sentids'])
        # item['predicted_label'] = item['label']  # give ground truth label

    return d_list
예제 #29
0
def score_converter(org_data_file, full_sent_list, top_k=5, prob_thr=0.5):
    """
        Combines sentences of same claim 
        :param org_data_file:
        :param full_sent_list: append full_sent_score list to evidence of original data file
        :param top_k: top k sentences to be retrieved
        :param prob_thr: probability threshold for retrieved sentences
        :return:
        """
    d_list = common.load_jsonl(org_data_file)
    augmented_dict = dict()
    print("Build selected sentences file:", len(full_sent_list))
    for sent_item in tqdm(full_sent_list):
        selection_id = sent_item['selection_id']
        org_id = int(selection_id.split('<##>')[0])
        if org_id in augmented_dict:
            augmented_dict[org_id].append(sent_item)
        else:
            augmented_dict[org_id] = [sent_item]

    for item in d_list:
        if int(item['id']) not in augmented_dict:
            cur_predicted_sentids = []
        else:
            cur_predicted_sentids = []
            sents = augmented_dict[int(item['id'])]

            for sent_i in sents:
                if sent_i['prob'] >= prob_thr:
                    cur_predicted_sentids.append(
                        (sent_i['sid'], sent_i['score']))

            cur_predicted_sentids = sorted(cur_predicted_sentids,
                                           key=lambda x: -x[1])

        item['scored_sentids'] = cur_predicted_sentids
        item['predicted_sentids'] = [sid for sid, _ in item['scored_sentids']
                                     ][:top_k]
        item['predicted_evidence'] = convert_evidence2scoring_format(
            item['predicted_sentids'])
        item['predicted_label'] = item[
            'label']  # give ground truth label (for OFEVER calculation)

    # Removing all score and prob
    for sent_item in full_sent_list:
        if 'score' in sent_item.keys():
            del sent_item['score']
            del sent_item['prob']

    return d_list
예제 #30
0
def prepare_data_only_page_view(tokenized_file, eval_file,
                                doc_retrieval_output_file):
    """
    This method prepare document retrieval data using only page view.
    :return:
    """
    doc_retrieval_method = 'pageview'
    print("Method:", doc_retrieval_method)

    haonan_docretri_object = HAONAN_DOCRETRI_OBJECT()

    doc_retrieval_result_list = first_doc_retrieval(
        haonan_docretri_object,
        tokenized_file,
        method=doc_retrieval_method,
        top_k=100)
    eval_list = common.load_jsonl(eval_file)

    disamb.item_resorting(doc_retrieval_result_list)

    print("Evaluating 1st Doc Retrieval")
    eval_mode = {'check_doc_id_correct': True, 'standard': False}
    print(
        c_scorer.fever_score(doc_retrieval_result_list,
                             eval_list,
                             mode=eval_mode,
                             verbose=False))
    print(
        "Max_doc_num_5:",
        c_scorer.fever_doc_only(doc_retrieval_result_list,
                                eval_list,
                                max_evidence=5))
    print(
        "Max_doc_num_10:",
        c_scorer.fever_doc_only(doc_retrieval_result_list,
                                eval_list,
                                max_evidence=10))
    print(
        "Max_doc_num_15:",
        c_scorer.fever_doc_only(doc_retrieval_result_list,
                                eval_list,
                                max_evidence=15))
    print(
        "Max_doc_num_20:",
        c_scorer.fever_doc_only(doc_retrieval_result_list,
                                eval_list,
                                max_evidence=20))
    # First Document retrieval End.
    common.save_jsonl(doc_retrieval_result_list, doc_retrieval_output_file)