def get_sentence_pair(top_k, d_list, p_level_results_list, is_training, debug_mode=False):
    #
    t_db_cursor = wiki_db_tool.get_cursor(config.WHOLE_PROCESS_FOR_RINDEX_DB)
    #
    # dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    # dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    dev_list = d_list

    # cur_dev_eval_results_list = common.load_jsonl(
    #     config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_document_level/2019_4_17/dev_p_level_bert_v1_results.jsonl")
    cur_dev_eval_results_list = p_level_results_list

    if debug_mode:
        dev_list = dev_list[:100]
        id_set = set([item['_id'] for item in dev_list])
        cur_dev_eval_results_list = [item for item in p_level_results_list if item['qid'] in id_set]

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')

    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
    list_dict_data_tool.append_subfield_from_list_to_dict(cur_dev_eval_results_list, copied_dev_o_dict,
                                                          'qid', 'fid', check=True)
    cur_results_dict_top2 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=top_k, filter_value=None)
    # print(cur_results_dict_top2)
    fitems = build_sentence_forward_item(cur_results_dict_top2, dev_list, is_training=is_training,
                                         db_cursor=t_db_cursor)

    return fitems
示例#2
0
def get_paragraph_forward_pair(tag,
                               ruleterm_doc_results,
                               is_training,
                               debug=False,
                               ignore_non_verifiable=False):
    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    if debug:
        d_list = d_list[:100]
        ruleterm_doc_results = ruleterm_doc_results[:100]

    ruleterm_doc_results_dict = list_dict_data_tool.list_to_dict(
        ruleterm_doc_results, 'id')
    db_cursor = fever_db.get_cursor()
    fitems = build_full_wiki_document_forward_item(ruleterm_doc_results_dict,
                                                   d_list, is_training,
                                                   db_cursor,
                                                   ignore_non_verifiable)

    return fitems
示例#3
0
def get_sentences(tag, is_training, debug=False):
    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    if debug:
        # d_list = d_list[:10]
        d_list = d_list[:50]
        # d_list = d_list[:200]

    doc_results = common.load_jsonl(
        config.RESULT_PATH /
        f"doc_retri_results/fever_results/merged_doc_results/m_doc_{tag}.jsonl"
    )
    doc_results_dict = list_dict_data_tool.list_to_dict(doc_results, 'id')
    fever_db_cursor = fever_db.get_cursor(config.FEVER_DB)
    forward_items = build_full_wiki_document_forward_item(
        doc_results_dict,
        d_list,
        is_training=is_training,
        db_cursor=fever_db_cursor)
    return forward_items
示例#4
0
def get_inference_pair(tag,
                       is_training,
                       sent_result_path,
                       debug_num=None,
                       evidence_filtering_threshold=0.01):
    # sent_result_path = ""

    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    if debug_num is not None:
        # d_list = d_list[:10]
        d_list = d_list[:50]
        # d_list = d_list[:200]

    d_dict = list_dict_data_tool.list_to_dict(d_list, 'id')

    threshold_value = evidence_filtering_threshold
    # sent_list = common.load_jsonl(
    #     config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/train_sent_results.jsonl")
    # sent_list = common.load_jsonl(
    #     config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/i(5000)|e(0)|s01(0.9170917091709171)|s05(0.8842384238423843)|seed(12)_dev_sent_results.json")

    # debug_num = None if not debug else 2971
    # debug_num = None

    if isinstance(sent_result_path, Path):
        sent_list = common.load_jsonl(sent_result_path, debug_num)
    elif isinstance(sent_result_path, list):
        sent_list = sent_result_path
    else:
        raise ValueError(
            f"{sent_result_path} is not of a valid argument type which should be [list, Path]."
        )

    list_dict_data_tool.append_subfield_from_list_to_dict(sent_list,
                                                          d_dict,
                                                          'oid',
                                                          'fid',
                                                          check=True)

    filltered_sent_dict = select_top_k_and_to_results_dict(
        d_dict, top_k=5, threshold=threshold_value)

    list_dict_data_tool.append_item_from_dict_to_list(
        d_list, filltered_sent_dict, 'id',
        ['predicted_evidence', 'predicted_scored_evidence'])
    fever_db_cursor = fever_db.get_cursor(config.FEVER_DB)
    forward_items = build_nli_forward_item(d_list,
                                           is_training=is_training,
                                           db_cursor=fever_db_cursor)

    return forward_items, d_list
示例#5
0
def get_nli_pair(tag,
                 is_training,
                 sent_level_results_list,
                 debug=None,
                 sent_top_k=5,
                 sent_filter_value=0.05):
    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    if debug:
        d_list = d_list[:100]
        # sent_dict = list_dict_data_tool.list_to_dict(sent_level_results_list):

    d_dict = list_dict_data_tool.list_to_dict(d_list, 'id')

    if debug:
        id_set = set([item['id'] for item in d_list])
        new_sent_list = []
        for item in sent_level_results_list:
            if item["qid"] in id_set:
                new_sent_list.append(item)
        sent_level_results_list = new_sent_list

    list_dict_data_tool.append_subfield_from_list_to_dict(
        sent_level_results_list, d_dict, 'qid', 'fid', check=True)

    filltered_sent_dict = select_top_k_and_to_results_dict(
        d_dict,
        score_field_name='prob',
        top_k=sent_top_k,
        filter_value=sent_filter_value,
        result_field='predicted_evidence')

    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
        d_list, filltered_sent_dict, 'id',
        ['predicted_evidence', 'selected_scored_results'])

    fever_db_cursor = fever_db.get_cursor(config.FEVER_DB)
    forward_items = build_nli_forward_item(d_list,
                                           is_training=is_training,
                                           db_cursor=fever_db_cursor)

    return forward_items, d_list
def eval_ensemble():
    sent_file = config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl"
    dev_sent_filtering_prob = 0.01
    tag = 'dev'
    top_k = 5

    # dev_list = common.load_jsonl(config.FEVER_DEV)
    dev_sent_results_list = common.load_jsonl(sent_file)

    dev_fitems, dev_list = get_nli_pair(
        tag,
        is_training=False,
        sent_level_results_list=dev_sent_results_list,
        debug=False,
        sent_top_k=top_k,
        sent_filter_value=dev_sent_filtering_prob)

    pred_file_list = [
        config.PRO_ROOT /
        "data/p_fever/fever_nli/04-25-22:02:53_fever_v2_nli_th0.2/ema_i(20000)|e(3)|ss(0.7002700270027002)|ac(0.746024602460246)|pr(0.6141389138913633)|rec(0.8627362736273627)|f1(0.7175148212089147)|seed(12)/nli_dev_label_results_th0.2.jsonl",
        config.PRO_ROOT /
        "data/p_fever/fever_nli/04-26-10:15:39_fever_v2_nli_th0.2/ema_i(14000)|e(2)|ss(0.6991199119911992)|ac(0.7492249224922493)|pr(0.7129412941294097)|rec(0.8338583858385838)|f1(0.7686736484619933)|seed(12)/nli_dev_label_results_th0.2.jsonl",
        config.PRO_ROOT /
        "data/p_fever/fever_nli/04-27-10:03:27_fever_v2_nli_th0.2/ema_i(26000)|e(3)|ss(0.6958695869586958)|ac(0.7447744774477447)|pr(0.7129412941294097)|rec(0.8338583858385838)|f1(0.7686736484619933)|seed(12)/nli_dev_label_results_th0.2.jsonl",
    ]
    pred_d_list = [common.load_jsonl(file) for file in pred_file_list]
    final_list = ensemble_nli_results(pred_d_list)
    pred_list = final_list

    ema_results_dict = list_dict_data_tool.list_to_dict(pred_list, 'oid')
    copied_dev_list = copy.deepcopy(dev_list)
    list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list,
                                                      ema_results_dict, 'id',
                                                      'predicted_label')

    dev_list = common.load_jsonl(config.FEVER_DEV)
    mode = {'standard': True}
    strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(
        copied_dev_list, dev_list, mode=mode, max_evidence=5)
    logging_item = {
        'ss': strict_score,
        'ac': acc_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    print(logging_item)
def inspect_upstream_eval():
    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')
    dev_eval_results_list = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/dev_s_level_bert_v1_results.jsonl"
    )
    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
    list_dict_data_tool.append_subfield_from_list_to_dict(
        dev_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)

    # 0.5
    # cur_results_dict_v05 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5,
    #                                                         score_field_name='prob',
    #                                                         filter_value=0.5,
    #                                                         result_field='sp')

    cur_results_dict_v02 = select_top_k_and_to_results_dict(
        copied_dev_o_dict,
        top_k=5,
        score_field_name='prob',
        filter_value=0.2,
        result_field='sp')

    # _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05, dev_list, verbose=False)

    _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02,
                                         dev_list,
                                         verbose=False)

    v02_sp_f1 = metrics_v2['sp_f1']
    v02_sp_recall = metrics_v2['sp_recall']
    v02_sp_prec = metrics_v2['sp_prec']

    v05_sp_f1 = metrics_v5['sp_f1']
    v05_sp_recall = metrics_v5['sp_recall']
    v05_sp_prec = metrics_v5['sp_prec']

    logging_item = {
        'label': 'ema',
        'v02': metrics_v2,
        # 'v05': metrics_v5,
    }

    print(logging_item)
示例#8
0
def eval_hotpot_s():
    cur_dev_eval_results_list_out = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_hotpotqa/hotpot_p_level_effects/hotpot_s_level_dev_results_top_k_doc_100.jsonl"
    )
    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')
    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_dev_eval_results_list_out,
        copied_dev_o_dict,
        'qid',
        'fid',
        check=True)
    # 0.5
    cur_results_dict_v05 = select_top_k_and_to_results_dict(
        copied_dev_o_dict,
        top_k=5,
        score_field_name='prob',
        filter_value=0.5,
        result_field='sp')

    # cur_results_dict_v02 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5,
    #                                                         score_field_name='prob',
    #                                                         filter_value=0.2,
    #                                                         result_field='sp')

    _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05,
                                         dev_list,
                                         verbose=False)

    # _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02, dev_list, verbose=False)

    logging_item = {
        # 'v02': metrics_v2,
        'v05': metrics_v5,
    }

    print(logging_item)
    f1 = metrics_v5['sp_f1']
    em = metrics_v5['sp_em']
    pr = metrics_v5['sp_prec']
    rec = metrics_v5['sp_recall']

    print(em, pr, rec, f1)
示例#9
0
def get_sentence_forward_pair(tag,
                              ruleterm_doc_results,
                              is_training,
                              debug=False,
                              ignore_non_verifiable=False,
                              top_k=5,
                              filter_value=0.005):
    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    if debug:
        d_list = d_list[:100]
        ruleterm_doc_results = ruleterm_doc_results[:100]

    # ruleterm_doc_results_dict = list_dict_data_tool.list_to_dict(ruleterm_doc_results, 'id')
    d_o_dict = list_dict_data_tool.list_to_dict(d_list, 'id')
    copied_d_o_dict = copy.deepcopy(d_o_dict)
    # copied_d_list = copy.deepcopy(d_list)
    list_dict_data_tool.append_subfield_from_list_to_dict(ruleterm_doc_results,
                                                          copied_d_o_dict,
                                                          'qid',
                                                          'fid',
                                                          check=True)

    cur_results_dict_filtered = select_top_k_and_to_results_dict(
        copied_d_o_dict,
        score_field_name='prob',
        top_k=top_k,
        filter_value=filter_value)

    db_cursor = fever_db.get_cursor()
    fitems = build_full_wiki_sentence_forward_item(cur_results_dict_filtered,
                                                   d_list, is_training,
                                                   db_cursor,
                                                   ignore_non_verifiable)

    return fitems
示例#10
0
def eitems_to_fitems(eitem_list,
                     tokenizer,
                     is_training,
                     max_tokens_for_doc=320,
                     max_query_length=64,
                     doc_stride=128,
                     debug=False):
    fitem_list = []  # The output of all fitems
    if debug:
        eitem_list = eitem_list[:100]

    for item in tqdm(eitem_list):
        f_items = preprocssing_span_prediction_item_paired(
            item, tokenizer, is_training, max_tokens_for_doc, max_query_length,
            doc_stride)
        fitem_list.extend(f_items)

    fitem_dict = list_dict_data_tool.list_to_dict(fitem_list, 'fid')

    return fitem_dict, fitem_list
示例#11
0
def get_open_qa_item_with_upstream_paragraphs(d_list,
                                              cur_eval_results_list,
                                              is_training,
                                              tokenizer: BertTokenizer,
                                              max_context_length,
                                              max_query_length,
                                              doc_stride=128,
                                              debug_mode=False,
                                              top_k=10,
                                              filter_value=0.1,
                                              match_type='string'):
    t_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_RAW_TEXT)

    if debug_mode:
        d_list = d_list[:100]
        id_set = set([item['question'] for item in d_list])
        cur_eval_results_list = [
            item for item in cur_eval_results_list if item['qid'] in id_set
        ]

    d_o_dict = list_dict_data_tool.list_to_dict(d_list, 'question')
    copied_d_o_dict = copy.deepcopy(d_o_dict)

    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_eval_results_list, copied_d_o_dict, 'qid', 'fid', check=True)

    cur_results_dict_top10 = od_sample_utils.select_top_k_and_to_results_dict(
        copied_d_o_dict,
        score_field_name='prob',
        top_k=top_k,
        filter_value=filter_value)

    forward_example_items = build_open_qa_forword_item(cur_results_dict_top10,
                                                       d_list, is_training,
                                                       t_cursor, match_type)
    forward_example_items = format_convert(forward_example_items, is_training)
    fitems_dict, read_fitems_list = span_preprocess_tool.eitems_to_fitems(
        forward_example_items, tokenizer, is_training, max_context_length,
        max_query_length, doc_stride, False)

    return fitems_dict, read_fitems_list, cur_results_dict_top10['pred_p_list']
示例#12
0
def eval_p_level():
    cur_eval_results_list = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/dev_p_level_bert_v1_results.jsonl"
    )

    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')

    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)
    # Top_5
    cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict,
                                                             top_k=5)

    _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5,
                                           dev_list,
                                           verbose=False)

    print(metrics_top5)
def get_qa_item_with_upstream_sentence(d_list,
                                       sentence_level_results,
                                       is_training,
                                       tokenizer: BertTokenizer,
                                       max_context_length,
                                       max_query_length,
                                       doc_stride=128,
                                       debug_mode=False,
                                       top_k=5,
                                       filter_value=0.2):
    t_db_cursor = wiki_db_tool.get_cursor(config.WHOLE_PROCESS_FOR_RINDEX_DB)

    if debug_mode:
        d_list = d_list[:100]
        id_set = set([item['_id'] for item in d_list])
        sentence_level_results = [
            item for item in sentence_level_results if item['qid'] in id_set
        ]

    d_o_dict = list_dict_data_tool.list_to_dict(d_list, '_id')
    copied_d_o_dict = copy.deepcopy(d_o_dict)
    list_dict_data_tool.append_subfield_from_list_to_dict(
        sentence_level_results, copied_d_o_dict, 'qid', 'fid', check=True)

    cur_results_dict = select_top_k_and_to_results_dict(
        copied_d_o_dict,
        top_k=top_k,
        score_field_name='prob',
        filter_value=filter_value,
        result_field='sp')

    forward_example_items = build_qa_forword_item(cur_results_dict, d_list,
                                                  is_training, t_db_cursor)
    forward_example_items = format_convert(forward_example_items, is_training)
    fitems_dict, read_fitems_list = span_preprocess_tool.eitems_to_fitems(
        forward_example_items, tokenizer, is_training, max_context_length,
        max_query_length, doc_stride, False)

    return fitems_dict, read_fitems_list, cur_results_dict['sp']
示例#14
0
def evidence_adjustment(tag, sent_file, label_file, filter_prob=0.2, top_k=5):
    dev_sent_filtering_prob = filter_prob

    # dev_list = common.load_jsonl(config.FEVER_DEV)
    dev_sent_results_list = common.load_jsonl(sent_file)

    dev_fitems, dev_list = get_nli_pair(
        tag,
        is_training=False,
        sent_level_results_list=dev_sent_results_list,
        debug=False,
        sent_top_k=top_k,
        sent_filter_value=dev_sent_filtering_prob)

    cur_eval_results_list = common.load_jsonl(label_file)

    ema_results_dict = list_dict_data_tool.list_to_dict(
        cur_eval_results_list, 'oid')
    copied_dev_list = copy.deepcopy(dev_list)
    list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list,
                                                      ema_results_dict, 'id',
                                                      'predicted_label')

    mode = {'standard': True}
    # delete_unused_evidence(copied_dev_list)
    strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(
        copied_dev_list, dev_list, mode=mode, max_evidence=5)
    logging_item = {
        'ss': strict_score,
        'ac': acc_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    print(logging_item)
示例#15
0
def model_perf(dataset_name, task_name, data_file, model_prediction_file):
    d_list = common.load_jsonl(data_file)
    collected_data_dict = list_dict_data_tool.list_to_dict(d_list,
                                                           key_fields='uid')
    model_prediction_dict = common.load_json(model_prediction_file)
    results_dict, all_correct_set = calculate_divergence_bwt_model_human_simplify(
        collected_data_dict, model_prediction_dict, task_name)
    print('-' * 60)
    print('Data:', dataset_name)
    print("All Correct Count:", len(all_correct_set))
    print('\t'.join([
        '{:20s}'.format('Model Name'), '{:10s}'.format('JSD'),
        '{:10s}'.format('KL'), '{:10s}'.format('Old Acc.'),
        '{:10s}'.format('New Acc.')
    ]))
    for model_name, model_item in results_dict.items():
        print('\t'.join([
            '{:20s}'.format(model_name),
            '{:10s}'.format(format_number(model_item['average JS div'])),
            '{:10s}'.format(format_number(model_item['average KL div'])),
            '{:10s}'.format(format_number(model_item['o_acc'])),
            '{:10s}'.format(format_number(model_item['m_acc'])),
        ]))
    print('-' * 60)
def eval_model_for_downstream(model_saved_path):
    bert_model_name = 'bert-base-uncased'
    lazy = True
    # lazy = True
    forward_size = 64
    # batch_size = 64
    batch_size = 128
    do_lower_case = True

    debug_mode = False
    max_l = 264
    # est_datasize = 900_000

    num_class = 1
    # num_train_optimization_steps
    tag = 'test'

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset
    # train_ruleterm_doc_results = common.load_jsonl(
    #     config.PRO_ROOT / "results/doc_retri_results/fever_results/merged_doc_results/m_doc_train.jsonl")
    # dev_ruleterm_doc_results = train_ruleterm_doc_results
    if tag == 'dev':
        dev_ruleterm_doc_results = common.load_jsonl(
            config.PRO_ROOT /
            "results/doc_retri_results/fever_results/merged_doc_results/m_doc_dev.jsonl"
        )

        dev_list = common.load_jsonl(config.FEVER_DEV)

        dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair(
            'dev',
            dev_ruleterm_doc_results,
            is_training=False,
            debug=debug_mode,
            ignore_non_verifiable=False)
    elif tag == 'train':
        dev_ruleterm_doc_results = common.load_jsonl(
            config.PRO_ROOT /
            "results/doc_retri_results/fever_results/merged_doc_results/m_doc_train.jsonl"
        )

        dev_list = common.load_jsonl(config.FEVER_TRAIN)

        dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair(
            'train',
            dev_ruleterm_doc_results,
            is_training=True,
            debug=debug_mode,
            ignore_non_verifiable=False)
    elif tag == 'test':
        dev_ruleterm_doc_results = common.load_jsonl(
            config.PRO_ROOT /
            "results/doc_retri_results/fever_results/merged_doc_results/m_doc_test.jsonl"
        )

        dev_list = common.load_jsonl(config.FEVER_TEST)

        dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair(
            'test',
            dev_ruleterm_doc_results,
            is_training=False,
            debug=debug_mode,
            ignore_non_verifiable=False)
    else:
        raise NotImplemented()

    # dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair('train', dev_ruleterm_doc_results,
    #                                                               is_training=True, debug=debug_mode,
    #                                                               ignore_non_verifiable=False)

    # Just to show the information
    fever_p_level_sampler.down_sample_neg(dev_fitems, None)

    if debug_mode:
        dev_list = dev_list[:100]
        eval_frequency = 2
        # print(dev_list[-1]['_id'])
        # exit(0)

    # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio)
    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id')
    # print(dev_o_dict)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                                   do_lower_case=do_lower_case)
    bert_cs_reader = BertContentSelectionReader(
        bert_tokenizer,
        lazy,
        is_paired=True,
        example_filter=lambda x: len(x['context']) == 0,
        max_l=max_l,
        element_fieldname='element')

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=True)

    model.load_state_dict(torch.load(model_saved_path))

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #

    if debug_mode:
        num_train_optimization_steps = 100

    dev_instances = bert_cs_reader.read(dev_fitems)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

    cur_eval_results_list = eval_model(model,
                                       dev_iter,
                                       device_num,
                                       make_int=True,
                                       with_probs=True,
                                       show_progress=True)

    common.save_jsonl(cur_eval_results_list,
                      f"fever_p_level_{tag}_results.jsonl")

    if tag == 'test':
        exit(0)
    # common.save_jsonl(cur_eval_results_list, "fever_p_level_train_results_1.jsonl")

    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
    copied_dev_d_list = copy.deepcopy(dev_list)
    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)

    cur_results_dict_th0_5 = select_top_k_and_to_results_dict(
        copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.5)

    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
        copied_dev_d_list, cur_results_dict_th0_5, 'id', 'predicted_docids')
    # mode = {'standard': False, 'check_doc_id_correct': True}
    strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list,
                                                            dev_list,
                                                            max_evidence=5)
    score_05 = {
        'ss': strict_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)

    cur_results_dict_th0_2 = select_top_k_and_to_results_dict(
        copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.2)

    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
        copied_dev_d_list, cur_results_dict_th0_2, 'id', 'predicted_docids')
    # mode = {'standard': False, 'check_doc_id_correct': True}
    strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list,
                                                            dev_list,
                                                            max_evidence=5)
    score_02 = {
        'ss': strict_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)

    cur_results_dict_th0_1 = select_top_k_and_to_results_dict(
        copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.1)

    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
        copied_dev_d_list, cur_results_dict_th0_1, 'id', 'predicted_docids')
    # mode = {'standard': False, 'check_doc_id_correct': True}
    strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list,
                                                            dev_list,
                                                            max_evidence=5)
    score_01 = {
        'ss': strict_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)

    cur_results_dict_th00_1 = select_top_k_and_to_results_dict(
        copied_dev_o_dict, score_field_name='prob', top_k=5, filter_value=0.01)

    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
        copied_dev_d_list, cur_results_dict_th00_1, 'id', 'predicted_docids')
    # mode = {'standard': False, 'check_doc_id_correct': True}
    strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list,
                                                            dev_list,
                                                            max_evidence=5)
    score_001 = {
        'ss': strict_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)

    cur_results_dict_th000_5 = select_top_k_and_to_results_dict(
        copied_dev_o_dict,
        score_field_name='prob',
        top_k=5,
        filter_value=0.005)

    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
        copied_dev_d_list, cur_results_dict_th000_5, 'id', 'predicted_docids')
    # mode = {'standard': False, 'check_doc_id_correct': True}
    strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(copied_dev_d_list,
                                                            dev_list,
                                                            max_evidence=5)
    score_0005 = {
        'ss': strict_score,
        'pr': pr,
        'rec': rec,
        'f1': f1,
    }

    logging_item = {
        'score_0005': score_0005,
        'score_001': score_001,
        'score_01': score_01,
        'score_02': score_02,
        'score_05': score_05,
    }

    print(json.dumps(logging_item, indent=2))
示例#17
0
def eval_model_for_downstream(model_saved_path):
    seed = 12
    torch.manual_seed(seed)
    bert_model_name = 'bert-base-uncased'
    # lazy = False
    lazy = True
    forward_size = 32
    # batch_size = 64
    batch_size = 128
    do_lower_case = True

    debug_mode = False
    # est_datasize = 900_000

    num_class = 1
    # num_train_optimization_steps

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels')

    # Load Dataset
    train_list = common.load_json(config.TRAIN_FILE)
    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)

    dev_fitems_list = common.load_jsonl(
        config.PDATA_ROOT / "content_selection_forward" / "hotpot_dev_p_level_unlabeled.jsonl")
    train_fitems_list = common.load_jsonl(
        config.PDATA_ROOT / "content_selection_forward" / "hotpot_train_p_level_labeled.jsonl")
    test_fitems_list = common.load_jsonl(
        config.PDATA_ROOT / "content_selection_forward" / "hotpot_test_p_level_unlabeled.jsonl")

    if debug_mode:
        dev_list = dev_list[:10]
        dev_fitems_list = dev_fitems_list[:296]
        train_fitems_list = train_fitems_list[:300]
        eval_frequency = 2
        # print(dev_list[-1]['_id'])
        # exit(0)

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')
    train_o_dict = list_dict_data_tool.list_to_dict(train_list, '_id')

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case)
    bert_cs_reader = BertContentSelectionReader(bert_tokenizer, lazy, is_paired=True,
                                                example_filter=lambda x: len(x['context']) == 0, max_l=286)

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1,
                                            act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True)

    model.load_state_dict(torch.load(model_saved_path))

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #
    dev_instances = bert_cs_reader.read(dev_fitems_list)
    train_instance = bert_cs_reader.read(train_fitems_list)
    test_instances = bert_cs_reader.read(test_fitems_list)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    # train_iter = biterator(train_instance, num_epochs=1, shuffle=False)
    # dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)
    test_iter = biterator(test_instances, num_epochs=1, shuffle=False)

    print(len(dev_fitems_list))
    print(len(test_fitems_list))
    print(len(train_fitems_list))

    # cur_dev_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, show_progress=True)
    # cur_train_eval_results_list = eval_model(model, train_iter, device_num, with_probs=True, show_progress=True)

    cur_test_eval_results_list = eval_model(model, test_iter, device_num, with_probs=True, show_progress=True)
    common.save_jsonl(cur_test_eval_results_list, "test_p_level_bert_v1_results.jsonl")

    print("Test write finished.")
    exit(0)

    copied_dev_o_dict = copy.deepcopy(dev_o_dict)

    list_dict_data_tool.append_subfield_from_list_to_dict(cur_dev_eval_results_list, copied_dev_o_dict,
                                                          'qid', 'fid', check=True)
    # Top_3
    cur_results_dict_top3 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=3)
    upperbound_results_dict_top3 = append_gt_downstream_to_get_upperbound_from_doc_retri(
        cur_results_dict_top3,
        dev_list)

    # Top_5
    cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5)
    upperbound_results_dict_top5 = append_gt_downstream_to_get_upperbound_from_doc_retri(
        cur_results_dict_top5,
        dev_list)

    cur_results_dict_top10 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=10)
    upperbound_results_dict_top10 = append_gt_downstream_to_get_upperbound_from_doc_retri(
        cur_results_dict_top10,
        dev_list)

    _, metrics_top3 = ext_hotpot_eval.eval(cur_results_dict_top3, dev_list, verbose=False)
    _, metrics_top3_UB = ext_hotpot_eval.eval(upperbound_results_dict_top3, dev_list, verbose=False)

    _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False)
    _, metrics_top5_UB = ext_hotpot_eval.eval(upperbound_results_dict_top5, dev_list, verbose=False)

    _, metrics_top10 = ext_hotpot_eval.eval(cur_results_dict_top10, dev_list, verbose=False)
    _, metrics_top10_UB = ext_hotpot_eval.eval(upperbound_results_dict_top10, dev_list, verbose=False)

    logging_item = {
        'top3': metrics_top3,
        'top3_UB': metrics_top3_UB,
        'top5': metrics_top5,
        'top5_UB': metrics_top5_UB,
        'top10': metrics_top10,
        'top10_UB': metrics_top10_UB,
    }

    print(logging_item)

    common.save_jsonl(cur_train_eval_results_list, "train_p_level_bert_v1_results.jsonl")
    common.save_jsonl(cur_dev_eval_results_list, "dev_p_level_bert_v1_results.jsonl")
示例#18
0
def model_go():
    seed = 12
    torch.manual_seed(seed)
    # bert_model_name = 'bert-large-uncased'
    bert_model_name = 'bert-base-uncased'
    experiment_name = 'hotpot_v0_cs'
    lazy = False
    # lazy = True
    forward_size = 16
    # batch_size = 64
    batch_size = 128
    gradient_accumulate_step = int(batch_size / forward_size)
    warmup_proportion = 0.1
    learning_rate = 5e-5
    num_train_epochs = 5
    eval_frequency = 5000
    pos_ratio = 0.2
    do_lower_case = True

    debug_mode = False
    # est_datasize = 900_000

    num_class = 1
    # num_train_optimization_steps

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels')

    # Load Dataset
    train_list = common.load_json(config.TRAIN_FILE)
    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)

    dev_fitems_list = common.load_jsonl(
        config.PDATA_ROOT / "content_selection_forward" / "hotpot_dev_p_level_unlabeled.jsonl")
    train_fitems_list = common.load_jsonl(
        config.PDATA_ROOT / "content_selection_forward" / "hotpot_train_p_level_labeled.jsonl")

    if debug_mode:
        dev_list = dev_list[:10]
        dev_fitems_list = dev_fitems_list[:296]
        train_fitems_list = train_fitems_list[:300]
        eval_frequency = 2
        # print(dev_list[-1]['_id'])
        # exit(0)

    sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio)
    est_datasize = len(sampled_train_list)

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')
    # print(dev_o_dict)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case)
    bert_cs_reader = BertContentSelectionReader(bert_tokenizer, lazy, is_paired=True,
                                                example_filter=lambda x: len(x['context']) == 0, max_l=286)

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1,
                                            act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \
                                   num_train_epochs

    print("Estimated training size", est_datasize)
    print("Number of optimization steps:", num_train_optimization_steps)

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=num_train_optimization_steps)

    dev_instances = bert_cs_reader.read(dev_fitems_list)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    forbackward_step = 0
    update_step = 0

    logging_agent = save_tool.ScoreLogger({})

    # # # Create Log File
    file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}")
    # Save the source code.
    script_name = os.path.basename(__file__)
    with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it:
        out_f.write(it.read())
        out_f.flush()
    # # # Log File end

    for epoch_i in range(num_train_epochs):
        print("Epoch:", epoch_i)
        sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio)
        train_instance = bert_cs_reader.read(sampled_train_list)
        train_iter = biterator(train_instance, num_epochs=1, shuffle=True)

        for batch in tqdm(train_iter):
            model.train()
            batch = move_to_device(batch, device_num)

            paired_sequence = batch['paired_sequence']
            paired_segments_ids = batch['paired_segments_ids']
            labels_ids = batch['label']
            att_mask, _ = torch_util.get_length_and_mask(paired_sequence)
            s1_span = batch['bert_s1_span']
            s2_span = batch['bert_s2_span']

            loss = model(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask,
                         mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN,
                         labels=labels_ids)

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.

            if gradient_accumulate_step > 1:
                loss = loss / gradient_accumulate_step

            loss.backward()
            forbackward_step += 1

            if forbackward_step % gradient_accumulate_step == 0:
                optimizer.step()
                optimizer.zero_grad()
                update_step += 1

                if update_step % eval_frequency == 0:
                    print("Update steps:", update_step)
                    dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

                    cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True)
                    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
                    list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_dev_o_dict,
                                                                          'qid', 'fid', check=True)
                    # Top_5
                    cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5)
                    upperbound_results_dict_top5 = append_gt_downstream_to_get_upperbound_from_doc_retri(
                        cur_results_dict_top5,
                        dev_list)

                    cur_results_dict_top10 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=10)
                    upperbound_results_dict_top10 = append_gt_downstream_to_get_upperbound_from_doc_retri(
                        cur_results_dict_top10,
                        dev_list)

                    _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False)
                    _, metrics_top5_UB = ext_hotpot_eval.eval(upperbound_results_dict_top5, dev_list, verbose=False)

                    _, metrics_top10 = ext_hotpot_eval.eval(cur_results_dict_top10, dev_list, verbose=False)
                    _, metrics_top10_UB = ext_hotpot_eval.eval(upperbound_results_dict_top10, dev_list, verbose=False)

                    # top5_doc_f1, top5_UB_sp_f1, top10_doc_f1, top10_Ub_sp_f1
                    # top5_doc_f1 = metrics_top5['doc_f1']
                    # top5_UB_sp_f1 = metrics_top5_UB['sp_f1']
                    # top10_doc_f1 = metrics_top10['doc_f1']
                    # top10_Ub_sp_f1 = metrics_top10_UB['sp_f1']

                    top5_doc_recall = metrics_top5['doc_recall']
                    top5_UB_sp_recall = metrics_top5_UB['sp_recall']
                    top10_doc_recall = metrics_top10['doc_recall']
                    top10_Ub_sp_recall = metrics_top10_UB['sp_recall']

                    logging_item = {
                        'top5': metrics_top5,
                        'top5_UB': metrics_top5_UB,
                        'top10': metrics_top10,
                        'top10_UB': metrics_top10_UB,
                    }

                    # print(logging_item)
                    save_file_name = f'i({update_step})|e({epoch_i})' \
                        f'|t5_doc_recall({top5_doc_recall})|t5_sp_recall({top5_UB_sp_recall})' \
                        f'|t10_doc_recall({top10_doc_recall})|t5_sp_recall({top10_Ub_sp_recall})|seed({seed})'

                    # print(save_file_name)
                    logging_agent.incorporate_results({}, save_file_name, logging_item)
                    logging_agent.logging_to_file(Path(file_path_prefix) / "log.json")

                    model_to_save = model.module if hasattr(model, 'module') else model
                    output_model_file = Path(file_path_prefix) / save_file_name
                    torch.save(model_to_save.state_dict(), str(output_model_file))
示例#19
0
    random.shuffle(pos_items)
    random.shuffle(neg_items)
    neg_sample_count = int(pos_count / ratio)

    sampled_neg = neg_items[:neg_sample_count]

    print(f"After Sampling, we have {pos_count}/{len(sampled_neg)} (pos/neg).")

    sampled_list = sampled_neg + pos_items
    random.shuffle(sampled_list)

    return sampled_list


if __name__ == '__main__':
    d_list = common.load_jsonl(config.FEVER_DEV)
    doc_results = common.load_jsonl(
        config.RESULT_PATH /
        "doc_retri_results/fever_results/merged_doc_results/m_doc_dev.jsonl")
    doc_results_dict = list_dict_data_tool.list_to_dict(doc_results, 'id')
    fever_db_cursor = fever_db.get_cursor(config.FEVER_DB)
    forward_items = build_full_wiki_document_forward_item(
        doc_results_dict, d_list, is_training=False, db_cursor=fever_db_cursor)
    # print(forward_items)

    # for item in forward_items:
    # if item['s_labels'] == 'true':
    # print(item['query'], item['context'], item['sid'], item['cid'], item['fid'], item['s_labels'])

    print(len(forward_items))
    # down_sample_neg(forward_items, ratio=0.2)
示例#20
0
def model_go(sent_filter_value, sent_top_k=5):
    seed = 12
    torch.manual_seed(seed)

    bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'
    bert_model_name = "bert-base-uncased"
    lazy = False
    forward_size = 32
    batch_size = 32
    gradient_accumulate_step = int(batch_size / forward_size)
    warmup_rate = 0.1
    learning_rate = 5e-5
    num_train_epochs = 5
    eval_frequency = 1000

    do_lower_case = True

    debug = False

    max_pre_context_length = 320
    max_query_length = 64
    doc_stride = 128
    qa_num_of_layer = 2
    do_ema = True
    ema_device_num = 1
    # s_filter_value = 0.5
    s_filter_value = sent_filter_value
    # s_top_k = 5
    s_top_k = sent_top_k

    experiment_name = f'hotpot_v0_qa_(s_top_k:{s_top_k},s_fv:{s_filter_value},qa_layer:{qa_num_of_layer})'

    print("Potential total length:",
          max_pre_context_length + max_query_length + 3)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                              do_lower_case=do_lower_case,
                                              cache_dir=bert_pretrain_path)

    # Load Dataset.
    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    train_list = common.load_json(config.TRAIN_FILE)

    dev_sentence_level_results = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/dev_s_level_bert_v1_results.jsonl"
    )
    train_sentence_level_results = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/train_s_level_bert_v1_results.jsonl"
    )

    dev_fitem_dict, dev_fitem_list, dev_sp_results_dict = get_qa_item_with_upstream_sentence(
        dev_list,
        dev_sentence_level_results,
        is_training=False,
        tokenizer=tokenizer,
        max_context_length=max_pre_context_length,
        max_query_length=max_query_length,
        filter_value=s_filter_value,
        doc_stride=doc_stride,
        top_k=s_top_k,
        debug_mode=debug)

    train_fitem_dict, train_fitem_list, _ = get_qa_item_with_upstream_sentence(
        train_list,
        train_sentence_level_results,
        is_training=True,
        tokenizer=tokenizer,
        max_context_length=max_pre_context_length,
        max_query_length=max_query_length,
        filter_value=s_filter_value,
        doc_stride=doc_stride,
        top_k=s_top_k,
        debug_mode=debug)

    # print(len(dev_fitem_list))
    # print(len(dev_fitem_dict))

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')

    if debug:
        dev_list = dev_list[:100]
        eval_frequency = 2

    est_datasize = len(train_fitem_list)

    span_pred_reader = BertPairedSpanPredReader(bert_tokenizer=tokenizer,
                                                lazy=lazy,
                                                example_filter=None)
    bert_encoder = BertModel.from_pretrained(bert_model_name,
                                             cache_dir=bert_pretrain_path)
    model = BertSpan(bert_encoder, qa_num_of_layer)

    ema = None
    if do_ema:
        ema = EMA(model, model.named_parameters(), device_num=ema_device_num)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    iterator = BasicIterator(batch_size=batch_size)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    print("Total train instances:", len(train_fitem_list))

    num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \
                                   num_train_epochs

    if debug:
        num_train_optimization_steps = 100

    print("Estimated training size", est_datasize)
    print("Number of optimization steps:", num_train_optimization_steps)

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_rate,
                         t_total=num_train_optimization_steps)

    dev_instances = span_pred_reader.read(dev_fitem_list)

    forbackward_step = 0
    update_step = 0

    logging_agent = save_tool.ScoreLogger({})

    # # # Create Log File
    file_path_prefix = None
    if not debug:
        file_path_prefix, date = save_tool.gen_file_prefix(
            f"{experiment_name}")
        # Save the source code.
        script_name = os.path.basename(__file__)
        with open(os.path.join(file_path_prefix, script_name),
                  'w') as out_f, open(__file__, 'r') as it:
            out_f.write(it.read())
            out_f.flush()
    # # # Log File end

    for epoch_i in range(num_train_epochs):
        print("Epoch:", epoch_i)

        print("Resampling:")
        train_fitem_dict, train_fitem_list, _ = get_qa_item_with_upstream_sentence(
            train_list,
            train_sentence_level_results,
            is_training=True,
            tokenizer=tokenizer,
            max_context_length=max_pre_context_length,
            max_query_length=max_query_length,
            filter_value=s_filter_value,
            doc_stride=doc_stride,
            top_k=s_top_k,
            debug_mode=debug)

        random.shuffle(train_fitem_list)
        train_instances = span_pred_reader.read(train_fitem_list)
        train_iter = iterator(train_instances, num_epochs=1, shuffle=True)

        for batch in tqdm(train_iter, desc="Batch Loop"):
            model.train()
            batch = allen_util.move_to_device(batch, device_num)
            paired_sequence = batch['paired_sequence']
            paired_segments_ids = batch['paired_segments_ids']
            att_mask, _ = torch_util.get_length_and_mask(paired_sequence)
            gt_span = batch['gt_span']

            loss = model(mode=BertSpan.ForwardMode.TRAIN,
                         input_ids=paired_sequence,
                         token_type_ids=paired_segments_ids,
                         attention_mask=att_mask,
                         gt_span=gt_span)

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.

            if gradient_accumulate_step > 1:
                loss = loss / gradient_accumulate_step

            loss.backward()
            forbackward_step += 1

            if forbackward_step % gradient_accumulate_step == 0:
                optimizer.step()
                if ema is not None and do_ema:
                    updated_model = model.module if hasattr(
                        model, 'module') else model
                    ema(updated_model.named_parameters())
                optimizer.zero_grad()
                update_step += 1

                if update_step % eval_frequency == 0:
                    # print("Non-EMA EVAL:")
                    # eval_iter = iterator(dev_instances, num_epochs=1, shuffle=False)
                    # cur_eitem_list, cur_eval_dict = span_eval(model, eval_iter, do_lower_case, dev_fitem_dict,
                    #                                           device_num)
                    # cur_results_dict = dict()
                    # cur_results_dict['p_answer'] = cur_eval_dict
                    # cur_results_dict['sp'] = dev_sp_results_dict
                    #
                    # _, metrics = ext_hotpot_eval.eval(cur_results_dict, dev_list, verbose=False)
                    # # print(metrics)
                    #
                    # logging_item = {
                    #     'score': metrics,
                    # }
                    #
                    # joint_f1 = metrics['joint_f1']
                    # joint_em = metrics['joint_em']
                    #
                    # print(logging_item)
                    #
                    # if not debug:
                    #     save_file_name = f'i({update_step})|e({epoch_i})' \
                    #         f'|j_f1({joint_f1})|j_em({joint_em})|seed({seed})'
                    #
                    #     # print(save_file_name)
                    #     logging_agent.incorporate_results({}, save_file_name, logging_item)
                    #     logging_agent.logging_to_file(Path(file_path_prefix) / "log.json")
                    #
                    #     model_to_save = model.module if hasattr(model, 'module') else model
                    #     output_model_file = Path(file_path_prefix) / save_file_name
                    #     torch.save(model_to_save.state_dict(), str(output_model_file))

                    if do_ema and ema is not None:
                        print("EMA EVAL")
                        ema_model = ema.get_inference_model()
                        ema_inference_device_ids = get_ema_gpu_id_list(
                            master_device_num=ema_device_num)
                        ema_model = ema_model.to(ema_device_num)
                        ema_model = torch.nn.DataParallel(
                            ema_model, device_ids=ema_inference_device_ids)
                        dev_iter = iterator(dev_instances,
                                            num_epochs=1,
                                            shuffle=False)
                        cur_eitem_list, cur_eval_dict = span_eval(
                            ema_model,
                            dev_iter,
                            do_lower_case,
                            dev_fitem_dict,
                            ema_device_num,
                            show_progress=False)
                        cur_results_dict = dict()
                        cur_results_dict['p_answer'] = cur_eval_dict
                        cur_results_dict['sp'] = dev_sp_results_dict

                        _, metrics = ext_hotpot_eval.eval(cur_results_dict,
                                                          dev_list,
                                                          verbose=False)
                        print(metrics)
                        print("---------------" * 3)

                        logging_item = {
                            'label': 'ema',
                            'score': metrics,
                        }

                        joint_f1 = metrics['joint_f1']
                        joint_em = metrics['joint_em']

                        print(logging_item)

                        if not debug:
                            save_file_name = f'ema_i({update_step})|e({epoch_i})' \
                                f'|j_f1({joint_f1})|j_em({joint_em})|seed({seed})'
                            # print(save_file_name)
                            logging_agent.incorporate_results({},
                                                              save_file_name,
                                                              logging_item)
                            logging_agent.logging_to_file(
                                Path(file_path_prefix) / "log.json")

                            model_to_save = ema_model.module if hasattr(
                                ema_model, 'module') else ema_model
                            output_model_file = Path(
                                file_path_prefix) / save_file_name
                            torch.save(model_to_save.state_dict(),
                                       str(output_model_file))
示例#21
0
def fever_retrieval_v0(term_retrieval_top_k=3, match_filtering_k=2, tag='dev'):
    # term_retrieval_top_k = 20
    # term_retrieval_top_k = 20

    # term_retrieval_top_k = 3
    # match_filtering_k = 2

    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    d_tf_idf = common.load_jsonl(
        config.RESULT_PATH /
        f"doc_retri_results/term_based_methods_results/fever_tf_idf_{tag}.jsonl"
    )

    tf_idf_dict = list_dict_data_tool.list_to_dict(d_tf_idf, 'id')

    r_list = []

    ner_set = get_title_entity_set()

    g_score_dict = dict()
    load_from_file(
        g_score_dict, config.PDATA_ROOT /
        "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")

    keyword_processor = KeywordProcessor(case_sensitive=True)
    keyword_processor_disamb = KeywordProcessor(case_sensitive=True)

    print("Build Processor")
    for kw in tqdm(ner_set):
        if filter_word(kw) or filter_document_id(kw):
            continue  # if the keyword is filtered by above function or is stopwords
        else:
            # matched_key_word is the original matched span. we need to save it for group ordering.
            matched_obj = _MatchedObject(matched_key_word=kw,
                                         matched_keywords_info={kw: 'kwm'})
            keyword_processor.add_keyword(kw, matched_obj)

    for kw in wiki_util.title_entities_set.disambiguation_group:
        if filter_word(kw) or filter_document_id(kw):
            continue  # if the keyword is filtered by above function or is stopwords
        else:
            if kw in keyword_processor:
                # if the kw existed in the kw_processor, we update its dict to add more disamb items
                existing_matched_obj: _MatchedObject = keyword_processor.get_keyword(
                    kw)
                for disamb_kw in wiki_util.title_entities_set.disambiguation_group[
                        kw]:
                    if filter_document_id(disamb_kw):
                        continue
                    if disamb_kw not in existing_matched_obj.matched_keywords_info:
                        existing_matched_obj.matched_keywords_info[
                            disamb_kw] = 'kwm_disamb'
            else:  # If not we add it to the keyword_processor_disamb, which is set to be lower priority
                # new_dict = dict()
                matched_obj = _MatchedObject(matched_key_word=kw,
                                             matched_keywords_info=dict())
                for disamb_kw in wiki_util.title_entities_set.disambiguation_group[
                        kw]:
                    if filter_document_id(disamb_kw):
                        continue
                    matched_obj.matched_keywords_info[disamb_kw] = 'kwm_disamb'
                    # new_dict[disamb_kw] = 'kwm_disamb'
                keyword_processor_disamb.add_keyword(kw, matched_obj)

    for item in tqdm(d_list):
        cur_id = str(item['id'])
        query = item['claim']

        query_terms = get_query_ngrams(query)
        valid_query_terms = [
            term for term in query_terms if term in g_score_dict
        ]

        retrieved_set = RetrievedSet()
        # print(tf_idf_doc_list)
        get_kw_matching_results(query, valid_query_terms, retrieved_set,
                                match_filtering_k, g_score_dict,
                                keyword_processor, keyword_processor_disamb)

        tf_idf_doc_list = tf_idf_dict[cur_id]['retrieved_list']
        added_count = 0
        for score, title in sorted(tf_idf_doc_list,
                                   key=lambda x: x[0],
                                   reverse=True)[:term_retrieval_top_k + 3]:
            if not filter_word(title) and not filter_document_id(
                    title) and not title.startswith('List of '):
                retrieved_set.add_item(RetrievedItem(title, 'tf-idf'))
                added_count += 1
                if term_retrieval_top_k is not None and added_count >= term_retrieval_top_k:
                    break

        predicted_docids = retrieved_set.to_id_list()
        # print(retrieved_set)
        # print(item['claim'], predicted_docids)

        r_item = dict()
        r_item['id'] = int(cur_id)
        r_item['claim'] = item['claim']
        r_item['predicted_docids'] = predicted_docids
        if tag != 'test':
            r_item['label'] = item['label']
        r_list.append(r_item)

    # r_list = common.load_jsonl('dev-debug.jsonl')

    # We need to modify the existing retrieved document for naming consistency
    for i, item in enumerate(r_list):
        predicted_docids = item['predicted_docids']
        modified_docids = []
        for docid in predicted_docids:
            docid = docid.replace(' ', '_')
            docid = reverse_convert_brc(docid)
            modified_docids.append(docid)
        item['predicted_docids'] = modified_docids
    # Modify finished

    # print(r_list[0:10])
    len_list = []
    for rset in r_list:
        len_list.append(len(rset['predicted_docids']))

    print(collections.Counter(len_list).most_common(10000))

    print(np.mean(len_list))
    print(np.std(len_list))
    print(np.max(len_list))
    print(np.min(len_list))

    common.save_jsonl(
        r_list, f'fever_term_based_retri_results_'
        f'{tag}_term_topk:{term_retrieval_top_k}_match_filtering_k:{match_filtering_k}.jsonl'
    )

    mode = {'standard': False, 'check_doc_id_correct': True}
    # fever_scorer.fever_score_analysis(r_list, d_list, mode=mode, max_evidence=None)
    fever_scorer.fever_score(r_list, d_list, mode=mode, max_evidence=None)
示例#22
0
def merge_results_with_haonao_module(term_retrieval_top_k=3,
                                     match_filtering_k=2,
                                     haonan_topk=10,
                                     tag='dev',
                                     save=False):
    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
        task_name = 'shared_task_dev'
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
        task_name = 'train'
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
        task_name = 'shared_task_test'
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    # r_list = common.load_jsonl(config.RESULT_PATH / f'doc_retri_results/fever_results/standard_term_based_results/'
    # f'fever_term_based_retri_results_{tag}_term_topk:{term_retrieval_top_k}_match_filtering_k:{match_filtering_k}.jsonl')

    r_list = common.load_jsonl(
        config.RESULT_PATH /
        f'doc_retri_results/fever_results/standard_term_based_results/'
        f'fever_term_based_retri_results_{tag}_term_topk:{term_retrieval_top_k}_match_filtering_k:{match_filtering_k}.jsonl'
    )

    old_result_list = common.load_jsonl(
        config.RESULT_PATH /
        f"doc_retri_results/fever_results/haonans_results/dr_{tag}.jsonl")
    item_resorting(old_result_list, top_k=haonan_topk)

    old_result_dict = list_dict_data_tool.list_to_dict(old_result_list, 'id')

    for i, item in enumerate(r_list):
        predicted_docids = item['predicted_docids']
        modified_docids = []
        for docid in predicted_docids:
            docid = docid.replace(' ', '_')
            docid = reverse_convert_brc(docid)
            modified_docids.append(docid)
        item['predicted_docids'] = modified_docids
        # item['predicted_docids'] = []

    merged_result_list = []
    for item in tqdm(r_list):
        cur_id = int(item['id'])
        old_retrieval_doc = old_result_dict[cur_id]['predicted_docids']
        new_retrieval_doc = item['predicted_docids']
        m_predicted_docids = set.union(set(old_retrieval_doc),
                                       set(new_retrieval_doc))
        # print(m_predicted_docids)
        m_predicted_docids = [
            docid for docid in m_predicted_docids
            if not docid.startswith('List_of_')
        ]
        item['predicted_docids'] = list(m_predicted_docids)
        # print(item['predicted_docids'])

    mode = {'standard': False, 'check_doc_id_correct': True}
    if tag != 'test':
        fever_scorer.fever_score_analysis(r_list,
                                          d_list,
                                          mode=mode,
                                          max_evidence=None)

    if save:
        print("Saved to:")
        common.save_jsonl(
            r_list, config.RESULT_PATH /
            f"doc_retri_results/fever_results/merged_doc_results/m_doc_{tag}.jsonl"
        )

    # States information.
    len_list = []
    for rset in r_list:
        len_list.append(len(rset['predicted_docids']))

    print(collections.Counter(len_list).most_common(10000))

    print(np.mean(len_list))
    print(np.std(len_list))
    print(np.max(len_list))
    print(np.min(len_list))
示例#23
0
def eval_trainset_for_train_nli(model_path):
    tag = 'test'
    is_training = False

    seed = 12
    torch.manual_seed(seed)
    bert_model_name = 'bert-base-uncased'
    lazy = False
    # lazy = True
    forward_size = 128
    # batch_size = 64
    # batch_size = 192
    batch_size = 128

    do_lower_case = True

    debug_mode = False
    # debug_mode = True

    num_class = 1

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset

    train_fitems_list = get_sentences(tag,
                                      is_training=is_training,
                                      debug=debug_mode)
    est_datasize = len(train_fitems_list)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                                   do_lower_case=do_lower_case)
    bert_cs_reader = BertContentSelectionReader(
        bert_tokenizer,
        lazy,
        is_paired=True,
        example_filter=lambda x: len(x['context']) == 0,
        max_l=128)

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=True)

    model.load_state_dict(torch.load(model_path))

    print("Estimated training size", est_datasize)
    print("Estimated forward steps:", est_datasize / forward_size)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    train_instance = bert_cs_reader.read(train_fitems_list)
    train_iter = biterator(train_instance, num_epochs=1, shuffle=False)

    cur_eval_results_list = eval_model(model,
                                       train_iter,
                                       device_num,
                                       with_probs=True,
                                       make_int=True,
                                       show_progress=True)

    if debug_mode:
        train_list = common.load_jsonl(config.FEVER_TRAIN)
        train_list = train_list[:50]
        set_gt_nli_label(train_list)
        train_o_dict = list_dict_data_tool.list_to_dict(train_list, 'id')

        copied_dev_o_dict = copy.deepcopy(train_o_dict)
        copied_dev_d_list = copy.deepcopy(train_list)
        list_dict_data_tool.append_subfield_from_list_to_dict(
            cur_eval_results_list, copied_dev_o_dict, 'oid', 'fid', check=True)

        print("Threshold 0.5:")
        cur_results_dict_th0_5 = select_top_k_and_to_results_dict(
            copied_dev_o_dict, top_k=5, threshold=0.1)
        list_dict_data_tool.append_item_from_dict_to_list(
            copied_dev_d_list, cur_results_dict_th0_5, 'id',
            'predicted_evidence')
        mode = {'standard': True, 'check_sent_id_correct': True}
        strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(
            copied_dev_d_list, train_list, mode=mode, max_evidence=5)
        print(strict_score, acc_score, pr, rec, f1)

    common.save_jsonl(cur_eval_results_list,
                      f'{tag}_sent_results_labeled:{is_training}.jsonl')
示例#24
0
def eval_model_for_downstream_ablation(model_saved_path, top_k_doc):
    bert_model_name = 'bert-base-uncased'
    lazy = True
    # lazy = True
    forward_size = 128
    # batch_size = 64
    # batch_size = 128
    do_lower_case = True

    debug_mode = False
    max_l = 128
    # est_datasize = 900_000
    tag = 'dev'

    num_class = 1
    # num_train_optimization_steps

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset
    train_upstream_doc_results = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/"
        "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_train_results.jsonl"
    )

    dev_upstream_doc_results = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/"
        "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_dev_results.jsonl"
    )

    test_upstream_doc_results = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/"
        "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_test_results.jsonl"
    )

    train_list = common.load_jsonl(config.FEVER_TRAIN)
    dev_list = common.load_jsonl(config.FEVER_DEV)
    test_list = common.load_jsonl(config.FEVER_TEST)
    # dev_list = common.load_jsonl(config.FEVER_DEV)

    if tag == 'dev':
        dev_fitems = fever_s_level_sampler.get_sentence_forward_pair(
            'dev',
            dev_upstream_doc_results,
            is_training=False,
            debug=debug_mode,
            ignore_non_verifiable=False,
            top_k=top_k_doc,
            filter_value=0.00000)
        fever_p_level_sampler.down_sample_neg(dev_fitems, None)
    elif tag == 'train':
        train_fitems = fever_s_level_sampler.get_sentence_forward_pair(
            'train',
            train_upstream_doc_results,
            is_training=True,
            debug=debug_mode,
            ignore_non_verifiable=False,
            top_k=top_k_doc,
            filter_value=0.00000)
        fever_p_level_sampler.down_sample_neg(train_fitems, None)
    elif tag == 'test':
        test_fitems = fever_s_level_sampler.get_sentence_forward_pair(
            'test',
            test_upstream_doc_results,
            is_training=False,
            debug=debug_mode,
            ignore_non_verifiable=False,
            top_k=top_k_doc,
            filter_value=0.00000)
        fever_p_level_sampler.down_sample_neg(test_fitems, None)

    # Just to show the information

    if debug_mode:
        dev_list = dev_list[:100]
        eval_frequency = 2
        # print(dev_list[-1]['_id'])
        # exit(0)

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id')
    test_o_dict = list_dict_data_tool.list_to_dict(test_list, 'id')
    train_o_dict = list_dict_data_tool.list_to_dict(train_list, 'id')
    # print(dev_o_dict)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                                   do_lower_case=do_lower_case)
    bert_cs_reader = BertContentSelectionReader(
        bert_tokenizer,
        lazy,
        is_paired=True,
        example_filter=lambda x: len(x['context']) == 0,
        max_l=max_l,
        element_fieldname='element')

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=True)

    model.load_state_dict(torch.load(model_saved_path))

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    if tag == 'dev':
        dev_instances = bert_cs_reader.read(dev_fitems)

        dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model,
                                           dev_iter,
                                           device_num,
                                           make_int=True,
                                           with_probs=True,
                                           show_progress=True)

        common.save_jsonl(
            cur_eval_results_list,
            f"fever_s_level_{tag}_results_top_k_doc_{top_k_doc}.jsonl")

        copied_dev_o_dict = copy.deepcopy(dev_o_dict)
        copied_dev_d_list = copy.deepcopy(dev_list)
        list_dict_data_tool.append_subfield_from_list_to_dict(
            cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)

        cur_results_dict_th0_5 = select_top_k_and_to_results_dict(
            copied_dev_o_dict,
            score_field_name='prob',
            top_k=5,
            filter_value=0.2,
            result_field='predicted_evidence')

        list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
            copied_dev_d_list, cur_results_dict_th0_5, 'id',
            'predicted_evidence')
        # mode = {'standard': False, 'check_doc_id_correct': True}

        strict_score, pr, rec, f1 = fever_scorer.fever_sent_only(
            copied_dev_d_list, dev_list, max_evidence=5)
        score_05 = {
            'top_k_doc': top_k_doc,
            'ss': strict_score,
            'pr': pr,
            'rec': rec,
            'f1': f1,
        }

        print("Top_k doc:", top_k_doc)
        print(score_05)
        common.save_json(
            score_05,
            f"top_k_doc:{top_k_doc}_ss:{strict_score}_pr:{pr}_rec:{rec}_f1:{f1}"
        )

    elif tag == 'test':
        test_instances = bert_cs_reader.read(test_fitems)

        test_iter = biterator(test_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model,
                                           test_iter,
                                           device_num,
                                           make_int=True,
                                           with_probs=True,
                                           show_progress=True)

        common.save_jsonl(cur_eval_results_list,
                          f"fever_s_level_{tag}_results.jsonl")

        # copied_test_o_dict = copy.deepcopy(test_o_dict)
        # copied_test_d_list = copy.deepcopy(test_list)
        # list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_test_o_dict,
        #                                                       'qid', 'fid', check=True)
        #
        # cur_results_dict_th0_5 = select_top_k_and_to_results_dict(copied_test_o_dict,
        #                                                           score_field_name='prob',
        #                                                           top_k=5, filter_value=0.5,
        #                                                           result_field='predicted_evidence')
        #
        # list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(copied_test_d_list,
        #                                                                cur_results_dict_th0_5,
        #                                                                'id', 'predicted_evidence')
        # mode = {'standard': False, 'check_doc_id_correct': True}

        # copied_train_o_dict = copy.deepcopy(train_o_dict)
        # copied_train_d_list = copy.deepcopy(train_list)
        # list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_train_o_dict,
        #                                                       'qid', 'fid', check=True)
        #
        # cur_results_dict_th0_5 = select_top_k_and_to_results_dict(copied_train_o_dict,
        #                                                           score_field_name='prob',
        #                                                           top_k=5, filter_value=0.5,
        #                                                           result_field='predicted_evidence')
        #
        # list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(copied_train_d_list,
        #                                                                cur_results_dict_th0_5,
        #                                                                'id', 'predicted_evidence')
        # # mode = {'standard': False, 'check_doc_id_correct': True}
        # strict_score, pr, rec, f1 = fever_scorer.fever_sent_only(copied_train_d_list, train_list,
        #                                                          max_evidence=5)
        # score_05 = {
        #     'ss': strict_score,
        #     'pr': pr, 'rec': rec, 'f1': f1,
        # }
        #
        # print(score_05)
    elif tag == 'train':
        train_instances = bert_cs_reader.read(train_fitems)

        train_iter = biterator(train_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model,
                                           train_iter,
                                           device_num,
                                           make_int=True,
                                           with_probs=True,
                                           show_progress=True)

        common.save_jsonl(cur_eval_results_list,
                          f"fever_s_level_{tag}_results.jsonl")

        copied_train_o_dict = copy.deepcopy(train_o_dict)
        copied_train_d_list = copy.deepcopy(train_list)
        list_dict_data_tool.append_subfield_from_list_to_dict(
            cur_eval_results_list,
            copied_train_o_dict,
            'qid',
            'fid',
            check=True)

        cur_results_dict_th0_5 = select_top_k_and_to_results_dict(
            copied_train_o_dict,
            score_field_name='prob',
            top_k=5,
            filter_value=0.5,
            result_field='predicted_evidence')

        list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
            copied_train_d_list, cur_results_dict_th0_5, 'id',
            'predicted_evidence')
        # mode = {'standard': False, 'check_doc_id_correct': True}
        strict_score, pr, rec, f1 = fever_scorer.fever_sent_only(
            copied_train_d_list, train_list, max_evidence=5)
        score_05 = {
            'ss': strict_score,
            'pr': pr,
            'rec': rec,
            'f1': f1,
        }

        print(score_05)
示例#25
0
def multitask_model_go():
    seed = 12
    torch.manual_seed(seed)
    # bert_model_name = 'bert-large-uncased'
    bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'
    bert_model_name = 'bert-base-uncased'
    lazy = False
    # lazy = True
    forward_size = 64
    # batch_size = 64
    batch_size = 128
    gradient_accumulate_step = int(batch_size / forward_size)
    warmup_proportion = 0.1
    learning_rate = 5e-5
    num_train_epochs = 1
    eval_frequency = 5000
    hotpot_pos_ratio = 0.2
    do_lower_case = True
    max_l = 264

    experiment_name = f'mtr_p_level_(num_train_epochs:{num_train_epochs})'

    debug_mode = False
    do_ema = True
    # est_datasize = 900_000

    num_class = 1
    # num_train_optimization_steps

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Hotpot Dataset
    hotpot_train_list = common.load_json(config.TRAIN_FILE)
    hotpot_dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    hotpot_dev_o_dict = list_dict_data_tool.list_to_dict(
        hotpot_dev_list, '_id')

    # Load Hotpot upstream paragraph forward item
    hotpot_dev_fitems_list = common.load_jsonl(
        config.PDATA_ROOT / "content_selection_forward" /
        "hotpot_dev_p_level_unlabeled.jsonl")
    hotpot_train_fitems_list = common.load_jsonl(
        config.PDATA_ROOT / "content_selection_forward" /
        "hotpot_train_p_level_labeled.jsonl")

    hotpot_train_fitems_list = hotpot_sampler_utils.field_name_convert(
        hotpot_train_fitems_list, 'doc_t', 'element')
    hotpot_dev_fitems_list = hotpot_sampler_utils.field_name_convert(
        hotpot_dev_fitems_list, 'doc_t', 'element')

    # Load FEVER Dataset
    # fever_train_list = common.load_json(config.FEVER_TRAIN)
    fever_dev_list = common.load_jsonl(config.FEVER_DEV)
    fever_dev_o_dict = list_dict_data_tool.list_to_dict(fever_dev_list, 'id')

    train_ruleterm_doc_results = common.load_jsonl(
        config.PRO_ROOT /
        "results/doc_retri_results/fever_results/merged_doc_results/m_doc_train.jsonl"
    )
    dev_ruleterm_doc_results = common.load_jsonl(
        config.PRO_ROOT /
        "results/doc_retri_results/fever_results/merged_doc_results/m_doc_dev.jsonl"
    )

    fever_train_fitems_list = fever_p_level_sampler.get_paragraph_forward_pair(
        'train',
        train_ruleterm_doc_results,
        is_training=True,
        debug=debug_mode,
        ignore_non_verifiable=True)
    fever_dev_fitems_list = fever_p_level_sampler.get_paragraph_forward_pair(
        'dev',
        dev_ruleterm_doc_results,
        is_training=False,
        debug=debug_mode,
        ignore_non_verifiable=False)
    if debug_mode:
        hotpot_dev_list = hotpot_dev_list[:10]
        hotpot_dev_fitems_list = hotpot_dev_fitems_list[:296]
        hotpot_train_fitems_list = hotpot_train_fitems_list[:300]

        fever_dev_list = fever_dev_list[:100]
        eval_frequency = 2

    # Down_sample for hotpot.
    hotpot_sampled_train_list = down_sample_neg(hotpot_train_fitems_list,
                                                ratio=hotpot_pos_ratio)
    hotpot_est_datasize = len(hotpot_sampled_train_list)
    fever_est_datasize = len(fever_train_fitems_list)

    print("Hotpot Train Size:", hotpot_est_datasize)
    print("Fever Train Size:", fever_est_datasize)

    est_datasize = hotpot_est_datasize + fever_est_datasize

    bert_tokenizer = BertTokenizer.from_pretrained(
        bert_model_name,
        do_lower_case=do_lower_case,
        cache_dir=bert_pretrain_path)
    bert_cs_reader = BertContentSelectionReader(
        bert_tokenizer,
        lazy,
        is_paired=True,
        example_filter=lambda x: len(x['context']) == 0,
        max_l=max_l,
        element_fieldname='element')

    bert_encoder = BertModel.from_pretrained(bert_model_name,
                                             cache_dir=bert_pretrain_path)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=True)

    ema = None
    if do_ema:
        ema = EMA(model, model.named_parameters(), device_num=1)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \
                                   num_train_epochs

    if debug_mode:
        num_train_optimization_steps = 100

    print("Estimated training size", est_datasize)
    print("Number of optimization steps:", num_train_optimization_steps)

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=num_train_optimization_steps)

    hotpot_dev_instances = bert_cs_reader.read(hotpot_dev_fitems_list)
    fever_dev_instances = bert_cs_reader.read(fever_dev_fitems_list)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    forbackward_step = 0
    update_step = 0

    logging_agent = save_tool.ScoreLogger({})

    file_path_prefix = '.'
    if not debug_mode:
        # # # Create Log File
        file_path_prefix, date = save_tool.gen_file_prefix(
            f"{experiment_name}")
        # Save the source code.
        script_name = os.path.basename(__file__)
        with open(os.path.join(file_path_prefix, script_name),
                  'w') as out_f, open(__file__, 'r') as it:
            out_f.write(it.read())
            out_f.flush()
        # # # Log File end

    for epoch_i in range(num_train_epochs):
        print("Epoch:", epoch_i)
        # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio)
        hotpot_sampled_train_list = down_sample_neg(hotpot_train_fitems_list,
                                                    ratio=hotpot_pos_ratio)
        all_train_data = hotpot_sampled_train_list + fever_train_fitems_list
        random.shuffle(all_train_data)
        train_instance = bert_cs_reader.read(all_train_data)
        train_iter = biterator(train_instance, num_epochs=1, shuffle=True)

        for batch in tqdm(train_iter):
            model.train()
            batch = move_to_device(batch, device_num)

            paired_sequence = batch['paired_sequence']
            paired_segments_ids = batch['paired_segments_ids']
            labels_ids = batch['label']
            att_mask, _ = torch_util.get_length_and_mask(paired_sequence)
            s1_span = batch['bert_s1_span']
            s2_span = batch['bert_s2_span']

            loss = model(
                paired_sequence,
                token_type_ids=paired_segments_ids,
                attention_mask=att_mask,
                mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN,
                labels=labels_ids)

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.

            if gradient_accumulate_step > 1:
                loss = loss / gradient_accumulate_step

            loss.backward()
            forbackward_step += 1

            if forbackward_step % gradient_accumulate_step == 0:
                optimizer.step()
                if ema is not None and do_ema:
                    updated_model = model.module if hasattr(
                        model, 'module') else model
                    ema(updated_model.named_parameters())
                optimizer.zero_grad()
                update_step += 1

                if update_step % eval_frequency == 0:
                    print("Update steps:", update_step)
                    # Eval FEVER
                    eval_fever_procedure(biterator, fever_dev_instances, model,
                                         device_num, 1, fever_dev_list,
                                         fever_dev_o_dict, debug_mode,
                                         logging_agent, update_step, epoch_i,
                                         file_path_prefix, do_ema, ema, seed)
                    eval_hotpot_procedure(biterator, hotpot_dev_instances,
                                          model, device_num, 1,
                                          hotpot_dev_list, hotpot_dev_o_dict,
                                          debug_mode, logging_agent,
                                          update_step, epoch_i,
                                          file_path_prefix, do_ema, ema, seed)

    if not debug_mode:
        print("Final Saving.")
        save_file_name = f'i({update_step})|e({num_train_epochs})_final_model'
        model_to_save = model.module if hasattr(model, 'module') else model
        output_model_file = Path(file_path_prefix) / save_file_name
        torch.save(model_to_save.state_dict(), str(output_model_file))

        if do_ema and ema is not None:
            print("Final EMA Saving")
            ema_model = ema.get_inference_model()
            save_file_name = f'i({update_step})|e({num_train_epochs})_final_ema_model'
            model_to_save = ema_model.module if hasattr(
                ema_model, 'module') else ema_model
            output_model_file = Path(file_path_prefix) / save_file_name
            torch.save(model_to_save.state_dict(), str(output_model_file))
示例#26
0
def model_go():
    seed = 12
    torch.manual_seed(seed)
    # bert_model_name = 'bert-large-uncased'
    bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'
    bert_model_name = 'bert-base-uncased'
    lazy = False
    # lazy = True
    forward_size = 128
    # batch_size = 64
    batch_size = 128
    gradient_accumulate_step = int(batch_size / forward_size)
    warmup_proportion = 0.1
    learning_rate = 5e-5
    num_train_epochs = 5
    eval_frequency = 2000
    pos_ratio = 0.2
    do_lower_case = True
    document_top_k = 2
    experiment_name = f'hotpot_v0_slevel_retri_(doc_top_k:{document_top_k})'

    debug_mode = False
    do_ema = True
    # est_datasize = 900_000

    num_class = 1
    # num_train_optimization_steps

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset
    train_list = common.load_json(config.TRAIN_FILE)
    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)

    # train_fitems = sentence_level_sampler.get_train_sentence_pair(document_top_k, True, debug_mode)
    # dev_fitems = sentence_level_sampler.get_dev_sentence_pair(document_top_k, False, debug_mode)

    # Load train eval results list
    cur_train_eval_results_list = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/"
        "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/train_p_level_bert_v1_results.jsonl"
    )

    cur_dev_eval_results_list = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/"
        "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/dev_p_level_bert_v1_results.jsonl"
    )

    train_fitems = get_sentence_pair(document_top_k,
                                     train_list,
                                     cur_train_eval_results_list,
                                     is_training=True,
                                     debug_mode=debug_mode)

    dev_fitems = get_sentence_pair(document_top_k,
                                   dev_list,
                                   cur_dev_eval_results_list,
                                   is_training=False,
                                   debug_mode=debug_mode)

    if debug_mode:
        dev_list = dev_list[:100]
        eval_frequency = 2
        # print(dev_list[-1]['_id'])
        # exit(0)

    # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio)
    est_datasize = len(train_fitems)

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')
    # print(dev_o_dict)

    bert_tokenizer = BertTokenizer.from_pretrained(
        bert_model_name,
        do_lower_case=do_lower_case,
        cache_dir=bert_pretrain_path)
    bert_cs_reader = BertContentSelectionReader(
        bert_tokenizer,
        lazy,
        is_paired=True,
        example_filter=lambda x: len(x['context']) == 0,
        max_l=128,
        element_fieldname='element')

    bert_encoder = BertModel.from_pretrained(bert_model_name,
                                             cache_dir=bert_pretrain_path)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=True)

    ema = None
    if do_ema:
        ema = EMA(model, model.named_parameters(), device_num=1)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \
                                   num_train_epochs

    if debug_mode:
        num_train_optimization_steps = 100

    print("Estimated training size", est_datasize)
    print("Number of optimization steps:", num_train_optimization_steps)

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=num_train_optimization_steps)

    dev_instances = bert_cs_reader.read(dev_fitems)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    forbackward_step = 0
    update_step = 0

    logging_agent = save_tool.ScoreLogger({})

    # # # Create Log File
    file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}")
    # Save the source code.
    script_name = os.path.basename(__file__)
    with open(os.path.join(file_path_prefix, script_name),
              'w') as out_f, open(__file__, 'r') as it:
        out_f.write(it.read())
        out_f.flush()
    # # # Log File end

    for epoch_i in range(num_train_epochs):
        print("Epoch:", epoch_i)
        # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio)
        random.shuffle(train_fitems)
        train_instance = bert_cs_reader.read(train_fitems)
        train_iter = biterator(train_instance, num_epochs=1, shuffle=True)

        for batch in tqdm(train_iter):
            model.train()
            batch = move_to_device(batch, device_num)

            paired_sequence = batch['paired_sequence']
            paired_segments_ids = batch['paired_segments_ids']
            labels_ids = batch['label']
            att_mask, _ = torch_util.get_length_and_mask(paired_sequence)
            s1_span = batch['bert_s1_span']
            s2_span = batch['bert_s2_span']

            loss = model(
                paired_sequence,
                token_type_ids=paired_segments_ids,
                attention_mask=att_mask,
                mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN,
                labels=labels_ids)

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.

            if gradient_accumulate_step > 1:
                loss = loss / gradient_accumulate_step

            loss.backward()
            forbackward_step += 1

            if forbackward_step % gradient_accumulate_step == 0:
                optimizer.step()
                if ema is not None and do_ema:
                    updated_model = model.module if hasattr(
                        model, 'module') else model
                    ema(updated_model.named_parameters())
                optimizer.zero_grad()
                update_step += 1

                if update_step % eval_frequency == 0:
                    print("Update steps:", update_step)
                    dev_iter = biterator(dev_instances,
                                         num_epochs=1,
                                         shuffle=False)

                    cur_eval_results_list = eval_model(model,
                                                       dev_iter,
                                                       device_num,
                                                       with_probs=True)
                    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
                    list_dict_data_tool.append_subfield_from_list_to_dict(
                        cur_eval_results_list,
                        copied_dev_o_dict,
                        'qid',
                        'fid',
                        check=True)
                    # 0.5
                    cur_results_dict_v05 = select_top_k_and_to_results_dict(
                        copied_dev_o_dict,
                        top_k=5,
                        score_field_name='prob',
                        filter_value=0.5,
                        result_field='sp')

                    cur_results_dict_v02 = select_top_k_and_to_results_dict(
                        copied_dev_o_dict,
                        top_k=5,
                        score_field_name='prob',
                        filter_value=0.2,
                        result_field='sp')

                    _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05,
                                                         dev_list,
                                                         verbose=False)

                    _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02,
                                                         dev_list,
                                                         verbose=False)

                    v02_sp_f1 = metrics_v2['sp_f1']
                    v02_sp_recall = metrics_v2['sp_recall']
                    v02_sp_prec = metrics_v2['sp_prec']

                    v05_sp_f1 = metrics_v5['sp_f1']
                    v05_sp_recall = metrics_v5['sp_recall']
                    v05_sp_prec = metrics_v5['sp_prec']

                    logging_item = {
                        'v02': metrics_v2,
                        'v05': metrics_v5,
                    }

                    print(logging_item)

                    # print(logging_item)
                    if not debug_mode:
                        save_file_name = f'i({update_step})|e({epoch_i})' \
                            f'|v02_f1({v02_sp_f1})|v02_recall({v02_sp_recall})' \
                            f'|v05_f1({v05_sp_f1})|v05_recall({v05_sp_recall})|seed({seed})'

                        # print(save_file_name)
                        logging_agent.incorporate_results({}, save_file_name,
                                                          logging_item)
                        logging_agent.logging_to_file(
                            Path(file_path_prefix) / "log.json")

                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        output_model_file = Path(
                            file_path_prefix) / save_file_name
                        torch.save(model_to_save.state_dict(),
                                   str(output_model_file))

                    if do_ema and ema is not None:
                        ema_model = ema.get_inference_model()
                        master_device_num = 1
                        ema_inference_device_ids = get_ema_gpu_id_list(
                            master_device_num=master_device_num)
                        ema_model = ema_model.to(master_device_num)
                        ema_model = torch.nn.DataParallel(
                            ema_model, device_ids=ema_inference_device_ids)
                        dev_iter = biterator(dev_instances,
                                             num_epochs=1,
                                             shuffle=False)

                        cur_eval_results_list = eval_model(ema_model,
                                                           dev_iter,
                                                           master_device_num,
                                                           with_probs=True)
                        copied_dev_o_dict = copy.deepcopy(dev_o_dict)
                        list_dict_data_tool.append_subfield_from_list_to_dict(
                            cur_eval_results_list,
                            copied_dev_o_dict,
                            'qid',
                            'fid',
                            check=True)
                        # 0.5
                        cur_results_dict_v05 = select_top_k_and_to_results_dict(
                            copied_dev_o_dict,
                            top_k=5,
                            score_field_name='prob',
                            filter_value=0.5,
                            result_field='sp')

                        cur_results_dict_v02 = select_top_k_and_to_results_dict(
                            copied_dev_o_dict,
                            top_k=5,
                            score_field_name='prob',
                            filter_value=0.2,
                            result_field='sp')

                        _, metrics_v5 = ext_hotpot_eval.eval(
                            cur_results_dict_v05, dev_list, verbose=False)

                        _, metrics_v2 = ext_hotpot_eval.eval(
                            cur_results_dict_v02, dev_list, verbose=False)

                        v02_sp_f1 = metrics_v2['sp_f1']
                        v02_sp_recall = metrics_v2['sp_recall']
                        v02_sp_prec = metrics_v2['sp_prec']

                        v05_sp_f1 = metrics_v5['sp_f1']
                        v05_sp_recall = metrics_v5['sp_recall']
                        v05_sp_prec = metrics_v5['sp_prec']

                        logging_item = {
                            'label': 'ema',
                            'v02': metrics_v2,
                            'v05': metrics_v5,
                        }

                        print(logging_item)

                        if not debug_mode:
                            save_file_name = f'ema_i({update_step})|e({epoch_i})' \
                                f'|v02_f1({v02_sp_f1})|v02_recall({v02_sp_recall})' \
                                f'|v05_f1({v05_sp_f1})|v05_recall({v05_sp_recall})|seed({seed})'

                            # print(save_file_name)
                            logging_agent.incorporate_results({},
                                                              save_file_name,
                                                              logging_item)
                            logging_agent.logging_to_file(
                                Path(file_path_prefix) / "log.json")

                            model_to_save = ema_model.module if hasattr(
                                ema_model, 'module') else ema_model
                            output_model_file = Path(
                                file_path_prefix) / save_file_name
                            torch.save(model_to_save.state_dict(),
                                       str(output_model_file))
def model_go():
    seed = 12
    torch.manual_seed(seed)
    # bert_model_name = 'bert-large-uncased'
    bert_model_name = 'bert-base-uncased'
    lazy = False
    # lazy = True
    forward_size = 64
    # batch_size = 64
    batch_size = 128
    gradient_accumulate_step = int(batch_size / forward_size)
    warmup_proportion = 0.1
    learning_rate = 5e-5
    num_train_epochs = 5
    eval_frequency = 5000
    do_lower_case = True
    ignore_non_verifiable = True
    experiment_name = f'fever_v0_plevel_retri_(ignore_non_verifiable:{ignore_non_verifiable})'

    debug_mode = False
    max_l = 264
    # est_datasize = 900_000

    num_class = 1
    # num_train_optimization_steps

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset
    train_ruleterm_doc_results = common.load_jsonl(
        config.PRO_ROOT /
        "results/doc_retri_results/fever_results/merged_doc_results/m_doc_train.jsonl"
    )
    dev_ruleterm_doc_results = common.load_jsonl(
        config.PRO_ROOT /
        "results/doc_retri_results/fever_results/merged_doc_results/m_doc_dev.jsonl"
    )

    # train_list = common.load_json(config.TRAIN_FILE)
    dev_list = common.load_jsonl(config.FEVER_DEV)

    train_fitems = fever_p_level_sampler.get_paragraph_forward_pair(
        'train',
        train_ruleterm_doc_results,
        is_training=True,
        debug=debug_mode,
        ignore_non_verifiable=True)
    dev_fitems = fever_p_level_sampler.get_paragraph_forward_pair(
        'dev',
        dev_ruleterm_doc_results,
        is_training=False,
        debug=debug_mode,
        ignore_non_verifiable=False)

    # Just to show the information
    fever_p_level_sampler.down_sample_neg(train_fitems, None)
    fever_p_level_sampler.down_sample_neg(dev_fitems, None)

    if debug_mode:
        dev_list = dev_list[:100]
        eval_frequency = 2
        # print(dev_list[-1]['_id'])
        # exit(0)

    # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio)
    est_datasize = len(train_fitems)

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id')
    # print(dev_o_dict)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                                   do_lower_case=do_lower_case)
    bert_cs_reader = BertContentSelectionReader(
        bert_tokenizer,
        lazy,
        is_paired=True,
        example_filter=lambda x: len(x['context']) == 0,
        max_l=max_l,
        element_fieldname='element')

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=True)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \
                                   num_train_epochs

    if debug_mode:
        num_train_optimization_steps = 100

    print("Estimated training size", est_datasize)
    print("Number of optimization steps:", num_train_optimization_steps)

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=num_train_optimization_steps)

    dev_instances = bert_cs_reader.read(dev_fitems)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    forbackward_step = 0
    update_step = 0

    logging_agent = save_tool.ScoreLogger({})

    if not debug_mode:
        # # # Create Log File
        file_path_prefix, date = save_tool.gen_file_prefix(
            f"{experiment_name}")
        # Save the source code.
        script_name = os.path.basename(__file__)
        with open(os.path.join(file_path_prefix, script_name),
                  'w') as out_f, open(__file__, 'r') as it:
            out_f.write(it.read())
            out_f.flush()
        # # # Log File end

    for epoch_i in range(num_train_epochs):
        print("Epoch:", epoch_i)
        # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio)
        random.shuffle(train_fitems)
        train_instance = bert_cs_reader.read(train_fitems)
        train_iter = biterator(train_instance, num_epochs=1, shuffle=True)

        for batch in tqdm(train_iter):
            model.train()
            batch = move_to_device(batch, device_num)

            paired_sequence = batch['paired_sequence']
            paired_segments_ids = batch['paired_segments_ids']
            labels_ids = batch['label']
            att_mask, _ = torch_util.get_length_and_mask(paired_sequence)
            s1_span = batch['bert_s1_span']
            s2_span = batch['bert_s2_span']

            loss = model(
                paired_sequence,
                token_type_ids=paired_segments_ids,
                attention_mask=att_mask,
                mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN,
                labels=labels_ids)

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.

            if gradient_accumulate_step > 1:
                loss = loss / gradient_accumulate_step

            loss.backward()
            forbackward_step += 1

            if forbackward_step % gradient_accumulate_step == 0:
                optimizer.step()
                optimizer.zero_grad()
                update_step += 1

                if update_step % eval_frequency == 0:
                    print("Update steps:", update_step)
                    dev_iter = biterator(dev_instances,
                                         num_epochs=1,
                                         shuffle=False)

                    cur_eval_results_list = eval_model(model,
                                                       dev_iter,
                                                       device_num,
                                                       make_int=True,
                                                       with_probs=True)
                    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
                    copied_dev_d_list = copy.deepcopy(dev_list)
                    list_dict_data_tool.append_subfield_from_list_to_dict(
                        cur_eval_results_list,
                        copied_dev_o_dict,
                        'qid',
                        'fid',
                        check=True)

                    cur_results_dict_th0_5 = select_top_k_and_to_results_dict(
                        copied_dev_o_dict,
                        score_field_name='prob',
                        top_k=5,
                        filter_value=0.5)

                    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
                        copied_dev_d_list, cur_results_dict_th0_5, 'id',
                        'predicted_docids')
                    # mode = {'standard': False, 'check_doc_id_correct': True}
                    strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(
                        copied_dev_d_list, dev_list, max_evidence=5)
                    score_05 = {
                        'ss': strict_score,
                        'pr': pr,
                        'rec': rec,
                        'f1': f1,
                    }

                    list_dict_data_tool.append_subfield_from_list_to_dict(
                        cur_eval_results_list,
                        copied_dev_o_dict,
                        'qid',
                        'fid',
                        check=True)

                    cur_results_dict_th0_2 = select_top_k_and_to_results_dict(
                        copied_dev_o_dict,
                        score_field_name='prob',
                        top_k=5,
                        filter_value=0.2)

                    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
                        copied_dev_d_list, cur_results_dict_th0_2, 'id',
                        'predicted_docids')
                    # mode = {'standard': False, 'check_doc_id_correct': True}
                    strict_score, pr, rec, f1 = fever_scorer.fever_doc_only(
                        copied_dev_d_list, dev_list, max_evidence=5)
                    score_02 = {
                        'ss': strict_score,
                        'pr': pr,
                        'rec': rec,
                        'f1': f1,
                    }

                    logging_item = {
                        'score_02': score_02,
                        'score_05': score_05,
                    }

                    print(logging_item)

                    s02_ss_score = score_02['ss']
                    s05_ss_score = score_05['ss']

                    if not debug_mode:
                        save_file_name = f'i({update_step})|e({epoch_i})' \
                            f'|v02_ofever({s02_ss_score})' \
                            f'|v05_ofever({s05_ss_score})|seed({seed})'

                        # print(save_file_name)
                        logging_agent.incorporate_results({}, save_file_name,
                                                          logging_item)
                        logging_agent.logging_to_file(
                            Path(file_path_prefix) / "log.json")

                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        output_model_file = Path(
                            file_path_prefix) / save_file_name
                        torch.save(model_to_save.state_dict(),
                                   str(output_model_file))
示例#28
0
def eval_model_for_downstream_ablation(model_saved_path,
                                       doc_top_k=2,
                                       tag='dev'):
    print(f"Run doc_top_k:{doc_top_k}")
    bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'
    seed = 12
    torch.manual_seed(seed)
    bert_model_name = 'bert-base-uncased'
    # lazy = False
    lazy = True
    # forward_size = 256
    forward_size = 256
    # batch_size = 64
    batch_size = 128
    do_lower_case = True
    document_top_k = doc_top_k

    debug_mode = False
    # est_datasize = 900_000

    num_class = 1
    # num_train_optimization_steps

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset
    train_list = common.load_json(config.TRAIN_FILE)
    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    test_list = common.load_json(config.TEST_FULLWIKI_FILE)

    # Load train eval results list
    # cur_train_eval_results_list = common.load_jsonl(
    #     config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/"
    #                       "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/train_p_level_bert_v1_results.jsonl")

    cur_dev_eval_results_list = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/"
        "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/dev_p_level_bert_v1_results.jsonl"
    )

    # cur_test_eval_results_list = common.load_jsonl(
    #     config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/"
    #                       "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/test_p_level_bert_v1_results.jsonl")

    # if tag == 'train':
    #     train_fitems = get_sentence_pair(document_top_k, train_list, cur_train_eval_results_list, is_training=True,
    #                                      debug_mode=debug_mode)
    if tag == 'dev':
        dev_fitems = get_sentence_pair(document_top_k,
                                       dev_list,
                                       cur_dev_eval_results_list,
                                       is_training=False,
                                       debug_mode=debug_mode)

    # elif tag == 'test':
    #     test_fitems = get_sentence_pair(document_top_k, test_list, cur_test_eval_results_list, is_training=False,
    #                                     debug_mode=debug_mode)

    if debug_mode:
        eval_frequency = 2

    #     dev_list = dev_list[:10]
    #     dev_fitems_list = dev_fitems_list[:296]
    #     train_fitems_list = train_fitems_list[:300]
    # print(dev_list[-1]['_id'])
    # exit(0)

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')
    train_o_dict = list_dict_data_tool.list_to_dict(train_list, '_id')

    bert_tokenizer = BertTokenizer.from_pretrained(
        bert_model_name,
        do_lower_case=do_lower_case,
        cache_dir=bert_pretrain_path)
    bert_cs_reader = BertContentSelectionReader(
        bert_tokenizer,
        lazy,
        is_paired=True,
        example_filter=lambda x: len(x['context']) == 0,
        max_l=128,
        element_fieldname='element')

    bert_encoder = BertModel.from_pretrained(bert_model_name,
                                             cache_dir=bert_pretrain_path)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=True)

    model.load_state_dict(torch.load(model_saved_path))

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #
    if tag == 'train':
        train_instance = bert_cs_reader.read(train_fitems)
    elif tag == 'dev':
        dev_instances = bert_cs_reader.read(dev_fitems)
    elif tag == 'test':
        test_instances = bert_cs_reader.read(test_fitems)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    if tag == 'train':
        train_iter = biterator(train_instance, num_epochs=1, shuffle=False)
        print(len(train_fitems))
    elif tag == 'dev':
        dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)
        print(len(dev_fitems))
    elif tag == 'test':
        test_iter = biterator(test_instances, num_epochs=1, shuffle=False)
        print(len(test_fitems))

    print("Forward size:", forward_size)

    if tag == 'train':
        cur_train_eval_results_list_out = eval_model(model,
                                                     train_iter,
                                                     device_num,
                                                     with_probs=True,
                                                     show_progress=True)
        common.save_jsonl(
            cur_train_eval_results_list_out, config.PRO_ROOT /
            "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/train_s_level_bert_v1_results.jsonl"
        )
    elif tag == 'dev':
        cur_dev_eval_results_list_out = eval_model(model,
                                                   dev_iter,
                                                   device_num,
                                                   with_probs=True,
                                                   show_progress=True)
        common.save_jsonl(
            cur_dev_eval_results_list_out,
            f"hotpot_s_level_{tag}_results_top_k_doc_{document_top_k}.jsonl")

    elif tag == 'test':
        cur_test_eval_results_list_out = eval_model(model,
                                                    test_iter,
                                                    device_num,
                                                    with_probs=True,
                                                    show_progress=True)
        common.save_jsonl(
            cur_test_eval_results_list_out, config.PRO_ROOT /
            "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/test_s_level_bert_v1_results.jsonl"
        )

    if tag == 'train' or tag == 'test':
        exit(0)

    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_dev_eval_results_list_out,
        copied_dev_o_dict,
        'qid',
        'fid',
        check=True)
    # 0.5
    cur_results_dict_v05 = select_top_k_and_to_results_dict(
        copied_dev_o_dict,
        top_k=5,
        score_field_name='prob',
        filter_value=0.5,
        result_field='sp')

    cur_results_dict_v02 = select_top_k_and_to_results_dict(
        copied_dev_o_dict,
        top_k=5,
        score_field_name='prob',
        filter_value=0.2,
        result_field='sp')

    _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05,
                                         dev_list,
                                         verbose=False)

    _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02,
                                         dev_list,
                                         verbose=False)

    logging_item = {
        'v02': metrics_v2,
        'v05': metrics_v5,
    }

    print(logging_item)
    f1 = metrics_v5['sp_f1']
    em = metrics_v5['sp_em']
    pr = metrics_v5['sp_prec']
    rec = metrics_v5['sp_recall']
    common.save_json(
        logging_item,
        f"top_k_doc:{document_top_k}_em:{em}_pr:{pr}_rec:{rec}_f1:{f1}")
示例#29
0
def model_go_with_old_data():
    seed = 12
    torch.manual_seed(seed)
    # bert_model_name = 'bert-large-uncased'
    bert_model_name = 'bert-base-uncased'
    experiment_name = 'fever_v1_nli'
    lazy = False
    # lazy = True
    forward_size = 16
    # batch_size = 64
    # batch_size = 192
    batch_size = 32
    gradient_accumulate_step = int(batch_size / forward_size)
    warmup_proportion = 0.1
    learning_rate = 5e-5
    num_train_epochs = 3
    eval_frequency = 2000
    do_lower_case = True
    pair_order = 'cq'
    # debug_mode = True
    debug_mode = False
    # est_datasize = 900_000

    num_class = 3
    # num_train_optimization_steps

    train_sent_filtering_prob = 0.35
    dev_sent_filtering_prob = 0.1

    # dev_sent_results_file = config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/i(5000)|e(0)|s01(0.9170917091709171)|s05(0.8842384238423843)|seed(12)_dev_sent_results.json"
    # train_sent_results_file = config.RESULT_PATH / "doc_retri_results/fever_results/sent_results/4-14-sent_results_v0/train_sent_results.jsonl"
    from utest.utest_format_converter_for_old_sent.tool import format_convert
    dev_sent_results_file = format_convert(
        config.PRO_ROOT /
        "results/doc_retri_results/fever_results/sent_results/old_sent_data_by_NSMN/4-15-dev_sent_pred_scores_old_format.jsonl"
    )
    train_sent_results_file = format_convert(
        config.PRO_ROOT /
        "results/doc_retri_results/fever_results/sent_results/old_sent_data_by_NSMN/train_sent_scores_old_format.jsonl"
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace('SUPPORTS', namespace='labels')
    vocab.add_token_to_namespace('REFUTES', namespace='labels')
    vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels')
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset
    # train_fitems_list = get_inference_pair('train', True, train_sent_results_file, debug_mode, train_sent_filtering_prob)
    dev_debug_num = 2481 if debug_mode else None
    dev_fitems_list, dev_list = get_inference_pair('dev', False,
                                                   dev_sent_results_file,
                                                   dev_debug_num,
                                                   dev_sent_filtering_prob)
    # = common.load_jsonl(config.FEVER_DEV)

    if debug_mode:
        dev_list = dev_list[:50]
        eval_frequency = 1
        # print(dev_list[-1]['_id'])
        # exit(0)

    # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio)
    train_debug_num = 2971 if debug_mode else None
    train_fitems_list, _ = get_inference_pair('train', True,
                                              train_sent_results_file,
                                              train_debug_num,
                                              train_sent_filtering_prob)
    est_datasize = len(train_fitems_list)

    # dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id')
    # print(dev_o_dict)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                                   do_lower_case=do_lower_case)
    bert_cs_reader = BertFeverNLIReader(bert_tokenizer,
                                        lazy,
                                        is_paired=True,
                                        query_l=64,
                                        example_filter=None,
                                        max_l=364,
                                        pair_order=pair_order)

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=False)
    #
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \
                                   num_train_epochs

    if debug_mode:
        num_train_optimization_steps = 100

    print("Estimated training size", est_datasize)
    print("Number of optimization steps:", num_train_optimization_steps)

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=num_train_optimization_steps)

    dev_instances = bert_cs_reader.read(dev_fitems_list)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    forbackward_step = 0
    update_step = 0

    logging_agent = save_tool.ScoreLogger({})

    file_path_prefix = '.'
    if not debug_mode:
        file_path_prefix, date = save_tool.gen_file_prefix(
            f"{experiment_name}")
        # # # Create Log File
        # Save the source code.
        script_name = os.path.basename(__file__)
        with open(os.path.join(file_path_prefix, script_name),
                  'w') as out_f, open(__file__, 'r') as it:
            out_f.write(it.read())
            out_f.flush()
        # # # Log File end

    for epoch_i in range(num_train_epochs):
        print("Epoch:", epoch_i)

        train_fitems_list, _ = get_inference_pair('train', True,
                                                  train_sent_results_file,
                                                  train_debug_num,
                                                  train_sent_filtering_prob)
        random.shuffle(train_fitems_list)
        train_instance = bert_cs_reader.read(train_fitems_list)
        train_iter = biterator(train_instance, num_epochs=1, shuffle=True)

        for batch in tqdm(train_iter):
            model.train()
            batch = move_to_device(batch, device_num)

            paired_sequence = batch['paired_sequence']
            paired_segments_ids = batch['paired_segments_ids']
            labels_ids = batch['label']
            att_mask, _ = torch_util.get_length_and_mask(paired_sequence)
            s1_span = batch['bert_s1_span']
            s2_span = batch['bert_s2_span']

            loss = model(
                paired_sequence,
                token_type_ids=paired_segments_ids,
                attention_mask=att_mask,
                mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN,
                labels=labels_ids)

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.

            if gradient_accumulate_step > 1:
                loss = loss / gradient_accumulate_step

            loss.backward()
            forbackward_step += 1

            if forbackward_step % gradient_accumulate_step == 0:
                optimizer.step()
                optimizer.zero_grad()
                update_step += 1

                if update_step % eval_frequency == 0:
                    print("Update steps:", update_step)
                    dev_iter = biterator(dev_instances,
                                         num_epochs=1,
                                         shuffle=False)

                    cur_eval_results_list = eval_model(model,
                                                       dev_iter,
                                                       device_num,
                                                       with_probs=True,
                                                       make_int=True)

                    results_dict = list_dict_data_tool.list_to_dict(
                        cur_eval_results_list, 'oid')
                    copied_dev_list = copy.deepcopy(dev_list)
                    list_dict_data_tool.append_item_from_dict_to_list(
                        copied_dev_list, results_dict, 'id', 'predicted_label')

                    mode = {'standard': True}
                    strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(
                        copied_dev_list,
                        dev_fitems_list,
                        mode=mode,
                        max_evidence=5)
                    logging_item = {
                        'ss': strict_score,
                        'ac': acc_score,
                        'pr': pr,
                        'rec': rec,
                        'f1': f1,
                    }

                    save_file_name = f'i({update_step})|e({epoch_i})' \
                        f'|ss({strict_score})|ac({acc_score})|pr({pr})|rec({rec})|f1({f1})' \
                        f'|seed({seed})'

                    common.save_jsonl(
                        copied_dev_list,
                        Path(file_path_prefix) /
                        f"{save_file_name}_dev_nli_results.json")

                    # print(save_file_name)
                    logging_agent.incorporate_results({}, save_file_name,
                                                      logging_item)
                    logging_agent.logging_to_file(
                        Path(file_path_prefix) / "log.json")

                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    output_model_file = Path(file_path_prefix) / save_file_name
                    torch.save(model_to_save.state_dict(),
                               str(output_model_file))
示例#30
0
def model_eval_ablation(model_path, filter_value=0.2, top_k_sent=5):
    bert_model_name = 'bert-base-uncased'
    bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'

    lazy = False
    forward_size = 32
    do_lower_case = True
    pair_order = 'cq'
    debug_mode = False

    maxout_model = False

    num_class = 3

    tag = 'dev'
    exp = 'no_re_train'
    print("Filter value:", filter_value)
    print("top_k_sent:", top_k_sent)
    train_sent_filtering_prob = 0.2
    dev_sent_filtering_prob = filter_value
    test_sent_filtering_prob = 0.2

    # Data dataset and upstream sentence results.
    dev_sent_results_list = common.load_jsonl(
        config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl")
    # train_sent_results_list = common.load_jsonl(
    #     config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_train_results.jsonl")
    test_sent_results_list = common.load_jsonl(
        config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_test_results.jsonl")

    dev_fitems, dev_list = get_nli_pair('dev', is_training=False,
                                        sent_level_results_list=dev_sent_results_list, debug=debug_mode,
                                        sent_top_k=top_k_sent, sent_filter_value=dev_sent_filtering_prob)
    # train_fitems, train_list = get_nli_pair('train', is_training=True,
    #                                         sent_level_results_list=train_sent_results_list, debug=debug_mode,
    #                                         sent_top_k=5, sent_filter_value=train_sent_filtering_prob)
    test_fitems, test_list = get_nli_pair('test', is_training=False,
                                          sent_level_results_list=test_sent_results_list, debug=debug_mode,
                                          sent_top_k=top_k_sent, sent_filter_value=test_sent_filtering_prob)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace('SUPPORTS', namespace='labels')
    vocab.add_token_to_namespace('REFUTES', namespace='labels')
    vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels')
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels')

    if debug_mode:
        dev_list = dev_list[:100]
        # train_list = train_list[:100]
        test_list = test_list[:100]
        eval_frequency = 2

    # est_datasize = len(train_fitems)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case,
                                                   cache_dir=bert_pretrain_path)
    bert_cs_reader = BertFeverNLIReader(bert_tokenizer, lazy, is_paired=True, query_l=64,
                                        example_filter=None, max_l=384, pair_order=pair_order)

    bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path)
    if not maxout_model:
        model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1,
                                                act_type='tanh', use_pretrained_pooler=True, use_sigmoid=False)
    else:
        model = BertPairMaxOutMatcher(bert_encoder, num_of_class=num_class, act_type="gelu", num_of_out_layers=2)

    model.load_state_dict(torch.load(model_path))

    dev_instances = bert_cs_reader.read(dev_fitems)
    # train_instances = bert_cs_reader.read(train_fitems)
    test_instances = bert_cs_reader.read(test_fitems)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if tag == 'dev':
        dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True,
                                           feed_input_span=maxout_model, show_progress=True)
        common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{dev_sent_filtering_prob}_{exp}.jsonl")

        ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid')
        copied_dev_list = copy.deepcopy(dev_list)
        list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict,
                                                          'id', 'predicted_label')

        common.save_jsonl(copied_dev_list, f"nli_{tag}_cp_results_th{dev_sent_filtering_prob}_{exp}.jsonl")
        mode = {'standard': True}
        strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list,
                                                                        mode=mode, max_evidence=5)
        logging_item = {
            'ss': strict_score, 'ac': acc_score,
            'pr': pr, 'rec': rec, 'f1': f1,
        }

        print(logging_item)
        common.save_json(logging_item,
                         f"nli_th{dev_sent_filtering_prob}_{exp}_ss:{strict_score}_ac:{acc_score}_pr:{pr}_rec:{rec}_f1:{f1}.jsonl")

    elif tag == 'test':
        test_iter = biterator(test_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model, test_iter, device_num, with_probs=True, make_int=True,
                                           feed_input_span=maxout_model, show_progress=True)

        common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{test_sent_filtering_prob}.jsonl")

        ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid')
        copied_test_list = copy.deepcopy(test_list)
        list_dict_data_tool.append_item_from_dict_to_list(copied_test_list, ema_results_dict,
                                                          'id', 'predicted_label')

        common.save_jsonl(copied_test_list, f"nli_{tag}_cp_results_th{test_sent_filtering_prob}.jsonl")