Пример #1
0
def precompute_forward_items_and_cache():
    # 3 places need to switch from dev to train !!!

    is_training = False
    doc_results = common.load_json(
        # config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_train_doc_retrieval_v8_before_multihop_filtering.json")
        # config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_dev_doc_retrieval_v8_before_multihop_filtering.json")
        config.PRO_ROOT /
        "results/doc_retri_results/doc_retrieval_final_v8/hotpot_test_doc_retrieval_v8_before_multihop_filtering.json"
    )
    doc_results = results_multihop_filtering(doc_results,
                                             multihop_retrieval_top_k=3,
                                             strict_mode=True)

    # db_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_DB)

    t_db_cursor = wiki_db_tool.get_cursor(config.WHOLE_PROCESS_FOR_RINDEX_DB)

    # data_list = common.load_json(config.DEV_FULLWIKI_FILE)
    data_list = common.load_json(config.TEST_FULLWIKI_FILE)
    # data_list = common.load_json(config.TRAIN_FILE)
    append_baseline_context(doc_results, data_list)

    fitem_list = build_full_wiki_document_forward_item(doc_results, data_list,
                                                       is_training,
                                                       t_db_cursor, True)

    print(len(fitem_list))
    common.save_jsonl(
        fitem_list, config.PDATA_ROOT / "content_selection_forward" /
        "hotpot_test_p_level_unlabeled.jsonl")
Пример #2
0
def results_analysis():
    doc_results = common.load_json(
        # config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_train_doc_retrieval_v8_before_multihop_filtering.json")
        config.PRO_ROOT /
        "results/doc_retri_results/doc_retrieval_final_v8/hotpot_dev_doc_retrieval_v8_before_multihop_filtering.json"
    )
    doc_results = results_multihop_filtering(doc_results,
                                             multihop_retrieval_top_k=3,
                                             strict_mode=True)

    # terms_based_results_list = common.load_jsonl(
    #     config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl")

    data_list = common.load_json(config.DEV_FULLWIKI_FILE)
    # data_list = common.load_json(config.TRAIN_FILE)

    append_baseline_context(doc_results, data_list)

    len_list = []
    for rset in doc_results['sp_doc'].values():
        len_list.append(len(rset))

    print("Results with filtering:")

    print(collections.Counter(len_list).most_common(10000))
    print(len(len_list))
    print("Mean:\t", np.mean(len_list))
    print("Std:\t", np.std(len_list))
    print("Max:\t", np.max(len_list))
    print("Min:\t", np.min(len_list))

    ext_hotpot_eval.eval(doc_results, data_list)
Пример #3
0
def toy_init_results():
    dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE)
    print(len(dev_fullwiki_list))

    # Load rindex file
    abs_rindexdb = IndexDB()
    abs_rindexdb.load_from_file(config.PDATA_ROOT / "reverse_indexing/abs_rindexdb")
    print("Number of terms:", len(abs_rindexdb.inverted_index.index))
    abs_rindexdb.inverted_index.build_Nt_table()
    abs_rindexdb.score_db['default-tf-idf'] = dict()
    load_from_file(abs_rindexdb.score_db['default-tf-idf'],
                   config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")
    # Load rindex finished

    saved_items = []
    for item in tqdm(dev_fullwiki_list):
        saved_tfidf_item = dict()
        question = item['question']
        qid = item['_id']

        doc_list = get_top_ranked_tf_idf_doc(question, abs_rindexdb, top_k=50)
        saved_tfidf_item['question'] = question
        saved_tfidf_item['qid'] = qid
        saved_tfidf_item['doc_list'] = doc_list

        saved_items.append(saved_tfidf_item)

    common.save_jsonl(saved_items, config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl")
Пример #4
0
    def batchnize_dataset(self, data, batch_size=None, shuffle=True):
        batches = []
        max_span_len = self.config["max_span_len"]
        dataset = load_json(data)

        if shuffle:
            random.shuffle(dataset)
            dataset.sort(key=lambda record: len(record["words"]))

        prev_seq_len = len(dataset[0]["words"])
        batch_words, batch_chars, batch_tags = [], [], []

        for record in dataset:
            seq_len = len(record["words"])

            if len(batch_words) == batch_size or prev_seq_len != seq_len:
                batches.append(
                    self.make_each_batch(batch_words, batch_chars,
                                         max_span_len, batch_tags))
                batch_words, batch_chars, batch_tags = [], [], []
                prev_seq_len = seq_len

            batch_words.append(record["words"])
            batch_chars.append(record["chars"])
            batch_tags.append(record["tags"])

        if len(batch_words) > 0:
            batches.append(
                self.make_each_batch(batch_words, batch_chars, max_span_len,
                                     batch_tags))
        if shuffle:
            random.shuffle(batches)
        for batch in batches:
            yield batch
Пример #5
0
def model_perf_binned(dataset_name, task_name, data_file, model_prediction_file, split_type='quantile', bin_num=5,
                      verbose=True):

    d_list = common.load_jsonl(data_file)
    collected_data_dict = list_dict_data_tool.list_to_dict(d_list, key_fields='uid')
    model_prediction_dict = common.load_json(model_prediction_file)

    bined_item = build_entropy_bins(collected_data_dict, bin_num, type=split_type)
    bined_item_results = calculate_per_bin_results_simplify(bined_item, model_prediction_dict,
                                                            task_name=task_name)

    if verbose:
        print('-' * 60)
        print('Data:', dataset_name)
        for model_name, range_items in bined_item_results.items():
            print('Model: {:20s}'.format(model_name))
            print('\t'.join(['{:18s}'.format('Entropy Range'), '{:15s}'.format('# of Example'),
                             '{:10s}'.format('JSD'), '{:10s}'.format('KL'),
                             '{:10s}'.format('Old Acc.'), '{:10s}'.format('New Acc.')]))
            for range_value, model_item in range_items['bin_results'].items():
                print('\t'.join(['{:5f}-{:5f}'.format(range_value[0], range_value[1]),
                                 '{:15s}'.format(format_number(model_item['total_count'])),
                                 '{:10s}'.format(format_number(model_item['average JS div'])),
                                 '{:10s}'.format(format_number(model_item['average KL div'])),
                                 '{:10s}'.format(format_number(model_item['o_acc'])),
                                 '{:10s}'.format(format_number(model_item['m_acc'])),
                                 ]))
        print('-' * 60)
    return bined_item_results
Пример #6
0
def show_nli_binned_plot(y_axis_value):
    dataset_name = 'Natural Language Inference'
    task_name = 'uncertainty_nli'
    snli_data_file = config.CHAOSNLI_SNLI
    mnli_data_file = config.CHAOSNLI_MNLI

    model_pred_file = config.MODEL_PRED_NLI

    d_list_snli = common.load_jsonl(snli_data_file)
    d_list_mnli = common.load_jsonl(mnli_data_file)

    collected_data_dict = {}
    collected_data_dict_snli = list_dict_data_tool.list_to_dict(d_list_snli, key_fields='uid')
    collected_data_dict_mnli = list_dict_data_tool.list_to_dict(d_list_mnli, key_fields='uid')
    collected_data_dict.update(collected_data_dict_snli)
    collected_data_dict.update(collected_data_dict_mnli)

    model_prediction_dict = common.load_json(model_pred_file)

    bin_num = 5
    split_type = 'quantile'
    column_name = 'ChaosNLI-(S+M)'

    bined_item = build_entropy_bins(collected_data_dict, bin_num, type=split_type)
    bined_item_results = calculate_per_bin_results_simplify(bined_item, model_prediction_dict,
                                                            task_name=task_name)

    plot_histogram(bined_item_results, y_axis_value, column_name)
Пример #7
0
def get_sp_position_count():
    train_list = common.load_json(config.TRAIN_FILE)
    c = Counter()
    for item in train_list:
        sp_position_analysis(item, c)

    print(c)
Пример #8
0
def experiment_test_full_wiki():
    multihop_retrieval_top_k = 3
    match_filtering_k = 3
    term_retrieval_top_k = 5

    data_list = common.load_json(config.TEST_FULLWIKI_FILE)
    terms_based_results_list = common.load_jsonl(
        config.RESULT_PATH /
        "doc_retri_results/term_based_methods_results/hotpot_tf_idf_test.jsonl"
    )
    g_score_dict = dict()
    load_from_file(
        g_score_dict, config.PDATA_ROOT /
        "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")
    # WE need to give gt data None.
    doc_retri_pred_dict = init_results_v8(
        data_list,
        None,
        terms_based_results_list,
        g_score_dict,
        match_filtering_k=match_filtering_k,
        term_retrieval_top_k=term_retrieval_top_k)

    len_list = []
    for rset in doc_retri_pred_dict['sp_doc'].values():
        len_list.append(len(rset))

    print("Results without filtering:")
    print(collections.Counter(len_list).most_common(10000))
    print(len(len_list))
    print("Mean:\t", np.mean(len_list))
    print("Std:\t", np.std(len_list))
    print("Max:\t", np.max(len_list))
    print("Min:\t", np.min(len_list))

    common.save_json(
        doc_retri_pred_dict,
        "hotpot_test_doc_retrieval_v8_before_multihop_filtering.json")

    # Filtering
    new_doc_retri_pred_dict = results_multihop_filtering(
        doc_retri_pred_dict, multihop_retrieval_top_k=multihop_retrieval_top_k)
    print("Results with filtering:")

    len_list = []
    for rset in new_doc_retri_pred_dict['sp_doc'].values():
        len_list.append(len(rset))

    print("Results with filtering:")
    print(collections.Counter(len_list).most_common(10000))
    print(len(len_list))
    print("Mean:\t", np.mean(len_list))
    print("Std:\t", np.std(len_list))
    print("Max:\t", np.max(len_list))
    print("Min:\t", np.min(len_list))

    # ext_hotpot_eval.eval(new_doc_retri_pred_dict, data_list)
    common.save_json(new_doc_retri_pred_dict,
                     "hotpot_test_doc_retrieval_v8.json")
Пример #9
0
    def logging_to_file(self, filename):
        if Path(filename).is_file():
            old_logging_list = common.load_json(filename)
            current_saved_key = set()

            for item in self.logging_item_list:
                current_saved_key.add(item['k'])

            for item in old_logging_list:
                if item['k'] not in current_saved_key:
                    raise ValueError("Previous logged item can not be found!")

        common.save_json(self.logging_item_list, filename, indent=2, sort_keys=True)
Пример #10
0
    def batchnize_dataset(self,
                          data,
                          data_name=None,
                          batch_size=None,
                          shuffle=True):
        max_span_len = self.config["max_span_len"]
        if data_name == "train":
            max_n_spans = self.config["max_n_spans"]
        else:
            if self.config["max_n_spans"] > 0:
                max_n_spans = 1000000
            else:
                max_n_spans = 0

        dataset = load_json(data)
        for instance_id, record in enumerate(dataset):
            record["instance_id"] = instance_id

        if shuffle:
            random.shuffle(dataset)
            dataset.sort(key=lambda record: len(record["words"]))

        batches = []
        batch_words, batch_chars, batch_tags, batch_ids = [], [], [], []
        prev_seq_len = len(dataset[0]["words"])

        for record in dataset:
            seq_len = len(record["words"])

            if len(batch_words) == batch_size or prev_seq_len != seq_len:
                batches.append(
                    self.make_each_batch_for_targets(batch_words, batch_chars,
                                                     batch_ids, max_span_len,
                                                     max_n_spans, batch_tags))
                batch_words, batch_chars, batch_tags, batch_ids = [], [], [], []
                prev_seq_len = seq_len

            batch_words.append(record["words"])
            batch_chars.append(record["chars"])
            batch_tags.append(record["tags"])
            batch_ids.append(record["instance_id"])

        if len(batch_words) > 0:
            batches.append(
                self.make_each_batch_for_targets(batch_words, batch_chars,
                                                 batch_ids, max_span_len,
                                                 max_n_spans, batch_tags))
        if shuffle:
            random.shuffle(batches)
        for batch in batches:
            yield batch
Пример #11
0
def get_sample_data(size=-1):
    qa_gt_s = common.load_json(config.FEVER_DATA_ROOT / "qa_aug" /
                               "squad_train_turker_groundtruth.json")
    # print(len(qa_gt_s))
    qa_aug_rnei = common.load_json(
        config.FEVER_DATA_ROOT / "qa_aug" /
        "squad_train_refutes_bytype_3x_claim_stoch_answspan_stoch.json")
    # print(len(qa_aug_rnei))
    random.shuffle(qa_aug_rnei)
    for item in qa_aug_rnei:
        sv = random.random()
        if sv > 0.5:
            item['label'] = "REFUTES"
        else:
            item['label'] = "NOT ENOUGH INFO"

    balanced_aug_data = qa_gt_s + qa_aug_rnei[:len(qa_gt_s) * 2]
    print("Total balanced size:", len(balanced_aug_data))
    random.shuffle(balanced_aug_data)
    if size != -1:
        return balanced_aug_data[:size]
    else:
        return balanced_aug_data
Пример #12
0
 def load_dataset(self, filename, keep_number=False, lowercase=True):
     dataset = []
     for record in load_json(filename):
         words = [
             word_convert(word,
                          keep_number=keep_number,
                          lowercase=lowercase) for word in record["words"]
         ]
         dataset.append({
             "sent_id": record["sent_id"],
             "words": words,
             "tags": record["spans"]
         })
     return dataset
Пример #13
0
def get_train_sentence_pair(top_k, is_training, debug=False, cur_train_eval_results_list=None):
    train_list = common.load_json(config.TRAIN_FILE)

    if cur_train_eval_results_list is None:
        cur_train_eval_results_list = common.load_jsonl(
            config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/"
                              "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/train_p_level_bert_v1_results.jsonl")

    if debug:
        train_list = train_list[:100]
        id_set = set([item['_id'] for item in train_list])
        cur_train_eval_results_list = [item for item in cur_train_eval_results_list if item['qid'] in id_set]

    return get_sentence_pair(top_k, train_list, cur_train_eval_results_list, is_training)
def full_wiki_baseline_upperbound():
    dev_fullwiki = common.load_json(config.DEV_FULLWIKI_FILE)
    # dev_fullwiki = common.load_json(config.DEV_DISTRACTOR_FILE)
    upperbound_pred_file = dict()

    upperbound_pred_file['sp'] = dict()
    upperbound_pred_file['sp_doc'] = dict()
    upperbound_pred_file['p_answer'] = dict()

    # print(dev_fullwiki)
    for item in dev_fullwiki:
        qid = item['_id']
        answer = item['answer']
        contexts = item['context']
        supporting_facts = item['supporting_facts']
        # supporting_doc = set([fact[0] for fact in item['supporting_facts']])

        # retrieved_doc_dict = set([context[0] for context in contexts])
        retrieved_doc_dict = dict()

        for doc_title, context_sents in contexts:
            if doc_title not in retrieved_doc_dict:
                retrieved_doc_dict[doc_title] = dict()

            for i, sent in enumerate(context_sents):
                retrieved_doc_dict[doc_title][i] = sent

        upperbound_pred_doc = []
        upperbound_pred_sp = []

        found_answer = False
        for sp_doc, sp_fact_line_num in supporting_facts:
            if sp_doc in retrieved_doc_dict and sp_fact_line_num in retrieved_doc_dict[sp_doc]:
                upperbound_pred_doc.append(sp_doc)
                upperbound_pred_sp.append([sp_doc, sp_fact_line_num])
                if answer in retrieved_doc_dict[sp_doc][sp_fact_line_num]:
                    found_answer = True

        p_answer = answer if found_answer else ""

        upperbound_pred_file['sp'][qid] = upperbound_pred_sp
        upperbound_pred_file['sp_doc'][qid] = upperbound_pred_doc

        upperbound_pred_file['p_answer'][qid] = p_answer

        if all([gt_fact in upperbound_pred_sp for gt_fact in supporting_facts]):
            # If we find all the evidence, to add additional yes/no answer.
            upperbound_pred_file['p_answer'][qid] = answer

    ext_hotpot_eval.eval(upperbound_pred_file, dev_fullwiki)
Пример #15
0
def term_based_doc_retri(hotpot_set):
    fullwiki_list = common.load_json(hotpot_set)
    print("{} questions".format(len(fullwiki_list)))

    retri_list = []
    for item in tqdm(fullwiki_list):
        saved_tfidf_item = dict()
        question = item['question']
        qid = item['_id']

        doc_list = lucene_retri_doc(question, top_k=50)
        saved_tfidf_item['question'] = question
        saved_tfidf_item['qid'] = qid
        saved_tfidf_item['doc_list'] = doc_list

        retri_list.append(saved_tfidf_item)
    return retri_list
Пример #16
0
def eval_hotpot_s():
    cur_dev_eval_results_list_out = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_hotpotqa/hotpot_p_level_effects/hotpot_s_level_dev_results_top_k_doc_100.jsonl"
    )
    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')
    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_dev_eval_results_list_out,
        copied_dev_o_dict,
        'qid',
        'fid',
        check=True)
    # 0.5
    cur_results_dict_v05 = select_top_k_and_to_results_dict(
        copied_dev_o_dict,
        top_k=5,
        score_field_name='prob',
        filter_value=0.5,
        result_field='sp')

    # cur_results_dict_v02 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5,
    #                                                         score_field_name='prob',
    #                                                         filter_value=0.2,
    #                                                         result_field='sp')

    _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05,
                                         dev_list,
                                         verbose=False)

    # _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02, dev_list, verbose=False)

    logging_item = {
        # 'v02': metrics_v2,
        'v05': metrics_v5,
    }

    print(logging_item)
    f1 = metrics_v5['sp_f1']
    em = metrics_v5['sp_em']
    pr = metrics_v5['sp_prec']
    rec = metrics_v5['sp_recall']

    print(em, pr, rec, f1)
Пример #17
0
def inspect_upstream_eval():
    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')
    dev_eval_results_list = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/dev_s_level_bert_v1_results.jsonl"
    )
    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
    list_dict_data_tool.append_subfield_from_list_to_dict(
        dev_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)

    # 0.5
    # cur_results_dict_v05 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5,
    #                                                         score_field_name='prob',
    #                                                         filter_value=0.5,
    #                                                         result_field='sp')

    cur_results_dict_v02 = select_top_k_and_to_results_dict(
        copied_dev_o_dict,
        top_k=5,
        score_field_name='prob',
        filter_value=0.2,
        result_field='sp')

    # _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05, dev_list, verbose=False)

    _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02,
                                         dev_list,
                                         verbose=False)

    v02_sp_f1 = metrics_v2['sp_f1']
    v02_sp_recall = metrics_v2['sp_recall']
    v02_sp_prec = metrics_v2['sp_prec']

    v05_sp_f1 = metrics_v5['sp_f1']
    v05_sp_recall = metrics_v5['sp_recall']
    v05_sp_prec = metrics_v5['sp_prec']

    logging_item = {
        'label': 'ema',
        'v02': metrics_v2,
        # 'v05': metrics_v5,
    }

    print(logging_item)
Пример #18
0
def load_and_eval():
    top_k = 50
    value_thrsehold = None
    tf_idf_dev_results = common.load_jsonl(config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl")
    doc_pred_dict = {'sp_doc': dict()}

    for item in tqdm(tf_idf_dev_results):
        sorted_scored_list = sorted(item['doc_list'], key=lambda x: x[0], reverse=True)
        pred_list = [docid for _, docid in sorted_scored_list[:top_k]]
        # print(sorted_scored_list)

        qid = item['qid']
        doc_pred_dict['sp_doc'][qid] = pred_list

        # break

    dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE)
    ext_hotpot_eval.eval(doc_pred_dict, dev_fullwiki_list)
Пример #19
0
def eval_p_level():
    cur_eval_results_list = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/dev_p_level_bert_v1_results.jsonl"
    )

    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')

    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)
    # Top_5
    cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict,
                                                             top_k=5)

    _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5,
                                           dev_list,
                                           verbose=False)

    print(metrics_top5)
Пример #20
0
    def train_knn_epoch(self, batches, name):
        loss_total = 0.
        num_batches = 0
        start_time = time.time()
        train_sents = load_json(self.cfg["train_set"])
        if self.cfg["knn_sampling"] == "random":
            train_sent_ids = [sent_id for sent_id in range(len(train_sents))]
        else:
            train_sent_ids = None

        for batch in batches:
            num_batches += 1
            if num_batches % 100 == 0:
                print("%d" % num_batches, flush=True, end=" ")

            # Setup a batch
            batch = self._add_neighbor_instances_to_batch(batch,
                                                          train_sents,
                                                          train_sent_ids,
                                                          is_train=True)
            # Convert a batch to the input format
            feed_dict = self._get_feed_dict(batch,
                                            is_train=True,
                                            keep_prob=self.cfg["keep_prob"],
                                            lr=self.cfg["lr"])
            # Train a model
            _, train_loss = self.sess.run([self.train_op, self.loss],
                                          feed_dict)

            if math.isnan(train_loss):
                self.logger.info("\n\n\nNAN: Index: %d\n" % num_batches)
                exit()

            loss_total += train_loss

        avg_loss = loss_total / num_batches
        self.logger.info("-- Time: %f seconds" % (time.time() - start_time))
        self.logger.info("-- Averaged loss: %f(%f/%d)" %
                         (avg_loss, loss_total, num_batches))
        return avg_loss, loss_total
Пример #21
0
  def _initialize_config(self):
    # create folders and logger
    os.makedirs(self.cfg["checkpoint_path"], exist_ok=True)
    os.makedirs(os.path.join(self.cfg["summary_path"]), exist_ok=True)
    self.logger = get_logger(
      os.path.join(self.cfg["checkpoint_path"], "log.txt"))

    # load dictionary
    dict_data = load_json(self.cfg["vocab"])
    self.word_dict = dict_data["word_dict"]
    self.char_dict = dict_data["char_dict"]
    self.tag_dict = dict_data["tag_dict"]
    del dict_data
    self.word_vocab_size = len(self.word_dict)
    self.char_vocab_size = len(self.char_dict)
    self.tag_vocab_size = len(self.tag_dict)
    self.rev_word_dict = dict([(idx, word)
                               for word, idx in self.word_dict.items()])
    self.rev_char_dict = dict([(idx, char)
                               for char, idx in self.char_dict.items()])
    self.rev_tag_dict = dict([(idx, tag)
                              for tag, idx in self.tag_dict.items()])
Пример #22
0
    def evaluate_knn_epoch(self, batches, name):
        correct = 0
        p_total = 0
        num_batches = 0
        start_time = time.time()
        train_sents = load_json(self.cfg["train_set"])
        if self.cfg["knn_sampling"] == "random":
            train_sent_ids = [sent_id for sent_id in range(len(train_sents))]
        else:
            train_sent_ids = None

        for batch in batches:
            num_batches += 1
            if num_batches % 100 == 0:
                print("%d" % num_batches, flush=True, end=" ")

            # Setup a batch
            batch = self._add_neighbor_instances_to_batch(batch,
                                                          train_sents,
                                                          train_sent_ids,
                                                          is_train=False)
            # Convert a batch to the input format
            feed_dict = self._get_feed_dict(batch)
            # Classify spans
            predicted_tags = self.sess.run([self.predicts], feed_dict)[0]

            crr_i, p_total_i = count_gold_and_system_outputs(
                batch["tags"], predicted_tags, NULL_LABEL_ID)
            correct += crr_i
            p_total += p_total_i

        p, r, f = f_score(correct, p_total, self.n_gold_spans)
        self.logger.info("-- Time: %f seconds" % (time.time() - start_time))
        self.logger.info(
            "-- {} set\tF:{:>7.2%} P:{:>7.2%} ({:>5}/{:>5}) R:{:>7.2%} ({:>5}/{:>5})"
            .format(name, f, p, correct, p_total, r, correct,
                    self.n_gold_spans))
        return f, p, r, correct, p_total, self.n_gold_spans
Пример #23
0
def model_perf(dataset_name, task_name, data_file, model_prediction_file):
    d_list = common.load_jsonl(data_file)
    collected_data_dict = list_dict_data_tool.list_to_dict(d_list,
                                                           key_fields='uid')
    model_prediction_dict = common.load_json(model_prediction_file)
    results_dict, all_correct_set = calculate_divergence_bwt_model_human_simplify(
        collected_data_dict, model_prediction_dict, task_name)
    print('-' * 60)
    print('Data:', dataset_name)
    print("All Correct Count:", len(all_correct_set))
    print('\t'.join([
        '{:20s}'.format('Model Name'), '{:10s}'.format('JSD'),
        '{:10s}'.format('KL'), '{:10s}'.format('Old Acc.'),
        '{:10s}'.format('New Acc.')
    ]))
    for model_name, model_item in results_dict.items():
        print('\t'.join([
            '{:20s}'.format(model_name),
            '{:10s}'.format(format_number(model_item['average JS div'])),
            '{:10s}'.format(format_number(model_item['average KL div'])),
            '{:10s}'.format(format_number(model_item['o_acc'])),
            '{:10s}'.format(format_number(model_item['m_acc'])),
        ]))
    print('-' * 60)
Пример #24
0
def inspect_sampler_squad_examples():
    bert_model_name = "bert-base-uncased"
    bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'
    do_lower_case = True
    max_pre_context_length = 315
    max_query_length = 64
    doc_stride = 128
    debug = True

    tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                              do_lower_case=do_lower_case,
                                              cache_dir=bert_pretrain_path)

    squad_train_v2 = common.load_json(config.SQUAD_TRAIN_2_0)

    train_eitem_list = preprocessing_squad(squad_train_v2)
    train_fitem_dict, train_fitem_list = eitems_to_fitems(
        train_eitem_list,
        tokenizer,
        is_training=False,
        max_tokens_for_doc=max_pre_context_length,
        doc_stride=doc_stride,
        debug=debug)
    print(len(train_fitem_list))
Пример #25
0
    def batchnize_dataset(self, data, batch_size=None, shuffle=True):
        max_span_len = self.config["max_span_len"]
        max_n_spans = None
        dataset = load_json(data)

        if shuffle:
            random.shuffle(dataset)

        batch_words, batch_chars, batch_tags = [], [], []

        for record in dataset:
            if len(batch_words) == batch_size:
                yield self.make_each_batch(batch_words, batch_chars,
                                           max_span_len, max_n_spans,
                                           batch_tags)
                batch_words, batch_chars, batch_tags = [], [], []

            batch_words.append(record["words"])
            batch_chars.append(record["chars"])
            batch_tags.append(record["tags"])

        if len(batch_words) > 0:
            yield self.make_each_batch(batch_words, batch_chars, max_span_len,
                                       max_n_spans, batch_tags)
Пример #26
0
def eval_model_for_downstream(model_saved_path):
    seed = 12
    torch.manual_seed(seed)
    bert_model_name = 'bert-base-uncased'
    # lazy = False
    lazy = True
    forward_size = 32
    # batch_size = 64
    batch_size = 128
    do_lower_case = True

    debug_mode = False
    # est_datasize = 900_000

    num_class = 1
    # num_train_optimization_steps

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels')

    # Load Dataset
    train_list = common.load_json(config.TRAIN_FILE)
    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)

    dev_fitems_list = common.load_jsonl(
        config.PDATA_ROOT / "content_selection_forward" / "hotpot_dev_p_level_unlabeled.jsonl")
    train_fitems_list = common.load_jsonl(
        config.PDATA_ROOT / "content_selection_forward" / "hotpot_train_p_level_labeled.jsonl")
    test_fitems_list = common.load_jsonl(
        config.PDATA_ROOT / "content_selection_forward" / "hotpot_test_p_level_unlabeled.jsonl")

    if debug_mode:
        dev_list = dev_list[:10]
        dev_fitems_list = dev_fitems_list[:296]
        train_fitems_list = train_fitems_list[:300]
        eval_frequency = 2
        # print(dev_list[-1]['_id'])
        # exit(0)

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')
    train_o_dict = list_dict_data_tool.list_to_dict(train_list, '_id')

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case)
    bert_cs_reader = BertContentSelectionReader(bert_tokenizer, lazy, is_paired=True,
                                                example_filter=lambda x: len(x['context']) == 0, max_l=286)

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1,
                                            act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True)

    model.load_state_dict(torch.load(model_saved_path))

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #
    dev_instances = bert_cs_reader.read(dev_fitems_list)
    train_instance = bert_cs_reader.read(train_fitems_list)
    test_instances = bert_cs_reader.read(test_fitems_list)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    # train_iter = biterator(train_instance, num_epochs=1, shuffle=False)
    # dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)
    test_iter = biterator(test_instances, num_epochs=1, shuffle=False)

    print(len(dev_fitems_list))
    print(len(test_fitems_list))
    print(len(train_fitems_list))

    # cur_dev_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, show_progress=True)
    # cur_train_eval_results_list = eval_model(model, train_iter, device_num, with_probs=True, show_progress=True)

    cur_test_eval_results_list = eval_model(model, test_iter, device_num, with_probs=True, show_progress=True)
    common.save_jsonl(cur_test_eval_results_list, "test_p_level_bert_v1_results.jsonl")

    print("Test write finished.")
    exit(0)

    copied_dev_o_dict = copy.deepcopy(dev_o_dict)

    list_dict_data_tool.append_subfield_from_list_to_dict(cur_dev_eval_results_list, copied_dev_o_dict,
                                                          'qid', 'fid', check=True)
    # Top_3
    cur_results_dict_top3 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=3)
    upperbound_results_dict_top3 = append_gt_downstream_to_get_upperbound_from_doc_retri(
        cur_results_dict_top3,
        dev_list)

    # Top_5
    cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5)
    upperbound_results_dict_top5 = append_gt_downstream_to_get_upperbound_from_doc_retri(
        cur_results_dict_top5,
        dev_list)

    cur_results_dict_top10 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=10)
    upperbound_results_dict_top10 = append_gt_downstream_to_get_upperbound_from_doc_retri(
        cur_results_dict_top10,
        dev_list)

    _, metrics_top3 = ext_hotpot_eval.eval(cur_results_dict_top3, dev_list, verbose=False)
    _, metrics_top3_UB = ext_hotpot_eval.eval(upperbound_results_dict_top3, dev_list, verbose=False)

    _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False)
    _, metrics_top5_UB = ext_hotpot_eval.eval(upperbound_results_dict_top5, dev_list, verbose=False)

    _, metrics_top10 = ext_hotpot_eval.eval(cur_results_dict_top10, dev_list, verbose=False)
    _, metrics_top10_UB = ext_hotpot_eval.eval(upperbound_results_dict_top10, dev_list, verbose=False)

    logging_item = {
        'top3': metrics_top3,
        'top3_UB': metrics_top3_UB,
        'top5': metrics_top5,
        'top5_UB': metrics_top5_UB,
        'top10': metrics_top10,
        'top10_UB': metrics_top10_UB,
    }

    print(logging_item)

    common.save_jsonl(cur_train_eval_results_list, "train_p_level_bert_v1_results.jsonl")
    common.save_jsonl(cur_dev_eval_results_list, "dev_p_level_bert_v1_results.jsonl")
Пример #27
0
def model_go():
    seed = 12
    torch.manual_seed(seed)
    # bert_model_name = 'bert-large-uncased'
    bert_model_name = 'bert-base-uncased'
    experiment_name = 'hotpot_v0_cs'
    lazy = False
    # lazy = True
    forward_size = 16
    # batch_size = 64
    batch_size = 128
    gradient_accumulate_step = int(batch_size / forward_size)
    warmup_proportion = 0.1
    learning_rate = 5e-5
    num_train_epochs = 5
    eval_frequency = 5000
    pos_ratio = 0.2
    do_lower_case = True

    debug_mode = False
    # est_datasize = 900_000

    num_class = 1
    # num_train_optimization_steps

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels')

    # Load Dataset
    train_list = common.load_json(config.TRAIN_FILE)
    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)

    dev_fitems_list = common.load_jsonl(
        config.PDATA_ROOT / "content_selection_forward" / "hotpot_dev_p_level_unlabeled.jsonl")
    train_fitems_list = common.load_jsonl(
        config.PDATA_ROOT / "content_selection_forward" / "hotpot_train_p_level_labeled.jsonl")

    if debug_mode:
        dev_list = dev_list[:10]
        dev_fitems_list = dev_fitems_list[:296]
        train_fitems_list = train_fitems_list[:300]
        eval_frequency = 2
        # print(dev_list[-1]['_id'])
        # exit(0)

    sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio)
    est_datasize = len(sampled_train_list)

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')
    # print(dev_o_dict)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case)
    bert_cs_reader = BertContentSelectionReader(bert_tokenizer, lazy, is_paired=True,
                                                example_filter=lambda x: len(x['context']) == 0, max_l=286)

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1,
                                            act_type='tanh', use_pretrained_pooler=True, use_sigmoid=True)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \
                                   num_train_epochs

    print("Estimated training size", est_datasize)
    print("Number of optimization steps:", num_train_optimization_steps)

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=num_train_optimization_steps)

    dev_instances = bert_cs_reader.read(dev_fitems_list)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    forbackward_step = 0
    update_step = 0

    logging_agent = save_tool.ScoreLogger({})

    # # # Create Log File
    file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}")
    # Save the source code.
    script_name = os.path.basename(__file__)
    with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it:
        out_f.write(it.read())
        out_f.flush()
    # # # Log File end

    for epoch_i in range(num_train_epochs):
        print("Epoch:", epoch_i)
        sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio)
        train_instance = bert_cs_reader.read(sampled_train_list)
        train_iter = biterator(train_instance, num_epochs=1, shuffle=True)

        for batch in tqdm(train_iter):
            model.train()
            batch = move_to_device(batch, device_num)

            paired_sequence = batch['paired_sequence']
            paired_segments_ids = batch['paired_segments_ids']
            labels_ids = batch['label']
            att_mask, _ = torch_util.get_length_and_mask(paired_sequence)
            s1_span = batch['bert_s1_span']
            s2_span = batch['bert_s2_span']

            loss = model(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask,
                         mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN,
                         labels=labels_ids)

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.

            if gradient_accumulate_step > 1:
                loss = loss / gradient_accumulate_step

            loss.backward()
            forbackward_step += 1

            if forbackward_step % gradient_accumulate_step == 0:
                optimizer.step()
                optimizer.zero_grad()
                update_step += 1

                if update_step % eval_frequency == 0:
                    print("Update steps:", update_step)
                    dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

                    cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True)
                    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
                    list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_dev_o_dict,
                                                                          'qid', 'fid', check=True)
                    # Top_5
                    cur_results_dict_top5 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=5)
                    upperbound_results_dict_top5 = append_gt_downstream_to_get_upperbound_from_doc_retri(
                        cur_results_dict_top5,
                        dev_list)

                    cur_results_dict_top10 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=10)
                    upperbound_results_dict_top10 = append_gt_downstream_to_get_upperbound_from_doc_retri(
                        cur_results_dict_top10,
                        dev_list)

                    _, metrics_top5 = ext_hotpot_eval.eval(cur_results_dict_top5, dev_list, verbose=False)
                    _, metrics_top5_UB = ext_hotpot_eval.eval(upperbound_results_dict_top5, dev_list, verbose=False)

                    _, metrics_top10 = ext_hotpot_eval.eval(cur_results_dict_top10, dev_list, verbose=False)
                    _, metrics_top10_UB = ext_hotpot_eval.eval(upperbound_results_dict_top10, dev_list, verbose=False)

                    # top5_doc_f1, top5_UB_sp_f1, top10_doc_f1, top10_Ub_sp_f1
                    # top5_doc_f1 = metrics_top5['doc_f1']
                    # top5_UB_sp_f1 = metrics_top5_UB['sp_f1']
                    # top10_doc_f1 = metrics_top10['doc_f1']
                    # top10_Ub_sp_f1 = metrics_top10_UB['sp_f1']

                    top5_doc_recall = metrics_top5['doc_recall']
                    top5_UB_sp_recall = metrics_top5_UB['sp_recall']
                    top10_doc_recall = metrics_top10['doc_recall']
                    top10_Ub_sp_recall = metrics_top10_UB['sp_recall']

                    logging_item = {
                        'top5': metrics_top5,
                        'top5_UB': metrics_top5_UB,
                        'top10': metrics_top10,
                        'top10_UB': metrics_top10_UB,
                    }

                    # print(logging_item)
                    save_file_name = f'i({update_step})|e({epoch_i})' \
                        f'|t5_doc_recall({top5_doc_recall})|t5_sp_recall({top5_UB_sp_recall})' \
                        f'|t10_doc_recall({top10_doc_recall})|t5_sp_recall({top10_Ub_sp_recall})|seed({seed})'

                    # print(save_file_name)
                    logging_agent.incorporate_results({}, save_file_name, logging_item)
                    logging_agent.logging_to_file(Path(file_path_prefix) / "log.json")

                    model_to_save = model.module if hasattr(model, 'module') else model
                    output_model_file = Path(file_path_prefix) / save_file_name
                    torch.save(model_to_save.state_dict(), str(output_model_file))
Пример #28
0
def model_go():
    for some_params in [0]:
        # bert_model_name = 'bert-large-uncased'
        seed = 6
        bert_model_name = 'bert-base-uncased'
        lazy = False
        forward_size = 16
        batch_size = 32
        gradient_accumulate_step = int(batch_size / forward_size)
        warmup_proportion = 0.1
        learning_rate = 5e-5
        num_train_epochs = 4
        do_ema = False
        dev_prob_threshold = 0.1
        train_prob_threshold = 0.35
        debug_mode = False
        experiment_name = f"fever_nli_bert_maxout_l4_on_fulldata"
        # experiment_name = f"bert_fever_nli_baseline_on_fulldata_aug_the_same_gt_mrate({some_params})"
        # experiment_name = f"bert_fever_nli_baseline_on_10p_aug_the_same_gt_mrate({some_params})"

        # data_aug = True
        data_aug = False
        data_aug_file = config.FEVER_DATA_ROOT / "qa_aug/squad_train_turker_groundtruth.json"
        # data_aug_size = int(21_015 * some_params)   # 10p
        data_aug_size = int(208_346 * some_params)

        # training_file = config.FEVER_DATA_ROOT / "fever_1.0/train_10.jsonl"
        training_file = config.FEVER_DATA_ROOT / "fever_1.0/train.jsonl"

        train_sample_top_k = 8

        # est_datasize = 208_346    # full
        # est_datasize = 14_544
        # est_datasize = 21_015 + data_aug_size   # 10p
        est_datasize = 208_346 + data_aug_size

        num_class = 3

        # num_train_optimization_steps
        torch.manual_seed(seed)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        n_gpu = torch.cuda.device_count()

        unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
        vocab = ExVocabulary(unk_token_num=unk_token_num)
        vocab.add_token_to_namespace('SUPPORTS', namespace='labels')
        vocab.add_token_to_namespace('REFUTES', namespace='labels')
        vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels')
        vocab.add_token_to_namespace("hidden", namespace="labels")
        vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels')
        # Finished build vocabulary.

        # Load standardized sentence file
        dev_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT /
                                                   "upstream_sentence_selection_Feb16/dev_sent_pred_scores.jsonl")
        dev_sent_after_threshold_filter = fever_ss_sampler.threshold_sampler_insure_unique(
            config.FEVER_DATA_ROOT / "fever_1.0/shared_task_dev.jsonl",
            dev_upstream_sent_list,
            prob_threshold=dev_prob_threshold, top_n=5)

        dev_data_list = fever_nli_sampler.select_sent_with_prob_for_eval(
            config.FEVER_DATA_ROOT / "fever_1.0/shared_task_dev.jsonl", dev_sent_after_threshold_filter,
            None, tokenized=True)

        # print(dev_data_list[0])
        # exit(0)

        train_upstream_sent_list = common.load_jsonl(config.FEVER_DATA_ROOT /
                                                     "upstream_sentence_selection_Feb16/train_sent_scores.jsonl")
        # Finished loading standardized sentence file.

        bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)

        bert_fever_reader = BertReaderFeverNLI(bert_tokenizer, lazy=lazy)

        dev_instances = bert_fever_reader.read(dev_data_list)

        biterator = BasicIterator(batch_size=forward_size)
        biterator.index_with(vocab)

        # print(list(mnli_dev_instances))

        # Load training model
        # Load training model
        # model_clf = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=num_class)
        bert_encoder = BertModel.from_pretrained(bert_model_name)
        model_clf = BertPairMaxOutMatcher(bert_encoder, num_of_class=3)

        ema_tracker = None
        ema_model_copy = None
        if do_ema and ema_tracker is None:
            ema_tracker = EMA(model_clf.named_parameters(), on_cpu=True)
            ema_model_copy = copy.deepcopy(model_clf)

        model_clf.to(device)

        param_optimizer = list(model_clf.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \
                                       num_train_epochs

        print(num_train_optimization_steps)

        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=learning_rate,
                             warmup=warmup_proportion,
                             t_total=num_train_optimization_steps)

        # optimizer = optim.Adam(optimizer_grouped_parameters, lr=learning_rate)

        # # # Create Log File
        file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}")
        # Save the source code.
        script_name = os.path.basename(__file__)
        with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it:
            out_f.write(it.read())
            out_f.flush()
        # # # Log File end

        model_clf.train()

        if n_gpu > 1:
            model_clf = nn.DataParallel(model_clf)

        forbackward_step = 0
        update_step = 0
        eval_iter_num = 2_000  # Change this to real evaluation.
        best_fever_score = -1

        for n_epoch in range(num_train_epochs):
            print("Resampling...")
            train_sent_after_threshold_filter = \
                fever_ss_sampler.threshold_sampler_insure_unique(training_file,
                                                                 train_upstream_sent_list,
                                                                 train_prob_threshold,
                                                                 top_n=train_sample_top_k)
            #
            train_data_list = fever_nli_sampler.adv_simi_sample_with_prob_v1_1(
                training_file,
                train_sent_after_threshold_filter,
                None,
                tokenized=True)

            aug_d_list = []
            if data_aug:
                aug_d_list = common.load_json(data_aug_file)
                random.shuffle(aug_d_list)
                aug_d_list = aug_d_list[:data_aug_size]

            train_data_list = train_data_list + aug_d_list

            random.shuffle(train_data_list)
            print("Sample data length:", len(train_data_list))
            sampled_train_instances = bert_fever_reader.read(train_data_list)
            #
            train_iter = biterator(sampled_train_instances, shuffle=True, num_epochs=1)

            for i, batch in enumerate(tqdm(train_iter)):
                paired_sequence = batch['paired_sequence']
                paired_segments_ids = batch['paired_segments_ids']
                labels_ids = batch['label']
                att_mask, _ = torch_util.get_length_and_mask(paired_sequence)
                s1_span = batch['bert_s1_span']
                s2_span = batch['bert_s2_span']

                paired_sequence = paired_sequence.to(device)
                paired_segments_ids = paired_segments_ids.to(device)
                labels_ids = labels_ids.to(device)
                att_mask = att_mask.to(device)
                s1_span = s1_span.to(device)
                s2_span = s2_span.to(device)

                loss = model_clf(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask,
                                 s1_span=s1_span, s2_span=s2_span,
                                 mode=BertPairMaxOutMatcher.ForwardMode.TRAIN,
                                 labels=labels_ids)

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.

                if gradient_accumulate_step > 1:
                    loss = loss / gradient_accumulate_step

                loss.backward()
                forbackward_step += 1

                if forbackward_step % gradient_accumulate_step == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    update_step += 1
                    if do_ema and ema_tracker is not None:
                        # if model_clf is DataParallel, then we use model_clf.module
                        model_to_track = model_clf.module if hasattr(model_clf,
                                                                     'module') else model_clf
                        ema_tracker(model_to_track.named_parameters())  # Whenever we do update, the do ema update

                    if update_step % eval_iter_num == 0:
                        print("Update steps:", update_step)
                        dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

                        if do_ema and ema_model_copy is not None and ema_tracker is not None:
                            print("EMA evaluation.")
                            EMA.load_ema_to_model(ema_model_copy, ema_tracker)
                            ema_model_copy.to(device)
                            if n_gpu > 1:
                                ema_model_copy = nn.DataParallel(ema_model_copy)
                            dev_data_list = hidden_eval(ema_model_copy, dev_iter, dev_data_list, device)
                        else:
                            dev_data_list = hidden_eval(model_clf, dev_iter, dev_data_list, device)

                        eval_mode = {'check_sent_id_correct': True, 'standard': True}
                        fever_score, label_score, pr, rec, f1 = fever_scorer.fever_score(dev_data_list,
                                                                                         common.load_jsonl(config.FEVER_DATA_ROOT / "fever_1.0/shared_task_dev.jsonl"),
                                                                                         mode=eval_mode,
                                                                                         verbose=False)
                        print("Fever Score(FScore/LScore:/Precision/Recall/F1):", fever_score, label_score, pr, rec, f1)

                        print(f"Dev:{fever_score}/{label_score}")

                        if best_fever_score < fever_score:
                            print("New Best FScore")
                            best_fever_score = fever_score

                            save_path = os.path.join(
                                file_path_prefix,
                                f'i({update_step})_epoch({n_epoch})_dev({fever_score})_lacc({label_score})_seed({seed})'
                            )
                            model_to_save = model_clf.module if hasattr(model_clf,
                                                                        'module') else model_clf
                            output_model_file = os.path.join(file_path_prefix, save_path)
                            torch.save(model_to_save.state_dict(), output_model_file)

            print("Update steps:", update_step)
            dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

            if do_ema and ema_model_copy is not None and ema_tracker is not None:
                print("EMA evaluation.")
                EMA.load_ema_to_model(ema_model_copy, ema_tracker)
                ema_model_copy.to(device)
                if n_gpu > 1:
                    ema_model_copy = nn.DataParallel(ema_model_copy)
                dev_data_list = hidden_eval(ema_model_copy, dev_iter, dev_data_list, device)
            else:
                dev_data_list = hidden_eval(model_clf, dev_iter, dev_data_list, device)

            eval_mode = {'check_sent_id_correct': True, 'standard': True}
            fever_score, label_score, pr, rec, f1 = fever_scorer.fever_score(dev_data_list,
                                                                             common.load_jsonl(config.FEVER_DATA_ROOT / "fever_1.0/shared_task_dev.jsonl"),
                                                                             mode=eval_mode,
                                                                             verbose=False)
            print("Fever Score(FScore/LScore:/Precision/Recall/F1):", fever_score, label_score, pr, rec, f1)

            print(f"Dev:{fever_score}/{label_score}")

            if best_fever_score < fever_score:
                print("New Best FScore")
                best_fever_score = fever_score

                save_path = os.path.join(
                    file_path_prefix,
                    f'i({update_step})_epoch({n_epoch})_dev({fever_score})_lacc({label_score})_seed({seed})'
                )
                model_to_save = model_clf.module if hasattr(model_clf,
                                                            'module') else model_clf
                output_model_file = os.path.join(file_path_prefix, save_path)
                torch.save(model_to_save.state_dict(), output_model_file)
        if p_item['label'] == 'false' and not p_item['in_sp_doc']:
            p_v = np.random.rand()
            if p_v < selection_prob:
                r_list.append(p_item)
        elif p_item['label'] == 'false' and p_item['in_sp_doc']:
            p_v = np.random.rand()
            if p_v < same_doc_prob:
                r_list.append(p_item)
        else:
            r_list.append(p_item)

    return r_list


if __name__ == '__main__':
    train_list = common.load_json(config.TRAIN_FILE)
    # train_list = common.load_json(config.DEV_FULLWIKI_FILE)
    # train_list = common.load_json(config.DEV_DISTRACTOR_FILE)
    # print(len(train_list))
    train_sent_data_list = build_sent_match_data_from_distractor_list(
        train_list, is_training=True)
    print(len(train_sent_data_list))
    train_sent_data_list = downsample_negative_examples(
        train_sent_data_list, 0.1, 1)
    print(len(train_sent_data_list))
    neg = 0
    pos = 0
    in_sp_doc = 0
    for p_item in train_sent_data_list:
        if p_item['label'] == 'true':
            pos += 1
def doc_retrie_v5_reimpl_tf_idf_upperbound():
    top_k = 10
    dev_fullwiki = common.load_json(config.DEV_FULLWIKI_FILE)

    pred_dev = common.load_json(
        # config.RESULT_PATH / "doc_retri_results/doc_raw_matching_with_disamb_with_hyperlinked_v5_file.json")
        # config.RESULT_PATH / "doc_retri_results/doc_raw_matching_file.json")
        config.RESULT_PATH / "doc_retri_results/doc_retrieval_debug_v6/doc_raw_matching_with_disamb_withiout_hyperlinked_v6_file_debug_4.json")
        # config.RESULT_PATH / "doc_retri_results/doc_raw_matching_with_disamb_withiout_hyperlinked_v5_file.json")

    tf_idf_dev_results = common.load_jsonl(
        config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl")

    tf_idf_scored_dict = dict()
    for item in tf_idf_dev_results:
        sorted_scored_list = sorted(item['doc_list'], key=lambda x: x[0], reverse=True)
        pred_list = [docid for _, docid in sorted_scored_list[:top_k]]
        qid = item['qid']
        tf_idf_scored_dict[qid] = pred_list

    pred_v5_sp_doc = pred_dev['sp_doc']
    # dev_fullwiki = common.load_json(config.DEV_DISTRACTOR_FILE)
    upperbound_pred_file = dict()

    upperbound_pred_file['sp'] = dict()
    upperbound_pred_file['sp_doc'] = dict()
    upperbound_pred_file['p_answer'] = dict()

    # print(dev_fullwiki

    for item in dev_fullwiki:
        qid = item['_id']
        answer = item['answer']
        contexts = item['context']
        supporting_facts = item['supporting_facts']

        tf_idf_docs = tf_idf_scored_dict[qid]

        v5_retrieved_doc = pred_v5_sp_doc[qid]
        # print(v5_retrieved_doc)
        supporting_doc = set([fact[0] for fact in item['supporting_facts']])

        # retrieved_doc_dict = set([context[0] for context in contexts])
        retrieved_doc_dict = dict()

        for doc_title, context_sents in contexts:
            if doc_title not in retrieved_doc_dict:
                retrieved_doc_dict[doc_title] = dict()

            for i, sent in enumerate(context_sents):
                retrieved_doc_dict[doc_title][i] = sent

        upperbound_pred_doc = []
        upperbound_pred_sp = []

        found_answer = False
        for sp_doc in tf_idf_docs:
            if sp_doc in supporting_doc:
                upperbound_pred_doc.append(sp_doc)
                for gt_sp_doc, sp_fact_line_num in supporting_facts:
                    if gt_sp_doc == sp_doc:
                        upperbound_pred_sp.append([sp_doc, sp_fact_line_num])
                    # if answer in retrieved_doc_dict[sp_doc][sp_fact_line_num]:
                        found_answer = True

        for sp_doc in v5_retrieved_doc:
            if sp_doc not in upperbound_pred_doc:
                if sp_doc in supporting_doc:
                    upperbound_pred_doc.append(sp_doc)
                    for gt_sp_doc, sp_fact_line_num in supporting_facts:
                        if gt_sp_doc == sp_doc:
                            upperbound_pred_sp.append([sp_doc, sp_fact_line_num])
                        # if answer in retrieved_doc_dict[sp_doc][sp_fact_line_num]:
                            found_answer = True


                # upperbound_pred_sp.append([sp_doc, sp_fact_line_num])
                # if answer in retrieved_doc_dict[sp_doc][sp_fact_line_num]:
                #     found_answer = True

        p_answer = answer if found_answer else ""

        upperbound_pred_file['sp'][qid] = upperbound_pred_sp
        upperbound_pred_file['sp_doc'][qid] = upperbound_pred_doc

        upperbound_pred_file['p_answer'][qid] = p_answer

        if all([gt_fact in upperbound_pred_sp for gt_fact in supporting_facts]):
            # If we find all the evidence, to add additional yes/no answer.
            upperbound_pred_file['p_answer'][qid] = answer

    ext_hotpot_eval.eval(upperbound_pred_file, dev_fullwiki)