Пример #1
0
def experiment_test_full_wiki():
    multihop_retrieval_top_k = 3
    match_filtering_k = 3
    term_retrieval_top_k = 5

    data_list = common.load_json(config.TEST_FULLWIKI_FILE)
    terms_based_results_list = common.load_jsonl(
        config.RESULT_PATH /
        "doc_retri_results/term_based_methods_results/hotpot_tf_idf_test.jsonl"
    )
    g_score_dict = dict()
    load_from_file(
        g_score_dict, config.PDATA_ROOT /
        "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")
    # WE need to give gt data None.
    doc_retri_pred_dict = init_results_v8(
        data_list,
        None,
        terms_based_results_list,
        g_score_dict,
        match_filtering_k=match_filtering_k,
        term_retrieval_top_k=term_retrieval_top_k)

    len_list = []
    for rset in doc_retri_pred_dict['sp_doc'].values():
        len_list.append(len(rset))

    print("Results without filtering:")
    print(collections.Counter(len_list).most_common(10000))
    print(len(len_list))
    print("Mean:\t", np.mean(len_list))
    print("Std:\t", np.std(len_list))
    print("Max:\t", np.max(len_list))
    print("Min:\t", np.min(len_list))

    common.save_json(
        doc_retri_pred_dict,
        "hotpot_test_doc_retrieval_v8_before_multihop_filtering.json")

    # Filtering
    new_doc_retri_pred_dict = results_multihop_filtering(
        doc_retri_pred_dict, multihop_retrieval_top_k=multihop_retrieval_top_k)
    print("Results with filtering:")

    len_list = []
    for rset in new_doc_retri_pred_dict['sp_doc'].values():
        len_list.append(len(rset))

    print("Results with filtering:")
    print(collections.Counter(len_list).most_common(10000))
    print(len(len_list))
    print("Mean:\t", np.mean(len_list))
    print("Std:\t", np.std(len_list))
    print("Max:\t", np.max(len_list))
    print("Min:\t", np.min(len_list))

    # ext_hotpot_eval.eval(new_doc_retri_pred_dict, data_list)
    common.save_json(new_doc_retri_pred_dict,
                     "hotpot_test_doc_retrieval_v8.json")
Пример #2
0
    def logging_to_file(self, filename):
        if Path(filename).is_file():
            old_logging_list = common.load_json(filename)
            current_saved_key = set()

            for item in self.logging_item_list:
                current_saved_key.add(item['k'])

            for item in old_logging_list:
                if item['k'] not in current_saved_key:
                    raise ValueError("Previous logged item can not be found!")

        common.save_json(self.logging_item_list, filename, indent=2, sort_keys=True)
Пример #3
0
def build_clean_lemma2tags(conceptnet_en_path, dump_path, num_parallels=20):
    # build a dict: clean_lemma str 2 list of tagger
    def _process(_conceptnet_id_list):
        nlp = spacy.load("en", disable=["parser", "ner", "textcat"])
        new_lemma_conceptnetid = collections.defaultdict(list)
        for _concept in tqdm(_conceptnet_id_list):
            _proc_concept = _concept.split("/")[3]
            _proc_concept = " ".join(_proc_concept.split("_"))
            doc = nlp(_proc_concept)
            _proc_concept = " ".join([token.lemma_ for token in doc])
            _clean_concept = clean_phrase(_proc_concept)
            if _clean_concept not in new_lemma_conceptnetid:
                _attr_list = [[
                    token.tag_,
                ] for token in nlp(_clean_concept)]
                new_lemma_conceptnetid[_clean_concept] = _attr_list
        return new_lemma_conceptnetid

    concept_set = set()
    for _row in conceptnet_dump_iter(conceptnet_en_path):
        for _concept in _row[2:4]:
            if _concept not in concept_set:
                concept_set.add(_concept)
    concept_list = list(concept_set)

    multi_dict = multiprocessing_map(
        func=_process,
        dict_args_list=[{
            "_conceptnet_id_list": _data
        } for _data in split_to_lists(concept_list, num_parallels)],
        num_parallels=num_parallels)

    final_dict = {}
    for _dict in multi_dict:
        final_dict.update(_dict)

    save_json(final_dict, dump_path)
Пример #4
0
def train(local_rank, args):
    # debug = False
    # print("GPU:", gpu)
    # world_size = args.world_size
    args.global_rank = args.node_rank * args.gpus_per_node + local_rank
    args.local_rank = local_rank
    # args.warmup_steps = 20
    debug_count = 1000
    num_epoch = args.epochs

    actual_train_batch_size = args.world_size * args.per_gpu_train_batch_size * args.gradient_accumulation_steps
    args.actual_train_batch_size = actual_train_batch_size

    set_seed(args.seed)
    num_labels = 3  # we are doing NLI so we set num_labels = 3, for other task we can change this value.

    max_length = args.max_length

    model_class_item = MODEL_CLASSES[args.model_class_name]
    model_name = model_class_item['model_name']
    do_lower_case = model_class_item[
        'do_lower_case'] if 'do_lower_case' in model_class_item else False

    tokenizer = model_class_item['tokenizer'].from_pretrained(
        model_name,
        cache_dir=str(config.PRO_ROOT / "trans_cache"),
        do_lower_case=do_lower_case)

    model = model_class_item['sequence_classification'].from_pretrained(
        model_name,
        cache_dir=str(config.PRO_ROOT / "trans_cache"),
        num_labels=num_labels)

    padding_token_value = tokenizer.convert_tokens_to_ids(
        [tokenizer.pad_token])[0]
    padding_segement_value = model_class_item["padding_segement_value"]
    padding_att_value = model_class_item["padding_att_value"]
    left_pad = model_class_item[
        'left_pad'] if 'left_pad' in model_class_item else False

    batch_size_per_gpu_train = args.per_gpu_train_batch_size
    batch_size_per_gpu_eval = args.per_gpu_eval_batch_size

    if not args.cpu and not args.single_gpu:
        dist.init_process_group(backend='nccl',
                                init_method='env://',
                                world_size=args.world_size,
                                rank=args.global_rank)

    train_data_str = args.train_data
    train_data_weights_str = args.train_weights
    eval_data_str = args.eval_data

    train_data_name = []
    train_data_path = []
    train_data_list = []
    train_data_weights = []

    eval_data_name = []
    eval_data_path = []
    eval_data_list = []

    train_data_named_path = train_data_str.split(',')
    weights_str = train_data_weights_str.split(
        ',') if train_data_weights_str is not None else None

    eval_data_named_path = eval_data_str.split(',')

    for named_path in train_data_named_path:
        ind = named_path.find(':')
        name = named_path[:ind]
        path = name[ind + 1:]
        if name in registered_path:
            d_list = common.load_jsonl(registered_path[name])
        else:
            d_list = common.load_jsonl(path)

        train_data_name.append(name)
        train_data_path.append(path)

        train_data_list.append(d_list)

    if weights_str is not None:
        for weights in weights_str:
            train_data_weights.append(float(weights))
    else:
        for i in range(len(train_data_list)):
            train_data_weights.append(1)

    for named_path in eval_data_named_path:
        ind = named_path.find(':')
        name = named_path[:ind]
        path = name[ind + 1:]
        if name in registered_path:
            d_list = common.load_jsonl(registered_path[name])
        else:
            d_list = common.load_jsonl(path)
        eval_data_name.append(name)
        eval_data_path.append(path)

        eval_data_list.append(d_list)

    assert len(train_data_weights) == len(train_data_list)

    batching_schema = {
        'uid':
        RawFlintField(),
        'y':
        LabelFlintField(),
        'input_ids':
        ArrayIndexFlintField(pad_idx=padding_token_value, left_pad=left_pad),
        'token_type_ids':
        ArrayIndexFlintField(pad_idx=padding_segement_value,
                             left_pad=left_pad),
        'attention_mask':
        ArrayIndexFlintField(pad_idx=padding_att_value, left_pad=left_pad),
    }

    data_transformer = NLITransform(model_name, tokenizer, max_length)
    # data_transformer = NLITransform(model_name, tokenizer, max_length, with_element=True)

    eval_data_loaders = []
    for eval_d_list in eval_data_list:
        d_dataset, d_sampler, d_dataloader = build_eval_dataset_loader_and_sampler(
            eval_d_list, data_transformer, batching_schema,
            batch_size_per_gpu_eval)
        eval_data_loaders.append(d_dataloader)

    # Estimate the training size:
    training_list = []
    for i in range(len(train_data_list)):
        print("Build Training Data ...")
        train_d_list = train_data_list[i]
        train_d_name = train_data_name[i]
        train_d_weight = train_data_weights[i]
        cur_train_list = sample_data_list(
            train_d_list, train_d_weight
        )  # change later  # we can apply different sample strategy here.
        print(
            f"Data Name:{train_d_name}; Weight: {train_d_weight}; "
            f"Original Size: {len(train_d_list)}; Sampled Size: {len(cur_train_list)}"
        )
        training_list.extend(cur_train_list)
    estimated_training_size = len(training_list)
    print("Estimated training size:", estimated_training_size)
    # Estimate the training size ends:

    # t_total = estimated_training_size // args.gradient_accumulation_steps * num_epoch
    t_total = estimated_training_size * num_epoch // args.actual_train_batch_size
    if args.warmup_steps <= 0:  # set the warmup steps to 0.1 * total step if the given warmup step is -1.
        args.warmup_steps = int(t_total * 0.1)

    if not args.cpu:
        torch.cuda.set_device(args.local_rank)
        model.cuda(args.local_rank)

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    if not args.cpu and not args.single_gpu:
        model = nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            find_unused_parameters=True)

    args_dict = dict(vars(args))
    file_path_prefix = '.'
    if args.global_rank in [-1, 0]:
        print("Total Steps:", t_total)
        args.total_step = t_total
        print("Warmup Steps:", args.warmup_steps)
        print("Actual Training Batch Size:", actual_train_batch_size)
        print("Arguments", pp.pprint(args))

    # Let build the logger and log everything before the start of the first training epoch.
    if args.global_rank in [
            -1, 0
    ]:  # only do logging if we use cpu or global_rank=0
        if not args.debug_mode:
            file_path_prefix, date = save_tool.gen_file_prefix(
                f"{args.experiment_name}")
            # # # Create Log File
            # Save the source code.
            script_name = os.path.basename(__file__)
            with open(os.path.join(file_path_prefix, script_name),
                      'w') as out_f, open(__file__, 'r') as it:
                out_f.write(it.read())
                out_f.flush()

            # Save option file
            common.save_json(args_dict,
                             os.path.join(file_path_prefix, "args.json"))
            checkpoints_path = Path(file_path_prefix) / "checkpoints"
            if not checkpoints_path.exists():
                checkpoints_path.mkdir()
            prediction_path = Path(file_path_prefix) / "predictions"
            if not prediction_path.exists():
                prediction_path.mkdir()

    global_step = 0

    # print(f"Global Rank:{args.global_rank} ### ", 'Init!')

    for epoch in tqdm(range(num_epoch),
                      desc="Epoch",
                      disable=args.global_rank not in [-1, 0]):
        # Let's build up training dataset for this epoch
        training_list = []
        for i in range(len(train_data_list)):
            print("Build Training Data ...")
            train_d_list = train_data_list[i]
            train_d_name = train_data_name[i]
            train_d_weight = train_data_weights[i]
            cur_train_list = sample_data_list(
                train_d_list, train_d_weight
            )  # change later  # we can apply different sample strategy here.
            print(
                f"Data Name:{train_d_name}; Weight: {train_d_weight}; "
                f"Original Size: {len(train_d_list)}; Sampled Size: {len(cur_train_list)}"
            )
            training_list.extend(cur_train_list)

        random.shuffle(training_list)
        train_dataset = NLIDataset(training_list, data_transformer)

        train_sampler = SequentialSampler(train_dataset)
        if not args.cpu and not args.single_gpu:
            print("Use distributed sampler.")
            train_sampler = DistributedSampler(train_dataset,
                                               args.world_size,
                                               args.global_rank,
                                               shuffle=True)

        train_dataloader = DataLoader(
            dataset=train_dataset,
            batch_size=batch_size_per_gpu_train,
            shuffle=False,  #
            num_workers=0,
            pin_memory=True,
            sampler=train_sampler,
            collate_fn=BaseBatchBuilder(batching_schema))  #
        # training build finished.

        print(debug_node_info(args), "epoch: ", epoch)

        if not args.cpu and not args.single_gpu:
            train_sampler.set_epoch(
                epoch
            )  # setup the epoch to ensure random sampling at each epoch

        for forward_step, batch in enumerate(
                tqdm(train_dataloader,
                     desc="Iteration",
                     disable=args.global_rank not in [-1, 0]), 0):
            model.train()

            batch = move_to_device(batch, local_rank)
            # print(batch['input_ids'], batch['y'])
            if args.model_class_name in ["distilbert", "bart-large"]:
                outputs = model(batch['input_ids'],
                                attention_mask=batch['attention_mask'],
                                labels=batch['y'])
            else:
                outputs = model(batch['input_ids'],
                                attention_mask=batch['attention_mask'],
                                token_type_ids=batch['token_type_ids'],
                                labels=batch['y'])
            loss, logits = outputs[:2]
            # print(debug_node_info(args), loss, logits, batch['uid'])
            # print(debug_node_info(args), loss, batch['uid'])

            # Accumulated loss
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            # if this forward step need model updates
            # handle fp16
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

                # Gradient clip: if max_grad_norm < 0
            if (forward_step + 1) % args.gradient_accumulation_steps == 0:
                if args.max_grad_norm > 0:
                    if args.fp16:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()

                global_step += 1

                if args.global_rank in [
                        -1, 0
                ] and args.eval_frequency > 0 and global_step % args.eval_frequency == 0:
                    r_dict = dict()
                    # Eval loop:
                    for i in range(len(eval_data_name)):
                        cur_eval_data_name = eval_data_name[i]
                        cur_eval_data_list = eval_data_list[i]
                        cur_eval_dataloader = eval_data_loaders[i]
                        # cur_eval_raw_data_list = eval_raw_data_list[i]

                        evaluation_dataset(args,
                                           cur_eval_dataloader,
                                           cur_eval_data_list,
                                           model,
                                           r_dict,
                                           eval_name=cur_eval_data_name)

                    # saving checkpoints
                    current_checkpoint_filename = \
                        f'e({epoch})|i({global_step})'

                    for i in range(len(eval_data_name)):
                        cur_eval_data_name = eval_data_name[i]
                        current_checkpoint_filename += \
                            f'|{cur_eval_data_name}#({round(r_dict[cur_eval_data_name]["acc"], 4)})'

                    if not args.debug_mode:
                        # save model:
                        model_output_dir = checkpoints_path / current_checkpoint_filename
                        if not model_output_dir.exists():
                            model_output_dir.mkdir()
                        model_to_save = (
                            model.module if hasattr(model, "module") else model
                        )  # Take care of distributed/parallel training

                        torch.save(model_to_save.state_dict(),
                                   str(model_output_dir / "model.pt"))
                        torch.save(optimizer.state_dict(),
                                   str(model_output_dir / "optimizer.pt"))
                        torch.save(scheduler.state_dict(),
                                   str(model_output_dir / "scheduler.pt"))

                    # save prediction:
                    if not args.debug_mode and args.save_prediction:
                        cur_results_path = prediction_path / current_checkpoint_filename
                        if not cur_results_path.exists():
                            cur_results_path.mkdir(parents=True)
                        for key, item in r_dict.items():
                            common.save_jsonl(
                                item['predictions'],
                                cur_results_path / f"{key}.jsonl")

                        # avoid saving too many things
                        for key, item in r_dict.items():
                            del r_dict[key]['predictions']
                        common.save_json(r_dict,
                                         cur_results_path /
                                         "results_dict.json",
                                         indent=2)

        # End of epoch evaluation.
        if args.global_rank in [-1, 0]:
            r_dict = dict()
            # Eval loop:
            for i in range(len(eval_data_name)):
                cur_eval_data_name = eval_data_name[i]
                cur_eval_data_list = eval_data_list[i]
                cur_eval_dataloader = eval_data_loaders[i]
                # cur_eval_raw_data_list = eval_raw_data_list[i]

                evaluation_dataset(args,
                                   cur_eval_dataloader,
                                   cur_eval_data_list,
                                   model,
                                   r_dict,
                                   eval_name=cur_eval_data_name)

            # saving checkpoints
            current_checkpoint_filename = \
                f'e({epoch})|i({global_step})'

            for i in range(len(eval_data_name)):
                cur_eval_data_name = eval_data_name[i]
                current_checkpoint_filename += \
                    f'|{cur_eval_data_name}#({round(r_dict[cur_eval_data_name]["acc"], 4)})'

            if not args.debug_mode:
                # save model:
                model_output_dir = checkpoints_path / current_checkpoint_filename
                if not model_output_dir.exists():
                    model_output_dir.mkdir()
                model_to_save = (
                    model.module if hasattr(model, "module") else model
                )  # Take care of distributed/parallel training

                torch.save(model_to_save.state_dict(),
                           str(model_output_dir / "model.pt"))
                torch.save(optimizer.state_dict(),
                           str(model_output_dir / "optimizer.pt"))
                torch.save(scheduler.state_dict(),
                           str(model_output_dir / "scheduler.pt"))

            # save prediction:
            if not args.debug_mode and args.save_prediction:
                cur_results_path = prediction_path / current_checkpoint_filename
                if not cur_results_path.exists():
                    cur_results_path.mkdir(parents=True)
                for key, item in r_dict.items():
                    common.save_jsonl(item['predictions'],
                                      cur_results_path / f"{key}.jsonl")

                # avoid saving too many things
                for key, item in r_dict.items():
                    del r_dict[key]['predictions']
                common.save_json(r_dict,
                                 cur_results_path / "results_dict.json",
                                 indent=2)
Пример #5
0
def eval_model(model_path, data_file=None, filter_value=0.5):
    seed = 12
    torch.manual_seed(seed)

    bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'
    bert_model_name = "bert-base-uncased"
    lazy = False
    forward_size = 16
    batch_size = 32

    do_lower_case = True

    debug = False

    max_pre_context_length = 320
    max_query_length = 64
    doc_stride = 128
    qa_num_of_layer = 2
    s_filter_value = filter_value
    s_top_k = 5

    tag = 'dev'

    print("Potential total length:",
          max_pre_context_length + max_query_length + 3)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                              do_lower_case=do_lower_case,
                                              cache_dir=bert_pretrain_path)

    # Load Dataset.
    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    test_list = common.load_json(config.TEST_FULLWIKI_FILE)
    train_list = common.load_json(config.TRAIN_FILE)

    if data_file is None:
        dev_sentence_level_results = common.load_jsonl(
            config.PRO_ROOT /
            "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/dev_s_level_bert_v1_results.jsonl"
        )
    else:
        dev_sentence_level_results = common.load_jsonl(data_file)

    test_sentence_level_results = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/test_s_level_bert_v1_results.jsonl"
    )

    train_sentence_level_results = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/train_s_level_bert_v1_results.jsonl"
    )

    dev_fitem_dict, dev_fitem_list, dev_sp_results_dict = get_qa_item_with_upstream_sentence(
        dev_list,
        dev_sentence_level_results,
        is_training=False,
        tokenizer=tokenizer,
        max_context_length=max_pre_context_length,
        max_query_length=max_query_length,
        filter_value=s_filter_value,
        doc_stride=doc_stride,
        top_k=s_top_k,
        debug_mode=debug)

    test_fitem_dict, test_fitem_list, test_sp_results_dict = get_qa_item_with_upstream_sentence(
        test_list,
        test_sentence_level_results,
        is_training=False,
        tokenizer=tokenizer,
        max_context_length=max_pre_context_length,
        max_query_length=max_query_length,
        filter_value=s_filter_value,
        doc_stride=doc_stride,
        top_k=s_top_k,
        debug_mode=debug)

    # train_fitem_dict, train_fitem_list, _ = get_qa_item_with_upstream_sentence(train_list, train_sentence_level_results,
    #                                                                            is_training=True,
    #                                                                            tokenizer=tokenizer,
    #                                                                            max_context_length=max_pre_context_length,
    #                                                                            max_query_length=max_query_length,
    #                                                                            filter_value=s_filter_value,
    #                                                                            doc_stride=doc_stride,
    #                                                                            top_k=s_top_k,
    #                                                                            debug_mode=debug)

    if debug:
        dev_list = dev_list[:100]

    span_pred_reader = BertPairedSpanPredReader(bert_tokenizer=tokenizer,
                                                lazy=lazy,
                                                example_filter=None)
    bert_encoder = BertModel.from_pretrained(bert_model_name,
                                             cache_dir=bert_pretrain_path)
    model = BertSpan(bert_encoder, qa_num_of_layer)

    model.load_state_dict(torch.load(model_path))

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    iterator = BasicIterator(batch_size=batch_size)

    if tag == 'dev':
        dev_instances = span_pred_reader.read(dev_fitem_list)
        # test_instances = span_pred_reader.read(test_fitem_list)
        eval_iter = iterator(dev_instances, num_epochs=1, shuffle=False)
        # eval_iter = iterator(test_instances, num_epochs=1, shuffle=False)

        cur_eitem_list, cur_eval_dict = span_eval(model,
                                                  eval_iter,
                                                  do_lower_case,
                                                  dev_fitem_dict,
                                                  device_num,
                                                  show_progress=True,
                                                  pred_no_answer=True)
        # cur_eitem_list, cur_eval_dict = span_eval(model, eval_iter, do_lower_case, test_fitem_dict,
        #                                           device_num, show_progress=True)

        cur_results_dict = dict()
        cur_results_dict['answer'] = cur_eval_dict
        cur_results_dict['sp'] = dev_sp_results_dict
        # cur_results_dict['sp'] = test_sp_results_dict

        # common.save_json(cur_results_dict, f"{tag}_qa_sp_results_{filter_value}_doctopk_5.json")

        cur_results_dict['p_answer'] = cur_eval_dict
        _, metrics = ext_hotpot_eval.eval(cur_results_dict,
                                          dev_list,
                                          verbose=False)
        # _, metrics = ext_hotpot_eval.eval(cur_results_dict, test_list, verbose=False)

        logging_item = {
            'score': metrics,
        }

        print(data_file)
        print(logging_item)

    elif tag == 'test':
        # dev_instances = span_pred_reader.read(dev_fitem_list)
        test_instances = span_pred_reader.read(test_fitem_list)
        # eval_iter = iterator(dev_instances, num_epochs=1, shuffle=False)
        eval_iter = iterator(test_instances, num_epochs=1, shuffle=False)

        # cur_eitem_list, cur_eval_dict = span_eval(model, eval_iter, do_lower_case, dev_fitem_dict,
        #                                           device_num, show_progress=True)
        cur_eitem_list, cur_eval_dict = span_eval(model,
                                                  eval_iter,
                                                  do_lower_case,
                                                  test_fitem_dict,
                                                  device_num,
                                                  show_progress=True)

        cur_results_dict = dict()
        cur_results_dict['answer'] = cur_eval_dict
        # cur_results_dict['sp'] = dev_sp_results_dict
        cur_results_dict['sp'] = test_sp_results_dict

        common.save_json(cur_results_dict, f"{tag}_qa_sp_results.json")

        cur_results_dict['p_answer'] = cur_eval_dict
        # _, metrics = ext_hotpot_eval.eval(cur_results_dict, dev_list, verbose=False)
        _, metrics = ext_hotpot_eval.eval(cur_results_dict,
                                          test_list,
                                          verbose=False)

        logging_item = {
            'score': metrics,
        }

        print(logging_item)
Пример #6
0
def evaluation():
    parser = argparse.ArgumentParser()
    parser.add_argument("--cpu",
                        action="store_true",
                        help="If set, we only use CPU.")
    parser.add_argument("--model_class_name",
                        type=str,
                        help="Set the model class of the experiment.",
                        required=True)

    parser.add_argument("--model_checkpoint_path",
                        type=str,
                        help='Set the path to save the prediction.',
                        required=True)

    parser.add_argument("--output_prediction_path",
                        type=str,
                        default=None,
                        help='Set the path to save the prediction.')

    parser.add_argument(
        "--per_gpu_eval_batch_size",
        default=16,
        type=int,
        help="Batch size per GPU/CPU for evaluation.",
    )

    parser.add_argument("--max_length",
                        default=156,
                        type=int,
                        help="Max length of the sequences.")

    parser.add_argument("--eval_data",
                        type=str,
                        help="The training data used in the experiments.")

    args = parser.parse_args()

    if args.cpu:
        args.global_rank = -1
    else:
        args.global_rank = 0

    model_checkpoint_path = args.model_checkpoint_path
    num_labels = 3  # we are doing NLI so we set num_labels = 3, for other task we can change this value.

    max_length = args.max_length

    model_class_item = MODEL_CLASSES[args.model_class_name]
    model_name = model_class_item['model_name']
    do_lower_case = model_class_item[
        'do_lower_case'] if 'do_lower_case' in model_class_item else False

    tokenizer = model_class_item['tokenizer'].from_pretrained(
        model_name,
        cache_dir=str(config.PRO_ROOT / "trans_cache"),
        do_lower_case=do_lower_case)

    model = model_class_item['sequence_classification'].from_pretrained(
        model_name,
        cache_dir=str(config.PRO_ROOT / "trans_cache"),
        num_labels=num_labels)

    model.load_state_dict(torch.load(model_checkpoint_path))

    padding_token_value = tokenizer.convert_tokens_to_ids(
        [tokenizer.pad_token])[0]
    padding_segement_value = model_class_item["padding_segement_value"]
    padding_att_value = model_class_item["padding_att_value"]
    left_pad = model_class_item[
        'left_pad'] if 'left_pad' in model_class_item else False

    batch_size_per_gpu_eval = args.per_gpu_eval_batch_size

    eval_data_str = args.eval_data
    eval_data_name = []
    eval_data_path = []
    eval_data_list = []

    eval_data_named_path = eval_data_str.split(',')

    for named_path in eval_data_named_path:
        ind = named_path.find(':')
        name = named_path[:ind]
        path = name[ind + 1:]
        if name in registered_path:
            d_list = common.load_jsonl(registered_path[name])
        else:
            d_list = common.load_jsonl(path)
        eval_data_name.append(name)
        eval_data_path.append(path)

        eval_data_list.append(d_list)

    batching_schema = {
        'uid':
        RawFlintField(),
        'y':
        LabelFlintField(),
        'input_ids':
        ArrayIndexFlintField(pad_idx=padding_token_value, left_pad=left_pad),
        'token_type_ids':
        ArrayIndexFlintField(pad_idx=padding_segement_value,
                             left_pad=left_pad),
        'attention_mask':
        ArrayIndexFlintField(pad_idx=padding_att_value, left_pad=left_pad),
    }

    data_transformer = NLITransform(model_name, tokenizer, max_length)
    eval_data_loaders = []
    for eval_d_list in eval_data_list:
        d_dataset, d_sampler, d_dataloader = build_eval_dataset_loader_and_sampler(
            eval_d_list, data_transformer, batching_schema,
            batch_size_per_gpu_eval)
        eval_data_loaders.append(d_dataloader)

    if not args.cpu:
        torch.cuda.set_device(0)
        model.cuda(0)

    r_dict = dict()
    # Eval loop:
    for i in range(len(eval_data_name)):
        cur_eval_data_name = eval_data_name[i]
        cur_eval_data_list = eval_data_list[i]
        cur_eval_dataloader = eval_data_loaders[i]
        # cur_eval_raw_data_list = eval_raw_data_list[i]

        evaluation_dataset(args,
                           cur_eval_dataloader,
                           cur_eval_data_list,
                           model,
                           r_dict,
                           eval_name=cur_eval_data_name)

    # save prediction:
    if args.output_prediction_path is not None:
        cur_results_path = Path(args.output_prediction_path)
        if not cur_results_path.exists():
            cur_results_path.mkdir(parents=True)
        for key, item in r_dict.items():
            common.save_jsonl(item['predictions'],
                              cur_results_path / f"{key}.jsonl")

        # avoid saving too many things
        for key, item in r_dict.items():
            del r_dict[key]['predictions']
        common.save_json(r_dict,
                         cur_results_path / "results_dict.json",
                         indent=2)
Пример #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_class",
                        default="roberta",
                        type=str,
                        help="model class, one of [bert, roberta]")
    parser.add_argument("--dataset", type=str, default="wn18rr")
    parser.add_argument("--num_workers", default=12, type=int)
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument("--data_dir", default=None, type=str)
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )
    parser.add_argument("--neg_weights", default=None, type=str)

    # extra parameters for prediction
    parser.add_argument("--no_verbose", action="store_true")
    parser.add_argument("--collect_prediction", action="store_true")
    parser.add_argument("--prediction_part", default="0,1", type=str)

    ## Other parameters
    define_hparams_training(parser)
    args = parser.parse_args()

    data_dir = args.data_dir or kgbert_data_dir

    # setup
    setup_prerequisite(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    if args.model_class == "roberta":
        config_class = RobertaConfig
        tokenizer_class = RobertaTokenizer
        model_class = RobertaForSequenceClassification
    elif args.model_class == "bert":
        config_class = BertConfig
        tokenizer_class = BertTokenizer
        model_class = BertForSequenceClassification
    else:
        raise KeyError(args.model_class)

    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=2)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case)
    model = model_class.from_pretrained(args.model_name_or_path, config=config)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab
    model.to(args.device)
    logger.info("Training/evaluation parameters %s", args)

    # Dataset
    neg_weights = [1., 1., 0.] if args.neg_weights is None else [
        float(_e) for _e in args.neg_weights.split(",")
    ]
    assert len(neg_weights) == 3 and sum(neg_weights) > 0

    train_dataset = LinkPredictionDataset(args.dataset,
                                          "train",
                                          None,
                                          data_dir,
                                          args.model_class,
                                          tokenizer,
                                          args.do_lower_case,
                                          args.max_seq_length,
                                          neg_times=5,
                                          neg_weights=neg_weights)
    dev_dataset = LinkPredictionDataset(
        args.dataset,
        "dev",
        None,
        data_dir,
        args.model_class,
        tokenizer,
        args.do_lower_case,
        args.max_seq_length,
    )
    test_dataset = LinkPredictionDataset(
        args.dataset,
        "test",
        None,
        data_dir,
        args.model_class,
        tokenizer,
        args.do_lower_case,
        args.max_seq_length,
    )

    if args.do_train:
        train(args, train_dataset, model, tokenizer, eval_dataset=dev_dataset)

    if args.do_train and (args.do_eval
                          or args.do_prediction):  # load the best model
        model = model_class.from_pretrained(args.output_dir, config=config)
        model.to(args.device)

    if not args.do_train and args.do_eval:
        pass

    if args.fp16:
        model = setup_eval_model_for_fp16(args, model)

    dataset_list = [train_dataset, dev_dataset, test_dataset]

    if not args.do_train and args.do_prediction:
        path_template = join(args.output_dir, "tuple_ranks_{},{}.json")
        part_param = args.prediction_part.split(",")
        part_param = [int(_e) for _e in part_param]
        assert len(part_param) == 2 and part_param[1] > part_param[0] >= 0
        cur_part_idx, num_parts = part_param

        if args.collect_prediction:
            tuple_ranks_list = []
            for _idx in range(num_parts):
                tuple_ranks_list.append(
                    load_json(path_template.format(_idx, num_parts)))
            tuple_ranks = combine_from_lists(tuple_ranks_list, ordered=True)
            output_str = calculate_metrics_for_link_prediction(tuple_ranks)
            with open(join(args.output_dir, "link_prediction_metrics.txt"),
                      "w",
                      encoding="utf-8") as fp:
                fp.write(output_str)
        else:
            test_raw_examples = test_dataset.raw_examples
            # part
            tgt_raw_examples = [
                _ex for _idx, _ex in enumerate(test_raw_examples)
                if _idx % num_parts == cur_part_idx
            ]
            # evaluate(args, test_dataset, model, tokenizer, None, "test_")
            tuple_ranks = predict(args,
                                  tgt_raw_examples,
                                  dataset_list,
                                  model,
                                  verbose=(not args.no_verbose))
            calculate_metrics_for_link_prediction(tuple_ranks, verbose=True)
            save_json(tuple_ranks,
                      path_template.format(cur_part_idx, num_parts))
Пример #8
0
def eval_model_for_downstream_ablation(model_saved_path,
                                       doc_top_k=2,
                                       tag='dev'):
    print(f"Run doc_top_k:{doc_top_k}")
    bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'
    seed = 12
    torch.manual_seed(seed)
    bert_model_name = 'bert-base-uncased'
    # lazy = False
    lazy = True
    # forward_size = 256
    forward_size = 256
    # batch_size = 64
    batch_size = 128
    do_lower_case = True
    document_top_k = doc_top_k

    debug_mode = False
    # est_datasize = 900_000

    num_class = 1
    # num_train_optimization_steps

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset
    train_list = common.load_json(config.TRAIN_FILE)
    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    test_list = common.load_json(config.TEST_FULLWIKI_FILE)

    # Load train eval results list
    # cur_train_eval_results_list = common.load_jsonl(
    #     config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/"
    #                       "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/train_p_level_bert_v1_results.jsonl")

    cur_dev_eval_results_list = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/"
        "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/dev_p_level_bert_v1_results.jsonl"
    )

    # cur_test_eval_results_list = common.load_jsonl(
    #     config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_paragraph_level/04-10-17:44:54_hotpot_v0_cs/"
    #                       "i(40000)|e(4)|t5_doc_recall(0.8793382849426064)|t5_sp_recall(0.879496479212887)|t10_doc_recall(0.888656313301823)|t5_sp_recall(0.8888325134240054)|seed(12)/test_p_level_bert_v1_results.jsonl")

    # if tag == 'train':
    #     train_fitems = get_sentence_pair(document_top_k, train_list, cur_train_eval_results_list, is_training=True,
    #                                      debug_mode=debug_mode)
    if tag == 'dev':
        dev_fitems = get_sentence_pair(document_top_k,
                                       dev_list,
                                       cur_dev_eval_results_list,
                                       is_training=False,
                                       debug_mode=debug_mode)

    # elif tag == 'test':
    #     test_fitems = get_sentence_pair(document_top_k, test_list, cur_test_eval_results_list, is_training=False,
    #                                     debug_mode=debug_mode)

    if debug_mode:
        eval_frequency = 2

    #     dev_list = dev_list[:10]
    #     dev_fitems_list = dev_fitems_list[:296]
    #     train_fitems_list = train_fitems_list[:300]
    # print(dev_list[-1]['_id'])
    # exit(0)

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')
    train_o_dict = list_dict_data_tool.list_to_dict(train_list, '_id')

    bert_tokenizer = BertTokenizer.from_pretrained(
        bert_model_name,
        do_lower_case=do_lower_case,
        cache_dir=bert_pretrain_path)
    bert_cs_reader = BertContentSelectionReader(
        bert_tokenizer,
        lazy,
        is_paired=True,
        example_filter=lambda x: len(x['context']) == 0,
        max_l=128,
        element_fieldname='element')

    bert_encoder = BertModel.from_pretrained(bert_model_name,
                                             cache_dir=bert_pretrain_path)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=True)

    model.load_state_dict(torch.load(model_saved_path))

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #
    if tag == 'train':
        train_instance = bert_cs_reader.read(train_fitems)
    elif tag == 'dev':
        dev_instances = bert_cs_reader.read(dev_fitems)
    elif tag == 'test':
        test_instances = bert_cs_reader.read(test_fitems)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    if tag == 'train':
        train_iter = biterator(train_instance, num_epochs=1, shuffle=False)
        print(len(train_fitems))
    elif tag == 'dev':
        dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)
        print(len(dev_fitems))
    elif tag == 'test':
        test_iter = biterator(test_instances, num_epochs=1, shuffle=False)
        print(len(test_fitems))

    print("Forward size:", forward_size)

    if tag == 'train':
        cur_train_eval_results_list_out = eval_model(model,
                                                     train_iter,
                                                     device_num,
                                                     with_probs=True,
                                                     show_progress=True)
        common.save_jsonl(
            cur_train_eval_results_list_out, config.PRO_ROOT /
            "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/train_s_level_bert_v1_results.jsonl"
        )
    elif tag == 'dev':
        cur_dev_eval_results_list_out = eval_model(model,
                                                   dev_iter,
                                                   device_num,
                                                   with_probs=True,
                                                   show_progress=True)
        common.save_jsonl(
            cur_dev_eval_results_list_out,
            f"hotpot_s_level_{tag}_results_top_k_doc_{document_top_k}.jsonl")

    elif tag == 'test':
        cur_test_eval_results_list_out = eval_model(model,
                                                    test_iter,
                                                    device_num,
                                                    with_probs=True,
                                                    show_progress=True)
        common.save_jsonl(
            cur_test_eval_results_list_out, config.PRO_ROOT /
            "data/p_hotpotqa/hotpotqa_sentence_level/04-19-02:17:11_hotpot_v0_slevel_retri_(doc_top_k:2)/i(12000)|e(2)|v02_f1(0.7153646038858843)|v02_recall(0.7114645831323757)|v05_f1(0.7153646038858843)|v05_recall(0.7114645831323757)|seed(12)/test_s_level_bert_v1_results.jsonl"
        )

    if tag == 'train' or tag == 'test':
        exit(0)

    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_dev_eval_results_list_out,
        copied_dev_o_dict,
        'qid',
        'fid',
        check=True)
    # 0.5
    cur_results_dict_v05 = select_top_k_and_to_results_dict(
        copied_dev_o_dict,
        top_k=5,
        score_field_name='prob',
        filter_value=0.5,
        result_field='sp')

    cur_results_dict_v02 = select_top_k_and_to_results_dict(
        copied_dev_o_dict,
        top_k=5,
        score_field_name='prob',
        filter_value=0.2,
        result_field='sp')

    _, metrics_v5 = ext_hotpot_eval.eval(cur_results_dict_v05,
                                         dev_list,
                                         verbose=False)

    _, metrics_v2 = ext_hotpot_eval.eval(cur_results_dict_v02,
                                         dev_list,
                                         verbose=False)

    logging_item = {
        'v02': metrics_v2,
        'v05': metrics_v5,
    }

    print(logging_item)
    f1 = metrics_v5['sp_f1']
    em = metrics_v5['sp_em']
    pr = metrics_v5['sp_prec']
    rec = metrics_v5['sp_recall']
    common.save_json(
        logging_item,
        f"top_k_doc:{document_top_k}_em:{em}_pr:{pr}_rec:{rec}_f1:{f1}")
Пример #9
0
def eval_model_for_downstream_ablation(model_saved_path, top_k_doc):
    bert_model_name = 'bert-base-uncased'
    lazy = True
    # lazy = True
    forward_size = 128
    # batch_size = 64
    # batch_size = 128
    do_lower_case = True

    debug_mode = False
    max_l = 128
    # est_datasize = 900_000
    tag = 'dev'

    num_class = 1
    # num_train_optimization_steps

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset
    train_upstream_doc_results = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/"
        "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_train_results.jsonl"
    )

    dev_upstream_doc_results = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/"
        "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_dev_results.jsonl"
    )

    test_upstream_doc_results = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/"
        "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_test_results.jsonl"
    )

    train_list = common.load_jsonl(config.FEVER_TRAIN)
    dev_list = common.load_jsonl(config.FEVER_DEV)
    test_list = common.load_jsonl(config.FEVER_TEST)
    # dev_list = common.load_jsonl(config.FEVER_DEV)

    if tag == 'dev':
        dev_fitems = fever_s_level_sampler.get_sentence_forward_pair(
            'dev',
            dev_upstream_doc_results,
            is_training=False,
            debug=debug_mode,
            ignore_non_verifiable=False,
            top_k=top_k_doc,
            filter_value=0.00000)
        fever_p_level_sampler.down_sample_neg(dev_fitems, None)
    elif tag == 'train':
        train_fitems = fever_s_level_sampler.get_sentence_forward_pair(
            'train',
            train_upstream_doc_results,
            is_training=True,
            debug=debug_mode,
            ignore_non_verifiable=False,
            top_k=top_k_doc,
            filter_value=0.00000)
        fever_p_level_sampler.down_sample_neg(train_fitems, None)
    elif tag == 'test':
        test_fitems = fever_s_level_sampler.get_sentence_forward_pair(
            'test',
            test_upstream_doc_results,
            is_training=False,
            debug=debug_mode,
            ignore_non_verifiable=False,
            top_k=top_k_doc,
            filter_value=0.00000)
        fever_p_level_sampler.down_sample_neg(test_fitems, None)

    # Just to show the information

    if debug_mode:
        dev_list = dev_list[:100]
        eval_frequency = 2
        # print(dev_list[-1]['_id'])
        # exit(0)

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id')
    test_o_dict = list_dict_data_tool.list_to_dict(test_list, 'id')
    train_o_dict = list_dict_data_tool.list_to_dict(train_list, 'id')
    # print(dev_o_dict)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                                   do_lower_case=do_lower_case)
    bert_cs_reader = BertContentSelectionReader(
        bert_tokenizer,
        lazy,
        is_paired=True,
        example_filter=lambda x: len(x['context']) == 0,
        max_l=max_l,
        element_fieldname='element')

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=True)

    model.load_state_dict(torch.load(model_saved_path))

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    if tag == 'dev':
        dev_instances = bert_cs_reader.read(dev_fitems)

        dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model,
                                           dev_iter,
                                           device_num,
                                           make_int=True,
                                           with_probs=True,
                                           show_progress=True)

        common.save_jsonl(
            cur_eval_results_list,
            f"fever_s_level_{tag}_results_top_k_doc_{top_k_doc}.jsonl")

        copied_dev_o_dict = copy.deepcopy(dev_o_dict)
        copied_dev_d_list = copy.deepcopy(dev_list)
        list_dict_data_tool.append_subfield_from_list_to_dict(
            cur_eval_results_list, copied_dev_o_dict, 'qid', 'fid', check=True)

        cur_results_dict_th0_5 = select_top_k_and_to_results_dict(
            copied_dev_o_dict,
            score_field_name='prob',
            top_k=5,
            filter_value=0.2,
            result_field='predicted_evidence')

        list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
            copied_dev_d_list, cur_results_dict_th0_5, 'id',
            'predicted_evidence')
        # mode = {'standard': False, 'check_doc_id_correct': True}

        strict_score, pr, rec, f1 = fever_scorer.fever_sent_only(
            copied_dev_d_list, dev_list, max_evidence=5)
        score_05 = {
            'top_k_doc': top_k_doc,
            'ss': strict_score,
            'pr': pr,
            'rec': rec,
            'f1': f1,
        }

        print("Top_k doc:", top_k_doc)
        print(score_05)
        common.save_json(
            score_05,
            f"top_k_doc:{top_k_doc}_ss:{strict_score}_pr:{pr}_rec:{rec}_f1:{f1}"
        )

    elif tag == 'test':
        test_instances = bert_cs_reader.read(test_fitems)

        test_iter = biterator(test_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model,
                                           test_iter,
                                           device_num,
                                           make_int=True,
                                           with_probs=True,
                                           show_progress=True)

        common.save_jsonl(cur_eval_results_list,
                          f"fever_s_level_{tag}_results.jsonl")

        # copied_test_o_dict = copy.deepcopy(test_o_dict)
        # copied_test_d_list = copy.deepcopy(test_list)
        # list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_test_o_dict,
        #                                                       'qid', 'fid', check=True)
        #
        # cur_results_dict_th0_5 = select_top_k_and_to_results_dict(copied_test_o_dict,
        #                                                           score_field_name='prob',
        #                                                           top_k=5, filter_value=0.5,
        #                                                           result_field='predicted_evidence')
        #
        # list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(copied_test_d_list,
        #                                                                cur_results_dict_th0_5,
        #                                                                'id', 'predicted_evidence')
        # mode = {'standard': False, 'check_doc_id_correct': True}

        # copied_train_o_dict = copy.deepcopy(train_o_dict)
        # copied_train_d_list = copy.deepcopy(train_list)
        # list_dict_data_tool.append_subfield_from_list_to_dict(cur_eval_results_list, copied_train_o_dict,
        #                                                       'qid', 'fid', check=True)
        #
        # cur_results_dict_th0_5 = select_top_k_and_to_results_dict(copied_train_o_dict,
        #                                                           score_field_name='prob',
        #                                                           top_k=5, filter_value=0.5,
        #                                                           result_field='predicted_evidence')
        #
        # list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(copied_train_d_list,
        #                                                                cur_results_dict_th0_5,
        #                                                                'id', 'predicted_evidence')
        # # mode = {'standard': False, 'check_doc_id_correct': True}
        # strict_score, pr, rec, f1 = fever_scorer.fever_sent_only(copied_train_d_list, train_list,
        #                                                          max_evidence=5)
        # score_05 = {
        #     'ss': strict_score,
        #     'pr': pr, 'rec': rec, 'f1': f1,
        # }
        #
        # print(score_05)
    elif tag == 'train':
        train_instances = bert_cs_reader.read(train_fitems)

        train_iter = biterator(train_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model,
                                           train_iter,
                                           device_num,
                                           make_int=True,
                                           with_probs=True,
                                           show_progress=True)

        common.save_jsonl(cur_eval_results_list,
                          f"fever_s_level_{tag}_results.jsonl")

        copied_train_o_dict = copy.deepcopy(train_o_dict)
        copied_train_d_list = copy.deepcopy(train_list)
        list_dict_data_tool.append_subfield_from_list_to_dict(
            cur_eval_results_list,
            copied_train_o_dict,
            'qid',
            'fid',
            check=True)

        cur_results_dict_th0_5 = select_top_k_and_to_results_dict(
            copied_train_o_dict,
            score_field_name='prob',
            top_k=5,
            filter_value=0.5,
            result_field='predicted_evidence')

        list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
            copied_train_d_list, cur_results_dict_th0_5, 'id',
            'predicted_evidence')
        # mode = {'standard': False, 'check_doc_id_correct': True}
        strict_score, pr, rec, f1 = fever_scorer.fever_sent_only(
            copied_train_d_list, train_list, max_evidence=5)
        score_05 = {
            'ss': strict_score,
            'pr': pr,
            'rec': rec,
            'f1': f1,
        }

        print(score_05)
Пример #10
0
def experiment_train_full_wiki():
    multihop_retrieval_top_k = 3
    match_filtering_k = 3
    term_retrieval_top_k = 5
    multihop_strict_mode = True
    debug_mode = None

    # data_list = common.load_json(config.DEV_FULLWIKI_FILE)
    data_list = common.load_json(config.TRAIN_FILE)

    if debug_mode is not None:
        data_list = data_list[:debug_mode]

    terms_based_results_list = common.load_jsonl(
        config.RESULT_PATH /
        "doc_retri_results/term_based_methods_results/hotpot_tf_idf_train.jsonl"
    )

    g_score_dict = dict()
    load_from_file(
        g_score_dict, config.PDATA_ROOT /
        "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")
    doc_retri_pred_dict = init_results_v8(
        data_list,
        data_list,
        terms_based_results_list,
        g_score_dict,
        match_filtering_k=match_filtering_k,
        term_retrieval_top_k=term_retrieval_top_k)

    len_list = []
    for rset in doc_retri_pred_dict['sp_doc'].values():
        len_list.append(len(rset))

    print("Results without filtering:")
    print(collections.Counter(len_list).most_common(10000))
    print(len(len_list))
    print("Mean:\t", np.mean(len_list))
    print("Std:\t", np.std(len_list))
    print("Max:\t", np.max(len_list))
    print("Min:\t", np.min(len_list))

    # common.save_json(doc_retri_pred_dict, f"hotpot_doc_retrieval_v8_before_multihop_filtering_{debug_mode}.json")
    common.save_json(
        doc_retri_pred_dict,
        f"hotpot_train_doc_retrieval_v8_before_multihop_filtering.json")

    # Filtering
    new_doc_retri_pred_dict = results_multihop_filtering(
        doc_retri_pred_dict,
        multihop_retrieval_top_k=multihop_retrieval_top_k,
        strict_mode=multihop_strict_mode)
    print("Results with filtering:")

    len_list = []
    for rset in new_doc_retri_pred_dict['sp_doc'].values():
        len_list.append(len(rset))

    print("Results with filtering:")
    print(collections.Counter(len_list).most_common(10000))
    print(len(len_list))
    print("Mean:\t", np.mean(len_list))
    print("Std:\t", np.std(len_list))
    print("Max:\t", np.max(len_list))
    print("Min:\t", np.min(len_list))

    ext_hotpot_eval.eval(new_doc_retri_pred_dict, data_list)
    # common.save_json(new_doc_retri_pred_dict, f"hotpot_doc_retrieval_v8_{debug_mode}.json")
    common.save_json(new_doc_retri_pred_dict,
                     f"hotpot_train_doc_retrieval_v8.json")
Пример #11
0
def model_eval_ablation(model_path, filter_value=0.2, top_k_sent=5):
    bert_model_name = 'bert-base-uncased'
    bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'

    lazy = False
    forward_size = 32
    do_lower_case = True
    pair_order = 'cq'
    debug_mode = False

    maxout_model = False

    num_class = 3

    tag = 'dev'
    exp = 'no_re_train'
    print("Filter value:", filter_value)
    print("top_k_sent:", top_k_sent)
    train_sent_filtering_prob = 0.2
    dev_sent_filtering_prob = filter_value
    test_sent_filtering_prob = 0.2

    # Data dataset and upstream sentence results.
    dev_sent_results_list = common.load_jsonl(
        config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl")
    # train_sent_results_list = common.load_jsonl(
    #     config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_train_results.jsonl")
    test_sent_results_list = common.load_jsonl(
        config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_test_results.jsonl")

    dev_fitems, dev_list = get_nli_pair('dev', is_training=False,
                                        sent_level_results_list=dev_sent_results_list, debug=debug_mode,
                                        sent_top_k=top_k_sent, sent_filter_value=dev_sent_filtering_prob)
    # train_fitems, train_list = get_nli_pair('train', is_training=True,
    #                                         sent_level_results_list=train_sent_results_list, debug=debug_mode,
    #                                         sent_top_k=5, sent_filter_value=train_sent_filtering_prob)
    test_fitems, test_list = get_nli_pair('test', is_training=False,
                                          sent_level_results_list=test_sent_results_list, debug=debug_mode,
                                          sent_top_k=top_k_sent, sent_filter_value=test_sent_filtering_prob)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace('SUPPORTS', namespace='labels')
    vocab.add_token_to_namespace('REFUTES', namespace='labels')
    vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels')
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels')

    if debug_mode:
        dev_list = dev_list[:100]
        # train_list = train_list[:100]
        test_list = test_list[:100]
        eval_frequency = 2

    # est_datasize = len(train_fitems)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case,
                                                   cache_dir=bert_pretrain_path)
    bert_cs_reader = BertFeverNLIReader(bert_tokenizer, lazy, is_paired=True, query_l=64,
                                        example_filter=None, max_l=384, pair_order=pair_order)

    bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path)
    if not maxout_model:
        model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1,
                                                act_type='tanh', use_pretrained_pooler=True, use_sigmoid=False)
    else:
        model = BertPairMaxOutMatcher(bert_encoder, num_of_class=num_class, act_type="gelu", num_of_out_layers=2)

    model.load_state_dict(torch.load(model_path))

    dev_instances = bert_cs_reader.read(dev_fitems)
    # train_instances = bert_cs_reader.read(train_fitems)
    test_instances = bert_cs_reader.read(test_fitems)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if tag == 'dev':
        dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True,
                                           feed_input_span=maxout_model, show_progress=True)
        common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{dev_sent_filtering_prob}_{exp}.jsonl")

        ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid')
        copied_dev_list = copy.deepcopy(dev_list)
        list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict,
                                                          'id', 'predicted_label')

        common.save_jsonl(copied_dev_list, f"nli_{tag}_cp_results_th{dev_sent_filtering_prob}_{exp}.jsonl")
        mode = {'standard': True}
        strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list,
                                                                        mode=mode, max_evidence=5)
        logging_item = {
            'ss': strict_score, 'ac': acc_score,
            'pr': pr, 'rec': rec, 'f1': f1,
        }

        print(logging_item)
        common.save_json(logging_item,
                         f"nli_th{dev_sent_filtering_prob}_{exp}_ss:{strict_score}_ac:{acc_score}_pr:{pr}_rec:{rec}_f1:{f1}.jsonl")

    elif tag == 'test':
        test_iter = biterator(test_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model, test_iter, device_num, with_probs=True, make_int=True,
                                           feed_input_span=maxout_model, show_progress=True)

        common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{test_sent_filtering_prob}.jsonl")

        ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid')
        copied_test_list = copy.deepcopy(test_list)
        list_dict_data_tool.append_item_from_dict_to_list(copied_test_list, ema_results_dict,
                                                          'id', 'predicted_label')

        common.save_jsonl(copied_test_list, f"nli_{tag}_cp_results_th{test_sent_filtering_prob}.jsonl")