Пример #1
0
def load_and_cache_examples(args, tokenizer, evaluate=False):
    processor = DataProcessor()
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(args.cache_dir, 'cached_{}_{}_{}'.format(
        'dev' if evaluate else 'train',
        list(filter(None, args.model_name_or_path.split('/'))).pop(),
        str(args.max_seq_length)))
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer,
            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset
Пример #2
0
    def predict(self, texts):
        examples = list()
        for item in texts:
            id = item.get("id")
            text = item.get("text")
            text = tokenization.convert_to_unicode(text)
            examples.append(data.InputExample(guid=id, text=text))

        example_string_list = data.convert_examples_to_features(
            examples, self.num_labels, self.max_seq_length, self.tokenizer)

        tic = time.time()
        tf.logging.info("***** Running predictions *****")
        tf.logging.info("  Num examples = {}".format(len(example_string_list)))

        predictions = self.predictor({"examples": example_string_list})
        scores = predictions.get("probabilities").tolist()
        toc = time.time()
        tf.logging.info("Prediction time: {}s".format((toc - tic)))

        results = [{
            "id": item["id"],
            "scores": dict(zip(self.labels, scores[i]))
        } for i, item in enumerate(texts)]
        return results
Пример #3
0
def val(model, processor, args, label_list, tokenizer, device):

    eval_examples = processor.get_dev_examples(args.data_dir)
    eval_features = convert_examples_to_features(
        eval_examples, label_list, args.max_seq_length, tokenizer, show_exp=False
    )
    all_input_ids = torch.tensor(
        [f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor(
        [f.input_mask for f in eval_features], dtype=torch.long
    )
    all_segment_ids = torch.tensor(
        [f.segment_ids for f in eval_features], dtype=torch.long
    )
    all_label_ids = torch.tensor(
        [f.label_id for f in eval_features], dtype=torch.long)
    eval_data = TensorDataset(
        all_input_ids, all_input_mask, all_segment_ids, all_label_ids
    )
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(
        eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size
    )

    model.eval()
    predict = np.zeros((0,), dtype=np.int32)
    gt = np.zeros((0,), dtype=np.int32)
    i = 0
    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)
        with torch.no_grad():
            output = model(input_ids, segment_ids, input_mask)
            labpre = output.argmax(dim=1)
            predict = np.hstack((predict, labpre.cpu().numpy()))
            gt = np.hstack((gt, label_ids.cpu().numpy()))
            if predict[-1] == gt[-1]:
                i += 1
    pr = i / len(eval_data)
    print("\n[Valid]\t[Correct Num: %d]\t[Presion: %f]" % (i, pr))
    return pr
Пример #4
0
def test(model, processor, args, label_list, tokenizer, device):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    test_examples = processor.get_test_examples(args.test_data_dir)
    with open(args.test_data_dir, "r", encoding='utf-8') as f:
        test_list = []
        for line in f:
            _, text_a, label = line.strip("\n").split("\t")
            test_list.append((text_a, label))
    test_features = convert_examples_to_features(
        test_examples, label_list, args.max_seq_length, tokenizer, show_exp=False
    )
    all_input_ids = torch.tensor(
        [f.input_ids for f in test_features], dtype=torch.long)
    all_input_mask = torch.tensor(
        [f.input_mask for f in test_features], dtype=torch.long
    )
    all_segment_ids = torch.tensor(
        [f.segment_ids for f in test_features], dtype=torch.long
    )
    all_label_ids = torch.tensor(
        [f.label_id for f in test_features], dtype=torch.long)
    test_data = TensorDataset(
        all_input_ids, all_input_mask, all_segment_ids, all_label_ids
    )
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(
        test_data, sampler=test_sampler, batch_size=args.eval_batch_size
    )
    model.eval()
    predict = np.zeros((0,), dtype=np.int32)
    gt = np.zeros((0,), dtype=np.int32)
    i = 0
    f = open("Error data.txt", "w")
    for text_id, (input_ids, input_mask, segment_ids, label_ids) in enumerate(
        test_dataloader
    ):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)
        with torch.no_grad():
            if args.classifier_model == "Classifier_joint_model":
                output_base, output_pad, output_textcnn = model(
                    input_ids, segment_ids, input_mask
                )
                output = output_base
            else:
                output = model(input_ids, segment_ids, input_mask)
            labpre = output
            labpre = labpre.argmax(dim=1)
            predict = np.hstack((predict, labpre.cpu().numpy()))
            gt = np.hstack((gt, label_ids.cpu().numpy()))
            if predict[-1] == gt[-1]:
                i += 1
            else:
                f.write(
                    test_list[text_id][0]
                    + "\t"
                    + str(predict[-1])
                    + "\t"
                    + test_list[text_id][1]
                    + "\n"
                )
    pr = i / len(test_data)
    print("\n[Test]\t[Correct Num: %d]\t[Presion: %f]" % (i, pr))
    return pr
Пример #5
0
def ensembletest(
    model1,
    model2,
    model3,
    model4,
    model5,
    processor,
    args,
    label_list,
    tokenizer,
    device,
):
    test_examples = processor.get_test_examples(args.test_data_dir)
    with open(args.test_data_dir, "r", 'utf-8') as f:
        test_list = []
        for line in f:
            _, text_a, label = line.strip("\n").split("\t")
            test_list.append((text_a, label))
    test_features = convert_examples_to_features(
        test_examples, label_list, args.max_seq_length, tokenizer, show_exp=False
    )
    all_input_ids = torch.tensor(
        [f.input_ids for f in test_features], dtype=torch.long)
    all_input_mask = torch.tensor(
        [f.input_mask for f in test_features], dtype=torch.long
    )
    all_segment_ids = torch.tensor(
        [f.segment_ids for f in test_features], dtype=torch.long
    )
    all_label_ids = torch.tensor(
        [f.label_id for f in test_features], dtype=torch.long)
    test_data = TensorDataset(
        all_input_ids, all_input_mask, all_segment_ids, all_label_ids
    )
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(
        test_data, sampler=test_sampler, batch_size=args.eval_batch_size
    )
    model1.eval()
    model2.eval()
    model3.eval()
    model4.eval()
    model5.eval()
    predict = np.zeros((0,), dtype=np.int32)
    gt = np.zeros((0,), dtype=np.int32)
    i = 0
    f = open("Error data.txt", "w")
    for text_id, (input_ids, input_mask, segment_ids, label_ids) in enumerate(
        test_dataloader
    ):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)
        with torch.no_grad():
            output1 = model1(input_ids, segment_ids, input_mask)
            output2 = model2(input_ids, segment_ids, input_mask)
            output3 = model3(input_ids, segment_ids, input_mask)
            output4 = model4(input_ids, segment_ids, input_mask)
            output5 = model5(input_ids, segment_ids, input_mask)
            labpre = (
                F.softmax(output1, dim=-1)
                + F.softmax(output2, dim=-1)
                + F.softmax(output3, dim=-1)
                + F.softmax(output4, dim=-1)
                + F.softmax(output5, dim=-1)
            )
            labpre = labpre.argmax(dim=1)
            predict = np.hstack((predict, labpre.cpu().numpy()))
            gt = np.hstack((gt, label_ids.cpu().numpy()))
            if predict[-1] == gt[-1]:
                i += 1
            else:
                f.write(
                    test_list[text_id][0]
                    + "\t"
                    + str(predict[-1])
                    + "\t"
                    + test_list[text_id][1]
                    + "\n"
                )
    pr = i / len(test_data)
    print("\n[Test]\t[Correct Num: %d]\t[Presion: %f]" % (i, pr))
    return pr
Пример #6
0
def load_and_cache_examples(args, task, tokenizer, eval_type):
    data_dir = args.task_to_data_dir[task]
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    if task in args.nli_tasks:
        processor = processors[task](args.task_to_data_dir[task])
    else:
        processor = processors[task]()

    output_mode = output_modes[task]

    label_list = processor.get_labels()
    if task in ["mnli", "mnli-mm"
                ] and args.model_type in ["roberta", "xlmroberta"]:
        # HACK(label indices are swapped in RoBERTa pretrained model)
        label_list[1], label_list[2] = label_list[2], label_list[1]

    if eval_type == "train":
        if args.sample_train:
            cached_features_file = join(data_dir, 'cached_type_{}_task_{}_sample_train_{}_num_samples_{}_model_{}_data_seed_{}'.\
                                format(eval_type, task, args.sample_train, args.num_samples,
                                       list(filter(None, args.model_name_or_path.split('/'))).pop(), args.data_seed))
        else:
            # here data_seed has no impact.
            cached_features_file = join(data_dir,
                                        'cached_type_{}_task_{}_sample_train_{}_num_samples_{}_model_{}'. \
                                        format(eval_type, task, args.sample_train, args.num_samples,
                                               list(filter(None, args.model_name_or_path.split('/'))).pop()))
    else:
        cached_features_file = join(data_dir, 'cached_type_{}_task_{}_model_{}'. \
                                    format(eval_type, task, list(filter(None, args.model_name_or_path.split('/'))).pop()))

    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        if eval_type == "train":
            if args.sample_train:
                data_dir = join(
                    data_dir,
                    "sampled_datasets", "seed_" + str(args.data_seed),
                    str(args.num_samples))  # sampled: for old version.
            examples = (processor.get_train_examples(data_dir))
        elif eval_type == "test":
            examples = (processor.get_dev_examples(data_dir))
        elif eval_type == "dev":
            examples = (processor.get_validation_examples(data_dir))

        features = convert_examples_to_features(
            examples,
            tokenizer,
            label_list=label_list,
            max_length=args.max_seq_length,
            pad_on_left=bool(
                args.model_type in ["xlnet"]),  # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
            output_mode=output_mode,
            no_label=True if
            (eval_type == "test" and task in args.glue_tasks) else False)
        print("Saving features into cached file %s", cached_features_file)
        torch.save(features, cached_features_file)

    if args.local_rank == 0 and eval_type == "train":
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                      dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                      dtype=torch.long)
    if output_mode == "classification":
        all_labels = torch.tensor([f.label for f in features],
                                  dtype=torch.long)
    elif output_mode == "regression":
        all_labels = torch.tensor([f.label for f in features],
                                  dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_attention_mask,
                            all_token_type_ids, all_labels)
    return dataset, processor.num_classes
Пример #7
0
def main():
    args = config().parser.parse_args()
    #     if args.local_rank == -1 or args.no_cuda:
    #         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #         n_gpu = torch.cuda.device_count()
    #     else:
    #         torch.cuda.set_device(args.local_rank)
    #         device = torch.device("cuda", args.local_rank)
    #         n_gpu = 1
    #         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    #         torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
        raw_train_data = json.load(open(args.train_file, mode='r'))
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )
        raw_test_data = json.load(open(args.predict_file, mode='r'))

    if os.path.exists(args.output_dir) == False:
        # raise ValueError("Output directory () already exists and is not empty.")
        os.makedirs(args.output_dir, exist_ok=True)

    import pickle as cPickle
    train_examples = None
    num_train_steps = None
    bert_config = BertConfig.from_json_file(args.bert_config_file)
    tokenizer = BertTokenizer(vocab_file=args.vocab_file,
                              do_lower_case=args.do_lower_case)
    if args.do_train:
        if os.path.exists("train_file_baseline.pkl"):
            train_examples = cPickle.load(
                open("train_file_baseline.pkl", mode='rb'))
        else:
            train_examples = read_examples(raw_train_data,
                                           tokenizer=tokenizer,
                                           doc_stride=args.doc_stride,
                                           max_seq_length=args.max_seq_length,
                                           is_training=True)
            cPickle.dump(train_examples,
                         open("train_file_baseline.pkl", mode='wb'))
        logger.info("train examples {}".format(len(train_examples)))
        num_train_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    model = BertForMultiChoice(bert_config)
    if args.init_checkpoint is not None:
        logger.info('load bert weight')
        state_dict = torch.load(args.init_checkpoint, map_location='cpu')
        missing_keys = []
        unexpected_keys = []
        error_msgs = []
        # copy state_dict so _load_from_state_dict can modify it
        metadata = getattr(state_dict, '_metadata', None)
        state_dict = state_dict.copy()
        # new_state_dict=state_dict.copy()
        # for kye ,value in state_dict.items():
        #     new_state_dict[kye.replace("bert","c_bert")]=value
        # state_dict=new_state_dict
        if metadata is not None:
            state_dict._metadata = metadata

        def load(module, prefix=''):
            local_metadata = {} if metadata is None else metadata.get(
                prefix[:-1], {})

            module._load_from_state_dict(state_dict, prefix, local_metadata,
                                         True, missing_keys, unexpected_keys,
                                         error_msgs)
            for name, child in module._modules.items():
                # logger.info("name {} chile {}".format(name,child))
                if child is not None:
                    load(child, prefix + name + '.')

        load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
        logger.info("missing keys:{}".format(missing_keys))
        logger.info('unexpected keys:{}'.format(unexpected_keys))
        logger.info('error msgs:{}'.format(error_msgs))
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)

    global_step = 0
    if args.do_train:
        cached_train_features_file = args.train_file + '_{0}_{1}_v{2}'.format(
            str(args.max_seq_length), str(args.doc_stride), str(1))
        train_features = None
        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                is_training=True)

            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s",
                            cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)
        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_choice_positions = torch.tensor(
            [f.choice_positions for f in train_features], dtype=torch.long)
        all_answer_positions = torch.tensor(
            [f.answer_positions for f in train_features], dtype=torch.long)
        all_choice_positions_mask = torch.tensor(
            [f.choice_positions_mask for f in train_features],
            dtype=torch.long)
        all_answer_positions_mask = torch.tensor(
            [f.answer_positions_mask for f in train_features],
            dtype=torch.long)
        all_choice_labels = torch.tensor(
            [f.choice_labels for f in train_features], dtype=torch.long)
        all_choice_labels_for_consine = torch.tensor(
            [f.choice_labels_for_consine for f in train_features],
            dtype=torch.long)

        train_data = TensorDataset(
            all_input_ids, all_input_mask, all_segment_ids,
            all_choice_positions, all_answer_positions,
            all_choice_positions_mask, all_answer_positions_mask,
            all_choice_labels, all_choice_labels_for_consine)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size,
                                      drop_last=True)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            model.zero_grad()
            epoch_itorator = tqdm(train_dataloader, disable=None)
            for step, batch in enumerate(epoch_itorator):
                if n_gpu == 1:
                    batch = tuple(
                        t.to(device)
                        for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, choice_positions, answer_positions, choice_positions_mask, answer_positions_mask, choice_labels, choice_labels_for_consine = batch
                loss1, loss2 = model(input_ids,
                                     input_mask,
                                     segment_ids,
                                     choice_positions,
                                     answer_positions,
                                     choice_positions_mask,
                                     answer_positions_mask,
                                     choice_labels,
                                     choice_labels_for_consine,
                                     limit_loss=True)
                if loss2 is not None:
                    loss = loss1 + loss2
                else:
                    loss = loss1
                if n_gpu > 1:
                    loss1 = loss1.mean()
                    loss2 = loss2.mean()
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if (step + 1) % 1 == 0:
                    if loss2 is not None:
                        logger.info(
                            "step: {} #### loss1: {}  loss2: {}".format(
                                step,
                                loss1.cpu().item(),
                                loss2.cpu().item()))
                    else:
                        logger.info("step: {} #### loss1: {}".format(
                            step,
                            loss1.cpu().item()))

    # Save a trained model
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    if args.do_train:
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        torch.save(model_to_save.state_dict(), output_model_file)

    # Load a trained model that you have fine-tuned
    model_state_dict = torch.load(output_model_file)
    model = BertForMultiChoice(bert_config)
    model.load_state_dict(model_state_dict)
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.do_predict and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        eval_examples = read_examples(raw_test_data,
                                      tokenizer=tokenizer,
                                      doc_stride=args.doc_stride,
                                      max_seq_length=args.max_seq_length,
                                      is_training=False)
        # eval_examples=eval_examples[:100]
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_choice_positions = torch.tensor(
            [f.choice_positions for f in eval_features], dtype=torch.long)
        all_answer_positions = torch.tensor(
            [f.answer_positions for f in eval_features], dtype=torch.long)
        all_choice_positions_mask = torch.tensor(
            [f.choice_positions_mask for f in eval_features], dtype=torch.long)
        all_answer_positions_mask = torch.tensor(
            [f.answer_positions_mask for f in eval_features], dtype=torch.long)

        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_choice_positions,
                                  all_answer_positions,
                                  all_choice_positions_mask,
                                  all_answer_positions_mask, all_example_index)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")

        for input_ids, input_mask, segment_ids, choice_positions, answer_positions, choice_positions_mask, answer_positions_mask, example_indices in tqdm(
                eval_dataloader, desc="Evaluating", disable=None):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            choice_positions = choice_positions.to(device)
            answer_positions = answer_positions.to(device)
            choice_positions_mask = choice_positions_mask.to(device)
            answer_positions_mask = answer_positions_mask.to(device)
            with torch.no_grad():
                batch_probs = model(input_ids, input_mask, segment_ids,
                                    choice_positions, answer_positions,
                                    choice_positions_mask,
                                    answer_positions_mask)  # [24, n]
            for i, example_index in enumerate(example_indices):
                probs = batch_probs[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(RawResult(unique_id=unique_id,
                                             logits=probs))
        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions.json")

        write_predictions(eval_examples, eval_features, all_results,
                          args.max_answer_length, output_prediction_file)
Пример #8
0
def main():
    args = parse_args()

    # specifies the path where the biobert or clinical bert model is saved
    if args.bert_model == 'biobert' or args.bert_model == 'clinical_bert':
        args.bert_model = args.model_loc

    print(f"Using bert model: {args.bert_model}")

    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info(f"device: {device} n_gpu: {n_gpu}")

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    processor = N2c2ClsProcessor(args.fold_id)
    num_labels = 13
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    print('TRAIN')
    train = processor.get_train_examples(args.data_dir)
    print([(train[i].text_a, train[i].text_b, train[i].label)
           for i in range(3)])
    print('DEV')
    dev = processor.get_dev_examples(args.data_dir)
    print([(dev[i].text_a, dev[i].text_b, dev[i].label) for i in range(3)])
    print('TEST')
    test = processor.get_test_examples(args.data_dir)
    print([(test[i].text_a, test[i].text_b, test[i].label) for i in range(3)])

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) /
            args.train_batch_size) * args.num_train_epochs

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else PYTORCH_PRETRAINED_BERT_CACHE
    model = BertForSequenceClassification.from_pretrained(
        args.bert_model, cache_dir=cache_dir, num_labels=num_labels)
    model.to(device)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)

                loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())

        # Load a trained model and config that you have fine-tuned
        config = BertConfig(output_config_file)
        model = BertForSequenceClassification(config, num_labels=num_labels)
        model.load_state_dict(torch.load(output_model_file))
    else:
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model, num_labels=num_labels)
    model.to(device)

    if args.do_eval:
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        pred = []

        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss = model(input_ids, segment_ids, input_mask,
                                      label_ids)
                logits = model(input_ids, segment_ids, input_mask)
                logits = torch.softmax(logits, 1)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)
            pred += logits.tolist()

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        loss = tr_loss / nb_tr_steps if args.do_train else None

        pred = {f.guid: p for f, p in zip(eval_features, pred)}

        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'global_step': global_step,
            'loss': loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        output_pred_file = os.path.join(args.output_dir, "pred_results.txt")
        with open(output_pred_file, 'w') as writer:
            logger.info("***** Writing Eval predictions *****")
            for id, p in pred.items():
                writer.write(f"{id}:{p}\n")

    if args.do_test and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        test_examples = processor.get_test_examples(args.data_dir)
        test_features = convert_examples_to_features(test_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer)
        logger.info("***** Running testing *****")
        logger.info("  Num examples = %d", len(test_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in test_features],
                                     dtype=torch.long)
        test_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        # Run prediction for full data
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.eval_batch_size)

        model.eval()
        test_loss, test_accuracy = 0, 0
        nb_test_steps, nb_test_examples = 0, 0

        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                test_dataloader, desc="Testing"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_test_loss = model(input_ids, segment_ids, input_mask,
                                      label_ids)
                logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_test_accuracy = accuracy(logits, label_ids)

            test_loss += tmp_test_loss.mean().item()
            test_accuracy += tmp_test_accuracy

            nb_test_examples += input_ids.size(0)
            nb_test_steps += 1

        test_loss = test_loss / nb_test_steps
        test_accuracy = test_accuracy / nb_test_examples
        loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'test_loss': test_loss,
            'test_accuracy': test_accuracy,
            'global_step': global_step,
            'loss': loss
        }

        output_test_file = os.path.join(args.output_dir, "test_results.txt")
        with open(output_test_file, "w") as writer:
            logger.info("***** Test results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Пример #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--training', default=True)
    parser.add_argument('--dropout', default=0.1, type=float)
    parser.add_argument('--std-alpha', default=0.003, type=float)
    parser.add_argument('--test-split', default=0.1, type=float)
    parser.add_argument('--epoch', default=10000, type=int)
    parser.add_argument('--epoch_start', default=0, type=int)
    parser.add_argument('--exp-decay-rate', default=0.99, type=float)
    parser.add_argument('--use_cuda', default=True)
    parser.add_argument('--hidden-size', default=767, type=int)
    parser.add_argument('--learning-rate', default=0.05, type=float)
    parser.add_argument('--print-freq', default=250, type=int)
    parser.add_argument('--train-batch-size', default=60, type=int)
    parser.add_argument('--dev-batch-size', default=100, type=int)
    parser.add_argument('--model-config', default='rnn_config.ini')
    parser.add_argument('--config', default='config.json')
    parser.add_argument('--clip', type=float, default=1, help='gradient clipping')
    parser.add_argument('--model-name', default='MultiHeadRln')
    parser.add_argument('--word-dim', default=100, type=int)
    parser.add_argument('--resume', default='MultiHeadRln_18 _300.model', type=str, metavar='PATH', help='path saved params')
    parser.add_argument("--output_dir", default='./checkpoints/', type=str, 
                        help="The output directory where the model checkpoints will be written.")
    # [/mnt/disk/dagi/BiDAF_multiHead/checkpoints/], [./checkpoints/]
    args = parser.parse_args()
    device = torch.device(f"cuda:{str(gpu_list[0])}" if args.use_cuda and torch.cuda.is_available() else "cpu")
    

    print('loading NewsQA data...')
    setattr(args, 'device', device)
    setattr(args, 'model_time', strftime('%H_%M_%S', gmtime()))

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased')

    data = load_data(story_path='../data', question_filename='../data/newsqa-data-v1',size=10000)
    features = convert_examples_to_features(data, tokenizer, 350, 150, 50, True)
    print("Data loading finished...")
    with open(args.config) as config_file: 
        hyp = json.load(config_file)['hyperparams']  
   
 
    model = EncoderModel(args, hyp).to(device)
    model = nn.DataParallel(model, device_ids=gpu_list)

    model_loss = nn.NLLLoss()
    optimizer  = torch.optim.SGD(model.parameters(), lr=args.learning_rate)
    print("Loading data to RAM: (6: items)")
    all_input_ids = torch.tensor([f.input_ids for f in tqdm(features)], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in tqdm(features)], dtype=torch.long)
    all_seq_lengths = torch.tensor([len(f.input_ids) for f in tqdm(features)], dtype=torch.long)
    all_start_pos = torch.tensor([f.start_position for f in tqdm(features)], dtype=torch.long)
    all_end_pos = torch.tensor([f.end_position for f in tqdm(features)], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in tqdm(features)], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_input_mask, all_start_pos, all_end_pos, all_seq_lengths, all_segment_ids, all_example_index)

    lengths = [int(len(dataset)*0.8), int(len(dataset))-int(len(dataset)*0.8)]
    train_dataset, test_dataset = random_split(dataset, lengths)
    print(f'Training Dataset: {int(len(dataset)*0.8)},  Dev Dataset: {int(len(dataset))-int(len(dataset)*0.8)}')
    train_sampler = SequentialSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
    eval_sampler = SequentialSampler(test_dataset)
    eval_dataloader = DataLoader(test_dataset, sampler=eval_sampler, batch_size=args.train_batch_size)

    if os.path.exists(f'model/{args.resume}'):
        print(f'loading ..........model/{args.resume}')
        model.load_state_dict(torch.load(f'model/{args.resume}', map_location=args.device)) #  map_location
    # loss, acc, total = 0, 0, 0
    for e in range(args.epoch_start, args.epoch):        
        f1, em, total, loss = 0.0, 0.0, 0.0, 0.0
        for i, batch in enumerate(train_dataloader): 
            batch_loss, out = train(model, bert_model(batch[0]), batch[1], batch[2], batch[3], batch[4], model_loss, optimizer, args)
            pred = list(zip(torch.argmax(out[0], dim=1).cpu().numpy(), torch.argmax(out[1],dim=1).cpu().numpy()))
            ans = list(zip(batch[2].cpu().numpy(), batch[3].cpu().numpy()))
            batch_f1 = compute_f1(ans, pred, len(pred))
            batch_em = compute_em(ans, pred, len(pred))
            total = i
            f1 += batch_f1
            em += batch_em
            loss += float(batch_loss)
            if i%10 == 0: # print every 10th loop
                print('F1:-'+str(f1/(i+1))+'  EM:-'+str(em/(i+1))+'  Loss:'+str(loss/(i+1)))
                                    
        print(f'epoch: {e} /loss: {loss/total:.3f} /F1: {f1/total:.3f} /EM: {em/total:.3f}')
    
    
        dev_f1, dev_em, dev_total, dev_loss = 0.0, 0.0, 0.0, 0.0
        for i,  batch in enumerate(eval_dataloader): #-----
            batch_loass, out = test(model, bert_model(batch[0]), batch[1], batch[2], batch[3], batch[4], model_loss, args)
            pred = list(zip(torch.argmax(out[0], dim=1).cpu().numpy(), torch.argmax(out[1], dim=1).cpu().numpy()))
            ans = list(zip(batch[2].cpu().numpy(), batch[3].cpu().numpy()))
            dev_f1 += compute_f1(ans, pred, len(pred))
            dev_em += compute_em(ans, pred, len(pred))
            dev_loss += float(batch_loss)
            dev_total += i
        print(f'Test:- loss:{dev_loss/dev_total:.3f} /F1: {dev_f1/dev_total} /EM: {dev_em/dev_total}')