Exemplo n.º 1
0
def train(model):
    n_gpu = torch.cuda.device_count()
    logger.info("device: {} n_gpu: {}".format(config.device, n_gpu))
    if n_gpu > 0:
        torch.cuda.manual_seed_all(config.seed)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    model_it_self = model.module if hasattr(model, 'module') else model
    global_step = 0
    num_train_steps = data_generator.get_num_train_steps()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0}
            ]
    optimizer = BERTAdam(optimizer_grouped_parameters,
                             lr=config.learning_rate,
                                warmup=config.warmup_proportion,
                             t_total=num_train_steps)
    dev_loader = data_generator.get_dev_loader()
    train_loader = data_generator.get_train_loader()
    for epoch in trange(int(config.num_train_epochs), desc="Epoch"):
        for step, batch in enumerate(tqdm(train_loader, desc="Iteration")):
            batch = tuple(t.to(config.device) for t in batch)
            loss, output = model(batch, global_step, -1)
            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if config.gradient_accumulation_steps > 1:
                loss = loss / config.gradient_accumulation_steps
            # opt.zero_grad()
            loss.backward()
            if (step + 1) % config.gradient_accumulation_steps == 0:
                optimizer.step()
                model.zero_grad()
                global_step += 1
            #if global_step % config.print_interval == 0:
            #    print_model_result(model_it_self.get_result())

            if global_step % config.eval_interval == 0 or global_step == num_train_steps:
                if config.do_eval:
                    print("\nepoch:{} global:{}\t".format(epoch, global_step))
                    eval_result = model_eval(model_it_self, dev_loader, data_type='dev')
                    # 保存模型,使用loss为评估标准
                    save_best_model(model_it_self, eval_result['loss'], data_type='dev')
                    if config.SAVE_USE_ACCURACY:
                        save_best_model(model_it_self, eval_result['accuracy'], data_type='dev',
                                        use_accuracy=config.SAVE_USE_ACCURACY)
    shutil.copy(config.train_best_accuracy_model, os.path.join(config.output_dir, 'best_ac_model.bin'))
    shutil.copy(config.train_best_loss_model, os.path.join(config.output_dir, 'best_loss_model.bin'))
Exemplo n.º 2
0
    def do_train(self, data, truncated=False):
        """
        训练模型, 数据集分成2部分,训练集和验证集, 默认比例9:1
        :param data: 输入的数据,注意如果做truncated,那么输入的数据为 [(content,aspect,start_idx, end_idx, label),...,]
        :param truncated: 是否要截断,截断按照 self.left_max_seq_len, self.right_max_seq_len进行
        :return:
        """
        if truncated:
            data, locations = self.do_truncate_data(data)
        train_data_len = int(len(data) * 0.9)
        train_data = data[:train_data_len]
        eval_data = data[train_data_len:]
        train_dataset = load_examples(train_data, self.max_seq_length, self.train_tokenizer, self.label_list, self.reverse_truncate)
        eval_dataset = load_examples(eval_data, self.max_seq_length, self.train_tokenizer, self.label_list, self.reverse_truncate)
        logger.info("训练数据集已加载,开始训练")
        num_train_steps = int(len(train_dataset) / self.train_batch_size) * self.num_train_epochs
        forward_batch_size = int(self.train_batch_size / self.gradient_accumulation_steps)
        # 开始训练
        params = list(self.train_model.named_parameters())
        all_trainable_params = divide_parameters(params, lr=self.learning_rate, weight_decay_rate=self.weight_decay_rate)
        # 优化器设置
        optimizer = BERTAdam(all_trainable_params, lr=self.learning_rate,
                             warmup=self.warmup_proportion, t_total=num_train_steps, schedule=self.schedule,
                             s_opt1=self.s_opt1, s_opt2=self.s_opt2, s_opt3=self.s_opt3)

        logger.info("***** 开始训练 *****")
        logger.info("  训练样本数是 = %d", len(train_dataset))
        logger.info("  评估样本数是 = %d", len(eval_dataset))
        logger.info("  前向 batch size = %d", forward_batch_size)
        logger.info("  训练的steps = %d", num_train_steps)
        ########### 训练的配置 ###########
        train_config = TrainingConfig(
            gradient_accumulation_steps = self.gradient_accumulation_steps,
            ckpt_frequency = self.ckpt_frequency,
            log_dir = self.output_dir,
            output_dir = self.output_dir,
            device = self.device)
        #初始化trainer,执行监督训练,而不是蒸馏。它可以把model_S模型训练成为teacher模型
        distiller = BasicTrainer(train_config = train_config,
                                 model = self.train_model,
                                 adaptor = BertForGLUESimpleAdaptorTraining)

        train_sampler = RandomSampler(train_dataset)
        #训练的dataloader
        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=forward_batch_size, drop_last=True)
        #执行callbakc函数,对eval数据集
        callback_func = partial(self.do_predict, eval_dataset=eval_dataset)
        with distiller:
            #开始训练
            distiller.train(optimizer, scheduler=None, dataloader=train_dataloader,
                              num_epochs=self.num_train_epochs, callback=callback_func)
        logger.info(f"训练完成")
        return "Done"
Exemplo n.º 3
0
def train(args):
    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        'sst': SstProcessor,
        'polarity': PolarityProcessor,
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format(
            args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval and not args.do_test:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format(
                args.max_seq_length, bert_config.max_position_embeddings))

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size * args.num_train_epochs)

    model = BertForSequenceClassification(bert_config, len(label_list))

    if args.do_test:
        model.load_state_dict(torch.load(os.path.join(args.data_dir, "model_best.pt")))
        model.to(device)
        model.eval()
        if args.do_eval:

            eval_example = processor.get_dev_examples(args.data_dir)
            eval_features = convert_examples_to_features(
                eval_example, label_list, args.max_seq_length, tokenizer)

            logger.info("***** Running eval *****")
            logger.info("  Num examples = %d", len(eval_example))
            logger.info("  Batch size = %d", args.eval_batch_size)
            _ = do_eval(model, device, eval_features, args)

        if args.do_predict:
            print(os.path.dirname(args.data_dir))
            test_example = processor.get_test_examples(os.path.dirname(args.data_dir))
            test_features = convert_examples_to_features(
                test_example, label_list, args.max_seq_length, tokenizer)

            logger.info("***** Running test *****")
            logger.info("  Num examples = %d", len(test_example))
            logger.info("  Batch size = %d", args.eval_batch_size)
            test_logit_list, _ = do_eval(model, device, test_features, args)
            np.save(os.path.join(args.data_dir, 'oof_test'), np.asarray(test_logit_list))

        return 0

    if args.init_checkpoint is not None:
        model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ['bias', 'gamma', 'beta']
    optimizer_parameters = [
        {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0}
    ]

    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    global_step = 0
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        max_score = 0
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, (input_ids, input_mask, segment_ids, label_ids) in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                loss.backward()

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()  # We have accumulated enought gradients
                    model.zero_grad()
                    global_step += 1

            if args.do_eval:
                eval_examples = processor.get_dev_examples(args.data_dir)
                eval_features = convert_examples_to_features(
                    eval_examples, label_list, args.max_seq_length, tokenizer)

                logger.info("***** Running evaluation *****")
                logger.info("  Num examples = %d", len(eval_examples))
                logger.info("  Batch size = %d", args.eval_batch_size)

                all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
                all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
                all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
                all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

                eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
                if args.local_rank == -1:
                    eval_sampler = SequentialSampler(eval_data)
                else:
                    eval_sampler = DistributedSampler(eval_data)
                eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

                model.eval()
                eval_loss, eval_accuracy = 0, 0
                nb_eval_steps, nb_eval_examples = 0, 0
                logit_list = []
                labels_eval_list = []
                for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)
                    with torch.no_grad():
                        tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids)

                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    logit_list.extend(logits.tolist())
                    labels_eval_list.extend(label_ids.tolist())
                    tmp_eval_accuracy = accuracy(logits, label_ids)
                    # _ = accuracy2(logits, label_ids)
                    # _ = accuracy3(logits, label_ids)
                    # _ = accuracy4(logits, label_ids)

                    eval_loss += tmp_eval_loss.mean().item()
                    eval_accuracy += tmp_eval_accuracy

                    nb_eval_examples += input_ids.size(0)
                    nb_eval_steps += 1
                # print(epoch)
                # f1 = score2(logit_list, labels_eval_list)

                eval_loss = eval_loss / nb_eval_steps  # len(eval_dataloader)
                eval_accuracy = eval_accuracy / nb_eval_examples  # len(eval_dataloader)
                print("eval_loss", eval_loss)
                result = {'eval_loss': eval_loss,
                          'eval_accuracy': eval_accuracy,
                          'global_step': global_step,
                          'loss': tr_loss / nb_tr_steps,
                          'Epoch': epoch,
                          'Dir': args.data_dir
                          }  # 'loss': loss.item()}

                output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
                with open(output_eval_file, "a") as writer:
                    logger.info("***** Eval results *****")
                    for key in sorted(result.keys()):
                        logger.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))

                if eval_accuracy > max_score:
                    # best_logit_list = logit_list
                    # best_eval_labels = labels_eval_list
                    np.save(os.path.join(args.data_dir, 'oof_train'), np.asarray(logit_list))
                    np.save(os.path.join(args.data_dir, 'oof_train_y'), np.asarray(labels_eval_list))
                    torch.save(model.state_dict(), os.path.join(args.data_dir, "model_%.4f.pt" % eval_accuracy))
                    if os.path.exists(os.path.join(args.data_dir, "model_best.pt")):
                        os.remove(os.path.join(args.data_dir, "model_best.pt"))
                    torch.save(model.state_dict(), os.path.join(args.data_dir, "model_best.pt"))
                    max_score = eval_accuracy
Exemplo n.º 4
0
def main():
    args = arguments.get_argparse("multiple_choice")
    logger.info(json.dumps(args.__dict__))

    if args.eval_on_train and not args.log_spec:
        args.log_spec = "on_train"

    processors = {
        "race": dataset_processor.RaceProcessor,
        "mctest": dataset_processor.MCTestProcessor,
        "swag": dataset_processor.SwagProcessor,
        "squad": dataset_processor.SquadProcessor,
        "openbookqa": dataset_processor.OpenBookQAProcessor,
        "multirc": dataset_processor.MultiRCProcessor,
        "arc": dataset_processor.ARCProcessor,
        "qa4mre": dataset_processor.QA4MREProcessor,
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend
        # which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend="nccl")
        if args.fp16:
            logger.info(
                """16-bits training currently not supported in distributed
                training""")
            # (see https://github.com/pytorch/pytorch/pull/13496)
            args.fp16 = False
    logger.info(
        "device %s n_gpu %d distributed training %r",
        device,
        n_gpu,
        bool(args.local_rank != -1),
    )

    if args.gradient_accumulation_steps < 1:
        raise ValueError("""Invalid gradient_accumulation_steps parameter: {},
            should be >= 1""".format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval and not args.do_test:
        raise ValueError(
            """At least one of `do_train` or `do_eval` or `do_test` must
            be True.""")
    if (args.do_train or args.do_eval) and args.do_test:
        raise ValueError(
            "Runing test must be independent of running train and/or dev")

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            """Cannot use sequence length {} because the BERT model was only
            trained up to sequence length {}""".format(
                args.max_seq_length, bert_config.max_position_embeddings))

    if args.small_debug:
        args.output_dir = 'debug'
    if os.path.exists(args.output_dir):
        if not os.listdir(args.output_dir) == ["args_log.txt"
                                               ] and not args.small_debug:
            raise ValueError(
                "Output directory already exists and is not empty.")
    os.makedirs(args.output_dir, exist_ok=True)
    os.makedirs(args.cache_dir, exist_ok=True)

    args_log = os.path.join(args.output_dir, "args_log.txt")
    if not os.path.exists(args_log):
        with open(args_log, "w") as writer:
            writer.write(json.dumps(args.__dict__))
    else:
        print("args_log.txt already exists")

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    if "{}" in args.corenlp_cache_dir:
        args.corenlp_cache_dir = args.corenlp_cache_dir.format(task_name)

    processor = processors[task_name](args.data_dir, args.dataset_option)
    num_options = processor.get_num_options()

    if args.convert_from_ans_extr:
        if args.do_train:
            if args.train_predictions:
                processor.set_candidates("train", args.train_predictions)
            else:
                raise ValueError("train prediction file is missing")
        if args.do_eval:
            if args.eval_predictions:
                processor.set_candidates("dev", args.eval_predictions)
            else:
                raise ValueError("eval prediction file is missing")

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None

    def cache_features(examples, split_name):
        cache_spec_cand = [
            task_name,
            args.dataset_option,
            split_name,
            args.input_ablation,
        ]
        cache_spec = "_".join(
            [str(x) for x in cache_spec_cand if x is not None])
        cache_path = os.path.join(args.cache_dir, "{}.pkl".format(cache_spec))

        if os.path.exists(cache_path) and not args.no_cache:
            features = pickle.load(open(cache_path, "rb"))
        else:
            if args.input_ablation or args.output_statistics:
                corenlp_cache_path = os.path.join(
                    args.corenlp_cache_dir,
                    "{}_{}.pkl".format(task_name, split_name))
                corenlp_cache = pickle.load(open(corenlp_cache_path, "rb"))
            else:
                corenlp_cache = None

            if args.output_statistics:
                output_mcmrc.output_statistics(examples, corenlp_cache)

            tokenized_examples = generate_tokenized_examples(
                examples, tokenizer, args.input_ablation, corenlp_cache,
                args.entity_anonymization)

            if args.output_mturk or args.output_examples:
                for ex in examples:
                    ex.input_ablation = "original"
                original_examples = generate_tokenized_examples(
                    examples, tokenizer, None, None)

                if args.output_examples:
                    output_mcmrc.output_examples(
                        tokenized_examples,
                        original_examples,
                        task_name,
                        'ent_anon'
                        if args.entity_anonymization else args.input_ablation,
                    )

                if args.output_mturk:
                    output_mcmrc.output_mturk(tokenized_examples,
                                              original_examples, task_name,
                                              args.input_ablation)

                exit(1)

            features = convert_examples_to_features(
                tokenized_examples,
                num_options,
                args.max_seq_length,
                args.max_query_length,
                args.max_option_length,
                tokenizer,
            )

            if not args.no_cache:
                with open(cache_path, "wb") as f:
                    pickle.dump(features, f)

        # assert len(examples) == len(features)
        return features

    if args.do_train:
        train_examples = processor.get_train_examples()
        if args.small_debug:
            train_examples = train_examples[:6000]
        num_train_per_epoch = len(train_examples)
        num_train_per_epoch /= args.train_batch_size
        num_train_per_epoch /= args.gradient_accumulation_steps
        num_train_steps = int(num_train_per_epoch * args.num_train_epochs)
        train_features = cache_features(train_examples, "train")

    if args.do_eval:
        if args.eval_on_train:
            eval_examples = processor.get_train_examples()
            eval_features = cache_features(eval_examples, "train")
        else:
            eval_examples = processor.get_dev_examples()
            if args.small_debug:
                eval_examples = eval_examples[:1000]
            eval_features = cache_features(eval_examples, "dev")

    if args.do_test:
        eval_examples = processor.get_test_examples()
        eval_features = cache_features(eval_examples, "test")

    global entity_set
    if args.entity_anonymization:
        if len(entity_set) == 0:
            anon_tag_cache_file = os.path.join(
                args.cache_dir,
                f'{task_name}_anon_tags_{args.entity_anonymization}.pkl')
            if not os.path.exists(anon_tag_cache_file):
                raise ValueError("vocabulary cache cannot be loaded")
            entity_set = pickle.load(open(anon_tag_cache_file, 'rb'))
            tokenizer.vocab_update(sorted(entity_set))
        else:
            anon_tag_cache_file = os.path.join(
                args.cache_dir,
                f'{task_name}_anon_tags_{args.entity_anonymization}.pkl')
            if not os.path.exists(anon_tag_cache_file):
                with open(anon_tag_cache_file, 'wb') as f:
                    pickle.dump(entity_set, f)

    # Prepare model
    model = BertForMultipleChoice(bert_config, num_options)
    if args.init_checkpoint is not None:
        state_dict = torch.load(args.init_checkpoint, map_location="cpu")
        if list(state_dict)[0].startswith("bert."):
            # finetuned on some target dataset
            model.load_state_dict(state_dict)
        else:
            # pretrained language model
            model.bert.load_state_dict(state_dict)

    if args.entity_anonymization and len(entity_set):
        model.bert.embeddings.extend_word_embeddings(len(entity_set))

    if args.limit_vocab_size or args.limit_vocab_freq:
        use_vocab, train_features, eval_features = vocab_selection(
            train_features,
            eval_features,
            args.cache_dir,
            args.output_dir,
            task_name,
            tokenizer,
            args.entity_anonymization,
            args.limit_vocab_size,
            args.limit_vocab_freq,
            num_options=num_options,
        )
        id_to_token = {v: k for k, v in tokenizer.vocab.items()}
        use_tokens = [id_to_token[i] for i in use_vocab]
        logger.info(sorted(use_tokens))
        logger.info(f'{len(use_tokens)}')
        model.bert.embeddings.limit_vocab(use_vocab)

    if args.fp16:
        model.half()

    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if args.fp16:
        param_optimizer = [
            (n, param.clone().detach().to("cpu").float().requires_grad_())
            for n, param in model.named_parameters()
        ]
    elif args.optimize_on_cpu:
        param_optimizer = [(n,
                            param.clone().detach().to("cpu").requires_grad_())
                           for n, param in model.named_parameters()]
    else:
        param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "gamma", "beta"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in param_optimizer if n not in no_decay],
            "weight_decay_rate": 0.01,
        },
        {
            "params": [p for n, p in param_optimizer if n in no_decay],
            "weight_decay_rate": 0.0,
        },
    ]
    optimizer = BERTAdam(
        optimizer_grouped_parameters,
        lr=args.learning_rate,
        warmup=args.warmup_proportion,
        t_total=num_train_steps,
    )
    global_step = 0

    if args.enter_debugger:
        model.eval()
        # features = convert_examples_to_features(
        #     eval_examples,
        #     num_options,
        #     args.max_seq_length,
        #     args.max_query_length,
        #     args.max_option_length,
        #     tokenizer,
        # )
        # output = get_predictions(
        #     model, eval_examples, features, args, device
        # )
        # output_logits, output_predictions, eval_loss, eval_accuracy = output
        print("in debugger")
        import pdb
        pdb.set_trace()

    def eval_func(num_epoch=-1, num_step=-1, log_spec=None):
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        model.eval()

        output = get_predictions(model, eval_examples, eval_features, args,
                                 device)
        output_logits, output_predictions, eval_loss, eval_accuracy = output

        model.train()

        output_qids = [e.qid for e in eval_examples]
        output_answers = [e.ans_idx for e in eval_examples]

        result = {
            "eval_loss": eval_loss,
            "eval_accuracy": eval_accuracy,
            "global_step": global_step,
        }

        output_spec = ""
        if num_epoch > -1 and num_step > -1:
            output_spec = "_{}_{}".format(num_epoch, num_step)
        elif log_spec:
            output_spec += "_{}".format(log_spec)

        output_eval_file = os.path.join(
            args.output_dir, "eval_results{}.json".format(output_spec))
        result["spec"] = output_spec
        logger.info("***** Eval results *****")
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
        with open(output_eval_file, "w") as writer:
            writer.write(json.dumps(result))

        output_pred_file = os.path.join(
            args.output_dir, "eval_preds{}.jsonl".format(output_spec))
        with open(output_pred_file, "w") as f:
            for qid, ans, pred, logit in zip(output_qids, output_answers,
                                             output_predictions,
                                             output_logits):
                result = {
                    "qid": qid,
                    "answer": chr(ans + ord("A")),
                    "prediction": chr(pred + ord("A")),
                    "logits": logit.tolist(),
                }
                f.write(json.dumps(result) + "\n")

    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for i in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            tmp_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.fp16 and args.loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/..
                    # ..sdk/mixed-precision-training/index.html
                    loss = loss * args.loss_scale
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16 or args.optimize_on_cpu:
                        if args.fp16 and args.loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                param.grad.data /= args.loss_scale
                        is_nan = set_optimizer_params_grad(
                            param_optimizer,
                            model.named_parameters(),
                            test_nan=True)
                        if is_nan:
                            logger.info("""FP16 TRAINING: Nan in gradients,
                                reducing loss scaling""")
                            args.loss_scale /= 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(
                            model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                    global_step += 1

                    if global_step % args.save_model_steps == 0:
                        output_model_file = os.path.join(
                            args.output_dir,
                            "pytorch_model_step_{}.bin".format(global_step),
                        )
                        if n_gpu > 1:
                            torch.save(model.module.state_dict(),
                                       output_model_file)
                        else:
                            torch.save(model.state_dict(), output_model_file)

                    tmp_loss += loss.item()
                    if (args.loss_report_steps > 0
                            and global_step % args.loss_report_steps == 0):
                        logger.info("Step loss: {}".format(
                            tmp_loss / args.loss_report_steps))
                        tmp_loss = 0

                    if (args.eval_steps > 0 and global_step > 0
                            and global_step % args.eval_steps == 0
                            and args.do_eval):
                        eval_func(i, global_step, args.log_spec)

            output_model_file = os.path.join(
                args.output_dir, "pytorch_model_epoch_{}.bin".format(i))
            if n_gpu > 1:
                torch.save(model.module.state_dict(), output_model_file)
            else:
                torch.save(model.state_dict(), output_model_file)
            if args.do_eval:
                eval_func(i, global_step, args.log_spec)

    if not args.do_train and args.do_eval:
        eval_func(log_spec=args.log_spec or "dev")

    if args.do_test:
        eval_func(log_spec=args.log_spec or "test")
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help=
        "The config json file corresponding to the pre-trained BERT model. \n"
        "This specifies the model architecture.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--accumulate_gradients",
        type=int,
        default=1,
        help=
        "Number of steps to accumulate gradient on (divide the batch_size and accumulate)"
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    args = parser.parse_args()

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}"
            .format(args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size *
            args.num_train_epochs)

    model = BertForSequenceClassification(bert_config, len(label_list))
    if args.init_checkpoint is not None:
        model.bert.load_state_dict(
            torch.load(args.init_checkpoint, map_location='cpu'))
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ['bias', 'gamma', 'beta']
    optimizer_parameters = [{
        'params':
        [p for n, p in model.named_parameters() if n not in no_decay],
        'weight_decay_rate':
        0.01
    }, {
        'params': [p for n, p in model.named_parameters() if n in no_decay],
        'weight_decay_rate':
        0.0
    }]

    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    global_step = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()  # We have accumulated enought gradients
                    model.zero_grad()
                    global_step += 1

    if args.do_eval:
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss, logits = model(input_ids, segment_ids,
                                              input_mask, label_ids)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples

        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'global_step': global_step,
            'loss': tr_loss / nb_tr_steps
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser()
    BERT_DIR = "./model/uncased_L-12_H-768_A-12/"
    ## Required parameters
    parser.add_argument("--bert_config_file", default=BERT_DIR+"bert_config.json", \
                        type=str, help="The config json file corresponding to the pre-trained BERT model. "
                             "This specifies the model architecture.")
    parser.add_argument("--vocab_file", default=BERT_DIR+"vocab.txt", type=str, \
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--output_dir", default="out", type=str, \
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--train_file", type=str, \
                        help="SQuAD json for training. E.g., train-v1.1.json", \
                        default="")
    parser.add_argument("--predict_file", type=str,
                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json", \
                        default="")
    parser.add_argument("--init_checkpoint", type=str,
                        help="Initial checkpoint (usually from a pre-trained BERT model).", \
                        default=BERT_DIR+"pytorch_model.bin")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help="Whether to lower case the input text. Should be True for uncased "
        "models and False for cased models.")
    parser.add_argument(
        "--max_seq_length",
        default=300,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=128,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=10.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--iterations_per_loop",
                        default=1000,
                        type=int,
                        help="How many steps to make in each estimator call.")
    parser.add_argument(
        "--n_best_size",
        default=3,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")

    parser.add_argument(
        "--verbose_logging",
        default=False,
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--accumulate_gradients",
        type=int,
        default=1,
        help=
        "Number of steps to accumulate gradient on (divide the batch_size and accumulate)"
    )
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument('--eval_period', type=int, default=2000)

    parser.add_argument('--max_n_answers', type=int, default=5)
    parser.add_argument('--merge_query', type=int, default=-1)
    parser.add_argument('--reduce_layers', type=int, default=-1)
    parser.add_argument('--reduce_layers_to_tune', type=int, default=-1)

    parser.add_argument('--only_comp', action="store_true", default=False)

    parser.add_argument('--train_subqueries_file', type=str, default="")  #500
    parser.add_argument('--predict_subqueries_file', type=str,
                        default="")  #500
    parser.add_argument('--prefix', type=str, default="")  #500

    parser.add_argument('--model', type=str, default="qa")  #500
    parser.add_argument('--pooling', type=str, default="max")
    parser.add_argument('--debug', action="store_true", default=False)
    parser.add_argument('--output_dropout_prob', type=float, default=0)
    parser.add_argument('--wait_step', type=int, default=30)
    parser.add_argument('--with_key', action="store_true", default=False)
    parser.add_argument('--add_noise', action="store_true", default=False)

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
        if not args.predict_file:
            raise ValueError(
                "If `do_train` is True, then `predict_file` must be specified."
            )

    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.do_train and args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        logger.info("Output directory () already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None

    eval_dataloader, eval_examples, eval_features, _ = get_dataloader(
        logger=logger,
        args=args,
        input_file=args.predict_file,
        subqueries_file=args.predict_subqueries_file,
        is_training=False,
        batch_size=args.predict_batch_size,
        num_epochs=1,
        tokenizer=tokenizer)
    if args.do_train:
        train_dataloader, train_examples, _, num_train_steps = get_dataloader(
                logger=logger, args=args, \
                input_file=args.train_file, \
                subqueries_file=args.train_subqueries_file, \
                is_training=True,
                batch_size=args.train_batch_size,
                num_epochs=args.num_train_epochs,
                tokenizer=tokenizer)

    #a = input()
    if args.model == 'qa':
        model = BertForQuestionAnswering(bert_config, 4)
        metric_name = "F1"
    elif args.model == 'classifier':
        if args.reduce_layers != -1:
            bert_config.num_hidden_layers = args.reduce_layers
        model = BertClassifier(bert_config, 2, args.pooling)
        metric_name = "F1"
    elif args.model == "span-predictor":
        if args.reduce_layers != -1:
            bert_config.num_hidden_layers = args.reduce_layers
        if args.with_key:
            Model = BertForQuestionAnsweringWithKeyword
        else:
            Model = BertForQuestionAnswering
        model = Model(bert_config, 2)
        metric_name = "Accuracy"
    else:
        raise NotImplementedError()

    if args.init_checkpoint is not None and args.do_predict and \
                len(args.init_checkpoint.split(','))>1:
        assert args.model == "qa"
        model = [model]
        for i, checkpoint in enumerate(args.init_checkpoint.split(',')):
            if i > 0:
                model.append(BertForQuestionAnswering(bert_config, 4))
            print("Loading from", checkpoint)
            state_dict = torch.load(checkpoint, map_location='cpu')
            filter = lambda x: x[7:] if x.startswith('module.') else x
            state_dict = {filter(k): v for (k, v) in state_dict.items()}
            model[-1].load_state_dict(state_dict)
            model[-1].to(device)

    else:
        if args.init_checkpoint is not None:
            print("Loading from", args.init_checkpoint)
            state_dict = torch.load(args.init_checkpoint, map_location='cpu')
            if args.reduce_layers != -1:
                state_dict = {k:v for k, v in state_dict.items() \
                    if not '.'.join(k.split('.')[:3]) in \
                    ['encoder.layer.{}'.format(i) for i in range(args.reduce_layers, 12)]}
            if args.do_predict:
                filter = lambda x: x[7:] if x.startswith('module.') else x
                state_dict = {filter(k): v for (k, v) in state_dict.items()}
                model.load_state_dict(state_dict)
            else:
                model.bert.load_state_dict(state_dict)
                if args.reduce_layers_to_tune != -1:
                    model.bert.embeddings.required_grad = False
                    n_layers = 12 if args.reduce_layers == -1 else args.reduce_layers
                    for i in range(n_layers - args.reduce_layers_to_tune):
                        model.bert.encoder.layer[i].require_grad = False

        model.to(device)

        if args.local_rank != -1:
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[args.local_rank],
                output_device=args.local_rank)
        elif n_gpu > 1:
            model = torch.nn.DataParallel(model)

    if args.do_train:
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_parameters = [{
            'params':
            [p for n, p in model.named_parameters() if n not in no_decay],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in model.named_parameters() if n in no_decay],
            'weight_decay_rate':
            0.0
        }]

        optimizer = BERTAdam(optimizer_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps)

        global_step = 0

        best_f1 = 0
        wait_step = 0
        model.train()
        global_step = 0
        stop_training = False

        for epoch in range(int(args.num_train_epochs)):
            for step, batch in tqdm(enumerate(train_dataloader)):
                global_step += 1
                batch = [t.to(device) for t in batch]
                loss = model(batch, global_step)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if global_step % args.gradient_accumulation_steps == 0:
                    optimizer.step()  # We have accumulated enought gradients
                    model.zero_grad()
                if global_step % args.eval_period == 0:
                    model.eval()
                    f1 =  predict(args, model, eval_dataloader, eval_examples, eval_features, \
                                  device, write_prediction=False)
                    logger.info("%s: %.3f on epoch=%d" %
                                (metric_name, f1 * 100.0, epoch))
                    if best_f1 < f1:
                        logger.info("Saving model with best %s: %.3f -> %.3f on epoch=%d" % \
                                (metric_name, best_f1*100.0, f1*100.0, epoch))
                        model_state_dict = {
                            k: v.cpu()
                            for (k, v) in model.state_dict().items()
                        }
                        torch.save(
                            model_state_dict,
                            os.path.join(args.output_dir, "best-model.pt"))
                        model = model.cuda()
                        best_f1 = f1
                        wait_step = 0
                        stop_training = False
                    else:
                        wait_step += 1
                        if best_f1 > 0.1 and wait_step == args.wait_step:
                            stop_training = True
                    model.train()
            if stop_training:
                break

    elif args.do_predict:
        if type(model) == list:
            model = [m.eval() for m in model]
        else:
            model.eval()
        f1 = predict(args, model, eval_dataloader, eval_examples,
                     eval_features, device)
        logger.info("Final %s score: %.3f%%" % (metric_name, f1 * 100.0))
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help="The config json file corresponding to the pre-trained BERT model. "
        "This specifies the model architecture.")
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help="Whether to lower case the input text. Should be True for uncased "
        "models and False for cased models.")
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--iterations_per_loop",
                        default=1000,
                        type=int,
                        help="How many steps to make in each estimator call.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        default=False,
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--optimize_on_cpu',
        default=False,
        action='store_true',
        help=
        "Whether to perform optimization and keep the optimizer averages on CPU"
    )

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory () already exists and is not empty.")
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = read_squad_examples(input_file=args.train_file,
                                             is_training=True)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size *
            args.num_train_epochs)

    model = BertForQuestionAnswering(bert_config)
    if args.init_checkpoint is not None:
        model.bert.load_state_dict(
            torch.load(args.init_checkpoint, map_location='cpu'))

    if not args.optimize_on_cpu:
        model.to(device)
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_parameters = [{
        'params':
        [p for n, p in model.named_parameters() if n not in no_decay],
        'weight_decay_rate':
        0.01
    }, {
        'params': [p for n, p in model.named_parameters() if n in no_decay],
        'weight_decay_rate':
        0.0
    }]
    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    global_step = 0
    if args.do_train:
        train_features = convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)
        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_start_positions = torch.tensor(
            [f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor(
            [f.end_position for f in train_features], dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_start_positions,
                                   all_end_positions)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
                loss = model(input_ids, segment_ids, input_mask,
                             start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.optimize_on_cpu:
                        model.to('cpu')
                    optimizer.step()  # We have accumulated enought gradients
                    model.zero_grad()
                    if args.optimize_on_cpu:
                        model.to(device)
                    global_step += 1

    if args.do_predict:
        eval_examples = read_squad_examples(input_file=args.predict_file,
                                            is_training=False)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_example_index)
        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(
                eval_dataloader, desc="Evaluating"):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_start_logits, batch_end_logits = model(
                    input_ids, segment_ids, input_mask)
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(
                    RawResult(unique_id=unique_id,
                              start_logits=start_logits,
                              end_logits=end_logits))
        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(args.output_dir,
                                         "nbest_predictions.json")
        write_predictions(eval_examples, eval_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, output_prediction_file,
                          output_nbest_file, args.verbose_logging)
Exemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        choices=["sentihood_single", "sentihood_NLI_M", "sentihood_QA_M", \
                                "sentihood_NLI_B", "sentihood_QA_B", "semeval_single", \
                                "semeval_NLI_M", "semeval_QA_M", "semeval_NLI_B", "semeval_QA_B"],
                        help="The name of the task to train.")
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help=
        "The config json file corresponding to the pre-trained BERT model. \n"
        "This specifies the model architecture.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        required=True,
        help="Initial checkpoint (usually from a pre-trained BERT model).")

    ## Other parameters
    parser.add_argument("--eval_test",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the test set.")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=4,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--accumulate_gradients",
        type=int,
        default=1,
        help=
        "Number of steps to accumulate gradient on (divide the batch_size and accumulate)"
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    args = parser.parse_args()

    #--task_name sentihood_NLI_M \
    #--data_dir data/sentihood/bert-pair/ \
    #--vocab_file uncased_L-12_H-768_A-12/vocab.txt \
    #--bert_config_file uncased_L-12_H-768_A-12/bert_config.json \
    #--init_checkpoint uncased_L-12_H-768_A-12/pytorch_model.bin \
    #--eval_test \
    #--do_lower_case \
    #--max_seq_length 128 \
    #--train_batch_size 2 \
    #--learning_rate 2e-5 \
    #--num_train_epochs 6.0 \
    #--output_dir senti_results/sentihood/NLI_M \
    #--seed 42
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    #bert_config = BertConfig.from_json_file(args.bert_config_file)

    #if args.max_seq_length > bert_config.max_position_embeddings:
    #   raise ValueError(
    #      "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format(
    #      args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    # prepare dataloaders
    processors = {
        "sentihood_single": Sentihood_single_Processor,
        "sentihood_NLI_M": Sentihood_NLI_M_Processor,
        "sentihood_QA_M": Sentihood_QA_M_Processor,
        "sentihood_NLI_B": Sentihood_NLI_B_Processor,
        "sentihood_QA_B": Sentihood_QA_B_Processor,
        "semeval_single": Semeval_single_Processor,
        "semeval_NLI_M": Semeval_NLI_M_Processor,
        "semeval_QA_M": Semeval_QA_M_Processor,
        "semeval_NLI_B": Semeval_NLI_B_Processor,
        "semeval_QA_B": Semeval_QA_B_Processor,
    }

    processor = processors[args.task_name]()
    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    # training set
    train_examples = None
    num_train_steps = None
    train_examples = processor.get_train_examples(args.data_dir)
    num_train_steps = int(
        len(train_examples) / args.train_batch_size * args.num_train_epochs)

    train_features = convert_examples_to_features_sentihood(
        train_examples, label_list, args.max_seq_length, tokenizer)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)  #乘完了iteration的次数

    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features],
                                 dtype=torch.long)
    all__detection_label_ids = torch.tensor(
        [f.label_exist_id for f in train_features], dtype=torch.long)
    all_aspects = torch.tensor([f.aspect for f in train_features],
                               dtype=torch.long)
    all_pos = torch.tensor([f.my_pos for f in train_features],
                           dtype=torch.long)
    all_dis = torch.tensor([f.dis for f in train_features], dtype=torch.long)
    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                               all_label_ids, all__detection_label_ids,
                               all_aspects, all_pos, all_dis)
    if args.local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    # test set
    if args.eval_test:
        test_examples = processor.get_test_examples(args.data_dir)
        test_features = convert_examples_to_features_sentihood(
            test_examples, label_list, args.max_seq_length, tokenizer)

        all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in test_features],
                                     dtype=torch.long)
        all_detection_label_ids = torch.tensor(
            [f.label_exist_id for f in test_features], dtype=torch.long)
        all_aspects = torch.tensor([f.aspect for f in test_features],
                                   dtype=torch.long)
        all_pos = torch.tensor([f.my_pos for f in test_features],
                               dtype=torch.long)
        all_dis = torch.tensor([f.dis for f in test_features],
                               dtype=torch.long)
        test_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids,
                                  all_detection_label_ids, all_aspects,
                                  all_pos, all_dis)
        test_dataloader = DataLoader(test_data,
                                     batch_size=args.eval_batch_size,
                                     shuffle=False)

    # model and optimizer

    model = BertForSequenceClassification(len(label_list))
    # model = torch.load("model_data/attention_add_model")
    #if args.init_checkpoint is not None:
    # model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ['bias', 'gamma', 'beta']
    optimizer_parameters = [
        {
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        },  #xiaobuxing
        {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.0
        }
    ]

    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    #optimizer = transformers.AdamW(optimizer_parameters)
    # train
    output_log_file = os.path.join(args.output_dir, "log.txt")
    print("output_log_file=", output_log_file)
    with open(output_log_file, "w") as writer:
        if args.eval_test:
            writer.write(
                "epoch\tglobal_step\tloss\ttest_loss\ttest_accuracy\n")
        else:
            writer.write("epoch\tglobal_step\tloss\n")

    global_step = 0
    epoch = 0
    # model = torch.load("model_data/attention_add_model6")
    #40
    for _ in trange(int(args.num_train_epochs), desc="Epoch"):
        epoch += 1
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
            batch = tuple(t.to(device) for t in batch)
            # print()
            input_ids, input_mask, segment_ids, label_ids, exist_ids, all_aspects, pos, dis = batch
            loss, _, _ = model(input_ids, segment_ids, input_mask, label_ids,
                               exist_ids, all_aspects, pos, dis)
            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()
            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()  # We have accumulated enought gradients
                model.zero_grad()
                global_step += 1
        # break
        torch.save(model, "model_data/attention_add_model" + str(epoch))
        #eval_test
        if args.eval_test:
            model.eval()
            test_loss, test_accuracy, senti_test_accuracy = 0, 0, 0
            nb_test_steps, nb_test_examples, senti_nb_test_examples = 0, 0, 0
            with open(
                    os.path.join(args.output_dir,
                                 "test_ep_" + str(epoch) + ".txt"),
                    "w") as f_test:
                for input_ids, input_mask, segment_ids, label_ids, exist_ids, all_aspects, pos, dis in test_dataloader:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)
                    exist_ids = exist_ids.to(device)
                    all_aspects = all_aspects.to(device)
                    dis = dis.to(device)
                    pos = pos.to(device)
                    with torch.no_grad():
                        tmp_test_loss, detect_logits, sent_logits = model(
                            input_ids, segment_ids, input_mask, label_ids,
                            exist_ids, all_aspects, pos, dis)

                    detect_logits = F.softmax(detect_logits, dim=-1)
                    detect_logits = detect_logits.detach().cpu().numpy()
                    exist_ids = exist_ids.to('cpu').numpy()
                    outputs = np.argmax(detect_logits, axis=1)
                    for output_i in range(len(outputs)):
                        f_test.write(str(outputs[output_i]))
                        for ou in detect_logits[output_i]:
                            f_test.write(" " + str(ou))
                        f_test.write("\n")
                    tmp_test_accuracy = np.sum(outputs == exist_ids)

                    test_loss += tmp_test_loss.mean().item()
                    test_accuracy += tmp_test_accuracy

                    sent_logits = F.softmax(sent_logits, dim=-1)
                    sent_logits = sent_logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    senti_outputs = np.argmax(sent_logits, axis=1)
                    for output_i in range(len(senti_outputs)):
                        f_test.write(str(senti_outputs[output_i]))
                        for ou in sent_logits[output_i]:
                            f_test.write(" " + str(ou))
                        f_test.write("\n")
                    senti_tmp_test_accuracy = np.sum(
                        senti_outputs == label_ids)

                    senti_test_accuracy += senti_tmp_test_accuracy

                    nb_test_examples += input_ids.size(0)
                    senti_nb_test_examples += np.sum(exist_ids)
                    nb_test_steps += 1

            test_loss = test_loss / nb_test_steps
            test_accuracy = test_accuracy / nb_test_examples
            senti_accuracy = senti_test_accuracy / senti_nb_test_examples

        result = collections.OrderedDict()
        if args.eval_test:
            result = {
                'epoch': epoch,
                'global_step': global_step,
                'loss': tr_loss / nb_tr_steps,
                'test_loss': test_loss,
                'test_accuracy': test_accuracy,
                'senti_test_accuracy': senti_accuracy,
            }
        else:
            result = {
                'epoch': epoch,
                'global_step': global_step,
                'loss': tr_loss / nb_tr_steps
            }

        logger.info("***** Eval results *****")
        with open(output_log_file, "a+") as writer:
            for key in result.keys():
                logger.info("  %s = %s\n", key, str(result[key]))
                writer.write("%s\t" % (str(result[key])))
            writer.write("\n")
Exemplo n.º 9
0
    def __init__(self, args):
        self.opt = args
        self.writer = SummaryWriter(log_dir=self.opt.output_dir)  # tensorboard
        bert_config = BertConfig.from_json_file(args.bert_config_file)

        if args.max_seq_length > bert_config.max_position_embeddings:
            raise ValueError(
                "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}"
                .format(args.max_seq_length,
                        bert_config.max_position_embeddings))

        # if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        #     raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
        # os.makedirs(args.output_dir, exist_ok=True)

        self.dataset = ReadData(self.opt)  # Read the data and preprocess it
        self.num_train_steps = None
        self.num_train_steps = int(
            len(self.dataset.train_examples) / self.opt.train_batch_size /
            self.opt.gradient_accumulation_steps * self.opt.num_train_epochs)
        self.opt.label_size = len(self.dataset.label_list)
        args.output_dim = len(self.dataset.label_list)
        print("label size: {}".format(args.output_dim))

        # 初始化模型
        print("initialize model ...")
        if args.model_class == BertForSequenceClassification:
            self.model = BertForSequenceClassification(
                bert_config, len(self.dataset.label_list))
        else:
            self.model = model_classes[args.model_name](bert_config, args)

        if args.init_checkpoint is not None:
            self.model.bert.load_state_dict(
                torch.load(args.init_checkpoint, map_location='cpu'))
        if args.fp16:
            self.model.half()
        # 冻结参数
        # for name, p in self.model.named_parameters():
        #     if name.startswith('bert.encoder.layer.11') or name.startswith('bert.encoder.layer.10') or name.startswith('bert.encoder.layer.9') or name.startswith('bert.encoder.layer.8'):  # 冻结最后一层
        #         p.requires_grad = False
        # 计算模型的参数个数
        n_trainable_params, n_nontrainable_params = 0, 0
        for p in self.model.parameters():
            n_params = torch.prod(torch.tensor(
                p.shape))  # torch.prod()表示计算所有元素的乘积
            if p.requires_grad:  # 是否需要求梯度
                n_trainable_params += n_params
            else:
                n_nontrainable_params += n_params
        print('n_trainable_params: {0}, n_nontrainable_params: {1}'.format(
            n_trainable_params, n_nontrainable_params))
        self.model.to(args.device)

        # 并行化
        if self.opt.n_gpu > 1:
            self.model = torch.nn.DataParallel(self.model,
                                               device_ids=self.opt.gpu_id)

        # Prepare optimizer
        if args.fp16:
            self.param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
                                    for n, param in self.model.named_parameters()]
        elif args.optimize_on_cpu:
            self.param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                                    for n, param in self.model.named_parameters()]
        else:
            self.param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [{
            'params':
            [p for n, p in self.param_optimizer if n not in no_decay],
            'weight_decay_rate':
            0.01
        }, {
            'params': [p for n, p in self.param_optimizer if n in no_decay],
            'weight_decay_rate':
            0.0
        }]
        self.optimizer = BERTAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  warmup=args.warmup_proportion,
                                  t_total=self.num_train_steps)
        # 配置自己模型的优化器
        # [p for pname, p in self.param_optimizer if not pname.startswith('module.bert')]
        # self.optimizer_me = torch.optim.Adam(
        #     [{'params': [p for pname, p in self.param_optimizer if not pname.startswith('module.bert')]}], lr=0.001,
        #     weight_decay=0)
        # self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer_me, mode='max',
        #                                           patience=3)  # 3个epoch后,所监测的值停止增加时自动调整学习率

        self.global_step = 0  # 初始化全局步数为 0
        self.max_test_acc = 0
        self.max_test_f1 = 0
Exemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help=
        "The config json file corresponding to the pre-trained BERT model. \n"
        "This specifies the model architecture.")
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    # Other parameters
    parser.add_argument(
        "--load_all",
        default=False,
        action='store_true',
        help="Whether to load all parameter or only for bert part.")
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument("--source",
                        default=None,
                        type=str,
                        help="Start point for finetune.")
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--multi",
                        default=False,
                        help="Whether to add adapter modules",
                        action='store_true')
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument(
        "--optim",
        default='normal',
        help=
        "Whether to split up the optimiser between adapters and not adapters.")
    parser.add_argument(
        "--sample",
        default='rr',
        help="How to sample tasks, other options 'prop', 'sqrt' or 'anneal'")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--h_aug",
                        default="n/a",
                        help="Size of hidden state for adapters..")
    parser.add_argument("--tasks",
                        default="all",
                        type=str,
                        help="Which set of tasks to train on.")
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--freeze",
                        default=False,
                        action='store_true',
                        help="Freeze base network weights")
    parser.add_argument("--freeze_regex", default='no', help="Freeze code")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    args = parser.parse_args()
    # config logger
    task_names = args.tasks.split(',')

    output_dir = os.path.join(
        args.output_dir, args.source + '_TO_' + '_'.join(task_names) + '_' +
        os.path.basename(args.bert_config_file).replace('.json', '') + '_' +
        args.freeze_regex + '_' + uuid.uuid4().hex[:8])
    tf_writer = SummaryWriter(os.path.join(output_dir, 'log'))
    json.dump(vars(args),
              open(os.path.join(output_dir, 'run_config.json'), 'w'),
              indent=2)
    os.makedirs(output_dir, exist_ok=True)

    log_file = os.path.join(output_dir, 'std.log')
    if log_file:
        logfile = logging.FileHandler(log_file, 'w')
        logger.addHandler(logfile)

    processors = {
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "rte": RTEProcessor,
        "sts": STSProcessor,
        "qqp": QQPProcessor,
        "qnli": QNLIProcessor,
        "snli": SNLIProcessor,
        "scitail": ScitailProcessor,
        "wnli": WnliProcessor,
        "msmarco": MsMarcoProcessor,
        "wikiqa": WikiqaProcessor
    }
    task_id_mappings = {
        'mnli': 0,
        'mrpc': 1,
        'rte': 2,
        'sts': 3,
        'qqp': 4,
        'qnli': 5,
        'snli': 6,
        'scitail': 7,
        'wnli': 8,
        "msmarco": 9,
        "wikiqa": 10
    }
    task_num_labels = [3, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2]
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}"
            .format(args.max_seq_length, bert_config.max_position_embeddings))

    os.makedirs(args.output_dir, exist_ok=True)

    task_names = args.tasks.split(',')
    if args.source != None:
        output_dir = os.path.join(
            args.output_dir,
            args.source + '_TO_' + '_'.join(task_names) + '_' +
            os.path.basename(args.bert_config_file).replace('.json', '') +
            '_' + uuid.uuid4().hex[:8])
    else:
        output_dir = os.path.join(
            args.output_dir, '_'.join(task_names) + '_' +
            os.path.basename(args.bert_config_file).replace('.json', ''))
    tf_writer = SummaryWriter(os.path.join(output_dir, 'log'))
    json.dump(vars(args),
              open(os.path.join(output_dir, 'run_config.json'), 'w'),
              indent=2)
    os.makedirs(output_dir, exist_ok=True)
    processor_list = [processors[task_name]() for task_name in task_names]
    label_list = [processor.get_labels() for processor in processor_list]

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    num_tasks = len(task_names)
    if args.do_train:
        train_examples = [
            processor.get_train_examples(args.data_dir + data_dir)
            for processor, data_dir in zip(processor_list, task_names)
        ]
        num_train_examples = [len(tr) for tr in train_examples]
        total_train_examples = sum(num_train_examples)
        num_train_steps = int(total_train_examples / args.train_batch_size *
                              args.num_train_epochs)
        total_tr = num_train_steps
        steps_per_epoch = int(num_train_steps / args.num_train_epochs)

    if args.h_aug is not 'n/a':
        bert_config.hidden_size_aug = int(args.h_aug)
    model = BertForMultiNLI(bert_config, task_num_labels)

    if args.init_checkpoint is not None:
        if args.load_all:
            missing_keys, unexpected_keys = model.load_state_dict(torch.load(
                args.init_checkpoint, map_location='cpu'),
                                                                  strict=False)
            logger.info('missing keys: {}'.format(missing_keys))
            logger.info('unexpected keys: {}'.format(unexpected_keys))
        elif args.multi:
            partial = torch.load(args.init_checkpoint, map_location='cpu')
            model_dict = model.bert.state_dict()
            update = {}
            for n, p in model_dict.items():
                if 'aug' in n or 'mult' in n:
                    update[n] = p
                    if 'pooler.mult' in n and 'bias' in n:
                        update[n] = partial['pooler.dense.bias']
                    if 'pooler.mult' in n and 'weight' in n:
                        update[n] = partial['pooler.dense.weight']
                else:
                    update[n] = partial[n]
            model.bert.load_state_dict(update)
        else:
            model.bert.load_state_dict(
                torch.load(args.init_checkpoint, map_location='cpu'))
            # Only initialized bert part params which has no 'bert' prefix
            bert_partial = torch.load(args.init_checkpoint, map_location='cpu')
            model_dict = model.state_dict()
            update = {}
            for n, p in model_dict.items():
                if 'bert' in n:
                    update[n[5:]] = bert_partial[n[5:]]
            missing_keys, unexpected_keys = model.bert.load_state_dict(
                update, strict=False)
            logger.info('missing keys: {}'.format(missing_keys))
            logger.info('unexpected keys: {}'.format(unexpected_keys))

    if args.freeze:
        for n, p in model.bert.named_parameters():
            if 'aug' in n or 'classifier' in n or 'mult' in n or 'gamma' in n or 'beta' in n:
                continue
            p.requires_grad = False

    def freeze_by_layer(layerno):
        freeze_layers = list(range(12))
        freeze_layers.remove(int(layerno))
        freeze_layers = [
            'encoder.layer.{}.'.format(no) for no in freeze_layers
        ]
        for n, p in model.bert.named_parameters():
            if 'embeddings' in n:
                p.requires_grad = False
            if 'pooler' in n:
                p.requires_grad = False
            for freeze_layer in freeze_layers:
                if n.startswith(freeze_layer):
                    p.requires_grad = False
        for n, p in model.bert.named_parameters():
            logger.info('{}\t{}'.format(p.requires_grad, n))

    if args.freeze_regex in [str(x) for x in range(11)]:
        logger.info('Tune some layer!')
        freeze_by_layer(args.freeze_regex)

    if args.freeze_regex == 'all':
        logger.info('Tune all bias parameters!')
        for n, p in model.bert.named_parameters():
            if "bias" not in n:
                p.requires_grad = False
                non_tuned += p.numel()
            else:
                tuned += p.numel()

    if args.freeze_regex == 'attention_bias':
        logger.info('Tune all attetnion bias parameters!')
        for n, p in model.bert.named_parameters():
            if "bias" in n and 'attention' in n:
                tuned += p.numel()
            else:
                p.requires_grad = False
                non_tuned += p.numel()

    if args.freeze_regex == 'linear_bias':
        logger.info('Tune all linear bias parameters!')
        for n, p in model.bert.named_parameters():
            if "bias" in n and ('output' in n or 'intermediate' in n):
                tuned += p.numel()
            else:
                p.requires_grad = False
                non_tuned += p.numel()

    if args.freeze_regex == 'layer_norm':
        logger.info('Tune all layer norm bias parameters!')
        for n, p in model.bert.named_parameters():
            if 'gamma' in n or 'beta' in n:
                tuned += p.numel()
            else:
                p.requires_grad = False
                non_tuned += p.numel()

    if args.freeze_regex == 'attn_self':
        logger.info('Tune all layer attention parameters!')
        for n, p in model.bert.named_parameters():
            if 'attention' in n:
                tuned += p.numel()
            else:
                p.requires_grad = False
                non_tuned += p.numel()

    for n, p in model.bert.named_parameters():
        logger.info('{}\t{}'.format(p.requires_grad, n))

    logger.info('tuned:{}({}), not tuned: {}'.format(tuned,
                                                     round(tuned / total, 6),
                                                     non_tuned))

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    if args.optim == 'normal':
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay) and p.requires_grad
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay) and p.requires_grad
            ],
            'weight_decay_rate':
            0.0
        }]
        optimizer = BERTAdam(optimizer_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=total_tr)
    else:
        no_decay = ['bias', 'gamma', 'beta']
        base = ['attn']
        optimizer_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n
                           for nd in no_decay) and not any(nd in n
                                                           for nd in base)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay) and not any(nd in n
                                                               for nd in base)
            ],
            'weight_decay_rate':
            0.0
        }]
        optimizer = BERTAdam(optimizer_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=total_tr)
        optimizer_parameters_mult = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay) and any(nd in n
                                                               for nd in base)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay) and any(nd in n
                                                           for nd in base)
            ],
            'weight_decay_rate':
            0.0
        }]
        optimizer_mult = BERTAdam(optimizer_parameters_mult,
                                  lr=3e-4,
                                  warmup=args.warmup_proportion,
                                  t_total=total_tr)
    if args.do_eval:
        eval_loaders = []
        for i, task in enumerate(task_names):
            eval_examples = processor_list[i].get_dev_examples(args.data_dir +
                                                               task_names[i])
            eval_features = convert_examples_to_features(
                eval_examples, label_list[i], args.max_seq_length, tokenizer,
                task)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in eval_features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label_ids)
            eval_sampler = SequentialSampler(eval_data)
            eval_loaders.append(
                DataLoader(eval_data,
                           sampler=eval_sampler,
                           batch_size=args.eval_batch_size))

    global_step = 0
    if args.do_train:
        loaders = []
        logger.info("  Num Tasks = %d", len(train_examples))
        for i, task in enumerate(task_names):
            train_features = convert_examples_to_features(
                train_examples[i], label_list[i], args.max_seq_length,
                tokenizer, task)
            logger.info("***** training data for %s *****", task)
            logger.info("  Data size = %d", len(train_features))

            all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in train_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in train_features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.long)

            train_data = TensorDataset(all_input_ids, all_input_mask,
                                       all_segment_ids, all_label_ids)
            train_sampler = RandomSampler(train_data)
            loaders.append(
                iter(
                    DataLoader(train_data,
                               sampler=train_sampler,
                               batch_size=args.train_batch_size)))
        total_params = sum(p.numel() for p in model.parameters())
        logger.info("  Num param = {}".format(total_params))
        loaders = [cycle(it) for it in loaders]
        model.train()
        best_score = 0.
        if args.sample == 'sqrt' or args.sample == 'prop':
            probs = num_train_examples
            if args.sample == 'prop':
                alpha = 1.
            if args.sample == 'sqrt':
                alpha = 0.5
            probs = [p**alpha for p in probs]
            tot = sum(probs)
            probs = [p / tot for p in probs]
        epoch = 0
        tr_loss = [0. for i in range(num_tasks)]
        nb_tr_steps = [0 for i in range(num_tasks)]
        nb_tr_instances = [0 for i in range(num_tasks)]
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            if args.sample == 'anneal':
                probs = num_train_examples
                alpha = 1. - 0.8 * epoch / (args.num_train_epochs - 1)
                probs = [p**alpha for p in probs]
                tot = sum(probs)
                probs = [p / tot for p in probs]

            for step in tqdm(range(steps_per_epoch)):
                task_index = np.random.choice(len(task_names), p=probs)
                task_model_index = task_id_mappings[task_names[task_index]]
                batch = next(loaders[task_index])
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, _ = model(input_ids, segment_ids, input_mask,
                                task_model_index, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss[task_index] += loss.item() * input_ids.size(0)
                nb_tr_instances[task_index] += input_ids.size(0)
                nb_tr_steps[task_index] += 1
                # if step % 1000 < num_tasks:
                #     logger.info("Task: {}, Step: {}".format(task_names[task_index], step))
                #     logger.info("Loss: {}".format(tr_loss[task_index]/nb_tr_instances[task_index]))
                tf_writer.add_scalar(
                    'loss/{}'.format(task_names[task_index]),
                    tr_loss[task_index] / nb_tr_instances[task_index],
                    nb_tr_steps[task_index])
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()  # We have accumulated enought gradients
                    if args.optim != 'normal':
                        optimizer_mult.step()
                    model.zero_grad()
                    global_step += 1
            epoch += 1
            ev_acc = 0.
            for i, task in enumerate(task_names):
                ev_acc += do_eval(model, logger, args, device, tr_loss[i],
                                  nb_tr_steps[i], global_step,
                                  processor_list[i], label_list[i], tokenizer,
                                  eval_loaders[i], task_id_mappings[task],
                                  task_names, task, output_dir)
            logger.info("Total acc: {}".format(ev_acc))
            if ev_acc > best_score:
                best_score = ev_acc
                model_dir = os.path.join(output_dir, "best_model.pth")
                torch.save(model.state_dict(), model_dir)
                output_eval_file = os.path.join(output_dir,
                                                "best_eval_results.txt")
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Best Eval results *****")
                    writer.write("%s = %s\n" % ('best_score', ev_acc))
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_config_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The config json file corresponding to the pre-trained BERT model. \n"
                             "This specifies the model architecture.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument("--vocab_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--init_checkpoint",
                        default=None,
                        type=str,
                        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument("--do_lower_case",
                        default=False,
                        action='store_true',
                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--discr",
                        default=False,
                        action='store_true',
                        help="Whether to do discriminative fine-tuning.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--accumulate_gradients",
                        type=int,
                        default=1,
                        help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed', 
                        type=int, 
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumualte before performing a backward/update pass.")                       
    args = parser.parse_args()

    processors = {
        "all":AllProcessor,
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format(
                            args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format(
            args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size * args.num_train_epochs)

    model = BertForSequenceClassification(bert_config, len(label_list))
    if args.init_checkpoint is not None:
        model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))

  #  torch.save(model.bert.state_dict(),"pytorch_model.bin")
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ['bias', 'gamma', 'beta']
    if args.discr:
        group1=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.']
        group2=['layer.6.','layer.7.','layer.8.','layer.9.','layer.10.']
        group3=['layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay_rate': 0.01},
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay_rate': 0.01, 'lr': args.learning_rate/1.5},
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay_rate': 0.01, 'lr': args.learning_rate},
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay_rate': 0.01, 'lr': args.learning_rate*1.5},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay_rate': 0.0},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay_rate': 0.0, 'lr': args.learning_rate/1.5},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay_rate': 0.0, 'lr': args.learning_rate},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay_rate': 0.0, 'lr': args.learning_rate*1.5},
        ]
    else:
        optimizer_parameters = [
             {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
             {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
             ]
		
    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)
	
    global_step = 0
    eval_examples = processor.get_dev_examples(args.data_dir)
    eval_features_1, eval_features_2, eval_features_3, eval_features_4 = convert_examples_to_features(
        eval_examples, label_list, args.max_seq_length, tokenizer)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features_1], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features_1], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features_1], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in eval_features_1], dtype=torch.long)
    all_dataset_label_ids = torch.tensor([f.dataset_label_id for f in eval_features_1], dtype=torch.long)
    eval_data_1 = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_dataset_label_ids)
    eval_dataloader_1 = DataLoader(eval_data_1, batch_size=args.eval_batch_size, shuffle=False)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features_2], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features_2], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features_2], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in eval_features_2], dtype=torch.long)
    all_dataset_label_ids = torch.tensor([f.dataset_label_id for f in eval_features_2], dtype=torch.long)
    eval_data_2 = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_dataset_label_ids)
    eval_dataloader_2 = DataLoader(eval_data_2, batch_size=args.eval_batch_size, shuffle=False)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features_3], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features_3], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features_3], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in eval_features_3], dtype=torch.long)
    all_dataset_label_ids = torch.tensor([f.dataset_label_id for f in eval_features_3], dtype=torch.long)
    eval_data_3 = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_dataset_label_ids)
    eval_dataloader_3 = DataLoader(eval_data_3, batch_size=args.eval_batch_size, shuffle=False)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features_4], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features_4], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features_4], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in eval_features_4], dtype=torch.long)
    all_dataset_label_ids = torch.tensor([f.dataset_label_id for f in eval_features_4], dtype=torch.long)
    eval_data_4 = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_dataset_label_ids)
    eval_dataloader_4 = DataLoader(eval_data_4, batch_size=args.eval_batch_size, shuffle=False)

    if args.do_train:
        train_features_1, train_features_2, train_features_3, train_features_4 = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features_1], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features_1], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features_1], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features_1], dtype=torch.long)
        all_dataset_label_ids = torch.tensor([f.dataset_label_id for f in train_features_1], dtype=torch.long)
        train_data_1 = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_dataset_label_ids)
        if args.local_rank == -1:
            train_sampler_1 = RandomSampler(train_data_1)
        else:
            train_sampler_1 = DistributedSampler(train_data_1)
        train_dataloader_1 = DataLoader(train_data_1, sampler=train_sampler_1, batch_size=args.train_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in train_features_2], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features_2], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features_2], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features_2], dtype=torch.long)
        all_dataset_label_ids = torch.tensor([f.dataset_label_id for f in train_features_2], dtype=torch.long)
        train_data_2 = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_dataset_label_ids)
        if args.local_rank == -1:
            train_sampler_2 = RandomSampler(train_data_2)
        else:
            train_sampler_2 = DistributedSampler(train_data_2)
        train_dataloader_2 = DataLoader(train_data_2, sampler=train_sampler_2, batch_size=args.train_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in train_features_3], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features_3], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features_3], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features_3], dtype=torch.long)
        all_dataset_label_ids = torch.tensor([f.dataset_label_id for f in train_features_3], dtype=torch.long)
        train_data_3 = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_dataset_label_ids)
        if args.local_rank == -1:
            train_sampler_3 = RandomSampler(train_data_3)
        else:
            train_sampler_3 = DistributedSampler(train_data_3)
        train_dataloader_3 = DataLoader(train_data_3, sampler=train_sampler_3, batch_size=args.train_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in train_features_4], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features_4], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features_4], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features_4], dtype=torch.long)
        all_dataset_label_ids = torch.tensor([f.dataset_label_id for f in train_features_4], dtype=torch.long)
        train_data_4 = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_dataset_label_ids)
        if args.local_rank == -1:
            train_sampler_4 = RandomSampler(train_data_4)
        else:
            train_sampler_4 = DistributedSampler(train_data_4)
        train_dataloader_4 = DataLoader(train_data_4, sampler=train_sampler_4, batch_size=args.train_batch_size)


        print("len(train_features_1)=",len(train_features_1))
        print("len(train_features_2)=",len(train_features_2))
        print("len(train_features_3)=",len(train_features_3))
        print("len(train_features_4)=",len(train_features_4))
        a=[]
        for i in range(int(len(train_features_1)/args.train_batch_size)):
            a.append(1)
        for i in range(int(len(train_features_2)/args.train_batch_size)):
            a.append(2)
        for i in range(int(len(train_features_3)/args.train_batch_size)):
            a.append(3)
        for i in range(int(len(train_features_4)/args.train_batch_size)):
            a.append(4)
        print("len(a)=",len(a))
        random.shuffle(a)
        print("a[:20]=",a[:20])

        epoch=0
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            random.shuffle(a)
            print("a[:20]=",a[:20])
            epoch+=1
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, number in enumerate((tqdm(a, desc="Iteration"))):
                if number==1:batch=train_dataloader_1.__iter__().__next__()
                if number==2:batch=train_dataloader_2.__iter__().__next__()
                if number==3:batch=train_dataloader_3.__iter__().__next__()
                if number==4:batch=train_dataloader_4.__iter__().__next__()
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, dataset_label_id = batch
                loss, _ = model(input_ids, segment_ids, input_mask, label_ids, dataset_label_id)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()    # We have accumulated enought gradients
                    model.zero_grad()
                    global_step += 1

            torch.save(model.module.bert.state_dict(), os.path.join(args.output_dir,'pytorch_model'+str(epoch)+'.bin'))

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            with open(os.path.join(args.output_dir, "results_imdb_ep_"+str(epoch)+".txt"),"w") as f:
                for input_ids, input_mask, segment_ids, label_ids, dataset_label_id in eval_dataloader_1:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids, dataset_label_id)

                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)
                    for output in outputs:
                        f.write(str(output)+"\n")
                    tmp_eval_accuracy=np.sum(outputs == label_ids)

                    eval_loss += tmp_eval_loss.mean().item()
                    eval_accuracy += tmp_eval_accuracy

                    nb_eval_examples += input_ids.size(0)
                    nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = eval_accuracy / nb_eval_examples
            eval_loss_imdb = eval_loss
            eval_accuracy_imdb = eval_accuracy

            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            with open(os.path.join(args.output_dir, "results_yelp_p_ep_"+str(epoch)+".txt"),"w") as f:
                for input_ids, input_mask, segment_ids, label_ids, dataset_label_id in eval_dataloader_2:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids, dataset_label_id)

                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)
                    for output in outputs:
                        f.write(str(output)+"\n")
                    tmp_eval_accuracy=np.sum(outputs == label_ids)

                    eval_loss += tmp_eval_loss.mean().item()
                    eval_accuracy += tmp_eval_accuracy

                    nb_eval_examples += input_ids.size(0)
                    nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = eval_accuracy / nb_eval_examples
            eval_loss_yelp_p = eval_loss
            eval_accuracy_yelp_p = eval_accuracy

            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            with open(os.path.join(args.output_dir, "results_ag_ep_"+str(epoch)+".txt"),"w") as f:
                for input_ids, input_mask, segment_ids, label_ids, dataset_label_id in eval_dataloader_3:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids, dataset_label_id)

                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)
                    for output in outputs:
                        f.write(str(output)+"\n")
                    tmp_eval_accuracy=np.sum(outputs == label_ids)

                    eval_loss += tmp_eval_loss.mean().item()
                    eval_accuracy += tmp_eval_accuracy

                    nb_eval_examples += input_ids.size(0)
                    nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = eval_accuracy / nb_eval_examples
            eval_loss_ag = eval_loss
            eval_accuracy_ag = eval_accuracy

            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            with open(os.path.join(args.output_dir, "results_dbpedia_ep_"+str(epoch)+".txt"),"w") as f:
                for input_ids, input_mask, segment_ids, label_ids, dataset_label_id in eval_dataloader_4:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids, dataset_label_id)

                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)
                    for output in outputs:
                        f.write(str(output)+"\n")
                    tmp_eval_accuracy=np.sum(outputs == label_ids)

                    eval_loss += tmp_eval_loss.mean().item()
                    eval_accuracy += tmp_eval_accuracy

                    nb_eval_examples += input_ids.size(0)
                    nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = eval_accuracy / nb_eval_examples
            eval_loss_dbpedia = eval_loss
            eval_accuracy_dbpedia = eval_accuracy

            result = {'eval_loss_imdb': eval_loss_imdb,
                      'eval_accuracy_imdb': eval_accuracy_imdb,
                      'eval_loss_yelp_p': eval_loss_yelp_p,
                      'eval_accuracy_yelp_p': eval_accuracy_yelp_p,
                      'eval_loss_ag': eval_loss_ag,
                      'eval_accuracy_ag': eval_accuracy_ag,
                      'eval_loss_dbpedia': eval_loss_dbpedia,
                      'eval_accuracy_dbpedia': eval_accuracy_dbpedia,
                      'global_step': global_step,
                      'loss': tr_loss/nb_tr_steps}

            output_eval_file = os.path.join(args.output_dir, "eval_results_ep_"+str(epoch)+".txt")
            print("output_eval_file=",output_eval_file)
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
Exemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument('--pause', type=int, default=0)
    parser.add_argument('--iteration', type=str, default='1')
    parser.add_argument('--fs', type=str, default='local',
                        help='must be `local`. Do not change.')

    # Data paths
    parser.add_argument('--data_dir', default='data/', type=str)
    parser.add_argument("--train_file", default='train-v1.1.json', type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument("--predict_file", default='dev-v1.1.json', type=str,
                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
    parser.add_argument('--gt_file', default='dev-v1.1.json', type=str, help='ground truth file needed for evaluation.')

    # Metadata paths
    parser.add_argument('--metadata_dir', default='metadata/', type=str)
    parser.add_argument("--vocab_file", default='vocab.txt', type=str,
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--bert_model_option", default='large_uncased', type=str,
                        help="model architecture option. [large_uncased] or [base_uncased]")
    parser.add_argument("--bert_config_file", default='bert_config.json', type=str,
                        help="The config json file corresponding to the pre-trained BERT model. "
                             "This specifies the model architecture.")
    parser.add_argument("--init_checkpoint", default='pytorch_model.bin', type=str,
                        help="Initial checkpoint (usually from a pre-trained BERT model).")

    # Output and load paths
    parser.add_argument("--output_dir", default='out/', type=str,
                        help="The output directory where the model checkpoints will be written.")
    parser.add_argument("--index_file", default='index.hdf5', type=str, help="index output file.")
    parser.add_argument("--question_emb_file", default='question.hdf5', type=str, help="question output file.")

    parser.add_argument('--load_dir', default='out/', type=str)

    # Local paths (if we want to run cmd)
    parser.add_argument('--eval_script', default='evaluate-v1.1.py', type=str)

    # Do's
    parser.add_argument("--do_load", default=False, action='store_true', help='Do load. If eval, do load automatically')
    parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.")
    parser.add_argument("--do_train_filter", default=False, action='store_true', help='Train filter or not.')
    parser.add_argument("--do_train_sparse", default=False, action='store_true', help='Train sparse or not.')
    parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument('--do_eval', default=False, action='store_true')
    parser.add_argument('--do_embed_question', default=False, action='store_true')
    parser.add_argument('--do_index', default=False, action='store_true')
    parser.add_argument('--do_serve', default=False, action='store_true')

    # Model options: if you change these, you need to train again
    parser.add_argument("--do_case", default=False, action='store_true',
                        help="Whether to lower case the input text. Should be True for uncased "
                             "models and False for cased models.")
    parser.add_argument('--phrase_size', default=961, type=int)
    parser.add_argument('--metric', default='ip', type=str, help='ip | l2')
    parser.add_argument("--use_sparse", default=False, action='store_true')

    # GPU and memory related options
    parser.add_argument("--max_seq_length", default=384, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
                             "longer than this will be truncated, and sequences shorter than this will be padded.")
    parser.add_argument("--doc_stride", default=128, type=int,
                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
    parser.add_argument("--max_query_length", default=64, type=int,
                        help="The maximum number of tokens for the question. Questions longer than this will "
                             "be truncated to this length.")
    parser.add_argument("--train_batch_size", default=12, type=int, help="Total batch size for training.")
    parser.add_argument("--predict_batch_size", default=16, type=int, help="Total batch size for predictions.")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--optimize_on_cpu',
                        default=False,
                        action='store_true',
                        help="Whether to perform optimization and keep the optimizer averages on CPU")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--fp16',
                        default=False,
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")

    # Training options: only effective during training
    parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--num_train_filter_epochs", default=1.0, type=float,
                        help="Total number of training epochs for filter to perform.")
    parser.add_argument("--num_train_sparse_epochs", default=3.0, type=float,
                        help="Total number of training epochs for sparse to perform.")
    parser.add_argument("--warmup_proportion", default=0.1, type=float,
                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
                             "of training.")
    parser.add_argument("--save_checkpoints_steps", default=1000, type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--iterations_per_loop", default=1000, type=int,
                        help="How many steps to make in each estimator call.")

    # Prediction options: only effective during prediction
    parser.add_argument("--n_best_size", default=20, type=int,
                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
                             "output file.")
    parser.add_argument("--max_answer_length", default=30, type=int,
                        help="The maximum length of an answer that can be generated. This is needed because the start "
                             "and end predictions are not conditioned on one another.")

    # Index Options
    parser.add_argument('--dtype', default='float32', type=str)
    parser.add_argument('--filter_threshold', default=-1e9, type=float)
    parser.add_argument('--compression_offset', default=-2, type=float)
    parser.add_argument('--compression_scale', default=20, type=float)
    parser.add_argument('--split_by_para', default=False, action='store_true')

    # Serve Options
    parser.add_argument('--port', default=9009, type=int)

    # Others
    parser.add_argument('--parallel', default=False, action='store_true')
    parser.add_argument("--verbose_logging", default=False, action='store_true',
                        help="If true, all of the warnings related to data processing will be printed. "
                             "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--draft', default=False, action='store_true')
    parser.add_argument('--draft_num_examples', type=int, default=12)

    args = parser.parse_args()

    # Filesystem routines
    if args.fs == 'local':
        class Processor(object):
            def __init__(self, path):
                self._save = None
                self._load = None
                self._path = path

            def bind(self, save, load):
                self._save = save
                self._load = load

            def save(self, checkpoint=None, save_fn=None, **kwargs):
                path = os.path.join(self._path, str(checkpoint))
                if save_fn is None:
                    self._save(path, **kwargs)
                else:
                    save_fn(path, **kwargs)

            def load(self, checkpoint, load_fn=None, session=None, **kwargs):
                assert self._path == session
                path = os.path.join(self._path, str(checkpoint), 'model.pt')
                if load_fn is None:
                    self._load(path, **kwargs)
                else:
                    load_fn(path, **kwargs)

        processor = Processor(args.load_dir)
    else:
        raise ValueError(args.fs)

    if not args.do_train:
        args.do_load = True

    # Configure paths
    args.train_file = os.path.join(args.data_dir, args.train_file)
    args.predict_file = os.path.join(args.data_dir, args.predict_file)
    args.gt_file = os.path.join(args.data_dir, args.gt_file)

    args.bert_config_file = os.path.join(args.metadata_dir, args.bert_config_file.replace(".json", "") +
                                         "_" + args.bert_model_option + ".json")
    args.init_checkpoint = os.path.join(args.metadata_dir, args.init_checkpoint.replace(".bin", "") +
                                        "_" + args.bert_model_option + ".bin")
    args.vocab_file = os.path.join(args.metadata_dir, args.vocab_file)
    args.index_file = os.path.join(args.output_dir, args.index_file)

    # Multi-GPU stuff
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    # Seed for reproducibility
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        # raise ValueError("Output directory () already exists and is not empty.")
        pass
    else:
        os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=not args.do_case)

    model = BertPhraseModel(
        bert_config,
        phrase_size=args.phrase_size,
        metric=args.metric,
        use_sparse=args.use_sparse
    )

    print('Number of model parameters:', sum(p.numel() for p in model.parameters()))

    if not args.do_load and args.init_checkpoint is not None:
        state_dict = torch.load(args.init_checkpoint, map_location='cpu')
        # If below: for Korean BERT compatibility
        if next(iter(state_dict)).startswith('bert.'):
            state_dict = {key[len('bert.'):]: val for key, val in state_dict.items()}
            state_dict = {key: val for key, val in state_dict.items() if key in model.encoder.bert_model.state_dict()}
        model.encoder.bert.load_state_dict(state_dict)

    if args.fp16:
        model.half()

    if not args.optimize_on_cpu:
        model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif args.parallel or n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.do_load:
        bind_model(processor, model)
        processor.load(args.iteration, session=args.load_dir)

    if args.do_train:
        train_examples = read_squad_examples(
            input_file=args.train_file, is_training=True, draft=args.draft, draft_num_examples=args.draft_num_examples)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

        no_decay = ['bias', 'gamma', 'beta']
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01},
            {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0}
        ]
        optimizer = BERTAdam(optimizer_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps)

        bind_model(processor, model, optimizer)

        global_step = 0
        train_features, train_features_ = convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)

        train_features = inject_noise_to_features_list(train_features,
                                                       clamp=True,
                                                       replace=True,
                                                       shuffle=True)

        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)

        all_input_ids_ = torch.tensor([f.input_ids for f in train_features_], dtype=torch.long)
        all_input_mask_ = torch.tensor([f.input_mask for f in train_features_], dtype=torch.long)

        if args.fp16:
            (all_input_ids, all_input_mask,
             all_start_positions,
             all_end_positions) = tuple(t.half() for t in (all_input_ids, all_input_mask,
                                                           all_start_positions, all_end_positions))
            all_input_ids_, all_input_mask_ = tuple(t.half() for t in (all_input_ids_, all_input_mask_))

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_input_ids_, all_input_mask_,
                                   all_start_positions, all_end_positions)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for epoch in range(int(args.num_train_epochs)):
            for step, batch in enumerate(tqdm(train_dataloader, desc="Epoch %d" % (epoch + 1))):
                batch = tuple(t.to(device) for t in batch)
                (input_ids, input_mask,
                 input_ids_, input_mask_,
                 start_positions, end_positions) = batch
                loss, _ = model(input_ids, input_mask,
                                input_ids_, input_mask_,
                                start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.optimize_on_cpu:
                        model.to('cpu')
                    optimizer.step()  # We have accumulated enought gradients
                    model.zero_grad()
                    if args.optimize_on_cpu:
                        model.to(device)
                    global_step += 1

            processor.save(epoch + 1)

    if args.do_train_filter:
        train_examples = read_squad_examples(
            input_file=args.train_file, is_training=True, draft=args.draft, draft_num_examples=args.draft_num_examples)
        num_train_steps = int(
            len(
                train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_filter_epochs)

        if args.parallel or n_gpu > 1:
            optimizer = Adam(model.module.filter.parameters())
        else:
            optimizer = Adam(model.filter.parameters())

        bind_model(processor, model, optimizer)

        global_step = 0
        train_features, train_features_ = convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)
        logger.info("***** Running filter training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)

        all_input_ids_ = torch.tensor([f.input_ids for f in train_features_], dtype=torch.long)
        all_input_mask_ = torch.tensor([f.input_mask for f in train_features_], dtype=torch.long)

        if args.fp16:
            (all_input_ids, all_input_mask,
             all_start_positions,
             all_end_positions) = tuple(t.half() for t in (all_input_ids, all_input_mask,
                                                           all_start_positions, all_end_positions))
            all_input_ids_, all_input_mask_ = tuple(t.half() for t in (all_input_ids_, all_input_mask_))

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_input_ids_, all_input_mask_,
                                   all_start_positions, all_end_positions)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for epoch in range(int(args.num_train_filter_epochs)):
            for step, batch in enumerate(tqdm(train_dataloader, desc="Epoch %d" % (epoch + 1))):
                batch = tuple(t.to(device) for t in batch)
                (input_ids, input_mask,
                 input_ids_, input_mask_,
                 start_positions, end_positions) = batch
                _, loss = model(input_ids, input_mask,
                                input_ids_, input_mask_,
                                start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.optimize_on_cpu:
                        model.to('cpu')
                    optimizer.step()  # We have accumulated enought gradients
                    model.zero_grad()
                    if args.optimize_on_cpu:
                        model.to(device)
                    global_step += 1

            processor.save(epoch + 1)

    if args.do_train_sparse:
        train_examples = read_squad_examples(
            input_file=args.train_file, is_training=True, draft=args.draft, draft_num_examples=args.draft_num_examples)
        num_train_steps = int(
            len(
                train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_sparse_epochs)

        '''
        if args.parallel or n_gpu > 1:
            optimizer = Adam(model.module.sparse_layer.parameters())
        else:
            optimizer = Adam(model.sparse_layer.parameters())
        '''

        no_decay = ['bias', 'gamma', 'beta']
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if (n not in no_decay) and ('filter' not in n)],
             'weight_decay_rate': 0.01},
            {'params': [p for n, p in model.named_parameters() if (n in no_decay) and ('filter' not in n)],
             'weight_decay_rate': 0.0}
        ]
        optimizer = BERTAdam(optimizer_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps)

        bind_model(processor, model, optimizer)

        global_step = 0
        train_features, train_features_ = convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)
        logger.info("***** Running sparse training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)

        all_input_ids_ = torch.tensor([f.input_ids for f in train_features_], dtype=torch.long)
        all_input_mask_ = torch.tensor([f.input_mask for f in train_features_], dtype=torch.long)

        if args.fp16:
            (all_input_ids, all_input_mask,
             all_start_positions,
             all_end_positions) = tuple(t.half() for t in (all_input_ids, all_input_mask,
                                                           all_start_positions, all_end_positions))
            all_input_ids_, all_input_mask_ = tuple(t.half() for t in (all_input_ids_, all_input_mask_))

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_input_ids_, all_input_mask_,
                                   all_start_positions, all_end_positions)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for epoch in range(int(args.num_train_sparse_epochs)):
            for step, batch in enumerate(tqdm(train_dataloader, desc="Epoch %d" % (epoch + 1))):
                batch = tuple(t.to(device) for t in batch)
                (input_ids, input_mask,
                 input_ids_, input_mask_,
                 start_positions, end_positions) = batch
                loss, _ = model(input_ids, input_mask,
                                input_ids_, input_mask_,
                                start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.optimize_on_cpu:
                        model.to('cpu')
                    optimizer.step()  # We have accumulated enought gradients
                    model.zero_grad()
                    if args.optimize_on_cpu:
                        model.to(device)
                    global_step += 1

            processor.save(epoch + 1)

    if args.do_predict:
        eval_examples = read_squad_examples(
            input_file=args.predict_file, is_training=False, draft=args.draft,
            draft_num_examples=args.draft_num_examples)
        eval_features, query_eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_input_ids_ = torch.tensor([f.input_ids for f in query_eval_features], dtype=torch.long)
        all_input_mask_ = torch.tensor([f.input_mask for f in query_eval_features], dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        if args.fp16:
            (all_input_ids, all_input_mask, all_example_index) = tuple(t.half() for t in (all_input_ids, all_input_mask,
                                                                                          all_example_index))
            all_input_ids_, all_input_mask_ = tuple(t.half() for t in (all_input_ids_, all_input_mask_))

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_input_ids_, all_input_mask_,
                                  all_example_index)
        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)

        model.eval()
        logger.info("Start evaluating")

        def get_results():
            for (input_ids, input_mask, input_ids_, input_mask_, example_indices) in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                input_ids_ = input_ids_.to(device)
                input_mask_ = input_mask_.to(device)
                with torch.no_grad():
                    batch_all_logits, bs, be = model(input_ids, input_mask, input_ids_, input_mask_)
                for i, example_index in enumerate(example_indices):
                    all_logits = batch_all_logits[i].detach().cpu().numpy()
                    filter_start_logits = bs[i].detach().cpu().numpy()
                    filter_end_logits = be[i].detach().cpu().numpy()
                    eval_feature = eval_features[example_index.item()]
                    unique_id = int(eval_feature.unique_id)
                    yield RawResult(unique_id=unique_id,
                                    all_logits=all_logits,
                                    filter_start_logits=filter_start_logits,
                                    filter_end_logits=filter_end_logits)

        output_prediction_file = os.path.join(args.output_dir, "predictions.json")
        write_predictions(eval_examples, eval_features, get_results(),
                          args.max_answer_length,
                          not args.do_case, output_prediction_file, args.verbose_logging,
                          args.filter_threshold)

        if args.do_eval:
            command = "python %s %s %s" % (args.eval_script, args.gt_file, output_prediction_file)
            import subprocess
            process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
            output, error = process.communicate()

    if args.do_embed_question:
        question_examples = read_squad_examples(
            question_only=True,
            input_file=args.predict_file, is_training=False, draft=args.draft,
            draft_num_examples=args.draft_num_examples)
        query_eval_features = convert_questions_to_features(
            examples=question_examples,
            tokenizer=tokenizer,
            max_query_length=args.max_query_length)
        question_dataloader = convert_question_features_to_dataloader(query_eval_features, args.fp16, args.local_rank,
                                                                      args.predict_batch_size)

        model.eval()
        logger.info("Start embedding")
        question_results = get_question_results_(question_examples, query_eval_features, question_dataloader, device,
                                                 model)
        path = os.path.join(args.output_dir, args.question_emb_file)
        print('Writing %s' % path)
        write_question_results(question_results, query_eval_features, path)

    if args.do_index:
        if ':' not in args.predict_file:
            predict_files = [args.predict_file]
            offsets = [0]
        else:
            dirname = os.path.dirname(args.predict_file)
            basename = os.path.basename(args.predict_file)
            start, end = list(map(int, basename.split(':')))

            # skip files if possible
            if os.path.exists(args.index_file):
                with h5py.File(args.index_file, 'r') as f:
                    dids = list(map(int, f.keys()))
                start = int(max(dids) / 1000)
                print('%s exists; starting from %d' % (args.index_file, start))

            names = [str(i).zfill(4) for i in range(start, end)]
            predict_files = [os.path.join(dirname, name) for name in names]
            offsets = [int(each) * 1000 for each in names]

        for offset, predict_file in zip(offsets, predict_files):
            try:
                context_examples = read_squad_examples(
                    context_only=True,
                    input_file=predict_file, is_training=False, draft=args.draft,
                    draft_num_examples=args.draft_num_examples)

                for example in context_examples:
                    example.doc_idx += offset

                context_features = convert_documents_to_features(
                    examples=context_examples,
                    tokenizer=tokenizer,
                    max_seq_length=args.max_seq_length,
                    doc_stride=args.doc_stride)

                logger.info("***** Running indexing on %s *****" % predict_file)
                logger.info("  Num orig examples = %d", len(context_examples))
                logger.info("  Num split examples = %d", len(context_features))
                logger.info("  Batch size = %d", args.predict_batch_size)

                all_input_ids = torch.tensor([f.input_ids for f in context_features], dtype=torch.long)
                all_input_mask = torch.tensor([f.input_mask for f in context_features], dtype=torch.long)
                all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
                if args.fp16:
                    all_input_ids, all_input_mask, all_example_index = tuple(
                        t.half() for t in (all_input_ids, all_input_mask, all_example_index))

                context_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)

                if args.local_rank == -1:
                    context_sampler = SequentialSampler(context_data)
                else:
                    context_sampler = DistributedSampler(context_data)
                context_dataloader = DataLoader(context_data, sampler=context_sampler,
                                                batch_size=args.predict_batch_size)

                model.eval()
                logger.info("Start indexing")

                def get_context_results():
                    for (input_ids, input_mask, example_indices) in context_dataloader:
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        with torch.no_grad():
                            batch_start, batch_end, batch_span_logits, bs, be, batch_sparse = model(input_ids,
                                                                                                    input_mask)
                        for i, example_index in enumerate(example_indices):
                            start = batch_start[i].detach().cpu().numpy().astype(args.dtype)
                            end = batch_end[i].detach().cpu().numpy().astype(args.dtype)
                            sparse = None
                            if batch_sparse is not None:
                                sparse = batch_sparse[i].detach().cpu().numpy().astype(args.dtype)
                            span_logits = batch_span_logits[i].detach().cpu().numpy().astype(args.dtype)
                            filter_start_logits = bs[i].detach().cpu().numpy().astype(args.dtype)
                            filter_end_logits = be[i].detach().cpu().numpy().astype(args.dtype)
                            context_feature = context_features[example_index.item()]
                            unique_id = int(context_feature.unique_id)
                            yield ContextResult(unique_id=unique_id,
                                                start=start,
                                                end=end,
                                                span_logits=span_logits,
                                                filter_start_logits=filter_start_logits,
                                                filter_end_logits=filter_end_logits,
                                                sparse=sparse)

                t0 = time()
                write_hdf5(context_examples, context_features, get_context_results(),
                           args.max_answer_length, not args.do_case, args.index_file, args.filter_threshold,
                           args.verbose_logging,
                           offset=args.compression_offset, scale=args.compression_scale,
                           split_by_para=args.split_by_para,
                           use_sparse=args.use_sparse)
                print('%s: %.1f mins' % (predict_file, (time() - t0) / 60))
            except Exception as e:
                with open(os.path.join(args.output_dir, 'error_files.txt'), 'a') as fp:
                    fp.write('error file: %s\n' % predict_file)
                    fp.write('error message: %s\n' % str(e))

    if args.do_serve:
        def get(text):
            question_examples = [SquadExample(qas_id='serve', question_text=text)]
            query_eval_features = convert_questions_to_features(
                examples=question_examples,
                tokenizer=tokenizer,
                max_query_length=16)
            question_dataloader = convert_question_features_to_dataloader(query_eval_features, args.fp16,
                                                                          args.local_rank,
                                                                          args.predict_batch_size)

            model.eval()

            question_results = get_question_results_(question_examples, query_eval_features, question_dataloader,
                                                     device, model)
            question_result = next(iter(question_results))
            out = question_result.start.tolist(), question_result.end.tolist(), question_result.span_logit.tolist()
            return out

        serve(get, args.port)
Exemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default='data/semeval2015/three_joint/TO/',
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--output_dir",
        default='results/semeval2015/three_joint/TO/my_result',
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )
    parser.add_argument(
        "--vocab_file",
        default='uncased_L-12_H-768_A-12/vocab.txt',
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--bert_config_file",
        default='uncased_L-12_H-768_A-12/bert_config.json',
        type=str,
        required=True,
        help=
        "The config json file corresponding to the pre-trained BERT model. \n"
        "This specifies the model architecture.")
    parser.add_argument(
        "--init_checkpoint",
        default='uncased_L-12_H-768_A-12/pytorch_model.bin',
        type=str,
        required=True,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument(
        "--tokenize_method",
        default='word_split',
        type=str,
        required=True,
        choices=["prefix_match", "unk_replace", "word_split"],
        help=
        "how to solve the unknow words, max prefix match or replace with [UNK] or split to some words"
    )
    parser.add_argument("--use_crf",
                        default=True,
                        required=True,
                        action='store_true',
                        help="Whether to use CRF after Bert sequence_output")

    ## Other parameters
    parser.add_argument("--eval_test",
                        default=True,
                        action='store_true',
                        help="Whether to run eval on the test set.")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--train_batch_size",
                        default=24,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=30.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--accumulate_gradients",
        type=int,
        default=1,
        help=
        "Number of steps to accumulate gradient on (divide the batch_size and accumulate)"
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}"
            .format(args.max_seq_length, bert_config.max_position_embeddings))

    processor = Semeval_Processor()
    label_list = processor.get_labels()
    ner_label_list = processor.get_ner_labels(
        args.data_dir)  # BIO or TO tags for ner entity

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file,
        tokenize_method=args.tokenize_method,
        do_lower_case=args.do_lower_case)

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    # training set
    train_examples = None
    num_train_steps = None
    train_examples = processor.get_train_examples(args.data_dir)
    num_train_steps = int(
        len(train_examples) / args.train_batch_size * args.num_train_epochs)

    train_features, _, _ = convert_examples_to_features(
        train_examples, label_list, args.max_seq_length, tokenizer,
        ner_label_list, args.tokenize_method)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features],
                                 dtype=torch.long)
    all_ner_label_ids = torch.tensor([f.ner_label_ids for f in train_features],
                                     dtype=torch.long)
    all_ner_mask = torch.tensor([f.ner_mask for f in train_features],
                                dtype=torch.long)

    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                               all_label_ids, all_ner_label_ids, all_ner_mask)
    if args.local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    # test set
    if args.eval_test:
        test_examples = processor.get_test_examples(args.data_dir)
        test_features, test_tokens, all_b_tokens = convert_examples_to_features(
            test_examples, label_list, args.max_seq_length, tokenizer,
            ner_label_list, args.tokenize_method)
        print("test_tokens")
        print(test_tokens)

        all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in test_features],
                                     dtype=torch.long)
        all_ner_label_ids = torch.tensor(
            [f.ner_label_ids for f in test_features], dtype=torch.long)
        all_ner_mask = torch.tensor([f.ner_mask for f in test_features],
                                    dtype=torch.long)

        test_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids,
                                  all_ner_label_ids, all_ner_mask)
        test_dataloader = DataLoader(test_data,
                                     batch_size=args.eval_batch_size,
                                     shuffle=False)

    # model and optimizer

    if args.use_crf:
        model = BertForTABSAJoint_CRF(bert_config, len(label_list),
                                      len(ner_label_list))
    else:
        model = BertForTABSAJoint(bert_config, len(label_list),
                                  len(ner_label_list), args.max_seq_length)

    if args.init_checkpoint is not None:
        model.bert.load_state_dict(
            torch.load(args.init_checkpoint, map_location='cpu'))
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ['bias', 'gamma', 'beta']
    optimizer_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay_rate':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay_rate':
        0.0
    }]

    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    # train
    output_log_file = os.path.join(args.output_dir, "log.txt")
    print("output_log_file=", output_log_file)
    with open(output_log_file, "w") as writer:
        if args.eval_test:
            writer.write(
                "epoch\tglobal_step\tloss\ttest_loss\ttest_accuracy\n")
        else:
            writer.write("epoch\tglobal_step\tloss\n")

    global_step = 0
    epoch = 0
    for _ in trange(int(args.num_train_epochs), desc="Epoch"):
        epoch += 1
        model.train()
        tr_loss = 0
        tr_ner_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids, ner_label_ids, ner_mask = batch
            loss, ner_loss, _, _ = model(input_ids, segment_ids, input_mask,
                                         label_ids, ner_label_ids, ner_mask)

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
                ner_loss = ner_loss.mean()
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
                ner_loss = ner_loss / args.gradient_accumulation_steps
            loss.backward(retain_graph=True)
            ner_loss.backward()

            tr_loss += loss.item()
            tr_ner_loss += ner_loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()  # We have accumulated enought gradients
                model.zero_grad()
                global_step += 1

        torch.save(model.state_dict(),
                   os.path.join(args.output_dir, f'model_ep_{epoch}.bin'))
        # eval_test
        if args.eval_test:

            model.eval()
            test_loss, test_accuracy = 0, 0
            ner_test_loss = 0
            nb_test_steps, nb_test_examples = 0, 0
            with open(
                    os.path.join(args.output_dir,
                                 "test_ep_" + str(epoch) + ".txt"),
                    "w") as f_test:
                f_test.write(
                    'yes_not\tyes_not_pre\tsentence\ttrue_ner\tpredict_ner\n')
                batch_index = 0
                for input_ids, input_mask, segment_ids, label_ids, ner_label_ids, ner_mask in test_dataloader:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)
                    ner_label_ids = ner_label_ids.to(device)
                    ner_mask = ner_mask.to(device)
                    # test_tokens is the origin word in sentences [batch_size, sequence_length]
                    ner_test_tokens = test_tokens[batch_index *
                                                  args.eval_batch_size:
                                                  (batch_index + 1) *
                                                  args.eval_batch_size]
                    ner_b_tokens = all_b_tokens[batch_index *
                                                args.eval_batch_size:
                                                (batch_index + 1) *
                                                args.eval_batch_size]

                    batch_index += 1

                    with torch.no_grad():
                        tmp_test_loss, tmp_ner_test_loss, logits, ner_predict = model(
                            input_ids, segment_ids, input_mask, label_ids,
                            ner_label_ids, ner_mask)

                    # category & polarity
                    logits = F.softmax(logits, dim=-1)
                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)

                    if args.use_crf:
                        # CRF
                        ner_logits = ner_predict
                    else:
                        # softmax
                        ner_logits = torch.argmax(F.log_softmax(ner_predict,
                                                                dim=2),
                                                  dim=2)
                        ner_logits = ner_logits.detach().cpu().numpy()

                    ner_label_ids = ner_label_ids.to('cpu').numpy()
                    ner_mask = ner_mask.to('cpu').numpy()
                    print("outputs")
                    print(outputs)
                    print("ner_label_list")
                    print(ner_label_list)
                    print("ner_test_tokens")
                    print(ner_test_tokens)
                    print("ner_b_tokens")
                    print(ner_b_tokens)
                    print("label_ids")
                    print(label_ids)
                    print("ner_logits")
                    print(ner_logits)

                    for output_i in range(len(outputs)):
                        # category & polarity
                        f_test.write(str(label_ids[output_i]))
                        f_test.write('\t')
                        f_test.write(str(outputs[output_i]))
                        f_test.write('\t')

                        # sentence & ner labels
                        sentence_clean = []
                        aspect_sentiment_clean = []
                        label_true = []
                        label_pre = []
                        sentence_len = len(ner_test_tokens[output_i])
                        aspect_sentiment_len = len(ner_b_tokens[output_i])
                        for i in range(aspect_sentiment_len):
                            if not ner_b_tokens[output_i][i].startswith('##'):
                                aspect_sentiment_clean.append(
                                    ner_b_tokens[output_i][i])

                        for i in range(sentence_len):
                            if not ner_test_tokens[output_i][i].startswith(
                                    '##'):
                                sentence_clean.append(
                                    ner_test_tokens[output_i][i])
                                label_true.append(
                                    ner_label_list[ner_label_ids[output_i][i]])
                                label_pre.append(
                                    ner_label_list[ner_logits[output_i][i]])

                        f_test.write(' '.join(sentence_clean))
                        f_test.write('\t')
                        f_test.write(' '.join(aspect_sentiment_clean))
                        f_test.write('\t')
                        f_test.write(' '.join(label_true))
                        f_test.write("\t")
                        f_test.write(' '.join(label_pre))
                        f_test.write("\n")
                    tmp_test_accuracy = np.sum(outputs == label_ids)
                    test_loss += tmp_test_loss.mean().item()
                    ner_test_loss += tmp_ner_test_loss.mean().item()
                    test_accuracy += tmp_test_accuracy

                    nb_test_examples += input_ids.size(0)
                    nb_test_steps += 1

            test_loss = test_loss / nb_test_steps
            ner_test_loss = ner_test_loss / nb_test_steps
            test_accuracy = test_accuracy / nb_test_examples

        result = collections.OrderedDict()
        if args.eval_test:
            result = {
                'epoch': epoch,
                'global_step': global_step,
                'loss': tr_loss / nb_tr_steps,
                'test_loss': test_loss,
                'ner_test_loss': ner_test_loss,
                'test_accuracy': test_accuracy
            }
        else:
            result = {
                'epoch': epoch,
                'global_step': global_step,
                'loss': tr_loss / nb_tr_steps,
                'ner_loss': tr_ner_loss / nb_tr_steps
            }

        logger.info("***** Eval results *****")
        with open(output_log_file, "a+") as writer:
            for key in result.keys():
                logger.info("  %s = %s\n", key, str(result[key]))
                writer.write("%s\t" % (str(result[key])))
            writer.write("\n")
Exemplo n.º 14
0
def main():
    #解析参数
    config.parse()
    args = config.args
    for k,v in vars(args).items():
        logger.info(f"{k}:{v}")
    #set seeds
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)

    #arguments check
    device, n_gpu = args_check(args)
    os.makedirs(args.output_dir, exist_ok=True)
    forward_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
    args.forward_batch_size = forward_batch_size

    #准备任务
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    # eg: MNLI,['contradiction', 'entailment', 'neutral'] --> [“矛盾”,“必然”,“中立”]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Student的配置
    if args.model_architecture == "electra":
        # 从transformers包中导入ElectraConfig, 并加载配置
        bert_config_S = ElectraConfig.from_json_file(args.bert_config_file_S)
        # (args.output_encoded_layers=='true')  --> True, 默认输出隐藏层的状态
        bert_config_S.output_hidden_states = (args.output_encoded_layers == 'true')
        bert_config_S.output_attentions = (args.output_attention_layers=='true')
        # num_labels;类别个数
        bert_config_S.num_labels = num_labels
        assert args.max_seq_length <= bert_config_S.max_position_embeddings
    elif args.model_architecture == "albert":
        # 从transformers包中导入AlbertConfig, 并加载配置
        bert_config_S = AlbertConfig.from_json_file(args.bert_config_file_S)
        # (args.output_encoded_layers=='true')  --> True, 默认输出隐藏层的状态
        bert_config_S.output_hidden_states = (args.output_encoded_layers == 'true')
        bert_config_S.output_attentions = (args.output_attention_layers=='true')
        # num_labels;类别个数
        bert_config_S.num_labels = num_labels
        assert args.max_seq_length <= bert_config_S.max_position_embeddings
    else:
        bert_config_S = BertConfig.from_json_file(args.bert_config_file_S)
        assert args.max_seq_length <= bert_config_S.max_position_embeddings

    #read data
    train_dataset = None
    eval_datasets  = None
    num_train_steps = None
    # electra和bert都使用的bert的 tokenizer方式
    tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)
    # 加载数据集, 计算steps
    if args.do_train:
        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
        if args.aux_task_name:
            aux_train_dataset = load_and_cache_examples(args, args.aux_task_name, tokenizer, evaluate=False, is_aux=True)
            train_dataset = torch.utils.data.ConcatDataset([train_dataset, aux_train_dataset])
        num_train_steps = int(len(train_dataset)/args.train_batch_size) * args.num_train_epochs
        logger.info("训练数据集已加载")
    if args.do_predict:
        eval_datasets = []
        eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
        for eval_task in eval_task_names:
            eval_datasets.append(load_and_cache_examples(args, eval_task, tokenizer, evaluate=True))
        logger.info("预测数据集已加载")


    # Student的配置
    if args.model_architecture == "electra":
        #加载模型配置, 只用student模型,其实这里相当于训练教师模型,只训练一个模型
        model_S = ElectraSPC(bert_config_S)
    elif args.model_architecture == "albert":
        model_S = AlbertSPC(bert_config_S)
    else:
        #加载模型配置, 只用student模型,其实这里相当于训练教师模型,只训练一个模型
        model_S = BertSPCSimple(bert_config_S, num_labels=num_labels,args=args)
    #对加载后的student模型的参数进行初始化, 使用student模型预测
    if args.load_model_type=='bert' and args.model_architecture not in ["electra", "albert"]:
        assert args.init_checkpoint_S is not None
        state_dict_S = torch.load(args.init_checkpoint_S, map_location='cpu')
        if args.only_load_embedding:
            state_weight = {k[5:]:v for k,v in state_dict_S.items() if k.startswith('bert.embeddings')}
            missing_keys,_ = model_S.bert.load_state_dict(state_weight,strict=False)
            logger.info(f"Missing keys {list(missing_keys)}")
        else:
            state_weight = {k[5:]:v for k,v in state_dict_S.items() if k.startswith('bert.')}
            missing_keys,_ = model_S.bert.load_state_dict(state_weight,strict=False)
            print(f"missing_keys,注意丢失的参数{missing_keys}")
        logger.info("Model loaded")
    elif args.load_model_type=='all':
        assert args.tuned_checkpoint_S is not None
        state_dict_S = torch.load(args.tuned_checkpoint_S,map_location='cpu')
        model_S.load_state_dict(state_dict_S)
        logger.info("Model loaded")
    elif args.model_architecture in ["electra", "albert"]:
        assert args.init_checkpoint_S is not None
        state_dict_S = torch.load(args.init_checkpoint_S, map_location='cpu')
        missing_keys, unexpected_keys = model_S.load_state_dict(state_dict_S,strict=False)
        logger.info(f"missing keys:{missing_keys}")
        logger.info(f"unexpected keys:{unexpected_keys}")
    else:
        logger.info("Model is randomly initialized.")
    #模型move to device
    model_S.to(device)
    if args.local_rank != -1 or n_gpu > 1:
        if args.local_rank != -1:
            raise NotImplementedError
        elif n_gpu > 1:
            model_S = torch.nn.DataParallel(model_S) #,output_device=n_gpu-1)

    if args.do_train:
        #parameters, params是模型的所有参数组成的列表
        params = list(model_S.named_parameters())
        all_trainable_params = divide_parameters(params, lr=args.learning_rate)
        logger.info("要训练的模型参数量组是,包括decay_group和no_decay_group: %d", len(all_trainable_params))
        # 优化器设置
        optimizer = BERTAdam(all_trainable_params,lr=args.learning_rate,
                             warmup=args.warmup_proportion,t_total=num_train_steps,schedule=args.schedule,
                             s_opt1=args.s_opt1, s_opt2=args.s_opt2, s_opt3=args.s_opt3)

        logger.info("***** 开始训练 *****")
        logger.info("  样本数是 = %d", len(train_dataset))
        logger.info("  前向 batch size = %d", forward_batch_size)
        logger.info("  训练的steps = %d", num_train_steps)

        ########### 训练的配置 ###########
        train_config = TrainingConfig(
            gradient_accumulation_steps = args.gradient_accumulation_steps,
            ckpt_frequency = args.ckpt_frequency,
            log_dir = args.output_dir,
            output_dir = args.output_dir,
            device = args.device)

        #初始化trainer,执行监督训练,而不是蒸馏。它可以把model_S模型训练成为teacher模型
        distiller = BasicTrainer(train_config = train_config,
                                 model = model_S,
                                 adaptor = BertForGLUESimpleAdaptorTraining)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            raise NotImplementedError
        #训练的dataloader
        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.forward_batch_size,drop_last=True)
        #执行callbakc函数,对eval数据集
        callback_func = partial(predict, eval_datasets=eval_datasets, args=args)
        with distiller:
            #开始训练
            distiller.train(optimizer, scheduler=None, dataloader=train_dataloader,
                              num_epochs=args.num_train_epochs, callback=callback_func)

    if not args.do_train and args.do_predict:
        res = predict(model_S,eval_datasets,step=0,args=args)
        print(res)
Exemplo n.º 15
0
def main():
    #parse arguments
    config.parse()
    args = config.args
    for k,v in vars(args).items():
        logger.info(f"{k}:{v}")
    #set seeds
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)

    #arguments check
    device, n_gpu = args_check(args)
    os.makedirs(args.output_dir, exist_ok=True)
    forward_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
    args.forward_batch_size = forward_batch_size

    #load bert config
    bert_config_T = BertConfig.from_json_file(args.bert_config_file_T)
    bert_config_S = BertConfig.from_json_file(args.bert_config_file_S)
    assert args.max_seq_length <= bert_config_T.max_position_embeddings
    assert args.max_seq_length <= bert_config_S.max_position_embeddings

    #Prepare GLUE task
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    #read data
    train_dataset = None
    eval_datasets  = None
    num_train_steps = None
    tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)
    if args.do_train:
        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
        if args.aux_task_name:
            aux_train_dataset = load_and_cache_examples(args, args.aux_task_name, tokenizer, evaluate=False, is_aux=True)
            train_dataset = torch.utils.data.ConcatDataset([train_dataset, aux_train_dataset])
        num_train_steps = int(len(train_dataset)/args.train_batch_size) * args.num_train_epochs
    if args.do_predict:
        eval_datasets = []
        eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
        for eval_task in eval_task_names:
            eval_datasets.append(load_and_cache_examples(args, eval_task, tokenizer, evaluate=True))
    logger.info("Data loaded")


    #Build Model and load checkpoint

    model_S = BertForGLUESimple(bert_config_S, num_labels=num_labels,args=args)
    #Load teacher
    if args.tuned_checkpoint_Ts:
        model_Ts = [BertForGLUESimple(bert_config_T, num_labels=num_labels,args=args) for i in range(len(args.tuned_checkpoint_Ts))]
        for model_T, ckpt_T in zip(model_Ts,args.tuned_checkpoint_Ts):
            logger.info("Load state dict %s" % ckpt_T)
            state_dict_T = torch.load(ckpt_T, map_location='cpu')
            model_T.load_state_dict(state_dict_T)
            model_T.eval()
    else:
        assert args.do_predict is True
    #Load student
    if args.load_model_type=='bert':
        assert args.init_checkpoint_S is not None
        state_dict_S = torch.load(args.init_checkpoint_S, map_location='cpu')
        state_weight = {k[5:]:v for k,v in state_dict_S.items() if k.startswith('bert.')}
        missing_keys,_ = model_S.bert.load_state_dict(state_weight,strict=False)
        assert len(missing_keys)==0
    elif args.load_model_type=='all':
        assert args.tuned_checkpoint_S is not None
        state_dict_S = torch.load(args.tuned_checkpoint_S,map_location='cpu')
        model_S.load_state_dict(state_dict_S)
    else:
        logger.info("Model is randomly initialized.")
    if args.do_train:
        for model_T in model_Ts:
            model_T.to(device)
    model_S.to(device)

    if args.local_rank != -1 or n_gpu > 1:
        if args.local_rank != -1:
            raise NotImplementedError
        elif n_gpu > 1:
            if args.do_train:
                model_Ts = [torch.nn.DataParallel(model_T) for model_T in model_Ts]
            model_S = torch.nn.DataParallel(model_S) #,output_device=n_gpu-1)

    if args.do_train:
        #parameters
        params = list(model_S.named_parameters())
        all_trainable_params = divide_parameters(params, lr=args.learning_rate)
        logger.info("Length of all_trainable_params: %d", len(all_trainable_params))

        optimizer = BERTAdam(all_trainable_params,lr=args.learning_rate,
                             warmup=args.warmup_proportion,t_total=num_train_steps,schedule=args.schedule,
                             s_opt1=args.s_opt1, s_opt2=args.s_opt2, s_opt3=args.s_opt3)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Forward batch size = %d", forward_batch_size)
        logger.info("  Num backward steps = %d", num_train_steps)

        ########### DISTILLATION ###########
        train_config = TrainingConfig(
            gradient_accumulation_steps = args.gradient_accumulation_steps,
            ckpt_frequency = args.ckpt_frequency,
            log_dir = args.output_dir,
            output_dir = args.output_dir,
            device = args.device)

        distill_config = DistillationConfig(
            temperature = args.temperature,
            kd_loss_type = 'ce')

        logger.info(f"{train_config}")
        logger.info(f"{distill_config}")
        adaptor = partial(BertForGLUESimpleAdaptor, no_logits=False, no_mask = False)


        distiller = MultiTeacherDistiller(train_config = train_config,
                            distill_config = distill_config,
                            model_T = model_Ts, model_S = model_S,
                            adaptor_T=adaptor,
                            adaptor_S=adaptor)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            raise NotImplementedError
        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.forward_batch_size,drop_last=True)
        callback_func = partial(predict, eval_datasets=eval_datasets, args=args)
        with distiller:
            distiller.train(optimizer, scheduler=None, dataloader=train_dataloader,
                              num_epochs=args.num_train_epochs, callback=callback_func)

    if not args.do_train and args.do_predict:
        res = predict(model_S,eval_datasets,step=0,args=args)
        print (res)
Exemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help="The config json file corresponding to the pre-trained BERT model. "
        "This specifies the model architecture.")
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help="Whether to lower case the input text. Should be True for uncased "
        "models and False for cased models.")
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--encoder_learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Bert encoder.")
    parser.add_argument(
        "--decoder_learning_rate",
        default=5e-3,
        type=float,
        help="The initial learning rate for generative decoder.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--iterations_per_loop",
                        default=1000,
                        type=int,
                        help="How many steps to make in each estimator call.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        default=False,
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--optimize_on_cpu',
        default=False,
        action='store_true',
        help=
        "Whether to perform optimization and keep the optimizer averages on CPU"
    )
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=128,
        help=
        'Loss scaling, positive power of 2 values can improve fp16 convergence.'
    )
    parser.add_argument('--reduce_dim',
                        default=0,
                        type=int,
                        help='reduce the dimention of decoder.')
    parser.add_argument('--decoder_layer',
                        default=2,
                        type=int,
                        help='the layers of self-attentive-decoder.')
    parser.add_argument('--load_trained_model',
                        default=False,
                        action='store_true',
                        help='reduce the dimention of decoder.')
    parser.add_argument('--postfix',
                        default='',
                        help='the name of log file to be saved.')
    parser.add_argument('--share_input_output_embed',
                        default=False,
                        action='store_true',
                        help='share input and output embeddings of decoder.')
    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if args.fp16:
            logger.info(
                "16-bits training currently not supported in distributed training"
            )
            args.fp16 = False  # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}"
        .format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    bert_config = BertConfig.from_json_file(args.bert_config_file)
    bert_config.reduce_dim = args.reduce_dim
    bert_config.decoder_layer = args.decoder_layer
    bert_config.share_embedd = args.share_input_output_embed

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    # if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
    #     raise ValueError("Output directory () already exists and is not empty.")
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)
    idx_to_vocab = load_idx_to_token(args.vocab_file)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        print('Preprocess the train dataset.')
        train_examples = read_msmarco_examples(args,
                                               input_file=args.train_file,
                                               is_training=True)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

        train_features = convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            p_max_length=P_MAX_LENGTH,
            q_max_length=Q_MAX_LENGTH,
            a_max_length=A_MAX_LENGTH)

        train_input_ids = torch.tensor([f.input_ids for f in train_features],
                                       dtype=torch.long)
        train_input_mask = torch.tensor([f.input_mask for f in train_features],
                                        dtype=torch.long)
        train_segment_ids = torch.tensor(
            [f.segment_ids for f in train_features], dtype=torch.long)
        train_answer_ids = torch.tensor([f.answer_ids for f in train_features],
                                        dtype=torch.long)
        train_answer_mask = torch.tensor(
            [f.answer_mask for f in train_features], dtype=torch.long)
        train_data = TensorDataset(train_input_ids, train_input_mask,
                                   train_segment_ids, train_answer_ids,
                                   train_answer_mask)

    print('Preprocess the dev dataset.')
    eval_examples = read_msmarco_examples(args,
                                          input_file=args.predict_file,
                                          is_training=False)
    eval_features = convert_examples_to_features(examples=eval_examples,
                                                 tokenizer=tokenizer,
                                                 p_max_length=P_MAX_LENGTH,
                                                 q_max_length=Q_MAX_LENGTH,
                                                 a_max_length=A_MAX_LENGTH)

    eval_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                  dtype=torch.long)
    eval_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                   dtype=torch.long)
    eval_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                    dtype=torch.long)
    eval_answer_ids = torch.tensor([f.answer_ids for f in eval_features],
                                   dtype=torch.long)
    eval_answer_mask = torch.tensor([f.answer_mask for f in eval_features],
                                    dtype=torch.long)
    # eval_example_id = torch.tensor([int(f.example_id) for f in eval_features], dtype=torch.long)
    eval_example_id = torch.tensor([int(0) for f in eval_features],
                                   dtype=torch.long)

    eval_data = TensorDataset(eval_input_ids, eval_input_mask,
                              eval_segment_ids, eval_answer_ids,
                              eval_answer_mask, eval_example_id)

    # Prepare model
    # model = BertForQuestionAnswering(bert_config)
    model = BertWithMultiPointer(bert_config)

    # print('state_dict is\n', [k for k in model.state_dict().keys()])
    # exit(0)
    if args.init_checkpoint is not None:
        print('Loading the pre-trained BERT parameters')
        state_dict = torch.load(args.init_checkpoint, map_location='cpu')
        model.bert.load_state_dict(state_dict)
        decoder_embedding = [
            "word_embeddings.weight", "position_embeddings.weight",
            "token_type_embeddings.weight", "LayerNorm.gamma", "LayerNorm.beta"
        ]
        decoder_embedding = ['embeddings.' + n for n in decoder_embedding]
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            if k in decoder_embedding:
                name = k[11:]
                new_state_dict[name] = v
        model.decoderEmbedding.load_state_dict(new_state_dict)

    if args.load_trained_model:
        print('Loading the pre-trained Model')
        model_path = args.output_dir + '/models/best_params.pt.' + args.postfix
        state_dict = torch.load(model_path)
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            name = k[7:]  # remove module.
            new_state_dict[name] = v
        model.load_state_dict(new_state_dict)

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)
        # model = DataParallelModel(model)
        # criterion = DataParallelCriterion(torch.nn.NLLLoss(ignore_index=0))

    # Prepare optimizer
    if args.fp16:
        param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
                            for n, param in model.named_parameters()]
    elif args.optimize_on_cpu:
        param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                            for n, param in model.named_parameters()]
    else:
        param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay) and (
                    'bert' in n or 'decoderEmbedding' in n)
            ],
            'weight_decay_rate':
            0.01,
            'lr':
            args.encoder_learning_rate
        },
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay) and (
                    'bert' not in n and 'decoderEmbedding' not in n)
            ],
            'weight_decay_rate':
            0.01,
            'lr':
            args.decoder_learning_rate
        },
        {
            'params': [
                p for n, p in param_optimizer
                if any(nd in n for nd in no_decay) and (
                    'bert' in n or 'decoderEmbedding' in n)
            ],
            'weight_decay_rate':
            0.0,
            'lr':
            args.encoder_learning_rate
        },
        {
            'params': [
                p for n, p in param_optimizer
                if any(nd in n for nd in no_decay) and (
                    'bert' not in n and 'decoderEmbedding' not in n)
            ],
            'weight_decay_rate':
            0.0,
            'lr':
            args.decoder_learning_rate
        },
    ]
    optimizer = BERTAdam(optimizer_grouped_parameters,
                         lr=args.encoder_learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    global_step = 0
    if args.do_train:
        eval_loss, eval_ppl = eval_the_model(args, model, eval_data)
        best_ppl = eval_ppl
        log_path = os.path.join(args.output_dir, 'log.txt.' + args.postfix)
        with open(log_path, 'w') as f:
            f.write('Before train, the average loss on val set is: ' +
                    str(eval_loss.float()) +
                    ' and the average ppl on val set is: ' + str(eval_ppl))
            f.write('\n\n')

        model.train()

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            train_loss = 0
            train_batch = 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                if n_gpu == 1:
                    batch = tuple(
                        t.to(device)
                        for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, answer_ids, answer_mask = batch
                loss, _ = model(input_ids,
                                segment_ids,
                                input_mask,
                                answer_ids=answer_ids,
                                answer_mask=answer_mask)
                # pred = []
                # target = []
                # for item in model_ret:
                #     pred.append(item[0][0].log())
                #     target.append(item[0][1])
                # target = torch.cat(target)
                # loss = criterion(pred, target)

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.fp16 and args.loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * args.loss_scale
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16 or args.optimize_on_cpu:
                        if args.fp16 and args.loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                param.grad.data = param.grad.data / args.loss_scale
                        is_nan = set_optimizer_params_grad(
                            param_optimizer,
                            model.named_parameters(),
                            test_nan=True)
                        if is_nan:
                            logger.info(
                                "FP16 TRAINING: Nan in gradients, reducing loss scaling"
                            )
                            args.loss_scale = args.loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(
                            model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                    global_step += 1

                train_loss += loss.detach().cpu()
                train_batch += 1

            train_loss /= train_batch
            train_ppl = math.pow(math.e, train_loss)
            eval_loss, eval_ppl = eval_the_model(args, model, eval_data)
            with open(log_path, 'a') as f:
                f.write('In epoch-' + str(epoch) +
                        ' the average loss on train set is: ' +
                        str(train_loss.float()) +
                        ' the average ppl on train set is: ' + str(train_ppl))
                f.write('\n')
                f.write('In epoch-' + str(epoch) +
                        ' the average loss on eval set is: ' +
                        str(eval_loss.float()) +
                        ' the average ppl on eval set is: ' + str(eval_ppl))
                f.write('\n\n')
            if eval_ppl < best_ppl:
                model_save_path = os.path.join(
                    args.output_dir, 'models',
                    'best_params.pt.' + args.postfix)
                if os.path.exists(model_save_path):
                    os.remove(model_save_path)
                torch.save(model.state_dict(), model_save_path)

    if args.do_predict:

        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        logger.info("Start evaluating")
        to_save = []
        for input_ids, input_mask, segment_ids, answer_ids, answer_mask, example_id in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            answer_ids = answer_ids.to(device)

            with torch.no_grad():

                def decode_to_vocab(batch, isContext=False):
                    with torch.cuda.device_of(batch):
                        batch = batch.tolist()
                    batch = [[idx_to_vocab[ind] for ind in ex] for ex in batch]

                    def trim(s, t):
                        sentence = []
                        for w in s:
                            if w == t:
                                break
                            sentence.append(w)
                        return sentence

                    batch = [trim(ex, '[SEP]') for ex in batch]

                    def filter_special(tok):
                        return tok not in [
                            '[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]',
                            '[EOS]'
                        ]

                    batch = [filter(filter_special, ex) for ex in batch]

                    return [' '.join(ex) if ex != None else '' for ex in batch]

                model_ret = model(input_ids,
                                  segment_ids,
                                  input_mask,
                                  answer_ids=answer_ids)
                outs = model_ret[1]
                if type(outs) == tuple:
                    outs, answer_scores = outs
                    outs = outs.data
                    answer_scores = answer_scores.tolist()
                ground_trurhs = decode_to_vocab(answer_ids)
                decode_answers = decode_to_vocab(outs)
                # decode_contexts = decode_to_vocab(input_ids, isContext=True)
                for i, (answer, ground_trurh, answer_score,
                        ex_id) in enumerate(
                            zip(decode_answers, ground_trurhs, answer_scores,
                                example_id)):
                    answer_length = len(answer.split(' '))
                    to_save.append('\t'.join([
                        ground_trurh.replace(" ##", ""),
                        answer.replace(" ##", ""),
                        str(1. * answer_score / answer_length),
                        str(ex_id), '\n'
                    ]))

        output_answer_file = os.path.join(args.output_dir, 'predictions',
                                          'predictions.txt.' + args.postfix)
        with open(output_answer_file, "w") as f:
            f.writelines(to_save)
Exemplo n.º 17
0
def _train_labeler(args):
  if args.data_setup == 'joint':
    train_gen_list, val_gen_list, crowd_dev_gen, elmo, bert, vocab = get_joint_datasets(args)
  else:
    train_fname = args.train_data
    dev_fname = args.dev_data
    print(train_fname, dev_fname)
    data_gens, elmo = get_datasets([(train_fname, 'train', args.goal),
                              (dev_fname, 'dev', args.goal)], args)
    train_gen_list = [(args.goal, data_gens[0])]
    val_gen_list = [(args.goal, data_gens[1])]
  train_log = SummaryWriter(os.path.join(constant.EXP_ROOT, args.model_id, "log", "train"))
  validation_log = SummaryWriter(os.path.join(constant.EXP_ROOT, args.model_id, "log", "validation"))
  tensorboard = TensorboardWriter(train_log, validation_log)

  if args.model_type == 'labeler':
    print('==> Labeler')
    model = denoising_models.Labeler(args, constant.ANSWER_NUM_DICT[args.goal])
  elif args.model_type == 'filter':
    print('==> Filter')
    model = denoising_models.Filter(args, constant.ANSWER_NUM_DICT[args.goal])
  else:
    print('Invalid model type: -model_type ' + args.model_type)
    raise NotImplementedError

  model.cuda()
  total_loss = 0
  batch_num = 0
  best_macro_f1 = 0.
  start_time = time.time()
  init_time = time.time()

  if args.bert:
    if args.bert_param_path:
      print('==> Loading BERT from ' + args.bert_param_path)
      model.bert.load_state_dict(torch.load(args.bert_param_path, map_location='cpu'))
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_parameters = [
        {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0}
        ]
    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.bert_learning_rate,
                         warmup=args.bert_warmup_proportion,
                         t_total=-1) # TODO: 
  else:
    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
  #optimizer = optim.SGD(model.parameters(), lr=1., momentum=0.)

  if args.load:
    load_model(args.reload_model_name, constant.EXP_ROOT, args.model_id, model, optimizer)

  for idx, m in enumerate(model.modules()):
    logging.info(str(idx) + '->' + str(m))

  while True:
    batch_num += 1  # single batch composed of all train signal passed by.
    for (type_name, data_gen) in train_gen_list:
      try:
        batch = next(data_gen)
        batch, _ = to_torch(batch)
      except StopIteration:
        logging.info(type_name + " finished at " + str(batch_num))
        print('Done!')
        torch.save({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()},
                   '{0:s}/{1:s}.pt'.format(constant.EXP_ROOT, args.model_id))
        return
      optimizer.zero_grad()
      loss, output_logits, cls_logits = model(batch, type_name)
      loss.backward()
      total_loss += loss.item()
      optimizer.step()

      if batch_num % args.log_period == 0 and batch_num > 0:
        gc.collect()
        cur_loss = float(1.0 * loss.clone().item())
        elapsed = time.time() - start_time
        train_loss_str = ('|loss {0:3f} | at {1:d}step | @ {2:.2f} ms/batch'.format(cur_loss, batch_num,
                                                                                    elapsed * 1000 / args.log_period))
        start_time = time.time()
        print(train_loss_str)
        logging.info(train_loss_str)
        tensorboard.add_train_scalar('train_loss_' + type_name, cur_loss, batch_num)

      if batch_num % args.eval_period == 0 and batch_num > 0:
        output_index = get_output_index(output_logits, threshold=args.threshold)
        gold_pred_train = get_gold_pred_str(output_index, batch['y'].data.cpu().clone(), args.goal)
        print(gold_pred_train[:10])
        accuracy = sum([set(y) == set(yp) for y, yp in gold_pred_train]) * 1.0 / len(gold_pred_train)

        train_acc_str = '{1:s} Train accuracy: {0:.1f}%'.format(accuracy * 100, type_name)
        if cls_logits is not None:
          cls_accuracy =  sum([(1. if pred > 0. else 0.) == gold for pred, gold in zip(cls_logits, batch['y_cls'].data.cpu().numpy())])  / float(cls_logits.size()[0])
          cls_tp = sum([(1. if pred > 0. else 0.) == 1. and gold == 1. for pred, gold in zip(cls_logits, batch['y_cls'].data.cpu().numpy())])
          cls_precision = cls_tp  / float(sum([1. if pred > 0. else 0. for pred in cls_logits])) 
          cls_recall = cls_tp  / float(sum(batch['y_cls'].data.cpu().numpy()))
          cls_f1 = f1(cls_precision, cls_recall)
          train_cls_acc_str = '{1:s} Train cls accuracy: {0:.2f}%  P: {2:.3f}  R: {3:.3f}  F1: {4:.3f}'.format(cls_accuracy * 100, type_name, cls_precision, cls_recall, cls_f1)
        print(train_acc_str)
        if cls_logits is not None:
          print(train_cls_acc_str)
        logging.info(train_acc_str)
        tensorboard.add_train_scalar('train_acc_' + type_name, accuracy, batch_num)
        if args.goal != 'onto':
          for (val_type_name, val_data_gen) in val_gen_list:
            if val_type_name == type_name:
              eval_batch, _ = to_torch(next(val_data_gen))
              evaluate_batch(batch_num, eval_batch, model, tensorboard, val_type_name, args, args.goal)

    if batch_num % args.eval_period == 0 and batch_num > 0 and args.data_setup == 'joint':
      # Evaluate Loss on the Turk Dev dataset.
      print('---- eval at step {0:d} ---'.format(batch_num))
      crowd_eval_loss, macro_f1 = evaluate_data(batch_num, 'crowd/dev_tree.json', model,
                                                tensorboard, "open", args, elmo, bert, vocab=vocab)

      if best_macro_f1 < macro_f1:
        best_macro_f1 = macro_f1
        save_fname = '{0:s}/{1:s}_best.pt'.format(constant.EXP_ROOT, args.model_id)
        torch.save({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, save_fname)
        print(
          'Total {0:.2f} minutes have passed, saving at {1:s} '.format((time.time() - init_time) / 60, save_fname))

    if batch_num % args.eval_period == 0 and batch_num > 0 and args.goal == 'onto':
      # Evaluate Loss on the Turk Dev dataset.
      print('---- OntoNotes: eval at step {0:d} ---'.format(batch_num))
      crowd_eval_loss, macro_f1 = evaluate_data(batch_num, args.dev_data, model, tensorboard,
                                                args.goal, args, elmo)

    if batch_num % args.save_period == 0 and batch_num > 0:
      save_fname = '{0:s}/{1:s}_{2:d}.pt'.format(constant.EXP_ROOT, args.model_id, batch_num)
      torch.save({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, save_fname)
      print(
        'Total {0:.2f} minutes have passed, saving at {1:s} '.format((time.time() - init_time) / 60, save_fname))
  # Training finished! 
  torch.save({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()},
             '{0:s}/{1:s}.pt'.format(constant.EXP_ROOT, args.model_id))
Exemplo n.º 18
0
class Instructor:
    def __init__(self, args):
        self.opt = args
        self.writer = SummaryWriter(log_dir=self.opt.output_dir)  # tensorboard
        bert_config = BertConfig.from_json_file(args.bert_config_file)

        if args.max_seq_length > bert_config.max_position_embeddings:
            raise ValueError(
                "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}"
                .format(args.max_seq_length,
                        bert_config.max_position_embeddings))

        # if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        #     raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
        # os.makedirs(args.output_dir, exist_ok=True)

        self.dataset = ReadData(self.opt)  # Read the data and preprocess it
        self.num_train_steps = None
        self.num_train_steps = int(
            len(self.dataset.train_examples) / self.opt.train_batch_size /
            self.opt.gradient_accumulation_steps * self.opt.num_train_epochs)
        self.opt.label_size = len(self.dataset.label_list)
        args.output_dim = len(self.dataset.label_list)
        print("label size: {}".format(args.output_dim))

        # 初始化模型
        print("initialize model ...")
        if args.model_class == BertForSequenceClassification:
            self.model = BertForSequenceClassification(
                bert_config, len(self.dataset.label_list))
        else:
            self.model = model_classes[args.model_name](bert_config, args)

        if args.init_checkpoint is not None:
            self.model.bert.load_state_dict(
                torch.load(args.init_checkpoint, map_location='cpu'))
        if args.fp16:
            self.model.half()
        # 冻结参数
        # for name, p in self.model.named_parameters():
        #     if name.startswith('bert.encoder.layer.11') or name.startswith('bert.encoder.layer.10') or name.startswith('bert.encoder.layer.9') or name.startswith('bert.encoder.layer.8'):  # 冻结最后一层
        #         p.requires_grad = False
        # 计算模型的参数个数
        n_trainable_params, n_nontrainable_params = 0, 0
        for p in self.model.parameters():
            n_params = torch.prod(torch.tensor(
                p.shape))  # torch.prod()表示计算所有元素的乘积
            if p.requires_grad:  # 是否需要求梯度
                n_trainable_params += n_params
            else:
                n_nontrainable_params += n_params
        print('n_trainable_params: {0}, n_nontrainable_params: {1}'.format(
            n_trainable_params, n_nontrainable_params))
        self.model.to(args.device)

        # 并行化
        if self.opt.n_gpu > 1:
            self.model = torch.nn.DataParallel(self.model,
                                               device_ids=self.opt.gpu_id)

        # Prepare optimizer
        if args.fp16:
            self.param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
                                    for n, param in self.model.named_parameters()]
        elif args.optimize_on_cpu:
            self.param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                                    for n, param in self.model.named_parameters()]
        else:
            self.param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [{
            'params':
            [p for n, p in self.param_optimizer if n not in no_decay],
            'weight_decay_rate':
            0.01
        }, {
            'params': [p for n, p in self.param_optimizer if n in no_decay],
            'weight_decay_rate':
            0.0
        }]
        self.optimizer = BERTAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  warmup=args.warmup_proportion,
                                  t_total=self.num_train_steps)
        # 配置自己模型的优化器
        # [p for pname, p in self.param_optimizer if not pname.startswith('module.bert')]
        # self.optimizer_me = torch.optim.Adam(
        #     [{'params': [p for pname, p in self.param_optimizer if not pname.startswith('module.bert')]}], lr=0.001,
        #     weight_decay=0)
        # self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer_me, mode='max',
        #                                           patience=3)  # 3个epoch后,所监测的值停止增加时自动调整学习率

        self.global_step = 0  # 初始化全局步数为 0
        self.max_test_acc = 0
        self.max_test_f1 = 0

    def do_train(self):  # 训练模型
        # for _ in trange(int(args.num_train_epochs), desc="Epoch"):
        for i_epoch in range(int(args.num_train_epochs)):
            print('>' * 100)
            print('epoch: ', i_epoch)
            tr_loss = 0
            train_accuracy = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            y_pred = []
            y_true = []
            self.model.train()  # 让模型处于训练状态,因为每跑完一个epoch就会处于测试状态
            for step, batch in enumerate(
                    tqdm(self.dataset.train_dataloader, desc="Training")):
                # batch = tuple(t.to(self.opt.device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, \
                input_t_ids, input_t_mask, segment_t_ids, \
                input_without_t_ids, input_without_t_mask, segment_without_t_ids, \
                input_left_t_ids, input_left_t_mask, segment_left_t_ids, \
                input_right_t_ids, input_right_t_mask, segment_right_t_ids, \
                input_left_ids, input_left_mask, segment_left_ids = batch

                input_ids = input_ids.to(self.opt.device)
                segment_ids = segment_ids.to(self.opt.device)
                input_mask = input_mask.to(self.opt.device)
                label_ids = label_ids.to(self.opt.device)

                if self.opt.model_class in [
                        BertForSequenceClassification, CNN
                ]:
                    loss, logits = self.model(input_ids, segment_ids,
                                              input_mask, label_ids)
                else:
                    input_t_ids = input_t_ids.to(self.opt.device)
                    input_t_mask = input_t_mask.to(self.opt.device)
                    segment_t_ids = segment_t_ids.to(self.opt.device)
                    if self.opt.model_class == MemNet:
                        input_without_t_ids = input_without_t_ids.to(
                            self.opt.device)
                        input_without_t_mask = input_without_t_mask.to(
                            self.opt.device)
                        segment_without_t_ids = segment_without_t_ids.to(
                            self.opt.device)
                        loss, logits = self.model(input_without_t_ids,
                                                  segment_without_t_ids,
                                                  input_without_t_mask,
                                                  label_ids, input_t_ids,
                                                  input_t_mask, segment_t_ids)
                    elif self.opt.model_class in [Cabasc]:
                        input_left_t_ids = input_left_t_ids.to(self.opt.device)
                        input_left_t_mask = input_left_t_mask.to(
                            self.opt.device)
                        segment_left_t_ids = segment_left_t_ids.to(
                            self.opt.device)
                        input_right_t_ids = input_right_t_ids.to(
                            self.opt.device)
                        input_right_t_mask = input_right_t_mask.to(
                            self.opt.device)
                        segment_right_t_ids = segment_right_t_ids.to(
                            self.opt.device)
                        loss, logits = self.model(
                            input_ids, segment_ids, input_mask, label_ids,
                            input_t_ids, input_t_mask, segment_t_ids,
                            input_left_t_ids, input_left_t_mask,
                            segment_left_t_ids, input_right_t_ids,
                            input_right_t_mask, segment_right_t_ids)
                    elif self.opt.model_class in [
                            RAM, TNet_LF, MGAN, TT, MLP, TD_BERT, TD_BERT_QA,
                            DTD_BERT
                    ]:
                        input_left_ids = input_left_ids.to(self.opt.device)
                        input_left_mask = input_left_mask.to(self.opt.device)
                        segment_left_ids = segment_left_ids.to(self.opt.device)
                        loss, logits = self.model(
                            input_ids, segment_ids, input_mask, label_ids,
                            input_t_ids, input_t_mask, segment_t_ids,
                            input_left_ids, input_left_mask, segment_left_ids)
                    else:
                        loss, logits = self.model(input_ids, segment_ids,
                                                  input_mask, label_ids,
                                                  input_t_ids, input_t_mask,
                                                  segment_t_ids)

                if self.opt.n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.fp16 and args.loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * args.loss_scale
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                tr_loss += loss.item()
                loss.backward()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                # 计算准确率
                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                tmp_train_accuracy = accuracy(logits, label_ids)
                y_pred.extend(np.argmax(logits, axis=1))
                y_true.extend(label_ids)
                train_accuracy += tmp_train_accuracy
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16 or args.optimize_on_cpu:
                        if args.fp16 and args.loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in self.model.parameters():
                                param.grad.data = param.grad.data / args.loss_scale
                        is_nan = set_optimizer_params_grad(
                            self.param_optimizer,
                            self.model.named_parameters(),
                            test_nan=True)
                        if is_nan:
                            logger.info(
                                "FP16 TRAINING: Nan in gradients, reducing loss scaling"
                            )
                            args.loss_scale = args.loss_scale / 2
                            self.model.zero_grad()
                            continue
                        self.optimizer.step()
                        # self.optimizer_me.step()
                        copy_optimizer_params_to_model(
                            self.model.named_parameters(),
                            self.param_optimizer)
                    else:
                        self.optimizer.step()
                        # self.optimizer_me.step()
                    self.model.zero_grad()
                    self.global_step += 1
            train_accuracy = train_accuracy / nb_tr_examples
            train_f1 = f1_score(y_true,
                                y_pred,
                                average='macro',
                                labels=np.unique(y_true))
            result = self.do_eval()  # 每跑完一轮,测试一次
            tr_loss = tr_loss / nb_tr_steps
            # self.scheduler.step(result['eval_accuracy'])  # 监测验证集的精度
            self.writer.add_scalar('train_loss', tr_loss, i_epoch)
            self.writer.add_scalar('train_accuracy', train_accuracy, i_epoch)
            self.writer.add_scalar('eval_accuracy', result['eval_accuracy'],
                                   i_epoch)
            self.writer.add_scalar('eval_loss', result['eval_loss'], i_epoch)
            # self.writer.add_scalar('lr', self.optimizer_me.param_groups[0]['lr'], i_epoch)
            print(
                "Results: train_acc: {0:.6f} | train_f1: {1:.6f} | train_loss: {2:.6f} | eval_accuracy: {3:.6f} | eval_loss: {4:.6f} | eval_f1: {5:.6f} | max_test_acc: {6:.6f} | max_test_f1: {7:.6f}"
                .format(train_accuracy, train_f1, tr_loss,
                        result['eval_accuracy'], result['eval_loss'],
                        result['eval_f1'], self.max_test_acc,
                        self.max_test_f1))

    def do_eval(self):  # 测试准确率
        self.model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        # confidence = []
        y_pred = []
        y_true = []
        for batch in tqdm(self.dataset.eval_dataloader, desc="Evaluating"):
            # batch = tuple(t.to(self.opt.device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids, \
            input_t_ids, input_t_mask, segment_t_ids, \
            input_without_t_ids, input_without_t_mask, segment_without_t_ids, \
            input_left_t_ids, input_left_t_mask, segment_left_t_ids, \
            input_right_t_ids, input_right_t_mask, segment_right_t_ids, \
            input_left_ids, input_left_mask, segment_left_ids = batch

            input_ids = input_ids.to(self.opt.device)
            segment_ids = segment_ids.to(self.opt.device)
            input_mask = input_mask.to(self.opt.device)
            label_ids = label_ids.to(self.opt.device)

            with torch.no_grad():  # 不计算梯度
                if self.opt.model_class in [
                        BertForSequenceClassification, CNN
                ]:
                    loss, logits = self.model(input_ids, segment_ids,
                                              input_mask, label_ids)
                else:
                    input_t_ids = input_t_ids.to(self.opt.device)
                    input_t_mask = input_t_mask.to(self.opt.device)
                    segment_t_ids = segment_t_ids.to(self.opt.device)
                    if self.opt.model_class == MemNet:
                        input_without_t_ids = input_without_t_ids.to(
                            self.opt.device)
                        input_without_t_mask = input_without_t_mask.to(
                            self.opt.device)
                        segment_without_t_ids = segment_without_t_ids.to(
                            self.opt.device)
                        loss, logits = self.model(input_without_t_ids,
                                                  segment_without_t_ids,
                                                  input_without_t_mask,
                                                  label_ids, input_t_ids,
                                                  input_t_mask, segment_t_ids)
                    elif self.opt.model_class in [Cabasc]:
                        input_left_t_ids = input_left_t_ids.to(self.opt.device)
                        input_left_t_ids = input_left_t_ids.to(self.opt.device)
                        input_left_t_mask = input_left_t_mask.to(
                            self.opt.device)
                        segment_left_t_ids = segment_left_t_ids.to(
                            self.opt.device)
                        input_right_t_ids = input_right_t_ids.to(
                            self.opt.device)
                        input_right_t_mask = input_right_t_mask.to(
                            self.opt.device)
                        segment_right_t_ids = segment_right_t_ids.to(
                            self.opt.device)
                        loss, logits = self.model(
                            input_ids, segment_ids, input_mask, label_ids,
                            input_t_ids, input_t_mask, segment_t_ids,
                            input_left_t_ids, input_left_t_mask,
                            segment_left_t_ids, input_right_t_ids,
                            input_right_t_mask, segment_right_t_ids)
                    elif self.opt.model_class in [
                            RAM, TNet_LF, MGAN, TT, MLP, TD_BERT, TD_BERT_QA,
                            DTD_BERT
                    ]:
                        input_left_ids = input_left_ids.to(self.opt.device)
                        input_left_mask = input_left_mask.to(self.opt.device)
                        segment_left_ids = segment_left_ids.to(self.opt.device)
                        loss, logits = self.model(
                            input_ids, segment_ids, input_mask, label_ids,
                            input_t_ids, input_t_mask, segment_t_ids,
                            input_left_ids, input_left_mask, segment_left_ids)
                    else:
                        loss, logits = self.model(input_ids, segment_ids,
                                                  input_mask, label_ids,
                                                  input_t_ids, input_t_mask,
                                                  segment_t_ids)

            # with torch.no_grad():  # 不计算梯度
            #     if self.opt.model_class in [BertForSequenceClassification, CNN]:
            #         loss, logits = self.model(input_ids, segment_ids, input_mask, label_ids)
            #     else:
            #         loss, logits = self.model(input_ids, segment_ids, input_mask, labels=label_ids,
            #                                   input_t_ids=input_t_ids,
            #                                   input_t_mask=input_t_mask, segment_t_ids=segment_t_ids)
            # confidence.extend(torch.nn.Softmax(dim=1)(logits)[:, 1].tolist())  # 获取 positive 类的置信度
            # loss = F.cross_entropy(logits, label_ids, size_average=False)  # 计算mini-batch的loss总和
            if self.opt.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.fp16 and args.loss_scale != 1.0:
                # rescale loss for fp16 training
                # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                loss = loss * args.loss_scale
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)
            y_pred.extend(np.argmax(logits, axis=1))
            y_true.extend(label_ids)

            # eval_loss += tmp_eval_loss.mean().item()
            eval_loss += loss.item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        # eval_loss = eval_loss / len(self.dataset.eval_examples)
        test_f1 = f1_score(y_true,
                           y_pred,
                           average='macro',
                           labels=np.unique(y_true))
        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        if eval_accuracy > self.max_test_acc:
            self.max_test_acc = eval_accuracy
            if self.opt.do_predict:  # 测试模式才保存模型
                torch.save(self.model, self.opt.model_save_path)
        if test_f1 > self.max_test_f1:
            self.max_test_f1 = test_f1

        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'eval_f1': test_f1,
        }

        # output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        # with open(output_eval_file, "w") as writer:
        #     logger.info("***** Eval results *****")
        #     for key in sorted(result.keys()):
        #         logger.info("  %s = %s", key, str(result[key]))
        #         writer.write("%s = %s\n" % (key, str(result[key])))
        # print("Eval results ==> eval_accuracy: {0}, eval_loss: {1}, max_test_acc: {2}".format(
        #     result['eval_accuracy'], result['eval_loss'], self.max_test_acc))
        return result

    def do_predict(self):
        # 加载保存的模型进行预测,获得准确率
        # 读测试集的数据
        # dataset = ReadData(self.opt)  # 这个方法有点冗余了,读取了所有的数据,包括训练集
        # Load model
        saved_model = torch.load(self.opt.model_save_path)
        saved_model.to(self.opt.device)
        saved_model.eval()
        nb_test_examples = 0
        test_accuracy = 0
        for batch in tqdm(self.dataset.eval_dataloader, desc="Testing"):
            batch = tuple(t.to(self.opt.device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids, input_t_ids, input_t_mask, segment_t_ids = batch

            with torch.no_grad():  # Do not calculate gradient
                if self.opt.model_class in [
                        BertForSequenceClassification, CNN
                ]:
                    _, logits = saved_model(input_ids, segment_ids, input_mask,
                                            label_ids)
                else:
                    _, logits = saved_model(input_ids,
                                            segment_ids,
                                            input_mask,
                                            labels=label_ids,
                                            input_t_ids=input_t_ids,
                                            input_t_mask=input_t_mask,
                                            segment_t_ids=segment_t_ids)
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_test_accuracy = accuracy(logits, label_ids)
            test_accuracy += tmp_test_accuracy
            nb_test_examples += input_ids.size(0)
        test_accuracy = test_accuracy / nb_test_examples
        return test_accuracy

    def run(self):
        print('> training arguments:')
        for arg in vars(self.opt):
            print('>>> {0}: {1}'.format(arg, getattr(self.opt, arg)))

        self.do_train()
        print('>' * 100)
        if self.opt.do_predict:
            test_accuracy = self.do_predict()
            print("Test Set Accuracy: {}".format(test_accuracy))
        print("Max validate Set Acc: {0}".format(
            self.max_test_acc))  # Output the final test accuracy
        self.writer.close()
        # return self.max_test_acc
        return self.max_test_f1
Exemplo n.º 19
0
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)

        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
    else:
        optimizer = BERTAdam(optimizer_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             max_grad_norm=1.0,
                             t_total=num_train_steps,
                             schedule=args.schedule,
                             weight_decay_rate=args.weight_decay_rate)

    train_pos_dataloader = DataLoader(pos_features,
                                      batch_size=args.train_batch_size // 2,
                                      collate_fn=data_gen, shuffle=True)
    neg_features = neg_features[:len(pos_features) * (len(neg_features) // len(pos_features))]
    train_neg_dataloader = DataLoader(neg_features,
                                      batch_size=int(
                                          args.train_batch_size // 2 * (len(neg_features) // len(pos_features))),
                                      collate_fn=data_gen, shuffle=True)
    assert len(train_pos_dataloader) == len(train_neg_dataloader)

    print('start training...')
Exemplo n.º 20
0
def main():
    #parse arguments
    config.parse()
    args = config.args
    for k, v in vars(args).items():
        logger.info(f"{k}:{v}")
    #set seeds
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)

    #arguments check
    device, n_gpu = args_check(args)
    os.makedirs(args.output_dir, exist_ok=True)
    forward_batch_size = int(args.train_batch_size /
                             args.gradient_accumulation_steps)
    args.forward_batch_size = forward_batch_size

    #load bert config
    bert_config_S = BertConfig.from_json_file(args.bert_config_file_S)
    assert args.max_seq_length <= bert_config_S.max_position_embeddings

    #read data
    train_examples = None
    train_features = None
    eval_examples = None
    eval_features = None
    num_train_steps = None

    tokenizer = ChineseFullTokenizer(vocab_file=args.vocab_file,
                                     do_lower_case=args.do_lower_case)
    convert_fn = partial(convert_examples_to_features,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length,
                         doc_stride=args.doc_stride,
                         max_query_length=args.max_query_length)
    if args.do_train:
        train_examples, train_features = read_and_convert(
            args.train_file,
            is_training=True,
            do_lower_case=args.do_lower_case,
            read_fn=read_squad_examples,
            convert_fn=convert_fn)
        if args.fake_file_1:
            fake_examples1, fake_features1 = read_and_convert(
                args.fake_file_1,
                is_training=True,
                do_lower_case=args.do_lower_case,
                read_fn=read_squad_examples,
                convert_fn=convert_fn)
            train_examples += fake_examples1
            train_features += fake_features1
        if args.fake_file_2:
            fake_examples2, fake_features2 = read_and_convert(
                args.fake_file_2,
                is_training=True,
                do_lower_case=args.do_lower_case,
                read_fn=read_squad_examples,
                convert_fn=convert_fn)
            train_examples += fake_examples2
            train_features += fake_features2

        num_train_steps = int(len(train_features) /
                              args.train_batch_size) * args.num_train_epochs

    if args.do_predict:
        eval_examples, eval_features = read_and_convert(
            args.predict_file,
            is_training=False,
            do_lower_case=args.do_lower_case,
            read_fn=read_squad_examples,
            convert_fn=convert_fn)

    #Build Model and load checkpoint
    model_S = BertForQASimple(bert_config_S, args)
    #Load student
    if args.load_model_type == 'bert':
        assert args.init_checkpoint_S is not None
        state_dict_S = torch.load(args.init_checkpoint_S, map_location='cpu')
        state_weight = {
            k[5:]: v
            for k, v in state_dict_S.items() if k.startswith('bert.')
        }
        missing_keys, _ = model_S.bert.load_state_dict(state_weight,
                                                       strict=False)
        assert len(missing_keys) == 0
    elif args.load_model_type == 'all':
        assert args.tuned_checkpoint_S is not None
        state_dict_S = torch.load(args.tuned_checkpoint_S, map_location='cpu')
        model_S.load_state_dict(state_dict_S)
    else:
        logger.info("Model is randomly initialized.")
    model_S.to(device)

    if args.local_rank != -1 or n_gpu > 1:
        if args.local_rank != -1:
            raise NotImplementedError
        elif n_gpu > 1:
            model_S = torch.nn.DataParallel(model_S)  #,output_device=n_gpu-1)

    if args.do_train:
        #parameters
        params = list(model_S.named_parameters())
        all_trainable_params = divide_parameters(params, lr=args.learning_rate)
        logger.info("Length of all_trainable_params: %d",
                    len(all_trainable_params))

        optimizer = BERTAdam(all_trainable_params,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps,
                             schedule=args.schedule,
                             s_opt1=args.s_opt1,
                             s_opt2=args.s_opt2,
                             s_opt3=args.s_opt3)

        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Forward batch size = %d", forward_batch_size)
        logger.info("  Num backward steps = %d", num_train_steps)

        ########### DISTILLATION ###########
        train_config = TrainingConfig(
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            ckpt_frequency=args.ckpt_frequency,
            log_dir=args.output_dir,
            output_dir=args.output_dir,
            device=args.device)

        distiller = BasicTrainer(train_config=train_config,
                                 model=model_S,
                                 adaptor=BertForQASimpleAdaptorTraining)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_doc_mask = torch.tensor([f.doc_mask for f in train_features],
                                    dtype=torch.float)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_start_positions = torch.tensor(
            [f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor(
            [f.end_position for f in train_features], dtype=torch.long)

        train_dataset = TensorDataset(all_input_ids, all_segment_ids,
                                      all_input_mask, all_doc_mask,
                                      all_start_positions, all_end_positions)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            raise NotImplementedError
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.forward_batch_size,
                                      drop_last=True)
        callback_func = partial(predict,
                                eval_examples=eval_examples,
                                eval_features=eval_features,
                                args=args)
        with distiller:
            distiller.train(optimizer,
                            scheduler=None,
                            dataloader=train_dataloader,
                            num_epochs=args.num_train_epochs,
                            callback=callback_func)

    if not args.do_train and args.do_predict:
        res = predict(model_S, eval_examples, eval_features, step=0, args=args)
        print(res)
Exemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--bert_config_file", default=None, type=str, required=True,
                        help="The config json file corresponding to the pre-trained BERT model. "
                             "This specifies the model architecture.")
    parser.add_argument("--vocab_file", default=None, type=str, required=True,
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument("--predict_file", default=None, type=str,
                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
    parser.add_argument("--init_checkpoint", default=None, type=str,
                        help="Init from checkpoint")
    parser.add_argument("--init_full_model", default=None, type=str,
                        help="Initial full model")
    parser.add_argument("--do_lower_case", default=True, action='store_true',
                        help="Whether to lower case the input text. Should be True for uncased "
                             "models and False for cased models.")
    parser.add_argument("--max_seq_length", default=384, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
                             "longer than this will be truncated, and sequences shorter than this will be padded.")
    parser.add_argument("--doc_stride", default=128, type=int,
                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
    parser.add_argument("--max_query_length", default=64, type=int,
                        help="The maximum number of tokens for the question. Questions longer than this will "
                             "be truncated to this length.")
    parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.")
    parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion", default=0.0, type=float,
                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
                             "of training.")
    parser.add_argument("--save_checkpoints_steps", default=2000, type=int,
                        help="How often to save the model checkpoint")
    parser.add_argument("--iterations_per_loop", default=1000, type=int,
                        help="How many steps to make in each estimator call.")
    parser.add_argument("--n_best_size", default=3, type=int,
                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
                             "output file.")
    parser.add_argument("--max_answer_length", default=100, type=int,
                        help="The maximum length of an answer that can be generated. This is needed because the start "
                             "and end predictions are not conditioned on one another.")
    parser.add_argument("--use_history", default=False, action='store_true',
                        help="Use history features")
    parser.add_argument("--verbose_logging", default=False, action='store_true',
                        help="If true, all of the warnings related to data processing will be printed. "
                             "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--optimize_on_cpu',
                        default=False,
                        action='store_true',
                        help="Whether to perform optimization and keep the optimizer averages on CPU")
    parser.add_argument('--fp16',
                        default=False,
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=128,
                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
    parser.add_argument('--dry_run',
                        action='store_true', default=False,
                        help='Don\'t load model, just load data')

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if args.fp16:
            logger.info("16-bits training currently not supported in distributed training")
            args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError("At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified.")

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        print("Warning: output directory () already exists and is not empty.")
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = bdu.read_coqa_examples(
            input_file=args.train_file, is_training=True)
        real_train_example_len = sum(len(ex['questions']) for ex in train_examples)
        num_train_steps = int(
            real_train_example_len / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    if not args.dry_run:
        if args.init_full_model is not None:
            model_state_dict = torch.load(args.init_full_model)
            model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict)
        else:
            model = BertForQuestionAnswering(bert_config, use_history=args.use_history)
            if args.init_checkpoint is not None:
                model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
        if args.fp16:
            model.half()
        model.to(device)
        if args.local_rank != -1:
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                              output_device=args.local_rank)
        elif n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Prepare optimizer
        if args.fp16:
            param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
                                for n, param in model.named_parameters()]
        elif args.optimize_on_cpu:
            param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                                for n, param in model.named_parameters()]
        else:
            param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
            ]
        optimizer = BERTAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps)

    global_step = 0
    if args.do_train:
        train_features = bdu.convert_coqa_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)
        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", real_train_example_len)
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
        all_f_history = torch.tensor([f.f_history for f in train_features], dtype=torch.uint8)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                   all_start_positions, all_end_positions, all_f_history)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for epoch_i in trange(int(args.num_train_epochs), desc="Epoch"):
            running_loss = []
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                if n_gpu == 1:
                    batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions, f_history = batch
                # Convert to float here.
                f_history_32 = f_history.float()
                loss = model(input_ids, segment_ids, input_mask,
                             start_positions=start_positions, end_positions=end_positions,
                             f_history=f_history_32,
                             debug=True)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.fp16 and args.loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * args.loss_scale
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                running_loss.append(loss.item())
                if step % 40 == 0:
                    logger.info("epoch {} step {}: avg loss {}".format(epoch_i, step, sum(running_loss) / len(running_loss)))
                    running_loss = []
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16 or args.optimize_on_cpu:
                        if args.fp16 and args.loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                param.grad.data = param.grad.data / args.loss_scale
                        is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
                        if is_nan:
                            logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
                            args.loss_scale = args.loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                    global_step += 1
                if global_step % (args.save_checkpoints_steps // args.train_batch_size) == 0:
                    model_name = os.path.join(args.output_dir, 'model-{}.pth'.format(global_step))
                    # Save a trained model
                    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
                    print("Step {}: saving model to {}".format(global_step, model_name))
                    torch.save(model_to_save.state_dict(), model_name)

        model_name = os.path.join(args.output_dir, 'model-{}.pth'.format(global_step))
        print("Step {}: saving model to {}".format(global_step, model_name))
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        torch.save(model_to_save.state_dict(), model_name)

    if args.do_predict:
        eval_examples = bdu.read_coqa_examples(
            input_file=args.predict_file, is_training=False)
        eval_features = bdu.convert_coqa_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        all_f_history = torch.tensor([f.f_history for f in eval_features], dtype=torch.uint8)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_f_history)
        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices, f_history in tqdm(eval_dataloader, desc="Evaluating"):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            f_history = f_history.to(device)
            with torch.no_grad(),:
                f_history_32 = f_history.float()
                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask, f_history=f_history_32)
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(RawResult(unique_id=unique_id,
                                             coqa_id=eval_feature.coqa_id,
                                             turn_id=eval_feature.turn_id,
                                             start_logits=start_logits,
                                             end_logits=end_logits))
        output_prediction_file = os.path.join(args.output_dir, "predictions.json")
        output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
        bdu.write_predictions(eval_examples, eval_features, all_results,
                              args.n_best_size, args.max_answer_length,
                              args.do_lower_case, output_prediction_file,
                              output_nbest_file, args.verbose_logging)
Exemplo n.º 22
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir", default=None, type=str, required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_config_file", default=None,  type=str,   required=True,
                        help="The config json file corresponding to the pre-trained BERT model. \n"
                             "This specifies the model architecture.")
    parser.add_argument("--vocab_file", default=None,  type=str,   required=True,
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--output_dir", default=None,  type=str,   required=True,
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--init_checkpoint", default=None,  type=str,   
                                help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument("--do_lower_case", default=False,  action='store_true',
                                help="Whether to lower case the input text. True for uncased models, False for cased models.")
    parser.add_argument("--max_seq_length", default=128,   type=int,
                                help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train", default=False,   action='store_true',
                                help="Whether to run training.")
    parser.add_argument("--do_eval", default=False,   action='store_true',
                                help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size", default=32,   type=int,
                                help="Total batch size for training.")
    parser.add_argument("--eval_batch_size", default=8,   type=int,
                                help="Total batch size for eval.")
    parser.add_argument("--learning_rate", default=5e-5,   type=float,
                                help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs", default=3.0,   type=float,
                                help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion", default=0.1,   type=float,
                                help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--save_checkpoints_steps", default=1000,   type=int,
                                help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda", default=False,   action='store_true',
                                help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank", type=int, default=-1,
                            help="local_rank for distributed training on gpus")
    parser.add_argument("--gpu_ids", type=str, default="0",
                            help="select one gpu to use")
    parser.add_argument('--seed',  type=int,  default=42,
                            help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                            help="Number of updates steps to accumualte before performing a backward/update pass.")                       
    parser.add_argument('--optimize_on_cpu', default=False,action='store_true',
                            help="Whether to perform optimization and keep the optimizer averages on CPU")
    parser.add_argument('--fp16', default=False, action='store_true',
                            help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale', type=float, default=128,
                            help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
    ################## atec 
    parser.add_argument('--do_submit', default=False, action='store_true',  help="submit to results to atec cloud")
    parser.add_argument('--train_devset', default=False, action='store_true',  help="")
    parser.add_argument("--test_in_file", default=None, type=str,  help="")
    parser.add_argument("--test_out_file", default=None, type=str,  help="")

    args = parser.parse_args()

    if args.local_rank == -1 and not args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
       
        #device = torch.device("cuda", args.gpu_id)
        n_gpu = len(args.gpu_ids.split(','))#torch.cuda.device_count()
    elif args.no_cuda:
        device = torch.device('cpu')
        n_gpu = 0
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if args.fp16:
            logger.info("16-bits training currently not supported in distributed training")
            args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    #if not args.do_train and not args.do_eval:
    #    raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format(
            args.max_seq_length, bert_config.max_position_embeddings))

    #if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
    #    raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir) #, exist_ok=True)


    processor = AtecProcessor()
    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        if args.train_devset:
            eval_examples = processor.get_dev_examples(args.data_dir)
            train_examples += eval_examples

        num_train_steps = int(
            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    logger.info('build model')
    model = BertForSequenceClassification(bert_config, len(label_list))
    #model = BertSiameseModel(bert_config, len(label_list))
    if args.init_checkpoint is not None:
        try:
            # just model.bert
            model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
        except RuntimeError as e:
            # all model
            import re
            new_state_dict = collections.OrderedDict()
            state_dict = torch.load(args.init_checkpoint, map_location='cpu')
            for key in state_dict.keys():
                new_key = re.sub("module\.", "", key)
                new_state_dict[new_key] = state_dict[key]
            model.load_state_dict(new_state_dict)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model, device_ids=[int(x) for x in
            args.gpu_ids.split(',')])

    global_step = 0
    tr_loss=None
    if args.do_train:
        # Prepare optimizer
        if args.fp16:
            param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
                                for n, param in model.named_parameters()]
        elif args.optimize_on_cpu:
            param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                                for n, param in model.named_parameters()]
        else:
            param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0}
            ]
        optimizer = BERTAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps)

        from tqdm import tqdm, trange
        from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
        from torch.utils.data.distributed import DistributedSampler

        train_features = convert_examples_to_features(
           train_examples, label_list, args.max_seq_length, tokenizer)
        # train_features = convert_examples_to_siamese_features(
        #    train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        # all_tokens_a = torch.tensor([f.tokens_a for f in train_features], dtype=torch.long)
        # all_types_a = torch.tensor([f.types_a for f in train_features], dtype=torch.long)
        # all_mask_a = torch.tensor([f.mask_a for f in train_features], dtype=torch.long)
        # all_tokens_b = torch.tensor([f.tokens_b for f in train_features], dtype=torch.long)
        # all_types_b = torch.tensor([f.types_b for f in train_features], dtype=torch.long)
        # all_mask_b = torch.tensor([f.mask_b for f in train_features], dtype=torch.long)
        # all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        # train_data = TensorDataset(all_tokens_a, all_types_a, all_mask_a,
        #         all_tokens_b, all_types_b, all_mask_b, all_label_ids)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            with tqdm(train_dataloader, desc="Iteration") as pbar:
                for step, batch in enumerate(pbar):
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, input_mask, segment_ids, label_ids = batch
                    loss, logits = model(input_ids, segment_ids, input_mask, label_ids)
                    # tokens_a, types_a, mask_a, tokens_b, types_b, mask_b, label_ids = batch
                    # loss, logits = model(tokens_a, types_a, mask_a, tokens_b, types_b,
                    #         mask_b, label_ids)

                    if n_gpu > 1:
                        loss = loss.mean() # mean() to average on multi-gpu.
                    if args.fp16 and args.loss_scale != 1.0:
                        # rescale loss for fp16 training
                        # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                        loss = loss * args.loss_scale
                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps
                    loss.backward()
                    tr_loss += loss.item()
                    
                    preds = th.argmax(logits, dim=1)
                    acc = accuracy_score(label_ids.detach().cpu().numpy(),
                                preds.detach().cpu().numpy())

                    pbar.set_postfix(loss=loss.item(), acc=acc.item())
                    #nb_tr_examples += input_ids.size(0)
                    nb_tr_examples += label_ids.size(0)
                    nb_tr_steps += 1
                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        if args.fp16 or args.optimize_on_cpu:
                            if args.fp16 and args.loss_scale != 1.0:
                                # scale down gradients for fp16 training
                                for param in model.parameters():
                                    param.grad.data = param.grad.data / args.loss_scale
                            is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
                            if is_nan:
                                logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
                                args.loss_scale = args.loss_scale / 2
                                model.zero_grad()
                                continue
                            optimizer.step()
                            copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
                        else:
                            optimizer.step()
                        model.zero_grad()
                        global_step += 1

        torch.save(model.state_dict(), os.path.join(args.output_dir,
                                            "pytorch_model.bin"))

    if args.do_eval:
        from tqdm import tqdm, trange
        from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
        from torch.utils.data.distributed import DistributedSampler

        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(
           eval_examples, label_list, args.max_seq_length, tokenizer)
        # eval_features = convert_examples_to_siamese_features(
        #    eval_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        # all_tokens_a = torch.tensor([f.tokens_a for f in eval_features], dtype=torch.long)
        # all_types_a = torch.tensor([f.types_a for f in eval_features], dtype=torch.long)
        # all_mask_a = torch.tensor([f.mask_a for f in eval_features], dtype=torch.long)
        # all_tokens_b = torch.tensor([f.tokens_b for f in eval_features], dtype=torch.long)
        # all_types_b = torch.tensor([f.types_b for f in eval_features], dtype=torch.long)
        # all_mask_b = torch.tensor([f.mask_b for f in eval_features], dtype=torch.long)
        # all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        # eval_data = TensorDataset(all_tokens_a, all_types_a, all_mask_a,
        #         all_tokens_b, all_types_b, all_mask_b, all_label_ids)

        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        y_true = []
        y_pred = []
        for batch in tqdm(eval_dataloader):
            # tokens_a, types_a, mask_a, tokens_b, types_b, mask_b, label_ids = batch
            input_ids, input_mask, segment_ids, label_ids = batch
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids)
                # tmp_eval_loss, logits = model(tokens_a, types_a, mask_a,
                #        tokens_b, types_b, mask_b, label_ids)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            preds = np.argmax(logits, axis=1)

            y_true.extend(label_ids)
            y_pred.extend(preds)

            tmp_eval_accuracy = accuracy(logits, label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            #nb_eval_examples += input_ids.size(0)
            nb_eval_examples += label_ids.size # np.array
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples

        result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy,
                  'global_step': global_step,
                  'precision': precision_score(y_true, y_pred),
                  'recall': recall_score(y_true, y_pred), 
                  'f1': f1_score(y_true, y_pred)}
        if tr_loss is not None:
            result['loss'] = tr_loss/nb_tr_steps

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "a") as writer:
            logger.info("***** Eval results *****")
            writer.write("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if args.do_submit:
        eval_examples = processor.get_test_examples(args.test_in_file)
        eval_features = convert_examples_to_features(
            eval_examples, label_list, args.max_seq_length, tokenizer)
        # eval_features = convert_examples_to_siamese_features(
        #    eval_examples, label_list, args.max_seq_length, tokenizer)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        #eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        
        # all_tokens_a = torch.tensor([f.tokens_a for f in eval_features], dtype=torch.long)
        # all_types_a = torch.tensor([f.types_a for f in eval_features], dtype=torch.long)
        # all_mask_a = torch.tensor([f.mask_a for f in eval_features], dtype=torch.long)
        # all_tokens_b = torch.tensor([f.tokens_b for f in eval_features], dtype=torch.long)
        # all_types_b = torch.tensor([f.types_b for f in eval_features], dtype=torch.long)
        # all_mask_b = torch.tensor([f.mask_b for f in eval_features], dtype=torch.long)
        # all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        # eval_data = TensorDataset(all_tokens_a, all_types_a, all_mask_a,
        #        all_tokens_b, all_types_b, all_mask_b, all_label_ids)


        #eval_sampler = SequentialSampler(eval_data)
        #eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval()
        y_pred = []
        batch_size = args.eval_batch_size
        for i in range(0, len(all_label_ids), batch_size):
            input_ids = all_input_ids[i:i+batch_size]
            input_mask = all_input_mask[i:i+batch_size]
            segment_ids = all_segment_ids[i:i+batch_size]
            label_ids = all_label_ids[i:i+batch_size]

            # tokens_a = all_tokens_a[i:i+batch_size]
            # types_a = all_types_a[i:i+batch_size]
            # mask_a = all_mask_a[i:i+batch_size]
            # tokens_b = all_tokens_b[i:i+batch_size]
            # types_b = all_types_b[i:i+batch_size]
            # mask_b = all_mask_b[i:i+batch_size]

            with torch.no_grad():
                logits = model(input_ids, segment_ids, input_mask)
                # logits = model(tokens_a, types_a, mask_a,
                #        tokens_b, types_b, mask_b)

            logits = logits.detach().cpu().numpy()
            preds = np.argmax(logits, axis=1)

            y_pred.extend(preds)

        with open(args.test_out_file, "w") as writer:
            for i, y in enumerate(y_pred):
                writer.write('{}\t{}\n'.format(i+1, y))
Exemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser()
    BERT_DIR = "uncased_L-12_H-768_A-12/"
    ## Required parameters
    parser.add_argument("--bert_config_file", default=BERT_DIR+"bert_config.json", \
                        type=str, help="The config json file corresponding to the pre-trained BERT model. "
                             "This specifies the model architecture.")
    parser.add_argument("--vocab_file", default=BERT_DIR+"vocab.txt", type=str, \
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--output_dir", default="out", type=str, \
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--load", default=False, action='store_true')
    parser.add_argument("--train_file", type=str, \
                        help="SQuAD json for training. E.g., train-v1.1.json", \
                        default="/home/sewon/data/squad/train-v1.1.json")
    parser.add_argument("--predict_file", type=str,
                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json", \
                        default="/home/sewon/data/squad/dev-v1.1.json")
    parser.add_argument("--init_checkpoint", type=str,
                        help="Initial checkpoint (usually from a pre-trained BERT model).", \
                        default=BERT_DIR+"pytorch_model.bin")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help="Whether to lower case the input text. Should be True for uncased "
        "models and False for cased models.")
    parser.add_argument(
        "--max_seq_length",
        default=300,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=39,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=300,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=1000.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--iterations_per_loop",
                        default=1000,
                        type=int,
                        help="How many steps to make in each estimator call.")
    parser.add_argument(
        "--n_best_size",
        default=3,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--verbose_logging",
        default=False,
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--accumulate_gradients",
        type=int,
        default=1,
        help=
        "Number of steps to accumulate gradient on (divide the batch_size and accumulate)"
    )
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument('--eval_period', type=int, default=500)
    parser.add_argument('--max_n_answers', type=int, default=20)
    parser.add_argument('--n_paragraphs', type=str, default='40')
    parser.add_argument('--verbose', action="store_true", default=False)
    parser.add_argument('--wait_step', type=int, default=12)

    # Learning method variation
    parser.add_argument('--loss_type', type=str, default="mml")
    parser.add_argument('--tau', type=float, default=12000.0)

    # For evaluation
    parser.add_argument('--prefix', type=str, default="")  #500
    parser.add_argument('--debug', action="store_true", default=False)

    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        print("Output directory () already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir, exist_ok=True)

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO,
        handlers=[
            logging.FileHandler(os.path.join(args.output_dir, "log.txt")),
            logging.StreamHandler()
        ])
    logger = logging.getLogger(__name__)
    logger.info(args)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
        if not args.predict_file:
            raise ValueError(
                "If `do_train` is True, then `predict_file` must be specified."
            )

    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.do_train and args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    model = BertForQuestionAnswering(bert_config,
                                     device,
                                     4,
                                     loss_type=args.loss_type,
                                     tau=args.tau)
    metric_name = "EM"

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    train_split = ',' in args.train_file
    if train_split:
        n_train_files = len(args.train_file.split(','))

    eval_dataloader, eval_examples, eval_features, _ = get_dataloader(
        logger=logger,
        args=args,
        input_file=args.predict_file,
        is_training=False,
        batch_size=args.predict_batch_size,
        num_epochs=1,
        tokenizer=tokenizer)

    if args.do_train:
        train_file = args.train_file
        if train_split:
            train_file = args.train_file.split(',')[0]
        train_dataloader, _, _, num_train_steps = get_dataloader(
                logger=logger, args=args, \
                input_file=train_file, \
                is_training=True,
                batch_size=args.train_batch_size,
                num_epochs=args.num_train_epochs,
                tokenizer=tokenizer)

    if args.init_checkpoint is not None:
        logger.info("Loading from {}".format(args.init_checkpoint))
        state_dict = torch.load(args.init_checkpoint, map_location='cpu')
        if args.do_train and args.init_checkpoint.endswith(
                'pytorch_model.bin'):
            model.bert.load_state_dict(state_dict)
        else:
            filter = lambda x: x[7:] if x.startswith('module.') else x
            state_dict = {filter(k): v for (k, v) in state_dict.items()}
            model.load_state_dict(state_dict)
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.do_train:
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_parameters = [{
            'params':
            [p for n, p in model.named_parameters() if n not in no_decay],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in model.named_parameters() if n in no_decay],
            'weight_decay_rate':
            0.0
        }]

        optimizer = BERTAdam(optimizer_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps)

        global_step = 0

        best_f1 = (-1, -1)
        wait_step = 0
        model.train()
        global_step = 0
        stop_training = False
        train_losses = []

        for epoch in range(int(args.num_train_epochs)):
            if epoch > 0 and train_split:
                train_file = args.train_file.split(',')[epoch % n_train_files]
                train_dataloader = get_dataloader(
                        logger=logger, args=args, \
                        input_file=train_file, \
                        is_training=True,
                        batch_size=args.train_batch_size,
                        num_epochs=args.num_train_epochs,
                        tokenizer=tokenizer)[0]

            for step, batch in enumerate(train_dataloader):
                global_step += 1
                batch = [t.to(device) for t in batch]
                loss = model(batch, global_step)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                train_losses.append(loss.detach().cpu())
                loss.backward()
                if global_step % args.gradient_accumulation_steps == 0:
                    optimizer.step()  # We have accumulated enought gradients
                    model.zero_grad()
                if global_step % args.eval_period == 0:
                    model.eval()
                    f1 =  predict(logger, args, model, eval_dataloader, eval_examples, eval_features, \
                                  device, write_prediction=False)
                    logger.info(
                        "Step %d Train loss %.2f EM %.2f F1 %.2f on epoch=%d" %
                        (global_step, np.mean(train_losses), f1[0] * 100,
                         f1[1] * 100, epoch))
                    train_losses = []
                    if best_f1 < f1:
                        logger.info("Saving model with best %s: %.2f (F1 %.2f) -> %.2f (F1 %.2f) on epoch=%d" % \
                                (metric_name, best_f1[0]*100, best_f1[1]*100, f1[0]*100, f1[1]*100, epoch))
                        model_state_dict = {
                            k: v.cpu()
                            for (k, v) in model.state_dict().items()
                        }
                        torch.save(
                            model_state_dict,
                            os.path.join(args.output_dir, "best-model.pt"))
                        model = model.to(device)
                        best_f1 = f1
                        wait_step = 0
                        stop_training = False
                    else:
                        wait_step += 1
                        if wait_step == args.wait_step:
                            stop_training = True
                    model.train()
            if stop_training:
                break

        logger.info("Training finished!")

    elif args.do_predict:
        if type(model) == list:
            model = [m.eval() for m in model]
        else:
            model.eval()
        f1 = predict(logger,
                     args,
                     model,
                     eval_dataloader,
                     eval_examples,
                     eval_features,
                     device,
                     varying_n_paragraphs=len(args.n_paragraphs) > 1)
Exemplo n.º 24
0
def main():
    #parse arguments
    config.parse()
    args = config.args
    for k, v in vars(args).items():
        logger.info(f"{k}:{v}")
    #set seeds
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)

    #arguments check
    device, n_gpu = args_check(args)
    os.makedirs(args.output_dir, exist_ok=True)
    forward_batch_size = int(args.train_batch_size /
                             args.gradient_accumulation_steps)
    args.forward_batch_size = forward_batch_size

    #load bert config
    bert_config_T = BertConfig.from_json_file(args.bert_config_file_T)
    bert_config_S = BertConfig.from_json_file(args.bert_config_file_S)
    assert args.max_seq_length <= bert_config_T.max_position_embeddings
    assert args.max_seq_length <= bert_config_S.max_position_embeddings

    #Prepare GLUE task
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    #read data
    train_dataset = None
    eval_datasets = None
    num_train_steps = None
    tokenizer = BertTokenizer(vocab_file=args.vocab_file,
                              do_lower_case=args.do_lower_case)
    # 加载数据集
    if args.do_train:
        train_dataset, examples = load_and_cache_examples(args,
                                                          args.task_name,
                                                          tokenizer,
                                                          evaluate=False)
        if args.aux_task_name:
            aux_train_dataset, examples = load_and_cache_examples(
                args,
                args.aux_task_name,
                tokenizer,
                evaluate=False,
                is_aux=True)
            train_dataset = torch.utils.data.ConcatDataset(
                [train_dataset, aux_train_dataset])
        num_train_steps = int(
            len(train_dataset) / args.train_batch_size) * args.num_train_epochs
    if args.do_predict:
        eval_datasets = []
        eval_task_names = ("mnli",
                           "mnli-mm") if args.task_name == "mnli" else (
                               args.task_name, )
        for eval_task in eval_task_names:
            eval_dataset, examples = load_and_cache_examples(args,
                                                             eval_task,
                                                             tokenizer,
                                                             evaluate=True)
            eval_datasets.append(eval_dataset)
    logger.info("数据集加载成功")

    #加载模型,加载teacher和student模型
    model_T = BertForGLUESimple(bert_config_T,
                                num_labels=num_labels,
                                args=args)
    model_S = BertForGLUESimple(bert_config_S,
                                num_labels=num_labels,
                                args=args)
    #加载teacher模型参数
    if args.tuned_checkpoint_T is not None:
        state_dict_T = torch.load(args.tuned_checkpoint_T, map_location='cpu')
        model_T.load_state_dict(state_dict_T)
        model_T.eval()
    else:
        assert args.do_predict is True
    #Load student
    if args.load_model_type == 'bert':
        assert args.init_checkpoint_S is not None
        state_dict_S = torch.load(args.init_checkpoint_S, map_location='cpu')
        if args.only_load_embedding:
            state_weight = {
                k[5:]: v
                for k, v in state_dict_S.items()
                if k.startswith('bert.embeddings')
            }
            missing_keys, _ = model_S.bert.load_state_dict(state_weight,
                                                           strict=False)
            logger.info(f"Missing keys {list(missing_keys)}")
        else:
            state_weight = {
                k[5:]: v
                for k, v in state_dict_S.items() if k.startswith('bert.')
            }
            missing_keys, _ = model_S.bert.load_state_dict(state_weight,
                                                           strict=False)
            assert len(missing_keys) == 0
        logger.info("Model loaded")
    elif args.load_model_type == 'all':
        assert args.tuned_checkpoint_S is not None
        state_dict_S = torch.load(args.tuned_checkpoint_S, map_location='cpu')
        model_S.load_state_dict(state_dict_S)
        logger.info("Model loaded")
    else:
        logger.info("Student模型没有可加载参数,随机初始化参数 randomly initialized.")
    model_T.to(device)
    model_S.to(device)

    if args.local_rank != -1 or n_gpu > 1:
        if args.local_rank != -1:
            raise NotImplementedError
        elif n_gpu > 1:
            model_T = torch.nn.DataParallel(model_T)  #,output_device=n_gpu-1)
            model_S = torch.nn.DataParallel(model_S)  #,output_device=n_gpu-1)

    if args.do_train:
        #parameters
        params = list(model_S.named_parameters())
        all_trainable_params = divide_parameters(params, lr=args.learning_rate)
        logger.info("Length of all_trainable_params: %d",
                    len(all_trainable_params))
        #优化器配置
        optimizer = BERTAdam(all_trainable_params,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps,
                             schedule=args.schedule,
                             s_opt1=args.s_opt1,
                             s_opt2=args.s_opt2,
                             s_opt3=args.s_opt3)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Forward batch size = %d", forward_batch_size)
        logger.info("  Num backward steps = %d", num_train_steps)

        ########### DISTILLATION ###########
        train_config = TrainingConfig(
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            ckpt_frequency=args.ckpt_frequency,
            log_dir=args.output_dir,
            output_dir=args.output_dir,
            device=args.device)
        # 定义了一些固定的matches配置文件
        from matches import matches
        intermediate_matches = None
        if isinstance(args.matches, (list, tuple)):
            intermediate_matches = []
            for match in args.matches:
                intermediate_matches += matches[match]
        logger.info(f"中间层match信息: {intermediate_matches}")
        distill_config = DistillationConfig(
            temperature=args.temperature,
            intermediate_matches=intermediate_matches)

        logger.info(f"训练配置: {train_config}")
        logger.info(f"蒸馏配置: {distill_config}")
        adaptor_T = partial(BertForGLUESimpleAdaptor,
                            no_logits=args.no_logits,
                            no_mask=args.no_inputs_mask)
        adaptor_S = partial(BertForGLUESimpleAdaptor,
                            no_logits=args.no_logits,
                            no_mask=args.no_inputs_mask)
        # 支持中间状态匹配的通用蒸馏模型
        distiller = GeneralDistiller(train_config=train_config,
                                     distill_config=distill_config,
                                     model_T=model_T,
                                     model_S=model_S,
                                     adaptor_T=adaptor_T,
                                     adaptor_S=adaptor_S)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            raise NotImplementedError
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.forward_batch_size,
                                      drop_last=True)
        callback_func = partial(predict,
                                eval_datasets=eval_datasets,
                                args=args,
                                examples=examples)
        with distiller:
            distiller.train(optimizer,
                            scheduler=None,
                            dataloader=train_dataloader,
                            num_epochs=args.num_train_epochs,
                            callback=callback_func)

    if not args.do_train and args.do_predict:
        res = predict(model_S,
                      eval_datasets,
                      step=0,
                      args=args,
                      examples=examples,
                      label_list=label_list)
        print(res)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default='/hdd/lujunyu/dataset/multi_turn_corpus/ecommerce/',
        type=str,
        required=False,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_config_file",
        default=BERT_BASE_DIR + 'bert_config.json',
        type=str,
        required=False,
        help=
        "The config json file corresponding to the pre-trained BERT model. \n"
        "This specifies the model architecture.")
    parser.add_argument("--task_name",
                        default='ECD',
                        type=str,
                        required=False,
                        help="The name of the task to train.")
    parser.add_argument(
        "--vocab_file",
        default=BERT_BASE_DIR + 'vocab.txt',
        type=str,
        required=False,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--output_dir",
        default='/hdd/lujunyu/model/chatbert/ecd_without_pretraining/',
        type=str,
        required=False,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=True,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_test",
                        default=True,
                        action='store_true',
                        help="Whether to run eval on the test set.")
    parser.add_argument("--train_batch_size",
                        default=500,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=200,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=1e-3,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=5.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.0,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=200,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = BertConfig.from_json_file(args.bert_config_file)
    bert_config.num_hidden_layers = 2
    # bert_config.num_attention_heads = 6
    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}"
            .format(args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        if args.do_train:
            raise ValueError(
                "Output directory ({}) already exists and is not empty.".
                format(args.output_dir))
    else:
        os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese',
                                              do_lower_case=args.do_lower_case)
    train_dataset = ECDDatasetForSP(file_path=os.path.join(
        args.data_dir, "train.txt"),
                                    max_seq_length=args.max_seq_length,
                                    tokenizer=tokenizer)
    eval_dataset = ECDDatasetForSP(file_path=os.path.join(
        args.data_dir, "dev.txt"),
                                   max_seq_length=args.max_seq_length,
                                   tokenizer=tokenizer)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.train_batch_size,
        sampler=RandomSampler(train_dataset),
        num_workers=4)
    eval_dataloader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=args.eval_batch_size,
        sampler=SequentialSampler(eval_dataset),
        num_workers=4)

    num_train_steps = None
    if args.do_train:
        num_train_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    model = BertForSequenceClassification(bert_config, num_labels=2)
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # no_decay = ['bias', 'gamma', 'beta']
    # optimizer_parameters = [
    #     {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01},
    #     {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0}
    # ]

    optimizer_parameters = [{
        'params': [p for n, p in model.named_parameters()],
        'weight_decay_rate': 0.0
    }]

    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    global_step = 0
    best_acc = 0.0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, _ = model(input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                labels=label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()  # We have accumulated enought gradients
                    model.zero_grad()
                    global_step += 1

                if (step + 1) % args.save_checkpoints_steps == 0:
                    ### Evaluate at the end of epoches
                    model.eval()
                    eval_loss, eval_accuracy = 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    logits_all = []
                    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)

                        with torch.no_grad():
                            tmp_eval_loss, logits = model(
                                input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                labels=label_ids)

                        logits = logits.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        for i in range(len(logits)):
                            logits_all += [logits[i]]

                        tmp_eval_accuracy = accuracy(logits,
                                                     label_ids.reshape(-1))

                        eval_loss += tmp_eval_loss.mean().item()
                        eval_accuracy += tmp_eval_accuracy

                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = eval_accuracy / nb_eval_examples

                    result = {
                        'eval_loss': eval_loss,
                        'eval_accuracy': eval_accuracy
                    }

                    output_eval_file = os.path.join(args.output_dir,
                                                    "eval_results_dev.txt")
                    with open(output_eval_file, "a") as writer:
                        logger.info("***** Eval results *****")
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                    output_eval_file = os.path.join(args.output_dir,
                                                    "logits_dev.txt")
                    with open(output_eval_file, "w") as f:
                        for i in range(len(logits_all)):
                            for j in range(len(logits_all[i])):
                                f.write(str(logits_all[i][j]))
                                if j == len(logits_all[i]) - 1:
                                    f.write("\n")
                                else:
                                    f.write(" ")

                    ### Save the best checkpoint
                    if best_acc < eval_accuracy:
                        try:  ### Remove 'module' prefix when using DataParallel
                            state_dict = model.module.state_dict()
                        except AttributeError:
                            state_dict = model.state_dict()
                        torch.save(state_dict,
                                   os.path.join(args.output_dir, "model.pt"))
                        best_acc = eval_accuracy
                        logger.info('Saving the best model in {}'.format(
                            os.path.join(args.output_dir, "model.pt")))

                    model.train()
Exemplo n.º 26
0
def main():
    #parse arguments
    config.parse()
    args = config.args
    for k, v in vars(args).items():
        logger.info(f"{k}:{v}")
    #set seeds
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)

    #arguments check
    device, n_gpu = args_check(args)
    os.makedirs(args.output_dir, exist_ok=True)
    forward_batch_size = int(args.train_batch_size /
                             args.gradient_accumulation_steps)
    args.forward_batch_size = forward_batch_size

    #load bert config
    bert_config_S = BertConfig.from_json_file(args.bert_config_file_S)
    assert args.max_seq_length <= bert_config_S.max_position_embeddings

    #Prepare GLUE task
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    # eg: MNLI,['contradiction', 'entailment', 'neutral'] --> [“矛盾”,“必然”,“中立”]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    #read data
    train_dataset = None
    eval_datasets = None
    num_train_steps = None
    tokenizer = BertTokenizer(vocab_file=args.vocab_file,
                              do_lower_case=args.do_lower_case)
    # 加载数据集, 计算steps
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False)
        if args.aux_task_name:
            aux_train_dataset = load_and_cache_examples(args,
                                                        args.aux_task_name,
                                                        tokenizer,
                                                        evaluate=False,
                                                        is_aux=True)
            train_dataset = torch.utils.data.ConcatDataset(
                [train_dataset, aux_train_dataset])
        num_train_steps = int(
            len(train_dataset) / args.train_batch_size) * args.num_train_epochs
    if args.do_predict:
        eval_datasets = []
        eval_task_names = ("mnli",
                           "mnli-mm") if args.task_name == "mnli" else (
                               args.task_name, )
        for eval_task in eval_task_names:
            eval_datasets.append(
                load_and_cache_examples(args,
                                        eval_task,
                                        tokenizer,
                                        evaluate=True))
    logger.info("数据集已加载")

    #加载模型并初始化, 只用student模型,其实这里相当于在MNLI数据上训练教师模型,只训练一个模型
    model_S = BertForGLUESimple(bert_config_S,
                                num_labels=num_labels,
                                args=args)
    #初始化student模型
    if args.load_model_type == 'bert':
        assert args.init_checkpoint_S is not None
        state_dict_S = torch.load(args.init_checkpoint_S, map_location='cpu')
        if args.only_load_embedding:
            state_weight = {
                k[5:]: v
                for k, v in state_dict_S.items()
                if k.startswith('bert.embeddings')
            }
            missing_keys, _ = model_S.bert.load_state_dict(state_weight,
                                                           strict=False)
            logger.info(f"Missing keys {list(missing_keys)}")
        else:
            state_weight = {
                k[5:]: v
                for k, v in state_dict_S.items() if k.startswith('bert.')
            }
            missing_keys, _ = model_S.bert.load_state_dict(state_weight,
                                                           strict=False)
            assert len(missing_keys) == 0
        logger.info("Model loaded")
    elif args.load_model_type == 'all':
        assert args.tuned_checkpoint_S is not None
        state_dict_S = torch.load(args.tuned_checkpoint_S, map_location='cpu')
        model_S.load_state_dict(state_dict_S)
        logger.info("Model loaded")
    else:
        logger.info("Model is randomly initialized.")
    model_S.to(device)

    if args.local_rank != -1 or n_gpu > 1:
        if args.local_rank != -1:
            raise NotImplementedError
        elif n_gpu > 1:
            model_S = torch.nn.DataParallel(model_S)  #,output_device=n_gpu-1)

    if args.do_train:
        #parameters
        params = list(model_S.named_parameters())
        all_trainable_params = divide_parameters(params, lr=args.learning_rate)
        logger.info("Length of all_trainable_params: %d",
                    len(all_trainable_params))
        # 优化器设置
        optimizer = BERTAdam(all_trainable_params,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps,
                             schedule=args.schedule,
                             s_opt1=args.s_opt1,
                             s_opt2=args.s_opt2,
                             s_opt3=args.s_opt3)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Forward batch size = %d", forward_batch_size)
        logger.info("  Num backward steps = %d", num_train_steps)

        ########### 蒸馏 ###########
        train_config = TrainingConfig(
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            ckpt_frequency=args.ckpt_frequency,
            log_dir=args.output_dir,
            output_dir=args.output_dir,
            device=args.device)

        #执行监督训练,而不是蒸馏。它可以用于训练teacher模型。初始化模型
        distiller = BasicTrainer(train_config=train_config,
                                 model=model_S,
                                 adaptor=BertForGLUESimpleAdaptorTraining)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            raise NotImplementedError
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.forward_batch_size,
                                      drop_last=True)
        callback_func = partial(predict,
                                eval_datasets=eval_datasets,
                                args=args)
        with distiller:
            distiller.train(optimizer,
                            scheduler=None,
                            dataloader=train_dataloader,
                            num_epochs=args.num_train_epochs,
                            callback=callback_func)

    if not args.do_train and args.do_predict:
        res = predict(model_S, eval_datasets, step=0, args=args)
        print(res)
Exemplo n.º 27
0
def main():
    BERT_BASE_DIR='../data/weights/cased_L-12_H-768_A-12'

    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_config_file",
                        default="{}/bert_config.json".format(BERT_BASE_DIR),
                        type=str,
                        required=True,
                        help="The config json file corresponding to the pre-trained BERT model. \n"
                             "This specifies the model architecture.")
    parser.add_argument("--task_name",
                        default="lm",
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument("--vocab_file",
                        default="{}/vocab.txt".format(BERT_BASE_DIR),
                        type=str,
                        required=True,
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--init_checkpoint",
                        default="{}/pytorch_model.bin".format(BERT_BASE_DIR),
                        type=str,
                        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument("--do_lower_case",
                        default=False,
                        action='store_true',
                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        default=True,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=True,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=10,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=2,
                        help="Number of updates steps to accumualte before performing a backward/update pass.")

    parser.add_argument('--MBTI',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumualte before performing a backward/update pass.")

    args = parser.parse_args()

    ## Initialization
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")


    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format(
            args.max_seq_length, bert_config.max_position_embeddings))


    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        print("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)
    save_path = os.path.join(args.output_dir, 'state_dict.pkl')
    print("save_path": save_path)


    ## Load Data
    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

    decoder = {v:k for k,v in tokenizer.wordpiece_tokenizer.vocab.items()}

    processors = {
        "lm": LMProcessor,
    }

    task_name = args.task_name.lower()
    if task_name not in processors:
            raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name](tokenizer=tokenizer)
    label_list = processor.get_labels()

    train_examples = processor.get_train_examples(args.data_dir, skip=30, tqdm=tqdm)
    num_train_steps = int(
        len(train_examples) / args.train_batch_size * args.num_train_epochs)

    train_features = convert_tokens_to_features(
        train_examples, label_list, args.max_seq_length, tokenizer, tqdm=tqdm)

    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
    all_label_weights = torch.tensor([f.label_weights for f in train_features], dtype=torch.long)

    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_label_weights)
    if args.local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

    ## Load Model
    model = BertForMaskedLanguageModelling(bert_config)
    if args.init_checkpoint is not None:
        model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))

    if os.path.isfile(save_path):
        model.load_state_dict(torch.load(save_path, map_location='cpu'))

    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    print(model)

    ## Initialize Optimizer
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_parameters = [
        {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0}
        ]

    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                    t_total=num_train_steps)

    ## Training
    val_test="""The next day I was somewhat somnolent, of which you may be sure Miss Frankland took no notice. She retired to her own room when we went for our recreation. My friends scolded me for not coming to them the previous night, but I told them that my parents had continued to move about her room for so long a time that I had fallen fast asleep, and even then had not had enough, as they might have observed how sleepy I had been all day."""
    global_step = 0

    model.train()
    for _ in tqdm(range(int(args.num_train_epochs)), desc="Epoch"):
        tr_loss, nb_tr_examples, nb_tr_steps = 0, 0, 0
        with tqdm(total=len(train_dataloader), desc='Iteration', mininterval=60) as prog:
            for step, batch in enumerate(train_dataloader):

                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, label_weights = batch
                loss, logits = model(input_ids, segment_ids, input_mask, label_ids, label_weights)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()    # We have accumulated enougth gradients
                    model.zero_grad()
                prog.update(1)
                prog.desc = 'Iter. loss={:2.6f}'.format(tr_loss/nb_tr_examples)
                if step%3000==10:

                    print('step', step, 'loss', tr_loss/nb_tr_examples)
                    display(predict_masked_words(val_test, processor, tokenizer, model, device=device, max_seq_length=args.max_seq_length))
                    display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=10, device=device))
                    tr_loss, nb_tr_examples, nb_tr_steps = 0, 0, 0

        torch.save(model.state_dict(), save_path)

    global_step += 1

    ## Save
    torch.save(model.state_dict(), save_path)

    ## Test
    display(predict_next_words(val_test, processor, tokenizer, model, max_seq_length=args.max_seq_length, n=10, device=device, debug=False))
Exemplo n.º 28
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--fine_tune_data_1_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--fine_tune_data_2_dir",
        default=None,
        type=str,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--eval_data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help=
        "The config json file corresponding to the pre-trained BERT model. \n"
        "This specifies the model architecture.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--discr",
                        default=False,
                        action='store_true',
                        help="Whether to do discriminative fine-tuning.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--accumulate_gradients",
        type=int,
        default=1,
        help=
        "Number of steps to accumulate gradient on (divide the batch_size and accumulate)"
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument("--frozen_bert",
                        default=False,
                        action='store_true',
                        help="frozen the gradient of bert encoder")
    parser.add_argument(
        '--layers',
        type=int,
        nargs='+',
        default=[-2],
        help="choose the layers that used for downstream tasks, "
        "-2 means use pooled output, -1 means all layer,"
        "else means the detail layers. default is -2")
    parser.add_argument('--num_datas',
                        default=None,
                        type=int,
                        help="the number of data examples")
    parser.add_argument('--num_test_datas',
                        default=None,
                        type=int,
                        help="the number of data examples")
    parser.add_argument('--pooling_type',
                        default=None,
                        type=str,
                        choices=[None, 'mean', 'max'])
    parser.add_argument(
        '--trunc_medium',
        type=int,
        default=-2,
        help="choose the trunc ways, -2 means choose the first seq_len tokens, "
        "-1 means choose the last seq_len tokens, "
        "0 means choose the first (seq_len // 2) and the last(seq_len // 2). "
        "other positive numbers k mean the first k tokens "
        "and the last (seq_len - k) tokens")
    args = parser.parse_args()

    processors = {
        "ag": AGNewsProcessor,
        "ag_sep": AGNewsProcessor_sep,
        "ag_sep_aug": AGNewsProcessor_sep_aug,
        "binary": BinaryProcessor,
        "imdb": IMDBProcessor,
        "imdb_t_m": IMDBProcessor_trunc_medium,
        "imdb_sep": IMDBProcessor_sep,
        "imdb_sep_aug": IMDBProcessor_sep_aug,
        #"yelp_p": Yelp_p_Processor,
        #"yelp_f": Yelp_f_Processor,
        #"yahoo": Yahoo_Processor,
        "trec": Trec_Processor,
        #"dbpedia":Dbpedia_Processor,
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}"
            .format(args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    model = BertForSequenceClassification(bert_config,
                                          len(label_list),
                                          args.layers,
                                          pooling=args.pooling_type)
    if args.init_checkpoint is not None:
        model.bert.load_state_dict(
            torch.load(args.init_checkpoint, map_location='cpu'))

    if args.frozen_bert:
        for p in model.bert.parameters():
            p.requires_grad = False

    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ['bias', 'gamma', 'beta']
    if args.discr:
        group1 = ['layer.0.', 'layer.1.', 'layer.2.', 'layer.3.']
        group2 = ['layer.4.', 'layer.5.', 'layer.6.', 'layer.7.']
        group3 = ['layer.8.', 'layer.9.', 'layer.10.', 'layer.11.']
        group_all = [
            'layer.0.', 'layer.1.', 'layer.2.', 'layer.3.', 'layer.4.',
            'layer.5.', 'layer.6.', 'layer.7.', 'layer.8.', 'layer.9.',
            'layer.10.', 'layer.11.'
        ]
        optimizer_parameters = [
            {
                'params': [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay) and not any(
                        nd in n for nd in group_all)
                ],
                'weight_decay_rate':
                0.01
            },
            {
                'params': [
                    p for n, p in model.named_parameters()
                    if not any(nd in n
                               for nd in no_decay) and any(nd in n
                                                           for nd in group1)
                ],
                'weight_decay_rate':
                0.01,
                'lr':
                args.learning_rate / 2.6
            },
            {
                'params': [
                    p for n, p in model.named_parameters()
                    if not any(nd in n
                               for nd in no_decay) and any(nd in n
                                                           for nd in group2)
                ],
                'weight_decay_rate':
                0.01,
                'lr':
                args.learning_rate
            },
            {
                'params': [
                    p for n, p in model.named_parameters()
                    if not any(nd in n
                               for nd in no_decay) and any(nd in n
                                                           for nd in group3)
                ],
                'weight_decay_rate':
                0.01,
                'lr':
                args.learning_rate * 2.6
            },
            {
                'params': [
                    p for n, p in model.named_parameters()
                    if any(nd in n
                           for nd in no_decay) and not any(nd in n
                                                           for nd in group_all)
                ],
                'weight_decay_rate':
                0.0
            },
            {
                'params': [
                    p for n, p in model.named_parameters()
                    if any(nd in n
                           for nd in no_decay) and any(nd in n
                                                       for nd in group1)
                ],
                'weight_decay_rate':
                0.0,
                'lr':
                args.learning_rate / 2.6
            },
            {
                'params': [
                    p for n, p in model.named_parameters()
                    if any(nd in n
                           for nd in no_decay) and any(nd in n
                                                       for nd in group2)
                ],
                'weight_decay_rate':
                0.0,
                'lr':
                args.learning_rate
            },
            {
                'params': [
                    p for n, p in model.named_parameters()
                    if any(nd in n
                           for nd in no_decay) and any(nd in n
                                                       for nd in group3)
                ],
                'weight_decay_rate':
                0.0,
                'lr':
                args.learning_rate * 2.6
            },
        ]
    else:
        optimizer_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.0
        }]

    data_dir = args.fine_tune_data_1_dir

    global_step = 0

    print("Initiate Training Data 1")
    train_examples = None
    num_train_steps = None
    train_examples = processor.get_train_examples(data_dir,
                                                  data_num=args.num_datas)
    num_train_steps = int(
        len(train_examples) / args.train_batch_size * args.num_train_epochs)

    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    train_features = convert_examples_to_features(
        train_examples,
        label_list,
        args.max_seq_length,
        tokenizer,
        trunc_medium=args.trunc_medium)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features],
                                 dtype=torch.long)

    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                               all_label_ids)
    if args.local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    print("Train 1")
    epoch = 0
    for _ in trange(int(args.num_train_epochs), desc="Epoch"):
        epoch += 1
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()
            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()  # We have accumulated enought gradients
                model.zero_grad()
                global_step += 1

    if args.fine_tune_data_2_dir:
        data_dir = args.fine_tune_data_2_dir
        print("Initiate Training Data 2")
        train_examples = None
        num_train_steps = None
        train_examples = processor.get_train_examples(data_dir,
                                                      data_num=args.num_datas)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size *
            args.num_train_epochs)

        optimizer_2 = BERTAdam(optimizer_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               t_total=num_train_steps)

        train_features = convert_examples_to_features(
            train_examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            trunc_medium=args.trunc_medium)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader_2 = DataLoader(train_data,
                                        sampler=train_sampler,
                                        batch_size=args.train_batch_size)

        print("Train 2")
        epoch = 0
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            epoch += 1
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader_2, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer_2.step()  # We have accumulated enought gradients
                    model.zero_grad()
                    global_step += 1

    print("Initiate Eval Data")
    data_dir = args.eval_data_dir
    eval_examples = processor.get_dev_examples(data_dir,
                                               data_num=args.num_test_datas)
    eval_features = convert_examples_to_features(
        eval_examples,
        label_list,
        args.max_seq_length,
        tokenizer,
        trunc_medium=args.trunc_medium)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                 dtype=torch.long)

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                              all_label_ids)
    eval_dataloader = DataLoader(eval_data,
                                 batch_size=args.eval_batch_size,
                                 shuffle=False)

    print("Eval Data")
    epoch = 0
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    with open(os.path.join(args.output_dir, "results_data.txt"), "w") as f:
        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                eval_dataloader, desc="Evaluate"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss, logits = model(input_ids, segment_ids,
                                              input_mask, label_ids)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            outputs = np.argmax(logits, axis=1)
            for output in outputs:
                f.write(str(output) + "\n")
            tmp_eval_accuracy = np.sum(outputs == label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_examples

    result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy}

    output_eval_file = os.path.join(args.output_dir, "eval_data_results.txt")
    print("output_eval_file=", output_eval_file)
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help=
        "The config json file corresponding to the pre-trained BERT model. \n"
        "This specifies the model architecture.")
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--load_all",
        default=False,
        action='store_true',
        help="Whether to load all parameter or only for bert part.")
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--multi",
                        default=False,
                        help="Whether to add adapter modules",
                        action='store_true')
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument(
        "--optim",
        default='normal',
        help=
        "Whether to split up the optimiser between adapters and not adapters.")
    parser.add_argument(
        "--sample",
        default='rr',
        help="How to sample tasks, other options 'prop', 'sqrt' or 'anneal'")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--h_aug",
                        default="n/a",
                        help="Size of hidden state for adapters..")
    parser.add_argument("--tasks",
                        default="all",
                        help="Which set of tasks to train on.")
    parser.add_argument(
        "--task_id",
        default=1,
        help="ID of single task to train on if using that setting.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--freeze",
                        default=False,
                        action='store_true',
                        help="Freeze base network weights")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    args = parser.parse_args()

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "rte": RTEProcessor,
        "sts": STSProcessor,
        "sst": SSTProcessor,
        "qqp": QQPProcessor,
        "qnli": QNLIProcessor,
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}"
            .format(args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    if args.tasks == 'inf':
        task_names = ['qnli', 'mnli', 'rte']
        data_dirs = ['QNLI', 'MNLI', 'RTE']
    elif args.tasks == 'all':
        task_names = [
            'cola', 'mrpc', 'mnli', 'rte', 'sts', 'sst', 'qqp', 'qnli'
        ]
        data_dirs = [
            'CoLA', 'MRPC', 'MNLI', 'RTE', 'STS-B', 'SST-2', 'QQP', 'QNLI'
        ]
    elif args.tasks == 'single':
        task_names = [
            'cola', 'mrpc', 'mnli', 'rte', 'sts', 'sst', 'qqp', 'qnli'
        ]
        data_dirs = [
            'CoLA', 'MRPC', 'MNLI', 'RTE', 'STS-B', 'SST-2', 'QQP', 'QNLI'
        ]
        task_names = [task_names[int(args.task_id)]]
        data_dirs = [data_dirs[int(args.task_id)]]
    if task_names[0] not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor_list = [processors[task_name]() for task_name in task_names]
    label_list = [processor.get_labels() for processor in processor_list]

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    num_tasks = len(task_names)
    if args.do_train:
        train_examples = [
            processor.get_train_examples(args.data_dir + data_dir)
            for processor, data_dir in zip(processor_list, data_dirs)
        ]
        num_train_steps = int(
            len(train_examples[0]) / args.train_batch_size *
            args.num_train_epochs)
        if args.tasks == 'all':
            total_tr = 300 * num_tasks * args.num_train_epochs
        else:
            total_tr = int(0.5 * num_train_steps)

    if args.tasks == 'all':
        steps_per_epoch = args.gradient_accumulation_steps * 300 * num_tasks
    else:
        steps_per_epoch = int(num_train_steps / (2. * args.num_train_epochs))
    bert_config.num_tasks = num_tasks
    if args.h_aug is not 'n/a':
        bert_config.hidden_size_aug = int(args.h_aug)
    model = BertForMultiTask(bert_config,
                             [len(labels) for labels in label_list])

    if args.init_checkpoint is not None:
        # load all parameter including the classification and model patch
        if args.load_all:
            model.load_state_dict(
                torch.load(args.init_checkpoint, map_location='cpu'))
        elif args.multi:
            partial = torch.load(args.init_checkpoint, map_location='cpu')
            model_dict = model.bert.state_dict()
            update = {}
            for n, p in model_dict.items():
                if 'aug' in n or 'mult' in n:
                    update[n] = p
                    if 'pooler.mult' in n and 'bias' in n:
                        update[n] = partial['pooler.dense.bias']
                    if 'pooler.mult' in n and 'weight' in n:
                        update[n] = partial['pooler.dense.weight']
                else:
                    update[n] = partial[n]
            model.bert.load_state_dict(update)
        else:
            model.bert.load_state_dict(
                torch.load(args.init_checkpoint, map_location='cpu'))

    if args.freeze:
        for n, p in model.bert.named_parameters():
            if 'aug' in n or 'classifier' in n or 'mult' in n or 'gamma' in n or 'beta' in n:
                continue
            p.requires_grad = False

    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)
    if args.optim == 'normal':
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.0
        }]
        optimizer = BERTAdam(optimizer_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=total_tr)
    else:
        no_decay = ['bias', 'gamma', 'beta']
        base = ['attn']
        optimizer_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n
                           for nd in no_decay) and not any(nd in n
                                                           for nd in base)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay) and not any(nd in n
                                                               for nd in base)
            ],
            'weight_decay_rate':
            0.0
        }]
        optimizer = BERTAdam(optimizer_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=total_tr)
        optimizer_parameters_mult = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay) and any(nd in n
                                                               for nd in base)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay) and any(nd in n
                                                           for nd in base)
            ],
            'weight_decay_rate':
            0.0
        }]
        optimizer_mult = BERTAdam(optimizer_parameters_mult,
                                  lr=3e-4,
                                  warmup=args.warmup_proportion,
                                  t_total=total_tr)
    if args.do_eval:
        eval_loaders = []
        for i, task in enumerate(task_names):
            eval_examples = processor_list[i].get_dev_examples(args.data_dir +
                                                               data_dirs[i])
            eval_features = convert_examples_to_features(
                eval_examples, label_list[i], args.max_seq_length, tokenizer,
                task)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in eval_features], dtype=torch.long)
            if task != 'sts':
                all_label_ids = torch.tensor(
                    [f.label_id for f in eval_features], dtype=torch.long)
            else:
                all_label_ids = torch.tensor(
                    [f.label_id for f in eval_features], dtype=torch.float32)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label_ids)

            if args.local_rank == -1:
                eval_sampler = SequentialSampler(eval_data)
            else:
                eval_sampler = DistributedSampler(eval_data)
            eval_loaders.append(
                DataLoader(eval_data,
                           sampler=eval_sampler,
                           batch_size=args.eval_batch_size))

    global_step = 0
    if args.do_train:
        loaders = []
        logger.info("  Num Tasks = %d", len(train_examples))
        for i, task in enumerate(task_names):
            train_features = convert_examples_to_features(
                train_examples[i], label_list[i], args.max_seq_length,
                tokenizer, task)
            logger.info("***** training data for %s *****", task)
            logger.info("  Data size = %d", len(train_features))

            all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in train_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in train_features], dtype=torch.long)
            if task != 'sts':
                all_label_ids = torch.tensor(
                    [f.label_id for f in train_features], dtype=torch.long)
            else:
                all_label_ids = torch.tensor(
                    [f.label_id for f in train_features], dtype=torch.float32)

            train_data = TensorDataset(all_input_ids, all_input_mask,
                                       all_segment_ids, all_label_ids)
            if args.local_rank == -1:
                train_sampler = RandomSampler(train_data)
            else:
                train_sampler = DistributedSampler(train_data)
            loaders.append(
                iter(
                    DataLoader(train_data,
                               sampler=train_sampler,
                               batch_size=args.train_batch_size)))
        total_params = sum(p.numel() for p in model.parameters())
        logger.info("  Num param = {}".format(total_params))
        loaders = [cycle(it) for it in loaders]
        model.train()
        best_score = 0.
        if args.sample == 'sqrt' or args.sample == 'prop':
            probs = [6680, 2865, 306798, 1945, 4491, 52616, 284257, 84715]
            if args.sample == 'prop':
                alpha = 1.
            if args.sample == 'sqrt':
                alpha = 0.5
            probs = [p**alpha for p in probs]
            tot = sum(probs)
            probs = [p / tot for p in probs]
        task_id = 0
        epoch = 0
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            if args.sample == 'anneal':
                probs = [6680, 2865, 306798, 1945, 4491, 52616, 284257, 84715]
                alpha = 1. - 0.8 * epoch / (args.num_train_epochs - 1)
                probs = [p**alpha for p in probs]
                tot = sum(probs)
                probs = [p / tot for p in probs]

            tr_loss = [0. for i in range(num_tasks)]
            nb_tr_examples, nb_tr_steps = 0, 0
            for step in range(steps_per_epoch):
                if args.sample != 'rr':
                    if step % args.gradient_accumulation_steps == 0:
                        task_id = np.random.choice(8, p=probs)
                else:
                    task_id = task_id % num_tasks
                batch = next(loaders[task_id])
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, _ = model(input_ids, segment_ids, input_mask, task_id,
                                task_names[task_id], label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss[task_id] += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if step % 1000 < num_tasks:
                    logger.info("Task: {}, Step: {}".format(task_id, step))
                    logger.info("Loss: {}".format(tr_loss[task_id] /
                                                  nb_tr_steps))
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()  # We have accumulated enought gradients
                    if args.optim != 'normal':
                        optimizer_mult.step()
                    model.zero_grad()
                    global_step += 1
                    if not args.sample:
                        task_id += 1
            epoch += 1
            ev_acc = 0.
            for i, task in enumerate(task_names):
                ev_acc += do_eval(model, logger, args, device, tr_loss[i],
                                  nb_tr_steps, global_step, processor_list[i],
                                  label_list[i], tokenizer, eval_loaders[i],
                                  task, i)
            logger.info("Total acc: {}".format(ev_acc))
            if ev_acc > best_score:
                best_score = ev_acc
                model_dir = os.path.join(args.output_dir, "best_model.pth")
                torch.save(model.state_dict(), model_dir)
            logger.info("Best Total acc: {}".format(best_score))

        ev_acc = 0.
        for i, task in enumerate(task_names):
            ev_acc += do_eval(model, logger, args, device, tr_loss[i],
                              nb_tr_steps, global_step, processor_list[i],
                              label_list[i], tokenizer, eval_loaders[i], task,
                              i)
        logger.info("Total acc: {}".format(ev_acc))
Exemplo n.º 30
0
def main(args):
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    bert_config_file = "distilbert_config.json"
    bert_config = BertConfig.from_json_file(bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}"
            .format(args.max_seq_length, bert_config.max_position_embeddings))

    # if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
    #     shutil.rmtree(args.output_dir)
    # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    # prepare dataloaders
    processor = Sentihood_QA_M_Processor()
    label_list = processor.get_labels()
    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    # training set
    num_train_steps = None

    train_data, len_train = get_data(processor, label_list, tokenizer,
                                     args.data_dir, 'train')
    if args.local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    num_train_steps = int(len_train / args.train_batch_size *
                          args.num_train_epochs)
    logger.info("***** Running training *****")
    logger.info("  Num Training examples = %d", len_train)
    logger.info("  Training Batch size = %d", args.train_batch_size)
    logger.info("  Training Num steps = %d", num_train_steps)

    # test set
    if args.eval_test:
        test_data, len_test = get_data(processor, label_list, tokenizer,
                                       args.data_dir, 'test')
        test_dataloader = DataLoader(test_data,
                                     batch_size=args.eval_batch_size,
                                     shuffle=False)
        logger.info("  Num Test examples = %d", len_test)

    # model and optimizer
    #model = BertForSequenceClassification(bert_config, len(label_list))
    #if args.init_checkpoint is not None:
    #    model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))

    model = BertBinaryClassifier(len(label_list))
    if args.init_checkpoint:
        logger.info("Loading model from checkpoint %s", args.init_checkpoint)
        model.load_state_dict(
            torch.load(args.init_checkpoint, map_location='cpu'))
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ['bias', 'gamma', 'beta']
    optimizer_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay_rate':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay_rate':
        0.0
    }]

    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)
    #optimizer = Adam(model.parameters(), lr=args.learning_rate)
    # train
    timestamp = time.strftime('%b-%d-%Y_%H%M', time.localtime())
    output_log_file = os.path.join(args.output_dir,
                                   "log_{}.txt".format(timestamp))
    print("output_log_file=", output_log_file)
    with open(output_log_file, "w") as writer:
        if args.eval_test:
            writer.write(
                "epoch\tglobal_step\ttrain_loss\ttrain_accuracy\ttest_loss\ttest_accuracy\n"
            )
        else:
            writer.write("epoch\tglobal_step\ttrain_loss\ttrain_accuracy\n")

    global_step = 0
    for epoch_num in range(int(args.num_train_epochs)):
        model.train()
        train_loss, train_accuracy, nb_train_steps, nb_train_examples = 0, 0, 0, 0
        for step, batch_data in enumerate(
                tqdm(train_dataloader, desc="Iteration")):
            #token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
            batch = tuple(t.to(device) for t in batch_data)
            input_ids, input_mask, label_ids = batch
            outputs = model(input_ids, input_mask, label_ids)

            #logits is the proba
            loss, logits = outputs[:2]
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            outputs = np.argmax(logits, axis=1)
            train_accuracy += np.sum(outputs == label_ids)

            train_loss += loss.item()
            loss.backward()

            nb_train_examples += input_ids.size(0)
            nb_train_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()  # We have accumulated enought gradients
                model.zero_grad()
                global_step += 1

            if (step + 1) % 50 == 0:
                logger.info(
                    "Epoch = %d, Batch = %d, Batch loss = %f, Avg loss (per batch) = %f",
                    epoch_num + 1, (step + 1), loss.item(),
                    train_loss / (step + 1))

            if global_step % 200 == 0:
                logger.info("Creating a checkpoint.")
                model.eval().cpu()
                ckpt_model_filename = "ckpt_epoch_" + str(
                    epoch_num + 1) + "_examples_" + str(global_step) + ".pth"
                ckpt_model_path = os.path.join(args.output_dir,
                                               ckpt_model_filename)
                torch.save(model.state_dict(), ckpt_model_path)
                model.to(device)

        train_loss /= nb_train_steps
        train_accuracy /= nb_train_examples
        logger.info(
            "After %f epoch, Training loss = %f, Training accuracy = %f",
            epoch_num + 1, train_loss, train_accuracy)

        # eval_test
        if args.eval_test:
            model.eval()
            test_loss, test_accuracy = 0, 0
            nb_test_steps, nb_test_examples = 0, 0
            with open(
                    os.path.join(args.output_dir,
                                 "test_ep_" + str(epoch_num + 1) + ".txt"),
                    "w") as f_test:
                for input_ids, input_mask, label_ids in test_dataloader:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        outputs = model(input_ids, input_mask, label_ids)

                    tmp_test_loss = outputs[0]
                    logits = outputs[1]
                    logits = logits.detach().cpu().numpy()
                    logits = softmax(logits, axis=1)
                    label_ids = label_ids.to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)
                    for output_i in range(len(outputs)):
                        f_test.write(str(outputs[output_i]))
                        for ou in logits[output_i]:
                            f_test.write(" " + str(ou))
                        f_test.write("\n")
                    tmp_test_accuracy = np.sum(outputs == label_ids)

                    test_loss += tmp_test_loss.mean().item()
                    test_accuracy += tmp_test_accuracy

                    nb_test_examples += input_ids.size(0)
                    nb_test_steps += 1

            test_loss = test_loss / nb_test_steps
            test_accuracy = test_accuracy / nb_test_examples

        result = collections.OrderedDict()
        if args.eval_test:
            result = {
                'epoch': epoch_num + 1,
                'global_step': global_step,
                'train_loss': train_loss,
                'train_accuracy': train_accuracy,
                'test_loss': test_loss,
                'test_accuracy': test_accuracy
            }
        else:
            result = {
                'epoch': epoch_num + 1,
                'global_step': global_step,
                'train_loss': train_loss,
                'train_accuracy': train_accuracy
            }

        logger.info("***** Eval results *****")
        with open(output_log_file, "a+") as writer:
            print("Final Classfier Results: ", result)
            for key in result.keys():
                logger.info("  %s = %s\n", key, str(result[key]))
                writer.write("%s\t" % (str(result[key])))
            writer.write("\n")

    model.eval().cpu()
    timestamp = time.strftime('%b-%d-%Y_%H%M', time.localtime())
    save_model_filename = "epoch_" + str(
        args.num_train_epochs) + "_" + timestamp + ".model"
    save_model_path = os.path.join(args.output_dir, save_model_filename)
    torch.save(model.state_dict(), save_model_path)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_config_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The config json file corresponding to the pre-trained BERT model. \n"
                             "This specifies the model architecture.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument("--vocab_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--init_checkpoint",
                        default=None,
                        type=str,
                        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument("--do_lower_case",
                        default=False,
                        action='store_true',
                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed', 
                        type=int, 
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumualte before performing a backward/update pass.")                       
    parser.add_argument('--optimize_on_cpu',
                        default=False,
                        action='store_true',
                        help="Whether to perform optimization and keep the optimizer averages on CPU")
    parser.add_argument('--fp16',
                        default=False,
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=128,
                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')

    args = parser.parse_args()

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "news": NewsProcessor,
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        # torch.distributed.init_process_group(backend='nccl')
        if args.fp16:
            logger.info("16-bits training currently not supported in distributed training")
            args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format(
            args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))


    processor = processors[task_name]()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

    label_list = processor.get_labels()

    print("label_list.size:%d\n" %(len(label_list)))

    # Prepare model
    model = BertForSequenceClassification(bert_config, len(label_list))
    if args.init_checkpoint is not None:
        model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
    if args.fp16:
        model.half()
    model.to(device)
    #if args.local_rank != -1:
        #model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
     #                                                     output_device=args.local_rank)
    #elif n_gpu > 1:
    #    model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if args.fp16:
        param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
                            for n, param in model.named_parameters()]
    elif args.optimize_on_cpu:
        param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                            for n, param in model.named_parameters()]
    else:
        param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0}
        ]
    optimizer = BERTAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    global_step = 0
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            
            train_sampler = RandomSampler(train_data)
            #train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.fp16 and args.loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * args.loss_scale
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16 or args.optimize_on_cpu:
                        if args.fp16 and args.loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                param.grad.data = param.grad.data / args.loss_scale
                        is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
                        if is_nan:
                            logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
                            args.loss_scale = args.loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                    global_step += 1

    if args.do_eval:
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(
            eval_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:

            eval_sampler = SequentialSampler(eval_data)
            #eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples

        result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy,
                  'global_step': global_step,
                  'loss': tr_loss/nb_tr_steps}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))