def save_all(self, path: str, tokenizer: BertTokenizer, label_encoder):
        """Save all files needed for inference

        :param path:
        :param tokenizer: Bert tokenizer
        :param label_encoder: label encoder
        :return:
        """
        torch.save(self.model.state_dict(), os.path.join(path, config.MODEL_NAME))
        tokenizer.save_pretrained(path)
        output = open(os.path.join(path, 'label_encoder.pkl'), 'wb')
        pickle.dump(label_encoder, output)
        output.close()
def save_model(
    args,
    model,
    optimizer,
    src_tokenizer: BertTokenizer,
    tgt_tokenizer: GPT2Tokenizer,
    nstep,
    nepoch,
    bleu,
    loss,
):
    # 记录整体训练评价结果
    train_metric_log_file = os.path.join(args.output_dir, "training_metric.tsv")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    if os.path.exists(train_metric_log_file):
        with open(train_metric_log_file, "a", encoding="utf-8") as fa:
            fa.write("{}\t{}\t{}\t{}\n".format(nepoch, nstep, loss, bleu))
    else:
        with open(train_metric_log_file, "w", encoding="utf-8") as fw:
            fw.write("epoch\tstep\tloss\tbleu\n")
            fw.write("{}\t{}\t{}\t{}\n".format(nepoch, nstep, loss, bleu))

    # 保存模型
    model_save_path = os.path.join(
        args.output_dir, "epoch{}_step{}/".format(nepoch, nstep)
    )
    os.makedirs(model_save_path)
    model.save_pretrained(model_save_path)
    if local_rank == 0 or local_rank == -1:
        print(
            "epoch:{} step:{} loss:{} bleu:{} model save complete.".format(
                nepoch, nstep, round(loss, 4), round(bleu, 4)
            )
        )
    if args.save_optimizer:
        torch.save(optimizer, os.path.join(model_save_path, "optimizer.pt"))

    # 保存tokenizer
    src_tokenizer.save_pretrained(os.path.join(model_save_path, "src_tokenizer"))
    tgt_tokenizer.save_pretrained(os.path.join(model_save_path, "tgt_tokenizer"))
예제 #3
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    with Path('foodbert/data/used_ingredients.json').open() as f:
        used_ingredients = json.load(f)  # Dont seperate these
    tokenizer = BertTokenizer(
        vocab_file='foodbert/data/bert-base-cased-vocab.txt',
        do_lower_case=False,
        max_len=128,
        never_split=used_ingredients
    )  # For one sentence instruction, longer shouldn't be necessary

    if model_args.model_name_or_path:
        model = AutoModelWithLMHead.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelWithLMHead.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"
                             ] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
            "flag (masked language modeling).")

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    # Get datasets
    train_dataset = (get_dataset(
        data_args, tokenizer=tokenizer, local_rank=training_args.local_rank)
                     if training_args.do_train else None)
    eval_dataset = (get_dataset(data_args,
                                tokenizer=tokenizer,
                                local_rank=training_args.local_rank,
                                evaluate=True)
                    if training_args.do_eval else None)
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=data_args.mlm,
        mlm_probability=data_args.mlm_probability)

    # Make sure checkpoint recovery and continous training works on GPU, probably we need to make sure to push all parameters to the gpu
    # Solves bug in Trainer https://github.com/huggingface/transformers/issues/4240
    model.to(training_args.device)
    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval and training_args.local_rank in [-1, 0]:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_lm.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results
def train(args, train_dataset, model: PreTrainedModel,
          tokenizer: BertTokenizer) -> Tuple[int, float]:
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()
    args.train_batch_size = args.per_gpu_batch_size * max(1, args.n_gpu)

    # 补齐 pad
    def collate(examples: List[torch.tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    # create dataloader for training
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate)
    # prepare gradient accumulation
    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs
    # load the model
    model = model.module if hasattr(
        model,
        'module') else model  # take care of distribute/parallel training
    model.resize_token_embeddings(len(dataloader))
    # Prepare optimizer and schedule(linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)
    # check if saved optimizer or scheduler state exist
    if (args.model_name_or_path and os.path.isfile(
            os.path.join(args.model_name_or_path, 'optimizer.pt'))
            and os.path.isfile(
                os.path.join(args.model_name_or_path, 'scheduler.pt'))):
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
        scheduler.load_state_dict(
            torch.laod(os.path.join(args.model_name_or_path, 'scheduler.pt')))
    # 混合精度训练
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                'Please install apex from https://www.github.com/nvidia/apex to use fp16 training.'
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)
    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
    # display log information before training
    logger.info("***** Running training *****")
    logger.info('Num examples =%d', len(train_dataset))
    logger.info("Num Epochs =%d", args.num_train_epochs)
    logger.info("Instantaneous batch size per GPU=%d", args.per_gpu_batch_size)
    logger.info(
        "Total train batch size(w.parallel,distribute&accumulation)=%d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("Gradient Accumulation steps=%d",
                args.gradient_accumulation_steps)
    logger.info("Total optimization steps=%d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to global step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split('-')[-1].split(
                '/')[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)
            logger.info(
                "Continuing training from checkpoint, will skip to saved global step"
            )
            logger.info("Continuing training from epcoh %d", epochs_trained)
            logger.info("Continuing training from global step %d", global_step)
            logger.info("Will skip the first %d step in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info(" Starting fine_tuning")
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc='Epoch',
                            disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproducibility
    for epoch in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc='Iteration',
                              disable=args.local_rank not in [-1, 0])
        if args.local_rank != -1:
            train_sampler.set_epoch(epoch)
        for step, batch in enumerate(epoch_iterator):
            # skip past any already trained step if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue
            # 对输入数据进行mask处理
            inputs, labels = mask_tokens(batch, tokenizer,
                                         args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs,
                            masked_lm_labels=labels) if args.mlm else model(
                                inputs, labels=labels)
            loss = outputs[0]
            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm(amp.master_params(optimizer),
                                                  args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm(model.parameters(),
                                                  args.max_grad_norm)
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1
            if agrs.local_rank in [
                    -1, 0
            ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                # log metrics
                if args.local_rank == -1 and args.evaluate_during_training:
                    # only evaluate when single GPU otherwise metrics may not average well
                    results = evaluate(args, model, tokenizer)
                    for key, value in results.items():
                        tb_writer.add_scaler("eval_{}".format(key), value,
                                             global_step)
                tb_writer.add_scaler('lr', scheduler.get_lr()[0], global_step)
                tb_writer.add_scaler('loss', (tr_loss - logging_loss) /
                                     args.logging_steps, global_step)
                logging_loss = tr_loss
            if args.local_rank in [
                    -1, 0
            ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                checkpoint_predix = 'checkpoint'
                # save model check point
                output_dir = os.path.join(
                    args.outout_dir, "{}-{}".format(checkpoint_prefix,
                                                    global_step))
                os.makedirs(output_dir, exist_ok=True)
                model_to_save = (model.module
                                 if hasattr(model, "module") else model)
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
                torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                logger.info('Saving model checkpoint to %s', output_dir)

                _rotate_checkpoints(args, checkpoint_prefix)

                torch.save(optimizer.state_dict(),
                           os.path.join(output_dir, 'optimizer.pt'))
                torch.save(scheduler.state_dict(),
                           os.path.join(output_dir, 'scheduler.pt'))
                logger.info('Saving optimizer and scheduler states to %s',
                            output_dir)
            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
    if args.local_rank in [-1, 0]:
        tb_writer.close()
    return global_step, tr_loss / global_step
예제 #5
0
class BuildCustomTransformersVocabulary(object):
    def __init__(self,
                 base_vocab_path='./vocab_small.txt',
                 additional_special_tokens={
                     'additional_special_tokens':
                     ['<num>', '<img>', '<url>', '#E-s', '|||']
                 }):
        self.tokenizer = BertTokenizer(vocab_file=base_vocab_path,
                                       do_lower_case=False,
                                       do_basic_tokenize=True)
        self.tokenizer.add_special_tokens(additional_special_tokens)
        self.no_vocab_tokens = set()

    def get_no_vocab_token(self, text, unk_token='[UNK]', other_split=False):
        """ tokens compare
        @param text:
        @param unk_token:
        @param other_split:  原始拆分出来single token txt, bert tokenizer拆分之后依然拆解为多个token, 是否增加词汇
        @return:
        """
        # text_tokens = self.tokenizer.tokenize(text)  # bert tokenizer根据词汇表处理之后切分出来的token(包含unk)
        origin_tokens = self.tokenize(text)  # 切词之后结果, 不在词汇表中的词没有转为unk

        # # 第一种方法不能保证一一对应, 有些切分出来字符再次转换时候会被再次切分
        # assert len(text_tokens) == len(origin_tokens)
        for idx, token in enumerate(origin_tokens):
            # 使用transformer tokenizer根据基础词汇表转换
            bert_token = self.tokenizer.tokenize(token)
            # if token != origin_tokens[idx]:
            #     # 未知token添加进词汇表
            #     self.no_vocab_tokens.append(origin_tokens[idx])
            if len(bert_token) == 1 and bert_token[0] == unk_token:
                self.no_vocab_tokens.add(token)  # 借助set去重
            if other_split and len(bert_token) > 1:
                # 单个字符被bert tokenizer拆分为多个字符, 实际不需要拆分
                self.no_vocab_tokens.add(token)

    def _tokenize(self, text):
        """将text拆分为 token list"""
        tokens_list = self.tokenizer.basic_tokenizer.tokenize(
            text, never_split=self.tokenizer.all_special_tokens)

        return tokens_list

    def tokenize(self, text: str, **kwargs):
        """ Converts a string in a sequence of tokens (string), using the tokenizer.
            Split in words for word-based vocabulary or sub-words for sub-word-based
            vocabularies (BPE/SentencePieces/WordPieces).

            Take care of added tokens.

            Args:
                text (:obj:`string`): The sequence to be encoded.
                **kwargs (:obj: `dict`): Arguments passed to the model-specific `prepare_for_tokenization` preprocessing method.
        """
        all_special_tokens = self.tokenizer.all_special_tokens
        text = self.tokenizer.prepare_for_tokenization(text, **kwargs)

        # TODO: should this be in the base class?
        def lowercase_text(t):
            # convert non-special tokens to lowercase
            escaped_special_toks = [
                re.escape(s_tok) for s_tok in all_special_tokens
            ]
            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
            return re.sub(pattern,
                          lambda m: m.groups()[0] or m.groups()[1].lower(), t)

        if self.tokenizer.init_kwargs.get("do_lower_case", False):
            text = lowercase_text(text)

        def split_on_token(tok, text):
            result = []
            split_text = text.split(tok)
            for i, sub_text in enumerate(split_text):
                sub_text = sub_text.rstrip()
                if i == 0 and not sub_text:
                    result += [tok]
                elif i == len(split_text) - 1:
                    if sub_text:
                        result += [sub_text]
                    else:
                        pass
                else:
                    if sub_text:
                        result += [sub_text]
                    result += [tok]
            return result

        def split_on_tokens(tok_list, text):
            if not text.strip():
                return []
            if not tok_list:
                return self._tokenize(text)

            tokenized_text = []
            text_list = [text]
            for tok in tok_list:
                tokenized_text = []
                for sub_text in text_list:
                    if sub_text not in self.tokenizer.unique_added_tokens_encoder:
                        tokenized_text += split_on_token(tok, sub_text)
                    else:
                        tokenized_text += [sub_text]
                text_list = tokenized_text

            return list(
                itertools.chain.from_iterable(
                    (self._tokenize(token)
                     if token not in self.tokenizer.unique_added_tokens_encoder
                     else [token] for token in tokenized_text)))

        added_tokens = self.tokenizer.unique_added_tokens_encoder
        tokenized_text = split_on_tokens(added_tokens, text)
        return tokenized_text

    def update_vocab(self, new_vocab_tokens: list):
        """ 更新原有基础词汇表
        @param new_vocab_tokens:
        @return:
        """
        add_token_num = self.tokenizer.add_tokens(new_vocab_tokens)

        return add_token_num

    def custom_save_vocabulary(self, new_vocab_path):
        """保存新的词汇表"""
        if os.path.exists(new_vocab_path):
            os.remove(new_vocab_path)

        index = 0
        with open(new_vocab_path, mode='w', encoding='utf-8') as writer:
            for token, token_index in sorted(self.tokenizer.vocab.items(),
                                             key=lambda kv: kv[1]):
                if index != token_index:
                    print(
                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!".
                        format(new_vocab_path))
                    index = token_index
                writer.write(token + "\n")
                index += 1

            # 将新增加的token添加到词汇表
            add_tokens_vocab = OrderedDict(self.tokenizer.added_tokens_encoder)
            for token, token_index in sorted(add_tokens_vocab.items(),
                                             key=lambda kv: kv[1]):
                if index != token_index:
                    print(
                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!".
                        format(new_vocab_path))
                    index = token_index
                writer.write(token + "\n")
                index += 1

        return new_vocab_path

    def save_vocab_pretrained(self, vocab_pretrained_path):
        """保存词表预训练全部内容"""
        if not os.path.exists(vocab_pretrained_path):
            # 路径不存在, 创建路径
            os.makedirs(vocab_pretrained_path)
        all_file = self.tokenizer.save_pretrained(
            vocab_pretrained_path)  # 存储所有词汇内容
        # model.resize_token_embeddings(len(tokenizer)) -> 重新设置embedding大小(词汇表大小已经改变)
        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary,
        # i.e. the length of the tokenizer.
        return all_file
예제 #6
0
    print(i)
    print(s)

tokenizer = BertTokenizer("data/atis/token.vocab",
                          bos_token="<BOS>",
                          eos_token="<EOS>",
                          model_max_len=50)
tokenizer.prepare_for_model(tokenizer.encode(y), return_tensors="pt")

tokenizer.SPECIAL_TOKENS_ATTRIBUTES
tokenizer.encode(y)
tokenizer.encode_plus(y)
y = "<BOS> embedding what is the flight number <EOS>"
ids = tokenizer.encode_plus
tokenizer.decode(tokenizer.encode(y))
tokenizer.save_pretrained("data/atis/save")
tokenizer.save_vocabulary("data/atis/save/saved")

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                          bos_token="<BOS>",
                                          eos_token="<EOS>")
tokenizer.tokenize("i like tea")
special_tokens = {"bos_token": "<BOS>", "eos_token": "<EOS>"}
tokenizer.add_special_tokens(special_tokens)

tokenizer.bos_token_id
tokenizer.eos_token_id
tokenizer.all_special_ids

tokenizer.special_tokens_map
tokenizer.additional_special_tokens
예제 #7
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument."
        )

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name"
        )

    if data_args.additional_tokens_file:
        with open(data_args.additional_tokens_file, "r") as infile:
            additional_tokens = [l.strip() for l in infile]
        #tokenizer.add_tokens(additional_tokens)
        tokenizer = BertTokenizer(data_args.additional_tokens_file, do_basic_tokenize=False)
    
    if model_args.model_name_or_path:
        model = AutoModelWithLMHead.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelWithLMHead.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the"
            "--mlm flag (masked language modeling)."
        )

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    # Get datasets

    train_dataset = get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None
    eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None
    if config.model_type == "xlnet":
        data_collator = DataCollatorForPermutationLanguageModeling(
            tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length,
        )
    else:
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
        )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        model_path = (
            model_args.model_name_or_path
            if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)
            else None
        )
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results
예제 #8
0
    vectorize = lambda x: vectorize_with_bert(" ".join(preprocess_string(x)),
                                              model, tokenizer, "sum", 1)
    get_similarity = lambda s1, s2: 1 - cosine(vectorize(s1), vectorize(s2))
    df[model_name] = df.apply(
        lambda x: get_similarity(x["concept_1"], x["concept_2"]), axis=1)
    y_true = list(df["class"].values)
    y_prob = list(df[model_name].values)
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    f_beta = lambda pr, re, beta: [((1 + beta**2) * p * r) /
                                   ((((beta**2) * p) + r))
                                   for p, r in zip(pr, re)]
    f_1_scores = f_beta(precision, recall, beta=1)
    f_1_max = np.nanmax(f_1_scores)
    rows.append((model_name, epoch_i, loss.item(), f_1_max))

# Writing results of the validation to those files.
df.to_csv(output_path_for_results, index=False)
header = ["model", "epoch", "training_loss", "f1_max"]
pd.DataFrame(rows, columns=header).to_csv(output_path_for_results_summary,
                                          index=False)

# In[44]:

output_dir = "../models/bert_small/model_save_{}/".format(
    datetime.datetime.now().strftime('%m_%d_%Y_h%Hm%Ms%S'))
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
예제 #9
0
            acc = ((y_pred_label == label.view(-1)).sum()).item()
            epoch_loss += loss.item()
            epoch_acc += acc
    return epoch_loss / len(iterator), epoch_acc / len(
        iterator.dataset.dataset)


for i in range(epochs):
    train_loss, train_acc = train(model, sentiment_train_loader, optimizer,
                                  criterion, device)
    valid_loss, valid_acc = evaluate(model, sentiment_valid_loader, criterion,
                                     device)
    print("\n")
    print("train loss: ", train_loss, "\t", "train acc:", train_acc)
    print("valid loss: ",
          valid_loss,
          "\t",
          "valid acc:",
          valid_acc,
          end="\n\n")

# 保存
import os

saved_model = "./saved_model"
saved_tokenizer = "./saved_tokenizer"
os.makedirs(saved_model)
os.makedirs(saved_tokenizer)
model.save_pretrained(saved_model)
tokenizer.save_pretrained(saved_tokenizer)
예제 #10
0
def train(args, train_dataset, model: BertForMlmWithClassification,
          tokenizer: BertTokenizer) -> Tuple[int, float]:
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(data: List[torch.Tensor]):
        sentences, labels = list(zip(*data))
        if tokenizer._pad_token is None:
            return pad_sequence(sentences, batch_first=True)
        return (
            pad_sequence(sentences,
                         batch_first=True,
                         padding_value=tokenizer.pad_token_id),
            torch.tensor(labels),
        )

    train_sampler = (RandomSampler(train_dataset) if args.local_rank == -1 else
                     DistributedSampler(train_dataset))
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = (
            args.max_steps //
            (len(train_dataloader) // args.gradient_accumulation_steps) + 1)
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if (args.model_name_or_path and os.path.isfile(
            os.path.join(args.model_name_or_path, "optimizer.pt"))
            and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt"))):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True,
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    # Take care of distributed/parallel training
    model_to_resize = model.module if hasattr(model, "module") else model
    model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
    train_iterator = trange(
        epochs_trained,
        int(args.num_train_epochs),
        desc="Epoch",
        disable=args.local_rank not in [-1, 0],
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            batch, class_labels = batch

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, mask_labels = (mask_tokens(batch, tokenizer, args)
                                   if args.mlm else (batch, batch))
            inputs = inputs.to(args.device)
            mask_labels = mask_labels.to(args.device) if args.mlm else None
            class_labels = class_labels.to(args.device)
            model.train()
            outputs = model(input_ids=inputs,
                            masked_lm_labels=mask_labels,
                            class_labels=class_labels)

            loss = outputs[0]

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if (args.local_rank in [-1, 0] and args.logging_steps > 0
                        and global_step % args.logging_steps == 0):
                    # Log metrics
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if (args.local_rank in [-1, 0] and args.save_steps > 0
                        and global_step % args.save_steps == 0):
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir,
                        "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
예제 #11
0
def main():

    # gpu profile
    n_gpu, device = gpu_profile(args)
    # set random seed
    seed_set(args.seed, n_gpu)

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    if os.path.exists(args.output_dir) == False:
        # raise ValueError("Output directory () already exists and is not empty.")
        os.makedirs(args.output_dir, exist_ok=True)
    feature_dir = '../data/rank/' + args.model_type  # + '_large'
    if 'addBad' in args.model_info:
        feature_dir = feature_dir + '_large'

    if os.path.exists(feature_dir) == False:
        # raise ValueError("Output directory () already exists and is not empty.")
        os.makedirs(feature_dir, exist_ok=True)

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    config = config_class.from_pretrained(args.model_name_or_path)
    vocab_file = os.path.join(args.model_name_or_path, 'vocab.txt')
    logger.info("loading the vocab file from {}".format(vocab_file))
    tokenizer = BertTokenizer(vocab_file=vocab_file,
                              do_lower_case=args.do_lower_case)

    # tokenizer = tokenizer_class.from_pretrained(
    #     vocab_file,
    #     do_lower_case=args.do_lower_case
    # )

    model = model_class.from_pretrained(args.model_name_or_path, config=config)

    if args.our_pretrain_model != '':
        model = load_state_dict(model, args.our_pretrain_model)

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    # bert_config = BertConfig.from_json_file(args.bert_config_file)
    #

    data_processor = Data_processor(tokenizer, args.policies_file,
                                    args.max_seq_length, args.max_query_length)

    if args.do_train:
        train_writer = prepare_summary_writer(args, mode='train')

        # get train features
        train_examples_file = "../data/rank/train_examples.pkl"
        train_features_file = os.path.join(
            feature_dir,
            'train_features_{0}_{1}.pkl'.format(str(args.max_seq_length),
                                                str(args.doc_stride)))
        train_examples = data_processor.get_train_examples(
            args.train_file, train_examples_file)

        valid_examples = None
        valid_features = None
        valid_dataloader = None
        if args.do_valid:
            valid_writer = prepare_summary_writer(args, mode='valid')
            valid_examples = train_examples[4000:]
            train_examples = train_examples[:4000]
        else:
            valid_writer = None

        logger.info("train examples {}".format(len(train_examples)))
        if valid_examples != None:
            logger.info("valid examples {}".format(len(valid_examples)))

        train_features = data_processor.get_train_features(
            train_examples, train_features_file, args.doc_stride)
        train_dataloader = data_processor.prepare_train_dataloader(
            train_features, args.train_batch_size, args.local_rank)

        num_train_steps = int(
            len(train_features) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

        logger.info("***** Running training *****")
        logger.info("  Num train_features = %d", len(train_features))
        if args.do_valid:
            valid_features_file = os.path.join(
                feature_dir,
                'valid_features_{0}_{1}.pkl'.format(str(args.max_seq_length),
                                                    str(args.doc_stride)))
            valid_features = data_processor.get_valid_features(
                valid_examples, valid_features_file, args.doc_stride)
            logger.info("  Num valid_features = %d", len(valid_features))
            valid_dataloader = data_processor.prepare_train_dataloader(
                valid_features, args.train_batch_size, args.local_rank)
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        del train_examples
        del train_features

        # Prepare model
        # trained_model_file = os.path.join(args.output_dir,
        #                                   "pytorch_model_bert_{}.bin".format(args.max_seq_length))
        # model = prepare_model(args, bert_config, device, n_gpu, trained_model_file)
        # training model

        model = train(args,
                      model,
                      train_dataloader,
                      device,
                      num_train_steps,
                      valid_examples=valid_examples,
                      valid_features=valid_features,
                      valid_dataloader=valid_dataloader,
                      n_gpu=n_gpu,
                      train_writer=train_writer,
                      valid_writer=valid_writer,
                      tokenizer=tokenizer)
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(model_save_path)
        tokenizer.save_pretrained(model_save_path)
        save_config_file()

    if args.do_predict:
        del model
        #trained_model_file = os.path.join(args.output_dir, args.model_type + '_' + args.model_info, "2020-03-22@11_57_57")
        # trained_model_file = os.path.join(args.output_dir + '/' + args.model_type)

        trained_model_file = args.predict_model_path

        model = model_class.from_pretrained(trained_model_file, config=config)

        # prepare predict dataloader
        # pred_features_file = os.path.join(feature_dir,
        #                                    'pred_features_{0}_{1}.pkl'.format(str(args.max_seq_length), str(args.doc_stride)))

        all_results = {}
        predict_file_name = args.predict_file.split('/')[-1].split('.')[0]
        cache_dir = os.path.join('/'.join(args.output_dir.split('/')[:-1]),
                                 'predict_cache_{}'.format(predict_file_name))
        if args.do_bm25:
            cache_dir = cache_dir + '_do_bm25' + '_{}'.format(args.topRate)
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)

        # build cache xc test
        # for pred_dataloader, pred_features, pred_examples in data_processor.prepare_pred_dataloader(
        #         args.predict_file, pred_features_file, args.predict_batch_size, args.doc_stride, cache_dir=cache_dir):
        #     pass
        # print('build cache successfully')

        if args.do_bm25:
            data_processor = Data_processor_bm25(tokenizer, args.policies_file,
                                                 args.max_seq_length,
                                                 args.max_query_length)

        for pred_dataloader, pred_features, pred_examples in data_processor.prepare_pred_dataloader(
                args.predict_file,
                args.predict_batch_size,
                args.doc_stride,
                cache_dir=cache_dir,
                topRate=args.topRate):
            # predicting
            results = predict(args, model, pred_dataloader, device, n_gpu)
            output_prediction_file = os.path.join(
                '/'.join(args.output_dir.split('/')[:-1]), "predictions.json")
            # output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
            # output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
            results = write_predictions(
                args,
                pred_examples,
                pred_features,
                results,
                n_best_size=10,
                output_prediction_file=output_prediction_file,
                tokenizer=tokenizer)
            all_results.update(results)

        submit_file_name = '_'.join([
            args.model_type, args.model_info, predict_file_name,
            trained_model_file.split('/')[-1]
        ])
        if args.do_bm25:
            submit_file_name = submit_file_name + '_bm25'
        # generate submit file
        submit_file = os.path.join('/'.join(args.output_dir.split('/')[:-1]),
                                   submit_file_name + '.csv')
        gen_submit_csv(all_results, submit_file)