Пример #1
0
    def __init__(self, hparams):
        """
        input:
            hparams: namespace with the following items:
                'data_dir' (str): Data Directory. default: './official/ebm_nlp_1_00'
                'bioelmo_dir' (str): BioELMo Directory. default: './models/bioelmo', help='BioELMo Directory')
                'max_length' (int): Max Length. default: 1024
                'lr' (float): Learning Rate. default: 1e-2
                'fine_tune_bioelmo' (bool): Whether to Fine Tune BioELMo. default: False
                'lr_bioelmo' (float): Learning Rate in BioELMo Fine-tuning. default: 1e-4
        """
        super().__init__(hparams)

        # Load Pretrained BioBERT
        DIR_BERT = Path(str(self.hparams.biobert_dir))
        BERT_CKPT_PATH = os.path.splitext(glob(str(DIR_BERT / '*ckpt*'))[0])[0]

        self.bertconfig = BertConfig.from_pretrained('bert-base-cased')
        self.berttokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        self.biobert_for_pretraining = BertForPreTraining.from_pretrained(
            'bert-base-cased')
        self.biobert_for_pretraining.load_tf_weights(self.bertconfig,
                                                     BERT_CKPT_PATH)
        self.biobert = self.biobert_for_pretraining.bert
        self.biobert_pad_token = self.berttokenizer.pad_token
        self.biobert_output_dim = self.bertconfig.hidden_size

        # Initialize Intermediate Affine Layer
        self.hidden_to_tag = nn.Linear(int(self.biobert_output_dim),
                                       len(self.itol))
Пример #2
0
    def build_model(cls, args, task):
        """Build a new model instance."""

        # make sure all arguments are present in older models
        base_architecture(args)

        if not hasattr(args, 'max_source_positions'):
            args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS
        if not hasattr(args, 'max_target_positions'):
            args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS

        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
        if len(task.datasets) > 0:
            src_berttokenizer = next(iter(task.datasets.values())).berttokenizer
        else:
            src_berttokenizer = BertTokenizer.from_pretrained(args.bert_model_name)

        def build_embedding(dictionary, embed_dim, path=None):
            num_embeddings = len(dictionary)
            padding_idx = dictionary.pad()
            emb = Embedding(num_embeddings, embed_dim, padding_idx)
            # if provided, load from preloaded dictionaries
            if path:
                embed_dict = utils.parse_embedding(path)
                utils.load_embedding(embed_dict, dictionary, emb)
            return emb

        if args.share_all_embeddings:
            if src_dict != tgt_dict:
                raise ValueError('--share-all-embeddings requires a joined dictionary')
            if args.encoder_embed_dim != args.decoder_embed_dim:
                raise ValueError(
                    '--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim')
            if args.decoder_embed_path and (
                    args.decoder_embed_path != args.encoder_embed_path):
                raise ValueError('--share-all-embeddings not compatible with --decoder-embed-path')
            encoder_embed_tokens = build_embedding(
                src_dict, args.encoder_embed_dim, args.encoder_embed_path
            )
            decoder_embed_tokens = encoder_embed_tokens
            args.share_decoder_input_output_embed = True
        else:
            encoder_embed_tokens = build_embedding(
                src_dict, args.encoder_embed_dim, args.encoder_embed_path
            )
            decoder_embed_tokens = build_embedding(
                tgt_dict, args.decoder_embed_dim, args.decoder_embed_path
            )
        from_tf = getattr(args, 'from_tf', False)
        bertencoder = BertForPreTraining.from_pretrained(args.bert_model_name, from_tf=from_tf,
                                                         output_hidden_states=True).bert
        args.bert_out_dim = bertencoder.config.hidden_size
        args.bert_num_layers = bertencoder.config.num_hidden_layers
        encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens)
        decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)

        return BertTransformerModel(encoder, decoder, bertencoder, src_berttokenizer, args.mask_cls_sep, args)
Пример #3
0
 def __init__(self, BERT_PATH):
     self.config = BertConfig.from_json_file(BERT_PATH +
                                             "/bert_config.json")
     self.model = BertForPreTraining.from_pretrained(BERT_PATH +
                                                     "/bert_model.ckpt",
                                                     from_tf=True,
                                                     config=self.config)
     self.tokenizer = BertTokenizer(BERT_PATH + "/vocab.txt")
     self.model.eval()
     self.model.cuda(args.gpu_id)
Пример #4
0
def load_pretrained_bert(config: BertConfig, model_path: str):
    if model_path.endswith(".index"):
        bert_model = BertForPreTraining.from_pretrained(model_path,
                                                        config=config,
                                                        from_tf=True).bert
    elif model_path.endswith(".pth"):
        bert_model = BertModel.from_pretrained(model_path, config=config)
    else:
        raise ValueError(f"Wrong model path ({model_path})")
    return bert_model
 def __init__(self, number_of_classes=16):
     super(BERTClass, self).__init__()
     # self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
     # self.reset_weight(self.bert)
     self.bert = BertForPreTraining.from_pretrained('bert-base-uncased',
                                                    return_dict=False)
     self.bert.cls.seq_relationship = torch.nn.Sequential(
         torch.nn.Dropout(0.3), torch.nn.Linear(768, 512),
         torch.nn.Linear(512, 256), torch.nn.ReLU(),
         torch.nn.Linear(256, number_of_classes), torch.nn.Softmax())
Пример #6
0
 def create_baseline_bert_model(self):
     model = BertForPreTraining.from_pretrained(
         self.model_weight_name,
         num_labels=self.num_labels,
         output_attentions=
         False,  # Whether the model returns attentions weights.
         output_hidden_states=
         False,  # Whether the model returns all hidden-states.
     )
     return model
Пример #7
0
def get_bert_save_dict():
    import os

    state_path = 'data/bert-large.pt'

    if os.path.exists(state_path):
        state = torch.load(state_path)
    else:
        model = BertForPreTraining.from_pretrained(globals.bert_model)
        state = model.state_dict()
        # cache state
        torch.save(state, state_path)
    return state
Пример #8
0
    def __init__(self, args, base_model_name='bert-base-uncased'):
        super(DialogBERT, self).__init__()

        if args.language == 'chinese': base_model_name = 'bert-base-chinese'

        self.tokenizer = BertTokenizer.from_pretrained(base_model_name,
                                                       cache_dir='./cache/')
        if args.model_size == 'tiny':
            self.encoder_config = BertConfig(vocab_size=30522,
                                             hidden_size=256,
                                             num_hidden_layers=6,
                                             num_attention_heads=2,
                                             intermediate_size=1024)
            self.utt_encoder = BertForPreTraining(self.encoder_config)
        elif args.model_size == 'small':
            self.encoder_config = BertConfig(vocab_size=30522,
                                             hidden_size=512,
                                             num_hidden_layers=8,
                                             num_attention_heads=4,
                                             intermediate_size=2048)
            self.utt_encoder = BertForPreTraining(self.encoder_config)
        else:
            self.encoder_config = BertConfig.from_pretrained(
                base_model_name, cache_dir='./cache/')
            self.utt_encoder = BertForPreTraining.from_pretrained(
                base_model_name,
                config=self.encoder_config,
                cache_dir='./cache/')

        self.context_encoder = BertModel(
            self.encoder_config)  # context encoder: encode context to vector

        self.mlm_mode = 'mse'  # 'mdn', 'mse'
        if self.mlm_mode == 'mdn':
            self.context_mlm_trans = MixtureDensityNetwork(
                self.encoder_config.hidden_size,
                self.encoder_config.hidden_size, 3)
        else:
            self.context_mlm_trans = BertPredictionHeadTransform(
                self.encoder_config
            )  # transform context hidden states back to utterance encodings

        self.dropout = nn.Dropout(self.encoder_config.hidden_dropout_prob)
        self.context_order_trans = SelfSorting(self.encoder_config.hidden_size)
        #       self.context_order_trans = MLP(self.encoder_config.hidden_size, '200-200-200', 1)

        self.decoder_config = deepcopy(self.encoder_config)
        self.decoder_config.is_decoder = True
        self.decoder_config.add_cross_attention = True
        self.decoder = BertLMHeadModel(self.decoder_config)
Пример #9
0
    def _load_google_checkpoint(self):
        logger.info('Loading Checkpoint from Google for Pre training')

        download_and_extract(self.google_checkpoint_location, './')

        checkpoint_dir = os.path.join('./', self.google_checkpoint_root)
        config_location = os.path.join(checkpoint_dir, 'bert_config.json')
        index_location = os.path.join(checkpoint_dir, 'bert_model.ckpt.index')

        logger.info(
            f'Config file: {config_location}. Index file: {index_location}')

        config = BertConfig.from_json_file(config_location)
        self.bert = BertForPreTraining.from_pretrained(index_location,
                                                       config=config,
                                                       from_tf=True)
Пример #10
0
 def from_pretrained(self, model_dir):
     self.encoder_config = BertConfig.from_pretrained(model_dir)
     self.tokenizer = BertTokenizer.from_pretrained(
         path.join(model_dir, 'tokenizer'),
         do_lower_case=args.do_lower_case)
     self.utt_encoder = BertForPreTraining.from_pretrained(
         path.join(model_dir, 'utt_encoder'))
     self.context_encoder = BertForSequenceClassification.from_pretrained(
         path.join(model_dir, 'context_encoder'))
     self.context_mlm_trans = BertPredictionHeadTransform(
         self.encoder_config)
     self.context_mlm_trans.load_state_dict(
         torch.load(path.join(model_dir, 'context_mlm_trans.pkl')))
     self.context_order_trans = SelfSorting(self.encoder_config.hidden_size)
     self.context_order_trans.load_state_dict(
         torch.load(path.join(model_dir, 'context_order_trans.pkl')))
     self.decoder_config = BertConfig.from_pretrained(model_dir)
     self.decoder = BertLMHeadModel.from_pretrained(
         path.join(model_dir, 'decoder'))
Пример #11
0
def test():
    bert_model_path = '../checkpoints/bert-base-chinese/'  # pytorch_model.bin
    bert_config_path = '../checkpoints/bert-base-chinese/'  # bert_config.json
    vocab_path = '../checkpoints/bert-base-chinese/vocab.txt'

    tokenizer = BertTokenizer.from_pretrained(vocab_path)
    # model = BertModel.from_pretrained(bert_model_path, config=bert_config_path)
    model = BertForPreTraining.from_pretrained(bert_model_path,
                                               config=bert_config_path)

    text_batch = ["哈哈哈", "嘿嘿嘿", "嘿嘿嘿", "嘿嘿嘿"]
    encoding = tokenizer(text_batch,
                         return_tensors='pt',
                         padding=True,
                         truncation=True)
    input_ids = encoding['input_ids']
    print(input_ids)
    print(input_ids.shape)
    output1, output2 = model(input_ids)
    print(output1)
    print(output2)
    print(output1.shape)
    print(output2.shape)
Пример #12
0
    def __init__(self,
                 pretrained_model,
                 tokenizer_name_or_path: str,
                 data_dir: str,
                 batch_size: int,
                 max_train_examples: int = None,
                 max_eval_examples: int = None,
                 train_strategy='train-all-lexical') -> None:
        super(LexicalTrainingModel, self).__init__()

        self.save_hyperparameters()

        if pretrained_model.startswith('google-checkpoint'):
            self._load_google_checkpoint()
        else:
            self.bert = BertForPreTraining.from_pretrained(pretrained_model)

        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name_or_path)

        self.__setup_lexical_for_training()

        self.train_dataset = None
        self.eval_dataset = None
        self.test_dataset = None
Пример #13
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    cudnn.deterministic = True

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    print("=> creating model 'bert'")
    model = BertForPreTraining.from_pretrained('bert-base-uncased',
                                               return_dict=True)

    if not torch.cuda.is_available():
        print('using CPU, this will be slow')
    elif args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            #  args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = BertPretrainingCriterion(vocab_size)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    args.max_predictions_per_seq = 80

    # Data loading code
    traindir = os.path.join(args.data)
    epoch = 0
    training_steps = 0

    writer = None
    enable_tensorboard = args.rank <= 0
    if enable_tensorboard:
        if args.rank == -1:
            # No DDP:
            writer = SummaryWriter(comment='_bert_no_ddp_' + args.data)
        else:
            writer = SummaryWriter(comment='_bert_' + args.dist_backend + '_' +
                                   str(args.world_size) + 'GPUs_' + args.data)

    train_raw_start = time.time()
    while True:
        batch_time = AverageMeter('Time', ':6.3f')
        data_time = AverageMeter('Data', ':6.3f')
        example_speed = AverageMeter('Speed', ':6.3f')
        losses = AverageMeter('Loss', ':.4e')

        files = [
            os.path.join(traindir, f) for f in os.listdir(traindir)
            if os.path.isfile(os.path.join(traindir, f)) and 'training' in f
        ]
        files.sort()
        num_files = len(files)
        random.Random(args.seed + epoch).shuffle(files)
        f_start_id = 0
        if torch.distributed.is_initialized() and get_world_size() > num_files:
            remainder = get_world_size() % num_files
            data_file = files[(f_start_id * get_world_size() + get_rank() +
                               remainder * f_start_id) % num_files]
        else:
            data_file = files[(f_start_id * get_world_size() + get_rank()) %
                              num_files]

        previous_file = data_file
        train_data = pretraining_dataset(data_file,
                                         args.max_predictions_per_seq)
        if args.distributed:
            train_sampler = torch.utils.data.distributed.DistributedSampler(
                train_data, shuffle=False)
        else:
            train_sampler = torch.utils.data.RandomSampler(train_data)

        train_dataloader = torch.utils.data.DataLoader(
            train_data,
            sampler=train_sampler,
            batch_size=args.batch_size,
            num_workers=4,
            pin_memory=True)

        pool = ProcessPoolExecutor(1)
        shared_file_list = {}

        for f_id in range(f_start_id + 1, len(files)):
            if get_world_size() > num_files:
                data_file = files[(f_id * get_world_size() + get_rank() +
                                   remainder * f_id) % num_files]
            else:
                data_file = files[(f_id * get_world_size() + get_rank()) %
                                  num_files]

            previous_file = data_file
            dataset_future = pool.submit(create_pretraining_dataset, data_file,
                                         args.max_predictions_per_seq,
                                         shared_file_list, args)
            train_iter = train_dataloader
            end = time.time()
            progress = ProgressMeter(
                len(train_iter),
                [batch_time, data_time, example_speed, losses],
                prefix="Epoch: [{}]".format(epoch))

            for step, batch in enumerate(train_iter):
                training_steps += 1
                batch = [t.to(args.gpu) for t in batch]
                input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch
                outputs = model(input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask)
                prediction_scores = outputs.prediction_logits
                seq_relationship_score = outputs.seq_relationship_logits
                loss = criterion(prediction_scores, seq_relationship_score,
                                 masked_lm_labels, next_sentence_labels)
                losses.update(loss.item())

                # compute gradient and do SGD step
                # optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                for param in model.parameters():
                    param.grad = None

                # measure elapsed time
                elapsed_time = time.time() - end
                batch_time.update(elapsed_time)
                end = time.time()
                speed = len(batch[0]) / elapsed_time
                example_speed.update(speed)
                global global_steps
                global global_examples

                global_examples += len(batch[0])
                global_steps += 1

                if step % args.print_freq == 0:
                    progress.display(step)
                    if writer is not None:
                        writer.add_scalar('loss/step', loss.item(),
                                          global_steps)
                        writer.add_scalar('speed/step', speed, global_steps)

                if global_steps >= (args.max_step / abs(args.world_size)):
                    break

            if global_steps >= (args.max_step / abs(args.world_size)):
                break

            del train_dataloader
            train_dataloader, data_file = dataset_future.result(timeout=None)

        now = time.time()
        print('Global Steps: ' + str(global_steps))
        print('Total Examples: ' + str(global_examples))
        print('Train duration: ' + str(now - train_raw_start))
        print('Example/Sec: ' + str(global_examples / (now - train_raw_start)))
        epoch += 1
        if epoch >= args.epochs:
            break

    if writer is not None:
        writer.add_scalar('overall_speed/step',
                          global_examples / (now - train_raw_start),
                          global_steps)
        writer.close()
Пример #14
0
    def load_annotations(self, proposal_method, **kwargs):
        logger = logging.getLogger("vmr.trainer")
        logger.info("Preparing data form file {}, please wait...".format(
            self.anno_file))
        self.annos = []
        self.gts = []
        word2vec_cache_prefix = os.path.splitext(self.anno_file)[0]
        word2vec_cache_file = '{}_word2vec_{}.pkl'.format(
            word2vec_cache_prefix, self.word2vec)

        # Define word embedding function
        if os.path.exists(word2vec_cache_file):
            annos_original = None
            # Load word embeddings cache if exists
            logger.info("Word2vec cache exist, load cache file.")
            with open(word2vec_cache_file, 'rb') as F:
                self.annos_query = pickle.load(F)

            def word_embedding(idx, sentence):
                assert self.annos_query[idx]['sentence'] == sentence, \
                    'annotation file {} has been modified, cache file expired!'.format(self.anno_file,)
                return self.annos_query[idx]['query'], self.annos_query[idx][
                    'wordlen']
        else:
            annos_original = []
            # Computing word embeddings if there's no cache
            if self.word2vec == 'BERT':
                # Here we use second-to-last hidden layer
                # See 3.5 Pooling Strategy & Layer Choice in https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#3-extracting-embeddings
                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
                bert = BertForPreTraining.from_pretrained('bert-base-uncased',
                                                          return_dict=True)
                bert.to('cuda')

                def word_embedding(idx, sentence):
                    sentence_tokenized = tokenizer(
                        sentence,
                        return_tensors="pt")  # token_num = sentence_num+2
                    for key in sentence_tokenized:
                        sentence_tokenized[key] = sentence_tokenized[key].to(
                            'cuda')
                    with torch.no_grad():
                        query = bert(**sentence_tokenized,
                                     output_hidden_states=True
                                     )['hidden_states'][-2].squeeze_().to(
                                         'cpu')  #(token_num, 768)
                        query = query[1:-1]
                    return query, query.size(
                        0)  #(sentence_len, 768) including punctuations
            elif self.word2vec == 'GloVe':

                def word_embedding(idx, sentence):
                    word2vec = glove_embedding(sentence)
                    return word2vec, word2vec.size(
                        0)  #(sentence_len, 300) including punctuations
            else:
                raise NotImplementedError

        # Loading annotations, generate ground truth for model proposal
        logger.info("loading annotations ...")
        with open(self.anno_file, 'r') as f:
            annos = json.load(f)
        for vid, anno in tqdm(annos.items()):
            duration = anno[
                'duration'] if self.dataset_name != 'tacos' else anno[
                    'num_frames'] / anno['fps']
            # Produce annotations
            for idx in range(len(anno['timestamps'])):
                timestamp = anno['timestamps'][idx]
                sentence = anno['sentences'][idx]
                if timestamp[0] < timestamp[1]:
                    moment = torch.tensor([max(timestamp[0], 0), min(timestamp[1], duration)]) if self.dataset_name != 'tacos' \
                    else torch.tensor(
                            [max(timestamp[0]/anno['fps'],0),
                            min(timestamp[1]/anno['fps'],duration)]
                        )
                    query, wordlen = word_embedding(len(self.annos), sentence)
                    self.avg_wordvec += query.mean(dim=0)
                    if annos_original is not None:
                        annos_original.append({
                            'sentence': sentence,
                            'query': query,
                            'wordlen': wordlen,
                        })
                    adjmat = torch.tensor(anno['dependency_parsing_graph']
                                          [idx]) if self.dep_graph else None
                    if self.consti_mask:
                        constimask = torch.tensor(
                            anno['constituency_parsing_mask'][idx],
                            dtype=torch.float32)
                        layers = torch.linspace(
                            constimask.size(0) - 1, 0, self.tree_depth).long(
                            )  # The original tree is from root to leaf
                        constimask = constimask[layers, :, :]
                    else:
                        constimask = None
                    if self.dep_graph:
                        padding = query.size(0) - adjmat.size(0)
                    adjmat = torch.nn.functional.pad(
                        adjmat,
                        (0, padding, 0,
                         padding), "constant", 0) if self.dep_graph else None
                    if wordlen >= self.max_num_words:
                        wordlen = self.max_num_words
                        query = query[:self.max_num_words]
                        adjmat = adjmat[:self.max_num_words, :self.
                                        max_num_words] if self.dep_graph else None
                    elif self.fix_num_words:
                        padding = self.max_num_words - wordlen
                        query = torch.nn.functional.pad(
                            query, (0, 0, 0, padding), "constant", 0)
                        #print('padded:', query.shape)
                        if self.dep_graph:
                            padding = self.max_num_words - adjmat.size(0)
                        adjmat = torch.nn.functional.pad(
                            adjmat, (0, padding, 0, padding), "constant",
                            0) if self.dep_graph else None

                    self.annos.append({
                        'vid':
                        vid,
                        'moment':
                        moment,
                        'sentence':
                        sentence,
                        'query':
                        query,
                        'querymask':
                        torch.ones(wordlen, dtype=torch.int32),
                        'adjmat':
                        adjmat,
                        'constimask':
                        constimask,
                        'wordlen':
                        wordlen,
                        'duration':
                        duration,
                    })
                    gt_dict = self.__generate_ground_truth__(
                        moment, duration, proposal_method, **kwargs)
                    self.gts.append(gt_dict)

        self.avg_wordvec /= len(self.annos)

        if not os.path.exists(word2vec_cache_file):
            with open(word2vec_cache_file, 'wb') as F:
                word2vec_cache = [{
                    'sentence': anno['sentence'],
                    'query': anno['query'],
                    'wordlen': anno['wordlen']
                } for anno in annos_original]
                pickle.dump(word2vec_cache, F)

        # Loading visual features if in_memory
        if self.in_memory:
            logger.info(
                "Loading visual features from {}, please wait...".format(
                    self.feat_file))
            self.feats, self.seglens = video2feats(self.feat_file,
                                                   annos.keys(),
                                                   self.num_segments,
                                                   self.dataset_name,
                                                   self.upsample)
        logger.info("Dataset prepared!")
Пример #15
0
        labels = data['bert_label'].to(device).long()
        optim.zero_grad()
        outputs = model(input_ids=input_ids,
                        token_type_ids=token_type_ids,
                        attention_mask=attention_mask,
                        labels=labels,
                        next_sentence_label=next_sentence_label)
        loss = outputs['loss']
        losses.append(loss.cpu().detach().numpy())
    loss = np.mean(losses)
    return loss


device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = BertConfig(vocab_size=len(WORDS) + 1)
model = BertForPreTraining.from_pretrained('bert-base-chinese')
model = model.to(device)
# model=nn.DataParallel(model,device_ids=[0,1])
optim = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
NUM_EPOCHS = 5
for epoch in range(NUM_EPOCHS):
    pbar = tqdm(train_loader)
    losses = []
    for data_label in pbar:
        data = data_label[0]
        next_sentence_label = data_label[1].to(device).long()

        input_ids = data['input_ids'].to(device).long()
        token_type_ids = data['token_type_ids'].to(device).long()
        attention_mask = data['attention_mask'].to(device).long()
Пример #16
0
from transformers import BertTokenizer, BertForPreTraining, BertForSequenceClassification
from tqdm import tqdm, trange
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

max_length=100
k=10
device="cpu"

pretrained_weights = '/data5/private/suyusheng/task_selecte/bert-base-uncased-128/'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights, do_lower_case=True)

fine_tuned_weight = '/data5/private/suyusheng/task_selecte/output_finetune/pytorch_model.bin_1314'
model = BertForPreTraining.from_pretrained(pretrained_weights, output_hidden_states=True,return_dict=True)
model.load_state_dict(torch.load(fine_tuned_weight), strict=False)
model.to(device)


#out_CLS = torch.load("/data5/private/suyusheng/task_selecte/data/open_domain_preprocessed/opendomain_CLS.pt")
out_CLS = torch.load("/data5/private/suyusheng/task_selecte/data/open_domain_preprocessed/opendomain_CLS_res.pt")
out_CLS = out_CLS.to(device)

#with open("/data5/private/suyusheng/task_selecte/data/open_domain_preprocessed/opendomain.json") as f:
with open("/data5/private/suyusheng/task_selecte/data/open_domain_preprocessed/opendomain_res.json") as f:
    out_data = json.load(f)

with open("../data/restaurant/train.json") as f:
    data = json.load(f)
    for index, d in enumerate(tqdm(data)):
Пример #17
0
    def __init__(self, args):
        super().__init__()

        self.args = args
        self.use_bert = args.bert
        self.pad_token_id = args.pad_token_id
        self.concat = args.concat

        if self.use_bert:
            # from transformers import AutoModel
            # self.bert = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states = True)
            from transformers import BertConfig, BertForPreTraining
            if args.finetune:
                config = BertConfig.from_json_file(
                    './bert/bert_tiny_finetune/bert_config.json')
                self.bert = BertForPreTraining.from_pretrained(
                    './bert/bert_tiny_finetune/bert_model.ckpt.index',
                    from_tf=True,
                    config=config)
            else:
                config = BertConfig.from_json_file(
                    './bert/bert_tiny/bert_config.json')
                self.bert = BertForPreTraining.from_pretrained(
                    './bert/bert_tiny/bert_model.ckpt.index',
                    from_tf=True,
                    config=config)
                # keeping the weights of the pre-trained encoder frozen
                for param in self.bert.base_model.parameters():
                    param.requires_grad = False
            # bert base uncased has embedding dim = 768, tiny = 128
            if self.concat:
                args.embedding_dim = 256
            else:
                args.embedding_dim = 128
        else:
            # Initialize embedding layer (1)
            self.embedding = nn.Embedding(args.vocab_size, args.embedding_dim)

        # Initialize Context2Query (2)
        self.aligned_att = AlignedAttention(args.embedding_dim)

        rnn_cell = nn.LSTM if args.rnn_cell_type == 'lstm' else nn.GRU

        # Initialize passage encoder (3)
        self.passage_rnn = rnn_cell(
            args.embedding_dim * 2,
            args.hidden_dim,
            bidirectional=args.bidirectional,
            batch_first=True,
        )

        # Initialize question encoder (4)
        self.question_rnn = rnn_cell(
            args.embedding_dim,
            args.hidden_dim,
            bidirectional=args.bidirectional,
            batch_first=True,
        )

        self.dropout = nn.Dropout(self.args.dropout)

        # Adjust hidden dimension if bidirectional RNNs are used
        _hidden_dim = (args.hidden_dim *
                       2 if args.bidirectional else args.hidden_dim)

        # Initialize attention layer for question attentive sum (5)
        self.question_att = SpanAttention(_hidden_dim)

        # Initialize bilinear layer for start positions (6)
        self.start_output = BilinearOutput(_hidden_dim, _hidden_dim)

        # Initialize bilinear layer for end positions (7)
        self.end_output = BilinearOutput(_hidden_dim, _hidden_dim)
Пример #18
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the files for bert pretraining.")
    parser.add_argument("--model_name_or_path",
                        default=None,
                        type=str,
                        required=True,
                        help="Path to pre-trained model")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    # parser.add_argument("--config_name", default="", type=str,
    #                     help="Pretrained config name or path if not the same as model_name")
    # parser.add_argument("--tokenizer_name", default="", type=str,
    #                     help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    # parser.add_argument("--do_eval", action="store_true",
    #                     help="Whether to run eval on the dev set.")
    # parser.add_argument("--do_predict", action="store_true",
    #                     help="Whether to run predictions on the test set."))
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=1000,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    # parser.add_argument("--eval_all_checkpoints", action="store_true",
    #                     help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument(
        "--yago_reference",
        action="store_true",
        help="Use Reference of Yago types as additional inputs.")
    parser.add_argument("--start_task_id", type=int, default=0)
    parser.add_argument("--skip_steps", type=int, default=-1)
    parser.add_argument("--skip_global_steps", type=int, default=-1)
    parser.add_argument("--load_checkpoint", type=str, default="")
    args = parser.parse_args()

    if 'uncased' in args.model_name_or_path:
        args.do_lower_case = True
    else:
        args.do_lower_case = False

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    # args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = (BertConfig,
                                                  BertForPreTraining,
                                                  BertTokenizer)
    # config = BertConfig() #
    bertconfig = BertConfig.from_pretrained(
        'bert-base-uncased',
        do_lower_case=args.do_lower_case,
        cache_dir='/work/smt2/qfeng/Project/huggingface/pretrain/base_uncased')
    # tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
    #                                             do_lower_case=args.do_lower_case,
    #                                             cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = BertTokenizer.from_pretrained(
        'bert-base-uncased',
        do_lower_case=True,
        cache_dir='/work/smt2/qfeng/Project/huggingface/pretrain/base_uncased')
    # model = model_class.from_pretrained(args.model_name_or_path,
    #                                     from_tf=bool(".ckpt" in args.model_name_or_path),
    #                                     config=config,
    #                                     cache_dir=args.cache_dir if args.cache_dir else None)
    if args.load_checkpoint == "":
        if not args.yago_reference:
            config = bertconfig
            model = BertForPreTraining(config)
        else:
            config = YagoRefBertConfig.from_pretrained(
                'bert-base-uncased'
                if args.do_lower_case else 'bert-base-cased',
                reference_size=REFERENCE_SIZE,
                cache_dir='/work/smt2/qfeng/Project/huggingface/pretrain/base_{}'
                .format('uncased' if args.do_lower_case else 'cased'))
            model = YagoRefBertForPreTraining(config)
    else:
        if 'step' in args.load_checkpoint.split(
                '/')[-1] and args.skip_global_steps is not None:
            assert (args.load_checkpoint.endswith(str(args.skip_global_steps)))
        if not args.yago_reference:
            config = BertConfig.from_pretrained(args.load_checkpoint)
            model = BertForPreTraining.from_pretrained(args.load_checkpoint)
        else:
            config = YagoRefBertConfig.from_pretrained(args.load_checkpoint)
            model = YagoRefBertForPreTraining.from_pretrained(
                args.load_checkpoint)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    # train_dataset = load_and_cache_examples(args, tokenizer, pad_token_label_id, mode="train") # TODO: need total rewritten
    # pickle_list = assign_pickles(args, args.start_task_id)
    # train_dataset = load_and_cache_examples(args, tokenizer, pickle_list)

    global_step, tr_loss = train(args,
                                 model=model,
                                 tokenizer=tokenizer,
                                 pad_token_label_id=pad_token_label_id)
    logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(
            model,
            "module") else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    # Evaluation TODO: need rewritten:
    """
Пример #19
0
    def __init__(self, metadata, timer, is_ZH, data_manager):
        super().__init__()
        self.timer = timer
        self.timer("bert-init")
        self.batch_per_train = 50
        self.batch_size_eval = 64
        self.max_seq_len = 301
        self.batch_size = 48
        self.weight_decay = 0
        self.learning_rate = 5e-5
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.
        self.total_epoch = 100
        self.logging_step = -1
        self.warmup_steps = 0
        self.metadata = metadata
        self.num_class = self.metadata.get_output_size()

        self.bert_folder = extract_bert_model()

        bertConfig = BertConfig.from_json_file(self.bert_folder +
                                               '/config.json')
        self.model = BertClassification(None, bertConfig, self.num_class)

        self.bertTokenizer = BertTokenizer.from_pretrained(self.bert_folder)
        bertModel = BertForPreTraining.from_pretrained(
            self.bert_folder, num_labels=self.num_class, from_tf=BERT_V == 0)
        self.model.bert = bertModel.bert
        del bertModel.cls
        self.model.to(torch.device("cuda"))
        self.data = data_manager
        self.data.add_pipeline(
            BertPipeline(is_ZH,
                         metadata,
                         self.bertTokenizer,
                         max_length=self.max_seq_len))
        self.train_data_loader = None
        self.test_data_loader = None
        self.valid_data_loader = None
        self.done_training = False
        self.estimate_time_per_batch = None
        self.estimate_valid_time = None
        self.estimate_test_time = None

        # init optimizer and scheduler
        no_decay = ["bias", "LayerNorm.weight"]

        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                self.weight_decay,
            },
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0
            },
        ]

        self.optimizer = AdamW(optimizer_grouped_parameters,
                               lr=self.learning_rate,
                               eps=self.adam_epsilon)
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.total_epoch * self.batch_per_train)

        # first, we only train the classifier
        self.optimizer_only_classifier = optim.Adam(
            self.model.classifier.parameters(), 0.0005)

        self.place = 'cpu'

        self.timer("bert-init")
        print('[bert init] time cost: %.2f' %
              (self.timer.accumulation["bert-init"]))
Пример #20
0
    log.write("tokenizer loaded with custom vocabulary of size %d \n" %
              len(tokenizer))

    #Datasets
    train_dataset = QuestMLMDataset(train_df, tokenizer, target_cols=TARGETS)
    val_dataset = QuestMLMDataset(test_df, tokenizer, target_cols=TARGETS)

    #Load Model
    config = BertConfig.from_json_file(str(path_to_ckpt_config /
                                           "config.json"))
    model = BertPretrain(config, len(TARGETS))
    model = model.cuda()
    log.write("model loaded")
    #Token embeddings of new tokens
    orig_bert = BertForPreTraining.from_pretrained("bert-base-cased")
    orig_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    state_dict = orig_bert.state_dict()
    del state_dict["cls.predictions.decoder.weight"], state_dict[
        "cls.predictions.bias"], state_dict["cls.predictions.decoder.bias"]
    orig_embedding = state_dict["bert.embeddings.word_embeddings.weight"]
    extra_tokens = list(tokenizer.vocab.keys())[len(orig_tokenizer.vocab):]
    new_tokens_as_orig_indices = [
        [i] for i in range(len(orig_tokenizer.vocab))
    ] + [
        orig_tokenizer.encode(t, add_special_tokens=False)
        for t in extra_tokens
    ]
    new_embedding = torch.zeros(len(new_tokens_as_orig_indices),
                                orig_embedding.shape[-1])
    new_embedding.normal_(mean=0.0, std=0.02)
Пример #21
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument."
        )

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name"
        )

    if model_args.model_name_or_path:
        model = BertForPreTraining.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = BertForPreTraining.from_config(config)
    
    if model_args.cls_model_name_or_path:
        cls_config = AutoConfig.from_pretrained(
            model_args.cls_model_name_or_path,
            num_labels=2,
            finetuning_task="cola",
            cache_dir=model_args.cache_dir,
        )
        cls_model = AutoModelForSequenceClassification.from_pretrained(
            model_args.cls_model_name_or_path,
            from_tf=bool(".ckpt" in model_args.cls_model_name_or_path),
            config=cls_config,
            cache_dir=model_args.cache_dir,
        )
        cls_model.resize_token_embeddings(len(tokenizer))
        mask_selector = MaskSelector(cls_model,training_args)

    model.resize_token_embeddings(len(tokenizer))
    

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
            "flag (masked language modeling)."
        )

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    # Get datasets

    train_dataset = get_dataset(data_args, tokenizer=tokenizer, model_args=model_args, cache_dir=model_args.cache_dir) if training_args.do_train else None
    eval_dataset = get_dataset(data_args, model_args=None, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None

   
    data_collator = DataCollatorForMixLM(
        tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
    )
    

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        model_path = (
            model_args.model_name_or_path
            if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)
            else None
        )
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results
Пример #22
0
def train():
    logger.info('*' * 64)
    logger.info('token:%s' % current_time)
    logger.info('*' * 64)

    parser = ArgumentParser()
    parser.add_argument(
        "--train_file",
        type=str,
        default="./my_test/data/student/part1.txt",
        help="Path or url of the dataset. If empty download from S3.")

    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./cache/',
                        help="Path or url of the dataset cache")
    parser.add_argument("--batch_size",
                        type=int,
                        default=2,
                        help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=1,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--lr",
                        type=float,
                        default=6.25e-4,
                        help="Learning rate")
    # parser.add_argument("--train_precent", type=float, default=0.7, help="Batch size for validation")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=1,
                        help="Number of training epochs")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    # parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
    parser.add_argument("--log_step",
                        type=int,
                        default=1,
                        help="Multiple-choice loss coefficient")
    parser.add_argument("--base_model", type=str, default="bert-base-uncased")
    parser.add_argument(
        "--on_memory",
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )

    args = parser.parse_args()
    logger.info(args)
    device = torch.device(args.device)
    tokenizer = BertTokenizer.from_pretrained(args.base_model)

    train_dataset = BERTDataset(args.train_file,
                                tokenizer,
                                seq_len=args.max_seq_length,
                                corpus_lines=None,
                                on_memory=args.on_memory)
    train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size)

    model = BertForPreTraining.from_pretrained(args.base_model)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    steps = len(train_data_loader.dataset) // train_data_loader.batch_size
    steps = steps if steps > 0 else 1
    logger.info('steps:%d' % steps)

    lr_warmup = get_cosine_schedule_with_warmup(optimizer=optimizer,
                                                num_warmup_steps=1500,
                                                num_training_steps=steps *
                                                args.n_epochs)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        gpu_num = torch.cuda.device_count()
        gpu_list = [int(i) for i in range(gpu_num)]
        model = DataParallel(model, device_ids=gpu_list)
        multi_gpu = True

    if torch.cuda.is_available():
        model.cuda()

    # model.to(device)
    # criterion.to(device)

    def update(engine, batch):
        model.train()
        # input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
        """
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        masked_lm_labels=None,
        next_sentence_label=None,
        """
        # loss = model(input_ids=batch[0],input_mask=batch[1],segment_ids=batch[2],lm_label_ids=batch[3],is_next=batch[4])

        loss = model(input_ids=batch[0],
                     attention_mask=batch[1],
                     position_ids=batch[2],
                     masked_lm_labels=batch[3],
                     next_sentence_label=batch[4])

        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        lr_warmup.step()
        if multi_gpu:
            loss = loss.mean()
        loss.backward()

        return loss.cpu().item()

    trainer = Engine(update)

    # def inference(engine, batch):
    #     model.eval()
    #     with torch.no_grad():
    #         input_ids = batch[0].to(device)
    #         attention_mask = batch[1].to(device)
    #         labels = batch[2].to(device)
    #         output = model(input_ids=input_ids, attention_mask=attention_mask)
    #
    #         predict = output.permute(1, 2, 0)
    #         trg = labels.permute(1, 0)
    #         loss = criterion(predict.to(device), trg.to(device))
    # return predict, trg
    #
    # evaluator = Engine(inference)
    # metrics = {"nll": Loss(criterion, output_transform=lambda x: (x[0], x[1])),
    #            "accuracy": Accuracy(output_transform=lambda x: (x[0], x[1]))}
    # for name, metric in metrics.items():
    #     metric.attach(evaluator, name)
    #
    # @trainer.on(Events.EPOCH_COMPLETED)
    # def log_validation_results(trainer):
    #     evaluator.run(valid_data_loader)
    #     ms = evaluator.state.metrics
    #     logger.info("Validation Results - Epoch: [{}/{}]  Avg accuracy: {:.6f} Avg loss: {:.6f}"
    #           .format(trainer.state.epoch, trainer.state.max_epochs, ms['accuracy'], ms['nll']))

    #
    '''======================early stopping =========================='''
    # def score_function(engine):
    #     val_loss = engine.state.metrics['nll']
    #     return -val_loss
    # handler = EarlyStopping(patience=5, score_function=score_function, trainer=trainer)
    # evaluator.add_event_handler(Events.COMPLETED, handler)
    '''==================print information by iterator========================='''

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(trainer):
        if trainer.state.iteration % args.log_step == 0:
            logger.info("Epoch[{}/{}] Step[{}/{}] Loss: {:.6f}".format(
                trainer.state.epoch, trainer.state.max_epochs,
                trainer.state.iteration % steps, steps,
                trainer.state.output * args.gradient_accumulation_steps))

    '''================add check point========================'''
    checkpoint_handler = ModelCheckpoint(checkpoint_dir,
                                         'checkpoint',
                                         n_saved=3)
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED, checkpoint_handler,
        {'BertClassificationModel': getattr(model, 'module', model)
         })  # "getattr" take care of distributed encapsulation
    '''==============run trainer============================='''
    trainer.run(train_data_loader, max_epochs=args.n_epochs)
Пример #23
0
def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
    ###############
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument("--pretrain_model",
                        default='bert-case-uncased',
                        type=str,
                        required=True,
                        help="Pre-trained model")
    parser.add_argument("--num_labels_task",
                        default=None,
                        type=int,
                        required=True,
                        help="num_labels_task")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--task",
                        default=None,
                        type=int,
                        required=True,
                        help="Choose Task")
    ###############

    args = parser.parse_args()

    processors = Processor_pretrain

    #num_labels = args.num_labels_task

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}"
        .format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    #tokenizer = BertTokenizer.from_pretrained(args.ernie_model, do_lower_case=args.do_lower_case)
    tokenizer = BertTokenizer.from_pretrained(args.pretrain_model,
                                              do_lower_case=True)

    train_examples = None
    num_train_steps = None
    #aspect_list = None
    #sentiment_list = None
    processor = processors()
    #num_labels = num_labels
    #train_examples, aspect_list, sentiment_list = processor.get_train_examples(args.data_dir)
    train_examples = processor.get_train_examples(args.data_dir)

    if args.task == 1:
        num_labels = len(aspect_list)
    elif args.task == 2:
        num_labels = len(sentiment_list)
    elif args.task == 0:
        print("pretrain")
        num_lables = 0
    else:
        print("What's task?")
        exit()

    num_train_steps = int(
        len(train_examples) / args.train_batch_size /
        args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    model = BertForPreTraining.from_pretrained(args.pretrain_model,
                                               return_dict=True)

    # Prepare optimizer
    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()

    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    #no_decay = ['bias', 'LayerNorm.weight']
    no_grad = [
        'bert.encoder.layer.11.output.dense_ent',
        'bert.encoder.layer.11.output.LayerNorm_ent'
    ]
    param_optimizer = [(n, p) for n, p in param_optimizer
                       if not any(nd in n for nd in no_grad)]
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=int(t_total *
                                                                     0.1),
                                                num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
            exit()

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    global_step = 0
    if args.do_train:
        #train_features = convert_examples_to_features( train_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task)
        train_features = convert_examples_to_features(
            train_examples,
            aspect_list=None,
            sentiment_list=None,
            max_seq_length=args.max_seq_length,
            tokenizer=tokenizer,
            task_n=args.task)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        #all_masked_lm_positions = torch.tensor([f.masked_lm_positions for f in train_features], dtype=torch.long)
        all_masked_lm_labels = torch.tensor(
            [f.masked_lm_labels for f in train_features], dtype=torch.long)
        if args.task == 1:
            print("Excuting the task 1")
        elif args.task == 2:
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in train_features], dtype=torch.long)
        elif args.task == 0:
            print("Excuting the task 0")
        else:
            print("Wrong here2")

        if args.task == 1:
            train_data = TensorDataset(all_input_ids, all_input_mask,
                                       all_label_ids)
        elif args.task == 2:
            train_data = TensorDataset(all_input_ids, all_input_mask,
                                       all_segment_ids, all_label_ids)
        elif args.task == 0:
            #train_data = TensorDataset(all_input_ids, all_input_mask, all_masked_lm_positions, all_masked_lm_labels)
            train_data = TensorDataset(all_input_ids, all_input_mask,
                                       all_masked_lm_labels)
        else:
            print("Wrong here1")

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        output_loss_file = os.path.join(args.output_dir, "loss")
        loss_fout = open(output_loss_file, 'w')
        model.train()

        ##########Pre-Process#########

        ###############################

        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(
                    t.to(device) if i != 3 else t for i, t in enumerate(batch))

                if args.task == 1:
                    input_ids, input_mask, label_ids = batch
                elif args.task == 2:
                    input_ids, input_mask, segment_ids, label_ids = batch
                elif args.task == 0:
                    #input_ids, input_mask, masked_lm_positions, masked_lm_labels = batch
                    input_ids, input_mask, masked_lm_labels = batch
                else:
                    print("Wrong here3")

                if args.task == 1:
                    #loss, logits, hidden_states, attentions
                    output = model(input_ids=input_ids,
                                   token_type_ids=None,
                                   input_mask=input_mask,
                                   labels=None)
                    loss = output.loss
                    #loss=<generator object gather.<locals>.gather_map.<locals>.<genexpr> at 0x7f52ec67f8e0>
                elif args.task == 2:
                    #loss, logits, hidden_states, attentions
                    output = model(input_ids=input_ids,
                                   token_type_ids=segment_ids,
                                   input_mask=input_mask,
                                   labels=None)
                    loss = output.loss
                elif args.task == 0:
                    #loss, logits, hidden_states, attentions
                    #output = model(input_ids=input_ids, input_mask=input_mask, position_ids=masked_lm_positions, labels=masked_lm_labels)
                    output = model(input_ids=input_ids,
                                   attention_mask=None,
                                   position_ids=None,
                                   masked_lm_labels=masked_lm_labels)
                    loss = output.loss
                else:
                    print("Wrong!!")

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    ###
                    #optimizer.backward(loss)
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    ###
                else:
                    loss.backward()

                loss_fout.write("{}\n".format(loss.item()))
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    ###
                    if args.fp16:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)
                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()
                    global_step += 1
                    ###
            model_to_save = model.module if hasattr(model, 'module') else model
            output_model_file = os.path.join(
                args.output_dir, "pytorch_model.bin_{}".format(global_step))
            torch.save(model_to_save.state_dict(), output_model_file)
        '''
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_corpus",
                        default=None,
                        type=str,
                        required=True,
                        help="The input train corpus.")
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--adam_epsilon", 
                        default=1e-8, 
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_steps", 
                        default=0, 
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--on_memory",
                        action='store_true',
                        help="Whether to load train samples into memory or use disk")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumualte before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                        "0 (default value): dynamic loss scaling.\n"
                        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError("Training is currently the only implemented execution option. Please set `do_train`.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir) and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    #train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.train_corpus)
        train_dataset = BERTDataset(args.train_corpus, tokenizer, seq_len=args.max_seq_length,
                                    corpus_lines=None, on_memory=args.on_memory)
        num_train_optimization_steps = int(
            len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    # Prepare model
    model = BertForPreTraining.from_pretrained(args.bert_model)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)

        else:
            optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)

    global_step = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            #TODO: check if this works with current data generator from disk that relies on next(file)
            # (it doesn't return item back by index)
            train_sampler = DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
                outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
                loss = outputs[0]
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    optimizer.zero_grad()
                    global_step += 1

        # Save a trained model
        if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
            logger.info("** ** * Saving fine - tuned model ** ** * ")
            model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
            model_to_save.save_pretrained(args.output_dir)
            tokenizer.save_pretrained(args.output_dir)
    def __init__(self,
                 scan_encoder_class=None,
                 scan_encoder_args={},
                 bert_class=None,
                 bert_args={},
                 scan_decoder_class=None,
                 scan_decoder_args={},
                 task_configs=[],
                 vocab_args={},
                 loss_weighting=None,
                 optim_class="Adam",
                 optim_args={},
                 scheduler_class=None,
                 scheduler_args={},
                 pretrained_configs=[],
                 cuda=True,
                 devices=[0]):
        """
        """
        super().__init__(optim_class, optim_args, scheduler_class,
                         scheduler_args, pretrained_configs, cuda, devices)

        self.encodes_scans = scan_encoder_class is not None
        if self.encodes_scans:
            self.scan_encoder = getattr(
                modules, scan_encoder_class)(**scan_encoder_args)
            self.scan_encoder = nn.DataParallel(self.scan_encoder,
                                                device_ids=self.devices)

        if bert_class == "BertModelPreTrained":
            self.bert = BertModel.from_pretrained(**bert_args)
        elif bert_class == "BertForPretraining":
            self.bert = BertForPreTraining.from_pretrained(**bert_args)
        elif bert_class == "BertModel":
            bert_args["config"] = BertConfig.from_dict(bert_args["config"])
            self.bert = BertModel(**bert_args)
        else:
            self.bert = getattr(modules, bert_class)(**bert_args)
        self.bert = nn.DataParallel(self.bert, device_ids=self.devices)

        self.decodes_scans = scan_decoder_class is not None
        if self.decodes_scans:
            self.scan_decoder = getattr(
                modules, scan_decoder_class)(**scan_decoder_args)

        self.task_heads = {}
        self.task_inputs = {}
        for task_head_config in task_configs:
            task = task_head_config["task"]
            head_class = getattr(modules, task_head_config["class"])
            args = task_head_config["args"]
            self.task_inputs[task] = (task_head_config["inputs"] if "inputs"
                                      in task_head_config else "pool")

            if "config" in args:
                # bert task heads take config object for parameters, must convert from dict
                config = args["config"]
                args["config"] = namedtuple("Config",
                                            config.keys())(*config.values())

            if head_class is BertOnlyMLMHead:
                embs = self.bert.module.embeddings.word_embeddings.weight
                self.task_heads[task] = head_class(
                    bert_model_embedding_weights=embs, **args)
            else:
                self.task_heads[task] = head_class(**args)

        self.task_heads = torch.nn.ModuleDict(self.task_heads)

        self.vocab = WordPieceVocab(**vocab_args)

        self._build_loss(loss_weighting)

        self._post_init()
def main():
    args = get_args()

    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # not parallizing across GPUs because of deadlocks
    n_gpu = 1 if torch.cuda.device_count() > 0 else 0

    logging.info(f'device: {device} n_gpu: {n_gpu} seed: {args.seed}')
    res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    logging.info(
        f'mem: {res.used / (1024**2)} (GiB) ({100 * (res.used / res.total):.3f}%)'
    )

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        logging.warning(
            f"Output directory ({args.output_dir}) already exists and is not empty!"
        )
    args.output_dir.mkdir(parents=True, exist_ok=True)

    # TODO: not sure what this for loop is doing
    samples_per_epoch = []
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.epochs

    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = total_train_examples // args.train_batch_size

    # Prepare model
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)
    model = BertForPreTraining.from_pretrained(args.bert_model)
    model.to(device)

    # Prepare optimizer
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=num_train_optimization_steps)

    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    model.train()
    for epoch in range(args.epochs):
        tmp_fp = f'/media/data_1/darius/data/512epoch_{epoch}_dataset_255.pkl'
        if Path(tmp_fp).is_file():
            logging.info(f'Loading dataset from {tmp_fp}...')
            with open(tmp_fp, 'rb') as f:
                epoch_dataset = pickle.load(f)
        else:
            epoch_dataset = PregeneratedDataset(
                epoch=epoch,
                training_path=args.pregenerated_data,
                tokenizer=tokenizer,
                num_data_epochs=num_data_epochs,
                reduce_memory=args.reduce_memory)
            with open(tmp_fp, 'wb') as f:
                pickle.dump(epoch_dataset, f, protocol=4)
        train_sampler = RandomSampler(epoch_dataset)
        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
            for _, (input_ids, input_mask, segment_ids, lm_label_ids,
                    is_next) in enumerate(train_dataloader):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                lm_label_ids = lm_label_ids.to(device)
                is_next = is_next.to(device)
                # breakpoint()
                outputs = model(input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                labels=lm_label_ids,
                                next_sentence_label=is_next)
                # outputs = model(input_ids, segment_ids,
                #                 input_mask, lm_label_ids, is_next)
                loss = outputs.loss
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                pbar.update(1)
                mean_loss = tr_loss / nb_tr_steps
                pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                optimizer.step()
                # optimizer.zero_grad()
                scheduler.step()
                global_step += 1

    # Save a trained model
    logging.info("** ** * Saving fine-tuned model ** ** * ")
    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self

    output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
    output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

    torch.save(model_to_save.state_dict(), output_model_file)
    model_to_save.config.to_json_file(output_config_file)
    tokenizer.save_vocabulary(args.output_dir)
Пример #27
0
import torch
from transformers import BertTokenizer, BertForPreTraining

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')

# 疑似誤り文(ノイズ)を生成し、img2infoの辞書で保管。
import json
import random
import pickle
import nltk
# word_tokenize
nltk.download('punkt')
# pos_tag
nltk.download('averaged_perceptron_tagger')
# wordnet
from nltk.corpus import wordnet as wn
from tqdm import tqdm


def build_img2info(json_obj, sim_value):
    # 画像のidをkey (key, caption, noise caption)をvalue
    img2info = {}
    idx = 0
    for dic in tqdm(json_obj.values(), total=len(json_obj)):
        new_noise = []
        for caption in dic['captions']:
            noise_captions = []
            # 形態素解析
            morph = nltk.word_tokenize(caption.lower())
            pos = nltk.pos_tag(morph)
Пример #28
0
from transformers import (BertTokenizerFast,
                          DataCollatorForNextSentencePrediction,
                          TextDatasetForNextSentencePrediction,
                          BertForPreTraining, Trainer)

parser = argparse.ArgumentParser()
parser.add_argument("--corpus_eval")
parser.add_argument("--block_size", type=int)
parser.add_argument("--model_name_or_path")
parser.add_argument("--token_vocab",
                    default='/home/ubuntu/lrz_share/data/token_vocab/bert/')
args = parser.parse_args()

tokenizer = BertTokenizerFast.from_pretrained(args.token_vocab)

model = BertForPreTraining.from_pretrained(args.model_name_or_path)

data_eval = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path=args.corpus_eval,
    block_size=args.block_size,
)

data_collator = DataCollatorForNextSentencePrediction(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
    block_size=args.block_size)

trainer = Trainer(model=model,
                  data_collator=data_collator,
from torch.nn import CrossEntropyLoss
from common import AverageMeter
from custom_metrics import LMAccuracy
from data_loader import Data_pretrain
from config import Config

if __name__ == '__main__':
    #  training_path, file_id, tokenizer, data_name, reduce_memory=False
    tokenizer = BertTokenizer.from_pretrained('./bert_base_pretrain/vocab.txt')
    train_data_path = './data/processed_data0.json'
    txt = Data_pretrain(train_data_path, tokenizer)
    data_iter = DataLoader(txt, shuffle=True, batch_size=2)
    bert_config = BertConfig.from_pretrained(Config.config_path)

    # model = BertForPreTraining(config=bert_config)
    model = BertForPreTraining.from_pretrained(
        './bert_base_pretrain/pytorch_model.bin', config=bert_config)
    model.to(Config.device)
    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
Пример #30
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_file",
                        default="manual_description.txt",
                        type=str,
                        help="The input train corpus.")
    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--output_dir",
        default="out",
        type=str,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=200,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=4.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--on_memory",
        default=True,
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    #train_examples = None
    num_train_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.train_file)
        train_dataset = BERTDataset(args.train_file,
                                    tokenizer,
                                    seq_len=args.max_seq_length,
                                    corpus_lines=None,
                                    on_memory=args.on_memory)
        num_train_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    model = BertForPreTraining.from_pretrained(
        args.bert_model, config=BertConfig.from_pretrained(args.bert_model))
    model.to(device)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    global_step = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        train_sampler = RandomSampler(train_dataset)

        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        for epoch in trange(1, int(args.num_train_epochs) + 1, desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            print("epoch=", epoch)
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration", position=0)):

                with torch.no_grad():
                    batch = (item.cuda(device=device) for item in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
                model.train()
                optimizer.zero_grad()
                prediction_scores, seq_relationship_score = model(
                    input_ids=input_ids,
                    attention_mask=input_mask,
                    token_type_ids=segment_ids)

                if lm_label_ids is not None and is_next is not None:
                    loss_fct = CrossEntropyLoss(ignore_index=-1)
                    #masked_lm_loss = loss_fct(prediction_scores.view(-1, model.config.vocab_size),lm_label_ids.view(-1))
                    next_sentence_loss = loss_fct(
                        seq_relationship_score.view(-1, 2), is_next.view(-1))
                    total_loss = next_sentence_loss

                model.zero_grad()
                loss = total_loss
                if step % 200 == 0:
                    print(loss)
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / num_train_steps, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
            if epoch % 5 == 0:
                # Save a trained model
                logger.info("** ** * Saving fine - tuned model ** ** * ")
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                checkpoint_prefix = 'checkpoint' + str(epoch)
                output_dir = os.path.join(
                    args.output_dir, '{}-{}'.format(checkpoint_prefix,
                                                    global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
                if args.do_train:
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))