示例#1
0
    def run(self, task):
        """
        Run GLUE training / evaluation.
        """
        params = self.params

        # task parameters
        self.task = task
        params.out_features = N_CLASSES[task]
        self.is_classif = task != 'STS-B'

        # load data
        self.data = self.load_data(task)
        if not self.data['dico'] == self._embedder.dico:
            raise Exception((
                "Dictionary in evaluation data (%i words) seems different than the one "
                +
                "in the pretrained model (%i words). Please verify you used the same dictionary, "
                + "and the same values for max_vocab and min_count.") %
                            (len(self.data['dico']), len(self._embedder.dico)))

        # embedder
        self.embedder = copy.deepcopy(self._embedder)
        self.embedder.cuda()

        # projection layer
        self.proj = nn.Sequential(*[
            nn.Dropout(params.dropout),
            nn.Linear(self.embedder.out_dim, params.out_features)
        ]).cuda()

        # float16
        if params.fp16:
            assert torch.backends.cudnn.enabled
            self.embedder.model = network_to_half(self.embedder.model)
            self.proj = network_to_half(self.proj)

        # optimizer
        self.optimizer = get_optimizer(
            list(self.embedder.get_parameters(params.finetune_layers)) +
            list(self.proj.parameters()), params.optimizer)
        if params.fp16:
            self.optimizer = FP16_Optimizer(self.optimizer,
                                            dynamic_loss_scale=True)

        # train and evaluate the model
        for epoch in range(params.n_epochs):

            # update epoch
            self.epoch = epoch

            # training
            logger.info("GLUE - %s - Training epoch %i ..." % (task, epoch))
            self.train()

            # evaluation
            logger.info("GLUE - %s - Evaluating epoch %i ..." % (task, epoch))
            with torch.no_grad():
                scores = self.eval()
                self.scores.update(scores)
示例#2
0
def create_optimizers(model,
                      args,
                      lr_schedule,
                      prev_optimizer=None,
                      prev_scheduler=None):
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = FusedAdam(params, lr=args.lr)
    optimizer = FP16_Optimizer(optimizer,
                               dynamic_loss_scale=True,
                               verbose=False)

    if prev_optimizer is not None:
        optimizer.load_state_dict(prev_optimizer.state_dict())

    if args.warmup < 0:
        print('No learning rate schedule used.')
    else:
        print('Using learning rate schedule.')
        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer.optimizer,
                                                      lr_schedule)
        if prev_scheduler is not None:
            # Continue LR schedule from previous scheduler
            scheduler.load_state_dict(prev_scheduler.state_dict())

    loss_model = SimpleDistributedDataParallel(model, args.world_size)
    return loss_model, optimizer, scheduler if args.warmup > 0 else None
示例#3
0
文件: model.py 项目: makar21/core
    def __init__(self, optimizer_class, optimizer_kwargs, criterion, is_fp16=False):
        super(Model, self).__init__()

        self._optimizer_kwargs = optimizer_kwargs

        self._device = 'cuda' if torch.cuda.is_available() else 'cpu'

        logger.info("Model device: {}".format(self.device))

        self._model = self.native_model_factory()

        self._is_fp16 = is_fp16
        if self._is_fp16:
            # Lazy load apex framework
            # noinspection PyUnresolvedReferences
            from apex.fp16_utils import network_to_half, FP16_Optimizer
            self._model = network_to_half(self._model)

        self._gpu_count = torch.cuda.device_count() if torch.cuda.is_available() else 0

        logger.info("GPU count: {}".format(self._gpu_count))

        self._model = DataParallel(self._model)

        self._model = self._model.to(self.device)

        self._criterion = criterion.to(self.device)

        self._optimizer = optimizer_class(self._model.parameters(), **optimizer_kwargs)

        if self._is_fp16:
            self._optimizer = FP16_Optimizer(self._optimizer)
示例#4
0
def build_optimizer(model):
    optim_map = {
        "rmsprop":
        optim.RMSprop,
        "adam":
        optim.Adam,
        "adamnorm":
        AdamNormGrad,
        "adamw":
        AdamW,
        "adadelta":
        optim.Adadelta,
        "sgd":
        optim.SGD,
        "sgd_momentum":
        lambda params, lr: optim.SGD(
            params, lr=lr, weight_decay=1e-4, momentum=0.9),
        "lbfgs":
        optim.LBFGS
    }
    # filt = filter(lambda p: p.requires_grad, model.parameters())
    # return optim_map[args.optimizer.lower().strip()](filt, lr=args.lr)
    optimizer = optim_map[args.optimizer.lower().strip()](model.parameters(),
                                                          lr=args.lr)
    if args.half is True:
        return FP16_Optimizer(optimizer, dynamic_loss_scale=True)

    return optimizer
示例#5
0
 def get_optimizer_fp(self, module):
     """
     Build optimizer.
     """
     assert module in ['model', 'encoder', 'decoder']
     optimizer = get_optimizer(getattr(self, module).parameters(), self.params.optimizer)
     if self.params.fp16:
         optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
     return optimizer
示例#6
0
 def get_combo_optimizer_fp(self, modules):
     """
     Build optimizer.
     """
     assert isinstance(modules, tuple)
     param_groups = []
     for module in modules:
         assert hasattr(self, module)
         param_groups.extend(getattr(self, module).parameters())
     optimizer = get_optimizer(param_groups, self.params.optimizer)
     if self.params.fp16:
         optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
     return optimizer
示例#7
0
 def set_parameters(self, params):
     """ ? """
     self.params = []
     self.sparse_params = []
     for k, p in params:
         if p.requires_grad:
             if self.method != 'sparseadam' or "embed" not in k:
                 self.params.append(p)
             else:
                 self.sparse_params.append(p)
     if self.method == 'sgd':
         self.optimizer = optim.SGD(self.params,
                                    lr=self.learning_rate,
                                    momentum=self.momentum)
     elif self.method == 'adagrad':
         self.optimizer = optim.Adagrad(self.params, lr=self.learning_rate)
         for group in self.optimizer.param_groups:
             for p in group['params']:
                 self.optimizer.state[p]['sum'] = self.optimizer\
                     .state[p]['sum'].fill_(self.adagrad_accum)
     elif self.method == 'adadelta':
         self.optimizer = optim.Adadelta(self.params, lr=self.learning_rate)
     elif self.method == 'adam':
         if self.fp16:
             from apex.optimizers import FusedAdam
             self.optimizer = FusedAdam(self.params,
                                        lr=self.learning_rate,
                                        betas=self.betas,
                                        eps=1e-9)
         else:
             self.optimizer = optim.Adam(self.params,
                                         lr=self.learning_rate,
                                         betas=self.betas,
                                         eps=1e-9)
     elif self.method == 'sparseadam':
         self.optimizer = MultipleOptimizer([
             optim.Adam(self.params,
                        lr=self.learning_rate,
                        betas=self.betas,
                        eps=1e-8),
             optim.SparseAdam(self.sparse_params,
                              lr=self.learning_rate,
                              betas=self.betas,
                              eps=1e-8)
         ])
     else:
         raise RuntimeError("Invalid optim method: " + self.method)
     if self.fp16:
         from apex.fp16_utils import FP16_Optimizer
         self.optimizer = FP16_Optimizer(self.optimizer,
                                         dynamic_loss_scale=True)
示例#8
0
文件: trainer.py 项目: wx-b/mcdons
    def set_optimizer(self):
        """
        Set optimizer parameters
        """
        if not self.model_conf.learn_center:
            if self.conf.optim == 'SGD':
                self.optimizer = getattr(optim, 'SGD')(
                    filter(lambda p: p.requires_grad, self.net.parameters()),
                    lr=self.conf.lr_init,
                    momentum=0.9,
                    nesterov=True,
                    weight_decay=self.conf.w_decay)  # default SGD
            else:
                self.optimizer = getattr(optim, self.conf.optim)(
                    filter(lambda p: p.requires_grad, self.net.parameters()),
                    lr=self.conf.lr_init,
                    weight_decay=self.conf.w_decay)  # default SGD
        else:  # Learn center
            params_model = []
            params_center = []
            for n, p in self.net.named_parameters():
                if 'centers' in n and p.requires_grad:  # TODO: check if classifier is also better if separated
                    params_center.append(p)
                elif p.requires_grad:
                    params_model.append(p)

            if self.conf.optim == 'SGD':
                self.optimizer = getattr(optim,
                                         'SGD')(params_model,
                                                lr=self.conf.lr_init,
                                                momentum=0.9,
                                                nesterov=True,
                                                weight_decay=self.conf.w_decay)
                self.optimizer.add_param_group({'params': params_center})
            else:
                self.optimizer = getattr(optim, self.conf.optim)(
                    params_model,
                    lr=self.conf.lr_init,
                    weight_decay=self.conf.w_decay)
                self.optimizer.add_param_group({
                    'params': params_center
                })  # Other settings are same as the first group by default

        if self.model_conf.use_fp16:
            self.optimizer = FP16_Optimizer(self.optimizer)
        if self.conf.res:
            if self.tp.get_meta('optim') == self.conf.optim:
                if 'optim_state' in self.tp.meta_dict.keys():
                    self.optimizer.load_state_dict(
                        self.tp.get_meta('optim_state'))
                    print('Optimizer Internal State Restored')
示例#9
0
文件: train.py 项目: NicolasAG/SGinPG
def build_optimizer(parameters):
    if params.fp16:
        # Use apex's FP 16 optimizer for mixed precison and to do loss scaling
        optimizer = FP16_Optimizer(AdamCosineWithWarmup(parameters,
                                                        betas=(0.9, 0.98),
                                                        eps=1e-6,
                                                        weight_decay=0.01),
                                   dynamic_loss_scale=True)
    else:
        optimizer = AdamCosineWithWarmup(parameters,
                                         betas=(0.9, 0.98),
                                         eps=1e-6,
                                         weight_decay=0.01)
    return optimizer
示例#10
0
def init_schedule(config, optimizer, train_loader):
    t_total = len(train_loader) * config.epochs
    warmup_steps = t_total * config.warmup_ratio
    if switch:
        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        scheduler = get_linear_schedule_with_warmup(
            optimizer.optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=t_total)
    else:
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=t_total)
    return scheduler
示例#11
0
    def run(self):
        """
        Run XNLI training / evaluation.
        """
        params = self.params

        # load data
        self.data = self.load_data()
        assert len(self.data['dico']) == self._embedder.n_words

        # embedder
        self.embedder = copy.deepcopy(self._embedder)
        self.embedder.cuda()

        # projection layer
        self.proj = nn.Sequential(
            *[nn.Dropout(params.dropout),
              nn.Linear(self.embedder.out_dim, 3)]).cuda()

        # float16
        if params.fp16:
            assert torch.backends.cudnn.enabled
            self.embedder.model = network_to_half(self.embedder.model)
            self.proj = network_to_half(self.proj)

        # optimizer
        self.optimizer = get_optimizer(
            list(self.embedder.get_parameters(params.finetune_layers)) +
            list(self.proj.parameters()), params.optimizer)
        if params.fp16:
            self.optimizer = FP16_Optimizer(self.optimizer,
                                            dynamic_loss_scale=True)

        # train and evaluate the model
        for epoch in range(params.n_epochs):

            # update epoch
            self.epoch = epoch

            # training
            logger.info("XNLI - Training epoch %i ..." % epoch)
            self.train()

            # evaluation
            logger.info("XNLI - Evaluating epoch %i ..." % epoch)
            with torch.no_grad():
                scores = self.eval()
                self.scores.update(scores)
示例#12
0
文件: main.py 项目: youzhonghui/ANL
def adjust_learning_rate(epoch, pack):
    if pack.optimizer is None:
        if cfg.train.optim == 'sgd' or cfg.train.optim is None:
            pack.optimizer = optim.SGD(pack.net.parameters(),
                                       lr=1,
                                       momentum=cfg.train.momentum,
                                       weight_decay=cfg.train.weight_decay,
                                       nesterov=cfg.train.nesterov)
        else:
            print('WRONG OPTIM SETTING!')
            assert False
        pack.lr_scheduler = optim.lr_scheduler.LambdaLR(
            pack.optimizer, get_lr_func())
        if cfg.base.fp16 and cfg.base.cuda:
            from apex.fp16_utils import FP16_Optimizer
            pack.optimizer = FP16_Optimizer(pack.optimizer,
                                            dynamic_loss_scale=True)

    pack.lr_scheduler.step(epoch)
    return pack.lr_scheduler.get_lr()
示例#13
0
 def __get_opimizer(self):
     weight_decay = self.args.train['weight_decay']
     if self.args.train['optimizer'] == 'SGD':
         optimizer = optim.SGD(self.net.parameters(), lr=self.getLR(0), momentum=0.9,
                           weight_decay=weight_decay)
     elif self.args.train['optimizer'] == 'Adam':
         if self.half:
             optimizer = FusedAdam(self.net.parameters(), lr=self.getLR(0))
         else:
             optimizer = optim.Adam(self.net.parameters(), lr=self.getLR(0))
     elif self.args.train['optimizer'] == 'AdamW':
         if self.half:
             optimizer = FusedAdam(self.net.parameters(), lr=self.getLR(0), adam_w_mode=True)
         else:
             optimizer = optim.AdamW(self.net.parameters(), lr=self.getLR(0))
     if self.half:
         optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True,
             dynamic_loss_args={'scale_factor' : 3})
         optimizer.loss_scale = 512
     if self.args.train['LA'] == True:
         optimizer = Lookahead(optimizer,k=5,alpha=0.5)        
     return optimizer
示例#14
0
def create_supervised_trainer(
        model: torch.nn.Module,
        optimizer: torch.optim.Optimizer,
        loss_fn: torch.nn.Module,
        max_norm: float = None,
        norm_type: int = 2,
        device: torch.cuda.device = None,
        non_blocking: bool = False,
        mixed_precision: bool = False,
        static_loss_scale: int = 512,
        prepare_batch: Callable = _prepare_batch) -> Engine:
    if device:
        model.to(device)

    if mixed_precision:
        optimizer = FP16_Optimizer(optimizer,
                                   static_loss_scale=static_loss_scale)

    def _process_function(engine: Engine, batch):
        model.train()
        optimizer.zero_grad()
        x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
        y_pred = model(x)
        loss = loss_fn(y_pred, y)

        if mixed_precision:
            optimizer.backward(loss)
        else:
            loss.backward()

        if max_norm:
            clip_grad_norm_(model.parameters(), max_norm, norm_type)

        optimizer.step()
        return loss.item()

    return Engine(_process_function)
示例#15
0
def main(**kwargs):
    args = DefaultConfig()
    args.parse(kwargs)
    if not torch.cuda.is_available():
        args.cuda = False
        args.device = None
        torch.manual_seed(args.seed)  # set random seed for cpu

    train_iter, val_iter, test_iter, args.vocab_size, vectors = data.load_data(
        args)

    global best_score

    # init model
    model = getattr(models, args.model)(args, vectors)

    # 模型保存位置
    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)
    save_path = os.path.join(args.save_dir,
                             '{}_{}.pth'.format(args.model, args.id))

    if args.cuda:
        #        torch.cuda.set_device(args.device)
        torch.cuda.manual_seed(args.seed)  # set random seed for gpu
        model.cuda()

    # 目标函数和优化器
    criterion = F.cross_entropy
    lr1, lr2 = args.lr1, args.lr2
    #    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr1, betas=(0.9, 0.99))
    #    optimizer = model.get_optimizer(lr1, lr2, args.weight_decay)
    optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.0)

    for i in range(args.max_epochs):
        total_loss = 0.0
        correct = 0
        total = 0

        model.train()

        for idx, batch in enumerate(train_iter):
            # 训练模型参数
            # 使用BatchNorm层时,batch size不能为1
            if len(batch) == 1:
                continue
            text, label = batch.text, batch.label
            if args.cuda:
                text, label = text.cuda(), label.cuda()

            optimizer.zero_grad()
            pred = model(text)
            loss = criterion(pred, label)
            #loss.backward()
            optimizer.backward(loss)
            optimizer.step()

            # 更新统计指标
            total_loss += float(loss.item())
            predicted = pred.max(1)[1]
            total += label.size(0)
            correct += predicted.eq(label).sum().item()

            if idx % 80 == 79:
                print('[{}, {}] loss: {:.3f} | Acc: {:.3f}%({}/{})'.format(
                    i + 1, idx + 1, total_loss / 20, 100. * correct / total,
                    correct, total))
                total_loss = 0.0
        # 计算再验证集上的分数,并相应调整学习率
        f1score = val(model, val_iter, args)
        if f1score > best_score:
            best_score = f1score
            checkpoint = {'state_dict': model.state_dict(), 'config': args}
            torch.save(checkpoint, save_path)
            print('Best tmp model f1score: {}'.format(best_score))
        if f1score < best_score:
            #model.load_state_dict(torch.load(save_path)['state_dict'],map_location={'cuda:1':'cuda:0'})
            model.load_state_dict(torch.load(save_path)['state_dict'])
            lr1 *= args.lr_decay
            lr2 = 2e-4 if lr2 == 0 else lr2 * 0.8
            #optimizer = model.get_optimizer(lr1, lr2, 0)
            optimizer = torch.optim.Adam(model.parameters(),
                                         lr=lr1,
                                         betas=(0.9, 0.99))
            print('* load previous best model: {}'.format(best_score))
            print('* model lr:{}  emb lr:{}'.format(lr1, lr2))
            if lr1 < args.min_lr:
                print('* training over, best f1 score: {}'.format(best_score))
                break

    # 保存训练最终的模型
    args.best_score = best_score
    final_model = {'state_dict': model.state_dict(), 'config': args}
    best_model_path = os.path.join(
        args.save_dir, '{}_{}_{}.pth'.format(args.model, args.text_type,
                                             best_score))
    torch.save(final_model, best_model_path)
    print('Best Final Model saved in {}'.format(best_model_path))

    # 在测试集上运行模型并生成概率结果和提交结果
    if not os.path.exists('result/'):
        os.mkdir('result/')
    probs, test_pred = test(model, test_iter, args)
    result_path = 'result/' + '{}_{}_{}'.format(args.model, args.id,
                                                args.best_score)
    np.save('{}.npy'.format(result_path), probs)
    print('Prob result {}.npy saved!'.format(result_path))

    test_pred[['id', 'class']].to_csv('{}.csv'.format(result_path), index=None)
    print('Result {}.csv saved!'.format(result_path))

    t2 = time.time()
    print('time use: {}'.format(t2 - t1))
示例#16
0
def main():

    parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None,
                           logging_scope=dllg.TRAIN_ITER_SCOPE,
                           iteration_interval=1),
        dllg.JsonBackend(log_file=args.log_file if args.rank == 0 else None,
                         logging_scope=dllg.TRAIN_ITER_SCOPE,
                         iteration_interval=1)
    ])

    LOGGER.timed_block_start("run")
    LOGGER.register_metric(tags.TRAIN_ITERATION_LOSS,
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("iter_time", metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("epoch_time", metric_scope=dllg.EPOCH_SCOPE)
    LOGGER.register_metric("run_time", metric_scope=dllg.RUN_SCOPE)
    LOGGER.register_metric("val_iter_loss", metric_scope=dllg.EPOCH_SCOPE)
    LOGGER.register_metric("train_epoch_items/sec",
                           metric_scope=dllg.EPOCH_SCOPE)
    LOGGER.register_metric("train_epoch_avg_loss",
                           metric_scope=dllg.EPOCH_SCOPE)

    log_hardware()

    model_name = args.model_name
    parser = models.parse_model_args(model_name, parser)
    parser.parse_args()

    args = parser.parse_args()

    log_args(args)

    torch.backends.cudnn.enabled = args.cudnn_enabled
    torch.backends.cudnn.benchmark = args.cudnn_benchmark

    distributed_run = args.world_size > 1
    if distributed_run:
        init_distributed(args, args.world_size, args.rank, args.group_name)

    LOGGER.log(key=tags.RUN_START)
    run_start_time = time.time()

    model_config = models.get_model_config(model_name, args)
    model = models.get_model(model_name,
                             model_config,
                             to_fp16=args.fp16_run,
                             to_cuda=True)

    epoch_start = 0
    if args.resume:
        resume_model_path = args.resume_tacotron2_path if args.model_name == "Tacotron2" else args.resume_waveglow_path
        checkpoint = torch.load(resume_model_path, map_location='cpu')
        epoch_start = checkpoint["epoch"]
        state_dict = checkpoint['state_dict']
        if checkpoint_from_distributed(state_dict):
            state_dict = unwrap_distributed(state_dict)

        model.load_state_dict(state_dict)
        print("restore model %s" % resume_model_path)

    if distributed_run:
        model = DDP(model)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.learning_rate,
                                 weight_decay=args.weight_decay)

    if args.fp16_run:
        optimizer = FP16_Optimizer(
            optimizer, dynamic_loss_scale=args.dynamic_loss_scaling)

    try:
        sigma = args.sigma
    except AttributeError:
        sigma = None

    criterion = loss_functions.get_loss_function(model_name, sigma)

    try:
        n_frames_per_step = args.n_frames_per_step
    except AttributeError:
        n_frames_per_step = None

    collate_fn = data_functions.get_collate_function(model_name,
                                                     n_frames_per_step)
    trainset = data_functions.get_data_loader(model_name, args.dataset_path,
                                              args.training_files, args)
    train_sampler = DistributedSampler(trainset) if distributed_run else None
    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=args.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)

    valset = data_functions.get_data_loader(model_name, args.dataset_path,
                                            args.validation_files, args)

    batch_to_gpu = data_functions.get_batch_to_gpu(model_name)

    iteration = 0
    model.train()

    LOGGER.log(key=tags.TRAIN_LOOP)

    for epoch in range(epoch_start, args.epochs):
        LOGGER.epoch_start()
        epoch_start_time = time.time()
        LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch)

        # used to calculate avg items/sec over epoch
        reduced_num_items_epoch = 0

        # used to calculate avg loss over epoch
        train_epoch_avg_loss = 0.0
        num_iters = 0

        # if overflow at the last iteration then do not save checkpoint
        overflow = False

        for i, batch in enumerate(train_loader):
            LOGGER.iteration_start()
            iter_start_time = time.time()
            LOGGER.log(key=tags.TRAIN_ITER_START, value=i)
            print("Batch: {}/{} epoch {}".format(i, len(train_loader), epoch))

            start = time.perf_counter()
            adjust_learning_rate(epoch, optimizer, args.learning_rate,
                                 args.anneal_steps, args.anneal_factor)

            model.zero_grad()
            x, y, num_items = batch_to_gpu(batch)

            if args.fp16_run:
                y_pred = model(fp32_to_fp16(x))
                loss = criterion(fp16_to_fp32(y_pred), y)
            else:
                y_pred = model(x)
                loss = criterion(y_pred, y)

            if distributed_run:
                reduced_loss = reduce_tensor(loss.data, args.world_size).item()
                reduced_num_items = reduce_tensor(num_items.data, 1).item()
            else:
                reduced_loss = loss.item()
                reduced_num_items = num_items.item()
            if np.isnan(reduced_loss):
                raise Exception("loss is NaN")

            LOGGER.log(key=tags.TRAIN_ITERATION_LOSS, value=reduced_loss)

            train_epoch_avg_loss += reduced_loss
            num_iters += 1

            # accumulate number of items processed in this epoch
            reduced_num_items_epoch += reduced_num_items

            if args.fp16_run:
                optimizer.backward(loss)
                grad_norm = optimizer.clip_master_grads(args.grad_clip_thresh)
            else:
                loss.backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), args.grad_clip_thresh)

            optimizer.step()

            overflow = optimizer.overflow if args.fp16_run else False
            iteration += 1

            LOGGER.log(key=tags.TRAIN_ITER_STOP, value=i)

            iter_stop_time = time.time()
            iter_time = iter_stop_time - iter_start_time
            LOGGER.log(key="train_iter_items/sec",
                       value=(reduced_num_items / iter_time))
            LOGGER.log(key="iter_time", value=iter_time)
            LOGGER.iteration_stop()

        LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch)
        epoch_stop_time = time.time()
        epoch_time = epoch_stop_time - epoch_start_time

        LOGGER.log(key="train_epoch_items/sec",
                   value=(reduced_num_items_epoch / epoch_time))
        LOGGER.log(key="train_epoch_avg_loss",
                   value=(train_epoch_avg_loss /
                          num_iters if num_iters > 0 else 0.0))
        LOGGER.log(key="epoch_time", value=epoch_time)

        LOGGER.log(key=tags.EVAL_START, value=epoch)

        validate(model, criterion, valset, iteration, args.batch_size,
                 args.world_size, collate_fn, distributed_run, args.rank,
                 batch_to_gpu, args.fp16_run)

        LOGGER.log(key=tags.EVAL_STOP, value=epoch)

        if not overflow and (epoch % args.epochs_per_checkpoint
                             == 0) and args.rank == 0:
            checkpoint_path = os.path.join(
                args.output_directory,
                "checkpoint_{}_{}".format(model_name, epoch))
            save_checkpoint(model, epoch, model_config, checkpoint_path)
            save_sample(
                model_name, model, args.waveglow_checkpoint,
                args.tacotron2_checkpoint, args.phrase_path,
                os.path.join(args.output_directory,
                             "sample_{}_{}.wav".format(model_name, iteration)),
                args.sampling_rate, args.fp16_run)

        LOGGER.epoch_stop()

    run_stop_time = time.time()
    run_time = run_stop_time - run_start_time
    LOGGER.log(key="run_time", value=run_time)
    LOGGER.log(key=tags.RUN_FINAL)

    print("training time", run_stop_time - run_start_time)

    LOGGER.timed_block_stop("run")

    if args.rank == 0:
        LOGGER.finish()
示例#17
0
model.total_parameters()
model.initialize_weights_xavier_uniform()

model = network_to_half(model)
model = model.cuda()
model.load_state_dict(torch.load("CARN_model_checkpoint.pt"))

learning_rate = 1e-4
weight_decay = 1e-6
optimizer = optim.Adam(model.parameters(),
                       lr=learning_rate,
                       weight_decay=weight_decay,
                       amsgrad=True)
# optimizer = optim.SGD(model.parameters(), momentum=0.9, nesterov=True, weight_decay=weight_decay, lr=learning_rate)

optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.0, verbose=False)
# optimizer.load_state_dict(torch.load("CARN_adam_checkpoint.pt"))

last_iter = -1  # torch.load("CARN_scheduler_last_iter")
scheduler = CyclicLR(optimizer.optimizer,
                     base_lr=1e-4,
                     max_lr=1e-4,
                     step_size=3 * total_batch,
                     mode="triangular",
                     last_batch_iteration=last_iter)
train_loss = []
train_ssim = []
train_psnr = []

test_loss = []
test_ssim = []
torch.backends.cudnn.benchmark = True

N, D_in, D_out = 64, 1024, 16

x = Variable(torch.cuda.FloatTensor(N, D_in ).normal_()).half()
y = Variable(torch.cuda.FloatTensor(N, D_out).normal_()).half()

model = torch.nn.Linear(D_in, D_out).cuda().half()
model = torch.nn.parallel.DistributedDataParallel(model,
                                                  device_ids=[args.local_rank],
                                                  output_device=args.local_rank)

optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
### CONSTRUCT FP16_Optimizer ###
optimizer = FP16_Optimizer(optimizer)
###

loss_fn = torch.nn.MSELoss()

for t in range(500):
    optimizer.zero_grad()
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    ### CHANGE loss.backward() TO: ###
    optimizer.backward(loss)
    ###
    optimizer.step()

print("final loss = ", loss)
示例#19
0
def train_network(net, model_ckpt, fold=0):
    # train the network, allow for keyboard interrupt
    try:
        # define optimizer
        # optimizer = optim.SGD(net.parameters(), lr=config.lr, momentum=0.9, weight_decay=configs.l2)

        if config.fp16:
            optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                          net.parameters()),
                                   lr=config.lr,
                                   eps=1e-04)

            from apex.fp16_utils import FP16_Optimizer
            optimizer = FP16_Optimizer(optimizer,
                                       dynamic_loss_scale=True,
                                       verbose=False)

        else:
            optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                          net.parameters()),
                                   lr=config.lr)

        valid_patience = 0
        best_val_loss = None
        best_val_f1 = None
        cycle = 0
        t_ = 0

        if args.resume:
            net, optimizer, start_epoch, best_val_loss = load_checkpoint(
                net, optimizer, model_ckpt)

        if config.reduce_lr_plateau:
            scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                optimizer, 'min', config.lr_scale, config.lr_patience, True)

        if config.cosine_annealing:
            cos_lr, cycle_ends = cosine_annealing_lr(config.min_lr,
                                                     config.max_lr,
                                                     config.cycle_size,
                                                     config.epochs,
                                                     config.cycle_size_inc)

        # get the loaders
        train_loader, valid_loader = get_data_loaders(
            imsize=config.imsize,
            num_channels=config.num_channels,
            batch_size=config.batch_size,
            test_size=config.test_size,
            num_workers=config.num_workers,
            preload=config.preload_data,
            external_data=config.external_data,
            mixup=config.mixup)

        # loss = F1Loss()
        if hasattr(config, 'focal_gamma'):
            loss = FocalLoss(config.focal_gamma)
        else:
            loss = FocalLoss()
        # loss = nn.BCEWithLogitsLoss().cuda()
        # if hasattr(config, 'focal_gamma'):
        #     loss = FocalTverskyLoss(gamma = config.focal_gamma)
        # else:
        #     loss = FocalTverskyLoss()

        # training flags
        freeze_bn = False
        save_imgs = False
        train_losses = []
        valid_losses = []
        valid_f1s = []
        lr_hist = []

        print('Training ...')
        print('Saving to ', model_ckpt)
        for e in range(config.epochs):
            print('\n' + 'Epoch {}/{}'.format(e, config.epochs))

            start = time.time()

            t_l = train(net, optimizer, loss, train_loader, freeze_bn)

            v_l, v_f1 = valid(net, optimizer, loss, valid_loader, save_imgs,
                              fold)

            if config.reduce_lr_plateau:
                scheduler.step(v_l)

            if config.cosine_annealing:
                for param_group in optimizer.param_groups:
                    param_group['lr'] = cos_lr[e]
                if (e in cycle_ends):
                    cycle = np.where(cycle_ends == e)[0][0] + 1
                    net.eval()
                    torch.save(
                        net.state_dict(),
                        model_ckpt.replace('best', 'cycle{}'.format(cycle)))
                    print("Cycle {} completed. Saving model to {}".format(
                        cycle,
                        model_ckpt.replace('best', 'cycle{}'.format(cycle))))

            lr_hist.append(optimizer.param_groups[0]['lr'])

            state = {
                'epoch': e,
                'arch': config.model_name,
                'state_dict': net.state_dict(),
                'best_val_loss': best_val_loss,
                'optimizer': optimizer.state_dict(),
            }

            # save the model on best validation loss
            if best_val_loss is None or v_l < best_val_loss:
                best_val_loss = v_l

                net.eval()

                torch.save(state, model_ckpt)
                valid_patience = 0
                print('Best val loss achieved. loss = {:.4f}.'.format(v_l),
                      " Saving model to ", model_ckpt)

            # save the model on best validation f1
            # if best_val_f1 is None or v_f1 > best_val_f1:
            #     net.eval()
            #     torch.save(net.state_dict(), model_ckpt.replace('best', 'bestf1'))
            #     best_val_f1 = v_f1
            #     valid_patience = 0
            #     print('Best val F1 achieved. F1 = {:.4f}.'.
            #         format(v_f1), " Saving model to ", model_ckpt.replace('best', 'bestf1'))

            # if (e > 5):
            #     SUBM_OUT = './subm/{}_{}_epoch{}.csv'.format(
            #                     config.model_name, config.exp_name, str(e))
            #     generate_submission(net, config, SUBM_OUT)

            else:
                valid_patience += 1

            torch.save(state, model_ckpt.replace('best', 'latest'))

            train_losses.append(t_l)
            valid_losses.append(v_l)
            valid_f1s.append(v_f1)

            log_metrics(train_losses, valid_losses, valid_f1s, lr_hist, e,
                        model_ckpt, config)

            t_ += 1
            print('Time: {:d}s'.format(int(time.time() - start)))

    except KeyboardInterrupt:
        pass

    gen_sub = input(
        "\n\nGenerate submission while the GPU is still hot from training? [Y/n]: "
    )
    if gen_sub in ['Y', 'y', 'Yes', 'yes']:
        generate_submission(net, config)
    def initialize(self, opt):
        BaseModel.initialize(self, opt)
        if opt.half:
            try:
                from apex.fp16_utils import FP16_Optimizer
            except ImportError:
                print("Please install NVIDIA Apex for safe mixed precision")

        # specify the training losses you want to print out. The program will call base_model.get_current_losses
        self.loss_names = ['D_A', 'G_A', 'cycle_A', 'idt_A', 'D_B', 'G_B', 'cycle_B', 'idt_B']
        # specify the images you want to save/display. The program will call base_model.get_current_visuals
        visual_names_A = ['real_A', 'fake_B', 'rec_A']
        visual_names_B = ['real_B', 'fake_A', 'rec_B']
        if self.isTrain and self.opt.lambda_identity > 0.0:
            visual_names_A.append('idt_A')
            visual_names_B.append('idt_B')

        self.visual_names = visual_names_A + visual_names_B
        # specify the models you want to save to the disk. The program will call base_model.save_networks and base_model.load_networks
        if self.isTrain:
            self.model_names = ['G_A', 'G_B', 'D_A', 'D_B']
        else:  # during test time, only load Gs
            self.model_names = ['G_A', 'G_B']

        # load/define networks
        # The naming conversion is different from those used in the paper
        # Code (paper): G_A (G), G_B (F), D_A (D_Y), D_B (D_X)
        self.netG_A = networks.define_G(opt.input_nc, opt.output_nc, opt.ngf, opt.netG, opt.norm,
                                        not opt.no_dropout, opt.init_type, opt.init_gain, self.gpu_ids)
        self.netG_B = networks.define_G(opt.output_nc, opt.input_nc, opt.ngf, opt.netG, opt.norm,
                                        not opt.no_dropout, opt.init_type, opt.init_gain, self.gpu_ids)

        if opt.half:
            self.netG_A = self.netG_A.half()
            self.netG_B = self.netG_B.half()

        if self.isTrain:
            use_sigmoid = opt.no_lsgan
            self.netD_A = networks.define_D(opt.output_nc, opt.ndf, opt.netD,
                                            opt.n_layers_D, opt.norm, use_sigmoid, opt.init_type, opt.init_gain, self.gpu_ids)
            self.netD_B = networks.define_D(opt.input_nc, opt.ndf, opt.netD,
                                            opt.n_layers_D, opt.norm, use_sigmoid, opt.init_type, opt.init_gain, self.gpu_ids)

            self.fake_A_pool = ImagePool(opt.pool_size)
            self.fake_B_pool = ImagePool(opt.pool_size)
            # define loss functions
            self.criterionGAN = networks.GANLoss(use_lsgan=not opt.no_lsgan, half_precision=opt.half).to(self.device)
            self.criterionCycle = torch.nn.L1Loss()
            self.criterionIdt = torch.nn.L1Loss()
            # initialize optimizers
            self.optimizer_G = torch.optim.Adam(itertools.chain(self.netG_A.parameters(), self.netG_B.parameters()),
                                                lr=opt.lr, betas=(opt.beta1, 0.999))
            self.optimizer_D = torch.optim.Adam(itertools.chain(self.netD_A.parameters(), self.netD_B.parameters()),
                                                lr=opt.lr, betas=(opt.beta1, 0.999))
            if opt.half:
                self.netD_A = self.netD_A.half()
                self.netD_B = self.netD_B.half()
                self.optimizer_G = FP16_Optimizer(self.optimizer_G, dynamic_loss_scale=True)
                self.optimizer_D = FP16_Optimizer(self.optimizer_D, dynamic_loss_scale=True)

            self.optimizers = []
            self.optimizers.append(self.optimizer_G)
            self.optimizers.append(self.optimizer_D)
示例#21
0
    def init_model(self):
        n_layer = int(
            (self.depth - 1) /
            2)  # depth = n_layer * (multi-head + ffn) + linear-softmax
        d_model = self.width
        d_inner = self.width * 2
        vocab_size = self.vocab
        tgt_len = self.bptt_len

        if self.d_embed < 0:
            self.d_embed = d_model

        # Mixed-floating point precision (if fp16 is enabled, storage will be with half-precision)
        if self.fp16 and 'cuda' not in self.device:
            print('WARNING: fp16 requires cuda, ignoring fp16 option')
            self.fp16 = False
        elif self.fp16:
            try:
                from apex.fp16_utils import FP16_Optimizer
                self.optimizer = FP16_Optimizer(
                    self.optimizer,
                    static_loss_scale=args.static_loss_scale,
                    dynamic_loss_scale=args.dynamic_loss_scale,
                    dynamic_loss_args={'init_scale': 2**16})
            except:
                print('WARNING: apex not installed, ignoring fp16 option')
                self.fp16 = False

        if self.restart:
            with open(os.path.join(restart_dir, 'model.pt'), 'rb') as f:
                model = torch.load(f)
            if not fp16:
                model = model.float()
            model.apply(self.update_dropout)
            model.apply(self.update_dropatt)
        else:
            model = MemTransformerLM(vocab_size,
                                     n_layer,
                                     self.n_head,
                                     d_model,
                                     self.d_head,
                                     d_inner,
                                     self.dropout,
                                     self.dropatt,
                                     tie_weight=self.tied,
                                     d_embed=self.d_embed,
                                     div_val=self.div_val,
                                     tie_projs=[False],
                                     pre_lnorm=self.pre_lnorm,
                                     tgt_len=self.tgt_len,
                                     ext_len=self.ext_len,
                                     mem_len=self.mem_len,
                                     cutoffs=[],
                                     same_length=self.same_length,
                                     attn_type=self.attn_type,
                                     clamp_len=self.clamp_len,
                                     sample_softmax=-1)
            model.apply(self.weights_init)
            model.word_emb.apply(
                self.weights_init
            )  # ensure embedding init is not overridden by out_layer in case of weight sharing

        self.model = model
        self.n_all_param = sum([p.nelement() for p in model.parameters()])
        self.n_nonemb_param = sum(
            [p.nelement() for p in model.layers.parameters()])

        if self.multi_gpu:
            self.model = self.model.to(self.device)
            if self.gpu0_bsz >= 0:
                self.para_model = BalancedDataParallel(self.gpu0_bsz,
                                                       self.model,
                                                       dim=1).to(self.device)
            else:
                self.para_model = nn.DataParallel(self.model,
                                                  dim=1).to(self.device)
        else:
            self.para_model = self.model.to(self.device)

        return model
示例#22
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size',
                        type=int,
                        default=100,
                        metavar='N',
                        help='input batch size for training (default: 100)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=100,
                        metavar='N',
                        help='input batch size for testing (default: 100)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        metavar='N',
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.1,
                        metavar='LR',
                        help='learning rate (default: 0.1)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.9,
                        metavar='M',
                        help='SGD momentum (default: 0.9)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=17,
                        metavar='S',
                        help='random seed (default: 17)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=100,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model',
                        type=str,
                        default='./models/model.pt',
                        help='For Saving the current Model')
    parser.add_argument('--data', type=str, default='mnist')
    parser.add_argument('--augment', action='store_true')
    parser.add_argument('--decay_frequency', type=int, default=25000)
    parser.add_argument('--l1', type=float, default=0.0)
    parser.add_argument('--fp16',
                        action='store_true',
                        help='Run in fp16 mode.')
    parser.add_argument('--valid_split', type=float, default=0.1)
    parser.add_argument('--resume', type=str)
    parser.add_argument('--start-epoch', type=int, default=1)
    parser.add_argument('--model', type=str, default='')
    parser.add_argument('--l2', type=float, default=5.0e-4)
    parser.add_argument(
        '--iterations',
        type=int,
        default=1,
        help=
        'How many times the model should be run after each other. Default=1')
    parser.add_argument(
        '--save-features',
        action='store_true',
        help=
        'Resumes a saved model and saves its feature data to disk for plotting.'
    )
    parser.add_argument(
        '--bench',
        action='store_true',
        help='Enables the benchmarking of layers and estimates sparse speedups'
    )
    sparselearning.core.add_sparse_args(parser)

    args = parser.parse_args()

    if args.fp16:
        try:
            from apex.fp16_utils import FP16_Optimizer
        except:
            print('WARNING: apex not installed, ignoring --fp16 option')
            args.fp16 = False

    use_cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    print_and_log('\n\n')
    print_and_log('=' * 80)
    print_and_log('=' * 80)
    print_and_log(args)
    torch.manual_seed(args.seed)
    for i in range(args.iterations):
        print_and_log("\nIteration start: {0}/{1}\n".format(
            i + 1, args.iterations))

        if args.data == 'mnist':
            train_loader, valid_loader, test_loader = get_mnist_dataloaders(
                args, validation_split=args.valid_split)
        else:
            train_loader, valid_loader, test_loader = get_cifar10_dataloaders(
                args, args.valid_split)

        if args.model not in models:
            print(
                'You need to select an existing model via the --model argument. Available models include: '
            )
            for key in models:
                print('\t{0}'.format(key))
            raise Exception('You need to select a model')
        else:
            cls, cls_args = models[args.model]
            cls_args.append(args.save_features)
            cls_args.append(args.bench)
            model = cls(*cls_args).to(device)
            print_and_log(model)
            print_and_log('=' * 60)
            print_and_log(args.model)
            print_and_log('=' * 60)

            print_and_log('=' * 60)
            print_and_log('Death mode: {0}'.format(args.death))
            print_and_log('Growth mode: {0}'.format(args.growth))
            print_and_log('Redistribution mode: {0}'.format(
                args.redistribution))
            print_and_log('=' * 60)

        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.l2,
                              nesterov=True)
        lr_scheduler = optim.lr_scheduler.StepLR(optimizer,
                                                 args.decay_frequency,
                                                 gamma=0.1)

        if args.resume:
            if os.path.isfile(args.resume):
                print_and_log("=> loading checkpoint '{}'".format(args.resume))
                checkpoint = torch.load(args.resume)
                args.start_epoch = checkpoint['epoch']
                model.load_state_dict(checkpoint['state_dict'])
                optimizer.load_state_dict(checkpoint['optimizer'])
                print_and_log("=> loaded checkpoint '{}' (epoch {})".format(
                    args.resume, checkpoint['epoch']))
                print_and_log('Testing...')
                evaluate(args, model, device, test_loader)
                plot_class_feature_histograms(args, model, device,
                                              train_loader, optimizer)
            else:
                print_and_log("=> no checkpoint found at '{}'".format(
                    args.resume))

        if args.fp16:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=None,
                                       dynamic_loss_scale=True,
                                       dynamic_loss_args={'init_scale': 2**16})
            model = model.half()

        mask = None
        if args.sparse:
            decay = CosineDecay(args.death_rate,
                                len(train_loader) * (args.epochs))
            mask = Masking(optimizer,
                           death_mode=args.death,
                           death_rate_decay=decay,
                           growth_mode=args.growth,
                           redistribution_mode=args.redistribution)
            mask.add_module(model, density=args.density)

        for epoch in range(1, args.epochs + 1):

            t0 = time.time()
            train(args, model, device, train_loader, optimizer, epoch,
                  lr_scheduler, mask)

            if args.valid_split > 0.0:
                val_acc = evaluate(args, model, device, valid_loader)

            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict()
                },
                is_best=False,
                filename=args.save_model)

            if args.sparse and epoch < args.epochs:
                mask.at_end_of_epoch()

            print_and_log(
                'Current learning rate: {0}. Time taken for epoch: {1}.\n'.
                format(optimizer.param_groups[0]['lr'],
                       time.time() - t0))

        evaluate(args, model, device, test_loader)
        print_and_log("\nIteration end: {0}/{1}\n".format(
            i + 1, args.iterations))
示例#23
0
def main():
    global args, best_prec1

    args = parser.parse_args()
    setup_logger(args)

    if args.fp16:
        try:
            from apex.fp16_utils import FP16_Optimizer
        except:
            print_and_log(
                'WARNING: apex not installed, ignoring --fp16 option')
            args.fp16 = False

    kwargs = {'num_workers': 1, 'pin_memory': True}
    dataset = args.model.split('_')[0]
    if dataset == 'mnist':
        full_dataset = datasets.MNIST('./data',
                                      train=True,
                                      download=True,
                                      transform=transforms.Compose([
                                          transforms.ToTensor(),
                                          transforms.Normalize((0.1307, ),
                                                               (0.3081, ))
                                      ]))

        if not (args.validate_set):
            train_loader = torch.utils.data.DataLoader(
                full_dataset,
                batch_size=args.batch_size,
                shuffle=True,
                **kwargs)
            val_loader = None
        else:
            train_dataset = split_dataset(full_dataset, split_end=50000)
            val_dataset = split_dataset(full_dataset, split_start=50000)
            train_loader = torch.utils.data.DataLoader(
                train_dataset,
                batch_size=args.batch_size,
                shuffle=True,
                **kwargs)
            val_loader = torch.utils.data.DataLoader(
                val_dataset,
                batch_size=args.batch_size,
                shuffle=False,
                **kwargs)

        test_loader = torch.utils.data.DataLoader(datasets.MNIST(
            './data',
            train=False,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ])),
                                                  batch_size=args.batch_size,
                                                  shuffle=False,
                                                  **kwargs)

    elif dataset == 'cifar10':
        normalize = transforms.Normalize(
            mean=[x / 255.0 for x in [125.3, 123.0, 113.9]],
            std=[x / 255.0 for x in [63.0, 62.1, 66.7]])

        if args.augment:
            transform_train = transforms.Compose([
                transforms.ToTensor(),
                transforms.Lambda(lambda x: F.pad(x.unsqueeze(0), (4, 4, 4, 4),
                                                  mode='reflect').squeeze()),
                transforms.ToPILImage(),
                transforms.RandomCrop(32),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                normalize,
            ])
        else:
            transform_train = transforms.Compose([
                transforms.ToTensor(),
                normalize,
            ])
        transform_test = transforms.Compose([transforms.ToTensor(), normalize])

        full_dataset = datasets.CIFAR10('./data',
                                        train=True,
                                        download=True,
                                        transform=transform_train)

        if not (args.validate_set):
            train_loader = torch.utils.data.DataLoader(
                full_dataset,
                batch_size=args.batch_size,
                shuffle=True,
                **kwargs)
            val_loader = None
        else:
            train_dataset = split_dataset(full_dataset, split_end=45000)
            val_dataset = split_dataset(full_dataset, split_start=45000)
            train_loader = torch.utils.data.DataLoader(
                train_dataset,
                batch_size=args.batch_size,
                shuffle=True,
                **kwargs)
            val_loader = torch.utils.data.DataLoader(
                val_dataset,
                batch_size=args.batch_size,
                shuffle=True,
                **kwargs)

        test_loader = torch.utils.data.DataLoader(datasets.CIFAR10(
            './data', train=False, transform=transform_test),
                                                  batch_size=args.batch_size,
                                                  shuffle=True,
                                                  **kwargs)

    elif dataset == 'imagenet':
        if not (args.data):
            raise Exception(
                'need to specify imagenet dataset location using the --data argument'
            )
        traindir = os.path.join(args.data, 'train')
        valdir = os.path.join(args.data, 'val')
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])

        full_dataset = datasets.ImageFolder(
            traindir,
            transforms.Compose([
                transforms.RandomResizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                normalize,
            ]))

        train_sampler = None

        if not (args.validate_set):
            train_loader = torch.utils.data.DataLoader(
                full_dataset,
                batch_size=args.batch_size,
                shuffle=(train_sampler is None),
                num_workers=args.workers,
                pin_memory=True,
                sampler=train_sampler)
            val_loader = None

        else:
            train_dataset = split_dataset(full_dataset,
                                          split_end=len(full_dataset) - 10000)
            val_dataset = split_dataset(full_dataset,
                                        split_start=len(full_dataset) - 10000)
            train_loader = torch.utils.data.DataLoader(
                train_dataset,
                batch_size=args.batch_size,
                shuffle=(train_sampler is None),
                num_workers=args.workers,
                pin_memory=True,
                sampler=train_sampler)

            val_loader = torch.utils.data.DataLoader(
                val_dataset,
                batch_size=args.batch_size,
                shuffle=False,
                num_workers=4,
                pin_memory=True)

        test_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
            valdir,
            transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                normalize,
            ])),
                                                  batch_size=args.batch_size,
                                                  shuffle=False,
                                                  num_workers=args.workers,
                                                  pin_memory=True)

    else:
        raise RuntimeError(
            'Unknown dataset {}. Dataset is first segment of network name'.
            format(dataset))

    print_and_log(args)
    with open(args.schedule_file, 'r') as stream:
        try:
            loaded_schedule = yaml.load(stream)
        except yaml.YAMLError as exc:
            print_and_log(exc)

    if args.model == 'mnist_mlp':
        model = mnist_mlp(initial_sparsity=args.initial_sparsity_fc,
                          sparse=not (args.tied),
                          no_batch_norm=args.no_batch_norm)
    elif args.model == 'cifar10_WideResNet':
        model = cifar10_WideResNet(
            args.layers,
            widen_factor=args.widen_factor,
            initial_sparsity_conv=args.initial_sparsity_conv,
            initial_sparsity_fc=args.initial_sparsity_fc,
            sub_kernel_granularity=args.sub_kernel_granularity,
            sparse=not (args.tied))

    elif args.model == 'imagenet_resnet50':
        model = imagenet_resnet50(
            initial_sparsity_conv=args.initial_sparsity_conv,
            initial_sparsity_fc=args.initial_sparsity_fc,
            sub_kernel_granularity=args.sub_kernel_granularity,
            widen_factor=args.widen_factor,
            vanilla_conv1=True,
            vanilla_conv3=True,
            vanilla_downsample=True,
            sparse=not args.sparse_momentum)
    else:
        raise RuntimeError('unrecognized model name ' + repr(args.model))

    model = model.cuda()
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                nesterov=args.nesterov,
                                weight_decay=args.weight_decay)

    if args.fp16:
        print_and_log('FP16')
        optimizer = FP16_Optimizer(optimizer,
                                   static_loss_scale=None,
                                   dynamic_loss_scale=True,
                                   dynamic_loss_args={'init_scale': 2**16})
        model = model.half()

    mask = None
    if not args.dense:
        decay = CosineDecay(args.prune_rate, len(train_loader) * (args.epochs))
        mask = Masking(optimizer,
                       decay,
                       prune_rate=args.prune_rate,
                       prune_mode='magnitude',
                       growth_mode=args.growth,
                       redistribution_mode=args.redistribution,
                       verbose=True,
                       fp16=args.fp16)
        mask.add_module(model, density=args.density)
        #mask.remove_weight_partial_name('downsample', verbose=True)
        #mask.remove_weight('conv1.weight')

    if dataset == 'imagenet':
        print_and_log('setting up data parallel')
        model = torch.nn.DataParallel(model).cuda()
        base_model = model.module
    else:
        base_model = model

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print_and_log("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            #args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            if 'optimizer' in checkpoint:
                optimizer.load_state_dict(checkpoint['optimizer'])
                print_and_log('OPTIM')
                mask.optimizer = optimizer
            print_and_log("=> loaded checkpoint '{}' ".format(args.resume))
        else:
            print_and_log("=> no checkpoint found at '{}'".format(args.resume))

    if args.copy_mask_from:
        if os.path.isfile(args.copy_mask_from):
            print_and_log("=> loading mask data '{}'".format(
                args.copy_mask_from))
            mask_data = torch.load(args.copy_mask_from)
            filtered_mask_data = collections.OrderedDict([
                (x, y) for (x, y) in mask_data['state_dict'].items()
                if 'mask' in x
            ])
            model.load_state_dict(filtered_mask_data, strict=False)
        else:
            print_and_log("=> no mask checkpoint found at '{}'".format(
                args.copy_mask_from))

    # get the number of model parameters
    model_size = base_model.get_model_size()

    cudnn.benchmark = True

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    train_loss_l = []
    test_loss_l = []
    train_prec1_l = []
    test_prec1_l = []
    train_prec5_l = []
    test_prec5_l = []

    val_loss_l = []
    val_prec1_l = []
    val_prec5_l = []

    prune_mode = args.prune_mode
    print_and_log('PRUNE MODE ' + str(prune_mode))

    start_pruning_after_epoch_n = args.start_pruning_after_epoch
    prune_every_epoch_n = args.prune_epoch_frequency
    prune_iterations = args.prune_iterations
    post_prune_epochs = args.post_prune_epochs

    filename = args.model + '_' + repr(args.job_idx)
    n_prunes_done = 0

    if prune_mode:
        ## Special consideration so that pruning mnist_mlp does not use less than 100 parameters in the top layer after pruning
        if args.prune_target_sparsity_fc > 0.9 and args.model == 'mnist_mlp':
            total_available_weights = (1. - args.prune_target_sparsity_fc) * (
                784 * 300 + 300 * 100 + 100 * 10) - 100
            prune_target_sparsity_special = 0.9
            prune_target_sparsity_fc = 1. - total_available_weights / (
                784 * 300 + 300 * 100)
        else:
            prune_target_sparsity_fc = prune_target_sparsity_special = args.prune_target_sparsity_fc

        prune_fraction_fc = 1.0 - (1 - prune_target_sparsity_fc)**(
            1.0 / prune_iterations)
        prune_fraction_conv = 1.0 - (1 - args.prune_target_sparsity_conv)**(
            1.0 / prune_iterations)

        prune_fraction_fc_special = 1.0 - (
            1 - prune_target_sparsity_special)**(1.0 / prune_iterations)

        cubic_pruning_multipliers = (
            1 - np.arange(prune_iterations + 1) / prune_iterations)**3.0

        def get_prune_fraction_cubic(current_prune_iter, final_sparsity):
            return 1 - (1 - final_sparsity + final_sparsity *
                        cubic_pruning_multipliers[current_prune_iter + 1]) / (
                            1 - final_sparsity + final_sparsity *
                            cubic_pruning_multipliers[current_prune_iter])

        nEpochs_to_prune = int(start_pruning_after_epoch_n +
                               prune_every_epoch_n *
                               (prune_iterations - 1)) + post_prune_epochs
        print_and_log(
            'prune fraction fc : {} , prune_fraction conv : {} '.format(
                prune_fraction_fc, prune_fraction_conv))
        print_and_log('nepochs ' + repr(nEpochs_to_prune))

        filename += '_target_' + repr(
            args.prune_target_sparsity_fc) + ',' + repr(
                args.prune_target_sparsity_conv)
        validate(test_loader, model, criterion, 1, 'validate')

    save_checkpoint(
        {
            'model_size': base_model.get_model_size(),
            'model_name': args.model,
            'state_dict': model.state_dict(),
            'args': args
        },
        filename=filename + '_initial')

    current_iteration = 0
    lr_schedule = loaded_schedule['lr_schedule']
    rewire_schedule = loaded_schedule['rewire_period_schedule']
    DeepR_temperature_schedule = loaded_schedule['DeepR_temperature_schedule']
    threshold = 1.0e-3
    if args.resume:
        print_and_log("Validating...")
        validate(test_loader, model, criterion, 1, 'validate')
    for epoch in range(args.start_epoch,
                       nEpochs_to_prune if prune_mode else args.epochs):
        adjust_learning_rate(optimizer, epoch, lr_schedule)
        rewire_period = get_schedule_val(rewire_schedule, epoch)
        DeepR_temperature = get_schedule_val(DeepR_temperature_schedule, epoch)
        print_and_log('rewiring every {} iterations'.format(rewire_period))

        t1 = time.time()
        current_iteration, threshold = train(mask, train_loader, model,
                                             criterion, optimizer, epoch,
                                             current_iteration, rewire_period,
                                             DeepR_temperature, threshold)
        print_and_log('epoch time ' + repr(time.time() - t1))

        if prune_mode and epoch >= start_pruning_after_epoch_n and (
                epoch - start_pruning_after_epoch_n
        ) % prune_every_epoch_n == 0 and n_prunes_done < prune_iterations:
            if args.cubic_prune_schedule:
                base_model.prune(
                    get_prune_fraction_cubic(n_prunes_done,
                                             prune_target_sparsity_fc),
                    get_prune_fraction_cubic(n_prunes_done,
                                             args.prune_target_sparsity_conv),
                    get_prune_fraction_cubic(n_prunes_done,
                                             prune_target_sparsity_special))
            else:
                base_model.prune(prune_fraction_fc, prune_fraction_conv,
                                 prune_fraction_fc_special)
            n_prunes_done += 1
            print_and_log(base_model.get_model_size())

        if not (args.no_validate_train):
            prec1_train, prec5_train, loss_train = validate(
                train_loader, model, criterion, epoch, 'train')
        else:
            prec1_train, prec5_train, loss_train = 0.0, 0.0, 0.0

        if args.validate_set:
            prec1_val, prec5_val, loss_val = validate(val_loader, model,
                                                      criterion, epoch,
                                                      'validate')
        else:
            prec1_val, prec5_val, loss_val = 0.0, 0.0, 0.0

        prec1_test, prec5_test, loss_test = validate(test_loader, model,
                                                     criterion, epoch, 'test')

        test_loss_l.append(loss_test)
        train_loss_l.append(loss_train)
        val_loss_l.append(loss_val)

        test_prec1_l.append(prec1_test)
        train_prec1_l.append(prec1_train)
        val_prec1_l.append(prec1_val)

        test_prec5_l.append(prec5_test)
        train_prec5_l.append(prec5_train)
        val_prec5_l.append(prec5_val)

        # remember best prec@1 and save checkpoint
        filenames = [filename]
        if epoch == args.stop_rewire_epoch:
            filenames += [filename + '_StopRewiringPoint_' + repr(epoch)]
        for f in filenames:
            save_checkpoint(
                {
                    'model_size': base_model.get_model_size(),
                    'test_loss': test_loss_l,
                    'train_loss': train_loss_l,
                    'val_loss': val_loss_l,
                    'test_prec1': test_prec1_l,
                    'train_prec1': train_prec1_l,
                    'val_prec1': val_prec1_l,
                    'test_prec5': test_prec5_l,
                    'train_prec5': train_prec5_l,
                    'val_prec5': train_prec5_l,
                    'model_name': args.model,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'epoch': epoch + 1,
                    'args': args
                },
                filename=f)

        if not args.dense and epoch < args.epochs:
            mask.at_end_of_epoch()

    print_and_log('Best accuracy: ', best_prec1)
示例#24
0
def main():
    parser = argparse.ArgumentParser()
    # # 必要参数
    parser.add_argument('--task',
                        default='multi',
                        type=str,
                        help='Task affecting load data and vectorize feature')
    parser.add_argument(
        '--loss_type',
        default='double',
        type=str,
        help='Select loss double or single, only for multi task'
    )  # 针对multi任务才有效
    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help=
        "Bert pre-trained model selected in the list: bert-base-uncased,bert-large-uncased, "
        "bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,bert-base-chinese,"
        "bert-base-multilingual-cased.")  # 选择预训练模型参数
    parser.add_argument("--debug",
                        default=False,
                        help="Whether run on small dataset")  # 正常情况下都应该选择false
    parser.add_argument(
        "--output_dir",
        default="./SQuAD/output/",
        type=str,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    # # 其他参数
    parser.add_argument("--train_file",
                        default="./SQuAD/version/train.json",
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default="./SQuAD/version/prediction.json",
        type=str,
        help=
        "SQuAD json for predictio ns. E.g., dev-v1.1.json or test-v1.1.json")

    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will be "
        "truncated to this length.")

    # # 控制参数
    parser.add_argument("--do_train",
                        default=True,
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=True,
                        help="Whether to run eval on the dev set.")

    parser.add_argument("--train_batch_size",
                        default=18,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=18,
                        type=int,
                        help="Total batch size for predictions.")

    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json file."
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated.This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        default=False,
        help=
        "If true, all of the warnings related to data processing will be printed.A number of "
        "warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        "--do_lower_case",
        default=True,
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--fp16',
        default=False,
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.Positive power of 2: static loss scaling value.\n"
    )
    parser.add_argument(
        '--version_2_with_negative',
        default=False,
        help=
        'If true, the SQuAD examples contain some that do not have an answer.')
    parser.add_argument(
        '--null_score_diff_threshold',
        type=float,
        default=0.0,
        help=
        "If null_score - best_non_null is greater than the threshold predict null."
    )
    args = parser.parse_args()

    # if是采用单机形式,else采用的是分布式形式;因为我们没有分布式系统,所以采用单机多GPU的方式进行训练10.24
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='hierarchical_copy')

    # 以下三句话的意义不是很大,基本操作这一部分是日志的输出形式10.24
    logging.basicConfig(
        format='%(asctime)s-%(levelname)s-%(name)s-%(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.info(
        "device:{}, n_gpu:{}, distributed training:{}, 16-bits training:{}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))
    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    # 以下几行均是用来设置参数10.24
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
    random.seed(args.seed)  # 设置随机种子
    np.random.seed(args.seed)  # 设置随机种子
    torch.manual_seed(args.seed)  # 为CPU设置种子用于生成随机数,以使得结果是确定的
    if n_gpu > 0:  # 如果使用多个GPU,应该使用torch.cuda.manual_seed_all()为所有的GPU设置种子
        torch.cuda.manual_seed_all(args.seed)

    # 以下三句又是基本操作,意义不大10.24
    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")
    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    # 以下2句是用来判断output_dir是否存在,若不存在,则创建即可(感觉有这个东西反而不太好,因为需要空文件夹)10.24
    # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
    #     raise ValueError("Output directory () already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # 这个东西是用来干啥的(从tokenization中读取,对Tokenizer进行初始化操作)10.24
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    # 从data中读取数据的方式,一种是单队列的读取方式,另一种是多通道读取方式10.24
    if args.task == 'squad':
        read_examples = read_squad_examples
    elif args.task == 'multi':
        read_examples = read_multi_examples

    # 用来加载训练样例以及优化的步骤10.24
    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = read_examples(
            input_file=args.train_file,
            is_training=True,
            version_2_with_negative=args.version_2_with_negative)
        if args.debug:
            train_examples = train_examples[:100]
        num_train_optimization_steps = \
            int(len(train_examples)/args.train_batch_size/args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # 模型准备中ing10.24
    model = BertForQuestionAnswering.from_pretrained(
        args.bert_model,
        cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE),
                               'distributed_{}'.format(args.local_rank)))

    # model = torch.nn.DataParallel(model).cuda()
    # 判断是否使用float16编码10.24
    if args.fp16:
        # model.half().cuda()
        model.half()
        # 将模型加载到相应的CPU或者GPU中10.24
    model.to(device)

    # 配置优化器等函数10.24
    if args.do_train:
        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        if args.fp16:
            try:
                # from apex.optimizers import FP16_Optimizer
                from apex.fp16_utils import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=True)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)
        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

    # 进行模型的拟合训练10.24
    global_step = 0
    if args.do_train:
        # 训练语料的特征提取
        train_features = convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)

        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_start_positions = torch.tensor(
            [f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor(
            [f.end_position for f in train_features], dtype=torch.long)
        all_start_vector = torch.tensor(
            [f.start_vector for f in train_features], dtype=torch.float)
        all_end_vector = torch.tensor([f.end_vector for f in train_features],
                                      dtype=torch.float)
        all_content_vector = torch.tensor(
            [f.content_vector for f in train_features], dtype=torch.float)

        # # 替换的内容all_start_positions以及all_end_positions
        # all1_start_positions = []
        # for i in range(len(train_features)):
        #     for j in range(len(train_features[i].start_position)):
        #         all1_start_positions.append(train_features[i].start_position[j])
        # all_start_positions = torch.tensor([k for k in all1_start_positions], dtype=torch.long)
        # all1_end_positions = []
        # for i in range(len(train_features)):
        #     for j in range(len(train_features[i].end_position)):
        #         all1_end_positions.append(train_features[i].end_position[j])
        # all_end_positions = torch.tensor([k for k in all1_end_positions], dtype=torch.long)
        # ####################################################################

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_start_positions,
                                   all_end_positions, all_start_vector,
                                   all_end_vector, all_content_vector)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)  # 随机采样器
        else:
            train_sampler = DistributedSampler(train_data)

        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for ep in trange(int(args.num_train_epochs), desc="Epoch"):
            # 每次都叫他进行分发,这样的话,就可以进行多GPU训练
            model = torch.nn.DataParallel(model).cuda()
            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc="Iteration",
                         disable=args.local_rank not in [-1, 0])):

                if n_gpu == 1:
                    batch = tuple(
                        t.to(device)
                        for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions, start_vector, end_vector, content_vector = batch

                loss = model(input_ids, segment_ids, input_mask,
                             start_positions, end_positions, start_vector,
                             end_vector, content_vector, args.loss_type)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                    print("loss率为:{}".format(loss))
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used and handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            print("\n")
            print(ep)
            output_model_file = os.path.join(args.output_dir,
                                             str(ep) + WEIGHTS_NAME)
            output_config_file = os.path.join(args.output_dir,
                                              str(ep) + CONFIG_NAME)

            torch.save(model.state_dict(), output_model_file)
            if isinstance(model, torch.nn.DataParallel):
                model = model.module
            model.config.to_json_file(output_config_file)
            tokenizer.save_vocabulary(args.output_dir)

    # 这个是用来加载进行微调调好后的代码以方便进行预测10.25
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
    else:
        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)

    # 再次将GPU加入10.25
    model.to(device)

    # 这部分就是进行相应的预测(用于生成预测文件)
    if args.do_predict and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        eval_examples = \
            read_examples(input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
        if args.debug:
            eval_examples = eval_examples[:100]
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_example_index)

        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(
                eval_dataloader,
                desc="Evaluating",
                disable=args.local_rank not in [-1, 0]):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_start_logits, batch_end_logits = model(
                    input_ids, segment_ids, input_mask)
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(
                    RawResult(unique_id=unique_id,
                              start_logits=start_logits,
                              end_logits=end_logits))

        middle_result = os.path.join(args.output_dir, 'middle_result.pkl')
        pickle.dump([eval_examples, eval_features, all_results],
                    open(middle_result, 'wb'))

        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(args.output_dir,
                                         "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(args.output_dir,
                                                 "null_odds.json")

        if (args.loss_type == 'double'):
            write_predictions_couple_labeling(
                eval_examples, eval_features, all_results, args.n_best_size,
                args.max_answer_length, args.do_lower_case,
                output_prediction_file, output_nbest_file,
                output_null_log_odds_file, args.verbose_logging,
                args.version_2_with_negative, args.null_score_diff_threshold)
        elif (args.loss_type == 'single'):
            write_predictions_single_labeling(
                eval_examples, eval_features, all_results, args.n_best_size,
                args.max_answer_length, args.do_lower_case,
                output_prediction_file, output_nbest_file,
                output_null_log_odds_file, args.verbose_logging,
                args.version_2_with_negative, args.null_score_diff_threshold)
        elif (args.loss_type == 'origin') or (args.task == 'multi'
                                              and args.loss_type == 'squad'):
            write_predictions(eval_examples, eval_features, all_results,
                              args.n_best_size, args.max_answer_length,
                              args.do_lower_case, output_prediction_file,
                              output_nbest_file, output_null_log_odds_file,
                              args.verbose_logging,
                              args.version_2_with_negative,
                              args.null_score_diff_threshold)
        else:
            raise ValueError('{} dataset and {} loss is not support'.format(
                args.task, args.loss_type))
示例#25
0
def train():
    if opt.part == 'table2pivot':
        corpus = Table2PivotCorpus(vocab_size=opt.vocab_size, 
                                    max_len=opt.src_max_len, 
                                    batch_size=opt.batch_size,
                                    log_dir=opt.dir,
                                    scale=opt.scale,
                                    mode=opt.mode)
    else:
        corpus = Pivot2TextCorpus(vocab_size=opt.vocab_size, 
                                    src_max_len=opt.src_max_len, 
                                    tgt_max_len=opt.tgt_max_len, 
                                    batch_size=opt.batch_size,
                                    share=opt.share,
                                    log_dir=opt.dir,
                                    scale=opt.scale,
                                    append_rate=opt.append_rate,
                                    drop_rate=opt.drop_rate,
                                    blank_rate=opt.blank_rate,
                                    setting=opt.setting,
                                    mode=opt.mode,
                                    use_feature=opt.feature)

    model = Pivot(emb_size=opt.emb_size,
                    key_emb_size=opt.key_emb_size,
                    pos_emb_size=opt.pos_emb_size,
                    hidden_size=opt.hidden_size,
                    n_hidden=opt.n_hidden,
                    n_block=opt.n_block,
                    ff_size=opt.ff_size,
                    n_head=opt.n_head,
                    enc_layers=opt.enc_layers,
                    dec_layers=opt.dec_layers,
                    dropout=opt.dropout,
                    bidirectional=opt.bidirectional,
                    beam_size=opt.beam_size,
                    max_decoding_step=opt.max_step,
                    minimum_length=opt.minimum_length,
                    label_smoothing=opt.label_smoothing,
                    share=opt.share,
                    part=opt.part,
                    vocab=corpus.vocab,
                    use_feature=opt.feature,
                    arch=opt.arch)
    
    if opt.fp16:
        model.half()
        model.to(device)
        try:
            from apex.fp16_utils import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        optimizer = FusedAdam(model.parameters(),
                              lr=opt.lr,
                              bias_correction=False)
        if opt.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=opt.loss_scale)
    else:
        model.to(device)
        if opt.optimizer == 'adagrad':
            optimizer = optim.Adagrad(model.parameters(), lr=opt.lr, initial_accumulator_value=0.1)
        else:
            optimizer = optim.Adam(model.parameters(), lr=opt.lr)

    learning_rate_scheduler = LearningRateWithMetricsWrapper(torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='max', patience=2))

    predictor = Predictor(dataset=corpus.test_dataset,
                          dataloader=corpus.test_loader,
                          corpus=corpus,
                          cuda_device=opt.gpu)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      learning_rate_scheduler=learning_rate_scheduler,
                      learning_rate_decay=opt.lr_decay,
                      ema_decay=opt.ema_decay,
                      predictor=predictor, 
                      train_loader=corpus.train_loader,
                      train_dataset=corpus.train_dataset,
                      validation_metric=corpus.metrics,
                      cuda_device=opt.gpu,
                      patience=4,
                      num_epochs=opt.epoch,
                      serialization_dir=corpus.log_dir,
                      num_serialized_models_to_keep=3,
                      summary_interval=opt.report,
                      should_log_parameter_statistics=False,
                      grad_norm=opt.grad_norm,
                      fp16=opt.fp16)

    trainer.train()
示例#26
0
        model = models.__dict__['resnet50'](low_dim=args.low_dim)
        model = torch.nn.DataParallel(model).cuda()

        optimizer = torch.optim.SGD(model.parameters(),
                                    0.03,
                                    momentum=0.9,
                                    weight_decay=1e-4)
        # optionally resume from a checkpoint
        if args.resume:
            if os.path.isfile(args.resume):
                print("=> loading checkpoint '{}'".format(args.resume))
                checkpoint = torch.load(args.resume)
                model.load_state_dict(checkpoint['state_dict'])
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.static_loss,
                                           verbose=False)
                optimizer.load_state_dict(checkpoint['optimizer'])
                args.start_epoch = checkpoint['epoch']
                best_prec1 = checkpoint['best_prec1']
                lemniscate = checkpoint['lemniscate']
                print("=> loaded checkpoint '{}' (epoch {})".format(
                    args.resume, checkpoint['epoch']))
            else:
                print("=> no checkpoint found at '{}'".format(args.resume))

        cudnn.benchmark = True

        # Can work with any model, but it assumes that the model has a
        # feature method, and a classifier method,
        # as in the VGG models in torchvision.
示例#27
0
                   audio_conf=audio_conf,
                   labels=labels,
                   rnn_type=supported_rnns[rnn_type],
                   mixed_precision=args.mixed_precision)
model = model.to(device)
if args.mixed_precision:
    model = convert_model_to_half(model)
print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

parameters = model.parameters()
optimizer = torch.optim.SGD(parameters, lr=3e-4, momentum=0.9, nesterov=True, weight_decay=1e-5)
if args.distributed:
    model = DistributedDataParallel(model)
if args.mixed_precision:
    optimizer = FP16_Optimizer(optimizer,
                               static_loss_scale=args.static_loss_scale,
                               dynamic_loss_scale=args.dynamic_loss_scale)

criterion = CTCLoss()

seconds = int(args.seconds)
batch_size = int(args.batch_size)


def iteration(inputs):
    # targets, align half of the audio
    targets = torch.ones(int(batch_size * ((seconds * 100) / 2)))
    target_sizes = torch.empty(batch_size, dtype=torch.int).fill_(int((seconds * 100) / 2))
    input_percentages = torch.ones(batch_size).fill_(1)
    input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
示例#28
0
def model_main(conf, pitch_classes, time_steps_vocab):
    """
    Run model pipeline from setup specified in <conf>

    Params
    ======
    conf: dict
        config from conf/train_conf.yaml 
    pitch_classes: dict
        Dict of drum pitch mappings (from conf/drum_pitches.yaml)
    time_steps_vocab: dict
        Dict of tick:token mappings (from conf/time_steps_vocab.yaml)
    """
    model_conf = conf['model']
    data_conf = conf['data']

    if model_conf['d_embed'] < 0:
        model_conf['d_embed'] = model_conf['d_model']

    assert model_conf[
        'ext_len'] >= 0, 'extended context length must be non-negative'
    assert model_conf['train_batch_size'] % model_conf['batch_chunk'] == 0

    model_conf['work_dir'] = '{}-{}'.format(model_conf['work_dir'],
                                            data_conf['dataset'])
    model_conf['work_dir'] = os.path.join(model_conf['work_dir'],
                                          time.strftime('%Y%m%d-%H%M%S'))
    #logging = create_exp_dir(model_conf['work_dir'],
    #   scripts_to_save=['train.py', 'mem_transformer.py'], debug=model_conf['debug'])
    logging = create_exp_dir(model_conf['work_dir'],
                             scripts_to_save=None,
                             debug=model_conf['debug'])

    # Set the random seed manually for reproducibility.
    #np.random.seed(model_conf['seed'])
    #torch.manual_seed(model_conf['seed'])
    if torch.cuda.is_available():
        if not model_conf['cuda']:
            print(
                'WARNING: You have a CUDA device, so you should probably run with --cuda'
            )
        else:
            pass
            #torch.cuda.manual_seed_all(model_conf['seed'])

    # Validate `--fp16` option
    if model_conf['fp16']:
        if not model_conf['cuda']:
            print('WARNING: --fp16 requires --cuda, ignoring --fp16 option')
            model_conf['fp16'] = False
        else:
            try:
                from apex.fp16_utils import FP16_Optimizer
            except:
                print('WARNING: apex not installed, ignoring --fp16 option')
                model_conf['fp16'] = False

    device = torch.device('cuda' if model_conf['cuda'] else 'cpu')

    ###############################################################################
    # Load data
    ###############################################################################
    corpus = get_corpus(data_conf['dataset'], data_conf['data_dir'],
                        pitch_classes, time_steps_vocab, conf['processing'])
    ntokens = corpus.vocab_size
    model_conf['n_token'] = ntokens

    cutoffs, tie_projs = [], [False]

    eval_batch_size = 10
    tr_iter = corpus.get_iterator('train',
                                  model_conf['train_batch_size'],
                                  model_conf['tgt_len'],
                                  device=device,
                                  ext_len=model_conf['ext_len'])
    va_iter = corpus.get_iterator('valid',
                                  eval_batch_size,
                                  model_conf['tgt_len'],
                                  device=device,
                                  ext_len=model_conf['ext_len'])
    te_iter = corpus.get_iterator('test',
                                  eval_batch_size,
                                  model_conf['tgt_len'],
                                  device=device,
                                  ext_len=model_conf['ext_len'])

    ###############################################################################
    # Build the model
    ###############################################################################
    def init_weight(weight):
        if model_conf['init'] == 'uniform':
            nn.init.uniform_(weight, -model_conf['init_range'],
                             model_conf['init_range'])
        elif model_conf['init'] == 'normal':
            nn.init.normal_(weight, 0.0, model_conf['init_std'])

    def init_bias(bias):
        nn.init.constant_(bias, 0.0)

    def weights_init(m):
        classname = m.__class__.__name__
        if classname.find('Linear') != -1:
            if hasattr(m, 'weight') and m.weight is not None:
                init_weight(m.weight)
            if hasattr(m, 'bias') and m.bias is not None:
                init_bias(m.bias)
        elif classname.find('AdaptiveEmbedding') != -1:
            if hasattr(m, 'emb_projs'):
                for i in range(len(m.emb_projs)):
                    if m.emb_projs[i] is not None:
                        nn.init.normal_(m.emb_projs[i], 0.0,
                                        model_conf['proj_init_std'])
        elif classname.find('Embedding') != -1:
            if hasattr(m, 'weight'):
                init_weight(m.weight)
        elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
            if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
                init_weight(m.cluster_weight)
            if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
                init_bias(m.cluster_bias)
            if hasattr(m, 'out_projs'):
                for i in range(len(m.out_projs)):
                    if m.out_projs[i] is not None:
                        nn.init.normal_(m.out_projs[i], 0.0,
                                        model_conf['proj_init_std'])
        elif classname.find('LayerNorm') != -1:
            if hasattr(m, 'weight'):
                nn.init.normal_(m.weight, 1.0, model_conf['init_std'])
            if hasattr(m, 'bias') and m.bias is not None:
                init_bias(m.bias)
        elif classname.find('TransformerLM') != -1:
            if hasattr(m, 'r_emb'):
                init_weight(m.r_emb)
            if hasattr(m, 'r_w_bias'):
                init_weight(m.r_w_bias)
            if hasattr(m, 'r_r_bias'):
                init_weight(m.r_r_bias)
            if hasattr(m, 'r_bias'):
                init_bias(m.r_bias)

    def update_dropout(m):
        classname = m.__class__.__name__
        if classname.find('Dropout') != -1:
            if hasattr(m, 'p'):
                m.p = model_conf['dropout']

    def update_dropatt(m):
        if hasattr(m, 'dropatt'):
            m.dropatt.p = model_conf['dropatt']

    if model_conf['restart']:
        with open(os.path.join(model_conf['restart_dir'], 'model.pt'),
                  'rb') as f:
            model = torch.load(f)
        if not model_conf['fp16']:
            model = model.float()
        model.apply(update_dropout)
        model.apply(update_dropatt)
    else:
        model = MemTransformerLM(ntokens,
                                 model_conf['n_layer'],
                                 model_conf['n_head'],
                                 model_conf['d_model'],
                                 model_conf['d_head'],
                                 model_conf['d_inner'],
                                 model_conf['dropout'],
                                 model_conf['dropatt'],
                                 tie_weight=model_conf['not_tied'],
                                 d_embed=model_conf['d_embed'],
                                 div_val=model_conf['div_val'],
                                 tie_projs=tie_projs,
                                 pre_lnorm=model_conf['pre_lnorm'],
                                 tgt_len=model_conf['tgt_len'],
                                 ext_len=model_conf['ext_len'],
                                 mem_len=model_conf['mem_len'],
                                 cutoffs=cutoffs,
                                 same_length=model_conf['same_length'],
                                 attn_type=model_conf['attn_type'],
                                 clamp_len=model_conf['clamp_len'],
                                 sample_softmax=model_conf['sample_softmax'])
        model.apply(weights_init)
        model.word_emb.apply(
            weights_init
        )  # ensure embedding init is not overridden by out_layer in case of weight sharing
    model_conf['n_all_param'] = sum([p.nelement() for p in model.parameters()])
    model_conf['n_nonemb_param'] = sum(
        [p.nelement() for p in model.layers.parameters()])

    if model_conf['fp16']:
        model = model.half()

    if model_conf['multi_gpu']:
        model = model.to(device)
        if model_conf['gpu0_bsz'] >= 0:
            para_model = BalancedDataParallel(model_conf['gpu0_bsz'] //
                                              model_conf['batch_chunk'],
                                              model,
                                              dim=1).to(device)
        else:
            para_model = nn.DataParallel(model, dim=1).to(device)
    else:
        para_model = model.to(device)

    #### optimizer
    if model_conf['optim'].lower() == 'sgd':
        if model_conf['sample_softmax'] > 0:
            dense_params, sparse_params = [], []
            for param in model.parameters():
                if param.size() == model.word_emb.weight.size():
                    sparse_params.append(param)
                else:
                    dense_params.append(param)
            optimizer_sparse = optim.SGD(sparse_params,
                                         lr=model_conf['learning_rate'] * 2)
            optimizer = optim.SGD(dense_params,
                                  lr=model_conf['learning_rate'],
                                  momentum=model_conf['mom'])
        else:
            optimizer = optim.SGD(model.parameters(),
                                  lr=model_conf['learning_rate'],
                                  momentum=model_conf['mom'])
    elif model_conf['optim'].lower() == 'adam':
        if model_conf['sample_softmax'] > 0:
            dense_params, sparse_params = [], []
            for param in model.parameters():
                if param.size() == model.word_emb.weight.size():
                    sparse_params.append(param)
                else:
                    dense_params.append(param)
            optimizer_sparse = optim.SparseAdam(sparse_params,
                                                lr=model_conf['learning_rate'])
            optimizer = optim.Adam(dense_params,
                                   lr=model_conf['learning_rate'])
        else:
            optimizer = optim.Adam(model.parameters(),
                                   lr=model_conf['learning_rate'])
    elif model_conf['optim'].lower() == 'adagrad':
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=model_conf['learning_rate'])

    #### scheduler
    if model_conf['scheduler'] == 'cosine':
        # here we do not set eta_min to lr_min to be backward compatible
        # because in previous versions eta_min is default to 0
        # rather than the default value of lr_min 1e-6
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, model_conf['max_step'],
            eta_min=model_conf['eta_min'])  # should use eta_min arg
        if model_conf['sample_softmax'] > 0:
            scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR(
                optimizer_sparse,
                model_conf['max_step'],
                eta_min=model_conf['eta_min'])  # should use eta_min arg
    elif model_conf['scheduler'] == 'inv_sqrt':
        # originally used for Transformer (in Attention is all you need)
        def lr_lambda(step):
            # return a multiplier instead of a learning rate
            if step == 0 and model_conf['warmup_steps'] == 0:
                return 1.
            else:
                return 1. / (step ** 0.5) if step > model_conf['warmup_steps'] \
                       else step / (model_conf['warmup_steps'] ** 1.5)

        scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
    elif model_conf['scheduler'] == 'dev_perf':
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            factor=model_conf['decay_rate'],
            patience=model_conf['patience'],
            min_lr=model_conf['lr_min'])
        if model_conf['sample_softmax'] > 0:
            scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau(
                optimizer_sparse,
                factor=model_conf['decay_rate'],
                patience=model_conf['patience'],
                min_lr=model_conf['lr_min'])
    elif model_conf['scheduler'] == 'constant':
        pass

    if model_conf['cuda'] and model_conf['fp16']:
        # If model_conf['dynamic_loss_scale'] is False, static_loss_scale will be used.
        # If model_conf['dynamic_loss_scale'] is True, it will take precedence over static_loss_scale.
        optimizer = FP16_Optimizer(
            optimizer,
            static_loss_scale=model_conf['static_loss_scale'],
            dynamic_loss_scale=model_conf['dynamic_loss_scale'],
            dynamic_loss_args={'init_scale': 2**16})

    if model_conf['restart']:
        if os.path.exists(
                os.path.join(model_conf['restart_dir'], 'optimizer.pt')):
            with open(os.path.join(model_conf['restart_dir'], 'optimizer.pt'),
                      'rb') as f:
                opt_state_dict = torch.load(f)
                optimizer.load_state_dict(opt_state_dict)
        else:
            print('Optimizer was not saved. Start from scratch.')

    logging('=' * 100)
    for k, v in model_conf.items():
        logging('    - {} : {}'.format(k, v))
    logging('=' * 100)
    logging('#params = {}'.format(model_conf['n_all_param']))
    logging('#non emb params = {}'.format(model_conf['n_nonemb_param']))

    ###############################################################################
    # Training code
    ###############################################################################

    def evaluate(eval_iter):
        # Turn on evaluation mode which disables dropout.
        model.eval()

        # If the model does not use memory at all, make the ext_len longer.
        # Otherwise, make the mem_len longer and keep the ext_len the same.
        if model_conf['mem_len'] == 0:
            model.reset_length(
                model_conf['eval_tgt_len'], model_conf['ext_len'] +
                model_conf['tgt_len'] - model_conf['eval_tgt_len'],
                model_conf['mem_len'])
        else:
            model.reset_length(
                model_conf['eval_tgt_len'], model_conf['ext_len'],
                model_conf['mem_len'] + model_conf['tgt_len'] -
                model_conf['eval_tgt_len'])

        # Evaluation
        total_len, total_loss = 0, 0.
        with torch.no_grad():
            mems = tuple()
            for i, (data, target, seq_len) in enumerate(eval_iter):
                if model_conf['max_eval_steps'] > 0 and i >= model_conf[
                        'max_eval_steps']:
                    break
                ret = model(data, target, *mems)
                loss, mems = ret[0], ret[1:]
                loss = loss.mean()
                total_loss += seq_len * loss.float().item()
                total_len += seq_len

        # Switch back to the training mode
        model.reset_length(model_conf['tgt_len'], model_conf['ext_len'],
                           model_conf['mem_len'])
        model.train()

        return total_loss / total_len

    def train():
        # Turn on training mode which enables dropout.
        global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
        model.train()
        if model_conf['batch_chunk'] > 1:
            mems = [tuple() for _ in range(model_conf['batch_chunk'])]
        else:
            mems = tuple()
        train_iter = tr_iter.get_varlen_iter(
        ) if model_conf['varlen'] else tr_iter
        for batch, (data, target, seq_len) in enumerate(train_iter):
            model.zero_grad()
            if model_conf['batch_chunk'] > 1:
                data_chunks = torch.chunk(data, model_conf['batch_chunk'], 1)
                target_chunks = torch.chunk(target, model_conf['batch_chunk'],
                                            1)
                for i in range(model_conf['batch_chunk']):
                    data_i = data_chunks[i].contiguous()
                    target_i = target_chunks[i].contiguous()
                    ret = para_model(data_i, target_i, *mems[i])
                    loss, mems[i] = ret[0], ret[1:]
                    loss = loss.float().mean().type_as(
                        loss) / model_conf['batch_chunk']
                    if model_conf['fp16']:
                        optimizer.backward(loss)
                    else:
                        loss.backward()
                    train_loss += loss.float().item()
            else:
                ret = para_model(data, target, *mems)
                loss, mems = ret[0], ret[1:]
                loss = loss.float().mean().type_as(loss)
                if model_conf['fp16']:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                train_loss += loss.float().item()

            if model_conf['fp16']:
                optimizer.clip_master_grads(model_conf['clip'])
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               model_conf['clip'])

            optimizer.step()
            if model_conf['sample_softmax'] > 0:
                optimizer_sparse.step()

            # step-wise learning rate annealing
            train_step += 1
            if model_conf['scheduler'] in ['cosine', 'constant', 'dev_perf']:
                # linear warmup stage
                if train_step < model_conf['warmup_steps']:
                    curr_lr = model_conf[
                        'learning_rate'] * train_step / model_conf[
                            'warmup_steps']
                    optimizer.param_groups[0]['lr'] = curr_lr
                    if model_conf['sample_softmax'] > 0:
                        optimizer_sparse.param_groups[0][
                            'learning_rate'] = curr_lr * 2
                else:
                    if model_conf['scheduler'] == 'cosine':
                        scheduler.step(train_step)
                        if model_conf['sample_softmax'] > 0:
                            scheduler_sparse.step(train_step)
            elif model_conf['scheduler'] == 'inv_sqrt':
                scheduler.step(train_step)

            if train_step % model_conf['log_interval'] == 0:
                cur_loss = train_loss / model_conf['log_interval']
                elapsed = time.time() - log_start_time
                log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \
                          '| ms/batch {:5.2f} | loss {:5.2f}'.format(
                    epoch, train_step, batch+1, optimizer.param_groups[0]['lr'],
                    elapsed * 1000 / model_conf['log_interval'], cur_loss)
                log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss))
                logging(log_str)
                train_loss = 0
                log_start_time = time.time()

            if train_step == 1 or train_step % model_conf['eval_interval'] == 0:
                val_loss = evaluate(va_iter)
                logging('-' * 100)
                log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \
                          '| valid loss {:5.2f}'.format(
                    train_step // model_conf['eval_interval'], train_step,
                    (time.time() - eval_start_time), val_loss)
                log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss))
                logging(log_str)
                logging('-' * 100)
                # Save the model if the validation loss is the best we've seen so far.
                if not best_val_loss or val_loss < best_val_loss:
                    create_dir_if_not_exists(
                        os.path.join(model_conf['work_dir'],
                                     f'train_step_{train_step}', ''))
                    if not model_conf['debug']:
                        with open(
                                os.path.join(model_conf['work_dir'],
                                             f'train_step_{train_step}',
                                             'model.pt'), 'wb') as f:
                            torch.save(model, f)
                        with open(
                                os.path.join(model_conf['work_dir'],
                                             f'train_step_{train_step}',
                                             'optimizer.pt'), 'wb') as f:
                            torch.save(optimizer.state_dict(), f)
                    best_val_loss = val_loss

                # dev-performance based learning rate annealing
                if model_conf['scheduler'] == 'dev_perf':
                    scheduler.step(val_loss)
                    if model_conf['sample_softmax'] > 0:
                        scheduler_sparse.step(val_loss)

                eval_start_time = time.time()

            if train_step == model_conf['max_step']:
                break

    # Loop over epochs.
    train_step = 0
    train_loss = 0
    best_val_loss = None

    log_start_time = time.time()
    eval_start_time = time.time()

    # At any point you can hit Ctrl + C to break out of training early.
    try:
        for epoch in itertools.count(start=1):
            train()
            if train_step == model_conf['max_step']:
                logging('-' * 100)
                logging('End of training')
                break
    except KeyboardInterrupt:
        logging('-' * 100)
        logging('Exiting from training early')

    create_dir_if_not_exists(model_conf['work_dir'])
    # Load the best saved model.
    with open(os.path.join(model_conf['work_dir'], 'model.pt'), 'rb') as f:
        model = torch.load(f)
    para_model = model.to(device)

    # Run on test data.
    test_loss = evaluate(te_iter)
    logging('=' * 100)
    logging('| End of training | test loss {:5.2f} | test ppl {:9.3f}'.format(
        test_loss, math.exp(test_loss)))
    logging('=' * 100)
示例#29
0
                                                     patience=args.patience,
                                                     min_lr=args.lr_min)
    if args.sample_softmax > 0:
        scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer_sparse,
            factor=args.decay_rate,
            patience=args.patience,
            min_lr=args.lr_min)
elif args.scheduler == 'constant':
    pass

if args.cuda and args.fp16:
    # If args.dynamic_loss_scale is False, static_loss_scale will be used.
    # If args.dynamic_loss_scale is True, it will take precedence over static_loss_scale.
    optimizer = FP16_Optimizer(optimizer,
                               static_loss_scale=args.static_loss_scale,
                               dynamic_loss_scale=args.dynamic_loss_scale,
                               dynamic_loss_args={'init_scale': 2**16})

if args.restart:
    if os.path.exists(os.path.join(args.restart_dir, 'optimizer.pt')):
        with open(os.path.join(args.restart_dir, 'optimizer.pt'), 'rb') as f:
            opt_state_dict = torch.load(f)
            optimizer.load_state_dict(opt_state_dict)
    else:
        print('Optimizer was not saved. Start from scratch.')

logging('=' * 100)
for k, v in args.__dict__.items():
    logging('    - {} : {}'.format(k, v))
logging('=' * 100)
logging('#params = {}'.format(args.n_all_param))
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--dev_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for develop")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=3000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--accumulate_gradients",
        type=int,
        default=1,
        help=
        "Number of steps to accumulate gradient on (divide the batch_size and accumulate)"
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument('--model_path',
                        type=str,
                        default='./model',
                        help='save model path')
    parser.add_argument('--load_model', type=str, default=None)
    parser.add_argument('--embedding_dim', type=int, default=300)
    parser.add_argument('--dropout_prob', type=float, default=0.2)

    args = parser.parse_args()
    processors = {"memory": MemoryProcessor, "logic": LogicalProcessor}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device((args.local_rank))
        device = "cuda"
        n_gpu = torch.cuda.device_count()
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        dist.init_process_group(backend='nccl')
        torch.backends.cudnn.benchmark = True

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size *
            args.num_train_epochs)

    vocab_dim = len(tokenization.load_vocab(args.vocab_file))

    model = SequenceClassification(vocab_dim, args.embedding_dim,
                                   args.dropout_prob, len(label_list), device)

    if args.load_model is not None:
        model.load_state_dict(torch.load(args.load_model, map_location='cpu'))

    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)

    global_step = 0

    if args.local_rank != -1:
        model = DDP(model)
        optimizer = FP16_Optimizer(optimizer)
        '''
        model = torch.nn.parallel.DistributedDataParallel(model,
                                                          device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
        '''
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.do_train:
        #train feature
        train_features = convert_to_ids(train_examples, label_list,
                                        args.max_seq_length, tokenizer)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_q_ids = torch.tensor([f.que_ids for f in train_features],
                                 dtype=torch.long)
        all_d_ids = torch.tensor([f.des_ids for f in train_features],
                                 dtype=torch.long)
        all_sd_ids = torch.tensor([f.scene_ids for f in train_features],
                                  dtype=torch.long)
        #all_Ld_ids = torch.tensor([f.local_scene_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)

        train_data = TensorDataset(all_q_ids, all_d_ids, all_sd_ids,
                                   all_label_ids)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      num_workers=1,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        #developset feature
        dev_exmaples = processor.get_dev_examples(args.data_dir)
        dev_features = convert_to_ids(dev_exmaples, label_list,
                                      args.max_seq_length, tokenizer)

        all_dev_q_ids = torch.tensor([f.que_ids for f in dev_features],
                                     dtype=torch.long)
        all_dev_d_ids = torch.tensor([f.des_ids for f in dev_features],
                                     dtype=torch.long)
        all_dev_sd_ids = torch.tensor([f.scene_ids for f in dev_features],
                                      dtype=torch.long)
        #all_dev_Ld_ids = torch.tensor([f.local_scene_ids for f in dev_features], dtype=torch.long)
        all_dev_label_ids = torch.tensor([f.label_id for f in dev_features],
                                         dtype=torch.long)

        dev_data = TensorDataset(all_dev_q_ids, all_dev_d_ids, all_dev_sd_ids,
                                 all_dev_label_ids)
        if args.local_rank == -1:
            dev_sampler = RandomSampler(dev_data)
        else:
            dev_sampler = DistributedSampler(dev_data)

        dev_dataloader = DataLoader(dev_data,
                                    num_workers=1,
                                    sampler=dev_sampler,
                                    batch_size=args.eval_batch_size)

        model.train()
        losses = []
        dev_accuracy_list = []
        dev_losses = []
        for epoch in range(int(args.num_train_epochs)):

            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for (q_ids, d_ids, sd_ids, label_ids) in (train_dataloader):

                optimizer.zero_grad()

                q_ids = q_ids.to(device)
                d_ids = d_ids.to(device)
                sd_ids = sd_ids.to(device)
                #Ld_ids = Ld_ids.to(device)
                label_ids = label_ids.to(device)

                loss, _ = model.forward(q_ids, d_ids, sd_ids, label_ids)

                tr_loss += loss.item()
                nb_tr_examples += q_ids.size(0)
                nb_tr_steps += 1

                loss.backward()
                optimizer.step()

                global_step += 1
            if (epoch + 1) % 10 == 0:
                if args.task_name == 'memory':
                    torch.save(
                        model.state_dict(),
                        os.path.join(
                            args.model_path,
                            'non_crossPassage_res_memory_model' +
                            str(epoch + 1) + '.bin'))
                else:
                    torch.save(
                        model.state_dict(),
                        os.path.join(
                            args.model_path,
                            'non_crossPassage_res_logic_model' +
                            str(epoch + 1) + '.bin'))
            losses.append(tr_loss / nb_tr_steps)

            #develop dataset evaluation
            dev_accuracy, nb_dev_examples = 0, 0
            for q_ids, d_ids, sd_ids, label_ids in dev_dataloader:

                q_ids = q_ids.to(device)
                d_ids = d_ids.to(device)
                sd_ids = sd_ids.to(device)
                #Ld_ids = Ld_ids.to(device)
                label_ids = label_ids.to(device)

                dev_loss, logits = model.forward(q_ids, d_ids, sd_ids,
                                                 label_ids)

                label_ids = label_ids.to('cpu').numpy()
                logits = logits.to('cpu').detach().numpy()

                tmp_dev_accuracy = accuracy(logits, label_ids)
                dev_accuracy += tmp_dev_accuracy

                nb_dev_examples += q_ids.size(0)

            print('-' * 20)
            print("Epochs : {}".format(epoch + 1))
            print("dev_accuracy : {}".format(dev_accuracy / nb_dev_examples))
            print("train Loss : {}".format(tr_loss / nb_tr_steps))
            print("validataion Loss : {}".format(dev_loss.item()))
            dev_losses.append(dev_loss.item())
            print('-' * 20)

    if args.do_eval:
        eval_examples = processor.get_test_examples(args.data_dir)
        eval_features = convert_to_ids(eval_examples, label_list,
                                       args.max_seq_length, tokenizer)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        all_q_vectors = torch.tensor([f.que_ids for f in eval_features],
                                     dtype=torch.long)
        all_d_vectors = torch.tensor([f.des_ids for f in eval_features],
                                     dtype=torch.long)
        all_sd_vectors = torch.tensor([f.scene_ids for f in eval_features],
                                      dtype=torch.long)
        #all_Ld_vectors = torch.tensor([f.local_scene_ids for f in eval_features], dtype=torch.long)
        all_label_vectors = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.long)

        eval_data = TensorDataset(all_q_vectors, all_d_vectors, all_sd_vectors,
                                  all_label_vectors)

        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     num_workers=1,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        logit_label_list = []
        for step, (q_vec, d_vec, sd_vec, label_vec) in enumerate(
                tqdm(eval_dataloader, desc="Iteration")):

            q_vec = q_vec.to(device)
            d_vec = d_vec.to(device)
            sd_vec = sd_vec.to(device)
            #Ld_vec = Ld_vec.to(device)
            label_vec = label_vec.to(device)

            tmp_eval_loss, logits = model.forward(q_vec, d_vec, sd_vec,
                                                  label_vec)

            label_ids = label_vec.to('cpu').numpy()
            logits = logits.to('cpu').detach().numpy()

            tmp_eval_accuracy = accuracy(logits, label_ids)

            output = np.argmax(logits, axis=1)

            list(output)
            list(label_ids)
            logit_label_list.append([output, label_ids])

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += q_vec.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps  # len(eval_dataloader)
        eval_accuracy = eval_accuracy / nb_eval_examples  # len(eval_dataloader)

        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'global_step': global_step
        }
        #'loss': tr_loss / nb_tr_steps}  # 'loss': loss.item()}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open('[memory]align_epoch20_output', 'w') as f:
            logit_output_list = []
            Gold_output_list = []
            for labels in logit_label_list:
                for logit in labels[0]:
                    logit_output = convert_id_to_label(logit, label_list)
                    logit_output_list.append(logit_output)
                for Gold in labels[1]:
                    Gold_output = convert_id_to_label(Gold, label_list)
                    Gold_output_list.append(Gold_output)
            for logit, gold in zip(logit_output_list, Gold_output_list):
                f.write(str(logit) + '\t' + str(gold) + '\n')

        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))