x = torch.flatten(x, 1) x = self.fc1(x) x = F.relu(x) x = self.dropout2(x) x = self.fc2(x) output = F.log_softmax(x, dim=1) return output net = Net() print(net) # %% criterion = nn.CrossEntropyLoss() # optimizer = optim.SGD(net.parameters(), lr=0.01) optimizer = optim.Adadelta(net.parameters(), lr=1.0) # %% # エポック数 num_epochs = 30 for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch + 1, num_epochs)) print('-------------') for phase in ['train', 'valid']: if phase == 'train': # モデルを訓練モードに設定 net.train() else:
criterion = criterion.cuda() image = Variable(image) text = Variable(text) length = Variable(length) # loss averager loss_avg = utils.averager() # setup optimizer if opt.adam: optimizer = optim.Adam(crnn.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) elif opt.adadelta: optimizer = optim.Adadelta(crnn.parameters()) else: optimizer = optim.RMSprop(crnn.parameters(), lr=opt.lr) def val(net, dataset, criterion, max_iter=100): print('Start val') for p in crnn.parameters(): p.requires_grad = False net.eval() data_loader = torch.utils.data.DataLoader(dataset, shuffle=True, batch_size=opt.batchSize, num_workers=int(opt.workers))
def __init__(self, opt): super(AdvisorAgent, self).__init__() self.model_name = opt['model_name'] self.evaluate_every = opt['evaluate_every_steps'] if self.model_name == 'advisor': Module = Advisor elif self.model_name == 'hred_db': Module = HRED_DB elif self.model_name == 'hred_db0': Module = HRED_DB0 elif self.model_name == 'hred': Module = HRED else: Module = BiLSTM opt['cuda'] = not opt['no_cuda'] and torch.cuda.is_available() if opt['cuda']: print('[ Using CUDA ]') torch.cuda.device(opt['gpu']) # torch.cuda.device([0, 1]) # It enables benchmark mode in cudnn, which # leads to faster runtime when the input sizes do not vary. cudnn.benchmark = True self.opt = opt if opt['pre_word2vec']: pre_w2v = load_ndarray(opt['pre_word2vec']) else: pre_w2v = None self.evaluator = Evaluator(CrossEntropyLoss(), batch_size=opt['batch_size'], use_cuda=opt['cuda'], model_name=self.model_name) self.score_type = 'clf' self.model = Module(opt['vocab_size'], \ opt['word_emb_size'], \ opt['hidden_size'], \ init_w2v=pre_w2v, \ enc_type=opt['enc_type'], \ rnn_type=opt['rnn_type'], \ bidirectional=not opt['no_bidirectional'], \ utter_enc_dropout=opt['utter_enc_dropout'], \ knowledge_enc_dropout=opt['knowledge_enc_dropout'], \ atten_type=opt['atten_type'], \ score_type=self.score_type, \ use_cuda=opt['cuda']#, \ # phase=opt['phase'] ) if opt['cuda']: self.model.cuda() # self.model = torch.nn.DataParallel(self.model, device_ids=[0, 1]) if self.score_type == 'ranking': # MultiLabelMarginLoss # For each sample in the mini-batch: # loss(x, y) = sum_ij(max(0, 1 - (x[y[j]] - x[i]))) / x.size(0) self.loss_fn = MultiLabelMarginLoss() else: self.loss_fn = CrossEntropyLoss() optim_params = [p for p in self.model.parameters() if p.requires_grad] lr = opt['learning_rate'] if opt['optimizer'] == 'sgd': self.optimizers = {self.model_name: optim.SGD(optim_params, lr=lr)} elif opt['optimizer'] == 'adam': self.optimizers = { self.model_name: optim.Adam(optim_params, lr=lr) } elif opt['optimizer'] == 'adadelta': self.optimizers = { self.model_name: optim.Adadelta(optim_params, lr=lr) } elif opt['optimizer'] == 'adagrad': self.optimizers = { self.model_name: optim.Adagrad(optim_params, lr=lr) } elif opt['optimizer'] == 'adamax': self.optimizers = { self.model_name: optim.Adamax(optim_params, lr=lr) } elif opt['optimizer'] == 'rmsprop': self.optimizers = { self.model_name: optim.RMSprop(optim_params, lr=lr) } else: raise NotImplementedError('Optimizer not supported.') self.scheduler = ReduceLROnPlateau(self.optimizers[self.model_name], mode='min', \ patience=opt['valid_patience'] // 3, verbose=True) if opt.get('model_file') and os.path.isfile(opt['model_file']): print('Loading existing model parameters from ' + opt['model_file']) self.load(opt['model_file'])
def main(): # Training settings parser = argparse.ArgumentParser(description="PyTorch MNIST Example") parser.add_argument("--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)") parser.add_argument("--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)") parser.add_argument("--epochs", type=int, default=14, metavar="N", help="number of epochs to train (default: 14)") parser.add_argument("--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)") parser.add_argument("--gamma", type=float, default=0.7, metavar="M", help="Learning rate step gamma (default: 0.7)") parser.add_argument("--dry-run", action="store_true", default=False, help="quickly check a single pass") parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") parser.add_argument( "--log-interval", type=int, default=10, metavar="N", help="how many batches to wait before logging training status", ) parser.add_argument("--save-model", action="store_true", default=False, help="For Saving the current Model") args = parser.parse_args() torch.manual_seed(args.seed) kwargs = {"batch_size": args.batch_size} kwargs.update({"num_workers": 1, "pin_memory": True, "shuffle": True}, ) transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform) dataset2 = datasets.MNIST("../data", train=False, transform=transform) train_loader = torch.utils.data.DataLoader(dataset1, **kwargs) test_loader = torch.utils.data.DataLoader(dataset2, **kwargs) model = net model = Pipe(model, balance=[6, 6], devices=[0, 1], chunks=2) device = model.devices[0] optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): tic = time.perf_counter() train(args, model, device, train_loader, optimizer, epoch) toc = time.perf_counter() print(f">>> TRANING Time {toc - tic:0.4f} seconds") tic = time.perf_counter() test(model, device, test_loader) toc = time.perf_counter() print(f">>> TESTING Time {toc - tic:0.4f} seconds") scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
# decoder = torch.nn.DataParallel(decoder, device_ids=range(opt.ngpu)) image = image.cuda() text = text.cuda() criterion = criterion.cuda() # loss averager loss_avg = utils.averager() # setup optimizer if opt . man : encoder_optimizer = optim.Adam(encoder.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) decoder_optimizer = optim.Adam(decoder.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) elif opt.adadelta: optimizer = optim.Adadelta(encoder.parameters(), lr=opt.lr) else: encoder_optimizer = optim.RMSprop(encoder.parameters(), lr=opt.lr) decoder_optimizer = optim.RMSprop(decoder.parameters(), lr=opt.lr) def val(encoder, decoder, criterion, batchsize, dataset, teach_forcing=False, max_iter=100): print('Start val') for e, d in zip(encoder.parameters(), decoder.parameters()): e.requires_grad = False d.requires_grad = False encoder.eval() decoder.eval() data_loader = torch.utils.data.DataLoader(
classname = m.__class__.__name__ if classname.find('Conv') != -1: nn.init.normal_(m.weight.data, 0.0, 0.02) elif classname.find('BatchNorm') != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0) netG = Generator(latent_size, hidden_size, 28**2) netD = Discriminator(28**2, 256, 1) netG.apply(weights_init) netD.apply(weights_init) netG.to(device) netD.to(device) criterion = nn.BCELoss() optimizerD = optim.Adadelta(netD.parameters()) optimizerG = optim.Adadelta(netG.parameters()) # optimizerD = torch.optim.Adam(netD.parameters(), lr=lr) # optimizerG = torch.optim.Adam(netG.parameters(), lr=lr) # netG.cuda() summary(netG, input_size=(latent_size, 1, 1)) summary(netD, input_size=(1, 28, 28)) # In[ ]: def train(netG, netD, num_epochs, optG, optD, data_loader, test_data_loader, criterion): for one in data_loader:
def setup_update(self, weights): super(AdaDeltaTorch, self).setup_update(weights) import torch.optim as topt self.torch_optimizer = topt.Adadelta(self.parameters, self.lr, self.rho, self.eps, self.weight_decay)
def run(config): model_dir = os.path.join(config.store_path, config.experiment_name + '.dir') os.makedirs(config.store_path, exist_ok=True) os.makedirs(model_dir, exist_ok=True) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', filename=os.path.join(model_dir, config.experiment_name), filemode='w') # define a new Handler to log to console as well console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') console.setFormatter(formatter) logging.getLogger('').addHandler(console) # Load model nnet = torch.load(config.model, map_location=lambda storage, loc: storage) model = nnetAEClassifierMultitask(nnet['feature_dim'] * nnet['num_frames'], nnet['num_classes'], nnet['encoder_num_layers'], nnet['classifier_num_layers'], nnet['ae_num_layers'], nnet['hidden_dim'], nnet['bn_dim']) model.load_state_dict(nnet['model_state_dict']) # I want to only update the encoder for p in model.classifier.parameters(): p.requires_grad = False for p in model.ae.parameters(): p.requires_grad = False logging.info('Model Parameters: ') logging.info('Encoder Number of Layers: %d' % (nnet['encoder_num_layers'])) logging.info('Classifier Number of Layers: %d' % (nnet['classifier_num_layers'])) logging.info('AE Number of Layers: %d' % (nnet['ae_num_layers'])) logging.info('AR Time shift: %d' % ) logging.info('Hidden Dimension: %d' % (nnet['feature_dim'])) logging.info('Number of Classes: %d' % (nnet['num_classes'])) logging.info('Data dimension: %d' % (nnet['feature_dim'])) logging.info('Bottleneck dimension: %d' % (nnet['bn_dim'])) logging.info('Number of Frames: %d' % (nnet['num_frames'])) logging.info('Optimizer: %s ' % (config.optimizer)) logging.info('Batch Size: %d ' % (config.batch_size)) logging.info('Initial Learning Rate: %f ' % (config.learning_rate)) sys.stdout.flush() if config.use_gpu: # Set environment variable for GPU ID id = get_device_id() os.environ["CUDA_VISIBLE_DEVICES"] = id model = model.cuda() criterion_classifier = nn.CrossEntropyLoss() criterion_ae = nn.MSELoss() if config.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) elif config.optimizer == 'adadelta': optimizer = optim.Adadelta(model.parameters()) elif config.optimizer == 'sgd': optimizer = optim.SGD(model.parameters(), lr=config.learning_rate) elif config.optimizer == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=config.learning_rate) elif config.optimizer == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=config.learning_rate) else: raise NotImplementedError("Learning method not supported for the task") model_path = os.path.join(model_dir, config.experiment_name + '__epoch_0.model') torch.save({ 'epoch': 1, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, (open(model_path, 'wb'))) ep_ae_adapt = [] ep_mm_adapt = [] ep_loss_anchor = [] ep_fer_anchor = [] ep_ae_anchor = [] ep_loss_test = [] ep_fer_test = [] ep_ae_test = [] # Load Datasets # Anchor set path = os.path.join(config.egs_dir, config.anchor_set) with open(os.path.join(path, 'lengths.pkl'), 'rb') as f: lengths_anchor = pickle.load(f) labels_anchor = torch.load(os.path.join(path, 'labels.pkl')) anchor_ids = list(labels_anchor.keys()) # Adaptation Set dataset_adapt = nnetDatasetSeqAE(os.path.join(config.egs_dir, config.adapt_set)) data_loader_adapt = torch.utils.data.DataLoader(dataset_adapt, batch_size=config.batch_size, shuffle=True) # Test Set dataset_test = nnetDatasetSeq(os.path.join(config.egs_dir, config.test_set)) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=config.batch_size, shuffle=True) # Start off with initial performance on test set model.eval() test_losses = [] test_ae_losses = [] test_fer = [] for batch_x, batch_l, lab in data_loader_test: _, indices = torch.sort(batch_l, descending=True) if config.use_gpu: batch_x = Variable(batch_x[indices]).cuda() batch_l = Variable(batch_l[indices]).cuda() lab = Variable(lab[indices]).cuda() else: batch_x = Variable(batch_x[indices]) batch_l = Variable(batch_l[indices]) lab = Variable(lab[indices]) # Main forward pass class_out, ae_out = model(batch_x, batch_l) # Convert all the weird tensors to frame-wise form class_out = pad2list(class_out, batch_l) batch_x = pad2list(batch_x, batch_l) ae_out = pad2list(ae_out, batch_l) lab = pad2list(lab, batch_l) loss_classifier = criterion_classifier(class_out, lab) loss_ae = criterion_ae(ae_out, batch_x) test_losses.append(loss_classifier.item()) test_ae_losses.append(loss_ae.item()) if config.use_gpu: test_fer.append(compute_fer(class_out.cpu().data.numpy(), lab.cpu().data.numpy())) else: test_fer.append(compute_fer(class_out.data.numpy(), lab.data.numpy())) print_log = "Initial Testset Error : Adapt (Test) loss: {:.3f} :: Adapt (Test) FER: {:.2f} :: Adapt (Test) AE Loss: {:.3f}".format( np.mean(test_losses), np.mean(test_fer), np.mean(test_ae_losses)) logging.info(print_log) for epoch_i in range(config.epochs): ###################### ##### Adaptation ##### ###################### model.train() adapt_ae_losses = [] adapt_mm_losses = [] anchor_losses = [] anchor_ae_losses = [] anchor_fer = [] test_losses = [] test_ae_losses = [] test_fer = [] # Main training loop for batch_x, batch_l in data_loader_adapt: # First do the adaptation _, indices = torch.sort(batch_l, descending=True) if config.use_gpu: batch_x = Variable(batch_x[indices]).cuda() batch_l = Variable(batch_l[indices]).cuda() else: batch_x = Variable(batch_x[indices]) batch_l = Variable(batch_l[indices]) # Main forward pass optimizer.zero_grad() class_out, ae_out = model(batch_x, batch_l) # Convert all the weird tensors to frame-wise form batch_x = pad2list(batch_x, batch_l) ae_out = pad2list(ae_out, batch_l) class_out = pad2list(class_out, batch_l) loss_ae = criterion_ae(ae_out, batch_x) mm_loss = mmeasure_loss(class_out, use_gpu=config.use_gpu) loss = config.adapt_weight * loss_ae - config.mm_weight * mm_loss # Just the autoencoder loss adapt_ae_losses.append(loss_ae.item()) adapt_mm_losses.append(mm_loss.item()) # loss.backward() # optimizer.step() # Now lets try to anchor the parameters as close as possible to previously seen data # Select anchor data randomly ids = [random.choice(anchor_ids) for i in range(config.batch_size)] batch_x = torch.cat([torch.load(os.path.join(path, index))[None, :, :] for index in ids]) batch_l = torch.cat([torch.IntTensor([lengths_anchor[index]]) for index in ids]) lab = torch.cat([labels_anchor[index][None, :] for index in ids]) _, indices = torch.sort(batch_l, descending=True) if config.use_gpu: batch_x = Variable(batch_x[indices]).cuda() batch_l = Variable(batch_l[indices]).cuda() lab = Variable(lab[indices]).cuda() else: batch_x = Variable(batch_x[indices]) batch_l = Variable(batch_l[indices]) lab = Variable(lab[indices]) # Main forward pass optimizer.zero_grad() class_out, ae_out = model(batch_x, batch_l) # Convert all the weird tensors to frame-wise form class_out = pad2list(class_out, batch_l) batch_x = pad2list(batch_x, batch_l) ae_out = pad2list(ae_out, batch_l) lab = pad2list(lab, batch_l) loss_classifier = criterion_classifier(class_out, lab) loss_ae = criterion_ae(ae_out, batch_x) loss += config.anchor_weight * (loss_ae + loss_classifier) # Use all the loss for anchor set anchor_losses.append(loss_classifier.item()) anchor_ae_losses.append(loss_ae.item()) if config.use_gpu: anchor_fer.append(compute_fer(class_out.cpu().data.numpy(), lab.cpu().data.numpy())) else: anchor_fer.append(compute_fer(class_out.data.numpy(), lab.data.numpy())) loss.backward() optimizer.step() ## Test it on the WSJ test set model.eval() for batch_x, batch_l, lab in data_loader_test: _, indices = torch.sort(batch_l, descending=True) if config.use_gpu: batch_x = Variable(batch_x[indices]).cuda() batch_l = Variable(batch_l[indices]).cuda() lab = Variable(lab[indices]).cuda() else: batch_x = Variable(batch_x[indices]) batch_l = Variable(batch_l[indices]) lab = Variable(lab[indices]) # Main forward pass class_out, ae_out = model(batch_x, batch_l) # Convert all the weird tensors to frame-wise form class_out = pad2list(class_out, batch_l) batch_x = pad2list(batch_x, batch_l) ae_out = pad2list(ae_out, batch_l) lab = pad2list(lab, batch_l) loss_classifier = criterion_classifier(class_out, lab) loss_ae = criterion_ae(ae_out, batch_x) test_losses.append(loss_classifier.item()) test_ae_losses.append(loss_ae.item()) if config.use_gpu: test_fer.append(compute_fer(class_out.cpu().data.numpy(), lab.cpu().data.numpy())) else: test_fer.append(compute_fer(class_out.data.numpy(), lab.data.numpy())) ep_ae_adapt.append(np.mean(adapt_ae_losses)) ep_mm_adapt.append(np.mean(adapt_mm_losses)) ep_loss_anchor.append(np.mean(anchor_losses)) ep_fer_anchor.append(np.mean(anchor_fer)) ep_ae_anchor.append(np.mean(anchor_ae_losses)) ep_loss_test.append(np.mean(test_losses)) ep_fer_test.append(np.mean(test_fer)) ep_ae_test.append(np.mean(test_ae_losses)) print_log = "Epoch: {:d} Adapt (Test) loss: {:.3f} :: Adapt (Test) FER: {:.2f}".format(epoch_i + 1, ep_loss_test[-1], ep_fer_test[-1]) print_log += " || Anchor loss : {:.3f} :: Anchor FER: {:.2f}".format(ep_loss_anchor[-1], ep_fer_anchor[-1]) print_log += " || AE Loss (Adapt) : {:.3f} :: AE Loss (Anchor) : {:.3f} :: AE Loss (Test) : {:.3f} ".format( ep_ae_adapt[-1], ep_ae_anchor[-1], ep_ae_test[-1]) print_log += " || Adapt mm loss : {:.3f} ".format(ep_mm_adapt[-1]) logging.info(print_log) if (epoch_i + 1) % config.model_save_interval == 0: model_path = os.path.join(model_dir, config.experiment_name + '__epoch_%d' % (epoch_i + 1) + '.model') torch.save({ 'epoch': epoch_i + 1, 'feature_dim': nnet['feature_dim'], 'num_frames': nnet['num_frames'], 'num_classes': nnet['num_classes'], 'encoder_num_layers': nnet['encoder_num_layers'], 'classifier_num_layers': nnet['classifier_num_layers'], 'ae_num_layers': nnet['ae_num_layers'], 'ep_ae_adapt': ep_ae_adapt, 'ep_mm_adapt': ep_mm_adapt, 'ep_loss_anchor': ep_loss_anchor, 'ep_fer_anchor': ep_fer_anchor, 'ep_ae_anchor': ep_ae_anchor, 'ep_loss_test': ep_loss_test, 'ep_fer_test': ep_fer_test, 'ep_ae_test': ep_ae_test, 'hidden_dim': nnet['hidden_dim'], 'bn_dim': nnet['bn_dim'], 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, (open(model_path, 'wb')))
def build_torch_optimizer(model, opt): """Builds the PyTorch optimizer. We use the default parameters for Adam that are suggested by the original paper https://arxiv.org/pdf/1412.6980.pdf These values are also used by other established implementations, e.g. https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer https://keras.io/optimizers/ Recently there are slightly different values used in the paper "Attention is all you need" https://arxiv.org/pdf/1706.03762.pdf, particularly the value beta2=0.98 was used there however, beta2=0.999 is still arguably the more established value, so we use that here as well Args: model: The model to optimize. opt. The dictionary of options. Returns: A ``torch.optim.Optimizer`` instance. """ params = [p for p in model.parameters() if p.requires_grad] betas = [opt.adam_beta1, opt.adam_beta2] if opt.optim == 'sgd': optimizer = optim.SGD(params, lr=opt.learning_rate, momentum=0.9, weight_decay=5e-3) elif opt.optim == 'adagrad': optimizer = optim.Adagrad( params, lr=opt.learning_rate, initial_accumulator_value=opt.adagrad_accumulator_init) elif opt.optim == 'adadelta': optimizer = optim.Adadelta(params, lr=opt.learning_rate) elif opt.optim == 'adafactor': optimizer = AdaFactor(params, non_constant_decay=True, enable_factorization=True, weight_decay=0) elif opt.optim == 'adam': optimizer = optim.Adam(params, lr=opt.learning_rate, betas=betas, eps=1e-9) elif opt.optim == 'sparseadam': dense = [] sparse = [] for name, param in model.named_parameters(): if not param.requires_grad: continue # TODO: Find a better way to check for sparse gradients. if 'embed' in name: sparse.append(param) else: dense.append(param) optimizer = MultipleOptimizer([ optim.Adam(dense, lr=opt.learning_rate, betas=betas, eps=1e-8), optim.SparseAdam(sparse, lr=opt.learning_rate, betas=betas, eps=1e-8) ]) elif opt.optim == 'fusedadam': # we use here a FusedAdam() copy of an old Apex repo optimizer = FusedAdam(params, lr=opt.learning_rate, betas=betas) else: raise ValueError('Invalid optimizer type: ' + opt.optim) if opt.model_dtype == 'fp16': import apex if opt.optim != 'fusedadam': # In this case use the new AMP API from apex loss_scale = "dynamic" if opt.loss_scale == 0 else opt.loss_scale model, optimizer = apex.amp.initialize( [model, model.generator], optimizer, opt_level=opt.apex_opt_level, loss_scale=loss_scale, keep_batchnorm_fp32=None) else: # In this case use the old FusedAdam with FP16_optimizer wrapper static_loss_scale = opt.loss_scale dynamic_loss_scale = opt.loss_scale == 0 optimizer = apex.optimizers.FP16_Optimizer( optimizer, static_loss_scale=static_loss_scale, dynamic_loss_scale=dynamic_loss_scale) return optimizer
def optimization_algorithms(SCI_optimizer, cnn, LR, SCI_SGD_MOMENTUM, REGULARIZATION): if type(SCI_optimizer) is str: if (SCI_optimizer == 'Adam'): optimizer = optim.Adam(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION) if (SCI_optimizer == 'AMSGrad'): optimizer = optim.Adam(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION, amsgrad=True) if (SCI_optimizer == 'AdamW'): optimizer = AdamW(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION) if (SCI_optimizer == 'RMSprop'): optimizer = optim.RMSprop(cnn.parameters(), lr=LR) #if (SCI_optimizer == 'SparseAdam') or (int(SCI_optimizer) == 4) : #optimizer = optim.SparseAdam(cnn.parameters(), lr=LR) if (SCI_optimizer == 'SGD'): optimizer = optim.SGD(cnn.parameters(), lr=LR, momentum=SCI_SGD_MOMENTUM, weight_decay=REGULARIZATION) if (SCI_optimizer == 'Adadelta'): optimizer = optim.Adadelta(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (SCI_optimizer == 'Rprop'): optimizer = optim.Rprop(cnn.parameters(), lr=LR) #if (SCI_optimizer == 'Adagrad') or (int(SCI_optimizer) == 7) : # optimizer = optim.Adagrad(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (SCI_optimizer == 'Adamax'): optimizer = optim.Adamax(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (SCI_optimizer == 'ASGD'): optimizer = optim.ASGD(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) #if (SCI_optimizer == 'LBFGS') or (int(SCI_optimizer) == 10) : #optimizer = optim.LBFGS(cnn.parameters(), lr=LR) else: if (int(SCI_optimizer) == 1): optimizer = optim.Adam(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 2): optimizer = optim.Adam(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION, amsgrad=True) if (int(SCI_optimizer) == 3): optimizer = AdamW(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 4): optimizer = optim.RMSprop(cnn.parameters(), lr=LR) #if (SCI_optimizer == 'SparseAdam') or (int(SCI_optimizer) == 4) : #optimizer = optim.SparseAdam(cnn.parameters(), lr=LR) if (int(SCI_optimizer) == 5): optimizer = optim.SGD(cnn.parameters(), lr=LR, momentum=SCI_SGD_MOMENTUM, weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 6): optimizer = optim.Adadelta(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 7): optimizer = optim.Rprop(cnn.parameters(), lr=LR) #if (SCI_optimizer == 'Adagrad') or (int(SCI_optimizer) == 7) : # optimizer = optim.Adagrad(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 8): optimizer = optim.Adamax(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 9): optimizer = optim.ASGD(cnn.parameters(), lr=LR, lambd=0.0001, alpha=0.75, t0=1000000.0, weight_decay=REGULARIZATION) #if (SCI_optimizer == 'LBFGS') or (int(SCI_optimizer) == 10) : #optimizer = optim.LBFGS(cnn.parameters(), lr=LR) return optimizer
def main(): # Training settings # Use the command line to modify the default settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=64, metavar='N', help='input batch size for testing (default: 64)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument( '--step', type=int, default=1, metavar='N', help='number of epochs between learning rate reductions (default: 1)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=100, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--evaluate', action='store_true', default=False, help='evaluate your model on the official test set') parser.add_argument('--load-model', type=str, help='model file path') parser.add_argument('--save-model', action='store_true', default=True, help='For Saving the current Model') parser.add_argument('--test-datasize', action='store_true', default=False, help='train on different sizes of dataset') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} torch.manual_seed(args.seed) # Evaluate on the official test set # if args.evaluate: # assert os.path.exists(args.load_model) # # # Set the test model # model = Net().to(device) # model = M.resnet18(num_classes=99).to(device) # model.load_state_dict(torch.load(args.load_model)) # # test_dataset = datasets.MNIST('./data', train=False, # transform=transforms.Compose([ # transforms.ToTensor(), # transforms.Normalize((0.1307,), (0.3081,)) # ])) # # test_loader = torch.utils.data.DataLoader( # test_dataset, batch_size=args.test_batch_size, shuffle=True, **kwargs) # # test(model, device, test_loader, analysis=True) # # return # Pytorch has default MNIST dataloader which loads data at each iteration # train_dataset_no_aug = TrainDataset(True, 'data/imet-2020-fgvc7/labels.csv', # 'data/imet-2020-fgvc7/train_20country.csv', 'data/imet-2020-fgvc7/train/', # transform=transforms.Compose([ # Data preprocessing # transforms.ToPILImage(), # Add data augmentation here # transforms.RandomResizedCrop(128), # transforms.ToTensor(), # transforms.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)) # ])) train_dataset_no_aug = TrainDataset( True, 'data/imet-2020-fgvc7/labels.csv', 'data/imet-2020-fgvc7/train_20country.csv', 'data/imet-2020-fgvc7/train/', transform=transforms.Compose([ # Data preprocessing transforms.ToPILImage(), # Add data augmentation here transforms.Resize(255), transforms.RandomCrop(224), transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ])) train_dataset_with_aug = train_dataset_no_aug assert (len(train_dataset_no_aug) == len(train_dataset_with_aug)) # You can assign indices for training/validation or use a random subset for # training by using SubsetRandomSampler. Right now the train and validation # sets are built from the same indices - this is bad! Change it so that # the training and validation sets are disjoint and have the correct relative sizes. np.random.seed(args.seed) subset_indices_valid = np.random.choice(len(train_dataset_no_aug), int(0.15 * len(train_dataset_no_aug)), replace=False) subset_indices_train = [ i for i in range(len(train_dataset_no_aug)) if i not in subset_indices_valid ] # subset_indices_train = [] # subset_indices_valid = [] # for target in range(10): # idx = (train_dataset_no_aug.targets == target).nonzero() # indices for each class # idx = idx.numpy().flatten() # val_idx = np.random.choice( len(idx), int(0.15*len(idx)), replace=False ) # val_idx = np.ndarray.tolist(val_idx.flatten()) # train_idx = [i for i in range(len(idx)) if i not in val_idx] # subset_indices_train += np.ndarray.tolist(idx[train_idx]) # subset_indices_valid += np.ndarray.tolist(idx[val_idx]) assert (len(subset_indices_train) + len(subset_indices_valid)) == len(train_dataset_no_aug) assert len(np.intersect1d(subset_indices_train, subset_indices_valid)) == 0 train_loader = torch.utils.data.DataLoader( train_dataset_with_aug, batch_size=args.batch_size, sampler=SubsetRandomSampler(subset_indices_train)) val_loader = torch.utils.data.DataLoader( train_dataset_no_aug, batch_size=args.test_batch_size, sampler=SubsetRandomSampler(subset_indices_valid)) # Load your model [fcNet, ConvNet, Net] #model = Net().to(device) # model = M.resnet50(num_classes=20).to(device) # model.load_state_dict(torch.load(args.load_model)) model = M.resnet50(pretrained=True) model.fc = nn.Linear(model.fc.in_features, 20) model = model.to(device) # model.load_state_dict(torch.load(args.load_model)) # print(model) # summary(model, (1,28,28)) # Try different optimzers here [Adam, SGD, RMSprop] optimizer = optim.Adadelta(model.parameters(), lr=args.lr) # Set your learning rate scheduler scheduler = StepLR(optimizer, step_size=args.step, gamma=args.gamma) # if args.test_datasize: # train_final_loss = [] # val_final_loss = [] # train_size = [] # for i in [1, 2, 4, 8, 16]: # print("Dataset with size 1/{} of original: ".format(i)) # subset_indices_train_sub = np.random.choice(subset_indices_train, int(len(subset_indices_train)/i), replace=False) # train_loader_sub = torch.utils.data.DataLoader( # train_dataset_with_aug, batch_size=args.batch_size, # sampler=SubsetRandomSampler(subset_indices_train_sub) # ) # train_losses = [] # val_losses = [] # for epoch in range(1, args.epochs + 1): # train_loss = train(args, model, device, train_loader_sub, optimizer, epoch) # val_loss = validation(model, device, val_loader) # train_losses.append(train_loss) # val_losses.append(val_loss) # scheduler.step() # learning rate scheduler # # You may optionally save your model at each epoch here # print("Train Loss: ", train_losses) # print("Test Loss: ", val_losses) # print("\n") # train_final_loss.append(train_losses[-1]) # val_final_loss.append(val_losses[-1]) # train_size.append(int(len(subset_indices_train)/i)) # # plt.loglog(range(1, args.epochs + 1), train_losses) # plt.loglog(range(1, args.epochs + 1), val_losses) # plt.xlabel("Number of training examples") # plt.ylabel("Loss") # plt.legend(["Training loss", "Val loss"]) # plt.title("Training loss and val loss as a function of the number of training examples on log-log scale") # plt.show() # return # Training loop train_losses = [] val_losses = [] accuracies = [] for epoch in range(1, args.epochs + 1): train_loss = train(args, model, device, train_loader, optimizer, epoch) (accuracy, val_loss) = validation(model, device, val_loader) train_losses.append(train_loss) val_losses.append(val_loss) accuracies.append(accuracy) scheduler.step() # learning rate scheduler # You may optionally save your model at each epoch here if args.save_model: torch.save(model.state_dict(), "mnist_model.pt") plt.plot(range(1, args.epochs + 1), train_losses) plt.plot(range(1, args.epochs + 1), val_losses) plt.xlabel("Epoch") plt.ylabel("Loss") plt.legend(["Training loss", "Val loss"]) plt.title("Training loss and val loss as a function of the epoch") plt.show() plt.plot(range(1, args.epochs + 1), accuracies) plt.xlabel("Epoch") plt.ylabel("Accuracy") plt.legend(["Validation Accuracy"]) plt.title("Accuracy in validation set as a function of the epoch") plt.show()
def __init__(self, opt, embedding=None, state_dict=None): self.opt = opt self.updates = state_dict[ 'updates'] if state_dict and 'updates' in state_dict else 0 self.eval_embed_transfer = True self.train_loss = AverageMeter() self.network = DNetwork(opt, embedding) #reload checkpoint parameters if state dictionary passed if state_dict: new_state = set(self.network.state_dict().keys()) for k in list(state_dict['network'].keys()): if k not in new_state: del state_dict['network'][k] for k, v in list(self.network.state_dict().items()): if k not in state_dict['network']: state_dict['network'][k] = v self.network.load_state_dict(state_dict['network']) #select optimizer parameters = [p for p in self.network.parameters() if p.requires_grad] if opt['optimizer'] == 'sgd': self.optimizer = optim.SGD(parameters, opt['learning_rate'], momentum=opt['momentum'], weight_decay=opt['weight_decay']) elif opt['optimizer'] == 'adamax': self.optimizer = optim.Adamax(parameters, opt['learning_rate'], weight_decay=opt['weight_decay']) elif opt['optimizer'] == 'adam': self.optimizer = optim.Adam(parameters, opt['learning_rate'], weight_decay=opt['weight_decay']) elif opt['optimizer'] == 'adadelta': self.optimizer = optim.Adadelta(parameters, opt['learning_rate'], rho=0.95) else: raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer']) if state_dict and 'optimizer' in state_dict: self.optimizer.load_state_dict(state_dict['optimizer']) if opt['fix_embeddings']: wvec_size = 0 else: wvec_size = (opt['vocab_size'] - opt['tune_partial']) * opt['embedding_dim'] if opt.get('have_lr_scheduler', False): if opt.get('scheduler_type', 'rop') == 'rop': self.scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=opt['lr_gamma'], patience=3) elif opt.get('scheduler_type', 'rop') == 'exp': self.scheduler = ExponentioalLR(self.optimizer, gamma=opt.get('lr_gamma', 0.5)) else: milestones = [ int(step) for step in opt.get('multi_step_lr', '10,20,30').split(',') ] self.scheduler = MultiStepLR(self.optimizer, milestones=milestones, gamma=opt.get('lr_gamma')) else: self.scheduler = None self.total_param = sum([p.nelement() for p in parameters]) - wvec_size
def train(data): print("Training model...") data.show_data_summary() save_data_name = data.model_dir + ".dset" data.save(save_data_name) if data.sentence_classification: model = SentClassifier(data) else: model = SeqLabel(data) if data.optimizer.lower() == "sgd": optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) else: print("Optimizer illegal: %s" % (data.optimizer)) exit(1) best_dev = -10 # data.HP_iteration = 1 ## start training for idx in range(data.HP_iteration): epoch_start = time.time() temp_start = epoch_start print("Epoch: %s/%s" % (idx, data.HP_iteration)) if data.optimizer == "SGD": optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 sample_id = 0 sample_loss = 0 total_loss = 0 right_token = 0 whole_token = 0 random.shuffle(data.train_Ids) print("Shuffle: first input word list:", data.train_Ids[0][0]) ## set model in train model model.train() model.zero_grad() batch_size = data.HP_batch_size batch_id = 0 train_num = len(data.train_Ids) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = data.train_Ids[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu, True, data.sentence_classification) instance_count += 1 loss, tag_seq = model.calculate_loss(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) right, whole = predict_check(tag_seq, batch_label, mask, data.sentence_classification) right_token += right whole_token += whole # print("loss:",loss.item()) sample_loss += loss.item() total_loss += loss.item() if end % 500 == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time print( " Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) if sample_loss > 1e8 or str(sample_loss) == "nan": print( "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." ) exit(1) sys.stdout.flush() sample_loss = 0 loss.backward() optimizer.step() model.zero_grad() temp_time = time.time() temp_cost = temp_time - temp_start print(" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print( "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % (idx, epoch_cost, train_num / epoch_cost, total_loss)) print("totalloss:", total_loss) if total_loss > 1e8 or str(total_loss) == "nan": print( "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." ) exit(1) # continue speed, acc, p, r, f, _, _, bal_acc, cm = evaluate(data, model, "dev") dev_finish = time.time() dev_cost = dev_finish - epoch_finish if data.seg: current_score = f print( "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, bal_acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (dev_cost, speed, acc, bal_acc, p, r, f)) print(cm) #cm.tabulate() else: current_score = acc print( "Dev: time: %.2fs speed: %.2fst/s; acc: %.4f; bal_acc: %.4f" % (dev_cost, speed, acc, bal_acc)) print(cm) #cm.tabulate() if current_score > best_dev: if data.seg: print("Exceed previous best f score:", best_dev) else: print("Exceed previous best acc score:", best_dev) model_name = data.model_dir + '.' + str(idx) + ".model" print("Save current best model in file:", model_name) torch.save(model.state_dict(), model_name) best_dev = current_score # ## decode test speed, acc, p, r, f, _, _, bal_acc, cm = evaluate(data, model, "test") test_finish = time.time() test_cost = test_finish - dev_finish if data.seg: print( "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, bal_acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (test_cost, speed, acc, bal_acc, p, r, f)) print(cm) #cm.tabulate() else: print( "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f; bal_acc: %.4f" % (test_cost, speed, acc, bal_acc)) print(cm) #cm.tabulate() gc.collect()
rel_feat_extractor, rel_decoder, bin_rel_decoder, gcn, vocab, config.schedule_k, config.use_cuda, config.max_entity_num) util.assign_embeddings(word_encoder.word_embeddings, pretrained_embeddings) if config.use_cuda: mymodel.cuda() if os.path.exists(config.load_model_path): state_dict = torch.load(open(config.load_model_path, "rb"), map_location=lambda storage, loc: storage) mymodel.load_state_dict(state_dict) print("Loading previous model successful [%s]" % config.load_model_path) parameters = [p for p in mymodel.parameters() if p.requires_grad] optimizer = optim.Adadelta(parameters) def create_batch_list(sort_batch_tensor: Dict[str, Any], outputs: Dict[str, Any]) -> List[Dict[str, Any]]: new_batch = [] for k in range(len(outputs['ent_span_pred'])): instance = {} instance['tokens'] = sort_batch_tensor['tokens'][k].cpu().numpy() instance['ent_labels'] = sort_batch_tensor['ent_labels'][k].cpu( ).numpy() instance['ent_span_labels'] = sort_batch_tensor['ent_span_labels'][ k].cpu().numpy() instance['candi_rels'] = sort_batch_tensor['candi_rels'][k] instance['rel_labels'] = sort_batch_tensor['rel_labels'][k]
def train(opt): # load train/valid/test data opt.vocab_size = get_nwords(opt.data_path) opt.category_size = get_nclasses(opt.data_path) mytrain_dset, myvalid_dset, mytest_dset = loaddset(opt) writer = SummaryWriter(opt.checkpoint_path) # init or load training infos infos = {} histories = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_' + opt.id + '-best.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same = ["rnn_size", "num_layers"] # optim needn't same for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile( os.path.join(opt.start_from, 'histories_' + opt.id + '-best.pkl')): with open( os.path.join(opt.start_from, 'histories_' + opt.id + '-best.pkl')) as f: histories = cPickle.load(f) # random seed must be inherited if didn't assign it. if opt.seed == 0: opt.seed = infos['opt'].seed iteration = infos.get('iter', 0) + 1 epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) else: best_val_score = None torch.manual_seed(opt.seed) torch.cuda.manual_seed(opt.seed) model = SAModel(opt) if opt.start_from is not None: # check if all necessary files exist assert os.path.isdir( opt.start_from), " %s must be a a path" % opt.start_from model.load_state_dict(torch.load( os.path.join(opt.start_from, 'model-best.pth')), strict=True) model.cuda() model.train() crit = LanguageModelCriterion() # 评估生成的caption classify_crit = ClassiferCriterion() # 评估分类结果 rl_crit = RewardCriterion() # RL训练 # select optimizer if opt.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay) elif opt.optim == 'adadelta': optimizer = optim.Adadelta(model.parameters(), lr=1.0, weight_decay=opt.weight_decay) opt.learning_rate_decay_start = -1 # training start tmp_patience = 0 # each epoch while True: update_lr_flag = True # when a new epoch start, set update_lr_flag to True if update_lr_flag: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0 and opt.optim != 'adadelta': frac = int((epoch - opt.learning_rate_decay_start) / opt.learning_rate_decay_every) decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor myutils.set_lr(optimizer, opt.current_lr) # set the decayed rate #print('epoch {}, lr_decay_start {}, cur_lr {}'.format(epoch, opt.learning_rate_decay_start, opt.current_lr)) else: opt.current_lr = opt.learning_rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = int((epoch - opt.scheduled_sampling_start) / opt.scheduled_sampling_increase_every) opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True myutils.init_cider_scorer(opt.reward_type) else: sc_flag = False update_lr_flag = False #loading train data myloader_train = DataLoader(mytrain_dset, batch_size=opt.batch_size, collate_fn=data_io.collate_fn, shuffle=True) torch.cuda.synchronize() for data, cap, cap_mask, cap_classes, class_mask, feat1, feat2, feat_mask, pos_feat, lens, groundtruth, image_id in myloader_train: start = time.time() cap = Variable(cap, requires_grad=False).cuda() cap_mask = Variable(cap_mask, requires_grad=False).cuda() cap_classes = Variable(cap_classes, requires_grad=False).cuda() class_mask = Variable(class_mask, requires_grad=False).cuda() feat1 = Variable(feat1, requires_grad=False).cuda() feat2 = Variable(feat2, requires_grad=False).cuda() feat_mask = Variable(feat_mask, requires_grad=False).cuda() pos_feat = Variable(pos_feat, requires_grad=False).cuda() optimizer.zero_grad() if not sc_flag: out, category = model( feat1, feat2, feat_mask, pos_feat, cap, cap_mask) # (B,seq_len+1,29324),(B,seq_len+1,14) loss_language = crit(out, cap, cap_mask) loss_classify = classify_crit(category, cap_classes, cap_mask, class_mask) # print(loss_language.data[0], loss_classify.data[0]) loss = loss_language + opt.weight_class * loss_classify # weight_class为0,不再训练pos信息生成,仅仅训练caption else: gen_result, sample_logprobs = model.sample( feat1, feat2, feat_mask, pos_feat, {'sample_max': 0}) reward = myutils.get_self_critical_reward( model, feat1, feat2, feat_mask, pos_feat, groundtruth, gen_result) # (m,max_length) loss = rl_crit( sample_logprobs, gen_result, Variable(torch.from_numpy(reward).float().cuda(), requires_grad=False)) loss.backward() myutils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.data[0] torch.cuda.synchronize() end = time.time() if not sc_flag: print( "iter {} (epoch {}), train_loss = {:.3f}, loss_lang = {:.3f}, loss_class = {:.3f}, time/batch = {:.3f}" .format(iteration, epoch, train_loss, loss_language.data[0], loss_classify.data[0], end - start)) else: print( "iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" .format(iteration, epoch, np.mean(reward[:, 0]), end - start)) # Write the training loss summary if (iteration % opt.losses_log_every == 0): writer.add_scalar('train_loss', train_loss, iteration) writer.add_scalar('learning_rate', opt.current_lr, iteration) writer.add_scalar('scheduled_sampling_prob', model.ss_prob, iteration) if sc_flag: writer.add_scalar('avg_reward', np.mean(reward[:, 0]), iteration) loss_history[ iteration] = train_loss if not sc_flag else np.mean( reward[:, 0]) lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model print('validation and save the model...') time.sleep(3) eval_kwargs = {} eval_kwargs.update( vars(opt)) # attend vars(opt) into eval_kwargs val_loss, predictions, lang_stats = eval_utils.eval_split( model, crit, classify_crit, myvalid_dset, eval_kwargs) print('validation is finish!') time.sleep(3) writer.add_scalar('validation loss', val_loss, iteration) if opt.language_eval == 1: for tag, value in lang_stats.items(): if type(value) is list: writer.add_scalar(tag, value[-1], iteration) else: writer.add_scalar(tag, value, iteration) for tag, value in model.named_parameters(): try: tag = tag.replace('.', '/') writer.add_histogram(tag, value.data.cpu().numpy(), iteration) writer.add_histogram( tag + '/grad', (value.grad).data.cpu().numpy(), iteration) except AttributeError: continue val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True tmp_patience = 0 else: tmp_patience += 1 if not os.path.exists(opt.checkpoint_path): os.mkdir(opt.checkpoint_path) checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) # Dump miscalleous informations(current information) infos['iter'] = iteration infos['epoch'] = epoch infos['best_val_score'] = best_val_score infos['opt'] = opt infos['val_score'] = lang_stats infos['val_sents'] = predictions histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(infos, f) with open( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(histories, f) if best_flag: checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '-best.pkl'), 'wb') as f: cPickle.dump(infos, f) with open( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '-best.pkl'), 'wb') as f: cPickle.dump(histories, f) if tmp_patience >= opt.patience: break iteration += 1 if tmp_patience >= opt.patience: print("early stop, trianing is finished!") break if epoch >= opt.max_epochs and opt.max_epochs != -1: print("reach max epochs, training is finished!") break epoch += 1
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=14, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--dry-run', action='store_true', default=False, help='quickly check a single pass') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") train_kwargs = {'batch_size': args.batch_size} test_kwargs = {'batch_size': args.test_batch_size} if use_cuda: cuda_kwargs = {'num_workers': 1, 'pin_memory': True, 'shuffle': True} train_kwargs.update(cuda_kwargs) test_kwargs.update(cuda_kwargs) transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) dataset1 = datasets.MNIST('./data', train=True, download=True, transform=transform) dataset2 = datasets.MNIST('./data', train=False, transform=transform) train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs) test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(model, device, test_loader) scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
def __init__(self, opt, state_dict=None, num_train_step=-1): self.config = opt self.updates = state_dict['updates'] if state_dict and 'updates' in state_dict else 0 self.local_updates = 0 self.train_loss = AverageMeter() self.network = SANBertNetwork(opt) if state_dict: self.network.load_state_dict(state_dict['state'], strict=False) self.mnetwork = nn.DataParallel(self.network) if opt['multi_gpu_on'] else self.network self.total_param = sum([p.nelement() for p in self.network.parameters() if p.requires_grad]) if opt['cuda']: self.network.cuda() no_decay = ['bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_parameters = [ {'params': [p for n, p in self.network.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in self.network.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # note that adamax are modified based on the BERT code if opt['optimizer'] == 'sgd': self.optimizer = optim.sgd(optimizer_parameters, opt['learning_rate'], weight_decay=opt['weight_decay']) elif opt['optimizer'] == 'adamax': self.optimizer = Adamax(optimizer_parameters, opt['learning_rate'], warmup=opt['warmup'], t_total=num_train_step, max_grad_norm=opt['grad_clipping'], schedule=opt['warmup_schedule']) if opt.get('have_lr_scheduler', False): opt['have_lr_scheduler'] = False elif opt['optimizer'] == 'adadelta': self.optimizer = optim.Adadelta(optimizer_parameters, opt['learning_rate'], rho=0.95) elif opt['optimizer'] == 'adam': self.optimizer = Adam(optimizer_parameters, lr=opt['learning_rate'], warmup=opt['warmup'], t_total=num_train_step, max_grad_norm=opt['grad_clipping'], schedule=opt['warmup_schedule']) if opt.get('have_lr_scheduler', False): opt['have_lr_scheduler'] = False else: raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer']) if state_dict and 'optimizer' in state_dict: self.optimizer.load_state_dict(state_dict['optimizer']) if opt['fp16']: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(self.network, self.optimizer, opt_level=opt['fp16_opt_level']) self.network = model self.optimizer = optimizer if opt.get('have_lr_scheduler', False): if opt.get('scheduler_type', 'rop') == 'rop': self.scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=opt['lr_gamma'], patience=3) elif opt.get('scheduler_type', 'rop') == 'exp': self.scheduler = ExponentialLR(self.optimizer, gamma=opt.get('lr_gamma', 0.95)) else: milestones = [int(step) for step in opt.get('multi_step_lr', '10,20,30').split(',')] self.scheduler = MultiStepLR(self.optimizer, milestones=milestones, gamma=opt.get('lr_gamma')) else: self.scheduler = None self.ema = None if opt['ema_opt'] > 0: self.ema = EMA(self.config['ema_gamma'], self.network) if opt['cuda']: self.ema.cuda() self.para_swapped = False # zero optimizer grad self.optimizer.zero_grad()
def train(self): CrossEntropyLoss = nn.CrossEntropyLoss().to(self.device) # input이 (N, C), target이 (N) 형태여야함 optimizer = optim.Adadelta(self.net.parameters(), lr=self.lr, rho=0.95, eps=1e-07) counter = self.counter self.checkpoints_to_keep = [] start_time = time.time() for epoch in range(self.epoch): if epoch != 0 and epoch % self.decay_epoch == 0: optimizer.param_groups[0]['lr'] = self.lr / 10 print('learning rate decayed') for step, (imgs, masks) in enumerate(self.train_loader): imgs, masks = imgs.to(self.device), masks.to(self.device) preds = self.net(imgs) # imgs.shape (N, 3, 224, 224) # masks.shape (N, 1, 224, 224) # preds.shape (N, 2, 224, 224) preds_flat = preds.permute(0, 2, 3, 1).contiguous().view(-1, self.num_classes) masks_flat = masks.squeeze(1).view(-1).long() # preds_flat.shape (N*224*224, 2) # masks_flat.shape (N*224*224, 1) self.net.zero_grad() loss = CrossEntropyLoss(preds_flat, masks_flat) loss.backward() optimizer.step() counter += 1 step_end_time = time.time() print('[%d/%d][%d/%d] - time_passed: %.2f, CrossEntropyLoss: %.2f' % (epoch, self.epoch, step, self.num_steps, step_end_time - start_time, loss)) # save sample images if step % self.sample_step == 0: for num, (imgs, masks) in enumerate(self.test_loader): imgs, masks = imgs.to(self.device), masks.to(self.device) preds = self.net(imgs) inverse_normalize = transforms.Normalize(mean=[-0.5 / 0.5, -0.5 / 0.5, -0.5 / 0.5], std=[1 / 0.5, 1 / 0.5, 1 / 0.5]) imgs = inverse_normalize(imgs[0]).permute(1, 2, 0).detach().cpu().numpy()[:,:,::-1] * 255 masks = masks[0].repeat(3, 1, 1).permute(1, 2, 0).detach().cpu().numpy() * 255 preds = torch.argmax(preds[0], 0).unsqueeze(2).repeat(1, 1, 3).detach().cpu().numpy() * 255 if not os.path.exists(os.path.join(self.sample_dir, self.model_dir())): os.makedirs(os.path.join(self.sample_dir, self.model_dir())) samples = np.hstack((imgs, masks, preds)) cv2.imwrite('{}/{}/sample_{}-{}.png'.format(self.sample_dir, self.model_dir(), num, counter), samples) print('Saved images') # save checkpoints if step % self.checkpoint_step == 0: if not os.path.exists(os.path.join(self.checkpoint_dir, self.model_dir())): os.makedirs(os.path.join(self.checkpoint_dir, self.model_dir())) self.save_checkpoint(counter, self.ckpt_max_to_keep) print("Saved checkpoint")
def train(config): ####################################################### # ENV ####################################################### use_gpu = torch.cuda.is_available() torch.manual_seed(config['seed']) if config['cuda']: torch.cuda.manual_seed(config['seed']) save_path = config['save_path'] ##################################################### # DATA ##################################################### source_path = config['source_path'] target_path = config['target_path'] num_k = config['num_k'] num_layer = config['num_layer'] batch_size = config['batch_size'] data_transforms = { source_path: transforms.Compose([ transforms.Resize(256), transforms.RandomHorizontalFlip(), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), target_path: transforms.Compose([ transforms.Resize(256), transforms.RandomHorizontalFlip(), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), } dsets = { source_path: datasets.ImageFolder(os.path.join(source_path), data_transforms[source_path]), target_path: datasets.ImageFolder(os.path.join(target_path), data_transforms[target_path]) } train_loader = CVDataLoader() train_loader.initialize(dsets[source_path], dsets[target_path], batch_size) #CVDataLoader.initialize dataset = train_loader.load_data() #CVDataLoader.load_data() test_loader = CVDataLoader() #opt = args test_loader.initialize(dsets[source_path], dsets[target_path], batch_size, shuffle=True) dataset_test = test_loader.load_data() dset_sizes = { source_path: len(dsets[source_path]), target_path: len(dsets[target_path]) } dset_classes = dsets[source_path].classes print('classes' + str(dset_classes)) option = 'resnet' + config['resnet'] G = ResBase(option) F1 = ResClassifier(num_classes=config['num_classes'], num_layer=config['num_layer'], num_unit=config['num_unit'], prob=config['prob'], middle=config['middle']) F2 = ResClassifier(num_classes=config['num_classes'], num_layer=config['num_layer'], num_unit=config['num_unit'], prob=config['prob'], middle=config['middle']) F1.apply(weights_init) F2.apply(weights_init) lr = config['lr'] if config['cuda']: G.cuda() F1.cuda() F2.cuda() if config['optimizer'] == 'momentum': optimizer_g = optim.SGD(list(G.features.parameters()), lr=config['lr'], weight_decay=0.0005) optimizer_f = optim.SGD(list(F1.parameters()) + list(F2.parameters()), momentum=0.9, lr=config['lr'], weight_decay=0.0005) elif config['optimizer'] == 'adam': optimizer_g = optim.Adam(G.features.parameters(), lr=config['lr'], weight_decay=0.0005) optimizer_f = optim.Adam(list(F1.parameters()) + list(F2.parameters()), lr=config['lr'], weight_decay=0.0005) else: optimizer_g = optim.Adadelta(G.features.parameters(), lr=args.lr, weight_decay=0.0005) optimizer_f = optim.Adadelta(list(F1.parameters()) + list(F2.parameters()), lr=args.lr, weight_decay=0.0005) criterion = nn.CrossEntropyLoss().cuda() for ep in range(config['num_epoch']): G.train() F1.train() F2.train() for batch_idx, data in enumerate(dataset): if batch_idx * batch_size > 30000: break # 이 부분 왜 있는지 확인 if config['cuda']: data1 = data['S'] target1 = data['S_label'] data2 = data['T'] target2 = data['T_label'] data1, target1 = data1.cuda(), target1.cuda() data2, target2 = data2.cuda(), target2.cuda() eta = 1.0 data = Variable(torch.cat((data1, data2), 0)) target1 = Variable(target1) # Step A : source data로 G, F1,F2 학습시키는 과정 optimizer_g.zero_grad() optimizer_f.zero_grad() output = G(data) # source, target data 같이 입력 output1 = F1(output) output_s1 = output1[:batch_size, :] # source data 부분 loss1 = criterion(output_s1, target1) # source data의 cross entropy 계산 output_t1 = output1[batch_size:, :] # target data logit 부분 output_t1 = F.softmax(output_t1) # target data softmax 통과 entropy_loss = -torch.mean( torch.log(torch.mean(output_t1, 0) + 1e-6)) output2 = F2(output) output_s2 = output2[:batch_size, :] # source data loss2 = criterion(output_s2, target1) # source data의 cross entropy 계산 output_t2 = output2[batch_size:, :] # target data logit 부분 output_t2 = F.softmax(output_t2) # target data softmax 통과 entropy_loss = entropy_loss - torch.mean( torch.log(torch.mean(output_t2, 0) + 1e-6)) # 두 F1, F2의 entropy를 더한다 all_loss = loss1 + loss2 + 0.01 * entropy_loss # 이 entropy loss가 논문에서는 class balance loss?? all_loss.backward() optimizer_g.step() optimizer_f.step() # Step B: F1, F2들의 target data에 대한 output의 차이가 max되도록 F1, F2를 트레인 # G의 파라메터들은 고정 optimizer_g.zero_grad() optimizer_f.zero_grad() output = G(data) output1 = F1(output) output_s1 = output1[:batch_size, :] loss1 = criterion(output_s1, target1) output_t1 = output1[batch_size:, :] output_t1 = F.softmax(output_t1) entropy_loss = -torch.mean( torch.log(torch.mean(output_t1, 0) + 1e-6)) output2 = F2(output) output_s2 = output2[:batch_size, :] loss2 = criterion(output_s2, target1) output_t2 = output2[batch_size:, :] output_t2 = F.softmax(output_t2) entropy_loss = entropy_loss - torch.mean( torch.log(torch.mean(output_t2, 0) + 1e-6)) loss_dis = torch.mean(torch.abs(output_t1 - output_t2)) F_loss = loss1 + loss2 - eta * loss_dis + 0.01 * entropy_loss F_loss.backward() optimizer_f.step() # Step C : G를 train, F1, F2의 ouput의 discrepancy가 작아지도록 G를 학습 # 이 단계를 여러번 수행한다 for i in range(num_k): optimizer_g.zero_grad() output = G(data) output1 = F1(output) output_s1 = output1[:batch_size, :] loss1 = criterion(output_s1, target1) output_t1 = output1[batch_size:, :] output_t1 = F.softmax(output_t1) entropy_loss = -torch.mean( torch.log(torch.mean(output_t1, 0) + 1e-6)) # torch.mean(input, dim=0) 각 컬럼별 평균계산, 왜 mean을 계산하는 거지? 이 부분 이해가 안됨 output2 = F2(output) output_s2 = output2[:batch_size, :] loss2 = criterion(output_s2, target1) output_t2 = output2[batch_size:, :] output_t2 = F.softmax(output_t2) entropy_loss = entropy_loss - torch.mean( torch.log(torch.mean(output_t2, 0) + 1e-6)) loss_dis = torch.mean( torch.abs(output_t1 - output_t2)) #왜 여기서는 entropy loss를 구현하지 않았지? loss_dis.backward() optimizer_g.step() if batch_idx % config['log_interval'] == 0: print( 'Train Ep: {} [{}/{} ({:.0f}%)]\tLoss1: {:.6f}\tLoss2: {:.6f}\t Dis: {:.6f} Entropy: {:.6f}' .format(ep, batch_idx * len(data), 70000, 100. * batch_idx / 70000, loss1.data[0], loss2.data[0], loss_dis.data[0], entropy_loss.data[0])) if batch_idx == 1 and ep > 1: test(test_loader, dataset_test, ep, config)
# Check for GPU availability: device = my_models.device_gpu_cpu() print('using device:', device) dtype = torch.float32 # we will be using float train = True test = True model = None if train: # Create models: model = my_models.model_2() my_models.test_model_size(model, dtype) # test model size output: optimizer = optim.Adadelta(model.parameters()) # Train model: model, loss_data = my_models.train_model(model, optimizer, train_loader, val_loader, device, dtype, epoches=2, print_every=5) # Save model to file: torch.save(model.state_dict(), MODEL_PATH + MODEL_NAME) # Save loss data to file:
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=14, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) # get some random traning images dataiter = iter(train_loader) images, labels = dataiter.next() # show batch images grid = torchvision.utils.make_grid(images) writer.add_image('images', grid, 0) model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) # show model graph writer.add_graph(model, images) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(model, device, test_loader) scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt") # close writer writer.close()
def main(opt, case): print("Arguments are : " + str(opt)) if opt.experiment is None: opt.experiment = 'expr' os.system('mkdir {0}'.format(opt.experiment)) # Why do we use this? opt.manualSeed = random.randint(1, 10000) # fix seed print("Random Seed: ", opt.manualSeed) random.seed(opt.manualSeed) np.random.seed(opt.manualSeed) torch.manual_seed(opt.manualSeed) cudnn.benchmark = True if torch.cuda.is_available() and not opt.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) opt.cuda = True print('Set CUDA to true.') train_dataset = dataset.hwrDataset(mode="train") assert train_dataset # The shuffle needs to be false when the sizing has been done. train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batchSize, shuffle=False, num_workers=int(opt.workers), collate_fn=dataset.alignCollate( imgH=opt.imgH, imgW=opt.imgW, keep_ratio=True)) test_dataset = dataset.hwrDataset(mode="test", transform=dataset.resizeNormalize( (100, 32))) nclass = len(opt.alphabet) + 1 nc = 1 criterion = CTCLoss() # custom weights initialization called on crnn def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv') != -1: m.weight.data.normal_(0.0, 0.02) elif classname.find('BatchNorm') != -1: m.weight.data.normal_(1.0, 0.02) m.bias.data.fill_(0) crnn = crnn_model.CRNN(opt.imgH, nc, nclass, opt.nh) crnn.apply(weights_init) if opt.cuda and not opt.uses_old_saving: crnn.cuda() crnn = torch.nn.DataParallel(crnn, device_ids=range(opt.ngpu)) criterion = criterion.cuda() if opt.crnn != '': print('Loading pre-trained model from %s' % opt.crnn) loaded_model = torch.load(opt.crnn) if opt.uses_old_saving: print("Assuming model was saved in rudementary fashion") crnn.load_state_dict(loaded_model) crnn.cuda() crnn = torch.nn.DataParallel(crnn, device_ids=range(opt.ngpu)) criterion = criterion.cuda() start_epoch = 0 else: print("Loaded model accuracy: " + str(loaded_model['accuracy'])) print("Loaded model epoch: " + str(loaded_model['epoch'])) start_epoch = loaded_model['epoch'] crnn.load_state_dict(loaded_model['state']) # Read this. loss_avg = utils.averager() # If following the paper's recommendation, using AdaDelta if opt.adam: optimizer = optim.Adam(crnn.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) elif opt.adadelta: optimizer = optim.Adadelta(crnn.parameters(), lr=opt.lr) elif opt.adagrad: print("Using adagrad") optimizer = optim.Adagrad(crnn.parameters(), lr=opt.lr) else: optimizer = optim.RMSprop(crnn.parameters(), lr=opt.lr) converter = utils.strLabelConverter(opt.alphabet) best_val_accuracy = 0 for epoch in range(start_epoch, opt.niter): train_iter = iter(train_loader) i = 0 while i < len(train_loader): for p in crnn.parameters(): p.requires_grad = True crnn.train() cost = train_batch(crnn, criterion, optimizer, train_iter, opt, converter) loss_avg.add(cost) i += 1 if i % opt.displayInterval == 0: print( '[%d/%d][%d/%d] Loss: %f' % (epoch, opt.niter, i, len(train_loader), loss_avg.val()) + " " + case) loss_avg.reset() if i % opt.valInterval == 0: try: val_loss_avg, accuracy = val_batch(crnn, opt, test_dataset, converter, criterion) model_state = { 'epoch': epoch + 1, 'iter': i, 'state': crnn.state_dict(), 'accuracy': accuracy, 'val_loss_avg': val_loss_avg, } utils.save_checkpoint( model_state, accuracy > best_val_accuracy, '{0}/netCRNN_{1}_{2}_{3}.pth'.format( opt.experiment, epoch, i, accuracy), opt.experiment) if accuracy > best_val_accuracy: best_val_accuracy = accuracy except Exception as e: print(e)
args.emb_size, args.theta_act, embeddings, args.train_embeddings, args.enc_drop).to(device) print('model: {}'.format(model)) if args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wdecay) elif args.optimizer == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wdecay) elif args.optimizer == 'adadelta': optimizer = optim.Adadelta(model.parameters(), lr=args.lr, weight_decay=args.wdecay) elif args.optimizer == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=args.lr, weight_decay=args.wdecay) elif args.optimizer == 'asgd': optimizer = optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) else: print('Defaulting to vanilla SGD') optimizer = optim.SGD(model.parameters(), lr=args.lr)
params += list(model_tag.parameters()) if opt.task_sc: params += list(model_class.parameters()) params = list( filter(lambda p: p.requires_grad, params)) # must be list, otherwise clip_grad_norm_ will be invalid if opt.optim.lower() == 'sgd': optimizer = optim.SGD(params, lr=opt.lr) elif opt.optim.lower() == 'adam': optimizer = optim.Adam(params, lr=opt.lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=0) # (beta1, beta2) elif opt.optim.lower() == 'adadelta': optimizer = optim.Adadelta(params, rho=0.95, lr=1.0) elif opt.optim.lower() == 'rmsprop': optimizer = optim.RMSprop(params, lr=opt.lr) def decode(data_feats, data_tags, data_class, output_path): data_index = np.arange(len(data_feats)) losses = [] TP, FP, FN, TN = 0.0, 0.0, 0.0, 0.0 TP2, FP2, FN2, TN2 = 0.0, 0.0, 0.0, 0.0 with open(output_path, 'w') as f: for j in range(0, len(data_index), opt.test_batchSize): if opt.testing: words, tags, raw_tags, classes, raw_classes, lens, line_nums = data_reader.get_minibatch_with_class( data_feats, data_tags,
def main(): global net global trainloader global valloader global best_loss global log_file global optimizer global criterion #initialize start_epoch = 0 best_loss = np.finfo(np.float32).max #augmentation random_rotate_func = lambda x: x.rotate(random.randint(-15, 15), resample=Image.BICUBIC) random_scale_func = lambda x: transforms.Scale(int(random.uniform(1.0,1.4)\ * max(x.size)))(x) gaus_blur_func = lambda x: x.filter(PIL.ImageFilter.GaussianBlur(radius=1)) median_blur_func = lambda x: x.filter(PIL.ImageFilter.MedianFilter(size=3)) #train preprocessing transform_train = transforms.Compose([ transforms.Lambda(lambd=random_rotate_func), transforms.CenterCrop(224), transforms.Scale((112, 112)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=MEAN, std=STD), ]) #validation preprocessing transform_val = transforms.Compose([ transforms.CenterCrop(224), transforms.Scale((112, 112)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=MEAN, std=STD) ]) print('==> Preparing data..') trainset = ImageListDataset(root=args.root, list_path=args.datalist, split='train', transform=transform_train) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) valset = ImageListDataset(root=args.root, list_path=args.datalist, split='val', transform=transform_val) valloader = torch.utils.data.DataLoader(valset, batch_size=args.batch_size, shuffle=False, num_workers=8, pin_memory=True) # Create model net = None if args.model_name == 'ResNet18': net = ResNet18() elif args.model_name == 'ResNet34': net = ResNet34() elif args.model_name == 'ResNet50': net = ResNet50() elif args.model_name == 'DenseNet': net = DenseNet121() elif args.model_name == 'VGG11': net = VGG('VGG11') elif args.model_name == 'ResNet152': net = ResNet152() elif args / model_name == 'ResNet101': net = ResNet101() print('==> Building model..') if args.resume: # Load checkpoint print('==> Resuming from checkpoint..') assert os.path.isdir( 'checkpoint'), 'Error: no checkpoint directory found!' checkpoint = torch.load('./checkpoint/{0}/best_model_chkpt.t7'.format( args.name)) net.load_state_dict(checkpoint['net']) best_loss = checkpoint['loss'] start_epoch = checkpoint['epoch'] + 1 # Choosing of criterion if args.criterion == 'MSE': criterion = nn.MSELoss() else: criterion = None # Add your criterion # Choosing of optimizer if args.optimizer == 'adam': optimizer = optim.Adam(net.parameters(), lr=args.lr) elif args.optimizer == 'adadelta': optimizer = optim.Adadelta(net.parameters(), lr=args.lr) else: optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) # Load on GPU if args.cuda: print('==> Using CUDA') print(torch.cuda.device_count()) if torch.cuda.device_count() > 1: net = torch.nn.DataParallel(net).cuda() else: net = net.cuda() cudnn.benchmark = True print('==> model on GPU') criterion = criterion.cuda() else: print('==> model on CPU') if not os.path.isdir(args.log_dir_path): os.makedirs(args.log_dir_path) log_file_path = os.path.join(args.log_dir_path, args.name + '.log') # logger file openning log_file = open(log_file_path, 'w') log_file.write('type,epoch,batch,loss,acc\n') print('==> Model') print(net) try: for epoch in range(start_epoch, args.epochs): trainloader = torch.utils.data.DataLoader( trainset, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) train(epoch) validation(epoch) print('==> Best loss: {0:.5f}'.format(best_loss)) except Exception as e: print(e.message) log_file.write(e.message) finally: log_file.close()
padPF1 = np.zeros((1, 5)) PF1 = np.vstack((padPF1, PF1)) PF2 = np.asarray(rng.uniform(low=-1, high=1, size=[101, 5])) padPF2 = np.zeros((1, 5)) PF2 = np.vstack((padPF2, PF2)) net = pcnn.textPCNN(parameterlist['max_sentence_word'], parameterlist['classes'], parameterlist['wordvector_dim'], parameterlist['PF_dim'], parameterlist['filter_size'], parameterlist['num_filter'], Wv, PF1, PF2) # criterion = nn.CrossEntropyLoss() # optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=0.001) optimizer = optim.Adadelta(net.parameters(), lr=1.0, rho=0.95, eps=1e-06, weight_decay=0) np.random.seed(1234) epoch_now = 0 batch_now = 0 print 'a epoch = %d batch' % ( int(len(train)) / int(parameterlist['batch_size']) + 1) for epoch in range(parameterlist['trainepoch']): print 'epoch = %d , start.. ' % epoch_now shuffled_data = [] shuffle_indices = np.random.permutation(np.arange(len(train))) for i in range(len(train)): shuffled_data.append(train[shuffle_indices[i]]) bag_now = 0
def train(data): print("Training model...") data.show_data_summary() save_data_name = data.model_dir + ".dset" data.save(save_data_name) model = SeqModel(data) loss_function = nn.NLLLoss() if data.optimizer.lower() == "sgd": optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) else: print("Optimizer illegal: %s" % (data.optimizer)) exit(0) best_dev = -10 # data.HP_iteration = 1 ## start training for idx in range(data.HP_iteration): epoch_start = time.time() temp_start = epoch_start print("Epoch: %s/%s" % (idx, data.HP_iteration)) if data.optimizer == "SGD": optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 sample_id = 0 sample_loss = 0 total_loss = 0 right_token = 0 whole_token = 0 random.shuffle(data.train_Ids) ## set model in train model model.train() model.zero_grad() batch_size = data.HP_batch_size batch_id = 0 train_num = len(data.train_Ids) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = data.train_Ids[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu) instance_count += 1 loss, tag_seq = model.neg_log_likelihood_loss( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) right, whole = predict_check(tag_seq, batch_label, mask) right_token += right whole_token += whole sample_loss += loss.data[0] total_loss += loss.data[0] if end % 500 == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time print( " Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) sys.stdout.flush() sample_loss = 0 loss.backward() optimizer.step() model.zero_grad() temp_time = time.time() temp_cost = temp_time - temp_start print(" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print( "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % (idx, epoch_cost, train_num / epoch_cost, total_loss)) # continue speed, acc, p, r, f, _, _ = evaluate(data, model, "dev") dev_finish = time.time() dev_cost = dev_finish - epoch_finish if data.seg: current_score = f print( "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (dev_cost, speed, acc, p, r, f)) else: current_score = acc print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" % (dev_cost, speed, acc)) if current_score > best_dev: if data.seg: print("Exceed previous best f score:", best_dev) else: print("Exceed previous best acc score:", best_dev) model_name = data.model_dir + '.' + str(idx) + ".model" print("Save current best model in file:", model_name) torch.save(model.state_dict(), model_name) best_dev = current_score # ## decode test speed, acc, p, r, f, _, _ = evaluate(data, model, "test") test_finish = time.time() test_cost = test_finish - dev_finish if data.seg: print( "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (test_cost, speed, acc, p, r, f)) else: print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" % (test_cost, speed, acc)) gc.collect()
def train(config_path, experiment_info, thread_queue): logger.info('------------MedQA v1.0 Train--------------') logger.info( '============================loading config file... print config file =========================' ) global_config = read_config(config_path) logger.info(open(config_path).read()) logger.info( '^^^^^^^^^^^^^^^^^^^^^^ config file info above ^^^^^^^^^^^^^^^^^^^^^^^^^' ) # set random seed seed = global_config['global']['random_seed'] torch.manual_seed(seed) global gpu_nums, init_embedding_weight, batch_test_data, tensorboard_writer, test_epoch, embedding_layer_name test_epoch = 0 enable_cuda = global_config['train']['enable_cuda'] device = torch.device("cuda" if enable_cuda else "cpu") if torch.cuda.is_available() and not enable_cuda: logger.warning("CUDA is avaliable, you can enable CUDA in config file") elif not torch.cuda.is_available() and enable_cuda: raise ValueError( "CUDA is not abaliable, please unable CUDA in config file") ############################### 获取数据集 ############################ logger.info('reading MedQA h5file dataset...') dataset = MedQADataset(global_config) logger.info('constructing model...') model_choose = global_config['global']['model'] dataset_h5_path = global_config['data']['dataset_h5'] logger.info('Using dataset path is : %s' % dataset_h5_path) logger.info('### Using model is: %s ###' % model_choose) if model_choose == 'SeaReader': model = SeaReader(dataset_h5_path, device) elif model_choose == 'SimpleSeaReader': model = SimpleSeaReader(dataset_h5_path, device) elif model_choose == 'TestModel': model = TestModel(dataset_h5_path, device) elif model_choose == 'cnn_model': model = cnn_model(dataset_h5_path, device) elif model_choose == 'match-lstm+': model = MatchLSTMPlus(dataset_h5_path) elif model_choose == 'r-net': model = RNet(dataset_h5_path) else: raise ValueError('model "%s" in config file not recognized' % model_choose) print_network(model) gpu_nums = torch.cuda.device_count() logger.info('dataParallel using %d GPU.....' % gpu_nums) if gpu_nums > 1: model = torch.nn.DataParallel(model) model = model.to(device) # weights_init(model) embedding_layer_name = 'module.embedding.embedding_layer.weight' for name in model.state_dict().keys(): if 'embedding_layer.weight' in name: embedding_layer_name = name break init_embedding_weight = model.state_dict()[embedding_layer_name].clone() task_criterion = CrossEntropyLoss( weight=torch.tensor([0.2, 0.8]).to(device)).to(device) gate_criterion = gate_Loss().to(device) embedding_criterion = Embedding_reg_L21_Loss(c=0.01).to(device) all_criterion = [task_criterion, gate_criterion, embedding_criterion] # optimizer optimizer_choose = global_config['train']['optimizer'] optimizer_lr = global_config['train']['learning_rate'] optimizer_eps = float(global_config['train']['eps']) optimizer_param = filter(lambda p: p.requires_grad, model.parameters()) if optimizer_choose == 'adamax': optimizer = optim.Adamax(optimizer_param) elif optimizer_choose == 'adadelta': optimizer = optim.Adadelta(optimizer_param) elif optimizer_choose == 'adam': optimizer = optim.Adam(optimizer_param, lr=optimizer_lr, eps=optimizer_eps) elif optimizer_choose == 'sgd': optimizer = optim.SGD(optimizer_param, lr=optimizer_lr) else: raise ValueError('optimizer "%s" in config file not recoginized' % optimizer_choose) scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=5, verbose=True) # check if exist model weight weight_path = global_config['data']['model_path'] if os.path.exists(weight_path) and global_config['train']['continue']: logger.info('loading existing weight............') if enable_cuda: weight = torch.load( weight_path, map_location=lambda storage, loc: storage.cuda()) else: weight = torch.load(weight_path, map_location=lambda storage, loc: storage) # weight = pop_dict_keys(weight, ['pointer', 'init_ptr_hidden']) # partial initial weight # todo 之后的版本可能不需要这句了 if not global_config['train']['keep_embedding']: del weight[ 'module.embedding.embedding_layer.weight'] #删除掉embedding层的参数 ,避免尺寸不对的问题 # # 删除全连接层的参数 # decision_layer_names=[] # for name,w in weight.items(): # if 'decision_layer' in name: # decision_layer_names.append(name) # for name in decision_layer_names: # del weight[name] model.load_state_dict(weight, strict=False) # training arguments logger.info('start training............................................') train_batch_size = global_config['train']['batch_size'] valid_batch_size = global_config['train']['valid_batch_size'] test_batch_size = global_config['train']['test_batch_size'] batch_train_data = dataset.get_dataloader_train(train_batch_size, shuffle=False) batch_dev_data = dataset.get_dataloader_dev(valid_batch_size, shuffle=False) batch_test_data = dataset.get_dataloader_test(test_batch_size, shuffle=False) clip_grad_max = global_config['train']['clip_grad_norm'] enable_char = False # tensorboardX writer save_cur_experiment_code_path = "savedcodes/" + experiment_info save_current_codes(save_cur_experiment_code_path, global_config) tensorboard_writer = SummaryWriter( log_dir=os.path.join('tensorboard_logdir', experiment_info)) best_valid_acc = None # every epoch for epoch in range(global_config['train']['epoch']): # train model.train() # set training = True, make sure right dropout train_avg_loss, train_avg_binary_acc = train_on_model( model=model, criterion=all_criterion, optimizer=optimizer, batch_data=batch_train_data, epoch=epoch, clip_grad_max=clip_grad_max, device=device, thread_queue=thread_queue) # evaluate with torch.no_grad(): model.eval() # let training = False, make sure right dropout val_avg_loss, val_avg_binary_acc, val_avg_problem_acc = eval_on_model( model=model, criterion=all_criterion, batch_data=batch_dev_data, epoch=epoch, device=device, init_embedding_weight=init_embedding_weight, eval_dataset='dev') # test_avg_loss, test_avg_binary_acc, test_avg_problem_acc=eval_on_model(model=model, # criterion=all_criterion, # batch_data=batch_test_data, # epoch=epoch, # device=device, # enable_char=enable_char, # batch_char_func=dataset.gen_batch_with_char, # init_embedding_weight=init_embedding_weight) # save model when best f1 score if best_valid_acc is None or val_avg_problem_acc > best_valid_acc: epoch_info = 'epoch=%d, val_binary_acc=%.4f, val_problem_acc=%.4f' % ( epoch, val_avg_binary_acc, val_avg_problem_acc) save_model( model, epoch_info=epoch_info, model_weight_path=global_config['data']['model_weight_dir'] + experiment_info + "_model_weight.pt", checkpoint_path=global_config['data']['checkpoint_path'] + experiment_info + "_save.log") logger.info("========= saving model weight on epoch=%d =======" % epoch) best_valid_acc = val_avg_problem_acc tensorboard_writer.add_scalar("train/lr", optimizer.param_groups[0]['lr'], epoch) tensorboard_writer.add_scalar("train/avg_loss", train_avg_loss, epoch) tensorboard_writer.add_scalar("train/binary_acc", train_avg_binary_acc, epoch) tensorboard_writer.add_scalar("val/avg_loss", val_avg_loss, epoch) tensorboard_writer.add_scalar("val/binary_acc", val_avg_binary_acc, epoch) tensorboard_writer.add_scalar("val/problem_acc", val_avg_problem_acc, epoch) # adjust learning rate scheduler.step(train_avg_loss) logger.info('finished.................................') tensorboard_writer.close()
TEXT, LABEL, filter_pred=lambda ex: ex.label != 'neutral') TEXT.build_vocab(train) LABEL.build_vocab(train) train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits( (train, val, test), batch_size=50, device=-1, repeat=False) # Build the vocabulary with word embeddings url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec' TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url)) net = CNN(model='multichannel', vocab_size=len(TEXT.vocab), class_number=2) criterion = nn.CrossEntropyLoss() parameters = filter(lambda p: p.requires_grad, net.parameters()) optimizer = optim.Adadelta(parameters, lr=0.5) for epoch in range(50): total_loss = 0 for batch in train_iter: text, label = batch.text.t_(), batch.label label = label - 1 net.zero_grad() logit = net(text) loss = criterion(logit, label) loss.backward() nn.utils.clip_grad_norm(parameters, max_norm=3) optimizer.step() total_loss += loss.data
def train(config): with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) # with open(config.train_eval_file, "r") as fh: # train_eval_file = json.load(fh) with open(config.dev_eval_file, "r") as fh: dev_eval_file = json.load(fh) with open(config.idx2word_file, 'r') as fh: idx2word_dict = json.load(fh) random.seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) if config.pre_att_id != '': config.save = 'T16-v2-{}-kp0{}-cond{}-ori{}-attcnt{}-gatefuse{}-lr{}-opt{}'.format(config.pre_att_id, config.keep_prob0, int(config.condition), int(config.original_ptr), config.att_cnt, config.gate_fuse, config.init_lr, config.optim) if config.use_elmo: config.save += "_ELMO" if config.train_emb: raise ValueError config.save += "_TE" if config.trnn: config.save += '_TRNN' else: config.save = 'baseline-{}'.format(time.strftime("%Y%m%d-%H%M%S")) if config.use_elmo: config.save += "_ELMO" if config.uniform_graph: config.save += '_UNIFORM' # non overwriting # if os.path.exists(config.save): # sys.exit(1) create_exp_dir(config.save, scripts_to_save=['run.py', 'model.py', 'util.py', 'main.py']) def logging(s, print_=True, log_=True): if print_: print(s) if log_: with open(os.path.join(config.save, 'log.txt'), 'a+') as f_log: f_log.write(s + '\n') if config.pre_att_id != '': sys.path.insert(0, '../pretrain') from data import Vocab vocab = Vocab('../pretrain/vocabv2.pkl', 100000, '<unk>') # from model8 import StructurePredictor # model = StructurePredictor(512, len(vocab), 1, 1, 0.0) # model.load_state_dict(torch.load('../skip_thought/{}/st_predictor.pt'.format(config.pre_att_id))) # model.cuda() # model.eval() model = torch.load('../pretrain/{}/model.pt'.format(config.pre_att_id)) # if 'gru' in config.pre_att_id: # model.set_gru(True) # elif 'add' in config.pre_att_id: # model.set_gru(False) # else: # assert False model.cuda() ori_model = model model = nn.DataParallel(model) model.eval() import re try: nly = int(re.search(r'ly(\d+)', config.pre_att_id).group(1)) except: nly = len(ori_model.enc_net.nets) if config.gate_fuse < 3: config.num_mixt = nly * 8 else: config.num_mixt = (nly + nly - 1) * 8 # old_model = torch.load('../skip_thought/{}/model.pt'.format(config.pre_att_id)) # from model5 import GraphModel # model = GraphModel(old_model).cuda() # model = nn.DataParallel(model) # model.eval() # del old_model # import gc # gc.collect() # from data import Vocab # vocab = Vocab('../skip_thought/vocabv2.pkl', 100000, '<unk>') del sys.path[0] pre_att_data = {'model': model, 'vocab': vocab} else: pre_att_data = None logging('Config') for k, v in config.__dict__.items(): logging(' - {} : {}'.format(k, v)) if config.use_elmo and config.load_elmo: ee = torch.load(config.elmo_ee_file) else: ee = None logging("Building model...") train_buckets = get_buckets(config.train_record_file, config, limit=True) dev_buckets = get_buckets(config.dev_record_file, config, limit=False) def build_train_iterator(): return DataIterator(train_buckets, config.batch_size, config.para_limit, config.ques_limit, config.char_limit, True, pre_att_data, config, ee, idx2word_dict, 'train') def build_dev_iterator(): return DataIterator(dev_buckets, config.batch_size, config.para_limit, config.ques_limit, config.char_limit, False, pre_att_data, config, ee, idx2word_dict, 'dev') model = Model(config, word_mat, char_mat) if not config.trnn else ModelTRNN(config, word_mat, char_mat) # logging('nparams {}'.format(sum([p.nelement() for p in model.parameters() if p.requires_grad]))) ori_model = model.cuda() # ori_model.word_emb.cpu() # model = ori_model model = nn.DataParallel(ori_model) lr = config.init_lr # optimizer = optim.SGD(model.parameters(), lr=config.init_lr, momentum=config.momentum) if config.optim == "adadelta": # default optimizer = optim.Adadelta(filter(lambda p: p.requires_grad, model.parameters()), lr=config.init_lr, rho=0.95) elif config.optim == "sgd": optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=config.init_lr, momentum=config.momentum) elif config.optim == "adam": optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=config.init_lr, betas=(config.momentum, 0.999)) cur_patience = 0 total_loss = 0 global_step = 0 best_dev_F1 = None stop_train = False start_time = time.time() eval_start_time = time.time() model.train() for epoch in range(10000 * 32 // config.batch_size): for data in build_train_iterator(): context_idxs = Variable(data['context_idxs']) ques_idxs = Variable(data['ques_idxs']) context_char_idxs = Variable(data['context_char_idxs']) ques_char_idxs = Variable(data['ques_char_idxs']) context_lens = Variable(data['context_lens']) y1 = Variable(data['y1']) y2 = Variable(data['y2']) graph = data['graph'] graph_q = data['graph_q'] if graph is not None: graph.volatile = False graph.requires_grad = False graph_q.volatile = False graph_q.requires_grad = False elmo, elmo_q = data['elmo'], data['elmo_q'] if elmo is not None: elmo.volatile = False elmo.requires_grad = False elmo_q.volatile = False elmo_q.requires_grad = False logit1, logit2 = model(context_idxs, ques_idxs, context_char_idxs, ques_char_idxs, context_lens, pre_att=graph, pre_att_q=graph_q, elmo=elmo, elmo_q=elmo_q) loss = criterion(logit1, y1) + criterion(logit2, y2) optimizer.zero_grad() loss.backward() optimizer.step() import gc; gc.collect() total_loss += loss.data[0] global_step += 1 if global_step % config.period == 0: cur_loss = total_loss / config.period elapsed = time.time() - start_time logging('| epoch {:3d} | step {:6d} | lr {:05.5f} | ms/batch {:5.2f} | train loss {:8.3f}'.format(epoch, global_step, lr, elapsed*1000/config.period, cur_loss)) total_loss = 0 start_time = time.time() if global_step % (config.checkpoint * 32 // config.batch_size) == 0: model.eval() metrics = evaluate_batch(build_dev_iterator(), model, 0, dev_eval_file) model.train() logging('-' * 89) logging('| eval {:6d} in epoch {:3d} | time: {:5.2f}s | dev loss {:8.3f} | EM {:.4f} | F1 {:.4f}'.format(global_step//config.checkpoint, epoch, time.time()-eval_start_time, metrics['loss'], metrics['exact_match'], metrics['f1'])) debug_s = '' if hasattr(ori_model, 'scales'): debug_s += '| scales {} '.format(ori_model.scales.data.cpu().numpy().tolist()) # if hasattr(ori_model, 'mixt_logits') and (not hasattr(ori_model, 'condition') or not ori_model.condition): # debug_s += '| mixt {}'.format(F.softmax(ori_model.mixt_logits, dim=-1).data.cpu().numpy().tolist()) if debug_s != '': logging(debug_s) logging('-' * 89) eval_start_time = time.time() dev_F1 = metrics['f1'] if best_dev_F1 is None or dev_F1 > best_dev_F1: best_dev_F1 = dev_F1 torch.save(ori_model.state_dict(), os.path.join(config.save, 'model.pt')) cur_patience = 0 else: cur_patience += 1 if cur_patience >= config.patience: lr /= 2.0 for param_group in optimizer.param_groups: param_group['lr'] = lr if lr < config.init_lr * 1e-2: stop_train = True break cur_patience = 0 if stop_train: break logging('best_dev_F1 {}'.format(best_dev_F1))