def train(): args = get_train_args() current_device = torch.device("cuda" if args.with_cuda else "cpu") criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum') if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) if args.use_logging: log_filename = str(dt.datetime.now(tz=TIMEZONE)) logging.basicConfig(level=logging.INFO, format='%(message)s') logger = logging.getLogger() logger.addHandler(logging.FileHandler(log_filename + '.log', 'a')) print = logger.info model = pretrained_model(args.pretrained_model, args.num_epochs, args.batch_size, args.max_sentence_length, current_device, args.save_dir, args.patience) train_dataset = model.tokenize_data('./train_lyrics.jsonl', args.max_sentence_length) valid_dataset = model.tokenize_data('./valid_lyrics.jsonl', args.max_sentence_length, 'valid') optimizer = AdamW(model.model.parameters(), lr=args.learning_rate, eps=args.eps, weight_decay=args.weight_decay) model.train(train_dataset, valid_dataset, optimizer, args.step_size, args.gamma)
def main(): global logs_path args = get_train_args() train_id = args.train_id num_processes = args.num_processes num_timesteps = args.timesteps game = args.game level = args.level model_save_path = args.save_dir + train_id + ".pkl" logs_path = os.path.join( args.logs_dir, check_subfolder_availability(args.logs_dir, train_id)) is_joint = args.joint load_model_path = args.load_model algo_name = args.algo policy_name = args.policy print( "\n\n===============================================================") print("Num processes:\t\t", num_processes) print("Train timesteps:\t", num_timesteps) print("Model save path:\t", model_save_path) print("Logs path:\t\t", logs_path) if not is_joint: print("Game:\t\t\t", game) print("Level:\t\t\t", level) else: print("Joint Training") if load_model_path: print("Loading model:\t\t", load_model_path) else: print("Creating new model") print( "===============================================================\n\n") train( train_id=train_id, game=game, level=level, num_processes=num_processes, num_timesteps=num_timesteps, algo_name=algo_name, policy_name=policy_name, is_joint=is_joint, model_save_path=model_save_path, load_model_path=load_model_path, logs_path=logs_path, hyper_opt=args.hyper_opt, short_life=args.short_life, )
preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict if __name__ == '__main__': args = get_train_args() if args.model_name == 'BiDAF': print('BiDAF Model') main(args) else: print('QANet Model') train_QaNet(args)
save_best_only=True, mode='auto') return [checkpoint] def build_model(args): if args.model == "VGG19": model = VGG19(num_classes=7, input_shape=(48, 48, 3), dropout=0.5) else: model = build_resnet(args.model, input_shape=(48, 48, 3), classes=7) return model if __name__ == "__main__": args_ = get_train_args() # Load data X_train, y_train, X_dev, y_dev, X_test, y_test, weight = data_loader(args_) # Optimizer, image augmentation and add class weight sgd = optimizer(args_) train_generator = train_generator() class_weights = class_weights(weight) checkpoint = callback(args_) model = build_model(args_) model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy']) model.fit_generator(train_generator.flow(X_train, y_train, args_.batch_size),
def main(): opts = get_train_args() print("load data ...") data = ELMoDataset(opts) dataloader = DataLoader(data, shuffle=True, batch_size=opts.batch_size) valid_data = ELMoDataset(opts, split='validation') validloader = DataLoader(valid_data, shuffle=True, batch_size=opts.batch_size) print("load model ...") model = ELMo(opts, [data.word_vocab_size, data.char_size]) optimizer = optim.Adam(model.parameters(), lr=opts.learning_rate) early_stopping = EarlyStopping(5, 0.0) if opts.multi == True: model = torch.nn.DataParallel(model) if opts.resume == True: print("resume training") model.load_state_dict(torch.load('model.pt')) model.cuda() loss = torch.nn.CrossEntropyLoss(ignore_index=0) train_batch_num = math.ceil(data.data_size / opts.batch_size) valid_batch_num = math.ceil(valid_data.data_size / opts.batch_size) print("start training") for epoch in range(1, opts.epochs + 1): print("epoch : " + str(epoch)) model.train() epoch_start = time.time() epoch_loss = 0 tot = 0 for i, batch_data in enumerate(dataloader): optimizer.zero_grad() word_idx, char_idx = batch_data pred = model(word_idx, char_idx) train_loss = loss(pred, word_idx[:, 1:].reshape(-1)) train_loss.backward() optimizer.step() batch_loss = train_loss.item() tot += word_idx.size(0) print('\r{:>10} epoch {} progress {} loss: {} perplexity : {}\n'. format('', epoch, tot / data.__len__(), batch_loss, 2**batch_loss), end='') epoch_loss += batch_loss end = time.time() time_used = end - epoch_start print('one epoch time: {} minutes'.format(time_used / 60)) print('{} epochs'.format(epoch)) print('epoch {} loss : {} perplexity : {}'.format( epoch, epoch_loss / train_batch_num, 2**(epoch_loss / train_batch_num))) model.eval() valid_loss = 0 with torch.no_grad(): for i, batch_data in enumerate(validloader): word_idx, char_idx = batch_data pred = model(word_idx, char_idx) batch_loss = loss(pred, word_idx[:, 1:].reshape(-1)) valid_loss += batch_loss.item() print('valid loss : {} preplexity : {}'.format( valid_loss / valid_batch_num, 2**(valid_loss / valid_batch_num))) with open('log.txt', 'a') as f: f.write( str(epoch) + ' epoch :' + str(epoch_loss / train_batch_num) + ' ' + str(2**(epoch_loss / train_batch_num)) + '\n') f.write( str(epoch) + ' valid :' + str(valid_loss / valid_batch_num) + ' ' + str(2**(valid_loss / valid_batch_num)) + '\n') # check early stopping if early_stopping(valid_loss): print("[Training is early stopped in %d Epoch.]" % epoch) if not os.path.exists(opts.model_path): os.mkdir(opts.model_path) state_dict = model.state_dict() torch.save(state_dict, os.path.abspath(opts.model_path + '/model.pt')) print("[Saved the trained model successfully.]") break if epoch % opts.save_step == 0: print("save model...") torch.save(model.state_dict(), 'model.pt')
def train(): """ Main script for training. """ args, train_config = get_train_args() num_classes = args.num_classes # Communicator and Context from nnabla.ext_utils import get_extension_context extension_module = "cudnn" # TODO: Hard coded!!! ctx = get_extension_context(extension_module, device_id=args.device_id, type_config=args.type_config) comm = CommunicatorWrapper(ctx) nn.set_default_context(comm.ctx) # To utilize TensorCore in FP16 channels = 4 if args.type_config == 'half' else 3 from nnabla_ext.cuda import StreamEventHandler stream_event_handler = StreamEventHandler(int(comm.ctx.device_id)) # Create data iterater data, vdata = get_data_iterators(args, comm, channels) # Create mixup object mixup = create_mixup_or_none(train_config.mixup, num_classes, comm) # Network for training t_model = get_model(args, num_classes, test=False, channel_last=args.channel_last, mixup=mixup, channels=channels, label_smoothing=train_config.label_smoothing, ctx_for_loss=comm.ctx_float) # Network for validation v_model = get_model(args, num_classes, test=True, channel_last=args.channel_last, channels=channels) # Solver # lr will be set later solver = MomentumNoWeightDecayBn(1, train_config.momentum) solver.set_parameters(nn.get_parameters()) # Learning rate scheduler learning_rate_scheduler = create_learning_rate_scheduler(train_config) # Monitors monitor = None if comm.rank == 0: if not os.path.isdir(args.monitor_path): os.makedirs(args.monitor_path) monitor = M.Monitor(args.monitor_path) # Epoch runner loss_scaling = train_config.loss_scaling if args.type_config == 'half' else 1 train_epoch = EpochTrainer(t_model, solver, learning_rate_scheduler, data, comm, monitor, loss_scaling, train_config.weight_decay, stream_event_handler, mixup) val_epoch = None if args.val_interval > 0: val_epoch = EpochValidator(v_model, vdata, comm, monitor, stream_event_handler) # Epoch loop for epoch in range(train_config.epochs): # Save parameters if epoch > 0 and epoch % ( args.model_save_interval) == 0 and comm.rank == 0: nn.save_parameters( os.path.join(args.monitor_path, 'param_%03d.h5' % epoch)) # Run validation for examples in an epoch if val_epoch is not None \ and epoch > 0 \ and epoch % args.val_interval == 0: val_epoch.run(epoch) # Run training for examples in an epoch train_epoch.run(epoch) # Run final validation if val_epoch is not None: val_epoch.run(train_config.epochs) # Save the final model. if comm.rank == 0: nn.save_parameters( os.path.join(args.monitor_path, 'param_%03d.h5' % (train_config.epochs)))
def main(): opts = get_train_args() print("load data ...") train_data = datasets.ImageFolder( root="data/train", transform=transforms.Compose([ transforms.Resize((256, 256)), # 한 축을 128로 조절하고 #transforms.CenterCrop(256), # square를 한 후, transforms.ToTensor(), # Tensor로 바꾸고 (0~1로 자동으로 normalize) transforms.Normalize( (0.5, 0.5, 0.5), # -1 ~ 1 사이로 normalize (0.5, 0.5, 0.5)), # (c - m)/s 니까... ])) test_data = datasets.ImageFolder( root="data/test", transform=transforms.Compose([ transforms.Resize((256, 256)), # 한 축을 128로 조절하고 #transforms.CenterCrop(256), # square를 한 후, transforms.ToTensor(), # Tensor로 바꾸고 (0~1로 자동으로 normalize) transforms.Normalize( (0.5, 0.5, 0.5), # -1 ~ 1 사이로 normalize (0.5, 0.5, 0.5)), # (c - m)/s 니까... ])) test_loader = DataLoader(test_data, batch_size=opts.batch_size, shuffle=False, num_workers=opts.num_processes) classes = train_data.classes print(classes) print("load model ...") if opts.model == 'resnet': model = models.resnet50(progress=True) model.load_state_dict(torch.load('resnet_model.pt')) elif opts.model == 'vggnet': model = models.vgg13_bn(progress=True) model.load_state_dict(torch.load('vggnet_model.pt')) elif opts.model == 'googlenet': model = models.googlenet(progress=True) model.load_state_dict(torch.load('googlenet_model.pt')) elif opts.model == 'densenet': model = models.densenet121(progress=True) model.load_state_dict(torch.load('densenet_model.pt')) else: model = models.resnext50_32x4d(progress=True) model.load_state_dict(torch.load('resnext_model.pt')) print(opts.model) model.cuda() print("start inference") idx = 0 with torch.no_grad(): with open(opts.model + '_result.txt', 'a') as f: for i, (inputs, labels) in enumerate(test_loader): inputs = inputs.cuda() outputs = model(inputs) _, predicted = outputs.max(1) for j, meta in enumerate(predicted): predicted_class = classes[meta] plant_class = predicted_class.split('_')[0] disease_class = predicted_class.split('_')[1] f.write( str(test_loader.dataset.samples[idx][0].split('/') [-1].split('.')[0]) + '\t' + plant_class + '\t' + disease_class + '\n') idx += 1
progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict if __name__ == '__main__': parser = argparse.ArgumentParser('Test a trained model on SQuAD') parser.add_argument( '--use_adv', default="no", help='Whether or not to test/train on adversarial dataset.') args, unknown = parser.parse_known_args() use_adv = True if (args.use_adv == 'yes') else False main(get_train_args(use_adv))
def main(): args, log = get_train_args() log.info('[Program starts. Loading data...]') train, dev, dev_y, embedding, opt = load_data(vars(args)) device, args.gpu_ids = util.get_available_devices() log.info('[Data loaded.]') if args.save_dawn_logs: dawn_start = datetime.now() log.info('dawn_entry: epoch\tf1Score\thours') model = DRQA(opt, embedding = embedding) model = nn.DataParallel(model, args.gpu_ids) model = model.to(device) epoch_0 = 0 best_val_score = 0.0 # ema = util.EMA(model, args.ema_decay) ## get optimizer and scheduler parameters = [p for p in model.parameters() if p.requires_grad] optimizer = optim.Adamax(parameters, weight_decay = opt['weight_decay']) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) train_loss = AverageMeter() for epoch in range(epoch_0, epoch_0 + args.epochs): log.warning('Epoch {}'.format(epoch)) # train batches = BatchGen(train, batch_size=args.batch_size, gpu=args.cuda) start = datetime.now() updates = 0 model.train() for i, batch in enumerate(batches): # Transfer to GPU inputs = [e.to(device) for e in batch[:7]] target_s = batch[7].to(device) target_e = batch[8].to(device) optimizer.zero_grad() # Forward score_s,score_e = model(*inputs) loss = F.nll_loss(score_s, target_s) + F.nll_loss(score_e, target_e) train_loss.update(loss.item()) # Backward loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(),opt['grad_clipping']) optimizer.step() updates +=1 # Clip gradients if i % args.log_per_updates == 0: log.info('> epoch [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'.format( epoch,updates, train_loss.value, str((datetime.now() - start) / (i + 1) * (len(batches) - i - 1)).split('.')[0])) log.debug('\n') # eval batches = BatchGen(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda) predictions = [] for i, batch in enumerate(batches): model.eval() inputs = [e.to(device) for e in batch[:7]] # Run forward with torch.no_grad(): score_s, score_e = model(*inputs) # Get argmax test spans text = batch[-2] spans = batch[-1] pred = [] max_len = opt['max_len'] or score_s.size(1) for i in range(score_s.size(0)): scores = torch.ger(score_s[i], score_e[i]) scores.triu_().tril_(max_len - 1) scores = scores.cpu().clone().numpy() s_idx, e_idx = np.unravel_index(np.argmax(scores), scores.shape) s_offset, e_offset = spans[i][s_idx][0], spans[i][e_idx][1] pred.append(text[i][s_offset:e_offset]) predictions.extend(pred) log.debug('> evaluating [{}/{}]'.format(i, len(batches))) em, f1 = score(predictions, dev_y) log.warning("dev EM: {} F1: {}".format(em, f1)) if args.save_dawn_logs: time_diff = datetime.now() - dawn_start log.warning("dawn_entry: {}\t{}\t{}".format(epoch, f1/100.0, float(time_diff.total_seconds() / 3600.0))) # save if not args.save_last_only or epoch == epoch_0 + args.epochs - 1: model_file = os.path.join(args.model_dir, 'checkpoint_epoch_{}.pt'.format(epoch)) params = { 'state_dict': { 'network': model.state_dict(), 'optimizer': optimizer.state_dict(), 'updates': updates, 'loss': train_loss.state_dict() }, 'config': opt, 'epoch': epoch, 'em': em, 'f1': f1, 'best_eval': best_val_score, 'random_state': random.getstate(), 'torch_state': torch.random.get_rng_state(), #'torch_cuda_state': torch.cuda.get_rng_state() } torch.save(params, model_file) log.info('model saved to {}'.format(model_file)) if f1 > best_val_score: best_val_score = f1 copyfile( model_file, os.path.join(args.model_dir, 'best_model.pt')) log.info('[new best model saved.]')
def train(): """ Main script for training. """ args, train_config = get_train_args() num_classes = args.num_classes # Communicator and Context from nnabla.ext_utils import get_extension_context extension_module = "cudnn" # TODO: Hard coded!!! ctx = get_extension_context(extension_module, device_id=args.device_id, type_config=args.type_config) comm = CommunicatorWrapper(ctx) nn.set_default_context(comm.ctx) # To utilize TensorCore in FP16 channels = 4 if args.type_config == 'half' else 3 from nnabla_ext.cuda import StreamEventHandler stream_event_handler = StreamEventHandler(int(comm.ctx.device_id)) # Create data iterater data, vdata = get_data_iterators(args, comm, channels, args.spatial_size) # Create mixup object mixup = create_mixup_or_none(train_config.mixup, num_classes, comm) # Load model for fine-tuning if args.finetune: assert args.model_load_path is not None, "`--model-load-path` must be set in finetuning mode." if comm.rank == 0: logger.info(f'Loading parameter file `{args.model_load_path}.`') logger.info( "NOTE: It doesn't verify the compatibility between the parameter file and the architecture you choose." ) nn.load_parameters(args.model_load_path) # String assumption that the last two paramters is the classification layer. param_keys = list(nn.get_parameters().keys()) bkey = param_keys[-1] wkey = param_keys[-2] if comm.rank == 0: logger.info( f'Removing the last two parameter for fine tuning under an assumption that those correspond to the final affine layer parameters; `{wkey}` and `{bkey}`.' ) nn.parameter.pop_parameter(wkey) nn.parameter.pop_parameter(bkey) # Network for training t_model = get_model(args, num_classes, test=False, channel_last=args.channel_last, mixup=mixup, channels=channels, spatial_size=args.spatial_size, label_smoothing=train_config.label_smoothing, ctx_for_loss=comm.ctx_float) # Network for validation v_model = get_model(args, num_classes, test=True, channel_last=args.channel_last, spatial_size=args.spatial_size, channels=channels) # Solver # lr will be set later solver = MomentumNoWeightDecayBn(1, train_config.momentum) solver.set_parameters(nn.get_parameters()) # Learning rate scheduler learning_rate_scheduler = create_learning_rate_scheduler(train_config) # Monitors monitor = None if comm.rank == 0: if not os.path.isdir(args.monitor_path): os.makedirs(args.monitor_path) monitor = M.Monitor(args.monitor_path) save_args(args, train_config) # Epoch runner loss_scaling = train_config.loss_scaling if args.type_config == 'half' else 1 train_epoch = EpochTrainer(t_model, solver, learning_rate_scheduler, data, comm, monitor, loss_scaling, train_config.weight_decay, stream_event_handler, mixup) val_epoch = None if args.val_interval > 0: val_epoch = EpochValidator(v_model, vdata, comm, monitor, stream_event_handler) # Epoch loop for epoch in range(train_config.epochs): # Save parameters if epoch > 0 and epoch % ( args.model_save_interval) == 0 and comm.rank == 0: nn.save_parameters( os.path.join(args.monitor_path, 'param_%03d.h5' % epoch)) # Run validation for examples in an epoch if val_epoch is not None \ and epoch > 0 \ and epoch % args.val_interval == 0: val_epoch.run(epoch) # Run training for examples in an epoch train_epoch.run(epoch) # Run final validation if val_epoch is not None: val_epoch.run(train_config.epochs) # Save the final model. if comm.rank == 0: nn.save_parameters( os.path.join(args.monitor_path, 'param_%03d.h5' % (train_config.epochs)))
def train(): # Check NNabla version if utils.get_nnabla_version_integer() < 11900: raise ValueError( 'Please update the nnabla version to v1.19.0 or latest version since memory efficiency of core engine is improved in v1.19.0' ) parser, args = get_train_args() # Get context. ctx = get_extension_context(args.context, device_id=args.device_id) comm = CommunicatorWrapper(ctx) nn.set_default_context(comm.ctx) ext = import_extension_module(args.context) # Monitors # setting up monitors for logging monitor_path = args.output monitor = Monitor(monitor_path) monitor_best_epoch = MonitorSeries('Best epoch', monitor, interval=1) monitor_traing_loss = MonitorSeries('Training loss', monitor, interval=1) monitor_validation_loss = MonitorSeries('Validation loss', monitor, interval=1) monitor_lr = MonitorSeries('learning rate', monitor, interval=1) monitor_time = MonitorTimeElapsed("training time per iteration", monitor, interval=1) if comm.rank == 0: if not os.path.isdir(args.output): os.makedirs(args.output) # Initialize DataIterator for MUSDB18. train_source, valid_source, args = load_datasources(parser, args) train_iter = data_iterator( train_source, args.batch_size, RandomState(args.seed), with_memory_cache=False, ) valid_iter = data_iterator( valid_source, 1, RandomState(args.seed), with_memory_cache=False, ) if comm.n_procs > 1: train_iter = train_iter.slice(rng=None, num_of_slices=comm.n_procs, slice_pos=comm.rank) valid_iter = valid_iter.slice(rng=None, num_of_slices=comm.n_procs, slice_pos=comm.rank) # Calculate maxiter per GPU device. # Change max_iter, learning_rate and weight_decay according no. of gpu devices for multi-gpu training. default_batch_size = 16 train_scale_factor = (comm.n_procs * args.batch_size) / default_batch_size max_iter = int((train_source._size // args.batch_size) // comm.n_procs) weight_decay = args.weight_decay * train_scale_factor args.lr = args.lr * train_scale_factor # Calculate the statistics (mean and variance) of the dataset scaler_mean, scaler_std = utils.get_statistics(args, train_source) # clear cache memory ext.clear_memory_cache() max_bin = utils.bandwidth_to_max_bin(train_source.sample_rate, args.nfft, args.bandwidth) # Get X-UMX/UMX computation graph and variables as namedtuple model = get_model(args, scaler_mean, scaler_std, max_bin=max_bin) # Create Solver and set parameters. solver = S.Adam(args.lr) solver.set_parameters(nn.get_parameters()) # Initialize Early Stopping es = utils.EarlyStopping(patience=args.patience) # Initialize LR Scheduler (ReduceLROnPlateau) lr_scheduler = ReduceLROnPlateau(lr=args.lr, factor=args.lr_decay_gamma, patience=args.lr_decay_patience) best_epoch = 0 # AverageMeter for mean loss calculation over the epoch losses = utils.AverageMeter() # Training loop. for epoch in trange(args.epochs): # TRAINING losses.reset() for batch in range(max_iter): model.mixture_audio.d, model.target_audio.d = train_iter.next() solver.zero_grad() model.loss.forward(clear_no_need_grad=True) if comm.n_procs > 1: all_reduce_callback = comm.get_all_reduce_callback() model.loss.backward(clear_buffer=True, communicator_callbacks=all_reduce_callback) else: model.loss.backward(clear_buffer=True) solver.weight_decay(weight_decay) solver.update() losses.update(model.loss.d.copy(), args.batch_size) training_loss = losses.get_avg() # clear cache memory ext.clear_memory_cache() # VALIDATION losses.reset() for batch in range(int(valid_source._size // comm.n_procs)): x, y = valid_iter.next() dur = int(valid_source.sample_rate * args.valid_dur) sp, cnt = 0, 0 loss_tmp = nn.NdArray() loss_tmp.zero() while 1: model.vmixture_audio.d = x[Ellipsis, sp:sp + dur] model.vtarget_audio.d = y[Ellipsis, sp:sp + dur] model.vloss.forward(clear_no_need_grad=True) cnt += 1 sp += dur loss_tmp += model.vloss.data if x[Ellipsis, sp:sp + dur].shape[-1] < dur or x.shape[-1] == cnt * dur: break loss_tmp = loss_tmp / cnt if comm.n_procs > 1: comm.all_reduce(loss_tmp, division=True, inplace=True) losses.update(loss_tmp.data.copy(), 1) validation_loss = losses.get_avg() # clear cache memory ext.clear_memory_cache() lr = lr_scheduler.update_lr(validation_loss, epoch=epoch) solver.set_learning_rate(lr) stop = es.step(validation_loss) if comm.rank == 0: monitor_best_epoch.add(epoch, best_epoch) monitor_traing_loss.add(epoch, training_loss) monitor_validation_loss.add(epoch, validation_loss) monitor_lr.add(epoch, lr) monitor_time.add(epoch) if validation_loss == es.best: best_epoch = epoch # save best model if args.umx_train: nn.save_parameters(os.path.join(args.output, 'best_umx.h5')) else: nn.save_parameters( os.path.join(args.output, 'best_xumx.h5')) if args.umx_train: # Early stopping for UMX after `args.patience` (140) number of epochs if stop: print("Apply Early Stopping") break
import torch import torch.nn as nn import torch.nn.functional as F import math from args import get_train_args config = get_train_args() D = config.connector_dim Nh = config.num_heads Dword = config.glove_dim Dchar = config.char_dim batch_size = config.batch_size dropout = config.dropout dropout_char = config.dropout_char Lc = config.para_limit Lq = config.ques_limit def mask_logits(inputs, mask): mask = mask.type(torch.float32) return inputs + (-1e30) * (1 - mask) class Initialized_Conv1d(nn.Module): def __init__(self, in_channels, out_channels, kernel_size=1, relu=False, stride=1,
def train(): # Check NNabla version if get_nnabla_version_integer() < 11900: raise ValueError( 'Please update the nnabla version to v1.19.0 or latest version since memory efficiency of core engine is improved in v1.19.0' ) parser, args = get_train_args() # Get context. ctx = get_extension_context(args.context, device_id=args.device_id) comm = CommunicatorWrapper(ctx) nn.set_default_context(comm.ctx) ext = import_extension_module(args.context) # Monitors # setting up monitors for logging monitor_path = os.path.join(args.output, args.target) monitor = Monitor(monitor_path) monitor_traing_loss = MonitorSeries('Training loss', monitor, interval=1) monitor_lr = MonitorSeries('learning rate', monitor, interval=1) monitor_time = MonitorTimeElapsed("training time per epoch", monitor, interval=1) if comm.rank == 0: if not os.path.isdir(args.output): os.makedirs(args.output) # Initialize DataIterator for MUSDB. train_source, args = load_datasources(parser, args) train_iter = data_iterator(train_source, args.batch_size, RandomState(args.seed), with_memory_cache=False) if comm.n_procs > 1: train_iter = train_iter.slice(rng=None, num_of_slices=comm.n_procs, slice_pos=comm.rank) # Change max_iter, learning_rate and weight_decay according no. of gpu devices for multi-gpu training. default_batch_size = 6 train_scale_factor = (comm.n_procs * args.batch_size) / default_batch_size max_iter = int(train_source._size // (comm.n_procs * args.batch_size)) weight_decay = args.weight_decay * train_scale_factor args.lr = args.lr * train_scale_factor print(f"max_iter per GPU-device:{max_iter}") # Calculate the statistics (mean and variance) of the dataset scaler_mean, scaler_std = get_statistics(args, train_source) # clear cache memory ext.clear_memory_cache() # Create input variables. mixture_audio = nn.Variable([args.batch_size] + list(train_source._get_data(0)[0].shape)) target_audio = nn.Variable([args.batch_size] + list(train_source._get_data(0)[1].shape)) with open(f"./configs/{args.target}.yaml") as file: # Load target specific Hyper parameters hparams = yaml.load(file, Loader=yaml.FullLoader) # create training graph mix_spec = spectogram(*stft(mixture_audio, n_fft=hparams['fft_size'], n_hop=hparams['hop_size'], patch_length=256), mono=(hparams['n_channels'] == 1)) target_spec = spectogram(*stft(target_audio, n_fft=hparams['fft_size'], n_hop=hparams['hop_size'], patch_length=256), mono=(hparams['n_channels'] == 1)) with nn.parameter_scope(args.target): d3net = D3NetMSS(hparams, comm=comm.comm, input_mean=scaler_mean, input_scale=scaler_std, init_method='xavier') pred_spec = d3net(mix_spec) loss = F.mean(F.squared_error(pred_spec, target_spec)) loss.persistent = True # Create Solver and set parameters. solver = S.Adam(args.lr) solver.set_parameters(nn.get_parameters()) # Initialize LR Scheduler (AnnealingScheduler) lr_scheduler = AnnealingScheduler(init_lr=args.lr, anneal_steps=[40], anneal_factor=0.1) # AverageMeter for mean loss calculation over the epoch losses = AverageMeter() for epoch in range(args.epochs): # TRAINING losses.reset() for batch in range(max_iter): mixture_audio.d, target_audio.d = train_iter.next() solver.zero_grad() loss.forward(clear_no_need_grad=True) if comm.n_procs > 1: all_reduce_callback = comm.get_all_reduce_callback() loss.backward(clear_buffer=True, communicator_callbacks=all_reduce_callback) else: loss.backward(clear_buffer=True) solver.weight_decay(weight_decay) solver.update() losses.update(loss.d.copy(), args.batch_size) training_loss = losses.get_avg() # clear cache memory ext.clear_memory_cache() lr = lr_scheduler.get_learning_rate(epoch) solver.set_learning_rate(lr) if comm.rank == 0: monitor_traing_loss.add(epoch, training_loss) monitor_lr.add(epoch, lr) monitor_time.add(epoch) # save intermediate weights nn.save_parameters(f"{os.path.join(args.output, args.target)}.h5") if comm.rank == 0: # save final weights nn.save_parameters( f"{os.path.join(args.output, args.target)}_final.h5")
def main(): opts = get_train_args() print("load data ...") data = DataSet('data/modified_triples.txt') dataloader = DataLoader(data, shuffle=True, batch_size=opts.batch_size) print("load model ...") if opts.model_type == 'transe': model = TransE(opts, data.ent_tot, data.rel_tot) elif opts.model_type == "distmult": model = DistMult(opts, data.ent_tot, data.rel_tot) if opts.optimizer == 'Adam': optimizer = optim.Adam(model.parameters(), lr=opts.lr) elif opts.optimizer == 'SGD': optimizer = optim.SGD(model.parameters(), lr=opts.lr) model.cuda() model.relation_normalize() loss = torch.nn.MarginRankingLoss(margin=opts.margin) print("start training") for epoch in range(1, opts.epochs + 1): print("epoch : " + str(epoch)) model.train() epoch_start = time.time() epoch_loss = 0 tot = 0 cnt = 0 for i, batch_data in enumerate(dataloader): optimizer.zero_grad() batch_h, batch_r, batch_t, batch_n = batch_data batch_h = torch.LongTensor(batch_h).cuda() batch_r = torch.LongTensor(batch_r).cuda() batch_t = torch.LongTensor(batch_t).cuda() batch_n = torch.LongTensor(batch_n).cuda() pos_score, neg_score, dist = model.forward(batch_h, batch_r, batch_t, batch_n) pos_score = pos_score.cpu() neg_score = neg_score.cpu() dist = dist.cpu() train_loss = loss(pos_score, neg_score, torch.ones(pos_score.size(-1))) + dist train_loss.backward() optimizer.step() batch_loss = torch.sum(train_loss) epoch_loss += batch_loss batch_size = batch_h.size(0) tot += batch_size cnt += 1 print('\r{:>10} epoch {} progress {} loss: {}\n'.format( '', epoch, tot / data.__len__(), train_loss), end='') end = time.time() time_used = end - epoch_start epoch_loss /= cnt print('one epoch time: {} minutes'.format(time_used / 60)) print('{} epochs'.format(epoch)) print('epoch {} loss: {}'.format(epoch, epoch_loss)) if epoch % opts.save_step == 0: print("save model...") model.entity_normalize() torch.save(model.state_dict(), 'model.pt') print("save model...") model.entity_normalize() torch.save(model.state_dict(), 'model.pt') print("[Saving embeddings of whole entities & relations...]") save_embeddings(model, opts, data.id2ent, data.id2rel) print("[Embedding results are saved successfully.]")
saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals) if __name__ == '__main__': main(get_train_args())
def train(): # Check NNabla version if utils.get_nnabla_version_integer() < 11900: raise ValueError( 'Please update the nnabla version to v1.19.0 or latest version since memory efficiency of core engine is improved in v1.19.0' ) parser, args = get_train_args() # Get context. ctx = get_extension_context(args.context, device_id=args.device_id) comm = CommunicatorWrapper(ctx) nn.set_default_context(comm.ctx) ext = import_extension_module(args.context) # Monitors # setting up monitors for logging monitor_path = args.output monitor = Monitor(monitor_path) monitor_best_epoch = MonitorSeries('Best epoch', monitor, interval=1) monitor_traing_loss = MonitorSeries('Training loss', monitor, interval=1) monitor_validation_loss = MonitorSeries('Validation loss', monitor, interval=1) monitor_lr = MonitorSeries('learning rate', monitor, interval=1) monitor_time = MonitorTimeElapsed("training time per iteration", monitor, interval=1) if comm.rank == 0: print("Mixing coef. is {}, i.e., MDL = {}*TD-Loss + FD-Loss".format( args.mcoef, args.mcoef)) if not os.path.isdir(args.output): os.makedirs(args.output) # Initialize DataIterator for MUSDB. train_source, valid_source, args = load_datasources(parser, args) train_iter = data_iterator(train_source, args.batch_size, RandomState(args.seed), with_memory_cache=False, with_file_cache=False) valid_iter = data_iterator(valid_source, 1, RandomState(args.seed), with_memory_cache=False, with_file_cache=False) if comm.n_procs > 1: train_iter = train_iter.slice(rng=None, num_of_slices=comm.n_procs, slice_pos=comm.rank) valid_iter = valid_iter.slice(rng=None, num_of_slices=comm.n_procs, slice_pos=comm.rank) # Calculate maxiter per GPU device. max_iter = int((train_source._size // args.batch_size) // comm.n_procs) weight_decay = args.weight_decay * comm.n_procs print("max_iter", max_iter) # Calculate the statistics (mean and variance) of the dataset scaler_mean, scaler_std = utils.get_statistics(args, train_source) max_bin = utils.bandwidth_to_max_bin(train_source.sample_rate, args.nfft, args.bandwidth) unmix = OpenUnmix_CrossNet(input_mean=scaler_mean, input_scale=scaler_std, nb_channels=args.nb_channels, hidden_size=args.hidden_size, n_fft=args.nfft, n_hop=args.nhop, max_bin=max_bin) # Create input variables. mixture_audio = nn.Variable([args.batch_size] + list(train_source._get_data(0)[0].shape)) target_audio = nn.Variable([args.batch_size] + list(train_source._get_data(0)[1].shape)) vmixture_audio = nn.Variable( [1] + [2, valid_source.sample_rate * args.valid_dur]) vtarget_audio = nn.Variable([1] + [8, valid_source.sample_rate * args.valid_dur]) # create training graph mix_spec, M_hat, pred = unmix(mixture_audio) Y = Spectrogram(*STFT(target_audio, n_fft=unmix.n_fft, n_hop=unmix.n_hop), mono=(unmix.nb_channels == 1)) loss_f = mse_loss(mix_spec, M_hat, Y) loss_t = sdr_loss(mixture_audio, pred, target_audio) loss = args.mcoef * loss_t + loss_f loss.persistent = True # Create Solver and set parameters. solver = S.Adam(args.lr) solver.set_parameters(nn.get_parameters()) # create validation graph vmix_spec, vM_hat, vpred = unmix(vmixture_audio, test=True) vY = Spectrogram(*STFT(vtarget_audio, n_fft=unmix.n_fft, n_hop=unmix.n_hop), mono=(unmix.nb_channels == 1)) vloss_f = mse_loss(vmix_spec, vM_hat, vY) vloss_t = sdr_loss(vmixture_audio, vpred, vtarget_audio) vloss = args.mcoef * vloss_t + vloss_f vloss.persistent = True # Initialize Early Stopping es = utils.EarlyStopping(patience=args.patience) # Initialize LR Scheduler (ReduceLROnPlateau) lr_scheduler = ReduceLROnPlateau(lr=args.lr, factor=args.lr_decay_gamma, patience=args.lr_decay_patience) best_epoch = 0 # Training loop. for epoch in trange(args.epochs): # TRAINING losses = utils.AverageMeter() for batch in range(max_iter): mixture_audio.d, target_audio.d = train_iter.next() solver.zero_grad() loss.forward(clear_no_need_grad=True) if comm.n_procs > 1: all_reduce_callback = comm.get_all_reduce_callback() loss.backward(clear_buffer=True, communicator_callbacks=all_reduce_callback) else: loss.backward(clear_buffer=True) solver.weight_decay(weight_decay) solver.update() losses.update(loss.d.copy(), args.batch_size) training_loss = losses.avg # clear cache memory ext.clear_memory_cache() # VALIDATION vlosses = utils.AverageMeter() for batch in range(int(valid_source._size // comm.n_procs)): x, y = valid_iter.next() dur = int(valid_source.sample_rate * args.valid_dur) sp, cnt = 0, 0 loss_tmp = nn.NdArray() loss_tmp.zero() while 1: vmixture_audio.d = x[Ellipsis, sp:sp + dur] vtarget_audio.d = y[Ellipsis, sp:sp + dur] vloss.forward(clear_no_need_grad=True) cnt += 1 sp += dur loss_tmp += vloss.data if x[Ellipsis, sp:sp + dur].shape[-1] < dur or x.shape[-1] == cnt * dur: break loss_tmp = loss_tmp / cnt if comm.n_procs > 1: comm.all_reduce(loss_tmp, division=True, inplace=True) vlosses.update(loss_tmp.data.copy(), 1) validation_loss = vlosses.avg # clear cache memory ext.clear_memory_cache() lr = lr_scheduler.update_lr(validation_loss, epoch=epoch) solver.set_learning_rate(lr) stop = es.step(validation_loss) if comm.rank == 0: monitor_best_epoch.add(epoch, best_epoch) monitor_traing_loss.add(epoch, training_loss) monitor_validation_loss.add(epoch, validation_loss) monitor_lr.add(epoch, lr) monitor_time.add(epoch) if validation_loss == es.best: # save best model nn.save_parameters(os.path.join(args.output, 'best_xumx.h5')) best_epoch = epoch if stop: print("Apply Early Stopping") break
def main(): early_stopping = EarlyStopping(5, 0.0) opts = get_train_args() print("load data ...") train_data = datasets.ImageFolder( root="data/train", transform=transforms.Compose([ transforms.Resize((256, 256)), # 한 축을 128로 조절하고 #transforms.CenterCrop(256), # square를 한 후, transforms.ToTensor(), # Tensor로 바꾸고 (0~1로 자동으로 normalize) transforms.Normalize( (0.5, 0.5, 0.5), # -1 ~ 1 사이로 normalize (0.5, 0.5, 0.5)), # (c - m)/s 니까... ])) valid_data = datasets.ImageFolder( root="data/val", transform=transforms.Compose([ transforms.Resize((256, 256)), # 한 축을 128로 조절하고 #transforms.CenterCrop(128), # square를 한 후, transforms.ToTensor(), # Tensor로 바꾸고 (0~1로 자동으로 normalize) transforms.Normalize( (0.5, 0.5, 0.5), # -1 ~ 1 사이로 normalize (0.5, 0.5, 0.5)), # (c - m)/s 니까... ])) train_loader = DataLoader(train_data, batch_size=opts.batch_size, shuffle=True, num_workers=opts.num_processes) valid_loader = DataLoader(valid_data, batch_size=opts.batch_size, shuffle=True, num_workers=opts.num_processes) classes = train_data.classes print(classes) print("load model ...") if opts.model == 'resnet': model = models.resnet50(progress=True) elif opts.model == 'vggnet': model = models.vgg13_bn(progress=True) elif opts.model == 'googlenet': model = models.googlenet(progress=True) elif opts.model == 'densenet': model = models.densenet121(progress=True) else: model = models.resnext50_32x4d(progress=True) print(opts.model) optimizer = optim.Adam(model.parameters(), lr=opts.lr) model.cuda() loss = torch.nn.CrossEntropyLoss() batch_nums = np.round(14400 / opts.batch_size) valid_nums = np.round(1600 / opts.batch_size) print("start training") for epoch in range(1, opts.epochs + 1): print("epoch : " + str(epoch)) model.train() epoch_loss = 0 tot = 0 cnt = 0 for i, (inputs, labels) in enumerate(train_loader): optimizer.zero_grad() inputs, labels = inputs.cuda(), labels.cuda() train_loss = loss(model(inputs), labels) train_loss.backward() optimizer.step() batch_loss = train_loss.item() epoch_loss += batch_loss cnt += 1 print('\r{:>10} epoch {} progress {} loss: {}\n'.format( '', epoch, tot / 14400, train_loss)) with open(str(opts.model) + ' log.txt', 'a') as f: f.write( str(epoch) + ' loss : ' + str(epoch_loss / batch_nums) + '\n') model.eval() valid_loss = 0 total = 0 correct = 0 with torch.no_grad(): for i, (inputs, labels) in enumerate(valid_loader): inputs, labels = inputs.cuda(), labels.cuda() outputs = model(inputs) batch_loss = loss(outputs, labels) batch_loss = batch_loss.item() valid_loss += batch_loss _, predicted = outputs.max(1) total += labels.size(0) correct += predicted.eq(labels).sum().item() acc = 100 * correct / total with open(str(opts.model) + ' log.txt', 'a') as f: f.write( str(epoch) + ' loss : ' + str(valid_loss / valid_nums) + ' acc : ' + str(acc) + '\n') # check early stopping if early_stopping(valid_loss): print("[Training is early stopped in %d Epoch.]" % epoch) torch.save(model.state_dict(), str(opts.model) + '_model.pt') print("[Saved the trained model successfully.]") break if epoch % opts.save_step == 0: print("save model...") torch.save(model.state_dict(), str(opts.model) + '_model.pt') print("save model...") torch.save(model.state_dict(), str(opts.model) + '_model.pt')