def train(): logging("Training text AE") # gan: preparation if args.niters_gan_schedule != "": gan_schedule = [int(x) for x in args.niters_gan_schedule.split("-")] else: gan_schedule = [] niter_gan = 1 best_test_loss = None impatience = 0 print("Begin!\n") for epoch in range(1, args.epochs + 1): # update gan training schedule if epoch in gan_schedule: niter_gan += 1 logging("GAN training loop schedule: {}".format(niter_gan)) total_loss_ae = 0 epoch_start_time = time.time() start_time = time.time() # train ae train_data = corpus.batchify(corpus.train, args.batch_size, shuffle=True) for niter, data in enumerate(train_data): # if niter == 1: # print(data) total_loss_ae, start_time = train_ae(epoch, data, total_loss_ae, start_time, niter) if niter % 10 == 0: autoencoder.noise_anneal(args.noise_anneal) logging('[{}/{}][{}/{}]'.format( epoch, args.epochs, niter, train_batches_num)) # eval # corpus是全局的,如果用过一次,生成器就遍历完了,下次在用就不会生成数据,所以test_data这里必须设成函数内局部变量 test_data = corpus.batchify(corpus.test, eval_batch_size, shuffle=False) test_loss, accuracy = evaluate_autoencoder(test_data, epoch) # print(2) logging('| end of epoch {:3d} | time: {:5.2f}s | test loss {:5.2f} | ' 'test ppl {:5.2f} | acc {:3.3f}'.format(epoch, (time.time() - epoch_start_time), test_loss, math.exp(test_loss), accuracy)) save_ckpt("ckpt_epoch%d" % epoch, args.save, autoencoder, args, corpus) if best_test_loss is None or test_loss < best_test_loss: impatience = 0 best_test_loss = test_loss logging("New saving model: epoch {}. best valid score={:.6f}".format(epoch, best_test_loss)) save_ckpt("ckpt_epoch%d-best@%.6f" % (epoch, best_test_loss), args.save, autoencoder, args, corpus) else: if not args.no_earlystopping and epoch >= args.min_epochs: impatience += 1 if impatience > args.patience: logging("Ending training") sys.exit()
def train(): logging("Training text AE") # gan: preparation if args.niters_gan_schedule != "": gan_schedule = [int(x) for x in args.niters_gan_schedule.split("-")] else: gan_schedule = [] niter_gan = 1 best_test_loss = None impatience = 0 for epoch in range(1, args.epochs + 1): # update gan training schedule if epoch in gan_schedule: niter_gan += 1 logging("GAN training loop schedule: {}".format(niter_gan)) total_loss_ae = 0 epoch_start_time = time.time() start_time = time.time() niter = 0 # train ae for i in range(len(train_data)): # print("train batch %d" % i) total_loss_ae, start_time = train_ae(epoch, train_data[niter], total_loss_ae, start_time, niter) niter += 1 if niter % 10 == 0: autoencoder.noise_anneal(args.noise_anneal) logging('[{}/{}][{}/{}]'.format(epoch, args.epochs, niter, len(train_data))) # eval test_loss, accuracy = evaluate_autoencoder(test_data, epoch) logging('| end of epoch {:3d} | time: {:5.2f}s | test loss {:5.2f} | ' 'test ppl {:5.2f} | acc {:3.3f}'.format( epoch, (time.time() - epoch_start_time), test_loss, math.exp(test_loss), accuracy)) save_ckpt("ckpt_epoch%d" % epoch, args.save, autoencoder, args, corpus) if best_test_loss is None or test_loss < best_test_loss: impatience = 0 best_test_loss = test_loss logging( "New saving model: epoch {}. best valid score={.6f}".format( epoch, best_test_loss)) save_ckpt("ckpt_epoch%d-best@%.6f" % (epoch, best_test_loss), args.save, autoencoder, args, corpus) else: if not args.no_earlystopping and epoch >= args.min_epochs: impatience += 1 if impatience > args.patience: logging("Ending training") sys.exit()
def train_model(model, train_iter, epoch): total_epoch_loss, total_epoch_acc, steps, best_acc = 0., 0., 0, .7 weight_p, bias_p = [], [] for name, p in model.named_parameters(): if p.requires_grad: if 'bias' in name: bias_p.append(p) else: weight_p.append(p) optim = torch.optim.Adam([{ 'params': weight_p, 'weight_decay': args.l2_reg_lambda }, { 'params': bias_p, 'weight_decay': 0 }], lr=args.learning_rate * (args.decay_rate**epoch)) if torch.cuda.is_available(): model.cuda() model.train() for id, text, label in train_iter: #if (len(label) != args.batch_size): continue label = torch.autograd.Variable(label).long() if torch.cuda.is_available(): text = text.cuda() label = label.cuda() optim.zero_grad() output = model(text) # [batch_size, sentences] loss = loss_ft(output, label) acc = calc_acc(output, label) loss.backward() #clip_gradient(model, 1e-1) optim.step() steps += 1 total_epoch_loss += loss.item() total_epoch_acc += acc if (steps == 1) or (steps % args.evaluate_every == 0): logger.info('\n--- Train Step: %d ---' % steps) logger.info('Acc = %.4f, Loss = %.4f' % (acc, loss)) print('\n--- Train Step: %d ---' % steps) print('Acc = %.4f, Loss = %.4f' % (acc, loss)) writer.add_scalar('Train/Loss', loss, epoch * len(train_iter) + steps) writer.add_scalar('Train/Acc', acc * 100, epoch * len(train_iter) + steps) writer.flush() # save checkpoint if acc achieves best acc if (acc > best_acc) & (steps % args.evaluate_every == 0): utils.save_ckpt(print_datetime, epoch, model, model_name, acc) best_acc = acc # averaged loss and acc return total_epoch_loss / steps, total_epoch_acc / steps
def proc_func(infile, outfile, csv_path, csv_index): img_path = img_folder + infile.split('.')[0] + '/' video2frames(src_folder + infile, img_path) feature = OpticalFlowAnalyzer(img_path).analyze() np.savez(dst_folder + outfile, feature) [csv_old, index_old] = load_ckpt(ckpt_path).split('#') ckpt_index = str( max(csv_index, int(index_old)) if csv_path is csv_old else csv_index) ckpt_info = csv_path + '#' + ckpt_index save_ckpt(ckpt_info, ckpt_path)
def proc_func(infile, outfile, csv_path, csv_index): try: audio_analyzer = AudioAnalyzer(src_folder + infile) audio_analyzer.compute_features() feature = audio_analyzer.analyze() np.savez(dst_folder + outfile, **feature) except: pass [csv_old, index_old] = load_ckpt(ckpt_path).split('#') ckpt_index = str(max(csv_index, int(index_old)) if csv_path is csv_old else csv_index) ckpt_info = csv_path + '#' + ckpt_index save_ckpt(ckpt_info, ckpt_path)
def proc_func(csv_path): start_time = time.time() print(csv_path + ' has began ...') ckpt_path = ckpt_folder + csv_path + '.ckpt' check_path(ckpt_path, binary=True) ckpt_index = read_ckpt(ckpt_path) csv_file = csv.reader(open(csv_folder + csv_path + '.csv')) _ = next(csv_file) rows = [row for row in csv_file] print('start from checkpoint ' + str(ckpt_index + 1) + ' in ' + str(csv_path)) for i in tqdm(range(ckpt_index + 1, len(rows))): process(rows[i][0], rows[i][1]) save_ckpt(i, ckpt_path) print(csv_path + ' has been done in ' + str(time.time() - start_time))
def main(): parser = get_args() args, unparsed = parser.parse_known_args() if len(unparsed) != 0: raise NameError("Argument {} not recognized".format(unparsed)) logger = GOATLogger(args.mode, args.save, args.log_freq) random.seed(args.seed) torch.manual_seed(args.seed) if args.cpu: device = torch.device('cpu') else: if not torch.cuda.is_available(): raise RuntimeError("GPU unavailable.") args.devices = torch.cuda.device_count() args.batch_size *= args.devices torch.backends.cudnn.benchmark = True device = torch.device('cuda') torch.cuda.manual_seed(args.seed) # Get data train_loader, val_loader, vocab_size, num_answers = prepare_data(args) # Set up model model = Model(vocab_size, args.word_embed_dim, args.hidden_size, args.resnet_out, num_answers) model = nn.DataParallel(model).to(device) logger.loginfo("Parameters: {:.3f}M".format( sum(p.numel() for p in model.parameters()) / 1e6)) # Set up optimizer optim = torch.optim.Adamax(model.parameters(), lr=2e-3) last_epoch = 0 bscore = 0.0 if args.resume: logger.loginfo("Initialized from ckpt: " + args.resume) ckpt = torch.load(args.resume, map_location=device) last_epoch = ckpt['epoch'] model.load_state_dict(ckpt['state_dict']) optim.load_state_dict(ckpt['optim_state_dict']) if args.mode == 'eval': _ = evaluate(val_loader, model, last_epoch, device, logger, args.data_root) return # Train for epoch in range(last_epoch, args.epoch): train(train_loader, model, optim, epoch, device, logger) score = evaluate(val_loader, model, epoch, device, logger) bscore = save_ckpt(score, bscore, epoch, model, optim, args.save, logger) logger.loginfo("Done")
def train(model,data_loader_train,data_loader_test,optimizer,criterion_1,criterion_2,cfg,train_args,test_args): if train_args['load_ckpt'] is not None: load_ckpt(train_args,model) model.train() lr = train_args['lr'] for epoch in range(train_args['epoch']): print('epoch #: %d'%epoch) for i,data in tqdm(enumerate(data_loader_train)): #target = data['B_bins'].squeeze().long()#.to(cfg['device']) output,pred_depth = model.train_nyuv2(data) output_softmax = output['b_fake_softmax'] output_logit = output['b_fake_logit'].cpu() # weights = calc_weights(output_softmax.cpu(),target.clone().detach()) loss_1 = criterion_1(output_logit,data['B_bins'].squeeze().long()) loss_2 = criterion_2(imgrad_yx(pred_depth.cpu().clone()),data['E'].cpu()) loss = loss_1 + loss_2 loss.backward() optimizer.zero_grad() optimizer.step() lr = poly_lr_scheduler(optimizer,train_args['lr'],i + epoch*len(data_loader_train)) if i%10 == 0: Img = vutils.make_grid(data['A'].data.cpu(),normalize = True,scale_each = True) GT_depth = vutils.make_grid(data['B'].data.cpu(),normalize = True,scale_each = True) Estimated_depth = vutils.make_grid(pred_depth.data.cpu(),normalize = True,scale_each = True) Edge = vutils.make_grid(data['E'].unsqueeze(1).repeat(1,3,1,1).data.cpu(),normalize = True,scale_each = True) inputs = vutils.make_grid((data['A']*data['E'].unsqueeze(1).repeat(1,3,1,1)).data.cpu(),normalize = True,scale_each = True) #x*e.repeat(1,3,1,1) writer.add_image('RGB',Img,i + epoch*len(data_loader_train)) writer.add_image('GT_Depth',GT_depth,i + epoch*len(data_loader_train)) writer.add_image('Predicted_Depth',Estimated_depth,i + epoch*len(data_loader_train)) writer.add_image('Edge',Edge,i + epoch*len(data_loader_train)) writer.add_image('inputs',inputs,i + epoch*len(data_loader_train)) del output['b_fake_softmax'],output_softmax,output['b_fake_logit'],output_logit,pred_depth,loss print(lr) test(model,data_loader_test,cfg,test_args) save_ckpt(train_args['batchsize'],save_dir = '.',step = i + epoch*len(data_loader_train),epoch = epoch,model = model,optimizer = optimizer)
def auto_train(self, cur_idx): tot_iter = Config.min_iters while True: utils.save_as_pkl("data/train_iter.pkl", tot_iter) seg_train() loss = utils.get_loss() if loss < Config.max_allowed_loss: Config.last_ckpt = cur_idx with open("best_ckpt.txt", "a", encoding="utf-8") as file: file.write("best checkpoint: stage_" + str(Config.last_ckpt) + "_ckpt\n") break else: print("Restart training ...", utils.read_from_pkl("data/selected_sents.pkl"), "examples with train iters of", tot_iter, "failed ... Abnormal loss:", loss) tot_iter += 5 if tot_iter > Config.max_iters: break utils.del_checkpoint() utils.save_ckpt()
def main(): args = parse_args() print('Called with args:') print(args) ## Dataset dataset_path = os.path.join(args.root, 'Human3.6M') dataset_train1 = DatasetHuman36m(root=dataset_path, mode='train', time_window=args.time_window, pose_norm=bin_to_bool(args.pose_norm), output_mode=args.data_mode) dataset_path2 = os.path.join(args.root, 'MADS') dataset_train2 = DatasetMads(root=dataset_path2, mode='train', time_window=args.time_window, pose_norm=bin_to_bool(args.pose_norm), output_mode=args.data_mode) dataset_list = [dataset_train1, dataset_train2] dataset_train = DatasetConcat(dataset_list) dataloader_train = torch.utils.data.DataLoader( dataset_train, batch_size=args.bs, shuffle=True, num_workers=args.num_workers, drop_last=True) dataset_val = DatasetHuman36m(root=dataset_path, mode='val', time_window=args.time_window, pose_norm=bin_to_bool(args.pose_norm), output_mode=args.data_mode) steps_per_epoch = len(dataset_train) // args.bs ## Model assert args.time_window > 0 # Multiple frame if args.data_mode == 'op': in_channels = len(REST_INDS) * 3 elif args.data_mode == 'kp': in_channels = len(REST_INDS) * 2 else: raise NotImplementedError model = BasicTemporalModel(in_channels=in_channels, num_features=args.num_features, num_blocks=args.num_blocks, time_window=args.time_window) model.cuda() model = nn.DataParallel(model) ## Optimizer if args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr) elif args.optimizer == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99) ## Logging output_dir = os.path.join(args.out_dir, args.out_name) if not os.path.exists(output_dir): os.makedirs(output_dir) with open(os.path.join(output_dir, 'log.txt'), 'w') as f: f.write(str(args)) if bin_to_bool(args.use_tfboard): tblogger = SummaryWriter(output_dir) else: tblogger = None # Average in each log_steps avg_loss = 0. stat_dict = OrderedDict() for key in ['All', 'LToe', 'LHeel', 'RToe', 'RHeel']: stat_dict[key] = { 'Accuracy': 0., 'TP': 0., 'TN': 0., 'FP': 0., 'FN': 0. } ## Training model.train() global_step = 0 best_f1 = 0. # To pick the best model on val set best_pre = 0. best_rec = 0. best_epoch = -1 for epoch in range(args.num_epochs): scheduler.step() dataiterator_train = iter(dataloader_train) for step in range(steps_per_epoch): inputs = next(dataiterator_train) # NOTE: Need to check if there exists any valid label mask = inputs[-1] if mask.sum() == 0: continue global_step += 1 optimizer.zero_grad() body, label, mask = inputs body = body.cuda() label = label.cuda() mask = mask.cuda() pred_logit = model(body) pred_prob = torch.sigmoid(pred_logit) loss = balanced_loss(pred_logit, label, mask) loss.backward() optimizer.step() # Tracking stats avg_loss += loss.item() / args.log_steps pred_label = pred_prob.detach() pred_label[pred_label < 0.5] = 0 pred_label[pred_label >= 0.5] = 1 mask_np = mask.cpu().numpy() # Recover it back to {-1, 0, 1} label_np = label.cpu().numpy() + (mask_np - 1) pred_label_np = pred_label.cpu().numpy() res = label_np == pred_label_np # NOTE: Original definition TN = np.logical_and(pred_label_np == 0, label_np == 0) TP = np.logical_and(pred_label_np == 1, label_np == 1) FN = np.logical_and(pred_label_np == 0, label_np == 1) FP = np.logical_and(pred_label_np == 1, label_np == 0) for i, key in enumerate(stat_dict): if key == 'All': stat_dict[key]['Accuracy'] += res.sum() / mask_np.sum( ) / args.log_steps stat_dict[key]['TP'] += TP.sum() stat_dict[key]['TN'] += TN.sum() stat_dict[key]['FP'] += FP.sum() stat_dict[key]['FN'] += FN.sum() else: stat_dict[key]['Accuracy'] += res[:, i - 1].sum( ) / mask_np[:, i - 1].sum() / args.log_steps stat_dict[key]['TP'] += TP[:, i - 1].sum() stat_dict[key]['TN'] += TN[:, i - 1].sum() stat_dict[key]['FP'] += FP[:, i - 1].sum() stat_dict[key]['FN'] += FN[:, i - 1].sum() # Logging if global_step % args.log_steps == 0: log_str = 'Global Step: {}, Train Epoch: {}/{} [{}/{}], Loss: {:.6f}\n'.format( global_step, epoch + 1, args.num_epochs, step + 1, steps_per_epoch, avg_loss) if bin_to_bool(args.use_tfboard): tblogger.add_scalar('loss', avg_loss, global_step) for key in stat_dict: acc = stat_dict[key]['Accuracy'] tp = stat_dict[key]['TP'] tn = stat_dict[key]['TN'] fp = stat_dict[key]['FP'] fn = stat_dict[key]['FN'] del stat_dict[key]['TP'] del stat_dict[key]['TN'] del stat_dict[key]['FP'] del stat_dict[key]['FN'] eps = 1e-6 pre = tp / (tp + fp + eps) rec = tp / (tp + fn + eps) f1 = 2. * pre * rec / (pre + rec + eps) stat_dict[key]['Precision'] = pre stat_dict[key]['Recall'] = rec stat_dict[key]['F1'] = f1 log_str += '\t\t{}: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1: {:4f}\n'.format( key, acc, pre, rec, f1) if bin_to_bool(args.use_tfboard): for subkey in stat_dict[key]: name = 'train/' + key + '/' + subkey tblogger.add_scalar(name, stat_dict[key][subkey], global_step) print(log_str) avg_loss = 0. for key in stat_dict: stat_dict[key]['Accuracy'] = 0. stat_dict[key]['TP'] = 0. stat_dict[key]['TN'] = 0. stat_dict[key]['FP'] = 0. stat_dict[key]['FN'] = 0. # Save ckpt if args.ckpt_steps > 0 and global_step % args.ckpt_steps == 0: curr_f1, curr_pre, curr_rec = evaluate(model, dataset_val, args, global_step, tblogger) if curr_pre > best_pre: best_pre = curr_pre best = True else: best = False print('Saving ckpt...') save_ckpt(output_dir, args, epoch, global_step, model, optimizer, best) print('Ckpt is saved!') # Save ckpt after each epoch if args.ckpt_steps == 0: curr_f1, curr_pre, curr_rec = evaluate(model, dataset_val, args, global_step, tblogger) if curr_pre > best_pre: best_pre = curr_pre best = True best_epoch = epoch else: best = False print('Saving ckpt...') save_ckpt(output_dir, args, epoch, global_step, model, optimizer, best) print('Ckpt is saved!') print('Finish training, best precision: {} at epoch {}'.format( best_pre, best_epoch))
sampler=args.sampler, batch_size=args.bs, num_aug=args.aug) criterion = MaximalCodingRateReduction(gam1=args.gam1, gam2=args.gam2, eps=args.eps) optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.mom, weight_decay=args.wd) scheduler = lr_scheduler.MultiStepLR(optimizer, [30, 60], gamma=0.1) utils.save_params(model_dir, vars(args)) ## Training for epoch in range(args.epo): for step, (batch_imgs, _, batch_idx) in enumerate(trainloader): batch_features = net(batch_imgs.cuda()) loss, loss_empi, loss_theo = criterion(batch_features, batch_idx) optimizer.zero_grad() loss.backward() optimizer.step() utils.save_state(model_dir, epoch, step, loss.item(), *loss_empi, *loss_theo) if step % 20 == 0: utils.save_ckpt(model_dir, net, epoch) scheduler.step() utils.save_ckpt(model_dir, net, epoch) print("training complete.")
import sys import utils from parameters import * from trainer import Trainer if __name__ == '__main__': config = get_parameters() config.command = 'python ' + ' '.join(sys.argv) print(config) trainer = Trainer(config) trainer.train() utils.save_ckpt(trainer, final=True)
def train(): args = parse_args() args_msg = [ ' %s: %s' % (name, value) for (name, value) in vars(args).items() ] logger.info('args:\n' + '\n'.join(args_msg)) ckpt_path = "models_chunk_twin_context" os.system("mkdir -p {}".format(ckpt_path)) logger = init_logging("chunk_model", "{}/train.log".format(ckpt_path)) csv_file = open(args.csv_file, 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(header) batch_size = args.batch_size device = torch.device("cuda:0") reg_weight = args.reg_weight ctc_crf_base.init_env(args.den_lm_fst_path, gpus) model = CAT_Chunk_Model(args.feature_size, args.hdim, args.output_unit, args.dropout, args.lamb, reg_weight) lr = args.origin_lr optimizer = optim.Adam(model.parameters(), lr=lr) epoch = 0 prev_cv_loss = np.inf if args.checkpoint: checkpoint = torch.load(args.checkpoint) epoch = checkpoint['epoch'] lr = checkpoint['lr'] prev_cv_loss = checkpoint['cv_loss'] model.load_state_dict(checkpoint['model']) model.cuda() model = nn.DataParallel(model) model.to(device) reg_model = CAT_RegModel(args.feature_size, args.hdim, args.output_unit, args.dropout, args.lamb) loaded_reg_model = torch.load(args.regmodel_checkpoint) reg_model.load_state_dict(loaded_reg_model) reg_model.cuda() reg_model = nn.DataParallel(reg_model) reg_model.to(device) prev_epoch_time = timeit.default_timer() model.train() reg_model.eval() while True: # training stage epoch += 1 gc.collect() if epoch > 2: cate_list = list(range(1, args.cate, 1)) random.shuffle(cate_list) else: cate_list = range(1, args.cate, 1) for cate in cate_list: pkl_path = args.tr_data_path + "/" + str(cate) + ".pkl" if not os.path.exists(pkl_path): continue tr_dataset = SpeechDatasetMemPickel(pkl_path) jitter = random.randint(-args.jitter_range, args.jitter_range) chunk_size = args.default_chunk_size + jitter tr_dataloader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=PadCollateChunk(chunk_size)) train_chunk_model(model, reg_model, tr_dataloader, optimizer, epoch, chunk_size, TARGET_GPUS, args, logger) # cv stage model.eval() cv_losses_sum = [] cv_cls_losses_sum = [] count = 0 cate_list = range(1, args.cate, 1) for cate in cate_list: pkl_path = args.dev_data_path + "/" + str(cate) + ".pkl" if not os.path.exists(pkl_path): continue cv_dataset = SpeechDatasetMemPickel(pkl_path) cv_dataloader = DataLoader(cv_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=PadCollateChunk( args.default_chunk_size)) validate_count = validate_chunk_model(model, reg_model, cv_dataloader, epoch, cv_losses_sum, cv_cls_losses_sum, args, logger) count += validate_count cv_loss = np.sum(np.asarray(cv_losses_sum)) / count cv_cls_loss = np.sum(np.asarray(cv_cls_losses_sum)) / count # save model save_ckpt( { 'cv_loss': cv_loss, 'model': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'lr': lr, 'epoch': epoch }, epoch < args.min_epoch or cv_loss <= prev_cv_loss, ckpt_path, "model.epoch.{}".format(epoch)) csv_row = [ epoch, (timeit.default_timer() - prev_epoch_time) / 60, lr, cv_loss ] prev_epoch_time = timeit.default_timer() csv_writer.writerow(csv_row) csv_file.flush() plot_train_figure(args.csv_file, args.figure_file) if epoch < args.min_epoch or cv_loss <= prev_cv_loss: prev_cv_loss = cv_loss lr = adjust_lr(optimizer, args.origin_lr, lr, cv_loss, prev_cv_loss, epoch, args.min_epoch) if (lr < args.stop_lr): print("rank {} lr is too slow, finish training".format(args.rank), datetime.datetime.now(), flush=True) break model.train() ctc_crf_base.release_env(gpus)
def main(args): model = Model() optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=args.gamma) if args.scheduler == 'multistep': scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.milestones, gamma=args.gamma) elif args.scheduler == 'cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, args.step_size) criterion = torch.nn.CrossEntropyLoss() model = model.cuda() criterion = criterion.cuda() start_epoch = 0 # Check number of parameters your model pytorch_total_params = sum(p.numel() for p in model.parameters()) print(f"Number of parameters: {pytorch_total_params}") if not os.path.exists('{}'.format(args.savepath)): os.makedirs('{}'.format(args.savepath)) # resume if args.resume: model, optimizer, start_epoch = load_ckpt(model, optimizer, args) # Dataloader if args.dataset == 'cifar10': normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_train.transforms.insert( 0, RandAugment(args.rand_n, args.rand_m)) transform_val = transforms.Compose([ transforms.ToTensor(), normalize, ]) trainset = CIFAR10(root=args.datapath, train=True, download=True, transform=transform_train) valset = CIFAR10(root=args.datapath, train=False, download=True, transform=transform_val) elif args.dataset == 'cifar100': normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_val = transforms.Compose([ transforms.ToTensor(), normalize, ]) trainset = CIFAR100(root=args.datapath, train=True, download=True, transform=transform_train) valset = CIFAR100(root=args.datapath, train=False, download=True, transform=transform_val) elif args.dataset == 'ImageNet': normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform_train = transforms.Compose([ transforms.RandomResizedCrop(image_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_val = transforms.Compose([ transforms.Resize(image_size + 32), transforms.CenterCrop(image_size), transforms.ToTensor(), normalize, ]) trainset = ImageNet(root=args.datapath, split='train', download=False, transform=transform_train) valset = ImageNet(root=args.datapath, split='val', download=False, transform=transform_val) elif args.dataeset == 'tiny-imagenet-200': normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform_train = transforms.Compose([ transforms.RandomResizedCrop(image_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_val = transforms.Compose([ transforms.Resize(image_size + 32), transforms.CenterCrop(image_size), transforms.ToTensor(), normalize, ]) trainset = ImageFolder(root=args.datapath, split='train', download=False, transform=transform_train) valset = ImageFolder(root=args.datapath, split='val', download=False, transform=transform_val) train_loader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=False) val_loader = torch.utils.data.DataLoader(valset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=False) # start training last_top1_acc = 0 acc1_valid = 0 best_acc1 = 0 is_best = False for epoch in range(start_epoch, args.epochs): print("\n----- epoch: {}, lr: {} -----".format( epoch, optimizer.param_groups[0]["lr"])) # train for one epoch start_time = time.time() last_top1_acc = train(train_loader, epoch, model, optimizer, criterion) elapsed_time = time.time() - start_time print('==> {:.2f} seconds to train this epoch\n'.format(elapsed_time)) # validate for one epoch start_time = time.time() acc1_valid = validate(val_loader, model, criterion) elapsed_time = time.time() - start_time print( '==> {:.2f} seconds to validate this epoch\n'.format(elapsed_time)) # learning rate scheduling scheduler.step() summary = [epoch, last_top1_acc, acc1_valid.item()] is_best = acc1_valid > best_acc1 best_acc1 = max(acc1_valid, best_acc1) save_summary('rexnetv1', args.dataset, args.name, summary) checkpoint = { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } save_ckpt(checkpoint, is_best, args) #if is_best: # torch.save(model.state_dict(), args.savepath+'model_weight_best.pth') # Save model each epoch #torch.save(model.state_dict(), args.savepath+'model_weight_epoch{}.pth'.format(epoch)) print(f"Last Top-1 Accuracy: {last_top1_acc}") print(f"Best valid Top-1 Accuracy: {best_acc1}") print(f"Number of parameters: {pytorch_total_params}")
criterion = nn.CrossEntropyLoss() optimizer = SGD(net.parameters(), lr=args.lr, momentum=args.mom, weight_decay=args.wd) ## Training for label_batch_id in range(class_batch_num): subtrainset = tf.get_subset(class_batch_list[label_batch_id, :], trainset) trainloader = DataLoader(subtrainset, batch_size=args.bs, drop_last=True, num_workers=4) print("training starts on label batch:{}".format(label_batch_id)) os.makedirs( os.path.join(model_dir, 'checkpoints', 'labelbatch{}'.format(label_batch_id))) for epoch in range(args.epo): lr_schedule(epoch, optimizer) for step, (batch_imgs, batch_lbls) in enumerate(trainloader): features = net(batch_imgs.cuda()) loss = criterion(features, batch_lbls.cuda()) optimizer.zero_grad() loss.backward() optimizer.step() utils.save_state(model_dir, label_batch_id, epoch, step, loss.item()) utils.save_ckpt(model_dir, net, epoch, label_batch_id) print("training complete.")
def main(args): # turn off tensorflow verbose os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' args = utils.make_save_dirs(args) args['y_shape'] = (64, 64, 1) args['x_shape'] = (64, 64, 128, 2) # Load data utils.save_model_settings(args) utils.make_tensorboard_summaries(args) loss_train = np.zeros(args['n_epochs'] + 1) loss_val = np.zeros(args['n_epochs'] + 1) cost_train = np.zeros(args['n_epochs'] + 1) # Launch the graph with tf.Session() as sess: X_train, y_train = utils.dataset_from_tfRecords( args['train_dataset'], args) # X_val, y_val= utils.dataset_from_tfRecords(args['val_dataset'],args) # tf Graph input is_training = tf.placeholder(tf.bool) which_dataset = tf.placeholder(tf.string) x = X_train y = y_train # Construct model if args['arch'] == '3d': pred = nets.encoding3d(x, args) # Define loss and optimizer cost, loss = utils.calc_cost(args, y, pred) optimizer = tf.train.AdamOptimizer( learning_rate=args['lr']).minimize(cost) saver = tf.train.Saver(max_to_keep=0) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) iters_per_epoch = np.floor(18000 / args['batch_size']) iters_per_epoch = 100 iters_per_epoch_val = np.floor(2000 / args['batch_size']) loss_epoch_train = [] cost_epoch_train = [] args['n_epochs'] *= 100 for iteration in range(int(iters_per_epoch * args['n_epochs'])): loss_, cost_, _, = sess.run([loss, cost, optimizer], feed_dict={ is_training: True, which_dataset: 'train' }) loss_epoch_train.append(loss_) cost_epoch_train.append(cost_) if iteration % iters_per_epoch == 0: epoch = int(np.floor(iteration / iters_per_epoch)) + 1 # store and show training loss loss_train[epoch] = sum(loss_epoch_train) / len( loss_epoch_train) cost_train[epoch] = sum(cost_epoch_train) / len( loss_epoch_train) print('Train: \tLoss = %6.6f\tCost = %6.6f' % (cost_train[epoch], loss_train[epoch])) ''' # store and show val loss loss_epoch_val = [] for i in range(iter_per_epoch_val): loss_, = sess.run(loss, feed_dict={is_training: True, which_dataset: 'val'}) loss_epoch_val.append(loss_) loss_val[epoch] = mean(loss_epoch_val) print('Val: \tLoss = %6.6f' % (loss_val[epoch])) ''' loss_epoch_train = [] cost_epoch_train = [] utils.save_ckpt(saver, args, sess, epoch) # Save all of the model parameters as a .mat file! utils.save_network_mat(sess, args, { 'loss_train': loss_train, 'cost_train': cost_train })
def main_worker(gpu, ngpus_per_node, args): csv_file = None csv_writer = None args.gpu = gpu args.rank = args.start_rank + gpu TARGET_GPUS = [args.gpu] logger = None ckpt_path = "models" os.system("mkdir -p {}".format(ckpt_path)) if args.rank == 0: logger = init_logging(args.model, "{}/train.log".format(ckpt_path)) args_msg = [ ' %s: %s' % (name, value) for (name, value) in vars(args).items() ] logger.info('args:\n' + '\n'.join(args_msg)) csv_file = open(args.csv_file, 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(header) gpus = torch.IntTensor(TARGET_GPUS) ctc_crf_base.init_env(args.den_lm_fst_path, gpus) dist.init_process_group(backend='nccl', init_method=args.dist_url, world_size=args.world_size, rank=args.rank) torch.cuda.set_device(args.gpu) model = CAT_Model(args.arch, args.feature_size, args.hdim, args.output_unit, args.layers, args.dropout, args.lamb, args.ctc_crf) if args.rank == 0: params_msg = params_num(model) logger.info('\n'.join(params_msg)) lr = args.origin_lr optimizer = optim.Adam(model.parameters(), lr=lr) epoch = 0 prev_cv_loss = np.inf if args.checkpoint: checkpoint = torch.load(args.checkpoint) epoch = checkpoint['epoch'] lr = checkpoint['lr'] prev_cv_loss = checkpoint['cv_loss'] model.load_state_dict(checkpoint['model']) model.cuda(args.gpu) model = nn.parallel.DistributedDataParallel(model, device_ids=TARGET_GPUS) tr_dataset = SpeechDatasetPickel(args.tr_data_path) tr_sampler = DistributedSampler(tr_dataset) tr_dataloader = DataLoader(tr_dataset, batch_size=args.gpu_batch_size, shuffle=False, num_workers=args.data_loader_workers, pin_memory=True, collate_fn=PadCollate(), sampler=tr_sampler) cv_dataset = SpeechDatasetPickel(args.dev_data_path) cv_dataloader = DataLoader(cv_dataset, batch_size=args.gpu_batch_size, shuffle=False, num_workers=args.data_loader_workers, pin_memory=True, collate_fn=PadCollate()) prev_epoch_time = timeit.default_timer() while True: # training stage epoch += 1 tr_sampler.set_epoch(epoch) # important for data shuffle gc.collect() train(model, tr_dataloader, optimizer, epoch, args, logger) cv_loss = validate(model, cv_dataloader, epoch, args, logger) # save model if args.rank == 0: save_ckpt( { 'cv_loss': cv_loss, 'model': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'lr': lr, 'epoch': epoch }, cv_loss <= prev_cv_loss, ckpt_path, "model.epoch.{}".format(epoch)) csv_row = [ epoch, (timeit.default_timer() - prev_epoch_time) / 60, lr, cv_loss ] prev_epoch_time = timeit.default_timer() csv_writer.writerow(csv_row) csv_file.flush() plot_train_figure(args.csv_file, args.figure_file) if epoch < args.min_epoch or cv_loss <= prev_cv_loss: prev_cv_loss = cv_loss else: args.annealing_epoch = 0 lr = adjust_lr_distribute(optimizer, args.origin_lr, lr, cv_loss, prev_cv_loss, epoch, args.annealing_epoch, args.gpu_batch_size, args.world_size) if (lr < args.stop_lr): print("rank {} lr is too slow, finish training".format(args.rank), datetime.datetime.now(), flush=True) break ctc_crf_base.release_env(gpus)
def main(): global args, best_prec1 args = parser.parse_args() gpu_num = torch.cuda.device_count() if args.distributed: args.rank, args.size = init_processes(args.dist_addr, args.dist_port, gpu_num, args.dist_backend) print("=> using {} GPUS for distributed training".format(args.size)) else: args.rank = 0 print("=> using {} GPUS for training".format(gpu_num)) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](feature_dim=args.feature_dim, pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](feature_dim=args.feature_dim) if args.sampled: if args.rank > 0: assert args.distributed assert args.sample_num <= args.num_classes model = models.HFClassifier(model, args.rank, args.feature_dim, args.sample_num, args.num_classes) else: model = models.Classifier(model, args.feature_dim, args.num_classes) if not args.distributed: if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model, [args.rank]) print('create DistributedDataParallel model successfully', args.rank) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) if args.sampled: with ParameterClient(args.tmp_client_id) as client: cls_resume = args.resume.replace('.pth.tar', '_cls.h5') if os.path.isfile(cls_resume): client.resume(cls_resume) print("=> loaded checkpoint '{}' (epoch {})".format( cls_resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format( cls_resume)) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.25, 0.25, 0.25]) train_dataset = FileListDataset( args.train_filelist, args.train_prefix, transforms.Compose([ transforms.Resize(args.input_size), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(FileListDataset( args.val_filelist, args.val_prefix, transforms.Compose([ transforms.Resize(args.input_size), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args.sampled) return assert max(args.lr_steps) < args.epochs lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, args.lr_steps, args.gamma) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args.sampled) # evaluate on validation set prec1 = validate(val_loader, model, criterion, args.sampled) # remember best prec@1 and save checkpoint if args.rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_ckpt( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, args.save_path, epoch + 1, is_best) if args.sampled: with ParameterClient(args.tmp_client_id) as client: client.snapshot('{}_epoch_{}_cls.h5'.format( args.save_path, epoch + 1))
def train(model, train_data_loader, val_data_loader, optimizer, scheduler, num_epochs, writer=None): best_val_loss = np.finfo(float).max best_model = None try: for epoch_i in range(num_epochs): start_time = time.time() model.train() scheduler.step() print('Epoch {}/{}: lr {}'.format(epoch_i + 1, num_epochs, scheduler.get_lr()), end='') if writer: writer.add_scalar('lr', scheduler.get_lr()[0], global_step=epoch_i) running_loss = 0.0 running_corrects = 0.0 for idx, (inputs, labels, raw_images) in enumerate(train_data_loader): if writer and idx % write_image_freq == 0: writer.add_image('raw-crop-label', cat_image_show( raw_images[0:20], inputs[0:20], draw_label_tensor(labels[0:20])), global_step=idx) for name, param in model.module.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), global_step=idx) if use_gpu: inputs = inputs.cuda() labels = labels.cuda() optimizer.zero_grad() outputs = model(inputs) loss = F.cross_entropy(outputs, labels, size_average=False) loss.backward() optimizer.step() running_loss += loss.item() _, preds = torch.max(F.softmax(outputs, dim=1), 1) running_corrects += torch.sum(preds == labels).item() train_dataset_size = len(train_data_loader.dataset) epoch_loss = running_loss / train_dataset_size epoch_acc = running_corrects / train_dataset_size print('\t{:5s} loss {:.4f} acc {:.4f}'.format( 'train', epoch_loss, epoch_acc)) train_end_time = time.time() val_loss, _ = val(model, val_data_loader, epoch_i, writer) val_end_time = time.time() if best_val_loss > val_loss: best_model = model best_val_loss = val_loss train_time = train_end_time - start_time val_time = val_end_time - train_end_time print('\ttime train {:.4f} val {:.4f}'.format( train_time, val_time)) if writer: writer.add_scalar('loss_epoch_train', epoch_loss, global_step=epoch_i) writer.add_scalar('acc_epoch_train', epoch_acc, global_step=epoch_i) writer.add_scalar('time_epoch_train', train_time, global_step=epoch_i) writer.add_scalar('time_epoch_val', val_time, global_step=epoch_i) save_ckpt(output_dir, best_model, optimizer, epoch_i, batch_size) except (RuntimeError, KeyboardInterrupt): save_ckpt(output_dir, best_model, optimizer, epoch_i, batch_size) print(traceback.format_exc())
def main(args): args.color_t = torch.rand(700, 3) if not os.path.exists(args.ckpt_dir): os.makedirs(args.ckpt_dir) if not os.path.exists(args.summary_dir): os.makedirs(args.summary_dir) device = torch.device( "cuda" if not args.nocuda and torch.cuda.is_available() else "cpu") if args.last_ckpt: print("=> loading checkpoint '{}'".format(args.last_ckpt)) model = torch.load(args.last_ckpt, map_location=device) finetune_parameters = list(model.propagate_cell.z_what_from_transit_net.parameters()) + \ list(model.propagate_cell.object_transit_net.parameters()) + \ list(model.propagate_cell.infer_edge_type.parameters()) for param in list(model.parameters()): param.requires_grad = False for param in list(finetune_parameters): param.requires_grad = True optimizer = torch.optim.RMSprop(finetune_parameters, lr=args.lr) global_step = 0 else: model = SCALOR(args) model.to(device) optimizer = torch.optim.RMSprop(model.parameters(), lr=args.lr) global_step = 0 model.train() writer = SummaryWriter(args.summary_dir) args.global_step = global_step log_tau_gamma = np.log(args.tau_end) / args.tau_ep D = torch.load(args.experience_replay) num_train = D.size for epoch in range(int(args.start_epoch), args.epochs): local_count = 0 last_count = 0 end_time = time.time() for _ in range(num_train // args.batch_size): args.phase_generate = False chunk_size = epoch + 5 chunk_size = min(chunk_size, args.chunk_size) observations, actions, rewards, nonterminals = D.sample( args.batch_size, chunk_size) tau = np.exp(global_step * log_tau_gamma) tau = max(tau, args.tau_end) args.tau = tau global_step += 1 log_phase = global_step % args.print_freq == 0 or global_step == 1 args.global_step = global_step args.log_phase = log_phase sample = observations[:, :, 0:3].permute(1, 0, 2, 3, 4) / 255 imgs = sample.to(device) actions = actions.to(device) y_seq, log_like, kl_z_what, kl_z_where, kl_z_depth, \ kl_z_pres, kl_z_bg, kl_edge_type, log_imp, counting, \ log_disc_list, log_prop_list, scalor_log_list = model(imgs, actions) log_like = log_like.mean(dim=0) kl_z_what = kl_z_what.mean(dim=0) kl_z_where = kl_z_where.mean(dim=0) kl_z_depth = kl_z_depth.mean(dim=0) kl_z_pres = kl_z_pres.mean(dim=0) kl_z_bg = kl_z_bg.mean(0) kl_edge_type = kl_edge_type.mean(0) kl_weight = 1 #total_loss = - log_like + kl_weight * (kl_z_what + kl_z_where + kl_z_depth + kl_z_pres + kl_z_bg + kl_edge_type) total_loss = kl_z_what + kl_edge_type optimizer.zero_grad() total_loss.backward() clip_grad_norm_(model.parameters(), args.cp) optimizer.step() local_count += imgs.data.shape[0] if log_phase: time_inter = time.time() - end_time end_time = time.time() count_inter = local_count - last_count print_scalor(global_step, epoch, local_count, count_inter,\ num_train, total_loss, log_like, kl_z_what, kl_z_where,\ kl_z_pres, kl_z_depth, time_inter) writer.add_scalar('train/total_loss', total_loss.item(), global_step=global_step) writer.add_scalar('train/log_like', log_like.item(), global_step=global_step) writer.add_scalar('train/What_KL', kl_z_what.item(), global_step=global_step) writer.add_scalar('train/Where_KL', kl_z_where.item(), global_step=global_step) writer.add_scalar('train/Pres_KL', kl_z_pres.item(), global_step=global_step) writer.add_scalar('train/Depth_KL', kl_z_depth.item(), global_step=global_step) writer.add_scalar('train/Bg_KL', kl_z_bg.item(), global_step=global_step) writer.add_scalar('train/Edge_KL', kl_edge_type.item(), global_step=global_step) # writer.add_scalar('train/Bg_alpha_KL', kl_z_bg_mask.item(), global_step=global_step) writer.add_scalar('train/tau', tau, global_step=global_step) log_summary(args, writer, imgs, y_seq, global_step, log_disc_list, log_prop_list, scalor_log_list, prefix='train') last_count = local_count #print(args.generate_freq) #args.generate_freq = 2 #if global_step % args.generate_freq == 0: ####################################### do generation #################################### model.eval() with torch.no_grad(): args.phase_generate = True y_seq, log_like, kl_z_what, kl_z_where, kl_z_depth, \ kl_z_pres, kl_z_bg, kl_edge_type, log_imp, counting, \ log_disc_list, log_prop_list, scalor_log_list = model(imgs, actions) args.phase_generate = False log_summary(args, writer, imgs, y_seq, global_step, log_disc_list, log_prop_list, scalor_log_list, prefix='generate') model.train() ####################################### end generation #################################### if global_step % args.save_epoch_freq == 0 or global_step == 1: save_ckpt(args.ckpt_dir, model, optimizer, global_step, epoch, local_count, args.batch_size, num_train)
[csv_old, index_old] = load_ckpt(ckpt_path).split('#') ckpt_index = str( max(csv_index, int(index_old)) if csv_path is csv_old else csv_index) ckpt_info = csv_path + '#' + ckpt_index save_ckpt(ckpt_info, ckpt_path) if __name__ == "__main__": check_path(ckpt_path, binary=True) ckpt_info = load_ckpt(ckpt_path) if ckpt_info is None or '#' not in ckpt_info: ckpt_chunk = csv_paths[0] ckpt_index = -1 save_ckpt(ckpt_chunk + '#' + str(ckpt_index), ckpt_path) else: ckpt_chunk = ckpt_info.split('#')[0] ckpt_index = int(ckpt_info.split('#')[1]) print('continue from checkpoint ' + ckpt_chunk + ' ' + str(ckpt_index)) for csv_path in csv_paths: print(csv_path + ' has began ...') csv_file = csv.reader(open(csv_folder + csv_path + '.csv')) _ = next(csv_file) rows = [row for row in csv_file] start_time = time.time() Parallel(n_jobs=n_proc, backend='multiprocessing')( delayed(proc_func)(rows[i][0], rows[i][1], csv_path, i) for i in tqdm(range(ckpt_index + 1, len(rows))))
def main(_): torch.random.manual_seed(FLAGS.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False path = FLAGS.dataset_path train_path = os.path.join(path, "train.txt") validation_path = os.path.join(path, "valid.txt") test_path = os.path.join(path, "test.txt") entity2id, relation2id = data.create_mappings(train_path) batch_size = FLAGS.batch_size vector_length = FLAGS.vector_length margin = FLAGS.margin norm = FLAGS.norm learning_rate = FLAGS.lr epochs = FLAGS.epochs device = torch.device('cuda') if ( FLAGS.use_gpu and torch.cuda.is_available()) else torch.device('cpu') print('Loading data...') train_set = data.FB15KDataset(train_path, entity2id, relation2id) train_generator = torch_data.DataLoader(train_set, batch_size=batch_size) validation_set = data.FB15KDataset(validation_path, entity2id, relation2id) validation_generator = torch_data.DataLoader( validation_set, batch_size=FLAGS.validation_batch_size) test_set = data.FB15KDataset(test_path, entity2id, relation2id) test_generator = torch_data.DataLoader( test_set, batch_size=FLAGS.validation_batch_size) print('Loading model...') model = model_definition.TransE(entity_count=len(entity2id), relation_count=len(relation2id), dim=vector_length, margin=margin, device=device, norm=norm) # type: torch.nn.Module model = model.to(device) optimizer = optim.SGD(model.parameters(), lr=learning_rate) # summary_writer = tensorboard.SummaryWriter(log_dir=FLAGS.tensorboard_log_dir) start_epoch_id = 1 step = 0 best_score = -1 utils.create_dir_not_exists('./checkpoint') # if FLAGS.checkpoint_path: # start_epoch_id, step, best_score = storage.load_checkpoint(FLAGS.checkpoint_path, model, optimizer) if os.path.exists(FLAGS.checkpoint_path): # 存在则加载模型 并继续训练 start_epoch_id, best_score = utils.load_ckpt(FLAGS.checkpoint_path, model, optimizer) print(model) print(f'Training and test on {device}...') start = time.time() # Training loop for epoch_id in range(start_epoch_id, epochs + 1): # print("Starting epoch: ", epoch_id) print('Epoch [{:>3}/{:>3}]'.format(epoch_id, epochs), end='') loss_impacting_samples_count = 0 samples_count = 0 model.train() for local_heads, local_relations, local_tails in train_generator: local_heads, local_relations, local_tails = ( local_heads.to(device), local_relations.to(device), local_tails.to(device)) positive_triples = torch.stack( (local_heads, local_relations, local_tails), dim=1) # Preparing negatives. # Generate binary tensor to replace either head or tail. 1 means replace head, 0 means replace tail. head_or_tail = torch.randint(high=2, size=local_heads.size(), device=device) random_entities = torch.randint(high=len(entity2id), size=local_heads.size(), device=device) broken_heads = torch.where(head_or_tail == 1, random_entities, local_heads) broken_tails = torch.where(head_or_tail == 0, random_entities, local_tails) negative_triples = torch.stack( (broken_heads, local_relations, broken_tails), dim=1) optimizer.zero_grad() loss, pd, nd = model(positive_triples, negative_triples) # loss pos_dis neg_dis loss.mean().backward() # summary_writer.add_scalar('Loss/train', loss.mean().data.cpu().numpy(), global_step=step) # summary_writer.add_scalar('Distance/positive', pd.sum().data.cpu().numpy(), global_step=step) # summary_writer.add_scalar('Distance/negative', nd.sum().data.cpu().numpy(), global_step=step) loss = loss.data.cpu() loss_impacting_samples_count += loss.nonzero().size()[0] samples_count += loss.size()[0] optimizer.step() step += 1 # summary_writer.add_scalar('Metrics/loss_impacting_samples', loss_impacting_samples_count / samples_count * 100, # global_step=epoch_id) if epoch_id % FLAGS.validation_freq == 0: model.eval() # _, _, hits_at_10, _ = test(model=model, data_generator=validation_generator, # entities_count=len(entity2id), # device=device, summary_writer=summary_writer, # epoch_id=epoch_id, metric_suffix="val") hits_at_1_score, hits_at_3_score, hits_at_10_score, mrr_score = test( model=model, data_generator=validation_generator, entities_count=len(entity2id), device=device, epoch_id=epoch_id, metric_suffix="val") score = hits_at_10_score if score > best_score: best_score = score improve = '*' # 在有提升的结果后面加上*标注 utils.save_ckpt(model, optimizer, epoch_id, best_score, FLAGS.checkpoint_path) # storage.save_checkpoint(model, optimizer, epoch_id, step, best_score) else: improve = '' # msg = 'Train loss: {0:>4.2f},' \ # ' Val hits@{1}(entity): {2:>5.2%}, MR(entity): {3:>6.2f},' \ # ' hits@{4}(relation): {5:>5.2%}, MR(relation): {6:>4.2f},' \ # ' Time: {7} {8}' # print(msg.format(loss, # config.top_k_entity, hitsk_ent, mr_ent, # config.top_k_relation, hitsk_rel, mr_rel, # time_since(start), improve)) print( 'Train loss: {:5.2}, val Hit@1: {:>5.2%}, Hit@3: {:>5.2%}, Hit@10: {:>5.2%}, MRR:{:>5.2%}, time:{} {}' .format(loss.mean().item(), hits_at_1_score, hits_at_3_score, hits_at_10_score, mrr_score, utils.time_since(start), improve)) # Testing the best checkpoint on test dataset print('Training done, start evaluate on test data...') # Testing the best checkpoint on test dataset # storage.load_checkpoint("checkpoint.tar", model, optimizer) utils.load_ckpt(FLAGS.checkpoint_path, model, optimizer) best_model = model.to(device) best_model.eval() # scores = test(model=best_model, data_generator=test_generator, entities_count=len(entity2id), device=device, # summary_writer=summary_writer, epoch_id=1, metric_suffix="test") # scores = test(model=best_model, data_generator=test_generator, entities_count=len(entity2id), device=device, # epoch_id=1, metric_suffix="test") hits_at_1_score, hits_at_3_score, hits_at_10_score, mrr_score = test( model=best_model, data_generator=test_generator, entities_count=len(entity2id), device=device, epoch_id=1, metric_suffix="test") print( 'Test Hit@1: {0:>5.2%}, Hit@3: {1:>5.2%}, Hit@10: {2:>5.2%}, MRR:{3:>5.2%}, total time:{4}' .format(hits_at_1_score, hits_at_3_score, hits_at_10_score, mrr_score, utils.time_since(start)))
def main(): # torch.autograd.set_detect_anomaly(True) torch.backends.cudnn.benchmark = True # os.environ['CUDA_LAUNCH_BLOCKING'] = '1' args = get_config()[0] torch.manual_seed(args.train.seed) torch.cuda.manual_seed(args.train.seed) torch.cuda.manual_seed_all(args.train.seed) np.random.seed(args.train.seed) model_dir = os.path.join(args.model_dir, args.exp_name) summary_dir = os.path.join(args.summary_dir, args.exp_name) if not os.path.isdir(model_dir): os.makedirs(model_dir) if not os.path.isdir(summary_dir): os.makedirs(summary_dir) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # torch.manual_seed(args.seed) args.train.num_gpu = torch.cuda.device_count() with open(os.path.join(summary_dir, 'config.yaml'), 'w') as f: yaml.dump(args, f) if args.data.dataset == 'mnist': train_data = MultiMNIST(args, mode='train') test_data = MultiMNIST(args, mode='test') val_data = MultiMNIST(args, mode='val') elif args.data.dataset == 'blender': train_data = Blender(args, mode='train') test_data = Blender(args, mode='test') val_data = Blender(args, mode='val') else: raise NotImplemented train_loader = DataLoader(train_data, batch_size=args.train.batch_size, shuffle=True, drop_last=True, num_workers=6) num_train = len(train_data) test_loader = DataLoader(test_data, batch_size=args.train.batch_size * 4, shuffle=False, drop_last=True, num_workers=6) num_test = len(test_data) val_loader = DataLoader(val_data, batch_size=args.train.batch_size * 4, shuffle=False, drop_last=True, num_workers=6) num_val = len(val_data) model = GNM(args) model.to(device) num_gpu = 1 if device.type == 'cuda' and torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") num_gpu = torch.cuda.device_count() model = nn.DataParallel(model) model.train() optimizer = torch.optim.RMSprop(model.parameters(), lr=args.train.lr) global_step = 0 if args.last_ckpt: global_step, args.train.start_epoch = \ load_ckpt(model, optimizer, args.last_ckpt, device) args.train.global_step = global_step args.log.phase_log = False writer = SummaryWriter(summary_dir) end_time = time.time() for epoch in range(int(args.train.start_epoch), args.train.epoch): local_count = 0 last_count = 0 for batch_idx, sample in enumerate(train_loader): imgs = sample.to(device) hyperparam_anneal(args, global_step) global_step += 1 phase_log = global_step % args.log.print_step_freq == 0 or global_step == 1 args.train.global_step = global_step args.log.phase_log = phase_log pa_recon, log_like, kl, _, _, _, log = \ model(imgs) aux_kl_pres, aux_kl_where, aux_kl_depth, aux_kl_what, aux_kl_bg, kl_pres, \ kl_where, kl_depth, kl_what, kl_global_all, kl_bg = kl aux_kl_pres_raw = aux_kl_pres.mean(dim=0) aux_kl_where_raw = aux_kl_where.mean(dim=0) aux_kl_depth_raw = aux_kl_depth.mean(dim=0) aux_kl_what_raw = aux_kl_what.mean(dim=0) aux_kl_bg_raw = aux_kl_bg.mean(dim=0) kl_pres_raw = kl_pres.mean(dim=0) kl_where_raw = kl_where.mean(dim=0) kl_depth_raw = kl_depth.mean(dim=0) kl_what_raw = kl_what.mean(dim=0) kl_bg_raw = kl_bg.mean(dim=0) log_like = log_like.mean(dim=0) aux_kl_pres = aux_kl_pres_raw * args.train.beta_aux_pres aux_kl_where = aux_kl_where_raw * args.train.beta_aux_where aux_kl_depth = aux_kl_depth_raw * args.train.beta_aux_depth aux_kl_what = aux_kl_what_raw * args.train.beta_aux_what aux_kl_bg = aux_kl_bg_raw * args.train.beta_aux_bg kl_pres = kl_pres_raw * args.train.beta_pres kl_where = kl_where_raw * args.train.beta_where kl_depth = kl_depth_raw * args.train.beta_depth kl_what = kl_what_raw * args.train.beta_what kl_bg = kl_bg_raw * args.train.beta_bg kl_global_raw = kl_global_all.sum(dim=-1).mean(dim=0) kl_global = kl_global_raw * args.train.beta_global total_loss = -(log_like - kl_pres - kl_where - kl_depth - kl_what - kl_bg - kl_global - aux_kl_pres - aux_kl_where - aux_kl_depth - aux_kl_what - aux_kl_bg) optimizer.zero_grad() total_loss.backward() clip_grad_norm_(model.parameters(), args.train.cp) optimizer.step() local_count += imgs.data.shape[0] if phase_log: bs = imgs.size(0) time_inter = time.time() - end_time count_inter = local_count - last_count print_schedule(global_step, epoch, local_count, count_inter, num_train, total_loss, time_inter) end_time = time.time() for name, param in model.named_parameters(): writer.add_histogram('param/' + name, param.cpu().detach().numpy(), global_step) if param.grad is not None: writer.add_histogram('grad/' + name, param.grad.cpu().detach(), global_step) if len(param.size()) != 1: writer.add_scalar( 'grad_std/' + name + '.grad', param.grad.cpu().detach().std().item(), global_step) writer.add_scalar( 'grad_mean/' + name + '.grad', param.grad.cpu().detach().mean().item(), global_step) for key, value in log.items(): if value is None: continue if key == 'importance_map_full_res_norm': writer.add_histogram( 'inside_value/' + key, value[value > 0].cpu().detach().numpy(), global_step) else: writer.add_histogram('inside_value/' + key, value.cpu().detach().numpy(), global_step) grid_image = make_grid( imgs.cpu().detach()[:args.log.num_summary_img].view( -1, args.data.inp_channel, args.data.img_h, args.data.img_w), args.log.num_img_per_row, normalize=False, pad_value=1) writer.add_image('train/1-image', grid_image, global_step) grid_image = make_grid(pa_recon[0].cpu().detach() [:args.log.num_summary_img].clamp( 0, 1).view(-1, args.data.inp_channel, args.data.img_h, args.data.img_w), args.log.num_img_per_row, normalize=False, pad_value=1) writer.add_image('train/2-reconstruction_overall', grid_image, global_step) if args.arch.phase_background: grid_image = make_grid(pa_recon[1].cpu().detach() [:args.log.num_summary_img].clamp( 0, 1).view(-1, args.data.inp_channel, args.data.img_h, args.data.img_w), args.log.num_img_per_row, normalize=False, pad_value=1) writer.add_image('train/3-reconstruction-fg', grid_image, global_step) grid_image = make_grid(pa_recon[2].cpu().detach() [:args.log.num_summary_img].clamp( 0, 1).view(-1, 1, args.data.img_h, args.data.img_w), args.log.num_img_per_row, normalize=False, pad_value=1) writer.add_image('train/4-reconstruction-alpha', grid_image, global_step) grid_image = make_grid(pa_recon[3].cpu().detach() [:args.log.num_summary_img].clamp( 0, 1).view(-1, args.data.inp_channel, args.data.img_h, args.data.img_w), args.log.num_img_per_row, normalize=False, pad_value=1) writer.add_image('train/5-reconstruction-bg', grid_image, global_step) bbox = visualize( imgs[:args.log.num_summary_img].cpu(), log['z_pres'].view( bs, args.arch.num_cell**2, -1)[:args.log.num_summary_img].cpu().detach(), log['z_where_scale'].view( bs, args.arch.num_cell**2, -1)[:args.log.num_summary_img].cpu().detach(), log['z_where_shift'].view( bs, args.arch.num_cell**2, -1)[:args.log.num_summary_img].cpu().detach(), only_bbox=True, phase_only_display_pres=False) bbox = bbox.view(args.log.num_summary_img, -1, 3, args.data.img_h, args.data.img_w).sum(1).clamp(0.0, 1.0) bbox_img = imgs[:args.log.num_summary_img].cpu().expand( -1, 3, -1, -1).contiguous() bbox_img[bbox.sum(dim=1, keepdim=True).expand(-1, 3, -1, -1) > 0.5] = \ bbox[bbox.sum(dim=1, keepdim=True).expand(-1, 3, -1, -1) > 0.5] grid_image = make_grid(bbox_img, args.log.num_img_per_row, normalize=False, pad_value=1) writer.add_image('train/6-bbox', grid_image, global_step) bbox_white = visualize( imgs[:args.log.num_summary_img].cpu(), log['z_pres'].view( bs, args.arch.num_cell**2, -1)[:args.log.num_summary_img].cpu().detach(), log['z_where_scale'].view( bs, args.arch.num_cell**2, -1)[:args.log.num_summary_img].cpu().detach(), log['z_where_shift'].view( bs, args.arch.num_cell**2, -1)[:args.log.num_summary_img].cpu().detach(), only_bbox=True, phase_only_display_pres=True) bbox_white = bbox_white.view(args.log.num_summary_img, -1, 3, args.data.img_h, args.data.img_w).sum(1).clamp( 0.0, 1.0) bbox_white_img = imgs[:args.log.num_summary_img].cpu().expand( -1, 3, -1, -1).contiguous() bbox_white_img[bbox_white.sum(dim=1, keepdim=True).expand(-1, 3, -1, -1) > 0.5] = \ bbox_white[bbox_white.sum(dim=1, keepdim=True).expand(-1, 3, -1, -1) > 0.5] grid_image = make_grid(bbox_white_img, args.log.num_img_per_row, normalize=False, pad_value=1) writer.add_image('train/6a-bbox-white', grid_image, global_step) grid_image = make_grid(log['recon_from_q_g'].cpu().detach() [:args.log.num_summary_img].clamp( 0, 1).view(-1, args.data.inp_channel, args.data.img_h, args.data.img_w), args.log.num_img_per_row, normalize=False, pad_value=1) writer.add_image('train/7-reconstruction_from_q_g', grid_image, global_step) if args.arch.phase_background: grid_image = make_grid( log['recon_from_q_g_fg'].cpu().detach() [:args.log.num_summary_img].clamp(0, 1).view( -1, args.data.inp_channel, args.data.img_h, args.data.img_w), args.log.num_img_per_row, normalize=False, pad_value=1) writer.add_image('train/8-recon_from_q_g-fg', grid_image, global_step) grid_image = make_grid( log['recon_from_q_g_alpha'].cpu().detach() [:args.log.num_summary_img].clamp(0, 1).view( -1, 1, args.data.img_h, args.data.img_w), args.log.num_img_per_row, normalize=False, pad_value=1) writer.add_image('train/9-recon_from_q_g-alpha', grid_image, global_step) grid_image = make_grid( log['recon_from_q_g_bg'].cpu().detach() [:args.log.num_summary_img].clamp(0, 1).view( -1, args.data.inp_channel, args.data.img_h, args.data.img_w), args.log.num_img_per_row, normalize=False, pad_value=1) writer.add_image('train/a-background_from_q_g', grid_image, global_step) writer.add_scalar('train/total_loss', total_loss.item(), global_step=global_step) writer.add_scalar('train/log_like', log_like.item(), global_step=global_step) writer.add_scalar('train/What_KL', kl_what.item(), global_step=global_step) writer.add_scalar('train/bg_KL', kl_bg.item(), global_step=global_step) writer.add_scalar('train/Where_KL', kl_where.item(), global_step=global_step) writer.add_scalar('train/Pres_KL', kl_pres.item(), global_step=global_step) writer.add_scalar('train/Depth_KL', kl_depth.item(), global_step=global_step) writer.add_scalar('train/kl_global', kl_global.item(), global_step=global_step) writer.add_scalar('train/What_KL_raw', kl_what_raw.item(), global_step=global_step) writer.add_scalar('train/bg_KL_raw', kl_bg_raw.item(), global_step=global_step) writer.add_scalar('train/Where_KL_raw', kl_where_raw.item(), global_step=global_step) writer.add_scalar('train/Pres_KL_raw', kl_pres_raw.item(), global_step=global_step) writer.add_scalar('train/Depth_KL_raw', kl_depth_raw.item(), global_step=global_step) writer.add_scalar('train/aux_What_KL', aux_kl_what.item(), global_step=global_step) writer.add_scalar('train/aux_bg_KL', aux_kl_bg.item(), global_step=global_step) writer.add_scalar('train/aux_Where_KL', aux_kl_where.item(), global_step=global_step) writer.add_scalar('train/aux_Pres_KL', aux_kl_pres.item(), global_step=global_step) writer.add_scalar('train/aux_Depth_KL', aux_kl_depth.item(), global_step=global_step) writer.add_scalar('train/aux_What_KL_raw', aux_kl_what_raw.item(), global_step=global_step) writer.add_scalar('train/aux_bg_KL_raw', aux_kl_bg_raw.item(), global_step=global_step) writer.add_scalar('train/aux_Where_KL_raw', aux_kl_where_raw.item(), global_step=global_step) writer.add_scalar('train/aux_Pres_KL_raw', aux_kl_pres_raw.item(), global_step=global_step) writer.add_scalar('train/aux_Depth_KL_raw', aux_kl_depth_raw.item(), global_step=global_step) writer.add_scalar('train/kl_global_raw', kl_global_raw.item(), global_step=global_step) writer.add_scalar('train/tau_pres', args.train.tau_pres, global_step=global_step) for i in range(args.arch.draw_step): writer.add_scalar(f'train/kl_global_raw_step_{i}', kl_global_all[:, i].mean().item(), global_step=global_step) writer.add_scalar('train/log_prob_x_given_g', log['log_prob_x_given_g'].mean(0).item(), global_step=global_step) elbo = (log_like.item() - kl_pres_raw.item() - kl_where_raw.item() - kl_depth_raw.item() - kl_what_raw.item() - kl_bg_raw.item() - kl_global_raw.item()) writer.add_scalar('train/elbo', elbo, global_step=global_step) ######################################## generation ######################################## with torch.no_grad(): model.eval() if num_gpu > 1: sample = model.module.sample()[0] else: sample = model.sample()[0] model.train() grid_image = make_grid(sample[0].cpu().detach().clamp(0, 1), args.log.num_img_per_row, normalize=False, pad_value=1) writer.add_image('generation/1-image', grid_image, global_step) if args.arch.phase_background: grid_image = make_grid(sample[1].cpu().detach().clamp( 0, 1), args.log.num_img_per_row, normalize=False, pad_value=1) writer.add_image('generation/2-fg', grid_image, global_step) grid_image = make_grid(sample[2].cpu().detach().clamp( 0, 1), args.log.num_img_per_row, normalize=False, pad_value=1) writer.add_image('generation/3-alpha', grid_image, global_step) grid_image = make_grid(sample[3].cpu().detach().clamp( 0, 1), args.log.num_img_per_row, normalize=False, pad_value=1) writer.add_image('generation/4-bg', grid_image, global_step) ###################################### generation end ###################################### last_count = local_count ###################################### ll computing ###################################### # only for logging, final ll should be computed using 100 particles if epoch % args.log.compute_nll_freq == 0: print(f'val nll at the end of epoch {epoch}') model.eval() args.log.phase_nll = True elbo_list = [] kl_list = [] ll_list = [] with torch.no_grad(): args.log.phase_log = False for batch_idx, sample in enumerate(val_loader): imgs = sample.to(device) ll_sample_list = [] for i in range(args.log.nll_num_sample): _, log_like, kl, log_imp, _, _, _ = \ model(imgs) aux_kl_pres, aux_kl_where, aux_kl_depth, aux_kl_what, \ aux_kl_bg, kl_pres, kl_where, kl_depth, kl_what, \ kl_global_all, kl_bg = kl log_imp_pres, log_imp_depth, log_imp_what, log_imp_where, log_imp_bg, log_imp_g = log_imp ll_sample_list.append( (log_like + log_imp_pres + log_imp_depth + log_imp_what + log_imp_where + log_imp_bg + log_imp_g).cpu()) # Only use one sample for elbo if i == 0: elbo_list.append((log_like - kl_pres - kl_where - kl_depth - kl_what - kl_bg - kl_global_all.sum(dim=1)).cpu()) kl_list.append( (kl_pres + kl_where + kl_depth + kl_what + kl_bg + kl_global_all.sum(dim=1)).cpu()) ll_sample = log_mean_exp(torch.stack(ll_sample_list, dim=1), dim=1) ll_list.append(ll_sample) ll_all = torch.cat(ll_list, dim=0) elbo_all = torch.cat(elbo_list, dim=0) kl_all = torch.cat(kl_list, dim=0) writer.add_scalar('val/ll', ll_all.mean(0).item(), global_step=epoch) writer.add_scalar('val/elbo', elbo_all.mean(0).item(), global_step=epoch) writer.add_scalar('val/kl', kl_all.mean(0).item(), global_step=epoch) args.log.phase_nll = False model.train() if epoch % (args.log.compute_nll_freq * 10) == 0: print(f'test nll at the end of epoch {epoch}') model.eval() args.log.phase_nll = True elbo_list = [] kl_list = [] ll_list = [] with torch.no_grad(): args.log.phase_log = False for batch_idx, sample in enumerate(test_loader): imgs = sample.to(device) ll_sample_list = [] for i in range(args.log.nll_num_sample): _, log_like, kl, log_imp, _, _, _ = \ model(imgs) aux_kl_pres, aux_kl_where, aux_kl_depth, aux_kl_what, \ aux_kl_bg, kl_pres, kl_where, kl_depth, kl_what, \ kl_global_all, kl_bg = kl log_imp_pres, log_imp_depth, log_imp_what, log_imp_where, log_imp_bg, log_imp_g = log_imp ll_sample_list.append( (log_like + log_imp_pres + log_imp_depth + log_imp_what + log_imp_where + log_imp_bg + log_imp_g).cpu()) # Only use one sample for elbo if i == 0: elbo_list.append((log_like - kl_pres - kl_where - kl_depth - kl_what - kl_bg - kl_global_all.sum(dim=1)).cpu()) kl_list.append( (kl_pres + kl_where + kl_depth + kl_what + kl_bg + kl_global_all.sum(dim=1)).cpu()) ll_sample = log_mean_exp(torch.stack(ll_sample_list, dim=1), dim=1) ll_list.append(ll_sample) ll_all = torch.cat(ll_list, dim=0) elbo_all = torch.cat(elbo_list, dim=0) kl_all = torch.cat(kl_list, dim=0) writer.add_scalar('test/ll', ll_all.mean(0).item(), global_step=epoch) writer.add_scalar('test/elbo', elbo_all.mean(0).item(), global_step=epoch) writer.add_scalar('test/kl', kl_all.mean(0).item(), global_step=epoch) args.log.phase_nll = False model.train() if epoch % args.log.save_epoch_freq == 0 and epoch != 0: save_ckpt(model_dir, model, optimizer, global_step, epoch, local_count, args.train.batch_size, num_train) save_ckpt(model_dir, model, optimizer, global_step, epoch, local_count, args.train.batch_size, num_train)
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu args.rank = args.start_rank + gpu TARGET_GPUS = [args.gpu] gpus = torch.IntTensor(TARGET_GPUS) logger = None ckpt_path = "models_chunk_twin_context" os.system("mkdir -p {}".format(ckpt_path)) if args.rank == 0: logger = init_logging( "chunk_model", "{}/train.log".format("models_chunk_twin_context")) args_msg = [ ' %s: %s' % (name, value) for (name, value) in vars(args).items() ] logger.info('args:\n' + '\n'.join(args_msg)) csv_file = open(args.csv_file, 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(header) ctc_crf_base.init_env(args.den_lm_fst_path, gpus) #print("rank {} init process grop".format(args.rank), # datetime.datetime.now(), flush=True) dist.init_process_group(backend='nccl', init_method=args.dist_url, world_size=args.world_size, rank=args.rank) torch.cuda.set_device(args.gpu) model = CAT_Chunk_Model(args.feature_size, args.hdim, args.output_unit, args.dropout, args.lamb, args.reg_weight, args.ctc_crf) if args.rank == 0: params_msg = params_num(model) logger.info('\n'.join(params_msg)) lr = args.origin_lr optimizer = optim.Adam(model.parameters(), lr=lr) epoch = 0 prev_cv_loss = np.inf if args.checkpoint: checkpoint = torch.load(args.checkpoint) epoch = checkpoint['epoch'] lr = checkpoint['lr'] prev_cv_loss = checkpoint['cv_loss'] model.load_state_dict(checkpoint['model']) model.cuda(args.gpu) model = nn.parallel.DistributedDataParallel(model, device_ids=TARGET_GPUS) reg_model = CAT_RegModel(args.feature_size, args.hdim, args.output_unit, args.dropout, args.lamb) loaded_reg_model = torch.load(args.regmodel_checkpoint) reg_model.load_state_dict(loaded_reg_model) reg_model.cuda(args.gpu) reg_model = nn.parallel.DistributedDataParallel(reg_model, device_ids=TARGET_GPUS) model.train() reg_model.eval() prev_epoch_time = timeit.default_timer() while True: # training stage epoch += 1 gc.collect() if epoch > 2: cate_list = list(range(1, args.cate, 1)) random.shuffle(cate_list) else: cate_list = range(1, args.cate, 1) for cate in cate_list: pkl_path = args.tr_data_path + "/" + str(cate) + ".pkl" if not os.path.exists(pkl_path): continue batch_size = int(args.gpu_batch_size * 2 / cate) if batch_size < 2: batch_size = 2 #print("rank {} pkl path {} batch size {}".format( # args.rank, pkl_path, batch_size)) tr_dataset = SpeechDatasetMemPickel(pkl_path) if tr_dataset.__len__() < args.world_size: continue jitter = random.randint(-args.jitter_range, args.jitter_range) chunk_size = args.default_chunk_size + jitter tr_sampler = DistributedSampler(tr_dataset) tr_dataloader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=PadCollateChunk(chunk_size), drop_last=True, sampler=tr_sampler) tr_sampler.set_epoch(epoch) # important for data shuffle print( "rank {} lengths_cate: {}, chunk_size: {}, training epoch: {}". format(args.rank, cate, chunk_size, epoch)) train_chunk_model(model, reg_model, tr_dataloader, optimizer, epoch, chunk_size, TARGET_GPUS, args, logger) # cv stage model.eval() cv_losses_sum = [] cv_cls_losses_sum = [] count = 0 cate_list = range(1, args.cate, 1) for cate in cate_list: pkl_path = args.dev_data_path + "/" + str(cate) + ".pkl" if not os.path.exists(pkl_path): continue batch_size = int(args.gpu_batch_size * 2 / cate) if batch_size < 2: batch_size = 2 cv_dataset = SpeechDatasetMemPickel(pkl_path) cv_dataloader = DataLoader(cv_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=PadCollateChunk( args.default_chunk_size), drop_last=True) validate_count = validate_chunk_model(model, reg_model, cv_dataloader, epoch, cv_losses_sum, cv_cls_losses_sum, args, logger) count += validate_count cv_loss = np.sum(np.asarray(cv_losses_sum)) / count cv_cls_loss = np.sum(np.asarray(cv_cls_losses_sum)) / count #print("mean_cv_loss:{} , mean_cv_cls_loss: {}".format(cv_loss, cv_cls_loss)) if args.rank == 0: save_ckpt( { 'cv_loss': cv_loss, 'model': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'lr': lr, 'epoch': epoch }, epoch < args.min_epoch or cv_loss <= prev_cv_loss, ckpt_path, "model.epoch.{}".format(epoch)) csv_row = [ epoch, (timeit.default_timer() - prev_epoch_time) / 60, lr, cv_loss ] prev_epoch_time = timeit.default_timer() csv_writer.writerow(csv_row) csv_file.flush() plot_train_figure(args.csv_file, args.figure_file) if epoch < args.min_epoch or cv_loss <= prev_cv_loss: prev_cv_loss = cv_loss else: args.annealing_epoch = 0 lr = adjust_lr_distribute(optimizer, args.origin_lr, lr, cv_loss, prev_cv_loss, epoch, args.annealing_epoch, args.gpu_batch_size, args.world_size) if (lr < args.stop_lr): print("rank {} lr is too slow, finish training".format(args.rank), datetime.datetime.now(), flush=True) break model.train() ctc_crf_base.release_env(gpus)
def main(): global args, best_prec1 args = parser.parse_args() gpu_num = torch.cuda.device_count() print("=> using {} GPUS for training".format(gpu_num)) if gpu_num > 0: args.cuda = True kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} path = os.path.join(args.data_folder, args.dataset) if args.dataset == 'mnist': num_classes = 10 normalize = transforms.Normalize((0.1307, ), (0.3081, )) train_loader = torch.utils.data.DataLoader(datasets.MNIST( path, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( datasets.MNIST(path, train=False, transform=transforms.Compose([ transforms.ToTensor(), normalize, ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) elif args.dataset == 'cifar10': """classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'] """ num_classes = 10 normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]) train_loader = torch.utils.data.DataLoader(datasets.CIFAR10( root=path, train=True, download=True, transform=transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( datasets.CIFAR10(root=path, train=False, download=True, transform=transforms.Compose([ transforms.ToTensor(), normalize, ])), batch_size=args.test_batch_size, shuffle=False, **kwargs) else: raise ValueError('{} is not supported'.format(args.dataset)) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](num_classes=num_classes, conv_init=args.conv_init, pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](num_classes=num_classes, conv_init=args.conv_init) if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True if args.evaluate: test(test_loader, model, criterion) return assert max(args.lr_steps) < args.epochs lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, args.lr_steps, args.gamma) for epoch in range(args.start_epoch, args.epochs): # train for one epoch train(train_loader, model, criterion, optimizer, epoch) lr_scheduler.step() # evaluate on test set prec1 = test(test_loader, model, criterion) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_ckpt( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, args.save_path, epoch + 1, is_best) print('best test top-1 accuracy: {}'.format(best_prec1))
def main(): start_time = time() init_out_dir() last_step = get_last_ckpt_step() if last_step >= 0: my_log(f'\nCheckpoint found: {last_step}\n') else: clear_log() print_args() net_init, net_apply, net_init_cache, net_apply_fast = get_net() rng, rng_net = jrand.split(jrand.PRNGKey(args.seed)) in_shape = (args.batch_size, args.L, args.L, 1) out_shape, params_init = net_init(rng_net, in_shape) _, cache_init = net_init_cache(params_init, jnp.zeros(in_shape), (-1, -1)) # sample_fun = get_sample_fun(net_apply, None) sample_fun = get_sample_fun(net_apply_fast, cache_init) log_q_fun = get_log_q_fun(net_apply) need_beta_anneal = args.beta_anneal_step > 0 opt_init, opt_update, get_params = optimizers.adam(args.lr) @jit def update(step, opt_state, rng): params = get_params(opt_state) rng, rng_sample = jrand.split(rng) spins = sample_fun(args.batch_size, params, rng_sample) log_q = log_q_fun(params, spins) / args.L**2 energy = energy_fun(spins) / args.L**2 def neg_log_Z_fun(params, spins): log_q = log_q_fun(params, spins) / args.L**2 energy = energy_fun(spins) / args.L**2 beta = args.beta if need_beta_anneal: beta *= jnp.minimum(step / args.beta_anneal_step, 1) neg_log_Z = log_q + beta * energy return neg_log_Z loss_fun = partial(expect, log_q_fun, neg_log_Z_fun, mean_grad_expected_is_zero=True) grads = grad(loss_fun)(params, spins, spins) opt_state = opt_update(step, grads, opt_state) return spins, log_q, energy, opt_state, rng if last_step >= 0: params_init = load_ckpt(last_step) opt_state = opt_init(params_init) my_log('Training...') for step in range(last_step + 1, args.max_step + 1): spins, log_q, energy, opt_state, rng = update(step, opt_state, rng) if args.print_step and step % args.print_step == 0: # Use the final beta, not the annealed beta free_energy = log_q / args.beta + energy my_log(', '.join([ f'step = {step}', f'F = {free_energy.mean():.8g}', f'F_std = {free_energy.std():.8g}', f'S = {-log_q.mean():.8g}', f'E = {energy.mean():.8g}', f'time = {time() - start_time:.3f}', ])) if args.save_step and step % args.save_step == 0: params = get_params(opt_state) save_ckpt(params, step)
def train(self): # Seed np.random.seed(self.manual_seed) random.seed(self.manual_seed) torch.manual_seed(self.manual_seed) # For fast training cudnn.benchmark = True # For BatchNorm self.G.train() self.D.train() # Fixed noise for sampling from G fixed_noise = torch.randn(self.batch_size, self.z_dim, device=self.device) if self.num_of_classes < self.batch_size: fixed_labels = torch.from_numpy( np.tile(np.arange(self.num_of_classes), self.batch_size // self.num_of_classes + 1)[:self.batch_size]).to(self.device) else: fixed_labels = torch.from_numpy(np.arange(self.batch_size)).to( self.device) # For gan loss label = torch.full((self.batch_size, ), 1, device=self.device) ones = torch.full((self.batch_size, ), 1, device=self.device) # Losses file log_file_name = os.path.join(self.save_path, 'log.txt') log_file = open(log_file_name, "wt") # Init start_time = time.time() G_losses = [] D_losses_real = [] D_losses_fake = [] D_losses = [] D_xs = [] D_Gz_trainDs = [] D_Gz_trainGs = [] # Instance noise - make random noise mean (0) and std for injecting inst_noise_mean = torch.full( (self.batch_size, 3, self.imsize, self.imsize), 0, device=self.device) inst_noise_std = torch.full( (self.batch_size, 3, self.imsize, self.imsize), self.inst_noise_sigma, device=self.device) # Start training for self.step in range(self.start, self.total_step): # Instance noise std is linearly annealed from self.inst_noise_sigma to 0 thru self.inst_noise_sigma_iters inst_noise_sigma_curr = 0 if self.step > self.inst_noise_sigma_iters else ( 1 - self.step / self.inst_noise_sigma_iters) * self.inst_noise_sigma inst_noise_std.fill_(inst_noise_sigma_curr) # ================== TRAIN D ================== # for _ in range(self.d_steps_per_iter): # Zero grad self.reset_grad() # TRAIN with REAL # Get real images & real labels real_images, real_labels = self.get_real_samples() # Get D output for real images & real labels inst_noise = torch.normal(mean=inst_noise_mean, std=inst_noise_std).to(self.device) d_out_real = self.D(real_images + inst_noise, real_labels) # Compute D loss with real images & real labels if self.adv_loss == 'hinge': d_loss_real = torch.nn.ReLU()(ones - d_out_real).mean() elif self.adv_loss == 'wgan_gp': d_loss_real = -d_out_real.mean() else: label.fill_(1) d_loss_real = self.criterion(d_out_real, label) # Backward d_loss_real.backward() # TRAIN with FAKE # Create random noise z = torch.randn(self.batch_size, self.z_dim, device=self.device) # Generate fake images for same real labels fake_images = self.G(z, real_labels) # Get D output for fake images & same real labels inst_noise = torch.normal(mean=inst_noise_mean, std=inst_noise_std).to(self.device) d_out_fake = self.D(fake_images.detach() + inst_noise, real_labels) # Compute D loss with fake images & real labels if self.adv_loss == 'hinge': d_loss_fake = torch.nn.ReLU()(ones + d_out_fake).mean() elif self.adv_loss == 'dcgan': label.fill_(0) d_loss_fake = self.criterion(d_out_fake, label) else: d_loss_fake = d_out_fake.mean() # Backward d_loss_fake.backward() # If WGAN_GP, compute GP and add to D loss if self.adv_loss == 'wgan_gp': d_loss_gp = self.lambda_gp * self.compute_gradient_penalty( real_images, real_labels, fake_images.detach()) d_loss_gp.backward() # Optimize self.D_optimizer.step() # ================== TRAIN G ================== # for _ in range(self.g_steps_per_iter): # Zero grad self.reset_grad() # Get real images & real labels (only need real labels) real_images, real_labels = self.get_real_samples() # Create random noise z = torch.randn(self.batch_size, self.z_dim).to(self.device) # Generate fake images for same real labels fake_images = self.G(z, real_labels) # Get D output for fake images & same real labels inst_noise = torch.normal(mean=inst_noise_mean, std=inst_noise_std).to(self.device) g_out_fake = self.D(fake_images + inst_noise, real_labels) # Compute G loss with fake images & real labels if self.adv_loss == 'dcgan': label.fill_(1) g_loss = self.criterion(g_out_fake, label) else: g_loss = -g_out_fake.mean() # Backward + Optimize g_loss.backward() self.G_optimizer.step() # Print out log info if self.step % self.log_step == 0: G_losses.append(g_loss.mean().item()) D_losses_real.append(d_loss_real.mean().item()) D_losses_fake.append(d_loss_fake.mean().item()) D_loss = D_losses_real[-1] + D_losses_fake[-1] if self.adv_loss == 'wgan_gp': D_loss += d_loss_gp.mean().item() D_losses.append(D_loss) D_xs.append(d_out_real.mean().item()) D_Gz_trainDs.append(d_out_fake.mean().item()) D_Gz_trainGs.append(g_out_fake.mean().item()) curr_time = time.time() curr_time_str = datetime.datetime.fromtimestamp( curr_time).strftime('%Y-%m-%d %H:%M:%S') elapsed = str( datetime.timedelta(seconds=(curr_time - start_time))) log = ( "[{}] : Elapsed [{}], Iter [{} / {}], G_loss: {:.4f}, D_loss: {:.4f}, D_loss_real: {:.4f}, D_loss_fake: {:.4f}, D(x): {:.4f}, D(G(z))_trainD: {:.4f}, D(G(z))_trainG: {:.4f}\n" .format(curr_time_str, elapsed, self.step, self.total_step, G_losses[-1], D_losses[-1], D_losses_real[-1], D_losses_fake[-1], D_xs[-1], D_Gz_trainDs[-1], D_Gz_trainGs[-1])) print(log) log_file.write(log) log_file.flush() utils.make_plots(G_losses, D_losses, D_losses_real, D_losses_fake, D_xs, D_Gz_trainDs, D_Gz_trainGs, self.log_step, self.save_path) # Sample images if self.step % self.sample_step == 0: self.G.eval() fake_images = self.G(fixed_noise, fixed_labels) self.G.train() sample_images = utils.denorm( fake_images.detach()[:self.save_n_images]) # Save batch images vutils.save_image( sample_images, os.path.join(self.sample_path, 'fake_{:05d}.png'.format(self.step))) # Save gif utils.make_gif( sample_images[0].cpu().numpy().transpose(1, 2, 0) * 255, self.step, self.sample_path, self.name, max_frames_per_gif=self.max_frames_per_gif) # Save model if self.step % self.model_save_step == 0: utils.save_ckpt(self)
def main(opts): # Set up model model_map = { 'v3_resnet50': network.deeplabv3_resnet50, 'v3plus_resnet50': network.deeplabv3plus_resnet50, 'v3_resnet101': network.deeplabv3_resnet101, 'v3plus_resnet101': network.deeplabv3plus_resnet101, 'v3_mobilenet': network.deeplabv3_mobilenet, 'v3plus_mobilenet': network.deeplabv3plus_mobilenet } best_score = 0.0 epoch = 0 if opts.ckpt is not None and os.path.isfile(opts.ckpt): checkpoint = torch.load(opts.ckpt, map_location=torch.device('cpu')) checkpoint['teacher_opts']['save_val_results'] = opts.save_val_results checkpoint['teacher_opts']['ckpt'] = opts.ckpt opts = utils.Bunch(checkpoint['teacher_opts']) model = model_map[opts.model](num_classes=opts.num_classes, output_stride=opts.output_stride, opts=opts) teacher = None utils.set_bn_momentum(model.backbone, momentum=0.01) macs, params = utils.count_flops(model, opts) if (opts.count_flops): return utils.create_result(opts, macs, params) # Set up optimizer and criterion optimizer = torch.optim.SGD(params=[ {'params': model.backbone.parameters(), 'lr': 0.1*opts.lr}, {'params': model.classifier.parameters(), 'lr': opts.lr}, ], lr=opts.lr, momentum=0.9, weight_decay=opts.weight_decay) scheduler = utils.PolyLR(optimizer, opts.total_epochs * len(train_loader), power=0.9) criterion = nn.CrossEntropyLoss(ignore_index=255, reduction='mean') # Load from checkpoint if opts.ckpt is not None and os.path.isfile(opts.ckpt): checkpoint = torch.load(opts.ckpt, map_location=torch.device('cpu')) model.load_state_dict(checkpoint["model_state"]) model = nn.DataParallel(model) model.to(device) optimizer.load_state_dict(checkpoint["optimizer_state"]) scheduler.load_state_dict(checkpoint["scheduler_state"]) epoch = checkpoint.get("epoch", 0) best_score = checkpoint.get('best_score', 0.0) print("Model restored from %s" % opts.ckpt) del checkpoint # free memory else: model = nn.DataParallel(model) model.to(device) if opts.save_val_results: score = validate(model) print(metrics.to_str(score)) return if opts.mode == "student": checkpoint = torch.load(opts.teacher_ckpt, map_location=torch.device('cpu')) checkpoint['teacher_opts']['at_type'] = opts.at_type teacher_opts = utils.Bunch(checkpoint['teacher_opts']) teacher = model_map[teacher_opts.model](num_classes=opts.num_classes, output_stride=teacher_opts.output_stride, opts=teacher_opts) teacher.load_state_dict(checkpoint["model_state"]) teacher = nn.DataParallel(teacher) teacher.to(device) for param in teacher.parameters(): param.requires_grad = False # ===== Train ===== for epoch in tqdm(range(epoch, opts.total_epochs)): if opts.mode == "teacher": train_teacher(model, optimizer, criterion, scheduler) else: train_student(model, teacher, optimizer, criterion, scheduler) score = validate(model) print(metrics.to_str(score)) utils.save_result(score, opts) if score['Mean IoU'] > best_score or (opts.max_epochs != opts.total_epochs and epoch+1 == opts.total_epochs): best_score = score['Mean IoU'] utils.save_ckpt(opts.data_root, opts, model, optimizer, scheduler, best_score, epoch+1)
def main(args): args.color_t = torch.rand(700, 3) if not os.path.exists(args.ckpt_dir): os.mkdir(args.ckpt_dir) if not os.path.exists(args.summary_dir): os.mkdir(args.summary_dir) device = torch.device( "cuda" if not args.nocuda and torch.cuda.is_available() else "cpu") train_data = TrainStation(args=args, train=True) train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, drop_last=True) num_train = len(train_data) model = SCALOR(args) model.to(device) model.train() optimizer = torch.optim.RMSprop(model.parameters(), lr=args.lr) global_step = 0 if args.last_ckpt: global_step, args.start_epoch = \ load_ckpt(model, optimizer, args.last_ckpt, device) writer = SummaryWriter(args.summary_dir) args.global_step = global_step log_tau_gamma = np.log(args.tau_end) / args.tau_ep for epoch in range(int(args.start_epoch), args.epochs): local_count = 0 last_count = 0 end_time = time.time() for batch_idx, (sample, counting_gt) in enumerate(train_loader): tau = np.exp(global_step * log_tau_gamma) tau = max(tau, args.tau_end) args.tau = tau global_step += 1 log_phase = global_step % args.print_freq == 0 or global_step == 1 args.global_step = global_step args.log_phase = log_phase imgs = sample.to(device) y_seq, log_like, kl_z_what, kl_z_where, kl_z_depth, \ kl_z_pres, kl_z_bg, log_imp, counting, \ log_disc_list, log_prop_list, scalor_log_list = model(imgs) log_like = log_like.mean(dim=0) kl_z_what = kl_z_what.mean(dim=0) kl_z_where = kl_z_where.mean(dim=0) kl_z_depth = kl_z_depth.mean(dim=0) kl_z_pres = kl_z_pres.mean(dim=0) kl_z_bg = kl_z_bg.mean(0) total_loss = -(log_like - kl_z_what - kl_z_where - kl_z_depth - kl_z_pres - kl_z_bg) optimizer.zero_grad() total_loss.backward() clip_grad_norm_(model.parameters(), args.cp) optimizer.step() local_count += imgs.data.shape[0] if log_phase: time_inter = time.time() - end_time end_time = time.time() count_inter = local_count - last_count print_scalor(global_step, epoch, local_count, count_inter, num_train, total_loss, log_like, kl_z_what, kl_z_where, kl_z_pres, kl_z_depth, time_inter) writer.add_scalar('train/total_loss', total_loss.item(), global_step=global_step) writer.add_scalar('train/log_like', log_like.item(), global_step=global_step) writer.add_scalar('train/What_KL', kl_z_what.item(), global_step=global_step) writer.add_scalar('train/Where_KL', kl_z_where.item(), global_step=global_step) writer.add_scalar('train/Pres_KL', kl_z_pres.item(), global_step=global_step) writer.add_scalar('train/Depth_KL', kl_z_depth.item(), global_step=global_step) writer.add_scalar('train/Bg_KL', kl_z_bg.item(), global_step=global_step) # writer.add_scalar('train/Bg_alpha_KL', kl_z_bg_mask.item(), global_step=global_step) writer.add_scalar('train/tau', tau, global_step=global_step) log_summary(args, writer, imgs, y_seq, global_step, log_disc_list, log_prop_list, scalor_log_list, prefix='train') last_count = local_count if global_step % args.generate_freq == 0: ####################################### do generation #################################### model.eval() with torch.no_grad(): args.phase_generate = True y_seq, log_like, kl_z_what, kl_z_where, kl_z_depth, \ kl_z_pres, kl_z_bg, log_imp, counting, \ log_disc_list, log_prop_list, scalor_log_list = model(imgs) args.phase_generate = False log_summary(args, writer, imgs, y_seq, global_step, log_disc_list, log_prop_list, scalor_log_list, prefix='generate') model.train() ####################################### end generation #################################### if global_step % args.save_epoch_freq == 0 or global_step == 1: save_ckpt(args.ckpt_dir, model, optimizer, global_step, epoch, local_count, args.batch_size, num_train)
f'samples{args.samples}' f'{args.tail}') os.makedirs(model_dir, exist_ok=True) utils.save_params(model_dir, vars(args)) print(model_dir) ## Data trainset, testset, num_classes = L.load_dataset(args.data, data_dir=args.data_dir) X_train, y_train = F.get_samples(trainset, args.samples) X_train, y_train = X_train.to(device), y_train.to(device) ## Architecture net = L.load_architecture(args.data, args.arch) net = net.to(device) ## Training with torch.no_grad(): Z_train = net.init(X_train, y_train) losses_train = net.get_loss() X_train, Z_train = F.to_cpu(X_train, Z_train) ## Saving utils.save_loss(model_dir, 'train', losses_train) utils.save_ckpt(model_dir, 'model', net) ## Plotting plot.plot_loss_mcr(model_dir, 'train') print(model_dir)