def clean(self): """ Clean the trainer for a new patch. Don't touch the model, as it depends on transfer learning options. """ self.iiter = 0 print(colored('Finished patch %s' % self.image_name, 'yellow')) torch.cuda.empty_cache() self.loss_min = None self.history = u.History(self.args.epochs)
def __init__(self, args, outpath, dtype=torch.cuda.FloatTensor): self.args = args self.dtype = dtype self.outpath = outpath if args.loss == 'mse': self.loss_fn = torch.nn.MSELoss().type(self.dtype) else: self.loss_fn = torch.nn.L1Loss().type(self.dtype) self.elapsed = None self.iiter = 0 self.iter_to_be_saved = list(range(0, self.args.epochs, int(self.args.save_every))) \ if self.args.save_every is not None else [0] self.loss_min = None self.outchannel = args.imgchannel self.history = u.History(self.args.epochs) self.imgpath = None self.image_name = None self.img = None self.img_ = None self.mask = None self.mask_ = None self.out_best = None self.out_old = None self.zfill = u.ten_digit(self.args.epochs) # build input tensors self.input_type = 'noise3d' if args.datadim == '3d' else 'noise' self.input_ = None self.input_old = None self.add_noise_ = None self.add_data_ = None self.add_data_weight = None self.input_list = [] # build network self.net = None self.parameters = None self.num_params = None self.optimizer = None
def train_model(dataset, paths, device): """The main function for executing network training. It loads the specified dataset iterator, saliency model, and helper classes. Training is then performed in a new session by iterating over all batches for a number of epochs. After validation on an independent set, the model is saved and the training history is updated. Args: dataset (str): Denotes the dataset to be used during training. paths (dict, str): A dictionary with all path elements. device (str): Represents either "cpu" or "gpu". """ iterator = data.get_dataset_iterator("train", dataset, paths["data"]) next_element, train_init_op, valid_init_op = iterator input_images, ground_truths = next_element[:2] input_plhd = tf.placeholder_with_default(input_images, (None, None, None, 3), name="input") #training = tf.placeholder(tf.bool, name="training") ## For BN msi_net = model_bn.MSINET(is_train=True) predicted_maps = msi_net.forward(input_plhd) optimizer, loss = msi_net.train(ground_truths, predicted_maps, config.PARAMS["learning_rate"]) n_train_data = getattr(data, dataset.upper()).n_train n_valid_data = getattr(data, dataset.upper()).n_valid n_train_batches = int(np.ceil(n_train_data / config.PARAMS["batch_size"])) n_valid_batches = int(np.ceil(n_valid_data / config.PARAMS["batch_size"])) history = utils.History(n_train_batches, n_valid_batches, dataset, paths["history"], device) progbar = utils.Progbar(n_train_data, n_train_batches, config.PARAMS["batch_size"], config.PARAMS["n_epochs"], history.prior_epochs) #training = tf.placeholder(tf.bool, name="training") ## For BN with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = msi_net.restore(sess, dataset, paths, device) print(">> Start training on %s..." % dataset.upper()) for epoch in range(config.PARAMS["n_epochs"]): sess.run(train_init_op) for batch in range(n_train_batches): #_, error = sess.run([optimizer, loss], feed_dict={training: True}) _, error = sess.run([optimizer, loss]) history.update_train_step(error) progbar.update_train_step(batch) sess.run(valid_init_op) for batch in range(n_valid_batches): #error = sess.run(loss, feed_dict={training: False}) error = sess.run(loss) history.update_valid_step(error) progbar.update_valid_step() msi_net.save(saver, sess, dataset, paths["latest"], device) history.save_history() progbar.write_summary(history.get_mean_train_error(), history.get_mean_valid_error()) if history.valid_history[-1] == min(history.valid_history): msi_net.save(saver, sess, dataset, paths["best"], device) msi_net.optimize(sess, dataset, paths["best"], device) print("\tBest model!", flush=True)
# log_dir = 'logs/' + str(np.max(existing_logs)+1) # print('Logging output via tensorboard to', log_dir) # else: # log_dir = None # print('Not logging output') # Set up dataset splits and ranking evaluation. #train_triples, val_triples, test_triples = utils.train_val_test_split(all_triples, val_size=5000, test_size=5000, random_state=0) filtered = False train_ranker = RankingEvaluation(train_triples[:5000], num_nodes, triples_to_filter=all_triples if filtered else None, device=device, show_progress=True) val_ranker = RankingEvaluation(val_triples, num_nodes, triples_to_filter=all_triples if filtered else None, device=device, show_progress=True) #test_ranker = RankingEvaluation(test_triples, num_nodes, filter_triples=all_triples if filtered else None, show_progress=True) history = utils.History() #node_features = load_image_features(num_nodes, entity_map) node_features = None utils.seed_all(0) # TODO: Make device parameter obsolete by moving everything to the device once .to(device) is called. # net = UnsupervisedRGCN(num_nodes, num_relations, train_triples, embedding_size=200, dropout=0, # embedding_size=500, dropout=0.5 # num_sample_train=10, num_sample_eval=10, activation=F.elu, # node_features=node_features, device=device) net = DistMult(500, num_nodes, num_relations, 0) net.to(device) optimizer = torch.optim.Adam(filter(lambda parameter: parameter.requires_grad, net.parameters()), lr=0.001) train_via_classification(net, train_triples, val_triples, optimizer, num_nodes, train_ranker, val_ranker, num_epochs=35, batch_size=64, batch_size_eval=512, device=device,
def logp(z): # log posterior distribution x = netG(z) lpr = -0.5 * (z**2).view(z.shape[0], -1).sum(-1) # log prior llh = -0.5 * ((x[..., ij[:, 0], ij[:, 1]] - vals)**2).view( x.shape[0], -1).sum(-1) / args.alpha # log likelihood return llh + lpr optimizer = optim.Adam(netI.parameters(), lr=args.lr, amsgrad=True, betas=(0.5, 0.9)) w = torch.FloatTensor(args.batch_size, args.nw).to(device) history = utils.History(args.outdir) plotter = utils.Plotter(args.outdir, netG, netI, args.condfile, torch.randn(64, args.nw).to(device)) for i in xrange(args.niter): optimizer.zero_grad() w.normal_(0, 1) z = netI(w) z = z.view(z.shape[0], z.shape[1], 1, 1) err = -logp(z).mean() ent = utils.sample_entropy(z) kl = err - ent kl.backward() optimizer.step()
def train_via_ranking(net, train_triples, val_triples, optimizer, num_nodes, train_ranker, val_ranker, num_epochs, batch_size, batch_size_eval, device, margin=1, history=None, save_best_to=None, dry_run=False, ranking_eval=True): #writer = SummaryWriter() if history is None: history = utils.History() loss_function = SimplifiedMarginRankingLoss(margin) if dry_run: # use first batch only train_triples = train_triples[:batch_size] val_triples = val_triples[:batch_size_eval] for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch + 1, num_epochs)) # -------------------- Training -------------------- net.train() train_dataset = TriplesDatasetRanking(train_triples, num_nodes) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) train_loader_tqdm = tqdm(train_loader) batches_history = utils.History() #running_metrics = collections.defaultdict(lambda: 0) for batch, (batch_triples, batch_negative_triples) in enumerate(train_loader_tqdm): batch_triples = batch_triples.to(device) batch_negative_triples = batch_negative_triples.to(device) # Sanity check: Train on 0 inputs. #print('WARNING: Sanity check enabled') #batch_triples = torch.zeros_like(batch_triples) #batch_negative_triples = torch.zeros_like(batch_negative_triples) optimizer.zero_grad() output = net(batch_triples) output_negative = net(batch_negative_triples) loss = loss_function(output, output_negative) loss.backward() optimizer.step() batches_history.log_metric('loss', loss.item()) batches_history.log_metric( 'acc', (output > output_negative).float().mean().item()) batches_history.log_metric( 'mean_diff', (output - output_negative).mean().item()) batches_history.log_metric( 'median_diff', (output - output_negative).median().item()) if batch % 10 == 0: train_loader_tqdm.set_postfix(batches_history.latest()) #for key in running_metrics: # running_metrics[key] /= len(batches) del batch_triples, batch_negative_triples, output, output_negative, loss torch.cuda.empty_cache() # -------------------- Testing -------------------- net.eval() with torch.no_grad(): val_dataset = TriplesDatasetRanking(val_triples, num_nodes) val_loader = DataLoader(val_dataset, batch_size=batch_size_eval, shuffle=False) val_batches_history = utils.History() for batch, (batch_triples, batch_negative_triples) in enumerate(val_loader): # TODO: Does it actually make sense to move these to CUDA? They are just used as indices. batch_triples = batch_triples.to(device) batch_negative_triples = batch_negative_triples.to(device) output = net(batch_triples) output_negative = net(batch_negative_triples) loss = loss_function(output, output_negative) # TODO: Especially getting the loss takes quite some time (as much as a single prediction for dist mult), maybe replace it by a running metric directly in torch. val_batches_history.log_metric('loss', loss.item()) val_batches_history.log_metric( 'acc', (output > output_negative).float().mean().item()) val_batches_history.log_metric( 'mean_diff', (output - output_negative).mean().item()) val_batches_history.log_metric( 'median_diff', (output - output_negative).median().item()) del batch_triples, batch_negative_triples, output, output_negative, loss torch.cuda.empty_cache() #for key in running_metrics: # running_metrics[key] /= len(batches) # TODO: Maybe implement these metrics in a batched fashion. history.log_metric('loss', batches_history.mean('loss'), val_batches_history.mean('loss'), 'Loss', print_=True) #writer.add_scalar('test/loss', batches_history.mean('loss'), epoch) #writer.add_scalar('test/val_loss', val_batches_history.mean('loss'), epoch) history.log_metric('acc', batches_history.mean('acc'), val_batches_history.mean('acc'), 'Accuracy', print_=True) history.log_metric('mean_diff', batches_history.mean('mean_diff'), val_batches_history.mean('mean_diff'), 'Mean Difference', print_=True) history.log_metric('median_diff', batches_history.mean('median_diff'), val_batches_history.mean('median_diff'), 'Median Difference', print_=True) # -------------------- Ranking -------------------- if ranking_eval: mean_rank, mean_rec_rank, hits_1, hits_3, hits_10 = train_ranker( net, batch_size=batch_size_eval) val_mean_rank, val_mean_rec_rank, val_hits_1, val_hits_3, val_hits_10 = val_ranker( net, batch_size=batch_size_eval) history.log_metric('mean_rank', mean_rank, val_mean_rank, 'Mean Rank', print_=True) history.log_metric('mean_rec_rank', mean_rec_rank, val_mean_rec_rank, 'Mean Rec Rank', print_=True) history.log_metric('hits_1', hits_1, val_hits_1, 'Hits@1', print_=True) history.log_metric('hits_3', hits_3, val_hits_3, 'Hits@3', print_=True) history.log_metric('hits_10', hits_10, val_hits_10, 'Hits@10', print_=True) # -------------------- Saving -------------------- if save_best_to is not None and ( epoch == 0 or history['val_mean_rec_rank'][-1] >= np.max( history['val_mean_rec_rank'][:-1])): # TODO: Using save on the model here directly gives an error. torch.save(net.state_dict(), save_best_to) print() print('Saving model after epoch {} to {}'.format( epoch + 1, save_best_to)) print('-' * 80) print() return history
def train_via_classification(net, train_triples, val_triples, optimizer, num_nodes, train_ranker, val_ranker, num_epochs, batch_size, batch_size_eval, device, history=None, save_best_to=None, dry_run=False, ranking_eval=True): #if log_dir is not None: # writer = SummaryWriter(log_dir=log_dir) if history is None: history = utils.History() loss_function = nn.BCEWithLogitsLoss() if dry_run: train_triples = train_triples[:batch_size] val_triples = val_triples[:batch_size_eval] for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch + 1, num_epochs)) # -------------------- Training -------------------- net.train() train_dataset = TriplesDatasetClassification(train_triples, num_nodes) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) train_loader_tqdm = tqdm(train_loader) batches_history = utils.History() # running_metrics = collections.defaultdict(lambda: 0) for batch, (batch_triples, batch_labels) in enumerate(train_loader_tqdm): batch_triples = batch_triples.to(device) batch_labels = batch_labels.to(device) # Sanity check 1: Train on 0 inputs. #train_loader_tqdm.set_description('WARNING: Sanity check enabled') #batch_triples = torch.zeros_like(batch_triples) # Sanity check 2: Train on 0 targets. #train_loader_tqdm.set_description('WARNING: Sanity check enabled') #batch_labels = torch.zeros_like(batch_labels) # Sanity check 3: Overfit on a single batch. #train_loader_tqdm.set_description('WARNING: Sanity check enabled') #if epoch == 0 and batch == 0: # fixed_batch_values = batch_triples, batch_labels #else: # batch_triples, batch_labels = fixed_batch_values # Sanity check 4: Overfit on a few batches. # train_loader_tqdm.set_description('WARNING: Sanity check enabled') # if epoch == 0: # if batch == 0: # fixed_batch_values = [] # if batch < 10: # fixed_batch_values.append((batch_triples, batch_labels)) # else: # break # else: # if batch < len(fixed_batch_values): # batch_triples, batch_labels = fixed_batch_values[batch] # else: # break optimizer.zero_grad() output = net(batch_triples) #print(output) loss = loss_function(output, batch_labels) loss.backward() optimizer.step() batches_history.log('loss', loss.item()) batches_history.log('acc', (torch.sigmoid(output).round() == batch_labels ).float().mean().item()) if batch % 10 == 0: train_loader_tqdm.set_postfix(batches_history.last()) # for key in running_metrics: # running_metrics[key] /= len(batches) del batch_triples, batch_labels, output, loss torch.cuda.empty_cache() # -------------------- Testing -------------------- net.eval() with torch.no_grad(): val_dataset = TriplesDatasetClassification(val_triples, num_nodes) val_loader = DataLoader(val_dataset, batch_size=batch_size_eval, shuffle=False) val_batches_history = utils.History() for batch, (batch_triples, batch_labels) in enumerate(val_loader): # TODO: Does it actually make sense to move these to CUDA? They are just used as indices. batch_triples = batch_triples.to(device) batch_labels = batch_labels.to(device) output = net(batch_triples) loss = loss_function(output, batch_labels) val_batches_history.log('loss', loss.item()) val_batches_history.log( 'acc', (torch.sigmoid(output).round() == batch_labels ).float().mean().item()) del batch_triples, batch_labels, output, loss torch.cuda.empty_cache() # for key in running_metrics: # running_metrics[key] /= len(batches) history.log('loss', batches_history.mean('loss'), val_batches_history.mean('loss'), print_=True) history.log('acc', batches_history.mean('acc'), val_batches_history.mean('acc'), print_=True) # if log_dir is not None: # writer.add_scalar('loss', batches_history.mean('loss'), epoch) # writer.add_scalar('val_loss', val_batches_history.mean('loss'), epoch) # writer.add_scalar('acc', batches_history.mean('acc'), epoch) # writer.add_scalar('val_acc', val_batches_history.mean('val_acc'), epoch) # -------------------- Ranking -------------------- if ranking_eval: mean_rank, mean_rec_rank, hits_1, hits_3, hits_10 = train_ranker( net, batch_size=batch_size_eval) val_mean_rank, val_mean_rec_rank, val_hits_1, val_hits_3, val_hits_10 = val_ranker( net, batch_size=batch_size_eval) history.log('mean_rank', mean_rank, val_mean_rank, print_=True) history.log('mean_rec_rank', mean_rec_rank, val_mean_rec_rank, print_=True) history.log('hits_1', hits_1, val_hits_1, print_=True) history.log('hits_3', hits_3, val_hits_3, print_=True) history.log('hits_10', hits_10, val_hits_10, print_=True) # if log_dir is not None: # writer.add_scalar('mean_rank', mean_rank, epoch) # writer.add_scalar('val_mean_rank', mean_rank, epoch) # writer.add_scalar('mean_rec_rank', mean_rec_rank, epoch) # writer.add_scalar('val_mean_rec_rank', val_mean_rec_rank, epoch) # writer.add_scalar('hits_1', hits_1, epoch) # writer.add_scalar('val_hits1', val_hits_1, epoch) # writer.add_scalar('hits_3', hits_3, epoch) # writer.add_scalar('val_hits3', val_hits_3, epoch) # writer.add_scalar('hits_10', hits_10, epoch) # writer.add_scalar('val_hits_10', val_hits_10, epoch) # -------------------- Saving -------------------- # TODO: Pass val_mean_rec_rank as parameter here. if not dry_run and save_best_to is not None and ( epoch == 0 or history.values['val_mean_rec_rank'][-1] >= np.max(history.values['val_mean_rec_rank'][:-1])): save_best_to = save_best_to.format( epoch=epoch ) # if there is no substring {epoch}, this doesn't have an effect # TODO: Using save on the model here directly gives an error. torch.save(net.state_dict(), save_best_to) print() print('Saving model after epoch {} to {}'.format( epoch + 1, save_best_to)) print('-' * 80) print() return history
def _common(args, mode, validation=False, val_ratio=0, aloi_file=None, **kwargs): data_mode = 'test' if mode == 'test' else 'train' database = vdao.VDAO(args.dataset_dir, args.file, mode=data_mode, val_set=validation, val_ratio=val_ratio, aloi_file=aloi_file) # Set tensorflow session configurations config = tf.ConfigProto() config.gpu_options.visible_device_list = '' K.set_session(tf.Session(config=config)) print('save results: {}'.format(args.save_dir)) # Useful metrics to record metrics_list = [ metrics.fnr, metrics.fpr, metrics.distance, metrics.f1, metrics.tp, metrics.tn, metrics.fp, metrics.fn ] meters = {func.__name__: func for func in metrics_list} thresholds = kwargs.pop('thresholds', 0.5) arch = archs.__dict__[args.arch.lower()] arch_params = utils.parse_kwparams(args.arch_params) logger = {} # Apply func to data comming from all specified layers for layer in VDAO.LAYER_NAME: print('layer: {}'.format(layer)) database.set_layer(layer) cross_history = utils.History() outputs = [] roc = metrics.ROC() if validation is True else None # Apply func to each partition of the data for group_idx, (samples, set_size) in enumerate( database.load_generator( **utils.parse_kwparams(args.cv_params))): # Load old model or create a new one if args.load_model is not None: try: model = arch(load_path=args.load_model, save_path=args.save_dir, layer=layer, group_idx=group_idx) except FileNotFoundError: print('file not found for layer {}'.format(layer)) continue else: model = arch( load_path=args.load_model, save_path=args.save_dir, layer=layer, group_idx=group_idx, # input_shape=samples[0][0].shape[1:], input_shape=next(iter(samples.values()))[0].shape[1:], weight_decay=args.weight_decay, **arch_params) if mode == 'train': output = _train(args, model, samples, set_size, meters, cross_history, roc=roc) print('\nFinished training {}'.format(group_idx + 1)) else: if type(thresholds) is dict: group_thresholds = thresholds[layer][group_idx] else: group_thresholds = thresholds output = _eval(args, model, samples, set_size[data_mode], meters, threshold=group_thresholds) outputs += [output] if mode == 'train': logger[layer] = {'history': cross_history} if roc is not None: logger[layer].update({'roc': roc}) else: logger[layer] = {'output': outputs} print('\n' + '* ' * 80 + '\n\n') return logger