def train(path): name = os.path.splitext(os.path.basename(path))[0] print('Processing: ', name) features = pd.read_csv(path, index_col=None) selected_features_names = [name for name, desc in selected_features] features = features[selected_features_names] split_idx = 1200 features = features.drop(['sound.files'], axis=1) noise_only_df, df = features.iloc[:split_idx], features.iloc[split_idx:] y = df.pop('petrel') X = df.values y_noise = noise_only_df.pop('petrel') X_noise = noise_only_df.values X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=42, stratify=y) hyperparams = { 'n_estimators': [100, 300, 500, 1000], 'learning_rate': [0.1], 'gamma': [0.0, 0.5], 'max_depth': [2, 3, 4], 'min_child_weight': [1, 2], 'subsample': [1.0, 0.8], 'reg_alpha': [0.0, 0.1], 'reg_lambda': [1, 2, 3] } # # hyperparams = { # 'n_estimators': [100], # 'learning_rate': [0.1], # 'gamma': [0.0], # 'max_depth': [2], # 'min_child_weight': [1], # 'subsample': [1.0], # 'reg_alpha': [0.0], # 'reg_lambda': [1] # } clf = model_selection.GridSearchCV(estimator=xg.XGBClassifier(objective='binary:logistic', n_jobs=-1), param_grid=hyperparams, cv=4) fit_params = clf.fit(X_train, y_train) estimator = fit_params.best_estimator_ joblib.dump(estimator, name + '_model.pkl') test_pred = estimator.predict(X_test) metrics = calculate_metrics(test_pred, y_test) noise_pred = estimator.predict(X_noise) noise_detection_accuracy = accuracy_score(y_noise, noise_pred) experiment = Experiment(api_key="4PdGdUZmGf6P8QsMa5F2zB4Ui", project_name="storm petrels", workspace="tracewsl") experiment.set_name(name) experiment.log_parameter('name', name) experiment.log_multiple_params(fit_params.best_params_) experiment.log_multiple_metrics(metrics) experiment.log_metric('Noise detection accuracy', noise_detection_accuracy) experiment.log_figure('Confusion matrix', get_confusion_matrix_figure(test_pred, y_test)) experiment.log_figure('Feature importnace', get_feature_importance_figure(estimator, list(df.columns.values)))
labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = rnn(images) loss = criterion(outputs, labels) loss.backward() optimizer.step() # Compute train accuracy _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += float((predicted == labels.data).sum()) # Log accuracy to Comet.ml experiment.log_metric("accuracy", 100 * correct / total, step=step) step += 1 if (i + 1) % 100 == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' % (epoch + 1, hyper_params['num_epochs'], i + 1, len(train_dataset) // hyper_params['batch_size'], loss.data.item())) with experiment.test(): # Test the Model correct = 0 total = 0 for images, labels in test_loader: images = Variable( images.view(-1, hyper_params['sequence_length'], hyper_params['input_size']))
hprev_val = np.zeros([1, hidden_size]) p = 0 # reset # Prepare inputs input_vals = [char_to_ix[ch] for ch in data[p:p + seq_length]] target_vals = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]] input_vals = one_hot(input_vals) target_vals = one_hot(target_vals) hprev_val, loss_val, _ = sess.run([hprev, loss, updates], feed_dict={inputs: input_vals, targets: target_vals, init_state: hprev_val}) # log the loss to Comet.ml experiment.log_metric("loss", loss_val, step=n) if n % 500 == 0: # Log Progress print('iter: %d, p: %d, loss: %f' % (n, p, loss_val)) # Do sampling sample_length = 200 start_ix = random.randint(0, len(data) - seq_length) sample_seq_ix = [char_to_ix[ch] for ch in data[start_ix:start_ix + seq_length]] ixes = [] sample_prev_state_val = np.copy(hprev_val) for t in range(sample_length):
def main(): args = get_args() hyperparams = vars(args) if not args.no_comet: experiment = Experiment(api_key="5yzCYxgDmFnt1fhJWTRQIkETT", log_code=True) experiment.log_multiple_params(hyperparams) text_field = data.Field(tokenize=custom_tokenizer, fix_length=args.sentence_len, unk_token='<**UNK**>') label_field = data.Field(sequential=False, unk_token=None) pair_field = data.RawField() if args.dataset == 'multinli': print('Loading MultiNLI Dataset') train = get_dataset(text_field, label_field, pair_field, 'train') val = get_dataset(text_field, label_field, pair_field, args.val_set) elif args.dataset == 'snli': print('Loading SNLI Dataset') train, val, test = datasets.SNLI.splits(text_field, label_field) del test else: raise Exception('Incorrect Dataset Specified') text_field.build_vocab(train, max_size=args.max_vocab_size) label_field.build_vocab(train, val) if args.word_vectors: text_field.vocab.load_vectors(args.word_vectors) device = -1 if args.cuda: device = None print('Generating Iterators') train_iter, val_iter = data.BucketIterator.splits( (train, val), batch_size=args.batch_size, shuffle=True, sort_key=sort_key, device=device) train_iter.repeat = False args.n_embed = len(text_field.vocab) args.d_out = len(label_field.vocab) args.n_cells = args.n_layers if args.bidir: args.n_cells *= 2 print(args) if args.load_model: model = torch.load(args.load_model) else: model = MODELS[args.model_type](args) print('Loading Word Embeddings') model.embed.weight.data = text_field.vocab.vectors criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) if args.cuda: model = model.cuda() criterion = criterion.cuda() print(model) print('Training Model') best_val_acc = 0.0 val_acc_history = [] for epoch in range(1, args.n_epochs + 1): if (args.model_type == 'DA') and (best_val_acc >= args.DA_embed_train): model.embed.weight.requires_grad = True train_iter.init_epoch() for batch_ind, batch in enumerate(train_iter): model.train() optimizer.zero_grad() out = model(batch) loss = criterion(out, batch.label) loss.backward() clip_grad_norm( filter(lambda p: p.requires_grad, model.parameters()), 10) optimizer.step() if (batch_ind != 0) and (batch_ind % args.dev_every == 0): val_correct, val_loss = evaluate(val_iter, model, criterion) val_accuracy = 100 * val_correct / len(val) print(' Batch Step {}/{}, Val Loss: {:.4f}, Val Accuracy: {:.4f}'.\ format(batch_ind, len(train) // args.batch_size, val_loss, val_accuracy)) train_correct, train_loss = evaluate(train_iter, model, criterion) val_correct, val_loss = evaluate(val_iter, model, criterion) val_accuracy = 100 * val_correct / len(val) val_acc_history.append(val_accuracy) stop_training = early_stop(val_acc_history) if not args.no_comet: experiment.log_metric("Train loss", train_loss) experiment.log_metric("Val loss", val_loss) experiment.log_metric("Accuracy (val)", val_accuracy) experiment.log_metric("Accuracy (train)", 100 * train_correct / len(train)) if args.save_model and (val_accuracy > best_val_acc): best_val_acc = val_accuracy if best_val_acc > 60: snapshot_path = '../saved_models/Model_{}_acc_{:.4f}_epoch_{}_model.pt'.format( args.model_type, val_accuracy, epoch) if args.cuda: torch.save(model.cpu(), snapshot_path) model = model.cuda() else: torch.save(model, snapshot_path) print('Epoch: {}, Train Loss: {:.4f}, Val Loss: {:.4f}, Train Acc: {:.2f}, Val Acc: {:.2f}, Best Val Acc: {:.2f}'.\ format(epoch, train_loss, val_loss, 100 * train_correct / len(train), val_accuracy, best_val_acc)) if stop_training: print('Early stop triggered.') break
def train_traditional(hyper_params, teacher, student, sf_teacher, sf_student, trainloader, valloader, args): for stage in range(2): # Load previous stage model (except zeroth stage) if stage != 0: hyper_params['stage'] = stage - 1 student.load_state_dict( torch.load( get_savename(hyper_params, args.dataset, mode='traditional-stage', p=args.percentage))) # update hyperparams dictionary hyper_params['stage'] = stage # Freeze all stages except current stage student = unfreeze_trad(student, hyper_params['stage']) project_name = 'trad-kd-' + hyper_params[ 'dataset'] + '-' + hyper_params['model'] experiment = Experiment(api_key="1jNZ1sunRoAoI2TyremCNnYLO", project_name=project_name, workspace="semseg_kd") experiment.log_parameters(hyper_params) optimizer = torch.optim.Adam(student.parameters(), lr=hyper_params['learning_rate']) scheduler = torch.optim.lr_scheduler.OneCycleLR( optimizer, max_lr=1e-2, steps_per_epoch=len(trainloader), epochs=hyper_params['num_epochs']) criterion = nn.MSELoss() savename = get_savename(hyper_params, args.dataset, mode='traditional-stage', p=args.percentage) lowest_val_loss = 100 for epoch in range(hyper_params['num_epochs']): student, lowest_val_loss, train_loss, val_loss = train_stage( model=student, teacher=teacher, stage=hyper_params['stage'], sf_student=sf_student, sf_teacher=sf_teacher, train_loader=trainloader, val_loader=valloader, loss_function=criterion, optimiser=optimizer, scheduler=scheduler, epoch=epoch, num_epochs=hyper_params['num_epochs'], savename=savename, lowest_val=lowest_val_loss, args=args) experiment.log_metric('train_loss', train_loss) experiment.log_metric('val_loss', val_loss) print(round(val_loss, 6)) # Classifier training hyper_params['stage'] = 1 student.load_state_dict( torch.load( get_savename(hyper_params, args.dataset, mode='traditional-stage', p=args.percentage))) hyper_params['stage'] = 2 # Freeze all stages except current stage student = unfreeze_trad(student, hyper_params['stage']) project_name = 'trad-kd-' + hyper_params['dataset'] + '-' + hyper_params[ 'model'] experiment = Experiment(api_key="1jNZ1sunRoAoI2TyremCNnYLO", project_name=project_name, workspace="semseg_kd") experiment.log_parameters(hyper_params) optimizer = torch.optim.Adam(student.parameters(), lr=hyper_params['learning_rate']) scheduler = torch.optim.lr_scheduler.OneCycleLR( optimizer, max_lr=1e-2, steps_per_epoch=len(trainloader), epochs=hyper_params['num_epochs']) if hyper_params['dataset'] == 'camvid': criterion = nn.CrossEntropyLoss(ignore_index=11) else: criterion = nn.CrossEntropyLoss(ignore_index=250) hyper_params['num_classes'] = 19 savename = get_savename(hyper_params, args.dataset, mode='traditional-kd', p=args.percentage) highest_iou = 0 for epoch in range(hyper_params['num_epochs']): student, highest_iou, train_loss, val_loss, avg_iou, avg_pixel_acc, avg_dice_coeff = train( model=student, train_loader=trainloader, val_loader=valloader, num_classes=12, loss_function=criterion, optimiser=optimizer, scheduler=scheduler, epoch=epoch, num_epochs=hyper_params['num_epochs'], savename=savename, highest_iou=highest_iou, args=args) experiment.log_metric('train_loss', train_loss) experiment.log_metric('val_loss', val_loss) experiment.log_metric('avg_iou', avg_iou) experiment.log_metric('avg_pixel_acc', avg_pixel_acc) experiment.log_metric('avg_dice_coeff', avg_dice_coeff)
random_state=42) checkpoint_callback = skopt.callbacks.CheckpointSaver( f'D:\\FINKI\\8_dps\\Project\\MODELS\\skopt_checkpoints\\{EXPERIMENT_ID}.pkl' ) hyperparameters_optimizer.fit(X_train, y_train, callback=[checkpoint_callback]) skopt.dump(hyperparameters_optimizer, f'saved_models\\{EXPERIMENT_ID}.pkl') y_pred = hyperparameters_optimizer.best_estimator_.predict(X_test) for i in range(len(hyperparameters_optimizer.cv_results_['params'])): exp = Experiment( api_key='A8Lg71j9LtIrsv0deBA0DVGcR', project_name=ALGORITHM, workspace="8_dps", auto_output_logging='native', ) exp.set_name(f'{EXPERIMENT_ID}_{i+1}') exp.add_tags([ DS, SEGMENTS_LENGTH, ]) for k, v in hyperparameters_optimizer.cv_results_.items(): if k == "params": exp.log_parameters(dict(v[i])) else: exp.log_metric(k, v[i]) exp.end() #%%
# score mae_calculator_d1.eval(d1.cpu().detach().numpy(), d1_label.cpu().detach().numpy()) mae_calculator_d2.eval(d2.cpu().detach().numpy(), d2_label.cpu().detach().numpy()) mae_calculator_d3.eval(d3.cpu().detach().numpy(), d3_label.cpu().detach().numpy()) mae_calculator_final.eval(d.cpu().detach().numpy(), d1_label.cpu().detach().numpy()) print("count ", mae_calculator_d1.count) print("d1_val ", mae_calculator_d1.get_mae()) print("d2_val ", mae_calculator_d2.get_mae()) print("d3_val ", mae_calculator_d3.get_mae()) print("dfinal_val ", mae_calculator_final.get_mae()) experiment.log_metric("d1_val", mae_calculator_d1.get_mae()) experiment.log_metric("d2_val", mae_calculator_d2.get_mae()) experiment.log_metric("d3_val", mae_calculator_d3.get_mae()) experiment.log_metric("dfinal_val", mae_calculator_final.get_mae()) exit() while current_epoch < TOTAL_EPOCH: experiment.log_current_epoch(current_epoch) current_epoch += 1 print("start epoch ", current_epoch) loss_sum = 0 sample = 0 start_time = time() counting = 0 for train_img, label in train_loader_pacnn: net.train()
def main(cmd=None, stdout=True): args = get_args(cmd, stdout) model_id = "seed_{}_strat_{}_noise_fn_{}_noise_fp_{}_num_passes_{}_seed_size_{}_model_{}_batch_size_{}_gamma_{}_label_budget_{}_epochs_{}".format( args.seed, args.strategy, args.noise_fn, args.noise_fp, args.num_passes, args.seed_size, args.model, args.batch_size, args.gamma, args.label_budget, args.epochs) logging.basicConfig( filename="{}/{}.txt".format(args.dout, model_id), format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logger = Experiment(comet_ml_key, project_name="ActiveDialogue") logger.log_parameters(vars(args)) if args.model == "glad": model_arch = GLAD elif args.model == "gce": model_arch = GCE env = PartialEnv(load_dataset, model_arch, args) if args.seed_size: with logger.train(): if not env.load('seed'): logging.info("No loaded seed. Training now.") env.seed_fit(args.seed_epochs, prefix="seed") logging.info("Seed completed.") else: logging.info("Loaded seed.") if args.force_seed: logging.info("Training seed regardless.") env.seed_fit(args.seed_epochs, prefix="seed") env.load('seed') use_strategy = False if args.strategy == "entropy": use_strategy = True strategy = partial_entropy elif args.strategy == "bald": use_strategy = True strategy = partial_bald if use_strategy: if args.threshold_strategy == "fixed": strategy = FixedThresholdStrategy(strategy, args, True) elif args.threshold_strategy == "variable": strategy = VariableThresholdStrategy(strategy, args, True) elif args.threshold_strategy == "randomvariable": strategy = StochasticVariableThresholdStrategy( strategy, args, True) ended = False i = 0 initial_metrics = env.metrics(True) logger.log_current_epoch(i) logging.info("Initial metrics: {}".format(initial_metrics)) for k, v in initial_metrics.items(): logger.log_metric(k, v) with logger.train(): while not ended: i += 1 # Observe environment state logger.log_current_epoch(i) if env.can_label: # Obtain label request from strategy obs, preds = env.observe(20 if args.strategy == "bald" else 1) if args.strategy != "bald": preds = preds[0] if args.strategy == "aggressive": label_request = aggressive(preds) elif args.strategy == "random": label_request = random(preds) elif args.strategy == "passive": label_request = passive(preds) elif use_strategy: label_request = strategy.observe(preds) else: raise ValueError() # Label solicitation labeled = env.label(label_request) if use_strategy: strategy.update( sum([ np.sum(s.flatten()) for s in label_request.values() ]), sum([ np.sum(np.ones_like(s).flatten()) for s in label_request.values() ])) else: break # Environment stepping ended = env.step() # Fit every al_batch of items best = env.fit(prefix=model_id, reset_model=True) for k, v in best.items(): logger.log_metric(k, v) env.load(prefix=model_id) # Final fit final_metrics = env.fit(epochs=args.final_epochs, prefix="final_fit_" + model_id, reset_model=True) for k, v in final_metrics.items(): logger.log_metric("Final " + k, v) logging.info("Final " + k + ": " + str(v)) logging.info("Run finished.")
def main(): experiment = Experiment(api_key="1x1ZQpvbtvDyO2s5DrlUyYpzv", project_name="GAN1", workspace="verlyn-fischer") discriminator_path = 'models/discriminator.pth' generator_path = 'models/generator.pth' # Load data data = mnist_data() # Create loader with data, so that we can iterate over it data_loader = torch.utils.data.DataLoader(data, batch_size=100, shuffle=True) # Num batches num_batches = len(data_loader) discriminator = DiscriminatorNet() generator = GeneratorNet() d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0002) g_optimizer = optim.Adam(generator.parameters(), lr=0.0002) loss = nn.BCELoss() num_test_samples = 16 test_noise = noise(num_test_samples) # Create logger instance # logger = Logger(model_name='VGAN', data_name='MNIST') # Total number of epochs to train num_epochs = 70 for epoch in range(num_epochs): print(f'Epoch: {epoch}') for n_batch, (real_batch, _) in enumerate(data_loader): N = real_batch.size(0) # 1. Train Discriminator real_data = Variable(images_to_vectors(real_batch)) # Generate fake data and detach # (so gradients are not calculated for generator) fake_data = generator(noise(N)).detach() # Train D d_error, d_pred_real, d_pred_fake = \ train_discriminator(d_optimizer, real_data, fake_data, discriminator, loss) # 2. Train Generator # Generate fake data fake_data = generator(noise(N)) # Train G g_error = train_generator(g_optimizer, fake_data, discriminator, loss) # Log batch error # logger.log(d_error, g_error, epoch, n_batch, num_batches) # Display Progress every few batches experiment.log_metric("d_error", d_error, step=n_batch) experiment.log_metric("g_error", g_error, step=n_batch) # if (n_batch) % 100 == 0: # # logger.log_images( # # test_images, num_test_samples, # # epoch, n_batch, num_batches # # ); # # Display status Logs # # logger.display_status( # # epoch, num_epochs, n_batch, num_batches, # # d_error, g_error, d_pred_real, d_pred_fake # # ) # Plot test images after each epoch test_images = vectors_to_images(generator(test_noise)) test_images = test_images.data plot_test_images(test_images, experiment, False) # Save models and log images torch.save(discriminator.state_dict(), discriminator_path) torch.save(discriminator.state_dict(), generator_path) test_images = vectors_to_images(generator(test_noise)) test_images = test_images.data plot_test_images(test_images, experiment, True)
class CometMLMonitor(MonitorBase): """ Send scalar data and the graph to https://www.comet.ml. Note: 1. comet_ml requires you to `import comet_ml` before importing tensorflow or tensorpack. 2. The "automatic output logging" feature of comet_ml will make the training progress bar appear to freeze. Therefore the feature is disabled by default. """ def __init__(self, experiment=None, tags=None, **kwargs): """ Args: experiment (comet_ml.Experiment): if provided, invalidate all other arguments tags (list[str]): experiment tags kwargs: arguments used to initialize :class:`comet_ml.Experiment`, such as project name, API key, etc. Refer to its documentation for details. """ if experiment is not None: self._exp = experiment assert tags is None and len(kwargs) == 0 else: from comet_ml import Experiment kwargs.setdefault( 'log_code', True ) # though it's not functioning, git patch logging requires it kwargs.setdefault('auto_output_logging', None) self._exp = Experiment(**kwargs) if tags is not None: self._exp.add_tags(tags) self._exp.set_code("Code logging is impossible ...") self._exp.log_dependency('tensorpack', __git_version__) @property def experiment(self): """ The :class:`comet_ml.Experiment` instance. """ return self._exp def _before_train(self): self._exp.set_model_graph(tf.get_default_graph()) @HIDE_DOC def process_scalar(self, name, val): self._exp.log_metric(name, val, step=self.global_step) @HIDE_DOC def process_image(self, name, val): self._exp.set_step(self.global_step) for idx, v in enumerate(val): log_name = "{}_step{}{}".format( name, self.global_step, "_" + str(idx) if len(val) > 1 else "") self._exp.log_image(v, image_format="jpeg", name=log_name, image_minmax=(0, 255)) def _after_train(self): self._exp.end() def _after_epoch(self): self._exp.log_epoch_end(self.epoch_num)
groups = data_reader.groups all_scores = [] for i in range(3): ae = Autoencoder(config[i]["encoder"], config[i]["decoder"], input_shape=input_shapes[i], latent_shape=latent_shape, loss="mean_squared_error", optimizer_params=None) experiment.log_multiple_params(config[i]) scores = ae.cross_validate(data[i], groups, experiment=experiment, epochs=10000, n_splits=4, log_prefix=f"dataset_{i}_") all_scores.append(scores) mean_scores = np.mean(scores) experiment.log_metric(f"mean_scores_{i}", mean_scores) experiment.log_other(f"scores_{i}", scores) experiment.log_metric(f"mean_all_scores", np.mean(all_scores)) print(all_scores)
experiment.add_tag("vanilla-resnet") from torch import nn, optim from tqdm import trange from film_test.resnet import resnet18 from film_test.traintest import train, test, device EPOCHS = 24 net = resnet18(num_classes=2) net = net.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) trainloader, testloader = qa_cifar() for epoch in trange(EPOCHS): experiment.log_metric("epoch", epoch) train( net, trainloader, epoch, optimizer, criterion, qa=True, comet=experiment) test(net, testloader, criterion, qa=True, comet=experiment)
net = ResNet_32(BasicBlock, [7, 7, 7], num_classes=10).cuda() criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4) epoch = 185 batch_size = 128 num_workers = 16 train_loader = get_training_dataloader(batch_size, num_workers) test_loader = get_test_dataloader(batch_size, num_workers) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [95, 140], gamma=0.1) start_time = time.time() step = 0 best_acc = 0.0 for i in range(1, epoch + 1): step = train(net, i, train_loader, optimizer, criterion, step) t_acc = eval_training(net, i, test_loader, step) if t_acc > best_acc: best_acc = t_acc scheduler.step() experiment.log_metric('lr', value=optimizer.param_groups[0]['lr'], step=i) end_time = time.time() print("time cost: %.4f min" % (float(end_time - start_time) / 60)) print("best test_set acc : %.4f" % best_acc)
def train_net(net): path = net.path hidden_sizes = net.hyperparameters["hidden_sizes"] n_epochs = net.hyperparameters["n_epochs"] batch_size = net.hyperparameters["batch_size"] n_it_neg = net.hyperparameters["n_it_neg"] # n_iterations in free phase n_it_pos = net.hyperparameters[ "n_it_pos"] # n_iterations in weekly clamped phase epsilon = net.hyperparameters["epsilon"] beta = net.hyperparameters["beta"] alphas = net.hyperparameters["alphas"] print("name = %s" % (path)) print("architecture = 784-" + "-".join([str(n) for n in hidden_sizes]) + "-10") print("number of epochs = %i" % (n_epochs)) print("batch_size = %i" % (batch_size)) print("n_it_neg = %i" % (n_it_neg)) print("n_it_pos = %i" % (n_it_pos)) print("epsilon = %.1f" % (epsilon)) print("beta = %.1f" % (beta)) print("learning rates: " + " ".join( ["alpha_W%i=%.3f" % (k + 1, alpha) for k, alpha in enumerate(alphas)]) + "\n") experiment = Experiment(project_name='eqprop') experiment.log_parameters({ 'original_implementation': True, 'net_type': 1, 'max_steps': n_it_neg, 'use_predictors': False }) n_batches_train = 50000 // batch_size n_batches_valid = 10000 // batch_size start_time = time.perf_counter() for epoch in range(n_epochs): ### TRAINING ### # CUMULATIVE SUM OF TRAINING ENERGY, TRAINING COST AND TRAINING ERROR measures_sum = [0., 0., 0.] gW = [0.] * len(alphas) for index in range(n_batches_train): # CHANGE THE INDEX OF THE MINI BATCH (= CLAMP X AND INITIALIZE THE HIDDEN AND OUTPUT LAYERS WITH THE PERSISTENT PARTICLES) net.change_mini_batch_index(index) # FREE PHASE net.free_phase(n_it_neg, epsilon) # MEASURE THE ENERGY, COST AND ERROR AT THE END OF THE FREE PHASE RELAXATION measures = net.measure() measures_sum = [ measure_sum + measure for measure_sum, measure in zip(measures_sum, measures) ] measures_avg = [ measure_sum / (index + 1) for measure_sum in measures_sum ] measures_avg[ -1] *= 100. # measures_avg[-1] corresponds to the error rate, which we want in percentage stdout.write("\repoch-%2i-train-%5i E=%.1f C=%.5f error=%.3f%%" % (epoch, (index + 1) * batch_size, measures_avg[0], measures_avg[1], measures_avg[2])) stdout.flush() _step = epoch * n_batches_train + index experiment.log_metric('energy', measures_avg[0], step=_step) experiment.log_metric('cost', measures_avg[1], step=_step) experiment.log_metric('accuracy', 100 - measures_avg[2], step=_step) # WEAKLY CLAMPED PHASE sign = 2 * np.random.randint(0, 2) - 1 # random sign +1 or -1 beta = np.float32(sign * beta) # choose the sign of beta at random Delta_logW = net.weakly_clamped_phase(n_it_pos, epsilon, beta, *alphas) gW = [ gW1 + Delta_logW1 for gW1, Delta_logW1 in zip(gW, Delta_logW) ] stdout.write("\n") dlogW = [100. * gW1 / n_batches_train for gW1 in gW] print(" " + " ".join([ "dlogW%i=%.3f%%" % (k + 1, dlogW1) for k, dlogW1 in enumerate(dlogW) ])) net.training_curves["training error"].append(measures_avg[-1]) ### VALIDATION ### # CUMULATIVE SUM OF VALIDATION ENERGY, VALIDATION COST AND VALIDATION ERROR measures_sum = [0., 0., 0.] for index in range(n_batches_valid): # CHANGE THE INDEX OF THE MINI BATCH (= CLAMP X AND INITIALIZE THE HIDDEN AND OUTPUT LAYERS WITH THE PERSISTENT PARTICLES) net.change_mini_batch_index(n_batches_train + index) # FREE PHASE net.free_phase(n_it_neg, epsilon) # MEASURE THE ENERGY, COST AND ERROR AT THE END OF THE FREE PHASE RELAXATION measures = net.measure() measures_sum = [ measure_sum + measure for measure_sum, measure in zip(measures_sum, measures) ] measures_avg = [ measure_sum / (index + 1) for measure_sum in measures_sum ] measures_avg[ -1] *= 100. # measures_avg[-1] corresponds to the error rate, which we want in percentage stdout.write("\r valid-%5i E=%.1f C=%.5f error=%.2f%%" % ((index + 1) * batch_size, measures_avg[0], measures_avg[1], measures_avg[2])) stdout.flush() stdout.write("\n") net.training_curves["validation error"].append(measures_avg[-1]) duration = (time.perf_counter() - start_time) / 60. print((" duration=%.1f min" % (duration))) # SAVE THE PARAMETERS OF THE NETWORK AT THE END OF THE EPOCH net.save_params()
best_train_acc = 0 best_test_acc = 0 for epoch in range(start_epoch, start_epoch + 2000): print(model_name) train_loss, train_acc = train(epoch) test_loss, test_acc = test(epoch) training_loss_list.append(train_loss) testing_loss_list.append(test_loss) training_acc_list.append(train_acc) testing_acc_list.append(test_acc) if (train_acc > best_train_acc): best_train_acc = train_acc if (test_acc > best_test_acc): best_test_acc = test_acc experiment.log_metric("best_train_acc", best_train_acc, epoch=epoch + 1) experiment.log_metric("best_test_acc", best_test_acc, epoch=epoch + 1) experiment.log_metric("train_acc", train_acc, epoch=epoch + 1) experiment.log_metric("test_acc", test_acc, epoch=epoch + 1) plt.plot(training_loss_list, color='blue', label='Training') plt.plot(testing_loss_list, color='red', label='Testing', alpha=.5) plt.xlabel('Epoch') plt.ylabel('Loss') plt.title('Loss plot') plt.legend() plt.savefig("./loss_plot_" + model_name + ".png", format='png') experiment.log_figure(figure=plt, figure_name='loss_plot', overwrite=True) plt.close() plt.plot(training_acc_list, color='blue', label='Training')
dis_optimizer.step() gen_optimizer.zero_grad() gen_loss.backward(retain_graph=True) gen_optimizer.step() print( "(Global Step {}) (Epoch {}) (Step {}) (Img Dis Loss {}) (Img Gen Loss {}) (Seq Dis Loss {}) (Seq Gen Loss {}) (Dis Loss {}) (Gen Loss {})" .format(global_step, epoch, i, img_dis_loss.item(), img_gen_loss.item(), seq_dis_loss.item(), seq_gen_loss.item(), dis_loss.item(), gen_loss.item()), end='\r', flush=True) experiment.log_metric("img_dis_loss", img_dis_loss.item(), step=global_step) experiment.log_metric("img_gen_loss", img_gen_loss.item(), step=global_step) experiment.log_metric("seq_dis_loss", seq_dis_loss.item(), step=global_step) experiment.log_metric("seq_gen_loss", seq_gen_loss.item(), step=global_step) experiment.log_metric("dis_loss", dis_loss.item(), step=global_step) experiment.log_metric("gen_loss", gen_loss.item(),
) with tf.Session(config=config) as sess: model = RPN3D( cls=cfg.DETECT_OBJ, single_batch_size=args.single_batch_size, avail_gpus=cfg.GPU_AVAILABLE.split(',') ) if tf.train.get_checkpoint_state(save_model_dir): print("Reading model parameters from %s" % save_model_dir) model.saver.restore( sess, tf.train.latest_checkpoint(save_model_dir)) counter=0 with experiment.test(): for batch in iterate_data(val_dir, shuffle=False, aug=False, is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT): experiment.log_metric("counter",counter) if args.vis: tags, results, front_images, bird_views, heatmaps = model.predict_step(sess, batch, summary=False, vis=True) else: tags, results = model.predict_step(sess, batch, summary=False, vis=False) # ret: A, B # A: (N) tag # B: (N, N') (class, x, y, z, h, w, l, rz, score) for tag, result in zip(tags, results): of_path = os.path.join(args.output_path, 'data', tag + '.txt') with open(of_path, 'w+') as f: labels = box3d_to_label([result[:, 1:8]], [result[:, 0]], [result[:, -1]], coordinate='lidar')[0] for line in labels: f.write(line)
step, mean_IU) filepath = os.path.join(args.snapshot_dir, filename) save_checkpoint(model.student, filepath, optimizer=None, meta=None) if step % 10000 == 0 or step in [100, 200, 300, 1000]: filename = 'CS_scenes_step-{:d}_mIU-{:.4f}.pth'.format( step, mean_IU) filepath = os.path.join(args.snapshot_dir, filename) torch.save(model.student.state_dict(), filepath) filename = 'mmseg_step-{:d}_mIU-{:.4f}.pth'.format( step, mean_IU) filepath = os.path.join(args.snapshot_dir, filename) save_checkpoint(model.student, filepath, optimizer=None, meta=None) # checkpoint = {'state_dict': weights_to_cpu(get_state_dict(model.student))} # torch.save(checkpoint, filename) if args.api_key: experiment.log_metric('mean_IU', mean_IU, step=step) for i in range(len(trainset.class_name)): experiment.log_metric(trainset.class_name[i], IU_array[i], step=step) val_log.close()
'batch_repulsive': br, 'bandwidth_repulsive': bandwidth_repulsive, 'lambda_repulsive': args.lambda_repulsive } else: kwargs = {} data, target = data.cpu(), target.cpu() info_batch = optimize(net, optimizer, batch=(data, target), add_repulsive_constraint=args.repulsive is not None, **kwargs) step += 1 for k, v in info_batch.items(): experiment.log_metric('train_{}'.format(k), v, step=step) # Save the model if not Path.exists(savepath / 'models'): os.makedirs(savepath / 'models') model_path = savepath / 'models' / '{}_{}epochs.pt'.format( model_name, epoch + 1) if not Path.exists(model_path): torch.save(net.state_dict(), model_path) else: raise ValueError( 'Error trying to save file at location {}: File already exists'.format( model_path))
def train(normal_digit, anomalies, folder, file, p_train, p_test): # Create an experiment experiment = Experiment(project_name="deep-stats-thesis", workspace="stecaron", disabled=True) experiment.add_tag("mnist_kpca") # General parameters DOWNLOAD_MNIST = True PATH_DATA = os.path.join(os.path.expanduser("~"), 'Downloads/mnist') # Define training parameters hyper_params = { "TRAIN_SIZE": 2000, "TRAIN_NOISE": p_train, "TEST_SIZE": 800, "TEST_NOISE": p_test, # on which class we want to learn outliers "CLASS_SELECTED": [normal_digit], # which class we want to corrupt our dataset with "CLASS_CORRUPTED": anomalies, "INPUT_DIM": 28 * 28, # In the case of MNIST "ALPHA": p_test, # level of significance for the test # hyperparameters gamma in rbf kPCA "GAMMA": [1], "N_COMP": [30] } # Log experiment parameterso0p experiment.log_parameters(hyper_params) # Load data train_data, test_data = load_mnist(PATH_DATA, download=DOWNLOAD_MNIST) # Normalize data train_data.data = train_data.data / 255. test_data.data = test_data.data / 255. # Build "train" and "test" datasets id_maj_train = numpy.random.choice(numpy.where( numpy.isin(train_data.train_labels, hyper_params["CLASS_SELECTED"]))[0], int((1 - hyper_params["TRAIN_NOISE"]) * hyper_params["TRAIN_SIZE"]), replace=False) id_min_train = numpy.random.choice(numpy.where( numpy.isin(train_data.train_labels, hyper_params["CLASS_CORRUPTED"]))[0], int(hyper_params["TRAIN_NOISE"] * hyper_params["TRAIN_SIZE"]), replace=False) id_train = numpy.concatenate((id_maj_train, id_min_train)) id_maj_test = numpy.random.choice(numpy.where( numpy.isin(test_data.test_labels, hyper_params["CLASS_SELECTED"]))[0], int((1 - hyper_params["TEST_NOISE"]) * hyper_params["TEST_SIZE"]), replace=False) id_min_test = numpy.random.choice(numpy.where( numpy.isin(test_data.test_labels, hyper_params["CLASS_CORRUPTED"]))[0], int(hyper_params["TEST_NOISE"] * hyper_params["TEST_SIZE"]), replace=False) id_test = numpy.concatenate((id_min_test, id_maj_test)) train_data.data = train_data.data[id_train] train_data.targets = train_data.targets[id_train] test_data.data = test_data.data[id_test] test_data.targets = test_data.targets[id_test] train_data.targets = numpy.isin(train_data.train_labels, hyper_params["CLASS_CORRUPTED"]) test_data.targets = numpy.isin(test_data.test_labels, hyper_params["CLASS_CORRUPTED"]) # Flatten the data and transform to numpy array train_data.data = train_data.data.view(-1, 28 * 28).numpy() test_data.data = test_data.data.view(-1, 28 * 28).numpy() # Train kPCA # param_grid = [{"gamma": hyper_params["GAMMA"], # "n_components": hyper_params["N_COMP"]}] param_grid = [{"n_components": hyper_params["N_COMP"]}] # kpca = KernelPCA(fit_inverse_transform=True, # kernel="rbf", # remove_zero_eig=True, # n_jobs=-1) kpca = PCA() #my_scorer2 = make_scorer(my_scorer, greater_is_better=True) # grid_search = GridSearchCV(kpca, param_grid, cv=ShuffleSplit( # n_splits=3), scoring=my_scorer) kpca.fit(train_data.data) X_kpca = kpca.transform(train_data.data) X_train_back = kpca.inverse_transform(X_kpca) X_test_back = kpca.inverse_transform(kpca.transform(test_data.data)) # Compute the distance between original data and reconstruction dist_train = numpy.linalg.norm(train_data.data - X_train_back, ord=2, axis=1) dist_test = numpy.linalg.norm(test_data.data - X_test_back, ord=2, axis=1) # Test performances on train train_anomalies_ind = numpy.argsort(dist_train)[int( (1 - hyper_params["ALPHA"]) * hyper_params["TRAIN_SIZE"]):int(hyper_params["TRAIN_SIZE"])] train_predictions = numpy.zeros(hyper_params["TRAIN_SIZE"]) train_predictions[train_anomalies_ind] = 1 train_recall = metrics.recall_score(train_data.targets, train_predictions) train_precision = metrics.precision_score(train_data.targets, train_predictions) train_f1_score = metrics.f1_score(train_data.targets, train_predictions) train_auc = metrics.roc_auc_score(train_data.targets, train_predictions) print(f"Train Precision: {train_precision}") print(f"Train Recall: {train_recall}") print(f"Train F1 Score: {train_f1_score}") print(f"Train AUC: {train_auc}") experiment.log_metric("train_precision", train_precision) experiment.log_metric("train_recall", train_recall) experiment.log_metric("train_f1_score", train_f1_score) experiment.log_metric("train_auc", train_auc) # Test performances on test test_probs = numpy.array( [numpy.sum(xi >= dist_train) / len(dist_train) for xi in dist_test], dtype=float) test_anomalies_ind = numpy.argwhere( test_probs >= 1 - hyper_params["ALPHA"]) test_predictions = numpy.zeros(hyper_params["TEST_SIZE"]) test_predictions[test_anomalies_ind] = 1 test_recall = metrics.recall_score(test_data.targets, test_predictions) test_precision = metrics.precision_score(test_data.targets, test_predictions) test_f1_score = metrics.f1_score(test_data.targets, test_predictions) test_auc = metrics.roc_auc_score(test_data.targets, test_probs) test_average_precision = metrics.average_precision_score( test_data.targets, test_predictions) print(f"Test Precision: {test_precision}") print(f"Test Recall: {test_recall}") print(f"Test F1 Score: {test_f1_score}") print(f"Test AUC: {test_auc}") print(f"Test average Precision: {test_average_precision}") experiment.log_metric("test_precision", test_precision) experiment.log_metric("test_recall", test_recall) experiment.log_metric("test_f1_score", test_f1_score) experiment.log_metric("test_auc", test_auc) experiment.log_metric("test_average_precision", test_average_precision) # Save the results in the output file col_names = [ "timestamp", "precision", "recall", "f1_score", "average_precision", "auc" ] results_file = os.path.join(folder, "results_" + file + ".csv") if os.path.exists(results_file): df_results = pandas.read_csv(results_file, names=col_names, header=0) else: df_results = pandas.DataFrame(columns=col_names) df_results = df_results.append(pandas.DataFrame(numpy.concatenate( (numpy.array( datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S')).reshape(1), test_precision.reshape(1), test_recall.reshape(1), test_f1_score.reshape(1), test_average_precision.reshape(1), test_auc.reshape(1))).reshape(1, -1), columns=col_names), ignore_index=True) df_results.to_csv(results_file)
class Logger: def __init__(self, send_logs, tags, parameters, experiment=None): self.stations = 5 self.send_logs = send_logs if self.send_logs: if experiment is None: json_loc = glob.glob("./**/comet_token.json")[0] with open(json_loc, "r") as f: kwargs = json.load(f) self.experiment = Experiment(**kwargs) else: self.experiment = experiment self.sent_mb = 0 self.speed_window = deque(maxlen=100) self.step_time = None self.current_speed = 0 if self.send_logs: if tags is not None: self.experiment.add_tags(tags) if parameters is not None: self.experiment.log_parameters(parameters) def begin_logging(self, episode_count, steps_per_ep, sigma, theta, step_time): self.step_time = step_time if self.send_logs: self.experiment.log_parameter("Episode count", episode_count) self.experiment.log_parameter("Steps per episode", steps_per_ep) self.experiment.log_parameter("theta", theta) self.experiment.log_parameter("sigma", sigma) def log_round(self, states, reward, cumulative_reward, info, loss, observations, step): self.experiment.log_histogram_3d(states, name="Observations", step=step) info = [[j for j in i.split("|")] for i in info] info = np.mean(np.array(info, dtype=np.float32), axis=0) try: # round_mb = np.mean([float(i.split("|")[0]) for i in info]) round_mb = info[0] except Exception as e: print(info) print(reward) raise e self.speed_window.append(round_mb) self.current_speed = np.mean(np.asarray(self.speed_window)/self.step_time) self.sent_mb += round_mb # CW = np.mean([float(i.split("|")[1]) for i in info]) CW = info[1] # stations = np.mean([float(i.split("|")[2]) for i in info]) self.stations = info[2] fairness = info[3] if self.send_logs: self.experiment.log_metric("Round reward", np.mean(reward), step=step) self.experiment.log_metric("Per-ep reward", np.mean(cumulative_reward), step=step) self.experiment.log_metric("Megabytes sent", self.sent_mb, step=step) self.experiment.log_metric("Round megabytes sent", round_mb, step=step) self.experiment.log_metric("Chosen CW", CW, step=step) self.experiment.log_metric("Station count", self.stations, step=step) self.experiment.log_metric("Current throughput", self.current_speed, step=step) self.experiment.log_metric("Fairness index", fairness, step=step) for i, obs in enumerate(observations): self.experiment.log_metric(f"Observation {i}", obs, step=step) self.experiment.log_metrics(loss, step=step) def log_episode(self, cumulative_reward, speed, step): if self.send_logs: self.experiment.log_metric("Cumulative reward", cumulative_reward, step=step) self.experiment.log_metric("Speed", speed, step=step) self.sent_mb = 0 self.last_speed = speed self.speed_window = deque(maxlen=100) self.current_speed = 0 def end(self): if self.send_logs: self.experiment.end()
running_loss += loss.item() total_batches = total_batches + 1.0 # running loss intervals div = 125 if i == 0: div = 1 if i % div == 0 and i != 0: # print every 1024 mini-batches print( '[%d, %d] E[loss]: %.20f loss: %.20f acc 1: %.20f acc 5: %.20f]' % (epoch, i, float(expected_loss) / (float(i + 1)), float(running_loss) / float(div), float(acc_1) / float(total_batches), float(acc_5) / float(total_batches))) experiment.log_metric('Epoch', epoch) experiment.log_metric('Training_iteration', i) experiment.log_metric( 'E[loss]', float(expected_loss) / float(i + 1)) experiment.log_metric('Running_loss', float(running_loss) / float(div)) experiment.log_metric( 'Acc@1', float(acc_1) / float(total_batches)) experiment.log_metric( 'Acc@5', float(acc_5) / float(total_batches)) experiment.log_metric( 'Acc@10', float(acc_10) / float(total_batches))
class CorefSolver(): def __init__(self, args): self.args = args self.data_utils = data_utils(args) self.disable_comet = args.disable_comet self.model = self.make_model( src_vocab=self.data_utils.vocab_size, tgt_vocab=self.data_utils.vocab_size, N=args.num_layer, dropout=args.dropout, entity_encoder_type=args.entity_encoder_type) print(self.model) if self.args.train: self.outfile = open(self.args.logfile, 'w') self.model_dir = make_save_dir(args.model_dir) # self.logfile = os.path.join(args.logdir, args.exp_name) # self.log = SummaryWriter(self.logfile) self.w_valid_file = args.w_valid_file def make_model(self, src_vocab, tgt_vocab, N=6, dropout=0.1, d_model=512, entity_encoder_type='linear', d_ff=2048, h=8): "Helper: Construct a model from hyperparameters." c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) attn_ner = MultiHeadedAttention(1, d_model, dropout) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) embed = Embeddings(d_model, src_vocab) word_embed = nn.Sequential(embed, c(position)) print('pgen', self.args.pointer_gen) if entity_encoder_type == 'transformer': # entity_encoder = nn.Sequential(embed, Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), 1)) print('transformer') entity_encoder = Seq_Entity_Encoder( embed, Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), 2)) elif entity_encoder_type == 'albert': albert_tokenizer = AlbertTokenizer.from_pretrained( 'albert-base-v2') albert = AlbertModel.from_pretrained('albert-base-v2') entity_encoder = Albert_Encoder(albert, albert_tokenizer, d_model) elif entity_encoder_type == 'gru': entity_encoder = RNNEncoder(embed, 'GRU', d_model, d_model, num_layers=1, dropout=0.1, bidirectional=True) print('gru') elif entity_encoder_type == 'lstm': entity_encoder = RNNEncoder(embed, 'LSTM', d_model, d_model, num_layers=1, dropout=0.1, bidirectional=True) print('lstm') if self.args.ner_at_embedding: model = EncoderDecoderOrg( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), DecoderOrg( DecoderLayerOrg(d_model, c(attn), c(attn), c(ff), dropout), N, d_model, tgt_vocab, self.args.pointer_gen), word_embed, word_embed, entity_encoder) else: if self.args.ner_last: decoder = Decoder( DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N, d_model, tgt_vocab, self.args.pointer_gen, self.args.ner_last) else: decoder = Decoder( DecoderLayer_ner(d_model, c(attn), c(attn), attn_ner, c(ff), dropout, self.args.fusion), N, d_model, tgt_vocab, self.args.pointer_gen, self.args.ner_last) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), decoder, word_embed, word_embed, entity_encoder) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) # levels = 3 # num_chans = [d_model] * (args.levels) # k_size = 5 # tcn = TCN(embed, d_model, num_channels, k_size, dropout=dropout) return model.cuda() def train(self): if not self.disable_comet: # logging hyper_params = { "num_layer": self.args.num_layer, "pointer_gen": self.args.pointer_gen, "ner_last": self.args.ner_last, "entity_encoder_type": self.args.entity_encoder_type, "fusion": self.args.fusion, "dropout": self.args.dropout, } COMET_PROJECT_NAME = 'summarization' COMET_WORKSPACE = 'timchen0618' self.exp = Experiment( api_key='mVpNOXSjW7eU0tENyeYiWZKsl', project_name=COMET_PROJECT_NAME, workspace=COMET_WORKSPACE, auto_output_logging='simple', auto_metric_logging=None, display_summary=False, ) self.exp.log_parameters(hyper_params) self.exp.add_tags([ '%s entity_encoder' % self.args.entity_encoder_type, self.args.fusion ]) if self.args.ner_last: self.exp.add_tag('ner_last') if self.args.ner_at_embedding: self.exp.add_tag('ner_at_embedding') self.exp.set_name(self.args.exp_name) self.exp.add_tag('coreference') print('ner_last ', self.args.ner_last) print('ner_at_embedding', self.args.ner_at_embedding) # dataloader & optimizer data_yielder = self.data_utils.data_yielder(num_epoch=100) optim = torch.optim.Adam(self.model.parameters(), lr=1e-7, betas=(0.9, 0.998), eps=1e-8, amsgrad=True) #get_std_opt(self.model) # entity_optim = torch.optim.Adam(self.entity_encoder.parameters(), lr=1e-7, betas=(0.9, 0.998), eps=1e-8, amsgrad=True) total_loss = [] start = time.time() print('*' * 50) print('Start Training...') print('*' * 50) start_step = 0 # if loading from checkpoint if self.args.load_model: state_dict = torch.load(self.args.load_model)['state_dict'] self.model.load_state_dict(state_dict) print("Loading model from " + self.args.load_model + "...") # encoder_state_dict = torch.load(self.args.entity_encoder)['state_dict'] # self.entity_encoder.load_state_dict(encoder_state_dict) # print("Loading entity_encoder from %s" + self.args.entity_encoder + "...") start_step = int(torch.load(self.args.load_model)['step']) print('Resume training from step %d ...' % start_step) warmup_steps = 10000 d_model = 512 lr = 1e-7 for step in range(start_step, self.args.total_steps): self.model.train() batch = data_yielder.__next__() optim.zero_grad() # entity_optim.zero_grad() #update lr if step % 400 == 1: lr = (1 / (d_model**0.5)) * min( (1 / (step / 4)**0.5), step * (1 / (warmup_steps**1.5))) for param_group in optim.param_groups: param_group['lr'] = lr # for param_group in entity_optim.param_groups: # param_group['lr'] = lr batch['src'] = batch['src'].long() batch['tgt'] = batch['tgt'].long() batch['ner'] = batch['ner'].long() batch['src_extended'] = batch['src_extended'].long() # forward the model if self.args.entity_encoder_type == 'albert': d = self.model.entity_encoder.tokenizer.batch_encode_plus( batch['ner_text'], return_attention_masks=True, max_length=10, add_special_tokens=False, pad_to_max_length=True, return_tensors='pt') ner_mask = d['attention_mask'].cuda().unsqueeze(1) ner = d['input_ids'].cuda() # print('ner', ner.size()) # print('ner_mask', ner_mask.size()) # print('src_mask', batch['src_mask'].size()) if self.args.entity_encoder_type == 'gru' or self.args.entity_encoder_type == 'lstm': ner_feat = self.model.entity_encoder( batch['ner'].transpose(0, 1), batch['cluster_len'])[1] elif self.args.entity_encoder_type == 'transformer': mask = gen_mask(batch['cluster_len']) ner_feat = self.model.entity_encoder(batch['ner'], mask) ner, ner_mask = self.data_utils.pad_ner_feature( ner_feat.squeeze(), batch['num_clusters'], batch['src'].size(0)) # print('ner', ner.size()) # print('ner_mask', ner_mask.size()) if self.args.ner_at_embedding: out = self.model.forward(batch['src'], batch['tgt'], ner, batch['src_mask'], batch['tgt_mask'], batch['src_extended'], len(batch['oov_list'])) else: out = self.model.forward(batch['src'], batch['tgt'], ner, batch['src_mask'], batch['tgt_mask'], batch['src_extended'], len(batch['oov_list']), ner_mask) # print out info pred = out.topk(1, dim=-1)[1].squeeze().detach().cpu().numpy()[0] gg = batch['src_extended'].long().detach().cpu().numpy()[0][:100] tt = batch['tgt'].long().detach().cpu().numpy()[0] yy = batch['y'].long().detach().cpu().numpy()[0] #compute loss & update loss = self.model.loss_compute(out, batch['y'].long()) loss.backward() optim.step() # entity_optim.step() total_loss.append(loss.detach().cpu().numpy()) # logging information if step % self.args.print_every_steps == 1: elapsed = time.time() - start print("Epoch Step: %d Loss: %f Time: %f lr: %6.6f" % (step, np.mean(total_loss), elapsed, optim.param_groups[0]['lr'])) self.outfile.write("Epoch Step: %d Loss: %f Time: %f\n" % (step, np.mean(total_loss), elapsed)) print( 'src:\n', self.data_utils.id2sent(gg, False, False, batch['oov_list'])) print( 'tgt:\n', self.data_utils.id2sent(yy, False, False, batch['oov_list'])) print( 'pred:\n', self.data_utils.id2sent(pred, False, False, batch['oov_list'])) print('oov_list:\n', batch['oov_list']) if ner_mask != None and not self.args.ner_at_embedding: pp = self.model.greedy_decode( batch['src_extended'].long()[:1], ner[:1], batch['src_mask'][:1], 100, self.data_utils.bos, len(batch['oov_list']), self.data_utils.vocab_size, True, ner_mask[:1]) else: pp = self.model.greedy_decode( batch['src_extended'].long()[:1], ner[:1], batch['src_mask'][:1], 100, self.data_utils.bos, len(batch['oov_list']), self.data_utils.vocab_size, True) pp = pp.detach().cpu().numpy() print( 'pred_greedy:\n', self.data_utils.id2sent(pp[0], False, False, batch['oov_list'])) print() start = time.time() if not self.disable_comet: # self.log.add_scalar('Loss/train', np.mean(total_loss), step) self.exp.log_metric('Train Loss', np.mean(total_loss), step=step) self.exp.log_metric('Learning Rate', optim.param_groups[0]['lr'], step=step) self.exp.log_text('Src: ' + self.data_utils.id2sent( gg, False, False, batch['oov_list'])) self.exp.log_text('Tgt:' + self.data_utils.id2sent( yy, False, False, batch['oov_list'])) self.exp.log_text('Pred:' + self.data_utils.id2sent( pred, False, False, batch['oov_list'])) self.exp.log_text('Pred Greedy:' + self.data_utils.id2sent( pp[0], False, False, batch['oov_list'])) self.exp.log_text('OOV:' + ' '.join(batch['oov_list'])) total_loss = [] ########################## # validation ########################## if step % self.args.valid_every_steps == 2: print('*' * 50) print('Start Validation...') print('*' * 50) self.model.eval() val_yielder = self.data_utils.data_yielder(1, valid=True) total_loss = [] fw = open(self.w_valid_file, 'w') for batch in val_yielder: with torch.no_grad(): batch['src'] = batch['src'].long() batch['tgt'] = batch['tgt'].long() batch['ner'] = batch['ner'].long() batch['src_extended'] = batch['src_extended'].long() ### ner ###### if self.args.entity_encoder_type == 'albert': d = self.model.entity_encoder.tokenizer.batch_encode_plus( batch['ner_text'], return_attention_masks=True, max_length=10, add_special_tokens=False, pad_to_max_length=True, return_tensors='pt') ner_mask = d['attention_mask'].cuda().unsqueeze(1) ner = d['input_ids'].cuda() if self.args.entity_encoder_type == 'gru' or self.args.entity_encoder_type == 'lstm': ner_feat = self.model.entity_encoder( batch['ner'].transpose(0, 1), batch['cluster_len'])[1] elif self.args.entity_encoder_type == 'transformer': mask = gen_mask(batch['cluster_len']) ner_feat = self.model.entity_encoder( batch['ner'], mask) ner, ner_mask = self.data_utils.pad_ner_feature( ner_feat.squeeze(), batch['num_clusters'], batch['src'].size(0)) ### ner ###### if self.args.ner_at_embedding: out = self.model.forward(batch['src'], batch['tgt'], ner, batch['src_mask'], batch['tgt_mask'], batch['src_extended'], len(batch['oov_list'])) else: out = self.model.forward(batch['src'], batch['tgt'], ner, batch['src_mask'], batch['tgt_mask'], batch['src_extended'], len(batch['oov_list']), ner_mask) loss = self.model.loss_compute(out, batch['y'].long()) total_loss.append(loss.item()) if self.args.ner_at_embedding: pred = self.model.greedy_decode( batch['src_extended'].long(), ner, batch['src_mask'], self.args.max_len, self.data_utils.bos, len(batch['oov_list']), self.data_utils.vocab_size) else: pred = self.model.greedy_decode( batch['src_extended'].long(), ner, batch['src_mask'], self.args.max_len, self.data_utils.bos, len(batch['oov_list']), self.data_utils.vocab_size, ner_mask=ner_mask) for l in pred: sentence = self.data_utils.id2sent( l[1:], True, self.args.beam_size != 1, batch['oov_list']) fw.write(sentence) fw.write("\n") fw.close() # files_rouge = FilesRouge() # scores = files_rouge.get_scores(self.w_valid_file, self.args.valid_tgt_file, avg=True) scores = cal_rouge_score(self.w_valid_file, self.args.valid_ref_file) r1_score = scores['rouge1'] r2_score = scores['rouge2'] print('=============================================') print('Validation Result -> Loss : %6.6f' % (sum(total_loss) / len(total_loss))) print(scores) print('=============================================') self.outfile.write( '=============================================\n') self.outfile.write('Validation Result -> Loss : %6.6f\n' % (sum(total_loss) / len(total_loss))) self.outfile.write( '=============================================\n') # self.model.train() # self.log.add_scalar('Loss/valid', sum(total_loss)/len(total_loss), step) # self.log.add_scalar('Score/valid', r1_score, step) if not self.disable_comet: self.exp.log_metric('Valid Loss', sum(total_loss) / len(total_loss), step=step) self.exp.log_metric('R1 Score', r1_score, step=step) self.exp.log_metric('R2 Score', r2_score, step=step) #Saving Checkpoint w_step = int(step / 10000) print('Saving ' + str(w_step) + 'w_model.pth!\n') self.outfile.write('Saving ' + str(w_step) + 'w_model.pth\n') model_name = str(w_step) + 'w_' + '%6.6f' % ( sum(total_loss) / len(total_loss) ) + '%2.3f_' % r1_score + '%2.3f_' % r2_score + 'model.pth' state = {'step': step, 'state_dict': self.model.state_dict()} torch.save(state, os.path.join(self.model_dir, model_name)) # entity_encoder_name = str(w_step) + '0w_' + '%6.6f'%(sum(total_loss)/len(total_loss)) + '%2.3f_'%r1_score + 'entity_encoder.pth' # state = {'step': step, 'state_dict': self.entity_encoder.state_dict()} # torch.save(state, os.path.join(self.model_dir, entity_encoder_name)) def test(self): #prepare model path = self.args.load_model # entity_encoder_path = self.args.entity_encoder state_dict = torch.load(path)['state_dict'] max_len = self.args.max_len model = self.model model.load_state_dict(state_dict) # entity_encoder_dict = torch.load(entity_encoder_path)['state_dict'] # self.entity_encoder.load_state_dict(entity_encoder_dict) pred_dir = make_save_dir(self.args.pred_dir) filename = self.args.filename #start decoding data_yielder = self.data_utils.data_yielder(num_epoch=1) total_loss = [] start = time.time() #file f = open(os.path.join(pred_dir, filename), 'w') self.model.eval() # decode_strategy = BeamSearch( # self.beam_size, # batch_size=batch.batch_size, # pad=self._tgt_pad_idx, # bos=self._tgt_bos_idx, # eos=self._tgt_eos_idx, # n_best=self.n_best, # global_scorer=self.global_scorer, # min_length=self.min_length, max_length=self.max_length, # return_attention=attn_debug or self.replace_unk, # block_ngram_repeat=self.block_ngram_repeat, # exclusion_tokens=self._exclusion_idxs, # stepwise_penalty=self.stepwise_penalty, # ratio=self.ratio) step = 0 for batch in data_yielder: #print(batch['src'].data.size()) step += 1 if step % 100 == 0: print('%d batch processed. Time elapsed: %f min.' % (step, (time.time() - start) / 60.0)) start = time.time() ### ner ### if self.args.entity_encoder_type == 'albert': d = self.model.entity_encoder.tokenizer.batch_encode_plus( batch['ner_text'], return_attention_masks=True, max_length=10, add_special_tokens=False, pad_to_max_length=True, return_tensors='pt') ner_mask = d['attention_mask'].cuda().unsqueeze(1) ner = d['input_ids'].cuda() else: ner_mask = None ner = batch['ner'].long() with torch.no_grad(): if self.args.beam_size == 1: if self.args.ner_at_embedding: out = self.model.greedy_decode( batch['src_extended'].long(), self.model.entity_encoder(ner), batch['src_mask'], max_len, self.data_utils.bos, len(batch['oov_list']), self.data_utils.vocab_size) else: out = self.model.greedy_decode( batch['src_extended'].long(), self.model.entity_encoder(ner), batch['src_mask'], max_len, self.data_utils.bos, len(batch['oov_list']), self.data_utils.vocab_size, ner_mask=ner_mask) else: ret = self.beam_decode(batch, max_len, len(batch['oov_list'])) out = ret['predictions'] for l in out: sentence = self.data_utils.id2sent(l[1:], True, self.args.beam_size != 1, batch['oov_list']) #print(l[1:]) f.write(sentence) f.write("\n") def beam_decode(self, batch, max_len, oov_nums): src = batch['src'].long() src_mask = batch['src_mask'] src_extended = batch['src_extended'].long() bos_token = self.data_utils.bos beam_size = self.args.beam_size vocab_size = self.data_utils.vocab_size batch_size = src.size(0) def rvar(a): return a.repeat(beam_size, 1, 1) def rvar2(a): return a.repeat(beam_size, 1) def bottle(m): return m.view(batch_size * beam_size, -1) def unbottle(m): return m.view(beam_size, batch_size, -1) ### ner ### if self.args.entity_encoder_type == 'albert': d = self.model.entity_encoder.tokenizer.batch_encode_plus( batch['ner_text'], return_attention_masks=True, max_length=10, add_special_tokens=False, pad_to_max_length=True, return_tensors='pt') ner_mask = d['attention_mask'].cuda().unsqueeze(1) ner = d['input_ids'].cuda() else: ner_mask = None ner = batch['ner'].long() ner = self.model.entity_encoder(ner) if self.args.ner_at_embedding: memory = self.model.encode(src, src_mask, ner) else: memory = self.model.encode(src, src_mask) assert batch_size == 1 beam = [ Beam(beam_size, self.data_utils.pad, bos_token, self.data_utils.eos, min_length=self.args.min_length) for i in range(batch_size) ] memory = rvar(memory) ner = rvar(ner) src_mask = rvar(src_mask) src_extended = rvar2(src_extended) for i in range(self.args.max_len): if all((b.done() for b in beam)): break # Construct batch x beam_size nxt words. # Get all the pending current beam words and arrange for forward. inp = torch.stack([b.get_current_state() for b in beam]).t().contiguous().view(-1, 1) #inp -> [1, 3] inp_mask = inp < self.data_utils.vocab_size inp = inp * inp_mask.long() decoder_input = inp if self.args.ner_at_embedding: final_dist = self.model.decode(memory, ner, src_mask, decoder_input, None, src_extended, oov_nums) else: final_dist = self.model.decode(memory, ner, src_mask, decoder_input, None, src_extended, oov_nums, ner_mask=ner_mask) # final_dist, decoder_hidden, attn_dist_p, p_gen = self.seq2seq_model.model_copy.decoder( # decoder_input, decoder_hidden, # post_encoder_outputs, post_enc_padding_mask, # extra_zeros, post_enc_batch_extend_vocab # ) # # Run one step. # print('inp', inp.size()) # decoder_outputs: beam x rnn_size # (b) Compute a vector of batch*beam word scores. out = unbottle(final_dist) out[:, :, 2] = 0 #no unk # out.size -> [3, 1, vocab] # (c) Advance each beam. for j, b in enumerate(beam): b.advance(out[:, j]) # decoder_hidden = self.beam_update(j, b.get_current_origin(), beam_size, decoder_hidden) # (4) Extract sentences from beam. ret = self._from_beam(beam) return ret def _from_beam(self, beam): ret = {"predictions": [], "scores": []} for b in beam: n_best = self.args.n_best scores, ks = b.sort_finished(minimum=n_best) hyps = [] for i, (times, k) in enumerate(ks[:n_best]): hyp = b.get_hyp(times, k) hyps.append(hyp) ret["predictions"].append(hyps) ret["scores"].append(scores) return ret
def train(rank, defparams, hyper): params = {} for param in defparams.keys(): params[param] = defparams[param] hyperp = {} for hp in hyper.keys(): hyperp[hp] = hyper[hp] experiment = Experiment(api_key="keGmeIz4GfKlQZlOP6cit4QOi", project_name="hadron-shower", workspace="engineren") experiment.add_tag(params['exp']) experiment.log_parameters(hyperp) device = torch.device("cuda") torch.manual_seed(params["seed"]) world_size = int(os.environ["SLURM_NNODES"]) rank = int(os.environ["SLURM_PROCID"]) dist.init_process_group(backend='nccl', world_size=world_size, rank=rank, init_method=params["DDP_init_file"]) aD = DCGAN_D(hyperp["ndf"]).to(device) aG = DCGAN_G(hyperp["ngf"], hyperp["z"]).to(device) aE = energyRegressor().to(device) aP = PostProcess_Size1Conv_EcondV2(48, 13, 3, 128, bias=True, out_funct='none').to(device) optimizer_g = torch.optim.Adam(aG.parameters(), lr=hyperp["L_gen"], betas=(0.5, 0.9)) optimizer_d = torch.optim.Adam(aD.parameters(), lr=hyperp["L_crit"], betas=(0.5, 0.9)) optimizer_e = torch.optim.SGD(aE.parameters(), lr=hyperp["L_calib"]) optimizer_p = torch.optim.Adam(aP.parameters(), lr=hyperp["L_post"], betas=(0.5, 0.9)) assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled." torch.backends.cudnn.benchmark = True # Initialize Amp models, optimizers = amp.initialize([aG, aD], [optimizer_g, optimizer_d], opt_level="O1", num_losses=2) #aD = nn.DataParallel(aD) #aG = nn.DataParallel(aG) #aE = nn.DataParallel(aE) aG, aD = models optimizer_g, optimizer_d = optimizers aG = nn.parallel.DistributedDataParallel(aG, device_ids=[0]) aD = nn.parallel.DistributedDataParallel(aD, device_ids=[0]) aE = nn.parallel.DistributedDataParallel(aE, device_ids=[0]) aP = nn.parallel.DistributedDataParallel(aP, device_ids=[0]) experiment.set_model_graph(str(aG), overwrite=False) experiment.set_model_graph(str(aD), overwrite=False) if params["restore_pp"]: aP.load_state_dict( torch.load(params["restore_path_PP"] + params["post_saved"], map_location=torch.device(device))) if params["restore"]: checkpoint = torch.load(params["restore_path"]) aG.load_state_dict(checkpoint['Generator']) aD.load_state_dict(checkpoint['Critic']) optimizer_g.load_state_dict(checkpoint['G_optimizer']) optimizer_d.load_state_dict(checkpoint['D_optimizer']) itr = checkpoint['iteration'] else: aG.apply(weights_init) aD.apply(weights_init) itr = 0 if params["c0"]: aE.apply(weights_init) elif params["c1"]: aE.load_state_dict( torch.load(params["calib_saved"], map_location=torch.device(device))) one = torch.tensor(1.0).to(device) mone = (one * -1).to(device) print('loading data...') paths_list = [ '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part1.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part2.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part3.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part4.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part5.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part6.hdf5', '/beegfs/desy/user/eren/data_generator/pion/hcal_only/pion40part7.hdf5' ] train_data = PionsDataset(paths_list, core=True) train_sampler = torch.utils.data.distributed.DistributedSampler( train_data, num_replicas=world_size, rank=rank) dataloader = DataLoader(train_data, batch_size=hyperp["batch_size"], num_workers=0, shuffle=False, drop_last=True, pin_memory=True, sampler=train_sampler) print('done') #scheduler_g = optim.lr_scheduler.StepLR(optimizer_g, step_size=1, gamma=params["gamma_g"]) #scheduler_d = optim.lr_scheduler.StepLR(optimizer_d, step_size=1, gamma=params["gamma_crit"]) #scheduler_e = optim.lr_scheduler.StepLR(optimizer_e, step_size=1, gamma=params["gamma_calib"]) #writer = SummaryWriter() e_criterion = nn.L1Loss() # for energy regressor training dataiter = iter(dataloader) BATCH_SIZE = hyperp["batch_size"] LATENT = hyperp["z"] EXP = params["exp"] KAPPA = hyperp["kappa"] LAMBD = hyperp["lambda"] ## Post-Processing LDP = hyperp["LDP"] wMMD = hyperp["wMMD"] wMSE = hyperp["wMSE"] ## IO paths OUTP = params['output_path'] for iteration in range(50000): iteration += itr + 1 #---------------------TRAIN D------------------------ for p in aD.parameters(): # reset requires_grad p.requires_grad_(True) # they are set to False below in training G for e in aE.parameters(): # reset requires_grad (constrainer) e.requires_grad_(True) # they are set to False below in training G for i in range(hyperp["ncrit"]): aD.zero_grad() aE.zero_grad() noise = np.random.uniform(-1, 1, (BATCH_SIZE, LATENT)) noise = torch.from_numpy(noise).float() noise = noise.view( -1, LATENT, 1, 1, 1) #[BS, nz] --> [Bs,nz,1,1,1] Needed for Generator noise = noise.to(device) batch = next(dataiter, None) if batch is None: dataiter = iter(dataloader) batch = dataiter.next() real_label = batch['energy'] ## energy label real_label = real_label.to(device) with torch.no_grad(): noisev = noise # totally freeze G, training D fake_data = aG(noisev, real_label).detach() real_data = batch['shower'] # 48x48x48 calo image real_data = real_data.to(device) real_data.requires_grad_(True) #### supervised-training for energy regressor! if params["train_calib"]: output = aE(real_data.float()) e_loss = e_criterion(output, real_label.view(BATCH_SIZE, 1)) e_loss.backward() optimizer_e.step() ###### # train with real data disc_real = aD(real_data.float(), real_label.float()) # train with fake data fake_data = fake_data.unsqueeze( 1) ## transform to [BS, 1, 48, 48, 48] disc_fake = aD(fake_data, real_label.float()) # train with interpolated data gradient_penalty = calc_gradient_penalty(aD, real_data.float(), fake_data, real_label, BATCH_SIZE, device, DIM=13) ## wasserstein-1 distace w_dist = torch.mean(disc_fake) - torch.mean(disc_real) # final disc cost disc_cost = torch.mean(disc_fake) - torch.mean( disc_real) + LAMBD * gradient_penalty with amp.scale_loss(disc_cost, optimizer_d) as scaled_loss: scaled_loss.backward() optimizer_d.step() #--------------Log to COMET ML ---------- if i == hyperp["ncrit"] - 1: experiment.log_metric("L_crit", disc_cost, step=iteration) experiment.log_metric("gradient_pen", gradient_penalty, step=iteration) experiment.log_metric("Wasserstein Dist", w_dist, step=iteration) if params["train_calib"]: experiment.log_metric("L_const", e_loss, step=iteration) #---------------------TRAIN G------------------------ for p in aD.parameters(): p.requires_grad_(False) # freeze D for c in aE.parameters(): c.requires_grad_(False) # freeze C gen_cost = None for i in range(hyperp["ngen"]): aG.zero_grad() noise = np.random.uniform(-1, 1, (BATCH_SIZE, LATENT)) noise = torch.from_numpy(noise).float() noise = noise.view( -1, LATENT, 1, 1, 1) #[BS, nz] --> [Bs,nz,1,1,1] Needed for Generator noise = noise.to(device) batch = next(dataiter, None) if batch is None: dataiter = iter(dataloader) batch = dataiter.next() real_label = batch['energy'] ## energy label real_label = real_label.to(device) noise.requires_grad_(True) real_data = batch['shower'] # 48x48x48 calo image real_data = real_data.to(device) fake_data = aG(noise, real_label.float()) fake_data = fake_data.unsqueeze( 1) ## transform to [BS, 1, 48, 48, 48] ## calculate loss function gen_cost = aD(fake_data.float(), real_label.float()) ## label conditioning #output_g = aE(fake_data) #output_r = aE(real_data.float()) output_g = 0.0 #for now output_r = 0.0 #for now aux_fake = (output_g - real_label)**2 aux_real = (output_r - real_label)**2 aux_errG = torch.abs(aux_fake - aux_real) ## Total loss function for generator g_cost = -torch.mean(gen_cost) + KAPPA * torch.mean(aux_errG) with amp.scale_loss(g_cost, optimizer_g) as scaled_loss_G: scaled_loss_G.backward() optimizer_g.step() #--------------Log to COMET ML ---------- experiment.log_metric("L_Gen", g_cost, step=iteration) ## plot example image if iteration % 100 == 0.0 or iteration == 1: image = fake_data.view(-1, 48, 13, 13).cpu().detach().numpy() cmap = mpl.cm.viridis cmap.set_bad('white', 1.) figExIm = plt.figure(figsize=(6, 6)) axExIm1 = figExIm.add_subplot(1, 1, 1) image1 = np.sum(image[0], axis=0) masked_array1 = np.ma.array(image1, mask=(image1 == 0.0)) im1 = axExIm1.imshow(masked_array1, filternorm=False, interpolation='none', cmap=cmap, vmin=0.01, vmax=100, norm=mpl.colors.LogNorm(), origin='lower') figExIm.patch.set_facecolor('white') axExIm1.set_xlabel('y [cells]', family='serif') axExIm1.set_ylabel('x [cells]', family='serif') figExIm.colorbar(im1) experiment.log_figure(figure=plt, figure_name="x-y") figExIm = plt.figure(figsize=(6, 6)) axExIm2 = figExIm.add_subplot(1, 1, 1) image2 = np.sum(image[0], axis=1) masked_array2 = np.ma.array(image2, mask=(image2 == 0.0)) im2 = axExIm2.imshow(masked_array2, filternorm=False, interpolation='none', cmap=cmap, vmin=0.01, vmax=100, norm=mpl.colors.LogNorm(), origin='lower') figExIm.patch.set_facecolor('white') axExIm2.set_xlabel('y [cells]', family='serif') axExIm2.set_ylabel('z [layers]', family='serif') figExIm.colorbar(im2) experiment.log_figure(figure=plt, figure_name="y-z") figExIm = plt.figure(figsize=(6, 6)) axExIm3 = figExIm.add_subplot(1, 1, 1) image3 = np.sum(image[0], axis=2) masked_array3 = np.ma.array(image3, mask=(image3 == 0.0)) im3 = axExIm3.imshow(masked_array3, filternorm=False, interpolation='none', cmap=cmap, vmin=0.01, vmax=100, norm=mpl.colors.LogNorm(), origin='lower') figExIm.patch.set_facecolor('white') axExIm3.set_xlabel('x [cells]', family='serif') axExIm3.set_ylabel('z [layers]', family='serif') figExIm.colorbar(im3) #experiment.log_metric("L_aux", aux_errG, step=iteration) experiment.log_figure(figure=plt, figure_name="x-z") ## E-sum monitoring figEsum = plt.figure(figsize=(6, 6 * 0.77 / 0.67)) axEsum = figEsum.add_subplot(1, 1, 1) etot_real = getTotE(real_data.cpu().detach().numpy(), xbins=13, ybins=13) etot_fake = getTotE(image, xbins=13, ybins=13) axEsumReal = axEsum.hist(etot_real, bins=25, range=[0, 1500], weights=np.ones_like(etot_real) / (float(len(etot_real))), label="orig", color='blue', histtype='stepfilled') axEsumFake = axEsum.hist(etot_fake, bins=25, range=[0, 1500], weights=np.ones_like(etot_fake) / (float(len(etot_fake))), label="generated", color='red', histtype='stepfilled') axEsum.text(0.25, 0.81, "WGAN", horizontalalignment='left', verticalalignment='top', transform=axEsum.transAxes, color='red') axEsum.text(0.25, 0.87, 'GEANT 4', horizontalalignment='left', verticalalignment='top', transform=axEsum.transAxes, color='blue') experiment.log_figure(figure=plt, figure_name="E-sum") #end = timer() #print(f'---train G elapsed time: {end - start}') if params["train_postP"]: #---------------------TRAIN P------------------------ for p in aD.parameters(): p.requires_grad_(False) # freeze D for c in aG.parameters(): c.requires_grad_(False) # freeze G lossP = None for i in range(1): noise = np.random.uniform(-1, 1, (BATCH_SIZE, LATENT)) noise = torch.from_numpy(noise).float() noise = noise.view( -1, LATENT, 1, 1, 1) #[BS, nz] --> [Bs,nz,1,1,1] Needed for Generator noise = noise.to(device) batch = next(dataiter, None) if batch is None: dataiter = iter(dataloader) batch = dataiter.next() real_label = batch['energy'] ## energy label real_label = real_label.to(device) noise.requires_grad_(True) real_data = batch['shower'] # calo image real_data = real_data.to(device) ## forward pass to generator fake_data = aG(noise, real_label.float()) fake_data = fake_data.unsqueeze( 1) ## transform to [BS, 1, layer, size, size] ### first LossD_P fake_dataP = aP(fake_data.float(), real_label.float()) lossD_P = aD(fake_dataP.float(), real_label.float()) lossD_P = lossD_P.mean() ## lossFixP real_sorted = real_data.view(BATCH_SIZE, -1) fake_sorted = fake_dataP.view(BATCH_SIZE, -1) real_sorted, _ = torch.sort(real_sorted, dim=1, descending=True) #.view(900,1) fake_sorted, _ = torch.sort(fake_sorted, dim=1, descending=True) #.view(900,1) lossFixPp1 = mmd_hit_sortKernel(real_sorted.float(), fake_sorted, kernel_size=100, stride=50, cutoff=2000, alpha=200) lossFixPp2 = F.mse_loss(fake_dataP.view(BATCH_SIZE, -1), fake_data.detach().view( BATCH_SIZE, -1), reduction='mean') lossFixP = wMMD * lossFixPp1 + wMSE * lossFixPp2 lossP = LDP * lossD_P - lossFixP lossP.backward(mone) optimizer_p.step() if iteration % 100 == 0 or iteration == 1: print('iteration: {}, critic loss: {}'.format( iteration, disc_cost.cpu().data.numpy())) if rank == 0: torch.save( { 'Generator': aG.state_dict(), 'Critic': aD.state_dict(), 'G_optimizer': optimizer_g.state_dict(), 'D_optimizer': optimizer_d.state_dict(), 'iteration': iteration }, OUTP + '{0}/wgan_itrs_{1}.pth'.format(EXP, iteration)) if params["train_calib"]: torch.save( aE.state_dict(), OUTP + '/{0}/netE_itrs_{1}.pth'.format(EXP, iteration)) if params["train_postP"]: torch.save( aP.state_dict(), OUTP + '{0}/netP_itrs_{1}.pth'.format(EXP, iteration))
def train(normal_digit, anomalies, folder, file, p_train, p_test): # Create an experiment experiment = Experiment(project_name="deep-stats-thesis", workspace="stecaron", disabled=True) experiment.add_tag("mnist_conv_ae") # General parameters DOWNLOAD_MNIST = True PATH_DATA = os.path.join(os.path.expanduser("~"), 'Downloads/mnist') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Define training parameters hyper_params = { "EPOCH": 75, "NUM_WORKERS": 10, "BATCH_SIZE": 256, "LR": 0.001, "TRAIN_SIZE": 4000, "TRAIN_NOISE": p_train, "TEST_SIZE": 800, "TEST_NOISE": p_test, # on which class we want to learn outliers "CLASS_SELECTED": [normal_digit], # which class we want to corrupt our dataset with "CLASS_CORRUPTED": anomalies, "ALPHA": p_test, "MODEL_NAME": "mnist_ae_model", "LOAD_MODEL": False, "LOAD_MODEL_NAME": "mnist_ae_model" } # Log experiment parameters experiment.log_parameters(hyper_params) # Load data train_data, test_data = load_mnist(PATH_DATA, download=DOWNLOAD_MNIST) # Train the autoencoder model = ConvAutoEncoder2() optimizer = torch.optim.Adam(model.parameters(), lr=hyper_params["LR"]) #loss_func = nn.MSELoss() loss_func = nn.BCELoss() # Build "train" and "test" datasets id_maj_train = numpy.random.choice(numpy.where( numpy.isin(train_data.train_labels, hyper_params["CLASS_SELECTED"]))[0], int((1 - hyper_params["TRAIN_NOISE"]) * hyper_params["TRAIN_SIZE"]), replace=False) id_min_train = numpy.random.choice(numpy.where( numpy.isin(train_data.train_labels, hyper_params["CLASS_CORRUPTED"]))[0], int(hyper_params["TRAIN_NOISE"] * hyper_params["TRAIN_SIZE"]), replace=False) id_train = numpy.concatenate((id_maj_train, id_min_train)) id_maj_test = numpy.random.choice(numpy.where( numpy.isin(test_data.test_labels, hyper_params["CLASS_SELECTED"]))[0], int((1 - hyper_params["TEST_NOISE"]) * hyper_params["TEST_SIZE"]), replace=False) id_min_test = numpy.random.choice(numpy.where( numpy.isin(test_data.test_labels, hyper_params["CLASS_CORRUPTED"]))[0], int(hyper_params["TEST_NOISE"] * hyper_params["TEST_SIZE"]), replace=False) id_test = numpy.concatenate((id_min_test, id_maj_test)) train_data.data = train_data.data[id_train] train_data.targets = train_data.targets[id_train] test_data.data = test_data.data[id_test] test_data.targets = test_data.targets[id_test] train_data.targets = torch.from_numpy( numpy.isin(train_data.train_labels, hyper_params["CLASS_CORRUPTED"])).type(torch.int32) test_data.targets = torch.from_numpy( numpy.isin(test_data.test_labels, hyper_params["CLASS_CORRUPTED"])).type(torch.int32) train_loader = Data.DataLoader(dataset=train_data, batch_size=hyper_params["BATCH_SIZE"], shuffle=True, num_workers=hyper_params["NUM_WORKERS"]) test_loader = Data.DataLoader(dataset=test_data, batch_size=test_data.data.shape[0], shuffle=False, num_workers=hyper_params["NUM_WORKERS"]) model.train() if hyper_params["LOAD_MODEL"]: model = torch.load(hyper_params["LOAD_MODEL_NAME"]) else: train_mnist(train_loader, model, criterion=optimizer, n_epoch=hyper_params["EPOCH"], experiment=experiment, device=device, model_name=hyper_params["MODEL_NAME"], loss_func=loss_func, loss_type="binary") # Compute p-values model.to(device) pval, test_errors = compute_reconstruction_pval( train_loader, model, test_loader, device) pval_order = numpy.argsort(pval) # Plot p-values x_line = numpy.arange(0, len(test_data), step=1) y_line = numpy.linspace(0, 1, len(test_data)) y_adj = numpy.arange(0, len(test_data), step=1) / len(test_data) * hyper_params["ALPHA"] zoom = int(0.2 * len(test_data)) # nb of points to zoom #index = numpy.isin(test_data.test_labels, hyper_params["CLASS_CORRUPTED"]).astype(int) index = numpy.array(test_data.targets).astype(int) fig, (ax1, ax2) = plt.subplots(2, 1) ax1.scatter(numpy.arange(0, len(pval), 1), pval[pval_order], c=index[pval_order].reshape(-1)) ax1.plot(x_line, y_line, color="green") ax1.plot(x_line, y_adj, color="red") ax1.set_title( f'Entire test dataset with {int(hyper_params["TEST_NOISE"] * 100)}% of noise' ) ax1.set_xticklabels([]) ax2.scatter(numpy.arange(0, zoom, 1), pval[pval_order][0:zoom], c=index[pval_order].reshape(-1)[0:zoom]) ax2.plot(x_line[0:zoom], y_line[0:zoom], color="green") ax2.plot(x_line[0:zoom], y_adj[0:zoom], color="red") ax2.set_title('Zoomed in') ax2.set_xticklabels([]) experiment.log_figure(figure_name="empirical_test_hypothesis", figure=fig, overwrite=True) plt.savefig(os.path.join(folder, "pvalues_" + file + ".png")) plt.show() # Compute some stats precision, recall, f1_score, average_precision, roc_auc = test_performances( pval, index, hyper_params["ALPHA"]) print(f"Precision: {precision}") print(f"Recall: {recall}") print(f"F1 Score: {f1_score}") print(f"AUC: {roc_auc}") print(f"Average Precison: {average_precision}") experiment.log_metric("precision", precision) experiment.log_metric("recall", recall) experiment.log_metric("f1_score", f1_score) experiment.log_metric("auc", roc_auc) experiment.log_metric("average_precision", average_precision) # Show some examples fig, axs = plt.subplots(5, 5) fig.tight_layout() axs = axs.ravel() for i in range(25): image = test_data.data[pval_order[i]] axs[i].imshow(image, cmap='gray') axs[i].axis('off') experiment.log_figure(figure_name="rejetcted_observations", figure=fig, overwrite=True) plt.show() fig, axs = plt.subplots(5, 5) fig.tight_layout() axs = axs.ravel() for i in range(25): image = test_data.data[pval_order[int(len(pval) - 1) - i]] axs[i].imshow(image, cmap='gray') axs[i].axis('off') experiment.log_figure(figure_name="better_observations", figure=fig, overwrite=True) plt.show() # Save the results in the output file col_names = ["timestamp", "precision", "recall", "f1_score", "average_precision", "auc"] results_file = os.path.join(folder, "results_" + file + ".csv") if os.path.exists(results_file): df_results = pandas.read_csv(results_file, names=col_names, header=0) else: df_results = pandas.DataFrame(columns=col_names) df_results = df_results.append( pandas.DataFrame( numpy.concatenate( (numpy.array( datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S')).reshape(1), precision.reshape(1), recall.reshape(1), f1_score.reshape(1), average_precision.reshape(1), roc_auc.reshape(1))).reshape(1, -1), columns=col_names), ignore_index=True) df_results.to_csv(results_file)
data, teach=True) learn.model, net = learn.model.to(args.gpu), net.to(args.gpu) teacher = learn.model sf_student, sf_teacher = get_features(net, teacher, experiment=expt) if args.api_key: project_name = expt + '-' + hyper_params['model'] + '-' + hyper_params[ 'dataset'] experiment = Experiment(api_key=args.api_key, project_name=project_name, workspace=args.workspace) experiment.log_parameters(hyper_params) optimizer = torch.optim.Adam(net.parameters(), lr=hyper_params["learning_rate"]) loss_function2 = nn.MSELoss() loss_function = nn.CrossEntropyLoss() savename = get_savename(hyper_params, experiment=expt) best_val_acc = 0 for epoch in range(hyper_params['num_epochs']): student, train_loss, val_loss, val_acc, best_val_acc = train( net, teacher, data, sf_teacher, sf_student, loss_function, loss_function2, optimizer, hyper_params, epoch, savename, best_val_acc) if args.api_key: experiment.log_metric("train_loss", train_loss) experiment.log_metric("val_loss", val_loss) experiment.log_metric("val_acc", val_acc * 100)
add_to_feed=feed_means_stds, minibatch_size=minibatch_size, save_after=10, save_path=_path_sav_fold("models")) plot_cost_name = "{}_nll_costs".format(exp_name) costs = [float(c) for c in costs] plot_cost_graph(plot_cost_name, costs, "{}.png".format(_path_sav_fold(plot_cost_name))) logger.info("--> Duration: {:.4f}".format(timers.tac())) # Comet.ml if use_comet: for cost in costs: experiment.log_metric("cost", cost) elif training_type == "adversarial": timers.tic() logger.info("Compiling adversarial model.") forward, backward_masks = spn.compile_adversarial( learning_rate=learning_rate) logger.info("--> Duration: {:.4f}".format(timers.tac())) timers.tic() logger.info("Fitting.") feed_means_stds = { spn.leaf_layer.means: leaf_means,
adam_stop = True return eval_acc """ Train model on Natural Language Inference task """ epoch = 1 #nli_net.load_state_dict(torch.load(os.path.join(params.outputdir, params.criticmodelname))) #print("\nCritic Loaded") while not stop_training and epoch <= params.n_epochs: with experiment.train(): train_accc, train_losss = trainepoch(epoch, RL_train=False) experiment.log_metric("Train Accuracy", train_accc, step=epoch) experiment.log_metric("Train Loss", sum(train_losss) / len(train_losss), step=epoch) with experiment.test(): eval_accc = evaluate(epoch, 'valid') experiment.log_metric("Validation Accuracy", eval_accc, step=epoch) epoch += 1 # Run best model on test set. nli_net.load_state_dict( torch.load(os.path.join(params.outputdir, params.criticmodelname))) print("\nCritic Loaded") #actorModel.load_state_dict(torch.load(os.path.join(params.outputdir, params.actormodelname))) #print("\nActor Loaded") #print(evaluate(epoch, 'train'))
global_step = 0 for epoch in range(1, num_epochs + 1): print("Epoch: {}/{}".format(epoch, num_epochs)) with experiment.train(): for train_step in range(train_steps): global_step += 1 # Perform training step on batch and record metrics loss, accuracy = model.train_on_batch( train_text[train_step], train_labels[train_step]) train_loss.append(loss) train_accuracy.append(accuracy) experiment.log_metric('loss', np.mean(train_loss), step=global_step) experiment.log_metric('accuracy', np.mean(train_accuracy), step=global_step) # Every evaluate_steps evaluate model on validation set if (train_step + 1) % evaluate_steps == 0 or ( train_step + 1) == train_steps: with experiment.validate(): for val_step in range(val_steps): # Perform evaluation step on batch and record metrics loss, accuracy = model.test_on_batch( val_text[val_step], val_labels[val_step]) val_loss.append(loss)
c_loss = mixup_criterion(c_obj_fn, code, m_label_a, m_label_b, lam) else: code, output = model(m_batch) cce_loss = criterion(output, m_label) c_loss = c_obj_fn(code, m_label) loss = cce_loss + (parser['c_loss_weight'] * c_loss) #print(loss) optimizer.zero_grad() loss.backward() for param in c_obj_fn.parameters(): param.grad.data *= (parser['c_loss_lr'] / (parser['c_loss_weight'] * parser['lr'])) optimizer.step() pbar.set_description('epoch: %d loss: %.3f'%(epoch, loss)) pbar.update(1) experiment.log_metric('trn_loss', loss) #lr_scheduler.step() #validation phase model.eval() with torch.set_grad_enabled(False): embeddings_dev = [] data_y_dev = [] with tqdm(total = len(devset_gen), ncols = 70) as pbar: for m_batch, m_label in devset_gen: m_batch = m_batch.to(device) code, _ = model(m_batch) m_label = list(m_label.numpy()) embeddings_dev.extend(list(code.cpu().numpy())) #>>> (16, 64?) data_y_dev.extend(m_label) pbar.set_description('epoch%d: Extract ValEmbeddings'%(epoch))
class Experiment: """ A helper class to facilitate the training and validation procedure of the GoTurnRemix model Parameters ---------- learning_rate: float Learning rate to train the model. The optimizer is SGD and the loss is L1 Loss image_size: int The size of the input image. This has to be fixed before the data is created data_path: Path Path to the data folder. If the folder name includes "pickle", then the data saved as pickles are loaded augment: bool Perform augmentation on the images before training logs_path: Path Path to save the validation predictions at the end of each epoch models_path: Path Path to save the model state at the end of each epoch save_name: str Name of the folder in which the logs and models are saved. If not provided, the current datetime is used """ def __init__(self, learning_rate: float, image_size: int, data_path: Path, augment: bool = True, logs_path: Path = None, models_path: Path = None, save_name: str = None, comet_api: str = None): self.image_size = image_size self.logs_path = logs_path self.models_path = models_path self.model = GoTurnRemix() self.model.cuda() self.criterion = torch.nn.L1Loss() self.optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, self.model.parameters()), lr=learning_rate) self.model_name = str(datetime.datetime.now()).split('.')[0].replace( ':', '-').replace(' ', '-') self.model_name = save_name if save_name else self.model_name self.augment = augment self.data = Data(data_path, target_size=self.image_size, transforms=augment) self.comet = None if comet_api: self.comet = Comet(api_key=comet_api) self.comet.log_parameter('learning_rate', learning_rate) self.comet.log_parameter('image_size', image_size) self.comet.log_parameter('augment', augment) def __train_step__(self, data): """ Performs one step of the training procedure Parameters ---------- data data obtained from @Data.__getitem__ Returns ------- Loss at the end of training step """ if self.comet: self.comet.train() previous_cropped, current_cropped, bbox, scale, crop = data previous_cropped = torch.div(previous_cropped, 255).float().cuda() current_cropped = torch.div(current_cropped, 255).float().cuda() previous_cropped = torch.autograd.Variable(previous_cropped, requires_grad=True) current_cropped = torch.autograd.Variable(current_cropped, requires_grad=True) bbox = bbox.requires_grad_(True).float().cuda() self.optimizer.zero_grad() preds = self.model(previous_cropped, current_cropped) del previous_cropped del current_cropped gc.collect() loss = self.criterion(preds, bbox) if self.comet: self.comet.log_metric('loss', loss) loss.backward() self.optimizer.step() return loss def __test__(self): """ Test tracking of the model Returns ------- Test loss and test predictions """ # Set model to evaluation mode if self.comet: self.comet.test() self.model.eval() test_preds = [] test_loss = [] video_frames = self.data.video_frames[-1] video_annotations = self.data.video_annotations[-1] p_a = video_annotations[0] p_f = video_frames[0] test_preds.append(p_a) for i in tqdm(range(1, len(video_annotations)), desc='Validating'): c_a = video_annotations[i] c_f = video_frames[i] p_c, c_c, bbox, scale, crop = self.data.make_crops( p_f, c_f, p_a, c_a) p_c = torch.div(torch.from_numpy(p_c), 255).unsqueeze(0).float().cuda() c_c = torch.div(torch.from_numpy(c_c), 255).unsqueeze(0).float().cuda() bbox = torch.tensor(bbox, requires_grad=False).float().cuda() preds = self.model(p_c, c_c) del p_c del c_c gc.collect() loss = torch.nn.functional.l1_loss(preds, bbox) if self.comet: self.comet.log_metric('val_loss', loss) test_loss.append(loss.item()) preds = self.data.get_bbox(preds.cpu().detach().numpy()[0], self.image_size, scale, crop) test_preds.append(preds) p_a = preds p_f = c_f return test_loss, test_preds def __validate__(self): """ Performs validation on the model Returns ------- Validation loss and validation predictions """ # Set model to evaluation mode if self.comet: self.comet.validate() self.model.eval() validation_preds = [] validation_loss = [] video_frames = self.data.video_frames[-1] video_annotations = self.data.video_annotations[-1] p_a = video_annotations[0] p_f = video_frames[0] validation_preds.append(p_a) for i in tqdm(range(1, len(video_annotations)), desc='Validating'): c_a = video_annotations[i] c_f = video_frames[i] p_c, c_c, bbox, scale, crop = self.data.make_crops( p_f, c_f, p_a, c_a) p_c = torch.div(torch.from_numpy(p_c), 255).unsqueeze(0).float().cuda() c_c = torch.div(torch.from_numpy(c_c), 255).unsqueeze(0).float().cuda() bbox = torch.tensor(bbox, requires_grad=False).float().cuda() preds = self.model(p_c, c_c) del p_c del c_c gc.collect() loss = torch.nn.functional.l1_loss(preds, bbox) if self.comet: self.comet.log_metric('val_loss', loss) validation_loss.append(loss.item()) preds = self.data.get_bbox(preds.cpu().detach().numpy()[0], self.image_size, scale, crop) validation_preds.append(preds) p_a = c_a p_f = c_f return validation_loss, validation_preds def train(self, epochs: int, batch_size: int, validate: bool = True, test: bool = True): """ Trains the model for @epochs number of epochs Parameters ---------- epochs: int Number of epochs to train the model batch_size: int The size of each batch when training the model validate: bool, default=True If True, validation occurs at the end of each epoch The results are saved in @logs_path and models are saved in @models_path test: bool, default=True If True, the model is tested for tracking at the end of the training procedure The results are saved in @logs_path Returns ------- list: List containing the training loss at the end of each epoch """ if self.comet: self.comet.log_parameter('epochs', epochs) self.comet.log_parameter('batch_size', batch_size) loss_per_epoch = [] preds_per_epoch = [] # Set the model to training mode self.model.train() # Create a DataLoader to feed data to the model dataloader = torch.utils.data.DataLoader(dataset=self.data, batch_size=batch_size, shuffle=True) # Run for @epochs number of epochs for epoch in range(epochs): if self.comet: self.comet.log_metric('epoch', epoch) running_loss = [] for step, data in enumerate( tqdm(dataloader, total=int(len(self.data) / batch_size), desc='Epoch {}'.format(epoch))): loss = self.__train_step__(data) running_loss.append(loss.item()) training_loss = sum(running_loss) / len(running_loss) if self.comet: self.comet.log_metric('mean_train_loss', training_loss) loss_per_epoch.append(sum(running_loss) / len(running_loss)) if validate: validation_loss, validation_preds = self.__validate__() if self.comet: self.comet.log_metric('mean_validation_loss', validation_loss) preds_per_epoch.append(validation_preds) print('Validation loss: {}'.format( sum(validation_loss) / len(validation_loss))) # Save the model at this stage if self.models_path: (self.models_path / self.model_name).mkdir(exist_ok=True) torch.save(self.model, (self.models_path / self.model_name / 'epoch_{}'.format(epoch)).resolve()) print('Training Loss: {}'.format(training_loss)) # Save the validation frames, ground truths and predictions at this stage if self.logs_path: (self.logs_path / self.model_name).mkdir(exist_ok=True) save = { 'frames': self.data.video_frames[-1], 'truth': self.data.video_annotations[-1], 'preds': preds_per_epoch } np.save( str((self.logs_path / self.model_name / 'preds_per_epoch.npy').resolve()), save) # Test the model and save the results if test: test_loss, test_preds = self.__test__() if self.logs_path: (self.logs_path / self.model_name).mkdir(exist_ok=True) save = { 'frames': self.data.video_frames[-1], 'truth': self.data.video_annotations[-1], 'preds': test_preds, 'loss': test_loss } np.save( str((self.logs_path / self.model_name / 'test_preds.npy').resolve()), save) return loss_per_epoch