def do_test(model, data, measures): start_time = time.time() input_size = len(_g.vocab) if not _g.args.quiet: print('Testing...') criterion = nn.NLLLoss(ignore_index=_g.vocab.stoi[_g.padding_symbol]) losses = None so_far = 0 try: for i, batch in zip(range(len(data)), data): # TODO necessary for now to do it this way loss = _t.evaluate(model, criterion, _u.to_one_hot(batch.before, input_size), batch.after, measures) loss = loss.unsqueeze(dim=1) losses = loss if losses is None else torch.cat((losses, loss), dim=1) so_far = i+1 if not _g.args.quiet: print('Testing done successfully') except KeyboardInterrupt: print('\nExiting earlier than expected. Wait a moment!') losses = losses.mean(dim=1) text = 'Test {} elements in {}.'.format(so_far * data.batch_size, _u.pretty_print_time(time.time() - start_time)) eval_measures = _u.to_builtin({n: (x,y) for n,x,y in zip(['loss'] + list(measures.keys()), losses[::2], losses[1::2])}) for i, j in eval_measures.items(): text += ' ' + i + ' {:5.6f}({:5.6f}).'.format(j[0], j[1]) if not _g.args.quiet: print(text)
def train_transfer(output, models=['linear', 'tree', 'forest', 'svr']): data = get_predictions(output) print('Primary predictions loaded.') [X, y, X_train, y_train, X_test, y_test, X_scaled, y_scaled, X_train_scaled, y_train_scaled, X_test_scaled, y_scaler] \ = pre.split_pipeline(data, output) print('Data preprocessed.') regressors = tra.build(X_train, y_train, X_train_scaled, y_train_scaled, models) best_regressor = tra.evaluate(regressors, X_train, y_train, X_train_scaled, y_train_scaled, X_test, y_test, X_test_scaled, y_scaler) print('Regressors evaluated. Best regressor is:\n' + str(best_regressor)) if 'SVR' in str(best_regressor): best_regressor.fit(X_scaled, y_scaled) else: best_regressor.fit(X, y) print('Regressor fit.') tra.print_results(best_regressor, X, X_scaled, y, y_scaler) tra.save(best_regressor, X, output + '_transfer') print('Regressor saved.') tra.upload(output + '_transfer') print('Regressor uploaded.')
def main(): args = get_training_args() # Logger log = Logger(file=args.log_file, verbose=args.verbose, flush=True) # Prepare data batches, dic = get_dataset(args, log) # Create model network = instantiate_network(args, dic, log) # Train model train(args, network, batches["train"], batches["dev"], log) # Test test_accuracy = evaluate(args, network, batches["test"]) # Print final result log(f"Test accuracy: {test_accuracy*100:.2f}%") # Explain if the model is SoPa if args.model_type == "sopa": explain(args, network, batches["explain"], log)
def test_model(dataroot, model_path, batch_size, device): trainset, validset, validset_subjects, class_weights = get_dataset( dataroot, folds_train=(0, 1, 2), folds_valid=(3, )) class_weights = class_weights.to(device) valid_loader = DataLoader(validset, batch_size=batch_size, num_workers=6, shuffle=False) model = get_model() sd = torch.load(model_path) # PBT saves the model as part of a dict that also contains other information about the individual if 'model' in sd and 'sd' in sd['model']: sd = sd['model']['sd'] model.load_state_dict(sd) model.to(device) valid_loss, cm, auc, prec, rec, f1 = evaluate(model, valid_loader, class_weights, device) print(f"Results for model {model_path}") print(f"valid_loss={valid_loss:.4e}") print(f"auc={auc:.4f}") print(f"prec={prec:.4f}") print(f"rec={rec:.4f}") print(f"f1={f1:.4f}") print(f"cm=\n{cm}") return { 'valid_loss': valid_loss, 'cm': cm, 'auc': auc, 'prec': prec, 'rec': rec, 'f1': f1 }
return os.path.realpath("project_git_repo/cpd35-clustering-demo") else: return os.getcwd() PROJECT_DIR = find_project_dir() SCRIPT_DIR = os.path.join(PROJECT_DIR, "assets/jupyterlab") DATA_DIR = os.path.join(PROJECT_DIR, "assets/data_asset") sys.path.append(os.path.normpath(SCRIPT_DIR)) print(SCRIPT_DIR) print(DATA_DIR) from training import train, evaluate, clusterings reference_df = pd.read_csv(os.path.join(DATA_DIR, "credit_risk_reference.csv")) input_df = reference_df.drop(['Risk'], axis=1) # Training models and select winning one results = [] for (clustering_name, clustering_op) in clusterings: print(clustering_name) model = train(input_df, clustering_name, clustering_op) result = evaluate(reference_df, clustering_op) print("---") results.append(result) best_score_idx = np.argmax(r['v_measure'] for r in results) print("The winner is: '{}' with V-measure: {}!".format( clusterings[best_score_idx][0], results[best_score_idx]['v_measure']))
def main(config_file): # read from config local_config = __import__(config_file) model_name = local_config.INPUTS['MODEL'] model = getattr(__import__('.models', fromlist=[model_name]), model_name) batch_size = local_config.INPUTS['BATCHSIZE'] optimizer = local_config.INPUTS['OPTIMIZER'] num_epochs = local_config.INPUTS['EPOCHS'] no_classes = local_config.INPUTS['CLASSES'] learning_rate = local_config.INPUTS['LR'] # logging start_time = time.time() date = time.strftime('%d-%m-%Y-%H-%M-%S', time.localtime()) log_path = f'./birdsong/run_log/{model_name}_{date}' state_fname, log_fname, summ_tensor_board = logger.create_log(log_path) writer = SummaryWriter(str(summ_tensor_board)) # Upsampling train_df = upsample_df(TRAIN, 400) # Augmentation noiser = SoundscapeNoise('storage/noise_slices', scaling=1) ds_train = SpectralDataset(train_df, INPUT_DIR, enhancement_func=exponent, augmentation_func=noiser) ds_test = SpectralDataset(TEST, INPUT_DIR, enhancement_func=exponent) dl_train = DataLoader(ds_train, batch_size, num_workers=4, pin_memory=PIN, shuffle=True) dl_test = DataLoader(ds_test, batch_size, num_workers=4, pin_memory=PIN, shuffle=True) print('Dataloaders initialized') time_axis = ds_test.shape[1] freq_axis = ds_test.shape[0] net = model(time_axis=time_axis, freq_axis=freq_axis, no_classes=no_classes) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(net.parameters(), lr=learning_rate) # local vars best_acc = 0 for epoch in range(num_epochs): train(net, dl_train, epoch, optimizer, criterion, DEVICE) train_stats, train_conf_matrix = evaluate(net, dl_train, criterion, no_classes, DEVICE) print( f'Train Loss: {train_stats[0]:.5f}, Train Acc: {train_stats[1]:.5f}' ) test_stats, test_conf_matrix = evaluate(net, dl_test, criterion, no_classes, DEVICE) print(f'Test Loss: {test_stats[0]:.5f}, Test Acc: {test_stats[1]:.5f}') is_best = test_stats[1] > best_acc best_acc = max(test_stats[1], best_acc) print('Best Accuracy: {:.5f}'.format(best_acc)) logger.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': net.state_dict(), 'best_accuracy': best_acc }, is_best, filename=state_fname) """ print('Making images') img_path = log_path + '/train' + '_' + str(epoch) + '.png' img = plot_conf_mat(img_path, train_conf_matrix) img_path = log_path + '/test' + '_' + str(epoch) + '.png' img = plot_conf_mat(img_path, test_conf_matrix) """ print('Writing logs') logger.write_summary(writer, epoch, train_stats, test_stats) logger.dump_log_txt(date, start_time, local_config, train_stats, test_stats, best_acc, log_fname) print('Done for now') writer.close() print('Finished Training')
epochs = 100 train_losses = [] val_losses = [] for epoch in range(epochs): # how many times to go through whole dataset? # ----- PURELY FOR PLOTTING THE HYPOTHESIS OVER THE WHOLE INPUT DOMAIN -------- all_preds = nn(X).detach().numpy() # make predictions for all inputs (not just minibatch) h_plot = h_ax.plot(X.numpy(), all_preds, c='g') # plot predictions for all inputs fig.canvas.draw() # ----------------------------------------------------------------------------- train(nn, train_loader, criterion, optimiser, epoch, fig, loss_ax, train_losses, p) idx = len(train_loader) * epoch # index of current batch evaluate(nn, val_loader, criterion, epoch, fig, loss_ax, val_losses, idx) h_plot.pop(0).remove() # remove the previous plot ''' # --------- OLD ---------- for batch_idx, batch in enumerate(train_loader): # for each minibatch from dataloader print(batch) x, y = batch # unpack the minibatch h = nn(x) # make predictions for this minibatch loss = criterion(h, y) # evaluate loss for this batch loss.backward() # differentiate loss with respect to parameters that the optimiser is tracking optimiser.step() # take optimisation step
def train_binary_model(path, epochs=100, ft_epochs=100, learning_rate=0.01, classes_to_match: Union[int, List[int]] = 0, classes_to_drop: Union[int, List[int]] = None): """ Train a smaller binary model for empty/not empty classification and save it under the given path. The method first loads the models using :py:doc:`generate_datasets.py <training.generate_datasets.py>` methods. Then the model is trained, saved and finally evaluated. Training is run in two steps: It is first trained with synthetic data and then finetuned with real data. Early stopping is used to prevent overfitting. Args: path(str): The directory to save the trained model to. epochs(int): The number of epochs. (Default value = 100) ft_epochs: The number of finetuning epochs. (Default value = 100) learning_rate: The learning rate for the Adadelta optimizer. (Default value = 0.01) classes_to_match(Union[int, list[int]]): The classes to match as class 1. (Default value = 0) classes_to_drop(Union[int, list[int]]): The classes to drop from the dataset. (Default value = None) Returns: None """ os.makedirs(path, exist_ok=True) concat_machine, concat_hand, concat_out, real_training, real_validation = load_datasets( TRANSFORMED_DATASET_NAMES) batch_size = 192 train_generator = ToBinaryGenerator(concat_machine.train, concat_hand.train, concat_out.train, classes_to_match=classes_to_match, classes_to_drop=classes_to_drop, batch_size=batch_size, shuffle=True, truncate=True) dev_generator = ToBinaryGenerator(concat_machine.test, concat_hand.test, concat_out.test, classes_to_match=classes_to_match, classes_to_drop=classes_to_drop, batch_size=batch_size, shuffle=True, truncate=True) ft_train_generator = ToBinaryGenerator(real_training.train, classes_to_match=classes_to_match, classes_to_drop=classes_to_drop, batch_size=batch_size, shuffle=True, truncate=True) ft_dev_generator = ToBinaryGenerator(real_training.test, classes_to_match=classes_to_match, classes_to_drop=classes_to_drop, batch_size=batch_size, shuffle=True, truncate=True) test_generator = ToBinaryGenerator(real_validation.test, classes_to_match=classes_to_match, classes_to_drop=classes_to_drop, batch_size=batch_size, shuffle=False) # Run training on the GPU with tf.device('/GPU:0'): # Keras Model print("Creating model..") model = Sequential() model.add(Conv2D(16, (5, 5), strides=2, input_shape=(28, 28, 1))) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(4, 4))) model.add(Conv2D(32, (2, 2))) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) # 32 model.add(Dense(64, activation='relu')) model.add(Dropout(0.25)) model.add(Dense(64, activation='relu')) model.add(Dense(1, activation='sigmoid')) # def mean_pred(_, y): # return keras.backend.mean(y) print("Compiling model..") model.compile( loss=keras.losses.BinaryCrossentropy(from_logits=True), optimizer=keras.optimizers.Adadelta(learning_rate), metrics=[keras.metrics.binary_accuracy, 'mse'], ) print(model.summary()) print("Training model") model.fit_generator(train_generator, validation_data=dev_generator, epochs=epochs, callbacks=[ EarlyStopping(monitor='val_accuracy', restore_best_weights=True, patience=3, min_delta=0.0001), ]) print("Finetuning model") model.fit_generator(ft_train_generator, validation_data=ft_train_generator, epochs=ft_epochs, callbacks=[ EarlyStopping(monitor='val_accuracy', restore_best_weights=True, patience=3, min_delta=0.0001), ]) models.save_model(model, path + "model.h5", save_format='h5') print("Evaluating") print( "Training dev", list( zip(model.metrics_names, model.evaluate_generator(dev_generator)))) print( "Finetuning dev", list( zip(model.metrics_names, model.evaluate_generator(ft_dev_generator)))) print( "Test", list( zip(model.metrics_names, model.evaluate_generator(test_generator)))) evaluate(model, test_generator, binary=True)
def run(): ## Load Config from JSON file dir_path = os.path.dirname(os.path.realpath(__file__)) config_path = os.path.join(dir_path, "experiment", FLAGS.config) if not os.path.exists(config_path): raise FileNotFoundError if not os.path.exists(FLAGS.data_path): raise FileNotFoundError with open(config_path, "r") as f: config = json.load(f) config["gpu"] = torch.cuda.is_available() ## Load Data df = dl.load_raw_text_file(FLAGS.data_path, num_examples=30000) # index language for Input and Output inp_index = LanguageIndex(phrases=df["es"].values) targ_index = LanguageIndex(df["eng"].values) vocab_inp_size = len(inp_index.word2idx) vocab_tar_size = len(targ_index.word2idx) # Convert Sentences into tokenized tensors input_tensor, target_tensor = dl.convert_tensor(df, inp_index, targ_index) # Split to training and test set input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split( input_tensor, target_tensor, test_size=0.2) train_dataset = MyData(input_tensor_train, target_tensor_train) val_dataset = MyData(input_tensor_val, target_tensor_val) # Conver to DataLoader Object train_dataset = data.DataLoader(train_dataset, batch_size=config['batch_size'], drop_last=True, shuffle=True) eval_dataset = data.DataLoader(val_dataset, batch_size=config['batch_size'], drop_last=False, shuffle=True) # Models model = Seq2Seq(config, vocab_inp_size, vocab_tar_size) scorer = create_scorer(config['metrics']) if config['gpu']: model = model.cuda() # Optimizer optimizer = torch.optim.Adam(model.parameters(), lr=config.get("learning_rate", .001)) for name, param in model.named_parameters(): if 'bias' in name: torch.nn.init.constant_(param, 0.0) elif 'weight' in name: torch.nn.init.xavier_normal_(param) print("Weight Initialized") ## Train and Evaluate over epochs all_train_avg_loss = [] all_eval_avg_loss = [] all_eval_avg_acc = [] for epoch in range(FLAGS.epochs): run_state = (epoch, FLAGS.epochs) # Train needs to return model and optimizer, otherwise the model keeps restarting from zero at every epoch model, optimizer, train_avg_loss = train(model, optimizer, train_dataset, run_state, config['debug']) all_train_avg_loss.append(train_avg_loss) # Return Val Set Loss and Accuracy eval_avg_loss, eval_acc = evaluate(model, eval_dataset, targ_index, scorer, config['debug']) all_eval_avg_loss.append(eval_avg_loss) all_eval_avg_acc.append(eval_acc) # Save Model Checkpoint checkpoint_dict = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': eval_avg_loss, } checkpoint_path = '{}/epoch_{:0.0f}_val_loss_{:0.3f}.pt'.format( FLAGS.model_checkpoint_dir, epoch, eval_avg_loss) torch.save(checkpoint_dict, checkpoint_path) # Export Model Learning Curve Info df = pd.DataFrame({ 'epoch': range(FLAGS.epochs), 'train_loss': all_train_avg_loss, 'eval_loss': all_eval_avg_loss, 'eval_acc': all_eval_avg_acc }) now = datetime.now() current_time = now.strftime("%Y%m%d%H%M%S") export_path = '{}/{}_{:0.0f}_bz_{}_val_loss_{:0.3f}.csv'.format( FLAGS.metrics_dir, current_time, FLAGS.epochs, config['batch_size'], eval_avg_loss) df.to_csv(export_path, index=False)
def main(config_file): #read from config local_config = __import__(config_file) model_name = local_config.INPUTS['MODEL'] model = getattr(__import__('.models', fromlist=[model_name]), model_name) batch_size = local_config.INPUTS['BATCHSIZE'] optimizer_name = local_config.INPUTS['OPTIMIZER'] optimizer = getattr(__import__('torch.optim', fromlist=[optimizer_name]), optimizer_name) num_epochs = local_config.INPUTS['EPOCHS'] no_classes = local_config.INPUTS['CLASSES'] learning_rate = local_config.INPUTS['LR'] #logging start_time = time.time() date = time.strftime('%d-%m-%Y-%H-%M-%S', time.localtime()) log_path = f'./birdsong/run_log/{model_name}_{date}' state_fname, log_fname, summ_tensor_board = logger.create_log(log_path) writer = SummaryWriter(str(summ_tensor_board)) params = {'input_dir' : INPUT_DIR, 'batchsize' : batch_size, 'window' : 5000, 'stride' : 2000, 'spectrogram_func' : mel_s, 'augmentation_func' : None} ds_test = SoundDataset(TEST, **params) ds_train = SoundDataset(TRAIN, **params) dl_test = DataLoader(ds_test, batch_size) dl_train = DataLoader(ds_train, batch_size) print('dataloaders initialized') time_axis = ds_test.shape[1] freq_axis = ds_test.shape[0] net = model(time_axis=time_axis, freq_axis=freq_axis, no_classes=no_classes) criterion = nn.CrossEntropyLoss() optimizer = optimizer(net.parameters(), lr=learning_rate) #local vars best_acc = 0 for epoch in range(num_epochs): train(net, dl_train, epoch, optimizer, criterion, DEVICE) train_stats, train_conf_matrix = evaluate(net, dl_train, criterion, no_classes, DEVICE) print(f'Train Loss: {train_stats[0]:.5f}, Train Acc: {train_stats[1]:.5f}') test_stats, test_conf_matrix = evaluate(net, dl_test, criterion, no_classes, DEVICE) print(f'Test Loss: {test_stats[0]:.5f}, Test Acc: {test_stats[1]:.5f}') is_best = test_stats[1] > best_acc best_acc = max(test_stats[1], best_acc) print('Best Accuracy: {:.5f}'.format(best_acc)) logger.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': net.state_dict(), 'best_accuracy': best_acc }, is_best, filename=state_fname) img_path = log_path + '/train' + '_' + str(epoch) + '.png' img = plot_conf_mat(img_path, train_conf_matrix) img_path = log_path + '/test' + '_' + str(epoch) + '.png' img = plot_conf_mat(img_path, test_conf_matrix) logger.write_summary(writer, epoch, train_stats, test_stats, img) logger.dump_log_txt(date, start_time, local_config, train_stats, test_stats, best_acc, log_fname) writer.close() print('Finished Training')
) from collections import Counter print(Counter(test_data.labels)) """ Train """ fc_dim = train_data[0][0].shape[0] #model = Net(fc_dim) model = LogisticRegression(fc_dim) print(f"Optimization parameters: {params['optimization']}") trained_model = Train(model,weights,train_data,test_data,**params['optimization']) print(trained_model.model) test_gen = torch.utils.data.DataLoader(test_data, batch_size=test_data.__len__(), shuffle=False) for test_data, test_labels in test_gen: evaluate(trained_model, test_data, test_labels) # model.load_state_dict(torch.load(PATH))
def main(args): set_seed(SEED) train_transforms, test_transforms = get_transforms(args.dataset) print(f"Data transformations:\n{train_transforms}\n") # Get the dataloaders train_loader, test_loader = get_dataloaders(args.dataset, args.batch_size, args.workers, train_transforms, test_transforms) # Architecture if args.dataset == 'mnist': in_channels = 1 else: raise NotImplementedError() if args.activation == 'relu': activation = nn.ReLU(inplace=True) else: raise NotImplementedError() if args.pooling == 'max': pooling = nn.MaxPool2d(kernel_size=(2, 2), stride=2) else: raise NotImplementedError() drop_rate = args.drop_rate # Build model model = LeNet5(in_channels, activation, pooling, drop_rate) if torch.cuda.is_available(): torch.cuda.set_device(args.gpu) model = model.cuda() # Weight normal initialization if args.init_weights: model.apply(normal_initialization) start_epoch = 0 if args.resume is not None: model, optimizer, start_epoch = load_training_state( model, optimizer, args.resume) # Loss function & optimizer if args.criterion == 'ce': criterion = nn.CrossEntropyLoss() else: raise NotImplementedError() if args.optimizer == 'sgd': # Issue optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) elif args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: raise NotImplementedError() scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=0, threshold=1e-2, verbose=True) # Output folder output_folder = os.path.join(args.output_folder, args.training_name) if not os.path.exists(output_folder): os.makedirs(output_folder) log_path = os.path.join(args.output_folder, 'logs', args.training_name) if os.path.exists(log_path): rmtree(log_path) logger = SummaryWriter(log_path) # Train best_loss = math.inf mb = master_bar(range(args.nb_epochs)) for epoch_idx in mb: # Training train_epoch(model, train_loader, optimizer, criterion, mb, tb_logger=logger, epoch=start_epoch + epoch_idx) # Evaluation val_loss, accuracy = evaluate(model, test_loader, criterion) mb.first_bar.comment = f"Epoch {start_epoch+epoch_idx+1}/{start_epoch+args.nb_epochs}" mb.write( f'Epoch {start_epoch+epoch_idx+1}/{start_epoch+args.nb_epochs} - Validation loss: {val_loss:.4} (Acc@1: {accuracy:.2%})' ) # State saving if val_loss < best_loss: print( f"Validation loss decreased {best_loss:.4} --> {val_loss:.4}: saving state..." ) best_loss = val_loss torch.save( dict(epoch=start_epoch + epoch_idx, model_state_dict=model.state_dict(), optimizer_state_dict=optimizer.state_dict(), val_loss=val_loss), os.path.join(output_folder, "training_state.pth")) if logger is not None: current_iter = (start_epoch + epoch_idx + 1) * len(train_loader) logger.add_scalar(f"Validation loss", val_loss, current_iter) logger.add_scalar(f"Error rate", 1 - accuracy, current_iter) logger.flush() scheduler.step(val_loss)
def run(): USE_CUDA = torch.cuda.is_available() config_path = os.path.join("experiments", FLAGS.config) if not os.path.exists(config_path): raise FileNotFoundError with open(config_path, "r") as f: config = json.load(f) config["gpu"] = torch.cuda.is_available() if "wancong" in config_path: dataset = PolyDataset() eval_dataset = PolyDataset("test_small.txt") else: dataset = ToyDataset(5, 15) eval_dataset = ToyDataset(5, 15, type='eval') BATCHSIZE = 30 train_loader = data.DataLoader(dataset, batch_size=BATCHSIZE, shuffle=False, collate_fn=pad_collate, drop_last=True) eval_loader = data.DataLoader(eval_dataset, batch_size=BATCHSIZE, shuffle=False, collate_fn=pad_collate, drop_last=True) config["batch_size"] = BATCHSIZE # Models model = Seq2Seq(config) print(f"total number of parameters: {count_parameters(model)}") if USE_CUDA: model = model.cuda() # Optimizer optimizer = torch.optim.Adam(model.parameters(), lr=config.get("learning_rate", .001)) print("=" * 60) print(model) print("=" * 60) for k, v in sorted(config.items(), key=lambda i: i[0]): print(" (" + k + ") : " + str(v)) print() print("=" * 60) print("\nInitializing weights...") for name, param in model.named_parameters(): if 'bias' in name: torch.nn.init.constant_(param, 0.0) elif 'weight' in name: torch.nn.init.xavier_normal_(param) for epoch in range(FLAGS.epochs): run_state = (epoch, FLAGS.epochs, FLAGS.train_size) # Train needs to return model and optimizer, otherwise the model keeps restarting from zero at every epoch model, optimizer = train(model, optimizer, train_loader, run_state) evaluate(model, eval_loader)
return train_iterator, valid_iterator, test_iterator, text_field.vocab # load data print("\nLoading data...") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') text_field = data.Field(tokenize='spacy', lower=True, include_lengths=True, batch_first=True) label_field = data.Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float) docid_field = data.RawField() train_iter, dev_iter, test_iter, text_voca = fall_data(docid_field, text_field, label_field, device=-1, repeat=False) # train or eval if args.test: best_model = model.LSTM().to(device) optimizer = optim.Adam(best_model.parameters(), lr=args.lr) t.load_checkpoint(destination_folder + '/model.pt', best_model, optimizer) t.evaluate(best_model, test_iter) else: print('start training') wandb.init() wandb.watch(model) model = model.LSTM(text_voca).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) eval_every = len(train_iter) // 2 t.train(model=model, optimizer=optimizer, train_loader=train_iter, valid_loader=dev_iter, num_epochs=args.epochs, eval_every = eval_every, file_path= destination_folder, device=device)
def do_train(model, tdata, vdata, measures): start_time = time.time() input_size = len(_g.vocab) # TODO paramatersanitychecker torch.save(model, _g.args.path + '/' + _g.args.filename) if not _g.args.quiet: print('Training...') optimizer = optim.Adam(model.parameters(), lr=_g.args.lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, _g.args.decay, _g.args.decay_factor) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.85, patience=30) criterion = nn.NLLLoss(ignore_index=_g.vocab.stoi[_g.padding_symbol], reduce=False) best_eval_loss = None t_prints, e_prints = None, None epoch_times = None saving = False try: for epoch, (tbatch, vbatch) in enumerate(zip(tdata, vdata)): if _g.args.time: if time.time() - start_time > _g.args.time: break elif epoch == _g.args.epochs: break epoch_start_time = time.time() t_losses = _t.tbtt(model, criterion, optimizer, _u.to_one_hot(tbatch.before, input_size), tbatch.after) t_losses = t_losses.unsqueeze(dim=1) e_losses = _t.evaluate(model, criterion, _u.to_one_hot(vbatch.before, input_size), vbatch.after, measures) e_losses = e_losses.unsqueeze(dim=1) t_prints = t_losses if t_prints is None else torch.cat((t_prints, t_losses), dim=1) e_prints = e_losses if e_prints is None else torch.cat((e_prints, e_losses), dim=1) epoch_end_time = time.time() epoch_time = torch.tensor(epoch_end_time - epoch_start_time) epoch_times = epoch_time if epoch_times is None else torch.stack((epoch_times, epoch_time), dim=0) if (epoch + 1) % _g.args.print_every == 0: t_prints = t_prints.mean(dim=1) e_prints = e_prints.mean(dim=1) if not _g.args.quiet: _u.pretty_print(_g.args.epochs, _g.args.time, epoch+1, epoch_end_time - start_time, _u.to_builtin(epoch_times.mean()), _u.to_builtin(torch.chunk(t_prints, 2)), _u.to_builtin( {n: (x, y) for n,x,y in zip(['loss'] + list(measures.keys()), e_prints[::2], e_prints[1::2])} ) ) t_prints, e_prints = None, None if not best_eval_loss or e_losses[0].item() < best_eval_loss: saving = True best_eval_loss = e_losses[0].item() torch.save(model, _g.args.path + '/' + _g.args.filename) saving = False scheduler.step() if not _g.args.quiet: print('Training done successfully') except KeyboardInterrupt: print('\nExiting earlier than expected. Wait a moment!') if saving: # In case it was interrupted while saving torch.save(model, _g.args.path + '/' + _g.args.filename)
def run(): USE_CUDA = torch.cuda.is_available() FLAGS.config = 'example_seq2seq.json' config_path = os.path.join("experiments", FLAGS.config) print(FLAGS.config) if not os.path.exists(config_path): raise FileNotFoundError with open(config_path, "r") as f: config = json.load(f) config["gpu"] = torch.cuda.is_available() writer = SummaryWriter('experiments/finally') # dataset = ToyDataset(5, 15) # eval_dataset = ToyDataset(5, 15, type='eval') dataset = Toy_Numbers(10) eval_dataset = Toy_Numbers(10, train=False) BATCHSIZE = 32 train_loader = data.DataLoader(dataset, batch_size=BATCHSIZE, shuffle=False, collate_fn=pad_collate, drop_last=True) eval_loader = data.DataLoader(eval_dataset, batch_size=BATCHSIZE, shuffle=False, collate_fn=pad_collate, drop_last=True) config["batch_size"] = BATCHSIZE # Models model = Seq2Seq(config) model = model.float() # dataiter = iter(train_loader) # sample_input= dataiter.next() # writer.add_graph(model, sample_input) # writer.close() if USE_CUDA: model = model.cuda() # Optimizer optimizer = torch.optim.Adam(model.parameters(), lr=config.get("learning_rate", .001)) print("=" * 60) print(model) print("=" * 60) for k, v in sorted(config.items(), key=lambda i: i[0]): print(" (" + k + ") : " + str(v)) print() print("=" * 60) print("\nInitializing weights...") for name, param in model.named_parameters(): if 'bias' in name: torch.nn.init.constant_(param, 0.0) elif 'weight' in name: torch.nn.init.xavier_normal_(param) for epoch in range(FLAGS.epochs): run_state = (epoch, FLAGS.epochs, FLAGS.train_size) # Train needs to return model and optimizer, otherwise the model keeps restarting from zero at every epoch model, optimizer = train(model, optimizer, train_loader, run_state, writer) # print("losses", l_list) # for i in l_list: # # print(i) # writer.add_scalar('Loss/train',i) evaluate(model, eval_loader, writer)
def run(args): print('\nSettings: \n', args, '\n') args.model_signature = str(dt.datetime.now())[0:19].replace(' ', '_') args.model_signature = args.model_signature.replace(':', '_') ########## Find GPUs (gpu_config, n_gpu_used) = set_gpus(args.n_gpu) ########## Data, model, and optimizer setup mnist = MNIST(args) x = tf.placeholder(tf.float32, [None, 28, 28, 1]) if args.model == 'hvae': if not args.K: raise ValueError('Must set number of flow steps when using HVAE') elif not args.temp_method: raise ValueError('Must set tempering method when using HVAE') model = HVAE(args, mnist.avg_logit) elif args.model == 'cnn': model = VAE(args, mnist.avg_logit) else: raise ValueError('Invalid model choice') elbo = model.get_elbo(x, args) nll = model.get_nll(x, args) optimizer = AdamaxOptimizer(learning_rate=args.learn_rate, eps=args.adamax_eps) opt_step = optimizer.minimize(-elbo) ########## Tensorflow and saver setup sess = tf.Session(config=gpu_config) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() savepath = os.path.join(args.checkpoint_dir, args.model_signature, 'model.ckpt') if not os.path.exists(args.checkpoint_dir): os.makedirs(args.checkpoint_dir) ########## Test that GPU memory is sufficient if n_gpu_used > 0: try: x_test = mnist.next_test_batch() (t_e, t_n) = sess.run((elbo, nll), {x: x_test}) mnist.batch_idx_test = 0 # Reset batch counter if it works except: raise MemoryError(""" Likely insufficient GPU memory Reduce test batch by lowering the -tbs parameter """) ########## Training Loop train_elbo_hist = [] val_elbo_hist = [] # For early stopping best_elbo = -np.inf es_epochs = 0 epoch = 0 train_times = [] for epoch in range(1, args.epochs + 1): t0 = time.time() train_elbo = train(epoch, mnist, opt_step, elbo, x, args, sess) train_elbo_hist.append(train_elbo) train_times.append(time.time() - t0) print('One epoch took {:.2f} seconds'.format(time.time() - t0)) val_elbo = validate(mnist, elbo, x, sess) val_elbo_hist.append(val_elbo) if val_elbo > best_elbo: # Save the model that currently generalizes best es_epochs = 0 best_elbo = val_elbo saver.save(sess, savepath) best_model_epoch = epoch elif args.early_stopping_epochs > 0: es_epochs += 1 if es_epochs >= args.early_stopping_epochs: print('***** STOPPING EARLY ON EPOCH {} of {} *****'.format( epoch, args.epochs)) break print('--> Early stopping: {}/{} (Best ELBO: {:.4f})'.format( es_epochs, args.early_stopping_epochs, best_elbo)) print('\t Current val ELBO: {:.4f}\n'.format(val_elbo)) if np.isnan(val_elbo): raise ValueError('NaN encountered!') train_times = np.array(train_times) mean_time = np.mean(train_times) std_time = np.std(train_times) print('Average train time per epoch: {:.2f} +/- {:.2f}'.format( mean_time, std_time)) ########## Evaluation # Restore the best-performing model saver.restore(sess, savepath) test_elbos = np.zeros(args.n_nll_runs) test_nlls = np.zeros(args.n_nll_runs) for i in range(args.n_nll_runs): print('\n---- Test run {} of {} ----\n'.format(i + 1, args.n_nll_runs)) (test_elbos[i], test_nlls[i]) = evaluate(mnist, elbo, nll, x, args, sess) mean_elbo = np.mean(test_elbos) std_elbo = np.std(test_elbos) mean_nll = np.mean(test_nlls) std_nll = np.std(test_nlls) print('\nTest ELBO: {:.2f} +/- {:.2f}'.format(mean_elbo, std_elbo)) print('Test NLL: {:.2f} +/- {:.2f}'.format(mean_nll, std_nll)) ########## Logging, Saving, and Plotting with open(args.logfile, 'a') as ff: print('----------------- Test ID {} -----------------'.format( args.model_signature), file=ff) print(args, file=ff) print('Stopped after {} epochs'.format(epoch), file=ff) print('Best model from epoch {}'.format(best_model_epoch), file=ff) print('Average train time per epoch: {:.2f} +/- {:.2f}'.format( mean_time, std_time), file=ff) print('FINAL VALIDATION ELBO: {:.2f}'.format(val_elbo_hist[-1]), file=ff) print('Test ELBO: {:.2f} +/- {:.2f}'.format(mean_elbo, std_elbo), file=ff) print('Test NLL: {:.2f} +/- {:.2f}\n'.format(mean_nll, std_nll), file=ff) if not os.path.exists(args.pickle_dir): os.makedirs(args.pickle_dir) train_dict = { 'train_elbo': train_elbo_hist, 'val_elbo': val_elbo_hist, 'args': args } pickle.dump( train_dict, open(os.path.join(args.pickle_dir, args.model_signature + '.p'), 'wb')) if not os.path.exists(args.plot_dir): os.makedirs(args.plot_dir) tf_gen_samples = model.get_samples(args) np_gen_samples = sess.run(tf_gen_samples) plot_digit_samples(np_gen_samples, args) plot_training_curve(train_elbo_hist, val_elbo_hist, args) ########## Email notification upon test completion try: msg_text = """Test completed for ID {0}. Parameters: {1} Test ELBO: {2:.2f} +/- {3:.2f} Test NLL: {4:.2f} +/- {5:.2f} """.format(args.model_signature, args, mean_elbo, std_elbo, mean_nll, std_nll) msg = MIMEText(msg_text) msg['Subject'] = 'Test ID {0} Complete'.format(args.model_signature) msg['To'] = args.receiver msg['From'] = args.sender s = smtplib.SMTP('localhost') s.sendmail(args.sender, [args.receiver], msg.as_string()) s.quit() except: print('Unable to send email from sender {0} to receiver {1}'.format( args.sender, args.receiver))