def grid_search(pipeline, train_path, test_path): X_train, y_train = load_dataset(train_path) X_test, y_test = load_dataset(test_path) target_names = list(set([i[0] for i in y_train])) print("%d documents (training set)" % len(X_train)) print("%d documents (test set)" % len(X_test)) print("%d categories" % len(target_names)) print() gridsearch = GridSearchCV(pipeline, parameters, cv=1, n_jobs=-1, verbose=1) print("Performing grid search...") print("pipeline:", [name for name, _ in pipeline.steps]) print("parameters:") print(parameters) t0 = time() gridsearch.fit(X_train, y_train) print("done in %0.3fs" % (time() - t0)) print() print("Best dev score: %0.3f" % gridsearch.best_score_) print("Best parameters set:") best_parameters = gridsearch.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) print("Best test score: %0.3f" % gridsearch.score(X_test, y_test))
def grid_search(parameters, train_path, test_path): result = { 'accuracy': None, 'precision': None, 'recall': None, 'f1_score': None } report_columns = list(parameters.keys()) + list(result.keys()) report_log(report_columns) print("Load data") X_train, y_train = load_dataset(train_path) X_test, y_test = load_dataset(test_path) train_corpus = [[y, x] for x, y in zip(X_train, y_train)] test_corpus = [[y, x] for x, y in zip(X_test, y_test)] train_tagged = convert_to_tagged(train_corpus) test_tagged = convert_to_tagged(test_corpus) target_names = list(set(y_train)) print("%d documents (training set)" % len(X_train)) print("%d documents (test set)" % len(X_test)) print("%d categories" % len(target_names)) print() model_count = len(list(itertools.product(*parameters.values()))) print("Start optimizing hyperparameters") for i, parameter_set in enumerate(itertools.product(*parameters.values())): print("\nTraining model %d/%d" % (i + 1, model_count)) params = dict(zip(parameters.keys(), parameter_set)) t0 = time() transformer = get_model(train_tagged, params) transformer.train(train_tagged, total_examples=transformer.corpus_count, epochs=transformer.iter) print("finish training doc2vec") X_train_vec, y_train_label = vec_for_learning(transformer, train_tagged) X_test_vec, y_test_label = vec_for_learning(transformer, test_tagged) print("start training classifier") model = LinearSVC() estimator = model.fit(X_train_vec, y_train_label) train_time = time() - t0 print("train time: %dm %0.3fs" % (train_time / 60, train_time - 60 * (train_time // 60))) t0 = time() y_pred = estimator.predict(X_test_vec) test_time = time() - t0 print("test time: %dm %0.3fs" % (test_time / 60, test_time - 60 * (test_time // 60))) result = get_metrics(y_test_label, y_pred) report_log(list(params.values()) + list(result.values()))
return regressors, targets if args.mode == "train-test": if not (args.train and args.test): parser.error("Mode train-test requires --train and --test") if not args.s: parser.error("Mode train_test requires --s") train_path = os.path.abspath(args.train) test_path = os.path.abspath(args.test) model_path = os.path.abspath(args.s) print("Train model") print("Load data") X_train, y_train = load_dataset(train_path) X_test, y_test = load_dataset(test_path) train_corpus = [[y, x] for x, y in zip(X_train, y_train)] test_corpus = [[y, x] for x, y in zip(X_test, y_test)] train_tagged = convert_to_tagged(train_corpus) test_tagged = convert_to_tagged(test_corpus) target_names = list(set(y_train)) print("%d documents (training set)" % len(X_train)) print("%d documents (test set)" % len(X_test)) print("%d categories" % len(target_names)) print() print("Training model")
from util.model_evaluation import get_metrics, plot_confusion_matrix t0 = time() x_transformer_file = open(join(cwd, "snapshots", "x_transformer.pkl"), "rb") x_transformer = pickle.load(x_transformer_file) y_transformer_file = open(join(cwd, "snapshots", "y_transformer.pkl"), "rb") y_transformer = pickle.load(y_transformer_file) ch2_file = open(join(cwd, "snapshots", "ch2.pkl"), "rb") ch2 = pickle.load(ch2_file) estimator_file = open(join(cwd, "snapshots", "model.pkl"), "rb") estimator = pickle.load(estimator_file) duration = time() - t0 print("Load model time: %0.3fs" % duration) test_path = join(cwd, "data", "test.xlsx") X_test, y_test = load_dataset(test_path) t0 = time() y_test = [item for sublist in y_test for item in sublist] X = x_transformer.transform(X_test) X = ch2.transform(X) y = estimator.predict(X) y_pred = y_transformer.inverse_transform(y) duration = time() - t0 print("Predict time: %0.3fs" % duration) get_metrics(y_test, y_pred) classes = set(y_test) cm = confusion_matrix(y_test, y_pred) plot_confusion_matrix(cm, classes, normalize=False,
def run(args): print('\nMODEL SETTINGS: \n', args, '\n') print("Random Seed: ", args.manual_seed) train_loader, val_loader, test_loader, args = load_dataset(args) encoder = MLP_encoder(args) decoder = MLP_decoder(args) if args.flow == "planar": model = VAE.PlanarVAE(encoder, decoder, args) elif args.flow == "NICE": # NICE-planar model = VAE.NICEVAE_amor(encoder, decoder, args) elif args.flow == "NICE_MLP": model = VAE.NICEVAE(encoder, decoder, args) elif args.flow == "syl_orthogonal": model = VAE.Sylvester_ortho_VAE(encoder, decoder, args) elif args.flow == "real": model = VAE.RealNVPVAE(encoder, decoder, args) if args.vampprior: load = torch.utils.data.DataLoader(train_loader.dataset, batch_size=args.num_pseudos, shuffle=True) pseudo_inputs = next(iter(load))[0] if args.data_as_pseudo else None model.init_pseudoinputs(pseudo_inputs) if args.cuda: print("Model on GPU") model.cuda() print(model) optimizer = optim.RMSprop(model.parameters(), lr=args.learning_rate, momentum=0.9) #### Training train_loss = [] val_loss = [] epoch = 0 t = time.time() for epoch in range(1, args.epochs + 1): tr_loss = train(epoch, train_loader, model, optimizer, args) train_loss.append(tr_loss.mean()) v_loss = evaluate(val_loader, model, args) val_loss.append(v_loss) train_loss = np.hstack(train_loss) val_loss = np.hstack(val_loss) #plot_training_curve(train_loss, val_loss) results = { "train_loss": train_loss.tolist(), "val_loss": val_loss.tolist() } #### Testing validation_loss = evaluate(val_loader, model, args) test_loss, log_likelihood = evaluate(test_loader, model, args, testing=True) results["ELBO"] = test_loss results["log_likelihood"] = log_likelihood elapsed = time.time() - t results["Running time"] = elapsed # Save the results. json_dir = args.out_dir + f"{args.flow}perm_k_{args.num_flows}_RMSProp_lr{args.learning_rate}_4" print("Saving data at: " + json_dir) output_folder = pathlib.Path(json_dir) output_folder.mkdir(parents=True, exist_ok=True) results_json = json.dumps(results, indent=4, sort_keys=True) (output_folder / "results.json").write_text(results_json)