示例#1
0
def grid_search(pipeline, train_path, test_path):
    X_train, y_train = load_dataset(train_path)
    X_test, y_test = load_dataset(test_path)
    target_names = list(set([i[0] for i in y_train]))
    print("%d documents (training set)" % len(X_train))
    print("%d documents (test set)" % len(X_test))
    print("%d categories" % len(target_names))
    print()

    gridsearch = GridSearchCV(pipeline, parameters, cv=1, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    print(parameters)
    t0 = time()
    gridsearch.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print()
    print("Best dev score: %0.3f" % gridsearch.best_score_)
    print("Best parameters set:")
    best_parameters = gridsearch.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    print("Best test score: %0.3f" % gridsearch.score(X_test, y_test))
def grid_search(parameters, train_path, test_path):
    result = {
        'accuracy': None,
        'precision': None,
        'recall': None,
        'f1_score': None
    }
    report_columns = list(parameters.keys()) + list(result.keys())
    report_log(report_columns)

    print("Load data")
    X_train, y_train = load_dataset(train_path)
    X_test, y_test = load_dataset(test_path)
    train_corpus = [[y, x] for x, y in zip(X_train, y_train)]
    test_corpus = [[y, x] for x, y in zip(X_test, y_test)]

    train_tagged = convert_to_tagged(train_corpus)
    test_tagged = convert_to_tagged(test_corpus)

    target_names = list(set(y_train))

    print("%d documents (training set)" % len(X_train))
    print("%d documents (test set)" % len(X_test))
    print("%d categories" % len(target_names))
    print()

    model_count = len(list(itertools.product(*parameters.values())))
    print("Start optimizing hyperparameters")
    for i, parameter_set in enumerate(itertools.product(*parameters.values())):
        print("\nTraining model %d/%d" % (i + 1, model_count))
        params = dict(zip(parameters.keys(), parameter_set))
        t0 = time()
        transformer = get_model(train_tagged, params)

        transformer.train(train_tagged,
                          total_examples=transformer.corpus_count,
                          epochs=transformer.iter)

        print("finish training doc2vec")

        X_train_vec, y_train_label = vec_for_learning(transformer,
                                                      train_tagged)
        X_test_vec, y_test_label = vec_for_learning(transformer, test_tagged)

        print("start training classifier")
        model = LinearSVC()

        estimator = model.fit(X_train_vec, y_train_label)
        train_time = time() - t0
        print("train time: %dm %0.3fs" % (train_time / 60, train_time - 60 *
                                          (train_time // 60)))

        t0 = time()
        y_pred = estimator.predict(X_test_vec)
        test_time = time() - t0
        print("test time: %dm %0.3fs" % (test_time / 60, test_time - 60 *
                                         (test_time // 60)))

        result = get_metrics(y_test_label, y_pred)
        report_log(list(params.values()) + list(result.values()))
示例#3
0
    return regressors, targets


if args.mode == "train-test":
    if not (args.train and args.test):
        parser.error("Mode train-test requires --train and --test")
    if not args.s:
        parser.error("Mode train_test requires --s")
    train_path = os.path.abspath(args.train)
    test_path = os.path.abspath(args.test)
    model_path = os.path.abspath(args.s)

    print("Train model")

    print("Load data")
    X_train, y_train = load_dataset(train_path)
    X_test, y_test = load_dataset(test_path)
    train_corpus = [[y, x] for x, y in zip(X_train, y_train)]
    test_corpus = [[y, x] for x, y in zip(X_test, y_test)]

    train_tagged = convert_to_tagged(train_corpus)
    test_tagged = convert_to_tagged(test_corpus)

    target_names = list(set(y_train))

    print("%d documents (training set)" % len(X_train))
    print("%d documents (test set)" % len(X_test))
    print("%d categories" % len(target_names))
    print()

    print("Training model")
示例#4
0
from util.model_evaluation import get_metrics, plot_confusion_matrix

t0 = time()
x_transformer_file = open(join(cwd, "snapshots", "x_transformer.pkl"), "rb")
x_transformer = pickle.load(x_transformer_file)
y_transformer_file = open(join(cwd, "snapshots", "y_transformer.pkl"), "rb")
y_transformer = pickle.load(y_transformer_file)
ch2_file = open(join(cwd, "snapshots", "ch2.pkl"), "rb")
ch2 = pickle.load(ch2_file)
estimator_file = open(join(cwd, "snapshots", "model.pkl"), "rb")
estimator = pickle.load(estimator_file)
duration = time() - t0
print("Load model time: %0.3fs" % duration)

test_path = join(cwd, "data", "test.xlsx")
X_test, y_test = load_dataset(test_path)
t0 = time()
y_test = [item for sublist in y_test for item in sublist]
X = x_transformer.transform(X_test)
X = ch2.transform(X)
y = estimator.predict(X)
y_pred = y_transformer.inverse_transform(y)
duration = time() - t0
print("Predict time: %0.3fs" % duration)
get_metrics(y_test, y_pred)

classes = set(y_test)
cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm,
                      classes,
                      normalize=False,
示例#5
0
def run(args):

    print('\nMODEL SETTINGS: \n', args, '\n')
    print("Random Seed: ", args.manual_seed)

    train_loader, val_loader, test_loader, args = load_dataset(args)

    encoder = MLP_encoder(args)
    decoder = MLP_decoder(args)
    if args.flow == "planar":
        model = VAE.PlanarVAE(encoder, decoder, args)
    elif args.flow == "NICE":  # NICE-planar
        model = VAE.NICEVAE_amor(encoder, decoder, args)
    elif args.flow == "NICE_MLP":
        model = VAE.NICEVAE(encoder, decoder, args)
    elif args.flow == "syl_orthogonal":
        model = VAE.Sylvester_ortho_VAE(encoder, decoder, args)
    elif args.flow == "real":
        model = VAE.RealNVPVAE(encoder, decoder, args)

    if args.vampprior:
        load = torch.utils.data.DataLoader(train_loader.dataset,
                                           batch_size=args.num_pseudos,
                                           shuffle=True)
        pseudo_inputs = next(iter(load))[0] if args.data_as_pseudo else None
        model.init_pseudoinputs(pseudo_inputs)

    if args.cuda:
        print("Model on GPU")
        model.cuda()

    print(model)

    optimizer = optim.RMSprop(model.parameters(),
                              lr=args.learning_rate,
                              momentum=0.9)

    #### Training
    train_loss = []
    val_loss = []

    epoch = 0
    t = time.time()
    for epoch in range(1, args.epochs + 1):

        tr_loss = train(epoch, train_loader, model, optimizer, args)
        train_loss.append(tr_loss.mean())

        v_loss = evaluate(val_loader, model, args)
        val_loss.append(v_loss)

    train_loss = np.hstack(train_loss)
    val_loss = np.hstack(val_loss)
    #plot_training_curve(train_loss, val_loss)
    results = {
        "train_loss": train_loss.tolist(),
        "val_loss": val_loss.tolist()
    }

    #### Testing

    validation_loss = evaluate(val_loader, model, args)
    test_loss, log_likelihood = evaluate(test_loader,
                                         model,
                                         args,
                                         testing=True)
    results["ELBO"] = test_loss
    results["log_likelihood"] = log_likelihood

    elapsed = time.time() - t
    results["Running time"] = elapsed

    # Save the results.
    json_dir = args.out_dir + f"{args.flow}perm_k_{args.num_flows}_RMSProp_lr{args.learning_rate}_4"
    print("Saving data at: " + json_dir)
    output_folder = pathlib.Path(json_dir)
    output_folder.mkdir(parents=True, exist_ok=True)
    results_json = json.dumps(results, indent=4, sort_keys=True)
    (output_folder / "results.json").write_text(results_json)