def wrapper_kaggle(epochs=40, validate_ratio=0.1, save_prob=True):
    train_x_1, test_x_1 = read_all_predict_score()
    train_x_2, train_y, test_x_2 = read_aggregated_vectors()
    train_x_3, train_y, test_x_3 = read_aggregated_vectors(google=False)

    old_train_x = train_x_1  # np.concatenate((train_x_1, train_x_2, train_x_3), axis=1)
    test_x = test_x_1  # np.concatenate((test_x_1, test_x_2, test_x_3), axis=1)
    old_train_y = np.asarray(train_y)
    # split train validate data
    sss_indices = StratifiedShuffleSplit(y=old_train_y, n_iter=1, test_size=validate_ratio)
    for indices in sss_indices:
        train_index, test_index = indices
    train_x = old_train_x[train_index]
    validate_x = old_train_x[test_index]
    train_y = old_train_y[train_index]
    validate_y = old_train_y[test_index]

    # add validation set for training
    # train_x = np.concatenate((train_x, validate_x))
    # train_y = np.concatenate((train_y, validate_y))

    # get dataset info
    dim = train_x[0].shape[0]
    n_out = len(np.unique(validate_y))
    datasets = (train_x, train_y, validate_x, validate_y, test_x)

    n_layers = 1

    print "input dimension is %d, output dimension is %d" % (dim, n_out)

    return_val = train_dropout_net(
        datasets=datasets,
        use_bias=True,
        n_epochs=epochs,
        dim=dim,
        lr_rate=0.02,
        n_out=n_out,
        dropout=True,
        dropout_rates=[0.7],
        n_hidden=[100],
        activations=[tanh] * n_layers,
        batch_size=50,
        update_rule='adagrad',
        no_test_y=True,
        save_prob=save_prob
    )
    if not save_prob:
        save_csv(return_val)
    else:
        train_prob, validate_prob, test_prob = return_val
        saved_train_prob = np.zeros((old_train_x.shape[0], n_out))
        saved_train_prob[train_index] = train_prob
        saved_train_prob[test_index] = validate_prob
        save_path = "D:/data/nlpdata/pickled_data/" + SST_KAGGLE + "_prob.pkl"
        print "saving probability feature to %s" % save_path

        f = open(Path(save_path), "wb")
        pkl.dump((saved_train_prob, test_prob), f, -1)
        f.close()
示例#2
0
def wrapper_kaggle(valid_portion=0.1):
    train_x, test_x = read_all_predict_score()
    _, train_y, _ = read_sst_kaggle_pickle()

    train_y = np.asarray(train_y)

    # train_x = train_x.reshape(train_x.shape[0], 18, 5)
    # test_x = test_x.reshape(test_x.shape[0], 18, 5)

    train_x, validate_x, train_y, validate_y = train_test_split(
        train_x, train_y, test_size=valid_portion, stratify=train_y
    )

    dim = train_x[0].shape
    print "input dimension is", dim

    img_size = (18, 5)

    n_out = len(np.unique(validate_y))
    datasets = (train_x, train_y, validate_x, validate_y, test_x)

    best_prediction = train_lecun_net(
        img_size=img_size,
        datasets=datasets,
        filter_size=(7, 2),
        pool_size=(2, 1),
        n_epochs=10,
        lr_rate=0.05,
        n_out=n_out,
        dropout_rate=0.5,
        n_hidden=500,
        nkerns=10,
        activation=leaky_relu,
        batch_size=100,
        update_rule="adagrad",
        user_bias=True,
        no_test_y=True,
    )
    import csv

    save_path = Path("C:/Users/Song/Course/571/hw3/kaggle_result.csv")
    with open(save_path, "wb") as f:
        writer = csv.writer(f, delimiter=",")
        writer.writerow(["PhraseId", "Sentiment"])
        phrase_ids = np.arange(156061, 222353)
        for phrase_id, sentiment in zip(phrase_ids, best_prediction):
            writer.writerow([phrase_id, sentiment])