Пример #1
0
def test(theta):
    print 'loading data...'
    _, _, dataTe = dataset.load(name='mnist.pkl.gz')

    print 'building the graph...'
    # fprop
    x = T.matrix('x', 'float32')
    F = models.create_mlp(x, theta)
    # zero-one loss
    y = T.ivector('y')
    ell = loss.create_zeroone(F, y)
    # all in one graph
    f_graph = function(
        inputs=[],
        outputs=ell,
        givens={x: dataTe[0], y: dataTe[1]}
    )

    print 'fire the graph...'
    er = f_graph()
    print 'error rate = %5.4f' % (er,)
Пример #2
0
def train_model(struct_list, opt, no_epochs, test_size, data, images ):

    split = train_test_split(data, images, test_size=test_size, random_state=42)
    (trainAttrX, testAttrX, trainImagesX, testImagesX) = split
    out_scaler = MinMaxScaler()
    trainY = out_scaler.fit_transform(trainAttrX["price"].values.reshape(-1, 1))
    testY = out_scaler.fit_transform(testAttrX["price"].values.reshape(-1, 1))
    trainAttrX, testAttrX, labelizer, cs = process_house_attributes(data, trainAttrX, testAttrX)

    mlp =  create_mlp(trainAttrX.shape[1], regress=False)
    cnn =  create_cnn(64, 64, 3, struct_list, regress=False)

    combinedInput = concatenate([mlp.output, cnn.output])
    x = Dense(4, activation="relu", name="fc_0")(combinedInput)
    x = Dense(1, activation="linear", name="fc_1")(x)
    model = Model(inputs=[mlp.input, cnn.input], outputs=x)
    model.compile(loss="mean_absolute_percentage_error", optimizer=opt, metrics=['accuracy'])
    history_callback = model.fit( [trainAttrX, trainImagesX], trainY,
        	         validation_data=([testAttrX, testImagesX], testY),
        	         epochs=no_epochs, batch_size=25)
    hist_df = pd.DataFrame(history_callback.history)
    return (model, hist_df, labelizer, cs, out_scaler)
Пример #3
0
# plt.show()

# plt.figure()
# plt.scatter(df['area'], df['price'])
# plt.show()

# Scale output between [0, 1]
max_price = train['price'].max()
train_y = train['price'] / max_price
test_y = test['price'] / max_price

# Processing data
(train_x, test_x) = datasets.process_house_attributes(df, train, test)

# Create model
model = models.create_mlp(train_x.shape[1], regress=True)
opt = Adam(lr=0.001, decay=0.001 / 200)
model.compile(loss='mean_absolute_percentage_error', optimizer=opt)

# Train model
model.fit(train_x, train_y, validation_data=(test_x, test_y),
          epochs=200, batch_size=8)

# Predict house prices
preds = model.predict(test_x)

diff = preds.flatten() - test_y
percent_diff = (diff / test_y) * 100
abs_percent_diff = np.abs(percent_diff)

mean = np.mean(abs_percent_diff)
Пример #4
0
(trainAttrX, testAttrX, trainImagesX, testImagesX) = split

# find the largest house price in the training set and use it to
# scale our house prices to the range [0, 1] (will lead to better
# training and convergence)
maxPrice = trainAttrX["price"].max()
trainY = trainAttrX["price"] / maxPrice
testY = testAttrX["price"] / maxPrice

# process the house attributes data by performing min-max scaling
# on continuous features, one-hot encoding on categorical features,
# and then finally concatenating them together
(trainAttrX, testAttrX) = datasets.process_house_attributes(df, trainAttrX, testAttrX)

# create the MLP and CNN models
mlp = models.create_mlp(trainAttrX.shape[1], regress=False)
cnn = models.create_cnn(64, 64, 3, regress=False)

# create the input to our final set of layers as the *output* of both
# the MLP and CNN
combinedInput = concatenate([mlp.output, cnn.output])

# our final FC layer head will have two dense layers, the final one
# being our regression head
x = Dense(4, activation="relu")(combinedInput)
x = Dense(1, activation="linear")(x)

# our final model will accept categorical/numerical data on the MLP
# input and images on the CNN input, outputting a single value (the
# predicted price of the house)
model = Model(inputs=[mlp.input, cnn.input], outputs=x)
Пример #5
0
def main():
    print('Training the join cardinality estimator')
    is_train = True
    num_rows, num_columns = 16, 16
    # target = 'join_selectivity'
    target = 'mbr_tests_selectivity'
    datasets_features_path = 'data/spatial_descriptors/spatial_descriptors_small_datasets.csv'
    datasets_histograms_path = 'data/histograms/small_datasets'
    join_results_path = 'data/join_results/join_results_small_datasets_no_bit.csv'
    features_df = datasets.load_datasets_feature(datasets_features_path)
    join_data, ds1_histograms, ds2_histograms, ds_all_histogram, ds_bops_histogram = datasets.load_join_data(
        features_df, join_results_path, datasets_histograms_path, num_rows,
        num_columns)

    train_attributes, test_attributes, ds1_histograms_train, ds1_histograms_test, ds2_histograms_train, ds2_histograms_test, ds_all_histogram_train, ds_all_histogram_test, ds_bops_histogram_train, ds_bops_histogram_test = train_test_split(
        join_data,
        ds1_histograms,
        ds2_histograms,
        ds_all_histogram,
        ds_bops_histogram,
        test_size=0.20,
        random_state=42)

    # train_attributes, val_attributes, ds1_histograms_train, ds1_histograms_val, ds2_histograms_train, ds2_histograms_val, ds_all_histogram_train, ds_all_histogram_val = train_test_split(
    #     train_attributes, ds1_histograms_train, ds2_histograms_train, ds_all_histogram_train, test_size=0.20, random_state=32)

    num_features = len(train_attributes.columns) - 10
    # print (join_data)
    X_train = pd.DataFrame.to_numpy(
        train_attributes[[i for i in range(num_features)]])
    X_test = pd.DataFrame.to_numpy(
        test_attributes[[i for i in range(num_features)]])
    y_train = train_attributes[target]
    y_test = test_attributes[target]
    # y_train = train_attributes['result_size']
    # y_test = test_attributes['result_size']

    mlp = models.create_mlp(X_train.shape[1], regress=False)
    cnn1 = models.create_cnn(num_rows, num_columns, 1, regress=False)
    # cnn2 = models.create_cnn(num_rows, num_columns, 1, regress=False)
    # cnn3 = models.create_cnn(num_rows, num_columns, 1, regress=False)

    # combined_input = concatenate([mlp.output, cnn1.output, cnn2.output, cnn3.output])
    combined_input = concatenate([mlp.output, cnn1.output])

    x = Dense(4, activation="relu")(combined_input)
    x = Dense(1, activation="linear")(x)

    # model = Model(inputs=[mlp.input, cnn1.input, cnn2.input, cnn3.input], outputs=x)
    model = Model(inputs=[mlp.input, cnn1.input], outputs=x)

    EPOCHS = 40
    LR = 1e-2
    # opt = Adam(lr=1e-4, decay=1e-4 / 200)
    opt = Adam(lr=LR, decay=LR / EPOCHS)
    model.compile(loss="mean_absolute_percentage_error", optimizer=opt)

    # print (model.summary())

    # train the model
    if is_train:
        print("[INFO] training model...")
        # model.fit(
        #     [X_train, ds1_histograms_train, ds2_histograms_train], y_train,
        #     validation_data=([X_test, ds1_histograms_test, ds2_histograms_test], y_test),
        #     epochs=EPOCHS, batch_size=128)
        model.fit([X_train, ds_bops_histogram_train],
                  y_train,
                  validation_data=([X_test, ds_bops_histogram_test], y_test),
                  epochs=EPOCHS,
                  batch_size=256)

        model.save('trained_models/model.h5')
        model.save_weights('trained_models/model_weights.h5')
    else:
        model = keras.models.load_model('trained_models/model.h5')
        model.load_weights('trained_models/model_weights.h5')

    print('Test on small datasets')
    y_pred = model.predict([X_test, ds_bops_histogram_test])

    print('r2 score: {}'.format(r2_score(y_test, y_pred)))

    diff = y_pred.flatten() - y_test
    percent_diff = (diff / y_test)
    abs_percent_diff = np.abs(percent_diff)

    # test_attributes['join_selectivity_pred'] = y_pred
    # test_attributes['percent_diff'] = abs_percent_diff
    # test_attributes.to_csv('prediction_small.csv')

    # compute the mean and standard deviation of the absolute percentage
    # difference
    mean = np.mean(abs_percent_diff)
    std = np.std(abs_percent_diff)

    print('mean = {}, std = {}'.format(mean, std))

    print('Test on large datasets')
    datasets_features_path = 'data/spatial_descriptors/spatial_descriptors_large_datasets.csv'
    datasets_histograms_path = 'data/histograms/large_datasets'
    join_results_path = 'data/join_results/join_results_large_datasets_no_bit.csv'
    features_df = datasets.load_datasets_feature(datasets_features_path)
    join_data, ds1_histograms, ds2_histograms, ds_all_histogram, ds_bops_histogram = datasets.load_join_data(
        features_df, join_results_path, datasets_histograms_path, num_rows,
        num_columns)

    X_test = pd.DataFrame.to_numpy(join_data[[i for i in range(num_features)]])
    y_test = join_data[target]

    y_pred = model.predict([X_test, ds_bops_histogram])

    print('r2 score: {}'.format(r2_score(y_test, y_pred)))

    diff = y_pred.flatten() - y_test
    percent_diff = (diff / y_test)
    abs_percent_diff = np.abs(percent_diff)
    mean = np.mean(abs_percent_diff)
    std = np.std(abs_percent_diff)

    print('mean = {}, std = {}'.format(mean, std))
	# Comment these out if ran once
	df = pd.read_csv("../input/train.csv")
	df = clean_data(df)
	df.to_csv('../output/cleaned_df.csv', index=False)
	# Then uncomment this
	#df = pd.read_csv("../output/cleaned_df.csv")
	# define params
	batch_size = 4096
	hidden_units = [150, 150, 150]
	dropout_rates = [0.20, 0.20, 0.20, 0.20]
	label_smoothing = 1e-2
	learning_rate = 3e-3

	# build model
	clf = create_mlp(
		X_train.shape[1], 5, hidden_units, dropout_rates, label_smoothing, learning_rate
		)
	# fit model
	clf.fit(X_train, y_train, epochs=100, batch_size=batch_size)

	models = []
	models.append(clf)

	# evaluate
	test_pred = clf.predict(X_test)
	test_pred = np.rint(test_pred)
	test_acc = np.sum(test_pred == y_test)/(y_test.shape[0]*5)
	print("test accuracy: " + str(test_acc))

	# Random Search CV 
	### IMPORTANT: Memory intensive. Will likely give back out of memory error.
Пример #7
0
state_cards = 9
epochs = 10000
epsilon = np.linspace(0.025, 0.99, num=epochs // 2)  #
epsilon = np.flip(epsilon)
epsilon = np.append(epsilon, 0.025 * np.ones(epochs // 2))
num_actions = 3  # [play_left, play_mid, play_right]
max_memory = 8192
batch_size = 64
input_size = state_cards + 2
lr = .0005

# optimizer
opt = Adam(lr=lr, decay=1e-3 / 200)

# model
model = create_mlp(input_size=input_size, layers=[128, 256, 256, 128])
model.compile(optimizer=opt, loss="mse")
model.summary()

# define environment
env = truco_qlearn(state_cards=state_cards, num_cards=num_cards)

# initialize experience replay object
exp_replay = ExperienceReplay(max_memory=max_memory,
                              env_dim=input_size,
                              discount=.99)

# training loop
losses = []
ti = time()
for e in range(epochs):
Пример #8
0
images = datasets.load_house_images(df, args['dataset'])
images = images / 255.0

print('[INFO] processing data...')
split = train_test_split(df, images, test_size=0.25, random_state=42)
(train_attr_x, test_attr_x, train_images_x, test_images_x) = split

max_price = train_attr_x['price'].max()
train_y = train_attr_x['price'] / max_price
test_y = test_attr_x['price'] / max_price

(train_attr_x,
 test_attr_x) = datasets.process_house_attributes(df, train_attr_x,
                                                  test_attr_x)

mlp = models.create_mlp(train_attr_x.shape[1], regress=False)
cnn = models.create_cnn(64, 64, 3, regress=False)

combined_input = concatenate([mlp.output, cnn.output])

x = Dense(4, activation='relu')(combined_input)
x = Dense(1, activation='linear')(x)

model = Model(inputs=[mlp.input, cnn.input], outputs=x)

opt = Adam(lr=1e-3, decay=1e-3 / 200)
model.compile(loss='mean_absolute_percentage_error', optimizer=opt)

print('[INFO] training model...')
model.fit([train_attr_x, train_images_x],
          train_y,