示例#1
0
def fisher():
	X_raw, y_raw, _ = helper.load_dataset(skip_every=11)
	_, _, X_raw = myPCA4(X_raw.shape[0]-15, X_raw, return_coeffs=True)
	print("loaded dataset")
	i_list = range(X_raw.shape[0])
	num_correct = 0
	for i_ind, i in enumerate(i_list):
		print("i = ", i)
		# print("(y_raw.shape) = ", (y_raw.shape))
		X = np.zeros((X_raw.shape[0]-1, X_raw.shape[1]))
		y = np.zeros((y_raw.shape[0]-1,1)).flatten()
		X[:i, :], X[i:, :] = X_raw[:i, :], X_raw[i+1:, :]
		X_meann = np.mean(X, axis=0)
		y[:i], y[i:] = y_raw[:i], y_raw[i+1:]
		classes, counts = np.unique(y, return_counts=True)
		means = []
		Xs = []
		# means = {}
		classes = classes.tolist()
		for c in classes:
			# means[c] = np.mean(X[y==c], axis=0)
			Xs.append(X[y==c])
			means.append(np.expand_dims(np.mean(Xs[-1], axis=0), axis=0))
			Xs[-1] -= means[-1]
		meann = np.expand_dims(np.mean(X, axis=0), axis=0)
		# [print(Xselem.shape) for Xselem in Xs]
		# [print(term.shape) for term in means]
		# print(meann.shape)
		mu_mat = np.concatenate(means, axis=0) - meann
		mean_wts = np.array(counts).reshape((mu_mat.shape[0], 1))
		wtd_mu_mat = mu_mat * mean_wts
		# print("wtd_mu_mat.shape = ", wtd_mu_mat.shape)
		Sb = np.matmul(mu_mat.T, wtd_mu_mat)
		Sw = np.zeros((X.shape[1], X.shape[1]))
		# print("Sw.shape = ", Sw.shape)
		for c_ind, c in enumerate(classes):
			Sw += np.matmul(Xs[c_ind].T, Xs[c_ind])
		eigvals, eigvecs = SLA.eig(Sb, Sw)#, eigvals_only=False)
		# eigvals, eigvecs = LA.eig(np.matmul(LA.inv(Sw), Sb))
		# eigvals, eigvecs = LA.eig(np.matmul(LA.inv(Sw), Sb))
		eigvecs = eigvecs.real[:,:len(classes)-1]
		# print("eigvals\n", eigvals)
		eigCoeffs = np.matmul(X-X_meann, eigvecs)
		mean_eigCoeffs = np.zeros((len(classes), len(classes)-1))
		# for mean_ind in
		# print("eigvecs.shape")
		X_test = X_raw[i, :].reshape((1, X.shape[1])) - X_meann
		eigCoeffs_test = np.matmul(X_test, eigvecs)
		nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto', metric='euclidean').fit(eigCoeffs)
		dists, indices = nbrs.kneighbors(eigCoeffs_test)
		if (y[indices[0,0]] == y_raw[i]):
			num_correct += 1
		# print(eigvals)
		# break
		# eigvals, eigvecs = SLA.eigh(Sb, Sw, eigvals_only=False)
		# print("eigvecs.shape = ", eigvecs.shape)
	return num_correct
示例#2
0
def eig_face_method(drop_first_n, k_vals):
	X_raw, y_raw, _ = helper.load_dataset(skip_every=11)
	print("loaded dataset")
	ik_mat = np.zeros((X_raw.shape[0], len(k_vals)))
	k_list = np.zeros((1, len(k_vals))).flatten().tolist()
	k_vals = sorted(k_vals, reverse=True)
	# i_list = np.random.permutation(X_raw.shape[0])[:50].tolist()
	i_list = range(X_raw.shape[0])
	if debug:
		debug_mat = np.zeros((len(i_list), 3))
	for i_ind, i in enumerate(i_list):
		print("i = ", i)
		# print("(y_raw.shape) = ", (y_raw.shape))
		X = np.zeros((X_raw.shape[0]-1, X_raw.shape[1]))
		y = np.zeros((y_raw.shape[0]-1,1)).flatten()
		X[:i, :], X[i:, :] = X_raw[:i, :], X_raw[i + 1:, :]
		y[:i], y[i:] = y_raw[:i], y_raw[i+1:]
		# X_meann, eigVecs, eigCoeffs = myPCA2(min(X.shape[0], k_vals[0]+1), X, drop_first_n=drop_first_n)
		X_meann, eigVecs, eigCoeffs = myPCA4(min(X.shape[0], k_vals[0]+1), X, drop_first_n=drop_first_n)
		X = X - X_meann
		X_test = X_raw[i,:].reshape((1, X.shape[1]))-X_meann
		eigCoeffs_test = np.matmul(X_test, eigVecs)
		for ind_k, k in enumerate(k_vals):
			# print("k, ind_k, k_vals[0], k-eigCoeffs.shape[1], eigCoeffs.shape=", k, ind_k, k_vals[0]+1, k - eigCoeffs.shape[1], eigCoeffs.shape)
			eigCoeffs_test[:, k:] = 0
			eigCoeffs[:, k:] = 0
			# print("eigCoeffs[0,:]=", eigCoeffs[0,:])
			nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto', metric='euclidean').fit(eigCoeffs)
			dists, indices = nbrs.kneighbors(eigCoeffs_test)
			if debug:
				debug_mat[i_ind,0] = y[indices[0,0]]
				debug_mat[i_ind,1] = y_raw[i]
				debug_mat[i_ind,2] = indices[0,0]
				# print("k = ", k, "y[indices[0,0]] = ", y[indices[0, 0]], "y_raw[i]=", y_raw[i], "indices[0,0]=", indices[0,0], "dist=", dists)
			if (y[indices[0,0]] == y_raw[i]):
				ik_mat[i,ind_k] = 1
				k_list[ind_k] += 1
			else:
				k_list[ind_k] += 0
	if debug:
		print("debug_mat\n", debug_mat)
	k_list.reverse()
	return k_list
	summary_mat = np.sum(ik_mat, axis=0).flatten().tolist()
	summary_mat.reverse()
	return summary_mat
示例#3
0
def correlation_method():
	X, y, _ = helper.load_dataset(skip_every=11)
	print("loaded dataset")
	""" For each face, get its nearest nbr(other than itself) and 
	see if it belongs to the same class
	and repeat this many_times """
	nbrs = NearestNeighbors(n_neighbors=2, algorithm='auto', metric='euclidean').fit(X)
	raw_distances, raw_indices = nbrs.kneighbors(X)
	dists, indices = raw_distances[:,1], raw_indices[:,1]
	pred_y = y[indices]
	num_correct = np.sum(pred_y == y)
	if debug:
		printr = np.zeros((y.size, 3))
		printr[:,0]=y
		printr[:,1]=pred_y
		printr[:,2]=dists
		print("printr:\n", printr)
	print("num_correct = ", num_correct, "num_total = ", y.size)
	return num_correct*100.0 / y.size, y.size
示例#4
0
def main():
    features, labels = helper.load_dataset()
    labels = helper.convert_labels(labels, to_numbers=True)

    num_classes = 12
    cat_targets = to_categorical(np.array(labels), num_classes)
    # print(cat_targets.shape)
    # print(cat_targets[:10])

    (x_train, y_train), (x_test, y_test) = helper.split_to_sets(features, cat_targets)
    # print(labels[:10])

    print(x_train.shape, x_test.shape)
    print(y_train.shape, y_test.shape)

    # model = larger_model()
    # model.fit(x_train, y_train,
    #           batch_size=100,
    #           epochs=100,
    #           validation_data=(x_test, y_test),
    #           shuffle=True)
    # scores = model.evaluate(x_test, y_test, verbose=1)
    # print('Test loss:', scores[0])
    # print('Test accuracy:', scores[1])

    model = new_cifar_model()
    model.fit(x_train, y_train,
              batch_size=28,
              epochs=500,
              validation_data=(x_test, y_test),
              shuffle=True)

    test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)
    print("Large CNN Error: %.2f%%" % (100-test_acc*100))
    print("test-loss: ", test_loss)

    # #predictions = model.predict(test_images)

    model.save('model_2.h5')
示例#5
0
def main():
	correlation_accu, num_total = correlation_method()
	# k_vals = [1, 5, 15, 40, 70, 100, 130, 150]
	# k_vals = [1, 60, 130]
	k_vals = [1, 5, 10, 15, 25, 50, 70, 90, 125, 150]
	no_drop_eig_face = [x * 100.0 / num_total for x in eig_face_method(0, k_vals)]
	drop_eig_face = [x * 100.0 / num_total for x in eig_face_method(3, k_vals)]
	linear_accu, _ = linear()
	X_raw, y_raw, _ = helper.load_dataset(skip_every=11)
	fisher_accu = fisher2(X_raw, y_raw) * 100.0 / num_total
	# plt.figure(figsize=(15.0, 9.0))
	fig = plt.figure()
	plt.plot(k_vals, no_drop_eig_face, color='C1', label="Eigenfaces")
	plt.plot(k_vals, drop_eig_face, color='C2', label="Eigenfaces w/o first 3 components")
	plt.axhline(y=fisher_accu, color='C3', label="Fisherfaces")
	plt.axhline(y=linear_accu, color='C4', label="Linear Subspace")
	plt.axhline(y=correlation_accu, color='C5', label="Correlation")
	plt.gca().legend(loc='lower right')  # show legend
	# plt.figure(figsize=(15.0, 9.0))
	fig.savefig("my_plot.png")
	# plt.figure()
	plt.show()
dataset = params['dataset']
model = params['model']
seed = params['seed']
n_train = params['n_train']

params.update(helper.get_model_config())
fit_direc = 'Fits/' + dataset + '/' + params['base_model'] + '/'
params['fit_file'] = fit_direc + "fit-" + params[
    'file_base'] + "-train%i-%s.P" % (n_train, model)
result_direc = 'Results/' + dataset + '/' + params['base_model'] + '/'
helper.init_direc(result_direc)
params['result_file'] = result_direc + "pred-" + params[
    'file_base'] + "-train%i-%s.P" % (n_train, model)

### Load dataset, get submodels, and limit n_train appropriately
data = helper.load_dataset(params['test_file'])
all_sms = data['X'].keys()
sms = helper.get_sms(model, all_sms)
params['model'] = model
params['sms'] = sms

if 'rand' in model:
    data['X'].update(helper.set_rand_splits(model, sms, data['X']['all']))
X = data['X']
y = data['Y']
# y_train: used for getting prior class probability
y_train = helper.load_dataset(params['train_file'])['Y'][:n_train]

M = Model(params)
p, M.fit, M.trained = helper.load_fits(params['fit_file'])
示例#7
0
    'enweiros', 'enweirosd', 'encweiros', 'encweirosd'
]
versions = 20

summary = np.zeros((len(base_clfs), 23))

for cid, bclf in enumerate(base_clfs):
    print(cid, bclf)
    base_clf = base_clfs[bclf]
    csvfile = open('csv/%s.csv' % bclf, 'w')

    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(header)
    for dataset in tqdm(datasets):
        # Load dataset
        X, y, X_, y_ = h.load_dataset(dataset)

        # Prepare place for results
        bacs = [[] for i in range(versions + 3)]

        # Iterate folds
        for fold in range(5):
            # Get division
            X_train, X_test = X_[fold]
            y_train, y_test = y_[fold]

            # Evaluating regular clf
            bacs[0].append(
                m.regular_bac(base_clf, X_train, y_train, X_test, y_test))

            # Evaluating over and undersampling
示例#8
0
#Get the update rule. Here we will use a more advanced optimization algorithm: ADAM [1]
params = lasagne.layers.get_all_params(net['out'], trainable=True)
updates = lasagne.updates.adam(loss, params)

test_prediction = lasagne.layers.get_output(net['out'], deterministic=True)
test_loss = lasagne.objectives.squared_error(test_prediction, target_var)
test_loss = test_loss.mean()


#Note that train_fn has a "updates" rule. Whenever we call this function, it updates the parameters of the model.
train_fn = theano.function([input_var, target_var], [loss, prediction], updates=updates, name='train')
val_fn = theano.function([input_var, target_var], [test_loss, test_prediction], name='validation')
get_preds = theano.function([input_var], test_prediction, name='get_preds')

X_train, y_train, X_test, y_test, X_valid, y_valid = load_dataset()

x_train = X_train.reshape(-1, 3, 64, 64)
x_test = X_test.reshape(-1, 3, 64, 64)
y_train = y_train.reshape(-1, 3, 32, 32)
y_test = y_test.reshape(-1, 3, 32, 32)



import time
epochs = 10 #s 100 # 15  #You can reduce the number of epochs to run it  faster (or run it for longer for a better model)
batch_size=128

#Run the training function per mini-batches
n_examples = x_train.shape[0]
n_batches = n_examples / batch_size
示例#9
0
                                                positive_data, positive_data)
    score_temp_0 = modelMain[temp_class_index].forward(positive_data)
    mainloss_p = mainloss_p + torch.log(
        torch.sigmoid(1 * score_temp_0) + 1e-2).mean()

    loss = -1.0 * mainloss_p + 0.1 * loss_pen
    optimizer[temp_class_index].zero_grad()
    loss.backward()
    optimizer[temp_class_index].step()

    return loss_pen.data, finished_epoch, mainloss_p.data


# ####################################################################################################

OCDataset = helper.load_dataset(opt.dataset)
oc_dataset = OCDataset(opt)
labels = oc_dataset.labels
num_class = oc_dataset.num_classes
modelMain = {}
optimizer = {}
training_iterator = {}
for label_index in range(num_class):
    if opt.dataset == 'mnist':
        modelMain[label_index] = Recognizer_mlp(opt)
    elif opt.dataset == 'cifar10':
        modelMain[label_index] = Recognizer_mlp_cifar(opt)
    if cuda:
        modelMain[label_index].cuda()
    training_iterator[label_index] = oc_dataset.get_training_iterator(
        label_index)
示例#10
0
    g_loss_logger = helper.get_logger(opt.log_port, 'g_loss')
    viz_image_logger = Visdom(port=opt.log_port, env="images")

# Loss function
criteria = torch.nn.BCELoss()

# Initialize generator and discriminator
generator = Generator(opt.batch_size, opt.latent_dim, opt.channels)
discriminator = Discriminator(opt.batch_size, opt.channels)

if cuda:
    generator.cuda()
    discriminator.cuda()
    criteria.cuda()

dataloader = helper.load_dataset(opt.dataset, opt.img_size, opt.batch_size)

# Optimizers
optimizer_G = torch.optim.Adam(generator.parameters(),
                               lr=opt.lr,
                               betas=(opt.b1, opt.b2))
optimizer_D = torch.optim.Adam(discriminator.parameters(),
                               lr=opt.lr,
                               betas=(opt.b1, opt.b2))

Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor

# ----------
#  Training
# ----------
示例#11
0
# W tym mniejszościowych
# IR
header = ['idx', 'dbname', 'features', 'samples', 'majority', 'minority', 'ir']
datasets = h.datasets_for_groups(
    ["imb_IRhigherThan9p1", "imb_IRhigherThan9p2"])

csvfile = open('csv/datasets.csv', 'w')
writer = csv.writer(csvfile, delimiter=',')
writer.writerow(header)

for i, dataset in enumerate(datasets):
    dbname = dataset[1].replace("_", "-")
    #print(dbname)

    # Load and analyze dataset
    X, y, _, _ = h.load_dataset(dataset)
    classes, c_counts = np.unique(y, return_counts=True)
    majority_c = 0 if c_counts[0] > c_counts[1] else 1
    minority_c = 1 - majority_c

    n_samples = len(y)
    n_features = X.shape[1]
    n_majority = np.sum(y == majority_c)
    n_minority = np.sum(y == minority_c)
    IR = n_majority / n_minority

    #print("%i samples (%i maj / %i min)" % (
    #    n_samples, n_majority, n_minority
    #))
    #print("IR = %.2f" % IR)
示例#12
0
def linear():
	X_raw, y_raw, z_raw = helper.load_dataset(skip_every=11) ###3, 5, 6
	classes, counts = np.unique(y_raw, return_counts=True)
	num_classes = classes.shape[0]
	img_size = X_raw.shape[1]
	num_img = X_raw.shape[0]
	bases = np.zeros((num_classes, 3, img_size))
	X_norm = X_raw/np.sqrt(np.sum(X_raw*X_raw, axis=1)).reshape(num_img,1)
	# bases1 = X_raw[z_raw == 3]
	# bases2 = X_raw[z_raw == 5]
	# bases3 = X_raw[z_raw == 6]

	indx = [2, 3, 5]
	rep_indx = [6,4,6]
	# rep_indx = [2,3,5]

	bases[:,0,:] = X_norm[z_raw == indx[0]]
	bases[:,1,:] = X_norm[z_raw == indx[1]]
	bases[:,2,:] = X_norm[z_raw == indx[2]]

	for i in range(num_classes):
		bases[i,1,:] = bases[i,1,:] - (np.matmul(bases[i,1,:], bases[i,0,:].T))*bases[i,0,:]
		bases[i,1,:] /= np.sqrt(np.sum(bases[i,1,:]*bases[i,1,:]))
		
		bases[i,2,:] = bases[i,2,:] - (np.matmul(bases[i,2,:], bases[i,0,:].T))*bases[i,0,:]
		bases[i,2,:] = bases[i,2,:] - (np.matmul(bases[i,2,:], bases[i,1,:].T))*bases[i,1,:]
		# bases[i,0,:] /= np.sqrt(np.sum(bases[i,0,:]*bases[i,0,:]))
		bases[i,2,:] /= np.sqrt(np.sum(bases[i,2,:]*bases[i,2,:]))
	

	correct = 0
	for i in range(num_img):
		pose = int(z_raw[i])
		# print(pose)
		class1 = int(y_raw[i])
		restore_base = np.zeros((3, img_size))
		restore_base[:,:] = bases[class1,:,:]
		if(pose == indx[0]):
			continue
			bases[class1,0,:] = X_norm[i+(rep_indx[0]-pose)]
			
			bases[class1,1,:] = X_norm[i+(indx[1]-pose)]
			bases[class1,1,:] = bases[class1,1,:] - (np.matmul(bases[class1,1,:], bases[class1,0,:].T))*bases[class1,0,:]
			bases[class1,1,:] /= np.sqrt(np.sum(bases[class1,1,:]*bases[class1,1,:]))
			
			bases[class1,2,:] = X_norm[i+(indx[2]-pose)]
			bases[class1,2,:] = bases[class1,2,:] - (np.matmul(bases[class1,2,:], bases[class1,0,:].T))*bases[class1,0,:]
			bases[class1,2,:] = bases[class1,2,:] - (np.matmul(bases[class1,2,:], bases[class1,1,:].T))*bases[class1,1,:]
			bases[class1,2,:] /= np.sqrt(np.sum(bases[class1,2,:]*bases[class1,2,:]))
			# bases[class1,0,:] /= np.sqrt(np.sum(bases[class1,0,:]*bases[class1,0,:]))
		elif(pose == indx[1]):
			# continue
			bases[class1,1,:] = X_norm[i+(rep_indx[1]-pose)]
			bases[class1,1,:] = bases[class1,1,:] - (np.matmul(bases[class1,1,:], bases[class1,0,:].T))*bases[class1,0,:]
			bases[class1,1,:] /= np.sqrt(np.sum(bases[class1,1,:]*bases[class1,1,:]))
			
			bases[class1,2,:] = X_norm[i+(indx[2]-pose)]
			bases[class1,2,:] = bases[class1,2,:] - (np.matmul(bases[class1,2,:], bases[class1,0,:].T))*bases[class1,0,:]
			bases[class1,2,:] = bases[class1,2,:] - (np.matmul(bases[class1,2,:], bases[class1,1,:].T))*bases[class1,1,:]
			bases[class1,2,:] /= np.sqrt(np.sum(bases[class1,2,:]*bases[class1,2,:]))
			# bases[class1,0,:] /= np.sqrt(np.sum(bases[class1,0,:]*bases[class1,0,:]))
		elif(pose == indx[2]):
			continue
			# print("base0 = ",bases[class1, 0,:])
			# print("base1 = ",bases[class1, 1,:])
			bases[class1,2,:] = X_norm[i+(rep_indx[2]-pose)]
			# print("base2 = ",bases[class1, 2,:])
			bases[class1,2,:] = bases[class1,2,:] - (np.matmul(bases[class1,2,:], bases[class1,0,:].T))*bases[class1,0,:]
			# print("base2 = ",bases[class1, 2,:])
			bases[class1,2,:] = bases[class1,2,:] - (np.matmul(bases[class1,2,:], bases[class1,1,:].T))*bases[class1,1,:]
			# print("base2 = ",bases[class1, 2,:])
			# print("i = ",i, " class1 = ",class1, " pose = ", pose, " val = ", np.sqrt(np.sum(bases[class1,2,:]*bases[class1,2,:])))
			bases[class1,2,:] /= np.sqrt(np.sum(bases[class1,2,:]*bases[class1,2,:]))
			# bases[class1,1,:] = bases[class1,1,:] - (np.matmul(bases[class1,1,:], bases[class1,0,:].T))*bases[class1,0,:]
			# bases[class1,0,:] /= np.sqrt(np.sum(bases[class1,0,:]*bases[class1,0,:]))
			# bases[class1,1,:] /= np.sqrt(np.sum(bases[class1,1,:]*bases[class1,1,:]))


		img = X_raw[i]
		min_val = -1
		ans = -1
		for j in range(num_classes):
			img1 = img.T
			obt_img = (np.matmul(bases[j,0,:], img1))*bases[j,0,:] + (np.matmul(bases[j,1,:],img1))*bases[j,1,:] + (np.matmul(bases[j,2,:],img1))*bases[j,2,:]
			diff = obt_img - img
			diff_val = np.sqrt(np.sum(diff*diff))
			if(min_val != -1):
				if(min_val > diff_val):
					min_val = diff_val
					ans = j
			else:
				min_val = diff_val
				ans = j

		if(ans == y_raw[i]):
			correct+=1
		bases[class1,:,:] = restore_base


	print("LinSub: Correct Predictions = ", correct, "num_total = ", y_raw.size)
	return (correct * 100.0 / y_raw.size), y_raw.size
示例#13
0
def fisher_driver():
	X_raw, y_raw, poses = helper.load_dataset(skip_every=11)
	print("fisher2(face reco:):", fisher2(X_raw, y_raw))
示例#14
0
#h = T.nnet.bn.batch_normalization(h , model['gamma'], model['beta'], h.mean(0, keepdims=True), h.mean(0, keepdims=True))
h = T.nnet.relu(h)

h = T.nnet.abstract_conv.conv2d_grad_wrt_inputs(h, model['W2_t'], (batch_size, 32, 32, 32), filter_shape=(3,32,filter_size,filter_size), border_mode='half', subsample=(2, 2))
output = T.tanh(h + model['b1_t'].dimshuffle('x', 0, 'x', 'x'))

#%%
loss = T.mean(T.sqr(output - output_))
updates = lasagne.updates.adam(loss, model.values())

train_function = theano.function(inputs=[input_, output_], outputs=[loss, output], updates=updates, name='train_fct')
test_function = theano.function(inputs=[input_, output_], outputs=[loss, output], name='test_fct')

#%%

trainx, trainy, testx, testy, _, _ = load_dataset()
trainx = trainx.reshape((-1, 3, 64, 64)).astype('float32')
trainy = trainy.reshape((-1,3,64, 64)).astype('float32')
testx = trainx.reshape((-1, 3, 64, 64)).astype('float32')
testy = trainy.reshape((-1,3,64, 64)).astype('float32')

#%%
num_batches = trainx.shape[0] / batch_size

for epoch in range(num_epochs) : 
    for i in range(num_batches):
        batch_x = trainx[i*batch_size : (i+1)*batch_size, :, :, :]
        batch_y = trainy[i*batch_size : (i+1)*batch_size, :, :, :]
        loss_train, predictions = train_function(batch_x, batch_y)
        print loss_train
    
示例#15
0
def main():
    # reading the command line arguments
    parser = argparse.ArgumentParser(
        description='Read in file paths and other parameters.')
    parser.add_argument('--asm_path',
                        help='path to the asm training files.',
                        default="gs://uga-dsp/project1/data/asm/",
                        type=str)
    parser.add_argument('--bytes_path',
                        help='path to the bytes training files.',
                        default="gs://uga-dsp/project1/data/bytes/",
                        type=str)
    parser.add_argument('--train_files',
                        help='path to the file containing the train files.',
                        default="gs://uga-dsp/project1/files/X_train.txt",
                        type=str)
    parser.add_argument('--test_files',
                        help='path to the file containing the test files.',
                        default="gs://uga-dsp/project1/files/X_test.txt",
                        type=str)
    parser.add_argument('--train_labels',
                        help='path to the file containing the train labels.',
                        default="gs://uga-dsp/project1/files/y_train.txt",
                        type=str)
    parser.add_argument('--test_labels',
                        help='path to the file containing the test labels.',
                        default="gs://uga-dsp/project1/files/y_train.txt",
                        type=str)
    parser.add_argument(
        '--outfile',
        help='path to the output file containing labels for final test set.',
        default="gs://p1-models/RF_Large_Predictions.csv",
        type=str)
    parser.add_argument('--model_path',
                        help='path to the folder for saving the final model.',
                        default="gs://models/",
                        type=str)
    parser.add_argument('--n_parts',
                        help='an integer specifying the number of partitions.',
                        default=50,
                        type=int)
    parser.add_argument('--mem_lim',
                        help='a string specifying the memory limit.',
                        default='10G',
                        type=str)
    parser.add_argument(
        '--max_depth',
        help='maximum depth of the tree in Rnadom Forest Classifier.',
        default=7,
        type=int)
    parser.add_argument(
        '--classifier',
        choices=['lr', 'nb', 'rf'],
        help='classifier algorithm to be used for the classification task.',
        default='rf',
        type=str)
    args = parser.parse_args()

    # initializing the variables
    print("Initializing the variables....")
    asm_path = args.asm_path
    bytes_path = args.bytes_path
    train_files = args.train_files
    test_files = args.test_files
    train_labels = args.train_labels
    test_labels = args.test_labels
    outfile = args.outfile
    model_path = args.model_path
    n_parts = args.n_parts
    memory_limit = args.mem_lim
    max_depth = args.max_depth
    classifier = args.classifier

    sc = spark_session_setup(memory_limit=memory_limit)

    # loading the dataset
    print("loading the dataset...")
    train_df, test_df = load_dataset(sc,
                                     asm_path=asm_path,
                                     bytes_path=bytes_path,
                                     X_train=train_files,
                                     y_train=train_labels,
                                     X_test=test_files,
                                     y_test=test_labels,
                                     n_parts=n_parts)

    # building the model
    print("building the model...")
    stages = build_pipeline(classifier=classifier, max_depth=max_depth)
    pipeline = Pipeline(stages=stages)
    model = pipeline.fit(train_df)

    # saving the model and writing the predictions into the output file
    if model_path:
        model.save(model_path)
    print("generatign the predictions...")
    predictions = model.transform(test_df)
    write_to_file(predictions, outfile)
示例#16
0
start = time.time()
helper.init_config(argv[1])
params = helper.parse_args(argv)
dataset = params['dataset']
model = params['model']
seed = params['seed']
n_train = params['n_train']

params.update(helper.get_model_config())
fit_direc = 'Fits/' + dataset + '/' + params['base_model'] + '/'
helper.init_direc(fit_direc)
params['fit_file'] = fit_direc + "fit-" + params[
    'file_base'] + "-train%i-%s.P" % (n_train, model)

### Load dataset, get submodels, and limit n_train appropriately
data = helper.load_dataset(params['train_file'])
assert np.shape(data['X']['all'])[0] >= n_train  #Ensure enough data

all_sms = data['X'].keys()
sms = helper.get_sms(model, all_sms)
params['model'] = model
params['sms'] = sms

if 'rand' in model:
    data['X'].update(
        helper.set_rand_splits(model, sms, data['X']['all'][:n_train, :]))
X = {
    sm: data['X'][sm][:n_train, :] if np.shape(data['X'][sm]) !=
    (1, 0) else np.array([])
    for sm in sms
}
示例#17
0
def exp(dataset_name):
    cfg_path = './cfg/{}'.format(dataset_name)
    pn_cfg = helper.read_cfg_file(os.path.join(cfg_path, 'PN'))
    upu_cfg = helper.read_cfg_file(os.path.join(cfg_path, 'uPU'))
    nnpu_cfg = helper.read_cfg_file(os.path.join(cfg_path, 'nnPU'))
    assert \
        upu_cfg['dataset']['dataset_name'] == \
        nnpu_cfg['dataset']['dataset_name'] and \
        upu_cfg['network']['network_name'] == \
        nnpu_cfg['network']['network_name'] and \
        pn_cfg['dataset']['dataset_name'] == \
        nnpu_cfg['dataset']['dataset_name'] and \
        pn_cfg['network']['network_name'] == \
        nnpu_cfg['network']['network_name']
    exp_name = 'exp_{}_{}_{}'.format(
        nnpu_cfg['dataset']['dataset_name'],
        nnpu_cfg['network']['network_name'],
        helper.get_unique_name()
    )
    log_data = helper.LogData()

    # upu and nnpu.
    PuDataset, PnDataset = helper.load_dataset(upu_cfg)
    pu_dataset = PuDataset(upu_cfg['dataset'])
    training_iterator = pu_dataset.get_training_iterator()

    Network = helper.load_network(upu_cfg)
    upu_trainer = trainer.TrainerBase(upu_cfg['trainer'])
    upu_trainer.setup_network(Network(upu_cfg['network'], pu_dataset.prior))
    nnpu_trainer = trainer.TrainerBase(nnpu_cfg['trainer'])
    nnpu_trainer.setup_network(Network(nnpu_cfg['network'], pu_dataset.prior))

    epoch = 0
    upu_train_accum, nnpu_train_accum = [], []
    for data in training_iterator:
        upu_train_accum.append(upu_trainer.train(data))
        nnpu_train_accum.append(nnpu_trainer.train(data))
        if training_iterator.epoch_finished > epoch:
            epoch = training_iterator.epoch_finished

            # train losses.
            upu_train_loss = sum(upu_train_accum) / float(len(upu_train_accum))
            nnpu_train_loss = sum(nnpu_train_accum) / float(len(nnpu_train_accum))
            upu_train_accum.clear()
            nnpu_train_accum.clear()

            # test 0-1 losses.
            test_iter = pu_dataset.get_testing_iterator()
            upu_test_loss = upu_trainer.evaluate_error(copy.deepcopy(test_iter))
            nnpu_test_loss = nnpu_trainer.evaluate_error(test_iter)

            print(
                'Epoch: {0:>5}, upu train: {1:7.4f}, upu test: {2:7.4f}, '
                'nnpu train: {3:7.4f}, nnpu test: {4:7.4f}'
                .format(epoch, upu_train_loss, upu_test_loss, nnpu_train_loss,
                        nnpu_test_loss))
            log_data.log_loss('upu train', upu_train_loss)
            log_data.log_loss('nnpu train', nnpu_train_loss)
            log_data.log_loss('upu test', upu_test_loss)
            log_data.log_loss('nnpu test', nnpu_test_loss)

    # pn.
    pn_dataset = PnDataset(pn_cfg['dataset'], pu_dataset.prior)
    pn_trainer = trainer.TrainerBase(pn_cfg['trainer'])
    pn_trainer.setup_network(Network(pn_cfg['network'], pu_dataset.prior))
    pn_training_iterator = pn_dataset.get_training_iterator()
    epoch = 0
    pn_accum = []
    for data in pn_training_iterator:
        pn_accum.append(pn_trainer.train(data))
        if pn_training_iterator.epoch_finished > epoch:
            epoch = pn_training_iterator.epoch_finished

            pn_train_loss = sum(pn_accum) / float(len(pn_accum))
            pn_accum.clear()

            test_set = pn_dataset.get_testing_iterator()
            pn_test_loss = pn_trainer.evaluate_error(test_set)
            log_data.log_loss('pn test', pn_test_loss)
            log_data.log_loss('pn train', pn_train_loss)
            print('Epoch: {0:>5}, pn train: {1:7.4f}, pn test: {2:7.4f}'
                  .format(epoch, pn_train_loss, pn_test_loss))
    helper.save_log_data(log_data, exp_name)
    helper.settle_saved_data(exp_name)
    return exp_name