def load_mnist(lazy): """ Load mnist dataset from a hdf5 file and test if it matches mlpython's one. """ dataset_name = 'mnist' start = time.time() import mlpython.datasets.store as mlstore mldatasets = mlstore.get_classification_problem(dataset_name, load_to_memory= (not lazy)) print "mlpython version loaded ({0:.2f}sec).".format(time.time() - start) start = time.time() dataset_name = os.path.join(os.environ['MLPYTHON_DATASET_REPO'], dataset_name + ".h5") dataset = mldata.dataset_store.load(dataset_name, lazy=lazy) print "mldata version loaded ({0:.2f}sec).".format(time.time() - start) print "Comparing first 1000..." count = 0 for (e1, t1), (e2, t2) in itertools.izip(dataset, itertools.chain(*mldatasets)): #print t1, t2 assert_array_almost_equal(e1, e2) assert_equal(t1, t2) count += 1 if count >= 1000: break
def train(): sys.argv.pop(0); # Remove first argument # Check if every option(s) from parent's script are here. if 5 != len(sys.argv): print "Usage: python run_stacked_autoencoders_nnet.py lr hidden_size n_epochs n_cdk seed" print "" print "Ex.: python run_stacked_autoencoders_nnet.py 0.01 50 10 10 1234" sys.exit() # Set the constructor str_ParamOption = "lr=" + sys.argv[0] + ", " + "hidden_size=" + sys.argv[1] + ", " + "n_epochs=" + sys.argv[2] + ", " +\ "CDk=" + sys.argv[3] + ", " + "seed=" + sys.argv[4] try: objectString = 'myObject = RBM(' + str_ParamOption + ')' exec objectString #code = compile(objectString, '<string>', 'exec') #exec code except Exception as inst: print "Error while instantiating RBM (required hyper-parameters are probably missing)" print inst print "Loading dataset..." trainset,validset,testset = dataset_store.get_classification_problem('ocr_letters') print "Training..." myObject.train(trainset) #Store the trained dictionary and the parameters to a file. pickle.dump((myObject.W, myObject.b, myObject.hidden_size), open("Models/RBM/model%d.pkl"%experiment_number, 'wb'))
def get_representation(): # Load the dictionary and corresponding args. (W, b, hidden_size) = pickle.load(open("Models/RBM/model%d.pkl"%experiment_number,'rb')) # Set the constructor myObject = RBM(hidden_size=hidden_size) print "Loading dataset..." trainset,validset,testset = dataset_store.get_classification_problem('ocr_letters') encoded_trainset = [] encoded_validset = [] encoded_testset = [] print "Initializing..." myObject.initialize(W,b) print "Encoding the trainset..." counter = 0 #Inelegant, I know! I use this to only use the first 1000 values. for input,target in trainset: #Encode the sample. h = myObject.encode(input) encoded_trainset.append(h) # counter +=1 # if counter == 1000: # break # Save the datasets to files. filename = "Models/RBM/trainset%d.pkl"%(experiment_number) pickle.dump( np.asarray(encoded_trainset) , open(filename, 'wb')) counter = 0 print "Encoding the validset..." for input,target in validset: #Encode the sample. h = myObject.encode(input) encoded_validset.append(h) # counter +=1 # if counter == 1000: # break filename = "Models/RBM/validset%d.pkl"%(experiment_number) pickle.dump( np.asarray(encoded_validset) , open(filename, 'wb')) #Note: only need to do it for the best hyper-params at the end. print "Encoding the testset..." for input,target in testset: #Encode the sample. h = myObject.encode(input) encoded_testset.append(h) filename = "Models/RBM/testset%d.pkl"%(experiment_number) pickle.dump( np.asarray(encoded_testset), open(filename, 'wb'))
def sklearn_convex(classifier, algorithm, max_evals=100, seed=1, filename='none', preproc=[], loss=None): global suppress_output if suppress_output: dump_file = None else: dump_file = filename + '.dump' estim = hyperopt_estimator(classifier=classifier, algo=algorithm, preprocessing=preproc, max_evals=max_evals, trial_timeout=240, fit_increment_dump_filename=dump_file, loss_fn=loss) filename = filename + '.out' dataset_store.download('convex') trainset, validset, testset = dataset_store.get_classification_problem( 'convex') X_train = trainset.data.mem_data[0] y_train = trainset.data.mem_data[1] X_valid = validset.data.mem_data[0] y_valid = validset.data.mem_data[1] X_test = testset.data.mem_data[0] y_test = testset.data.mem_data[1] X_fulltrain = np.concatenate((X_train, X_valid)) y_fulltrain = np.concatenate((y_train, y_valid)) print(y_train.shape) print(y_valid.shape) print(y_test.shape) #find_model( X_train, y_train, X_test, y_test, estim, filename ) find_model(X_fulltrain, y_fulltrain, X_test, y_test, estim, filename)
def convex(): dataset_store.download('convex') trainset,validset,testset = dataset_store.get_classification_problem('convex') X_train = trainset.data.mem_data[0] y_train = trainset.data.mem_data[1] X_valid = validset.data.mem_data[0] y_valid = validset.data.mem_data[1] X_test = testset.data.mem_data[0] y_test = testset.data.mem_data[1] X_fulltrain = np.concatenate((X_train, X_valid)) y_fulltrain = np.concatenate((y_train, y_valid)) pca = PCA() X_train_pca = pca.fit_transform( X_fulltrain ) X_test_pca = pca.fit_transform( X_test ) clfs = [ MultinomialNB(), SVC(), KNeighborsClassifier(), SGDClassifier() ] pca_clfs = [ SVC(), KNeighborsClassifier(), SGDClassifier() ] print("Convex\n") with open( "convex_baselines.txt", 'w' ) as f: for clf in clfs: clf.fit( X_fulltrain, y_fulltrain ) pred = clf.predict( X_test ) score = metrics.f1_score( y_test, pred ) print( "Classifier: %s\nScore: %f\n" % (clf, score) ) f.write("Classifier: %s\nScore: %f\n\n" % (clf, score)) for clf in pca_clfs: clf.fit( X_train_pca, y_fulltrain ) pred = clf.predict( X_test_pca ) score = metrics.f1_score( y_test, pred ) print( "Classifier: PCA + %s\nScore: %f\n" % (clf, score) ) f.write("Classifier: PCA + %s\nScore: %f\n\n" % (clf, score))
def convex(): dataset_store.download('convex') trainset, validset, testset = dataset_store.get_classification_problem( 'convex') X_train = trainset.data.mem_data[0] y_train = trainset.data.mem_data[1] X_valid = validset.data.mem_data[0] y_valid = validset.data.mem_data[1] X_test = testset.data.mem_data[0] y_test = testset.data.mem_data[1] X_fulltrain = np.concatenate((X_train, X_valid)) y_fulltrain = np.concatenate((y_train, y_valid)) pca = PCA() X_train_pca = pca.fit_transform(X_fulltrain) X_test_pca = pca.fit_transform(X_test) clfs = [MultinomialNB(), SVC(), KNeighborsClassifier(), SGDClassifier()] pca_clfs = [SVC(), KNeighborsClassifier(), SGDClassifier()] print("Convex\n") with open("convex_baselines.txt", 'w') as f: for clf in clfs: clf.fit(X_fulltrain, y_fulltrain) pred = clf.predict(X_test) score = metrics.f1_score(y_test, pred) print("Classifier: %s\nScore: %f\n" % (clf, score)) f.write("Classifier: %s\nScore: %f\n\n" % (clf, score)) for clf in pca_clfs: clf.fit(X_train_pca, y_fulltrain) pred = clf.predict(X_test_pca) score = metrics.f1_score(y_test, pred) print("Classifier: PCA + %s\nScore: %f\n" % (clf, score)) f.write("Classifier: PCA + %s\nScore: %f\n\n" % (clf, score))
def sklearn_convex( classifier, algorithm, max_evals=100, seed=1, filename = 'none', preproc=[], loss=None ): global suppress_output if suppress_output: dump_file = None else: dump_file = filename+'.dump' estim = hyperopt_estimator( classifier=classifier, algo=algorithm, preprocessing=preproc, max_evals=max_evals, trial_timeout=240, fit_increment_dump_filename=dump_file, loss_fn=loss) filename = filename + '.out' dataset_store.download('convex') trainset,validset,testset = dataset_store.get_classification_problem('convex') X_train = trainset.data.mem_data[0] y_train = trainset.data.mem_data[1] X_valid = validset.data.mem_data[0] y_valid = validset.data.mem_data[1] X_test = testset.data.mem_data[0] y_test = testset.data.mem_data[1] X_fulltrain = np.concatenate((X_train, X_valid)) y_fulltrain = np.concatenate((y_train, y_valid)) print(y_train.shape) print(y_valid.shape) print(y_test.shape) #find_model( X_train, y_train, X_test, y_test, estim, filename ) find_model( X_fulltrain, y_fulltrain, X_test, y_test, estim, filename )
def get_dictionary(): """ Train the sparse coding model and save the dictionary and params to a file. """ sys.argv.pop(0); # Remove first argument # Check if every option(s) from parent's script are here. if 5 != len(sys.argv): print "Usage: python run_sparse_code.py lr size L1 n_epochs seed" print "" print "Ex.: python run_sparse_code.py 0.1 20 0.1 5 1234" sys.exit() # Set the constructor str_ParamOption = "lr=" + sys.argv[0] + ", " + "size=" + sys.argv[1] + ", " + "L1=" + sys.argv[2] + ", " + "n_epochs=" + sys.argv[3] + ", " + "seed=" + sys.argv[4] str_ParamOptionValue = sys.argv[0] + "\t" + sys.argv[1] + "\t" + sys.argv[2] + "\t" + sys.argv[3] + "\t" + sys.argv[4] try: objectString = 'myObject = SparseCode(' + str_ParamOption + ')' exec objectString #code = compile(objectString, '<string>', 'exec') #exec code except Exception as inst: print "Error while instantiating SparseCode (required hyper-parameters are probably missing)" print inst print "Loading dataset..." trainset,validset,testset = dataset_store.get_classification_problem('ocr_letters') print "Training..." myObject.train(trainset) #Store the trained dictionary and the parameters to a file. pickle.dump((myObject.dictionary, myObject.lr, myObject.hidden_size, myObject.L1), open("Models/SC/dictionary%d.pkl"%experiment_number, 'wb')) myObject.show_filters()
import os import itertools import numpy as np import fcntl import copy from string import Template import mlpython.datasets.store as dataset_store import mlpython.mlproblems.generic as mlpb from rbm import RBM #from autoencoder import Autoencoder print "Loading dataset..." trainset,validset,testset = dataset_store.get_classification_problem('ocr_letters') print "Train RBM for 10 iterations... (this might take a few minutes)" rbm = RBM(n_epochs = 10, hidden_size = 200, lr = 0.01, CDk = 1, seed=1234 ) rbm.train(mlpb.SubsetFieldsProblem(trainset)) rbm.show_filters()
# Set the constructor str_ParamOption = "lr=" + sys.argv[0] + ", " + "dc=" + sys.argv[1] + ", " + "sizes=" + sys.argv[2] + ", " + "L2=" + \ sys.argv[3] + ", " + "L1=" + sys.argv[4] + ", " + "seed=" + sys.argv[5] + ", " + "tanh=" + sys.argv[6] str_ParamOptionValue = sys.argv[0] + "\t" + sys.argv[1] + "\t" + sys.argv[2] + "\t" + sys.argv[3] + "\t" + sys.argv[ 4] + "\t" + sys.argv[5] + "\t" + sys.argv[6] try: objectString = 'myObject = NeuralNetwork(n_epochs=1,' + str_ParamOption + ')' exec objectString # code = compile(objectString, '<string>', 'exec') # exec code except Exception as inst: print "Error while instantiating NeuralNetwork (required hyper-parameters are probably missing)" print inst print "Loading dataset..." trainset, validset, testset = dataset_store.get_classification_problem('ocr_letters') print "Training..." # Early stopping code best_val_error = np.inf best_it = 0 str_header = 'best_it\t' look_ahead = 5 n_incr_error = 0 for stage in range(1, 500 + 1, 1): if not n_incr_error < look_ahead: break myObject.n_epochs = stage myObject.train(trainset) n_incr_error += 1 outputs, costs = myObject.test(trainset) errors = np.mean(costs, axis=0)
def get_representation(): """ Grab the dictionary, convert the datasets to a sparse representation and save them to a file. """ # Load the dictionary and corresponding args. (dictionary, lr, hidden_size, L1) = pickle.load(open("Models/SC/dictionary%d.pkl"%experiment_number,'rb')) # Set the constructor myObject = SparseCode(lr,hidden_size,L1) print "Loading dataset..." trainset,validset,testset = dataset_store.get_classification_problem('ocr_letters') encoded_trainset = [] trainset_out = [] encoded_validset = [] validset_out = [] encoded_testset = [] testset_out = [] print "Initializing..." myObject.initialize_dictionary(dictionary) print "Encoding the trainset..." #counter = 0 #Inelegant, I know! I use this to only use the first 1000 values. for input,target in trainset: #Run ISTA h = myObject.infer(input) encoded_trainset.append(h) trainset_out.append(target) # counter +=1 # if counter == 1000: # break # Save the datasets to files. filename = "Models/SC/trainset%d.pkl"%(experiment_number) pickle.dump( np.asarray(encoded_trainset) , open(filename, 'wb')) filename = "Models/train_outputs.pkl" pickle.dump( np.asarray(trainset_out) , open(filename, 'wb')) #counter = 0 print "Encoding the validset..." for input,target in validset: #Run ISTA h = myObject.infer(input) encoded_validset.append(h) validset_out.append(target) # counter +=1 # if counter == 1000: # break filename = "Models/SC/validset%d.pkl"%(experiment_number) pickle.dump( np.asarray(encoded_validset) , open(filename, 'wb')) filename = "Models/valid_outputs.pkl" pickle.dump( np.asarray(validset_out) , open(filename, 'wb')) Note: only need to do it for the best hyper-params at the end. print "Encoding the testset..." for input,target in testset: #Run ISTA h = myObject.infer(input) encoded_testset.append(h) testset_out.append(target) filename = "Models/SC/testset%d.pkl"%(experiment_number) pickle.dump( np.asarray(encoded_testset), open(filename, 'wb')) filename = "Models/test_outputs.pkl" pickle.dump( np.asarray(testset_out) , open(filename, 'wb'))