def run(is_test=False): count = [] word2idx = {} Config.is_test = is_test if not os.path.exists(Config.checkpoint_dir): os.makedirs(Config.checkpoint_dir) if not os.path.exists(Config.vector_dir): os.makedirs(Config.vector_dir) train_data = read_data( '%s/%s.train.txt' % (Config.data_dir, Config.data_name), count, word2idx) valid_data = read_data( '%s/%s.valid.txt' % (Config.data_dir, Config.data_name), count, word2idx) test_data = read_data( '%s/%s.test.txt' % (Config.data_dir, Config.data_name), count, word2idx) idx2word = dict(zip(word2idx.values(), word2idx.keys())) save_obj('%s/idx2word.pkl' % (Config.vector_dir), idx2word) save_obj('%s/word2idx.pkl' % (Config.vector_dir), word2idx) Config.nwords = len(word2idx) tf.reset_default_graph() with tf.Session() as sess: model = MemN2N(Config, sess, True) model.build_model() if Config.is_test: model.run(valid_data, test_data) else: model.run(train_data, valid_data) tf.summary.FileWriter("./logs", graph=tf.get_default_graph())
def get_embeddings_tokenizer(filename1, filename2, EMBEDDING_DIM): """Get the embeddings and the tokenizer for words.""" data = read_data(filename1, clean=False)[1:] + read_data(filename2, clean=False)[1:] texts = [] for d in data: raw = d.split() texts.append(raw[0]) word_tokenizer = Tokenizer() word_tokenizer.fit_on_texts(texts) word_tokenizer.word_index['<<SPAD>>'] = len(word_tokenizer.word_index) + 1 word_visit = [0 for i in range(len(word_tokenizer.word_index) + 1)] embedding_matrix = np.random.random((len(word_tokenizer.word_index)+1, EMBEDDING_DIM)) embedding_matrix = embedding_matrix.astype(np.float64) for d in data: raw = d.split() label = raw[0] word_index = word_tokenizer.word_index[label] if word_visit[word_index] == 0: vector = [float(x) for x in raw[1:EMBEDDING_DIM+1]] word_visit[word_index] = 1 else: vector = (embedding_matrix[word_index] + np.array([float(x) for x in raw[1:EMBEDDING_DIM+1]]))/2 embedding_matrix[word_index] = vector return embedding_matrix, word_tokenizer
def main(): trainData = dh.read_data('./movie-data/ratings-train.csv') testData = dh.read_data('./movie-data/ratings-test.csv') movFeat = dh.read_data('./movie-data/movie-features.csv') start = time.time() q3a(trainData, testData) q3b(movFeat, trainData, testData) q3c(movFeat, trainData, testData) q3d(movFeat, trainData, testData) print "Overall time: ", time.time() - start
def q4b(): print "\n------------------------------------------Question B------------------------------------------" trainMatrix = dh.read_data('./data/zip.train') testMatrix = dh.read_data('./data/zip.test') graph_setup("k", "Error", "PCA degree vs Error") PCA = 1 gammaMin = 0.000 gammaMax = 0.015 gammaNum = 10 cMin = 0.0 cMax = 25.0 cNum = 25 PCAmin = 0 PCAmax = 100 PCAnum = 100 rbf, trainSVMY, testSVMY, trainXIn, testXIn, trainY, testY, testError, trainError, cvError, gamma, C, k, runTime, valErrors = ln.margin_svm( trainMatrix, testMatrix, PCA=PCA, matrixList1=[2], matrixList2=[8], gammaMin=gammaMin, gammaMax=gammaMax, gNum=gammaNum, cMin=cMin, cMax=cMax, cNum=cNum, PCAmin=PCAmin, PCAmax=PCAmax, PCAnum=PCAnum) for tmpk, gammaDict in valErrors.items(): tmpC, tmpgamma = min(gammaDict, key=gammaDict.get) print "K: ", tmpk, "\tC: ", tmpC, "\tGamma: ", tmpgamma, "\tcvError: ", gammaDict[ (tmpC, tmpgamma)], "\ttrainError: ", trainError[ tmpk], "\ttestError: ", testError[tmpk] print "Optimal Setup:\tGamma: ", gamma, "\tRuntime: ", runTime, "\tC: ", C, "\tFeatures: ", k print "trainError: ", trainError[k], "\ttestError: ", testError[ k], "\tcvError: ", cvError[k] PCA_graph_add(trainError, "Train Error", 'blue') PCA_graph_add(testError, "Test Error", 'green') PCA_graph_add(cvError, "CV Error", 'red') plt.legend() plt.savefig("q3b.eps", format='eps', dpi=1000)
def run_all2(self): for i in range(9): print("epoch " + str(i + 1) + " out of 9") prices = data_handler.read_data( "./Data/lob_datatrial000" + str(i + 1) + ".csv", "MIC") X, y = data_handler.split_data(prices, self.steps) self.train(X, y, 100, 0) time = data_handler.read_data("./Data/lob_data.csv", "TIME") prices = data_handler.read_data("./Data/lob_data.csv", "MIC") X, y = data_handler.split_data(prices, self.steps) self.test(X, y, verbose=1)
def main(): print "------------- MEMM based POS-TAGGER -------------------- " tagger = MEMMTagger() trainset = read_data("oct27.train") devset = read_data("oct27.dev") testset = read_data("oct27.test") tagger.train(trainset) print '----------- Dev Set Results ----------------- ' tagger.test(devset) print '----------- Test Set Results ---------------- ' tagger.test(testset)
def main(): print "----------- Structured Perceptron POS-TAGGER ---------- " tagger = StructuredPerceptronTagger() trainset = read_data("oct27.train") devset = read_data("oct27.dev") testset = read_data("oct27.test") tagger.train(trainset) print '----------- Dev Set Results ----------------- ' tagger.test(devset) print '----------- Test Set Results ---------------- ' tagger.test(testset)
def run_all(self): time = data_handler.read_data("./Data/lob_data.csv", "TIME") prices = data_handler.read_data("./Data/lob_data.csv", "MIC") X, y = data_handler.split_data(prices, self.steps) split_ratio = [9, 1] train_X, test_X = data_handler.split_train_test_data(X, split_ratio) train_X = train_X.reshape((-1, self.steps, 1)) test_X = test_X.reshape((-1, self.steps, 1)) train_y, test_y = data_handler.split_train_test_data(y, split_ratio) self.train(train_X, train_y, 200, verbose=1) self.test(test_X, test_y, verbose=1) self.save()
def q4a(): # extract data print "------------------------------------------Question A------------------------------------------" trainMatrix = dh.read_data('./data/zip.train') testMatrix = dh.read_data('./data/zip.test') PCA = 0 gammaMin = 0.0 gammaMax = 0.05 gammaNum = 500 cMin = 0.0 cMax = 0.1 cNum = 10 PCAmin = 0 PCAmax = 100 PCAnum = 100 rbf, trainSVMY, testSVMY, trainXIn, testXIn, trainY, testY, testError, trainError, cvError, gamma, C, k, runTime, valErrors = ln.margin_svm( trainMatrix, testMatrix, PCA=PCA, matrixList1=[2], matrixList2=[8], gammaMin=gammaMin, gammaMax=gammaMax, gNum=gammaNum, cMin=cMin, cMax=cMax, cNum=cNum, PCAmin=PCAmin, PCAmax=PCAmax, PCAnum=PCAnum) for tmpk, gammaDict in valErrors.items(): tmpC, tmpgamma = min(gammaDict, key=gammaDict.get) print "K: ", tmpk, "\tC: ", tmpC, "\tGamma: ", tmpgamma, "\tcvError: ", gammaDict[ (tmpC, tmpgamma)], "\ttrainError: ", trainError[ tmpk], "\ttestError: ", testError[tmpk] print "Optimal Setup:\tGamma: ", gamma, "\tRuntime: ", runTime, "\tC: ", C, "\tFeatures: ", k print "trainError: ", trainError[k], "\ttestError: ", testError[ k], "\tcvError: ", cvError[k] graph_setup("Gamma", "Error", "RBF Kernel SVM") graph_add(valErrors, "Train Error", 'red') plt.legend() plt.savefig("q3a.eps", format='eps', dpi=1000)
def get_features_importance(): seed(47) np.random.seed(47) datasets = read_data('multiclass') dic = {} l = [] for atype in datasets.keys(): print(atype) dic[atype] = {} df = datasets[atype] X, y = df[df.columns[:-1]], df[df.columns[-1]] sss = StratifiedShuffleSplit(n_splits=1, test_size=.2) for l in [learners[1]]: for train_index, test_index in sss.split(X, y): train_df = df.iloc[train_index] test_df = df.iloc[test_index] tuner = get_tuner(l) default_config = tuner.default_config clf = tuner.get_clf(default_config) x_train, y_train = train_df[train_df.columns[:-1]], train_df[train_df.columns[-1]] x_test, y_test = test_df[test_df.columns[:-1]], test_df[test_df.columns[-1]] clf.fit(x_train, y_train) prediction = clf.predict(x_test) cm = confusion_matrix(y_test, prediction) key_feats_indices = np.argsort(clf.feature_importances_)[::-1][:5] for index in key_feats_indices: print("%s: %s" % (df.columns[index], clf.feature_importances_[index]), end="; ") import pdb pdb.set_trace() print()
def cross_validation(): df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location) data_dict: dict = dh.split_original_data(df_data, 0.2) df_train_data: pd.DataFrame = data_dict["train_data"] df_test_data: pd.DataFrame = data_dict["test_data"] A: np.ndarray = fill_averages(df_train_data) U, Vh = perform_svd(A) min_k = 2 max_k = 50 print("Starting cross validation") ks = [] errs = [] # Winning K = 10 for k in range(min_k, max_k + 1): prediction_matrix = make_predictions(k, U, Vh) err = calc_rmse(df_test_data, prediction_matrix) print("K = {0}, RMSE = {1}".format(k, err)) ks.append(k) errs.append(err) plt.plot(ks, errs) plt.show()
def read_layered_subword(filename): """Read data as subwords.""" text_data = read_data(filename) text_layered = break_in_subword(text_data) return text_layered
def run(): logging.config.fileConfig("logging_config.ini") print("Processing data") df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location) data_dict: dict = dh.split_original_data(df_data, 0.1) df_train_data: pd.DataFrame = data_dict["train_data"] df_test_data: pd.DataFrame = data_dict["test_data"] train_samples: np.ndarray = dh.df_as_array(df_train_data) test_samples: np.ndarray = dh.df_as_array(df_test_data) mean_predictions = calculate_all_means(df_train_data) # initialize variables needed for training k = 100 bu = np.zeros(paths.num_users) bm = np.zeros(paths.num_movies) user_features = np.zeros((paths.num_users, k)) movie_features = np.zeros((k, paths.num_movies)) train(k, mean_predictions, user_features, movie_features, bu, bm, train_samples, test_samples) print("Calculating predictions and writing file") prediction_matrix = final_predictions(mean_predictions, user_features, movie_features, bu, bm) dh.write_submission(prediction_matrix)
def playing_with_data(): #folder_data = '/home/maja/PhDProject/human_data/data/' folder_data = '/home/maja/PhDProject/data/' folder_data='/home/maja/PhDProject/human_data/data/' folder_data ='/home/maja/PhDProject/data/' folder_specific = '2013_07_31/' #'HT_2013_04_02/' folder_specific = 'others/' folder_specific = '2013_08_10/' file_data = folder_specific + '2013_07_31_0002.abf' #2013_04_02_0013.abf' file_data = folder_specific + '2013_07_03 PR1_0000.abf' file_data = folder_specific + '2013_09_03_0002.abf' #file_data = folder_specific + '2013_09_03_0006.abf' file_data = folder_specific + '2013_09_05_0009_afterNBQX.abf' file_data = folder_specific + '2013_09_05_0019_synch.abf' file_data = folder_specific + '2013_09_05_0017.abf' file_data = folder_specific + '2013_10_08_0002.abf' folder_save = '/home/maja/PhDProject/data/2013_07_31/saved/' folder_save = '/home/maja/PhDProject/human_data/data/others/' file_save = folder_save + 'all_data_gabaB.npz' #file_save = folder_save + 'data.dat' data, scale, fs = dh_temp.read_data(folder_data, file_data) dh_temp.save_data(folder_save, file_save, data, scale, fs) del data, scale, fs display.plot_data(folder_save, file_save, x_scale = 'ms')
def run(): logging.config.fileConfig("logging_config.ini") print("Processing data") df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location) data_dict: dict = dh.split_original_data(df_data, 0.1) df_train_data: pd.DataFrame = data_dict["train_data"] df_test_data: pd.DataFrame = data_dict["test_data"] train(df_train_data, df_test_data)
def execute_approach(): df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location) A: np.ndarray = fill_averages(df_data) U, Vh = perform_svd(A) # K = 9 was the winning value from the cross validation k = 10 prediction_matrix = make_predictions(k, U, Vh) assert(prediction_matrix.shape == (paths.num_users, paths.num_movies)) dh.write_submission(prediction_matrix)
def run(): logging.config.fileConfig("logging_config.ini") print("Processing data") df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location) data_dict: dict = dh.split_original_data(df_data, 0.1) df_train_data: pd.DataFrame = data_dict["train_data"] df_test_data: pd.DataFrame = data_dict["test_data"] # cross_validation(df_train_data, df_test_data) # assign the best result from cross validation to K K = 10 train(K, df_train_data, df_test_data)
def run(): logging.config.fileConfig("logging_config.ini") print("Processing data") df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location) data_dict: dict = dh.split_original_data(df_data, 0.1) df_train_data: pd.DataFrame = data_dict["train_data"] df_test_data: pd.DataFrame = data_dict["test_data"] print("Calculating initialization data") mean_predictions = calculate_all_means(df_train_data) train_samples: np.ndarray = dh.df_as_array(df_train_data) # Perform either cross validation or a single run using best result # cross_validation(df_train_data, train_samples, df_test_data, mean_predictions) k = 10 execute_approach(k, df_train_data, train_samples, df_test_data, mean_predictions)
def get_embeddings_tokenizer(filename, EMBEDDING_DIM): """Get the embeddings and the tokenizer for words.""" data = read_data(filename) texts = [] embedding_matrix = np.random.randn(len(data), EMBEDDING_DIM) embedding_matrix = embedding_matrix.astype(np.float64) for i in range(len(data)): raw = data[i].split() label = raw[0] vector = [float(x) for x in raw[1:EMBEDDING_DIM+1]] texts.append(label) embedding_matrix[i] = vector word_tokenizer = Tokenizer() word_tokenizer.fit_on_texts(texts) word_tokenizer.word_index['<<SPAD>>'] = len(word_tokenizer.word_index) + 1 return embedding_matrix, word_tokenizer
def execute(res=''): seed(47) np.random.seed(47) datasets = read_data() dic={} l=[] for atype in datasets.keys(): print(atype) dic[atype] = {} df = datasets[atype] #import pdb #pdb.set_trace() X, y = df[df.columns[:-1]], df[df.columns[-1]] sss = StratifiedShuffleSplit(n_splits=5, test_size=.2) for m in metrics: print(m) dic[atype][m] = {} for l in learners: dic[atype][m][l] = {'flash': [], 'default': []} for train_index, test_index in sss.split(X, y): train_df = df.iloc[train_index] test_df = df.iloc[test_index] tuner = get_tuner(l) best_config = tuning(tuner, train_df, project_name="", metric=m) default_config = tuner.default_config x_train, y_train = train_df[train_df.columns[:-1]], train_df[train_df.columns[-1]] x_test, y_test = test_df[test_df.columns[:-1]], test_df[test_df.columns[-1]] tuned_score = measure_fitness(tuner, x_train, y_train, x_test, y_test, best_config, m) default_score = measure_fitness(tuner, x_train, y_train, x_test, y_test, default_config, m) dic[atype][m][l]['flash'].append(tuned_score) dic[atype][m][l]['default'].append(default_score) print(l, dic[atype][m][l]) print() print("*"*10) with open('dump/flash.pickle', 'wb') as handle: pickle.dump(dic, handle)
def init(data=[], files=globe.dic['files'], replot=globe.dic['replots']): dic = globe.dic for i, filename in enumerate(files): if dic['Verbose'] > 0: print "loading", filename sys_err = dic['sys_err_default'] if len(filename.split('#')) == 2: sys_err = float(filename.split('#')[1].strip()) filename = filename.split('#')[0].strip() if dic['outputs']: output = dic['outputs'].pop() else: output = '.'.join(filename.split('.')[:-1]) dic['numbered'] = 0 # Now read data file blocks = make_blocks(read_data(filename)) if blocks: for j, b in enumerate(blocks): if dic['GroupBy'] == 'files': data.append([[i, j], filename, output, b, sys_err]) elif dic['GroupBy'] == 'blocks': data.append([[j, i], filename, output, b, sys_err]) data.sort(key=lambda x: x[0]) data = structure(data) for i, filename in enumerate(replot): if dic['Verbose'] > 0: print "reloading data from", filename if len(filename.split('#')) == 2: filename = filename.split('#')[0].strip() data = data + reload_plot(filename) return data
"""Divide the data into train and test sets.""" nrmlzd_list, size = find_distribution(data) train_size = int(0.8 * size) test_size = int(0.2 * size) print(train_size, test_size) train = [] test = [] train_id = [] test_id = [] counter = 0 for each in nrmlzd_list: for i in range(0, train_size): data[each[i]]['id'] = each[i] train.append(data[each[i]]) train_id.append(each[i]) #pdb.set_trace() for j in range(0, test_size): data[each[j]]['id'] = each[j] test.append(data[each[j]]) test_id.append(each[j]) #pdb.set_trace() print(len(train), len(test)) return train, test if __name__ == "__main__": data = read_data("final_codemixed.json") train, test = divde_train_test(data) write_to_file(train, "train_data.json") write_to_file(test, "test_data.json")
loader.load() from indicnlp.transliterate.unicode_transliterate import ItransTransliterator def transliterate(data, lang): """Transliterator.""" total = len(data) new_data = list() for i in range(len(data)): printProgressBar(i + 1, total, prefix='Progress:', suffix='Complete', length=50) new_data.append(ItransTransliterator.to_itrans(data[i], LANG)) return new_data if __name__ == "__main__": LANG = 'hi' INPUT_FILE = "/home/chrizandr/code-mixing/data/IITB.en-hi.hi" OUTPUT_FILE = "/home/chrizandr/code-mixing/data/IITB.en-hi.hi.roman" print("Reading data") original_text = read_data(INPUT_FILE, encoding="UNI") print("Transliterating") romanized_text = transliterate(original_text, LANG) print("Writing to file") write_data(OUTPUT_FILE, romanized_text, encoding="UNI")
def q4c(): print "\n------------------------------------------Question C------------------------------------------" trainMatrix = dh.read_data('./data/zip.train') testMatrix = dh.read_data('./data/zip.test') PCA = 2 print "PCA: " gammaMin = 0.0 gammaMax = 0.02 gammaNum = 10 cMin = 0.0 cMax = 10 cNum = 20 PCAmin = 0 PCAmax = 100 PCAnum = 100 rbf, trainSVMY, testSVMY, trainX, testX, trainY, testY, testError, trainError, cvError, gamma, C, k, runTime, valErrors = ln.margin_svm( trainMatrix, testMatrix, PCA=PCA, matrixList1=[1], matrixList2=[0, 2, 3, 4, 5, 6, 7, 8, 9], gammaMin=gammaMin, gammaMax=gammaMax, gNum=gammaNum, cMin=cMin, cMax=cMax, cNum=cNum, PCAmin=PCAmin, PCAmax=PCAmax, PCAnum=PCAnum) for tmpk, gammaDict in valErrors.items(): tmpC, tmpgamma = min(gammaDict, key=gammaDict.get) print "K: ", tmpk, "\tC: ", tmpC, "\tGamma: ", tmpgamma, "\tcvError: ", gammaDict[ (tmpC, tmpgamma)], "\ttrainError: ", trainError[ tmpk], "\ttestError: ", testError[tmpk] print "gamma: ", gamma, "\ttime: ", runTime, "\tC: ", C, "\tFeatures: ", k print "trainError: ", trainError[k], "\ntestError: ", testError[ k], "\ncvError: ", cvError[k] #graph_setup("k", "Error", "graph") #PCA_graph_add(trainError, "Train Error", 'blue') #PCA_graph_add(testError, "Test Error", 'green') #PCA_graph_add(cvError, "CV Error", 'red') #plt.legend() #plt.savefig("q3c-pca.png") graph_setup("Feature 1", "Feature 2", "PCA Test Set Results") q4c_graph(testX, testY, rbf, "pca-contour-test.eps") graph_setup("feature1", "feature2", "PCA Train Set Results") q4c_graph(trainX, trainY, rbf, "pca-contour-train.eps", 1) #------------------------------------------------------------------------------ # NEXT SECTION #------------------------------------------------------------------------------ trainMatrix = dh.read_data('./data/features.train') testMatrix = dh.read_data('./data/features.test') PCA = 3 print "\nFeature: " rbf, trainSVMY, testSVMY, trainX, testX, trainY, testY, testError, trainError, cvError, gamma, C, k, runTime, valErrors = ln.margin_svm( trainMatrix, testMatrix, PCA=PCA, matrixList1=[1], matrixList2=[0, 2, 3, 4, 5, 6, 7, 8, 9], gammaMin=gammaMin, gammaMax=gammaMax, gNum=gammaNum, cMin=cMin, cMax=cMax, cNum=cNum, PCAmin=PCAmin, PCAmax=PCAmax, PCAnum=PCAnum) for tmpk, gammaDict in valErrors.items(): tmpC, tmpgamma = min(gammaDict, key=gammaDict.get) print "K: ", tmpk, "\tC: ", tmpC, "\tGamma: ", tmpgamma, "\tcvError: ", gammaDict[ (tmpC, tmpgamma)], "\ttrainError: ", trainError[ tmpk], "\ttestError: ", testError[tmpk] print "gamma: ", gamma, "\ttime: ", runTime, "\tC: ", C, "\tFeatures: ", k print "trainError: ", trainError[k], "\ntestError: ", testError[ k], "\ncvError: ", cvError[k] #PCA_graph_add(trainError, "Train Error", 'blue') #PCA_graph_add(testError, "Test Error", 'green') #PCA_graph_add(cvError, "CV Error", 'red') #plt.legend() #plt.savefig("q3c-feature.png") graph_setup("Feature 1", "Feature 2", "Feature Data Test Set Results") q4c_graph(testX, testY, rbf, "feature-contour-test.eps") graph_setup("Feature 1", "Feature 2", "Feature Data Train Set Results") q4c_graph(trainX, trainY, rbf, "feature-contour-train.eps", 1)
""".""" from data_handler import read_data, write_data, break_in_subword import pdb INPUT = "data/IITB.en-hi.hi.roman.clean" OUTPUT = "data/IITB.en-hi.hi.syll" print "Reading" data = read_data(INPUT, encoding="UNI", clean=True) print "Breaking" new_data = break_in_subword(data, sentences=True) print "Writing" write_data(OUTPUT, new_data, encoding="UNI") pdb.set_trace()
loader.load() from indicnlp.transliterate.unicode_transliterate import ItransTransliterator def transliterate(data, lang): """Transliterator.""" total = len(data) new_data = list() for i in range(len(data)): print(i, len(data)) # printProgressBar(i+1, total, prefix='Progress:', suffix='Complete', length=50) try: new_data.append(ItransTransliterator.to_itrans(data[i], LANG)) except IndexError: print(data[i]) return new_data if __name__ == "__main__": LANG = 'hi' INPUT_FILE = "/home/chrizandr/code-mixing/data/IITB.en-hi.hi" OUTPUT_FILE = "/home/chrizandr/code-mixing/data/IITB.en-hi.hi.roman" print("Reading data") original_text = read_data(INPUT_FILE, encoding="UNI", clean=False) print("Transliterating") romanized_text = transliterate(original_text, LANG) print("Writing to file") write_data(OUTPUT_FILE, romanized_text, encoding="UNI")
import sys from keras.callbacks import ModelCheckpoint import data_handler import model from data_handler import usable_chars train_x, train_y = data_handler.read_data("train.txt", 50) test_x, test_y = data_handler.read_data("test.txt", 50) lstm = model.Model(y_shape=len(usable_chars), batch_size=50) weights_file = "weights.hdf5" checkpoint = ModelCheckpoint(weights_file, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks_list = [checkpoint] if len(sys.argv) > 1: lstm.load_weights(sys.argv[1]) else: lstm.train_model(train_x, train_y, test_x, test_y, epochs=50, callbacks=callbacks_list) lstm.load_weights("weights.hdf5") acc, loss = lstm.test_model(train_x, train_y) print("Training Accuaracy:", acc, "Training Loss:", loss)
self.n_features = 1 def train(self, X, y, epochs, verbose): self.model.fit(X, y, epochs=epochs, verbose=verbose) def test(self, X, y): for i in range(len(X)): input = X[i].reshape((1, self.steps, 1)) yhat = self.model.predict(input, verbose=1) print(y[i], yhat[0][0], np.mean(input[0])) # model.fit(X, y, epochs=200, verbose=1) if __name__ == "__main__": # numpy.set_printoptions(threshold=sys.maxsize) time = data_handler.read_data("lob_datatrial0001.csv", "TIME") prices = data_handler.read_data("lob_datatrial0001.csv", "MIC") # splitting data into chunks of 4 steps = 59 reshape = True # X, y = data_handler.split_data(prices, steps, reshape) # split_ratio = [9,1] # train_X, test_X = data_handler.split_train_test_data(X, split_ratio) # train_X = train_X.reshape((-1, steps, 1)) # test_X = test_X.reshape((-1, steps, 1)) # train_y, test_y = data_handler.split_train_test_data(y, split_ratio)
import data_handler import general_functions from evaluation import evaluation import cPickle import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer import matplotlib.pyplot as plt if __name__ == "__main__": # ### # # 1 read data into dictionary # ### raw_data_dictionary = data_handler.read_data("./data") ### # 2 generate id list and text list from dictionary ### id_list, raw_text_list = data_handler.generate_lists_from_dictionary(raw_data_dictionary) ### # 3 stem texts and remove stopwords ### num_data = 20000 stemmed_text_list_all = data_handler.stem_text_and_remove_stopwords(raw_text_list, "Krovetz") stemmed_text_list = stemmed_text_list_all[:num_data]
vec_spec = vehicle_spec.VehicleSpec(angle_norm=30, image_crop_vert=[220, 480]) data_path = '/home/elschuer/data/LaneKeepingE2E/images_train_augmented/' desc_file = 'data_labels.csv' contains_full_path = True model_name = 'nvidia_model.h5' convert_image = False image_channels = 1 data_handler = data_handler.DataHandler( data_path, desc_file, vehicle_spec=vec_spec, contains_full_path=contains_full_path, convert_image=convert_image, image_channels=1) data_handler.read_data() if analyze_data: data_analyzer = data_analyzer.DataAnalyzer() data_analyzer.showDataDistribution(data_handler.y_data) data_analyzer.print_samples_not_equal_zero(data_handler.y_data) model_trainer = ModelTrainer(epochs=10, data_handler=data_handler, model_name=model_name) model_trainer.train_model() if shutdown_on_finish: os.system("shutdown now -h")