def run_transform(name, data_x, data_y, transformer): print("Working on {}...".format(name)) report_name = "reports/{}_nn_output.txt".format(name) sys.stdout = open(report_name, "w") #2 transform the data transform_x = transformer(data_x, data_y, name) plot_corr(name, pd.DataFrame(data=transform_x), data_y) kmeans_name = "{} KMeans Clustered".format(name) gmm_name = "{} GMM Clustered".format(name) #3 cluster the transformed data kmeans_clustered = kmeans(kmeans_name, transform_x, data_y) gmm_clustered = gmm(gmm_name, transform_x, data_y) #4 run neural network on transformed data x_train, x_test, y_train, y_test = split_data(transform_x, data_y) run_nn(name, x_train, x_test, y_train, y_test) #5 call run_nn on cluster from #3 (clustered from dimensionally reduced) kmx_train, kmx_test, kmy_train, kmy_test = split_data( kmeans_clustered, data_y) run_nn(kmeans_name, kmx_train, kmx_test, kmy_train, kmy_test) gmmx_train, gmmx_test, gmmy_train, gmmy_test = split_data( gmm_clustered, data_y) run_nn(gmm_name, gmmx_train, gmmx_test, gmmy_train, gmmy_test) sys.stdout = sys.__stdout__ print("Finished {}!".format(name)) print()
def make_src_tgt_data(): train_dir = "train.txt" eval_dir = "eval.txt" test_dir = "test.txt" train_data = utils.load_data(train_dir) eval_data = utils.load_data(eval_dir) test_data = utils.load_data(test_dir) train_data = utils.filter_sentences_with_punct(train_data) eval_data = utils.filter_sentences_with_punct(eval_data) test_data = utils.filter_sentences_with_punct(test_data) token_train_data = utils.tokenize_data(train_data) token_eval_data = utils.tokenize_data(eval_data) token_test_data = utils.tokenize_data(test_data) index2word, word2index = utils.build_vocab_with_nltk( token_train_data, 10000) train_src_data, train_tgt_data = utils.split_data(token_train_data, Limits) eval_src_data, eval_tgt_data = utils.split_data(token_eval_data, Limits) test_src_data, test_tgt_data = utils.split_data(token_test_data, Limits) utils.save_data("train_src_data.txt", train_src_data) utils.save_data("train_tgt_data.txt", train_tgt_data) utils.save_data("eval_src_data.txt", eval_src_data) utils.save_data("eval_tgt_data.txt", eval_tgt_data) utils.save_data("test_src_data.txt", test_src_data) utils.save_data("test_tgt_data.txt", test_tgt_data) utils.save_data("vocab.txt", index2word)
def _data(data_pth, split_val=True, verbose=1): data = np.load(data_pth, allow_pickle=True) x, y = data['x'], data['y'] x = x[:, :, np.newaxis] x_train, y_train, x_test, y_test = split_data(x, y) class_weights_dict = calc_class_weights(y_train) if split_val: x_train, y_train, x_val, y_val = split_data(x_train, y_train) y_train = to_one_hot(y_train, dimension=10) y_test = to_one_hot(y_test, dimension=10) y_val = to_one_hot(y_val, dimension=10) if verbose: print('\nx_train shape: %s' '\ny_train shape: %s' '\nx_test shape: %s' '\ny_test shape: %s' '\nx_val shape: %s' '\ny_val shape: %s' % (x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_val.shape, y_val.shape)) return x_train, y_train, x_test, y_test, x_val, y_val, class_weights_dict else: y_train = to_one_hot(y_train, dimension=10) y_test = to_one_hot(y_test, dimension=10) if verbose: print('\nx_train shape: %s' '\ny_train shape: %s' '\nx_test shape: %s' '\ny_test shape: %s' % (x_train.shape, y_train.shape, x_test.shape, y_test.shape)) return x_train, y_train, x_test, y_test, class_weights_dict
def load_dataset(cfg): length_std = float(cfg['length_std']) length_mean = float(cfg['length_mean']) noise_std = float(cfg['noise_std']) length = int(cfg['length']) nb_past_steps = int(cfg['nb_past_steps']) nb_future_steps = int(cfg['nb_future_steps']) train_fraction = float(cfg['train_fraction']) test_fraction = float(cfg['test_fraction']) valid_fraction = float(cfg['valid_fraction']) sequence = generate_sequence(length_std, length_mean, noise_std, length) xs, ys = utils.sequence_to_supervised(sequence, nb_past_steps, nb_future_steps) xs = np.expand_dims(xs, axis=2) ys = np.expand_dims(ys, axis=1) x_train, x_valid, x_test = utils.split_data(xs, train_fraction, valid_fraction, test_fraction) y_train, y_valid, y_test = utils.split_data(ys, train_fraction, valid_fraction, test_fraction) return x_train, y_train, x_valid, y_valid, x_test, y_test
def load_data(cfg): xml_path = cfg['xml_path'] nb_past_steps = int(cfg['nb_past_steps']) nb_future_steps = int(cfg['nb_future_steps']) train_fraction = float(cfg['train_fraction']) valid_fraction = float(cfg['valid_fraction']) test_fraction = float(cfg['test_fraction']) xs, ys = load_glucose_data(xml_path, nb_past_steps, nb_future_steps) ys = np.expand_dims(ys, axis=1) x_train, x_valid, x_test = utils.split_data(xs, train_fraction, valid_fraction, test_fraction) y_train, y_valid, y_test = utils.split_data(ys, train_fraction, valid_fraction, test_fraction) # scale data scale = float(cfg['scale']) x_train *= scale y_train *= scale x_valid *= scale y_valid *= scale x_test *= scale y_test *= scale return x_train, y_train, x_valid, y_valid, x_test, y_test
def main(): isconcat = True #isconcat =False #modelname = 'lambdanet-b512-model.h5' modelname = 'lambdanet-b512-l01-model.h5' #modelname = 'lambdanet-b512-windows-model.h5' #predfilename='lamdbanet-b512-pred.dat' #acc1:76 lambdanet max poolingresult #predfilename = 'lambdanet-b512-pred.dat' # acc1:60 lambdanet avg pooling result #predfilename = 'rank-pred.dat'#acc1:65 svm result predfilename = 'lambdanet-b512-l01-pred.dat' #acc1:74 lambdanet max poolingresult #predfilename = 'lambdanet-b512-windows-pred.dat' stage = 3 if stage <= 0: utils.prepare_data(vocab_size) if stage <= 1: utils.split_data(n=5) #utils.split_data(n=10) if stage <= 2: #train(n=5,isconcat=isconcat,modelname=modelname) train_lambda(n=5, isconcat=isconcat, modelname=modelname) #train_lambda(n=10, isconcat=isconcat, modelname=modelname) if stage <= 3: predict(n=5, isconcat=isconcat, modelname=modelname, predfilename=predfilename) if stage <= 4: utils.calc_metric(n=5, predfilename=predfilename) utils.calc_metric_method(n=5, predfilename=predfilename)
def find_best_c(x, y, share): x_train, x_check = utils.split_data(x, share) y_train, y_check = utils.split_data(y, share) best_c = 2**-7 best_f1 = 0 for i in range(-7, 7): c = 2**i v = train(x_train, y_train, c) p, r = utils.process_result(test(x_check, y_check, v)) f1 = utils.f1(p, r) if f1 > best_f1: best_f1 = f1 best_c = c return best_c
def find_best_c(x, y, share): x_train, x_check = utils.split_data(x, share) y_train, y_check = utils.split_data(y, share) best_c = 2 ** -7 best_f1 = 0 for i in range(-7, 7): c = 2 ** i v = train(x_train, y_train, c) p, r = utils.process_result(test(x_check, y_check, v)) f1 = utils.f1(p, r) if f1 > best_f1: best_f1 = f1 best_c = c return best_c
def main(FLAG): Model = SimpleModel(FLAG.input_dim, FLAG.hidden_dim, FLAG.output_dim, optimizer=tf.train.RMSPropOptimizer( FLAG.learning_rate)) image, label = load_dataset() image, label = image_augmentation(image, label, horizon_flip=True, control_brightness=True) label = label / 96. (train_X, train_y), (valid_X, valid_y), (test_X, test_y) = split_data(image, label) if FLAG.Mode == "validation": lr_list = 10**np.random.uniform(-6, -2, 20) Model.validation(train_X, train_y, valid_X, valid_y, lr_list) elif FLAG.Mode == "train": Model.train(train_X, train_y, valid_X, valid_y, FLAG.batch_size, FLAG.Epoch, FLAG.save_graph, FLAG.save_model) pred_Y = Model.predict(test_X[123]) print(pred_Y) print(test_y[123]) print(np.mean(np.square(pred_Y - test_y[123])))
def main(): # initialise the models vmodel = VideoNet().to(device) amodel = AudioNet().to(device) avmodel = AVNet().to(device) vmodel.load_state_dict(torch.load('vmodel_final.pt')) amodel.load_state_dict(torch.load('amodel_final.pt')) avmodel.load_state_dict(torch.load('avmodel_final.pt')) print('loaded model') params = list(vmodel.parameters())+list(amodel.parameters())+list(avmodel.parameters()) # optimiser = optim.Adam(params, lr=LR) optimiser = optim.SGD(params, lr=LR, momentum=0.9) list_vid = os.listdir('data/train/full_vid') # ensure no extra files like .DS_Store are present train_list, val_list = utils.split_data(list_vid, 0.8, 0.2) # log the list for reference utils.log_list(train_list, 'data/train_list.txt') utils.log_list(val_list, 'data/val_list.txt') # uncomment following to read previous list # train_list = utils.read_list('data/train_list.txt') # val_list = utils.read_list('data/val_list.txt') train_list = ['video_001.mp4'] composed = transforms.Compose([Resize(256), RandomCrop(224)]) # composed = transforms.Compose([Resize(256)]) train_loader = torch.utils.data.DataLoader(AVDataset(train_list[:1], transform=composed), batch_size=batch_size, shuffle=False, num_workers=4) val_loader = torch.utils.data.DataLoader(AVDataset(train_list[:1], transform=composed), batch_size=batch_size,shuffle=False, num_workers=4) l,p,cam=val(vmodel,amodel,avmodel,val_loader) print(p,cam.shape) import skvideo.io vids=skvideo.io.vread('data/train/'+'snippet/video_001.mp4') # print('vids',vids) findcam(np.expand_dims(vids,0),np.abs(cam.cpu().numpy()))
def test1(): top_n = int(input("Input the number of recommendations\n")) k = int(input("Input the number of related users\n")) data = utils.get_data() train, test = utils.split_data(data, 2, 1, 1) del data user = int(input("Input the user id \n")) print("The train set contains the movies of the user: "******"it takes ", (end_time - start_time).seconds, " seconds to get W") start_time = datetime.datetime.now() rec = get_recommendation(user, train, top_n, k, weight, related_users) end_time = datetime.datetime.now() print("it takes ", (end_time - start_time).seconds, " seconds to get recommend for one user") print(rec) for item in rec: print(item), if item in test[user]: print(" True") else: print(" False")
def main(_): pp.pprint(flags.FLAGS.__flags) if not os.path.exists(FLAGS.checkpoint_dir): os.makedirs(FLAGS.checkpoint_dir) X, y, sentences, index_to_word = utils.load_sentiment_data( FLAGS.max_length) vocab_size, n_classes = X.shape[2], y.shape[1] X_train, y_train, X_test, y_test = utils.split_data(X, y) with tf.Session() as sess: deep_pdf = SentimentRNN( sess, vocab_size=vocab_size, n_classes=n_classes, batch_size=FLAGS.batch_size, keep_prob=FLAGS.keep_prob, max_length=FLAGS.max_length, n_recurrent_layers=FLAGS.n_recurrent_layers, n_fc_layers=FLAGS.n_fc_layers, recurrent_layer_width=FLAGS.recurrent_layer_width, fc_layer_width=FLAGS.fc_layer_width, checkpoint_dir=FLAGS.checkpoint_dir, epoch=FLAGS.epoch) if FLAGS.is_train: deep_pdf.train(FLAGS, X_train, y_train, X_test, y_test) else: deep_pdf.load(FLAGS.checkpoint_dir)
def main(): print("--------- Naive Bayes ---------") data_set = utils.read_data('dataset.txt') train_data, test_data = utils.split_data(data_set, 0.3) prob_matrix, prob_yi = train(train_data) test(prob_matrix, prob_yi, test_data)
def find_best_c(x, y, share, count): x_train, x_check = utils.split_data(x, share) y_train, y_check = utils.split_data(y, share) best_f1 = 0 best_c = -1 c = 10 while c <= 40: w1, w2 = train(x_train, y_train, c, count) p, r = utils.process_result(test(x_check, y_check, w1, w2)) f1 = utils.f1(p, r) if f1 > best_f1: best_f1 = f1 best_c = c c += 10 return best_c
def parse_coauthor(file): """ Parse & convert coauthor into Dataframe. Args: -- file: coauthor file address, encoding in utf-8. coauthor file: https://lfs.aminer.cn/lab-datasets/aminerdataset/AMiner-Coauthor.zip ETA 10min """ with open(file, encoding='utf-8') as f: data = f.readlines() def process(d): df = pd.DataFrame(columns=['1st', '2nd', 'num']) for c in d: c = c.lstrip('#') c = c.rstrip('\n') df = df.append( {col: val for col, val in zip(df.columns, c.split('\t'))}, ignore_index=True) return df coauthor_df = multiprocess(process, split_data(data, size=2000)) coauthor_df['num'] = coauthor_df['num'].astype('int64') return coauthor_df
def choose_best_vec(): train_sets_names = ["A", "B", "C", "D"] print("Choosing best vectorization method:") clf = LinearDiscriminantAnalysis() best_score = 0 best_vec_method = "" for vec_method in train_sets_names: X_train, y_train, X_test, _ = training_data() X_train, X_test = extract_feature(X_train, X_test, vec_method) # Split Training set to predefined train and cross validation X_t, X_cv, y_t, y_cv, _ = split_data(X_train, y_train) model = clf.fit(X_t, y_t) y_pred = model.predict(X_cv) score = accuracy_score(y_cv, y_pred) print("Method:", vec_method, "cv accuracy:", score) if score > best_score: best_score = score best_vec_method = vec_method print("Best vectorization method:", best_vec_method, "with score of:", best_score) return best_vec_method
def __init__(self, env, batch: np.array, V=None, lr=1e-4, pi_eval=None): self.env = env self.batch = batch # for tc self.V = V if V is not None: self.pi = LinearPolicy(V.get_feature_vec_len(), env.action_space.n, lr) else: self.pi = LinearPolicy(env.observation_space.shape[0], env.action_space.n, lr) self.pi_eval = pi_eval # processing batch of data, generating 1-hot vectors data_x, data_y = utils.pre_process_batch(env, None, batch, V) self.train_x, self.train_y, _, _ = utils.split_data(data_x, data_y, ratio=0.0) assert len(self.train_x) == len(data_x) # generating separate validation set #val_batch = utils.generate_batch(self.env, self.pi_eval, 0.05 * len(self.batch)) #self.test_x, self.test_y = utils.pre_process_batch(env, None, val_batch, V) print('Number of training trajs {}'.format(len(self.batch))) print('Number of training steps {}'.format(len(self.train_x))) #print ('Number of testing trajs {}'.format(len(val_batch))) #print ('Number of testing steps {}'.format(len(self.test_x))) self.compute_ris_estimates()
def prepare_data(dataset_path, vectorization='bow', verbose=False): X, y = utils.prepare_xy(dataset_path) bow_path = 'app/saves/embeddings/bow.pickle' tfidf_path = 'app/saves/embeddings/tfidf.pickle' if vectorization == 'bow': if os.path.isfile(bow_path): X = load_vectors(bow_path) else: X = vecotrize_bow(X, save_path=bow_path) elif vectorization == 'tfidf': if os.path.isfile(tfidf_path): X = load_vectors(tfidf_path) else: X = vectorize_tfidf(X, save_path=tfidf_path) else: raise ValueError('Método não implementado!') X_train, X_test, X_val, y_train, y_test, y_val = utils.split_data(X, y) if verbose: print('Treino') print(X_train.shape) print('Validação') print(X_val.shape) print('Teste') print(X_test.shape) return X_train, X_val, X_test, y_train, y_val, y_test
def main(_): try: os.makedirs(FLAGS.save_dir) except: pass print('Load data') full_data = utils.load_age_data(FLAGS.data_dir) train_data, val_data = utils.split_data(full_data) traingen, valgen = data.AgeDatagen(FLAGS, train_data), data.AgeDatagen(FLAGS, val_data) # model = Model(FLAGS, '/cpu:0') model = AgeModel(FLAGS) with tf.Session() as sess: sess.run(tf.initialize_all_variables()) sess.run(tf.assign(model.lr, FLAGS.learning_rate)) print('Let the train begin!') for epoch in range(FLAGS.epochs): sess.run(tf.assign(model.lr, FLAGS.learning_rate)) FLAGS.learning_rate *= FLAGS.decay_rate pbar = tqdm(range(FLAGS.steps)) for _ in pbar: x, y = next(traingen) loss, _, = sess.run([model.loss, model.train_op], {model.inputs: x, model.targets: y}) pbar.set_description("loss: {:.2f}, ".format(loss))
def test_nn_framework(): # Input: (M x T x B) matrix representing blocks in all ASTs all_matrix = utils.load_data("./data-created/q4_array_of_ast_matrices.npy") validation_matrix = utils.create_validation_matrix(all_matrix) print(all_matrix.shape) print(validation_matrix.shape) # Split into training/dev/test set train_matrix, dev_matrix, test_matrix = utils.split_data(all_matrix) print(train_matrix.shape) print(dev_matrix.shape) print(test_matrix.shape) # Create model num_timesteps = all_matrix.shape[1] num_blocks = all_matrix.shape[2] model = create_model(num_timesteps, num_blocks) # train_matrix model #train_matrix = all_matrix[0, :, :] #train_labels = validation_matrix[0, :, :] #train_model(train_matrix, train_labels) train_model(model, all_matrix, validation_matrix) print( utils.accuracy_from_onehot_matrix( model.predict(test_matrix), utils.create_validation_matrix(test_matrix)))
def item_mean_test(ratings, min_num_ratings, verbose=False, p_test=0.1): """ Splits the data set in train and test and compute the RMSE using as prediction the item mean. :param ratings: initial data set (sparse matrix of size nxp, n items and p users) :param min_num_ratings: all users and items must have at least min_num_ratings per user and per item to be kept :param verbose: True if user wants details to be printed :param p_test share of the data set to be dedicated to test set :return: RMSE value of the prediction using item means as a predictions b """ _, train, test = split_data(ratings, min_num_ratings, verbose=verbose, p_test=p_test) cumulated_rmse = 0 # find the RMSE share due to all users for item in range(train.shape[0]): # compute the mean of non-zero rating for current user current_train_ratings = train[item] current_non_zero_train_ratings = current_train_ratings[ current_train_ratings.nonzero()] if current_non_zero_train_ratings.shape[1] != 0: mean = current_non_zero_train_ratings.mean() # compute the rmse with all non-zero ratings of current user in test set current_test_ratings = test[item] current_non_zero_test_ratings = current_test_ratings[ current_test_ratings.nonzero()].todense() cumulated_rmse += calculate_mse(current_non_zero_test_ratings, mean) cumulated_rmse = np.sqrt(float(cumulated_rmse) / test.nnz) return cumulated_rmse
def grade1(): print("=" * 20 + "Grading Problem 1" + "=" * 20) marks = 0.0 try: X, Y = utils.load_data2('data2.csv') X, Y = utils.preprocess(X, Y) X_train, Y_train, X_test, Y_test = utils.split_data(X, Y) W, train_mses, test_mses = p1.ista(X_train, Y_train, X_test, Y_test, _lambda=0.1) assert train_mses[-1] < 0.2 marks += 1.5 except: print('Train Error is large') try: assert test_mses[-1] < 0.25 marks += 1.5 except: print('Test Error is large') print("Marks obtained in Problem 1: ", marks) return marks
def main(): [data, labels] = load_data() [x_train, y_train, x_test, y_test] = split_data(data, labels) val_labels = y_test x_train = np.array(x_train) x_train = 255 - x_train x_train /= 255 x_test = np.array(x_test) x_test = 255 - x_test x_test /= 255 # multi-convnet model y_train = convert_to_multi_output_target(y_train) y_test = convert_to_multi_output_target(y_test) model = train(x_train, y_train, x_test, y_test) # convnet model #y_train = convert_to_general_target(y_train) #y_test = convert_to_general_target(y_test) #model = train(x_train, y_train, x_test, y_test, default_model='convnet') # pretrain model #model = load_model('my_model.h5') pred(model, x_test, val_labels)
def main(): df_train, df_evaluate = read_challenge_data() df_train = process_and_filter_data(df_train, config) df_evaluate = process_evaluate_data(df_evaluate, config) X_train, X_test, y_train, y_test = split_data(df_train, config) run_baseline(X_train, X_test, y_train, y_test, config) gb_clf, y_pred_gb = run_gradient_boosting_classifier( X_train, X_test, y_train, y_test, config) rf_clf, y_pred_rf = run_random_forest(X_train, X_test, y_train, y_test, config) xgb_clf, y_pred_xgb = run_xgboost(X_train, X_test, y_train, y_test, config) # Voting classifier. voting_clf = VotingClassifier(estimators=[("rf", rf_clf), ("gb", gb_clf), ("xgb", xgb_clf)], voting="soft").fit(X_train, y_train) stratified_shuffle_split = StratifiedKFold(n_splits=10) cross_val_score_ = cross_val_score(voting_clf, X_train, y_train, cv=stratified_shuffle_split).mean() LOGGER.info( f"Voting classifier cross validation score: {cross_val_score_}") if config["test"]: print(classification_report(voting_clf.predict(X_test), y_test)) final_model = voting_clf final_predictions = final_model.predict(df_evaluate) pd.DataFrame(final_predictions).to_csv("data/101617.txt", index=False, header=False)
def grade1(): marks = 0 try: X = np.random.rand(110, 5) Y = np.random.rand(110, 1) X_train, Y_train, X_test, Y_test = utils.split_data(X, Y, 0.75) assert np.allclose(np.vstack([X_train, X_test]), X) assert np.allclose(np.vstack([Y_train, Y_test]), Y) assert len(X_train) == 82 and len(Y_train) == 82 marks += 0.5 except: print('Q1 split_data() incorrect', file=stderr) return marks try: x = np.array([ 9.71711545, 5.27658861, 0.74957658, 7.25267862, 1.57512235, 4.95493874, 4.6645458, 8.81014817, 5.6875507, 8.9270358 ]).reshape(10, 1) y = np.array([ 7.4395211, 1.29711056, 4.99824035, 1.87706798, 0.93306619, 6.65645683, 8.6573449, 2.54946024, 1.3023241, 6.52289899 ]).reshape(10, 1) w = 0.513244 b = 1.839345 assert np.isclose(single_var_reg.mse(x, y, w, b), 4.319008411331635) marks += 0.5 except: print('Q1 mse() incorrect', file=stderr) return marks try: X, Y = utils.load_data1('data1.csv') X_train, Y_train, X_test, Y_test = utils.split_data(X, Y) w, b, train_mses, test_mses = single_var_reg.ordinary_least_squares( X_train, Y_train, X_test, Y_test) assert train_mses[-1] < 52 assert test_mses[-1] < 68 for i in range(len(train_mses) - 1): assert train_mses[i] >= train_mses[i + 1] marks += 3 except: print('Q1 ordinary_least_squares() incorrect', file=stderr) return marks return marks
def main(): # parse arguments parser = argparse.ArgumentParser() parser.add_argument('--emb_file', default='data/giga5_glv.pkl', type=str) parser.add_argument('--corpus_file', default='data/semcor_corpus.pkl', type=str) parser.add_argument('--load_corpus_from_file', default=True, type=bool) parser.add_argument('--model_name', default='AverageLinear', type=str) parser.add_argument('--context_size', default=1, type=int) parser.add_argument('--lr', default=0.1, type=float) parser.add_argument('--max_epochs', default=5, type=int) parser.add_argument('--clip_grad', default=5.0, type=float) parser.add_argument('--batch_size', default=16, type=int) parser.add_argument('--val_iter', default=500, type=int) parser.add_argument('--print_iter', default=200, type=int) parser.add_argument('--hidden_size', default=50, type=int) args = parser.parse_args() # set random seeds #np.random.seed(100) #torch.manual_seed(10) # get the data if args.load_corpus_from_file and os.path.exists(args.corpus_file): # load the data from file if it already exists with open(args.corpus_file, 'rb') as cf: dataset = pickle.load(cf) print("Loaded corpus from {}!".format(args.corpus_file)) else: dataset = SemCor(args.context_size) print("Parsed corpus!") # save the corpus for next time with open(args.corpus_file, 'wb') as cf: pickle.dump(dataset, cf) print("Saved corpus to {} !".format(args.corpus_file)) # split up the data train_data, val_data, test_data = split_data(dataset) # load embeddings from file emb_weight_matrix_df = pd.read_pickle(args.emb_file) emb_weight_matrix = torch.tensor(emb_weight_matrix_df.values) # make the model if args.model_name == "AverageLinear": model = AverageLinear(dataset.max_num_senses, emb_weight_matrix, emb_weight_matrix_df) if args.model_name == "LSTMEncoder": model = LSTMEncoder(dataset.max_num_senses, args.hidden_size, emb_weight_matrix, emb_weight_matrix_df) else: raise Exception("Invalid model name: {}".format(args.model_name)) # run the model train(model, train_data, val_data, args) test(model, test_data)
def eval_on_dataset(self, dataset, is_dataset_csv=False): if is_dataset_csv: dataset = utils.load_data_from_csv(dataset, self.use_cols) # Split data to features and labels x, y_true = utils.split_data(dataset, self.y_col) # Classify data y_pred = self.predict(x) # Evaluate predictions return utils.eval_predictions(y_true, y_pred)
def sub_split(args): ts = args.testsize if 0 < ts < 1: data_train, data_test, names = ul.split_data(args.file, ts) header = ','.join(names) ul.write_array(f"{1-ts}_{args.output}", data_train, header=header) ul.write_array(f"{ts}_{args.output}", data_test, header=header) else: print("error")
def validate(self): # Validate model train_x, y = self.load_train() train_x, test_x, train_y, test_y = u.split_data(train_x, y) self.fit(train_x, train_y) print("Validating") preds = self.compute_predict(test_x) print(metrics.classification_report(test_y, preds))
def prepare_quries_answers(args): chat_data = utils.load_data(args.data_dir) chat_data = utils.filter_sentences(chat_data, args.whitelist) index2word, word2index = utils.build_vocab(chat_data, max_words=args.max_words) Limits.q_max_len, Limits.a_max_len, Limits.q_min_len, Limits.a_min_len = args.q_max_len, \ args.a_max_len, args.q_min_len, args.a_min_len queries, answers = utils.split_data(chat_data, Limits) queries, answers = utils.vectorize(queries, answers, word2index, sort_by_len=True) return queries, answers, index2word, word2index
def main(): # Extract list of stopwords with open(conf.STOPWORDS_FILE, 'r') as f: stopword_list = f.read() sw = set([w.strip() for w in stopword_list.split()]) # Split data into ham and spam folders train_data_path = os.path.abspath(os.path.join(conf.TRAIN_DIR)) train_file_path = os.path.abspath(os.path.join(conf.TRAIN_FILE)) utils.split_data(train_file_path, train_data_path) # Process training data and prepare sets of (features, label) data spam_path = os.path.join(train_data_path, '0') # label 0 for spam ham_path = os.path.join(train_data_path, '1') # label 1 for ham spam_mails = utils.get_dir_data(spam_path) ham_mails = utils.get_dir_data(ham_path) spam_set, ham_set = process_train_data(spam_mails, ham_mails, stopwords=sw) # 5 Fold Cross Validation with training data to report result metrics precision, recall, F1, ham_mails_accuracy, spam_mails_accuracy = \ get_matrix(spam_set, ham_set, conf.NUM_FOLDS) print "Precision : %.4f" % precision print "Recall : %.4f" % recall print "F1 : %.4f" % F1 print "Spam Mails Accuracy : %.2f" % spam_mails_accuracy print "Ham Mails Accuracy : %.2f" % ham_mails_accuracy # Model training on 100% train data train_set = spam_set + ham_set classifier = NaiveBayesClassifier.train(train_set) # Top 20 informative features print classifier.show_most_informative_features(20) # Classify on given test data test_data_path = os.path.abspath(os.path.join(conf.TEST_DIR)) output_dir_path = os.path.abspath(os.path.join(conf.OUTPUT_DIR)) if not os.path.exists(output_dir_path): os.makedirs(output_path) output_file_path = os.path.join(output_dir_path, conf.OUTPUT_FILE) test_mails = utils.get_dir_data_with_filename(test_data_path) utils.write_file(output_file_path, classify_data(classifier, test_mails, stopwords=sw))
def main(): # Extract list of stopwords with open(conf.STOPWORDS_FILE, 'r') as f: stopword_list = f.read() sw = set([w.strip() for w in stopword_list.split()]) # Split data into ham and spam folders train_data_path = os.path.abspath(os.path.join(conf.TRAIN_DIR)) train_file_path = os.path.abspath(os.path.join(conf.TRAIN_FILE)) utils.split_data(train_file_path, train_data_path) # Process training data and prepare sets of (features, label) data spam_path = os.path.join(train_data_path, '0') # label 0 for spam ham_path = os.path.join(train_data_path, '1') # label 1 for ham spam_mails = utils.get_dir_data(spam_path) ham_mails = utils.get_dir_data(ham_path) spam_set, ham_set = process_train_data(spam_mails, ham_mails, stopwords = sw) # 5 Fold Cross Validation with training data to report result metrics precision, recall, F1, ham_mails_accuracy, spam_mails_accuracy = \ get_matrix(spam_set, ham_set, conf.NUM_FOLDS) print "Precision : %.4f" % precision print "Recall : %.4f" % recall print "F1 : %.4f" % F1 print "Spam Mails Accuracy : %.2f" % spam_mails_accuracy print "Ham Mails Accuracy : %.2f" % ham_mails_accuracy # Model training on 100% train data train_set = spam_set + ham_set classifier = NaiveBayesClassifier.train(train_set) # Top 20 informative features print classifier.show_most_informative_features(20) # Classify on given test data test_data_path = os.path.abspath(os.path.join(conf.TEST_DIR)) output_dir_path = os.path.abspath(os.path.join(conf.OUTPUT_DIR)) if not os.path.exists(output_dir_path): os.makedirs(output_path) output_file_path = os.path.join(output_dir_path, conf.OUTPUT_FILE) test_mails = utils.get_dir_data_with_filename(test_data_path) utils.write_file(output_file_path, classify_data(classifier, test_mails, stopwords = sw))
train_size = train_data[0].shape[0] test_size = test_data[0].shape[0] num_feas = len(utils.FIELD_SIZES) min_round = 1 num_round = 200 early_stop_round = 5 batch_size = 1024 field_sizes = utils.FIELD_SIZES field_offsets = utils.FIELD_OFFSETS algo = 'pnn1' if algo in {'fnn', 'ccpm', 'pnn1', 'pnn2'}: train_data = utils.split_data(train_data) test_data = utils.split_data(test_data) tmp = [] for x in field_sizes: if x > 0: tmp.append(x) field_sizes = tmp print('remove empty fields', field_sizes) if algo == 'lr': lr_params = { 'input_dim': input_dim, 'opt_algo': 'gd', 'learning_rate': 0.1, 'l2_weight': 0, 'random_seed': 0
def init_data(share): data_lines = open('wdbc.data').readlines() data = [x.split(',') for x in data_lines] # numpy.random.shuffle(data) return utils.split_data(data, share)
# parameters batch_size = 32 num_epochs = 1000 training_split = .8 do_random_crop = False num_classes = 2 dataset_name = 'imdb' input_shape = (48, 48, 3) images_path = '../datasets/imdb_crop/' log_file_path = 'log_files/gender_training.log' trained_models_path = '../trained_models/gender_models/simple_CNN' # data loader data_loader = DataLoader(dataset_name) ground_truth_data = data_loader.get_data() train_keys, val_keys = split_data(ground_truth_data, training_split) image_generator = ImageGenerator(ground_truth_data, batch_size, input_shape[:2], train_keys, val_keys, None, path_prefix=images_path, vertical_flip_probability=0, do_random_crop=do_random_crop) # model parameters/compilation model = simple_CNN(input_shape, num_classes) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) model.summary() # model callbacks
reg_term = sum(T.sum(p ** 2) for p in params) def loss(y, t): return loss_function(y, t) + lambda_reg * reg_term return nn.objectives.Objective(l_out, loss_function=loss) train_labels = p.read_csv(os.path.join(base_dir, 'data/trainLabels.csv')) labels_split = p.DataFrame(list(train_labels.image.str.split('_')), columns=['id', 'eye']) labels_split['level'] = train_labels.level labels_split['id'] = labels_split['id'].astype('int') id_train, y_train, id_valid, y_valid = split_data(train_labels, labels_split, valid_size=10, SEED=SEED, pairs=True) # Change train dataset to oversample other labels. # Total sizes: # ( image # level # 0 25810 # 1 2443 # 2 5292 # 3 873 # 4 708, image # level # 0 0.734783 # 1 0.069550 # 2 0.150658