def init_test_data(): ''' Initialize testing data from database :return: ''' conn = data_loader.get_connection() data_loader.init_database(conn) data_loader.load_test_data(conn)
def test(self): """Test Function.""" print("Testing the results") self.inputs = data_loader.load_test_data( self._dataset_name, False, self._do_flipping) self.model_test_setup() saver = tf.train.Saver() init = tf.global_variables_initializer() start = time.time() with tf.Session() as sess: sess.run(init) chkpt_fname = tf.train.latest_checkpoint(self._checkpoint_dir) saver.restore(sess, chkpt_fname) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) self._num_imgs_to_save = cyclegan_datasets.DATASET_TO_SIZES[ self._dataset_name] self.save_images(sess, 0) coord.request_stop() coord.join(threads) print("rate", (time.time()-start))
def _read(self, file_path: str) -> Iterator[Instance]: if self.augmentation == None: train_loader_fn = data_loader.load_train_data elif self.augmentation == 'EDA': train_loader_fn = data_loader.load_train_data_with_EDA elif self.augmentation == 'CDA': pass else: print("Invalid augmentation:", self.augmentation) if file_path == 'train': X, Y = train_loader_fn(self.scenario) X = X.tolist() Y = Y.tolist() for text, label in zip(X, Y): yield self.text_to_instance( tokens=[Token(x) for x in self.tokenizer(text)], ID='__', # ID for every row label=label) elif file_path == 'test': X, Y, IDs = data_loader.load_test_data() X = X.tolist() Y = Y.tolist() for text, label, ID in zip(X, Y, IDs): yield self.text_to_instance( tokens=[Token(x) for x in self.tokenizer(text)], ID=ID, # unique ID for every test row label=label) else: print("Invalid split parameter:", data)
def q1(): '''Question 1 ''' model.load_state_dict(torch.load(F'{data_path}/{model_name}')) model.eval() p300 = P300(model=model) for person in selected_persons: brain, event = load_test_data(sorted_path, person) target = p300.get_target(brain, event) print(target)
def load_data(train_data_path='./aclImdb/train/', test_data_path='./aclImdb/test/'): # Load data print("Load Data...") Xtr_text, Ytr, Xva_text, Yva = load_train_data(train_data_path, 0.1) Xte_text, Yte = load_test_data(test_data_path) # Combine training and validation data: Xtr_text = np.append(Xtr_text, Xva_text) Ytr = np.append(Ytr, Yva) print("Done loading data!\n") return Xtr_text, Ytr, Xte_text, Yte
def test(fake_user): user_emb, ui_matrix = data_loader.load_user_info() test_item, test_attribute = data_loader.load_test_data() # similar_uid = get_similar_user(fake_user, user_emb, 20) ndcg_10 = ndcg_at_k(test_item, fake_user, user_emb, ui_matrix, 10) ndcg_20 = ndcg_at_k(test_item, fake_user, user_emb, ui_matrix, 20) p_10 = p_at_k(test_item, fake_user, user_emb, ui_matrix, 10) p_20 = p_at_k(test_item, fake_user, user_emb, ui_matrix, 20) map_10 = map_at_k(test_item, fake_user, user_emb, ui_matrix, 10) map_20 = map_at_k(test_item, fake_user, user_emb, ui_matrix, 20) print( 'test:ndcg@10:{:.3f}, ndcg@20:{:.3f}, p@10:{:.3f}, p@20:{:.3f}, map@10:{:.3f}, map@20:{:.3f}' .format(ndcg_10, ndcg_20, p_10, p_20, map_10, map_20))
def test(model, tar = True): alpha = 0 dataloader = data_loader.load_test_data(tar=tar, person=args.person) model.eval() n_correct = 0 with torch.no_grad(): for _, (t_data, t_label) in enumerate(dataloader): t_data, t_label = t_data.to(DEVICE), t_label.to(DEVICE) t_label = t_label.squeeze() class_output, _ = model(input_data=t_data, alpha=alpha) prob, pred = torch.max(class_output.data, 1) n_correct += (pred == t_label.long()).sum().item() acc = float(n_correct) / len(dataloader.dataset) * 100 return acc
def test(model, dataset_name, epoch): alpha = 0 dataloader = data_loader.load_test_data(dataset_name) model.eval() n_correct = 0 with torch.no_grad(): for _, (t_img, t_label) in enumerate(dataloader): t_img, t_label = t_img.to(DEVICE), t_label.to(DEVICE) class_output, _ = model(input_data=t_img, alpha=alpha) prob, pred = torch.max(class_output.data, 1) n_correct += (pred == t_label.long()).sum().item() acc = float(n_correct) / len(dataloader.dataset) * 100 return acc
def train_and_predict(): DataCreator() print('-' * 30) print('Loading and preprocessing train data...') print('-' * 30) imgs_train, imgs_mask_train = DataLoader() imgs_train = preprocess(imgs_train) imgs_mask_train = preprocess(imgs_mask_train) imgs_train = imgs_train.astype('float32') mean = np.mean(imgs_train) # mean for data centering std = np.std(imgs_train) # std for data normalization #imgs_train -= mean #imgs_train /= std imgs_mask_train = imgs_mask_train.astype('float32') imgs_mask_train /= 255. # scale masks to [0, 1] print('-' * 30) print('Creating and compiling model...') print('-' * 30) model = get_unet() model_checkpoint = ModelCheckpoint('weights.h5', monitor='val_loss', save_best_only=True) print('-' * 30) print('Fitting model...') print('-' * 30) model.fit(imgs_train, imgs_mask_train, batch_size=32, nb_epoch=numberOfEpochs, verbose=1, shuffle=True, validation_split=0.2, callbacks=[model_checkpoint]) #TESTS DATA -------------- print('-' * 30) print('Loading and preprocessing test data...') print('-' * 30) imgs_test, imgs_id_test = load_test_data() imgs_test = preprocess(imgs_test) imgs_test = imgs_test.astype('float32') # imgs_test -= mean imgs_test /= std print('-' * 30) print('Loading saved weights...') print('-' * 30) model.load_weights('weights.h5') print('-' * 30) print('Predicting masks on test data...') print('-' * 30) imgs_mask_test = model.predict(imgs_test, verbose=1) np.save('imgs_mask_test.npy', imgs_mask_test) print('-' * 30) print('Saving predicted masks to files...') print('-' * 30) pred_dir = 'preds' if not os.path.exists(pred_dir): os.mkdir(pred_dir) #for image, image_id in zip(imgs_mask_test, imgs_id_test): i = 0 for image in imgs_mask_test: image = (image[:, :, 0] * 255.).astype(np.uint8) imsave(os.path.join(pred_dir, str(i) + '_pred.png'), image) i += 1
def main(): if len(sys.argv) != 2 or sys.argv[1] not in ['svm', 'nn']: print("Invalid command. Expected 'svm' or 'nn'.") return c_name = sys.argv[1] print('Running job: TF-IDF vectorization and ' + c_name.upper() + ' classifier.') train_data = data_loader.load_train_data().sample( frac=1, random_state=42).reset_index(drop=True) test_data = data_loader.load_test_data() if c_name == 'svm': classifier = LinearSVC(random_state=42) param_grid = { 'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)], 'classifier__C': [0.1, 1] } else: classifier = MLPClassifier((50, ), solver='lbfgs', learning_rate_init=1e-4, tol=1e-6, max_iter=200, random_state=42) param_grid = {'vectorizer__ngram_range': [(1, 1)]} pipe = Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', classifier)]) cv_grid = GridSearchCV(pipe, n_jobs=2, cv=5, verbose=3, param_grid=param_grid) start_time = time.time() cv_grid.fit(train_data.text, train_data.sentiment) end_time = time.time() print('Total fit time: {}'.format(end_time - start_time)) # Classification report pred = cv_grid.predict(train_data.text) cr = classification_report(train_data.sentiment, pred) print(cr) # Test predictions pred = cv_grid.predict(test_data.text) print('Predictions finished.') # Save predictions results = pd.DataFrame({'Id': test_data.index, 'Prediction': pred}) results = results.set_index('Id') data_loader.save_submission(results, 'tfidf_' + c_name.upper() + '_submission.csv') print('Predictions saved.') # Save classification results cvr_path = path.join( 'pickles', 'tfidf_' + c_name.upper() + '_cross_validation_results') # Cross validation results be_path = path.join('pickles', 'tfidf_' + c_name.upper() + '_best_estimator') # Best estimator dump(cv_grid.cv_results_, open(cvr_path, 'wb')) dump(cv_grid.best_estimator_, open(be_path, 'wb')) print('Classification results saved.')
if args.eda: # with EDA X_train, Y_train = data_loader.load_train_data_with_EDA( scenario=args.scenario) elif args.cda: # with CDA pass else: # wihtout any Data Augmentation X_train, Y_train = data_loader.load_train_data(args.scenario) X_train = X_train.tolist() Y_train = Y_train.tolist() # convert to list labels_train = labels_str_to_int(Y_train) # convert labels to integers # Test data: X_test, Y_test, test_IDs = data_loader.load_test_data() X_test = X_test.tolist() Y_test = Y_test.tolist() test_IDs = test_IDs.tolist() # convert to list labels_test = labels_str_to_int(Y_test) # convert labels to integers testIDs_idx = np.linspace( 0, len(test_IDs), len(test_IDs), False ) # can't create a tensor of strings, so create a corresponding list of indexes; we use that to index into test_IDs print("testIDs indexes:", len(testIDs_idx)) if args.model == 'bert': run_bert() elif args.model == 'xlnet': run_xlnet()
def load_test_data_from_file(test_file_path): df = load_test_data(test_file_path) df.replace(' ?', np.nan, inplace=True) df = df.dropna() return df
elif args.eda and args.cda: # both EDA and CDA train_passages, Y_train = data_loader.load_train_data_with_EDA_and_CDA( scenario=args.scenario) else: # wihtout any Data Augmentation train_passages, Y_train = data_loader.load_train_data( scenario=args.scenario, N_WORDS=DOCUMENT_LENGTH, exp=experiment) print("\nTrain Set ---- X: {} | Y: {} | Distribution: {}".format( len(train_passages), len(Y_train), Counter(Y_train))) print("Y train preview:", Y_train[:3]) # Load test data (same for each scenario, with or without augmentation): test_passages, Y_test, test_IDs = data_loader.load_test_data( N_WORDS=DOCUMENT_LENGTH) print( "Test Set ---- X: {} | Y: {} | Distribution: {} | Test IDs: {}, preview: {}" .format(len(test_passages), len(Y_test), Counter(Y_test), len(test_IDs), test_IDs[:3])) print("Y test preview:", Y_test[:3]) # Sanity check: if args.scenario == 'A': assert len(train_passages) == len(Y_train) == 401 else: assert len(train_passages) == len(Y_train) == 400 assert len(test_passages) == len(Y_test) == 198 prediction_probs = predict(algo)
# ------------------------------------------------------------------------------ # top settings # ------------------------------------------------------------------------------ n_training_data = 10000 n_test_data = 1000 n_epoch = 10 mini_batch_size = 100 learn_rate = 0.005 # ------------------------------------------------------------------------------ # step 1: generate data # ------------------------------------------------------------------------------ training_data = data_loader.load_training_data(n_training_data) test_dataset,test_label = data_loader.load_test_data (n_test_data ) # ------------------------------------------------------------------------------ # step 2: setup the model # ------------------------------------------------------------------------------ label_x = tf.placeholder(tf.float32, [None, 784]) label_y = tf.placeholder(tf.float32, [None, 10]) y_pred=add_layer('output_layer', label_x, 784, 10, tf.nn.softmax) ''' W = tf.Variable(tf.random_normal([784, 10])); # [1, 784] x [784, 10] = [1, 10] b = tf.Variable(tf.zeros([10])) z_pred = tf.matmul(label_x, W) + b y_pred = tf.nn.softmax(z_pred) ''' # ------------------------------------------------------------------------------
imgs_train, imgs_mask_train = DataLoader() imgs_train = preprocess(imgs_train) imgs_mask_train = preprocess(imgs_mask_train) #imgs_train = imgs_train.astype('float32') mean = np.mean(imgs_train) # mean for data centering std = np.std(imgs_train) # std for data normalization model = get_unet() model_checkpoint = ModelCheckpoint('weights.h5', monitor='loss', save_best_only=True) print('-' * 30) print('Loading and preprocessing test data...') print('-' * 30) imgs_test, imgs_id_test = load_test_data() imgs_test = preprocess(imgs_test) imgs_test = imgs_test.astype('float32') imgs_test -= mean imgs_test /= std print('-' * 30) print('Loading saved weights...') print('-' * 30) model.load_weights('weights.h5') print('-' * 30) print('Predicting masks on test data...') print('-' * 30) imgs_mask_test = model.predict(imgs_test, verbose=1)
def main(): model = enc_dec(params_dict, DEVICE) model = model.to(DEVICE) ###################### Debugging ############################# # optimizer = optim.Adam(model.parameters(), lr=LR) # optimizer.zero_grad() # data = torch.randn(BATCH_SIZE, TIME_STEPS, INPUT_DIM).to(DEVICE) # wt_batch = torch.randn(BATCH_SIZE, TIME_STEPS, INPUT_DIM).to(DEVICE) # enc_reps, dec_reps = model(data, False) # print("Encoded: ") # for rep in enc_reps: # print(rep.shape, torch.max(rep).item(), torch.min(rep).item()) # print("Decoded: ") # for i in range(len(dec_reps)): # print(i) # for rep in dec_reps[i]: # print(rep.shape, torch.max(rep).item(), torch.min(rep).item()) # loss_dict = all_losses(enc_reps, dec_reps[-1], True, True) # for key in loss_dict: # print(key, loss_dict[key].item()) # loss = loss_dict[FINAL_CRTR] # loss.backward() # optimizer.step() # return # assert(False) ###################### Debugging ############################# if (TRAIN_MODE): start = time.time() ################## Prepare Data ###################### print('Prepare data...') test_set = load_test_data(INPUT_DIM, TIME_STEPS) train_loader = get_data_loader('train_length.csv', INPUT_DIM, TIME_STEPS, BATCH_SIZE, True) valid_loader = get_data_loader('valid_length.csv', INPUT_DIM, TIME_STEPS, BATCH_SIZE, False) time_elapsed = time.time() - start print("Getting data takes %fs." % time_elapsed) print("#####################################") print("Training...") if (RESUME and os.path.exists(LAST_WEIGTHS_PATH)): states = torch.load(LAST_WEIGTHS_PATH) start_epoch = states['epoch'] start_level = states['level'] model.load_state_dict(states['state_dict']) optimizer = init_optimizer(model, start_level) optimizer.load_state_dict(states['optimizer']) print('Resume from level', start_level, end=' ') print('epoch %d' % start_epoch) else: print("Start fresh run.") start_level = None start_epoch = 0 optimizer = init_optimizer(model, start_level) if (INIT_WEIGHTS != None): print('Initial weights: ', INIT_WEIGHTS) states = torch.load(INIT_WEIGHTS) model.load_state_dict(states['state_dict']) optimizer.load_state_dict(states['optimizer']) scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8) min_criterion_vals, best_epoch, best_epochs = train( model, optimizer, scheduler, train_loader, valid_loader, test_set, start_level, start_epoch) # Save results in a csv file if (STORE_RESULTS): write_results(CSVFILEPATH, min_criterion_vals, best_epoch, best_epochs) else: if (WEIGHTS_PATH != None and os.path.exists(WEIGHTS_PATH)): print( 'Reconstruct/Represenation Learning by using model weights at:' ) print(WEIGHTS_PATH) states = torch.load(WEIGHTS_PATH) model.load_state_dict(states['state_dict']) else: print('Caution: Weights path does not exist', WEIGHTS_PATH) return if (TEST_MODE): print("Reconstructing...") test_set = load_test_data(INPUT_DIM, TIME_STEPS) test(model, test_set, 'pred.wav') if (LEARN_MODE): repr_store_path = CONFIG_DIR + 'repr/' rep_learning(model, repr_store_path) for fold in [train_fold, valid_fold, test_fold]: store_rep_for_mfn(fold)
def _test_svm(): X = [[0, 0],[1, 1]] y = [0, 1] model = svm.SVC(gamma="scale") model = model.fit(X, y) print(model.support_vectors_) def _test_one_against_one(): X = [[0, 0], [1, 1], [2, 2], [3, 3]] y = [1, 2, 3, 4] model = svm.SVC(gamma="scale", decision_function_shape="ovr") model = model.fit(X, y) print(model.support_vectors_) print(model.predict([[-0.1, -0.1], [1.1, 1.1], [2.1, 2.1], [4, 4]])) def training(): traing_data = data_loader.load_training_data() model = svm.SVC(gamma="scale", decision_function_shape="ovo").fit(traing_data[0], traing_data[1]) return model if __name__ == "__main__": # _test_svm() # _test_one_against_one() model = training() test_data = data_loader.load_test_data() predicts = model.predict(test_data[0]) print(len([tup for tup in zip(test_data[1], predicts) if tup[0] == tup[1]]) / len(test_data[1]))
TRAIN_DIRS = [os.path.join(os.path.dirname(CW_DIR), 'data_bowl', 'data', 'stage1_train'), os.path.join(os.path.dirname(CW_DIR), 'data_bowl', 'extra_data'), os.path.join(os.path.dirname(CW_DIR), 'data_bowl', 'stage1_test',\ 'DSB2018_stage1_test-master', 'stage1_test')] TEST_DIR = os.path.join(os.path.dirname(CW_DIR), 'data_bowl', 'stage2_test') IMG_DIR_NAME = 'images' MASK_DIR_NAME = 'masks' train_df = read_train_data_properties(TRAIN_DIRS, IMG_DIR_NAME, MASK_DIR_NAME) test_df = read_test_data_properties(TEST_DIR, IMG_DIR_NAME) x_train, y_train, contour_train, no_contour_train = load_train_data(train_df) y_train_full = np.array([ np.concatenate((x, y, z), axis=2) for x, y, z in zip(y_train, contour_train, no_contour_train) ]) labels_train = get_train_labels(train_df) x_test = load_test_data(test_df) model_paths = train(train_df, y_train_full, labels_train) y_prediction = inference(x_test, model_paths) y_test_rle, y_test_ids = get_rle_encoding(test_df, y_prediction) sub = pd.DataFrame() sub['ImageId'] = y_test_ids sub['EncodedPixels'] = pd.Series(y_test_rle).apply( lambda x: ' '.join(str(y) for y in x)) sub.to_csv('sub-dsbowl2018.csv', index=False) sub.head()
def test_from_beams(data_testset, beams_dir, predict_only=True, sample_best=False): test_source_file = os.path.join(config.DATA_DIR, 'test_source_dict.json') test_target_file = os.path.join(config.DATA_DIR, 'test_target.txt') predictions_final_file = os.path.join(config.PREDICTIONS_DIR, 'predictions_final.txt') predictions_reduced_file = os.path.join(config.METRICS_DIR, 'predictions_reduced.txt') test_reference_file = os.path.join(config.METRICS_DIR, 'test_references.txt') print('Loading test data...', end=' ') sys.stdout.flush() # Load and preprocess the test data data_loader.load_test_data(data_testset) print('DONE') print('Extracting beams...') sys.stdout.flush() # Read all beam files in the given beams folder beam_files = glob.glob(os.path.join(beams_dir, '*.txt')) print('-> Beam files found:') print('\n'.join(beam_files)) # Combine all beam files into a single DataFrame df_beams = pd.concat( (pd.read_csv(f, sep='\t', header=None, encoding='utf8') for f in beam_files), axis=1, ignore_index=True) assert len(df_beams.columns) > 1 # Combine beams and their corresponding scores into tuples beams = [] for i in range(0, len(df_beams.columns), 2): beams.append(list(zip(df_beams.iloc[:, i], df_beams.iloc[:, i + 1]))) # Transpose the list of beams so as to have all beams of a single sample per line beams = list(map(list, zip(*beams))) print('DONE') print('Reranking...') sys.stdout.flush() # Score the slot alignment in the beams, and rerank the beams accordingly if sample_best: beams = postprocessing.rerank_beams(beams, keep_n=10, keep_least_errors_only=True) else: beams = postprocessing.rerank_beams(beams, keep_n=10) print('DONE') print('Evaluating...') sys.stdout.flush() with io.open(test_source_file, 'r', encoding='utf8') as f_test_source, \ io.open(predictions_final_file, 'w', encoding='utf8') as f_predictions_final: mrs = json.load(f_test_source, object_pairs_hook=OrderedDict) if sample_best: predictions = [ random.choice(prediction_beams)[0] for prediction_beams in beams ] else: predictions = [ prediction_beams[0][0] for prediction_beams in beams ] # Post-process the generated utterances predictions_final = postprocessing.finalize_utterances( predictions, mrs) for prediction in predictions_final: f_predictions_final.write(prediction + '\n') if not predict_only: # Create a file with a single prediction for each group of the same MRs if 'rest_e2e' in data_testset: test_mrs, _ = data_loader.read_rest_e2e_dataset_test( data_testset) elif 'tv' in data_testset: test_mrs, _, _ = data_loader.read_tv_dataset_test(data_testset) elif 'laptop' in data_testset: test_mrs, _, _ = data_loader.read_laptop_dataset_test( data_testset) elif 'hotel' in data_testset: test_mrs, _, _ = data_loader.read_hotel_dataset_test( data_testset) elif 'video_game' in data_testset: test_mrs, _ = data_loader.read_video_game_dataset_test( data_testset) else: raise FileNotFoundError with io.open(predictions_reduced_file, 'w', encoding='utf8') as f_predictions_reduced: for i in range(len(test_mrs)): if i == 0 or test_mrs[i] != test_mrs[i - 1]: f_predictions_reduced.write(predictions_final[i] + '\n') if not predict_only: # Depending on the OS, the tensor2tensor BLEU script might require a different way of executing if sys.executable is not None: bleu_script = 'python ' + os.path.join( os.path.dirname(sys.executable), 't2t-bleu') else: bleu_script = 't2t-bleu' metrics_script = 'python ' + os.path.join(config.METRICS_DIR, 'measure_scores.py') # Run the tensor2tensor internal BLEU script os.system(bleu_script + ' --translation=' + predictions_final_file + ' --reference=' + test_target_file) # Run the metrics script provided by the E2E NLG Challenge os.system(metrics_script + ' ' + test_reference_file + ' ' + predictions_reduced_file) print('DONE')
import numpy as np from nltk.corpus import stopwords import string from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from nltk.stem.wordnet import WordNetLemmatizer import data_loader import matplotlib.pyplot as plt import warnings warnings.filterwarnings('ignore') # %matplotlib inline plt.set_cmap('RdYlBu') import pre_processing train, valid = data_loader.load_train_data('data/train.csv') test = data_loader.load_test_data('data/test.csv', 'data/test_labels.csv') list_classes = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] train_y = train[list_classes].values valid_y = valid[list_classes].values test_y = test[list_classes].values train = train.fillna('') valid = valid.fillna('') test = test.fillna('') """## Data Exploration""" print(train.shape)
def train(g, d, train_loader, neg_loader, epoches, g_optim, d_optim, neg_lens): g = g.to(device) d = d.to(device) time.sleep(0.1) print("start training on {}".format(device)) time.sleep(0.1) bce_loss = torch.nn.BCELoss() # 训练判别器D for e in tqdm(range(epoches)): start_time = time.time() idx = 0 d_loss = 0.0 neg_iter = neg_loader.__iter__() # 训练判别器d for _, _, real_attr, real_user_emb in train_loader: if idx > neg_lens: break _, _, neg_attr, neg_user_emb = neg_iter.next() # 正例的属性和用户嵌入 real_attr = real_attr.to(device) real_user_emb = real_user_emb.to(device) # 负例的属性和用户嵌入 neg_attr = neg_attr.to(device) neg_user_emb = neg_user_emb.to(device) # 生成器生成虚拟用户嵌入 fake_user_emb = g(real_attr) fake_user_emb = fake_user_emb.to(device) # 判别器判别 d_real, d_logit_real = d(real_attr, real_user_emb) d_fake, d_logit_fake = d(real_attr, fake_user_emb) d_neg, d_logit_neg = d(neg_attr, neg_user_emb) # 计算d_loss d_optim.zero_grad() d_loss_real = bce_loss(d_real, torch.ones_like(d_real)) d_loss_fake = bce_loss(d_fake, torch.zeros_like(d_fake)) d_loss_neg = bce_loss(d_neg, torch.zeros_like(d_neg)) d_loss = torch.mean(d_loss_real + d_loss_fake + d_loss_neg) d_loss.backward() d_optim.step() idx += batch_size # 训练生成器g g_loss = 0.0 for uid, mid, attr, user_emb in train_loader: g_optim.zero_grad() attr = attr.to(device) # 生成虚拟用户嵌入 fake_user_emb = g(attr) fake_user_emb = fake_user_emb.to(device) # 算loss d_fake, d_logit_fake = d(attr, fake_user_emb) g_loss = bce_loss(d_fake, torch.ones_like(d_fake)) g_loss.backward() g_optim.step() end_time = time.time() print("\nepoch:{}: time:{:.2f}, d_loss:{:.3f}, g_loss:{:.3f}".format( e + 1, end_time - start_time, d_loss, g_loss)) # test test_item, test_attribute = data_loader.load_test_data() test_item = torch.tensor(test_item).to(device) test_attribute = torch.tensor(test_attribute, dtype=torch.long).to(device) fake_user = g(test_attribute) eval.test(fake_user.cpu().detach().numpy()) time.sleep(0.1)
leven_cost += max(i2 - i1, j2 - j1) elif tag == 'insert': leven_cost += (j2 - j1) elif tag == 'delete': leven_cost += (i2 - i1) return leven_cost def defaultdict_from_dict(dic): dd = defaultdict(int) dd.update(dic) return dd # 加载数据 test_pny_list, test_han_list = load_test_data() # 1.声学模型----------------------------------- # 2.语言模型------------------------------------------- with open('vocab/pny_vocab.json', "r", encoding='utf-8') as f: pny_dict_w2id = json.load(f) pny_dict_w2id = defaultdict_from_dict(pny_dict_w2id) pny_dict_id2w = {v: k for k, v in pny_dict_w2id.items()} with open('vocab/han_vocab.json', "r", encoding='utf-8') as f: han_dict_w2id = json.load(f) han_dict_w2id = defaultdict_from_dict(han_dict_w2id) han_dict_id2w = {v: k for k, v in han_dict_w2id.items()}
def main(nb_epoch=1, data_augmentation=True, noise=True, maxout=True, dropout=True, l1_reg=False, l2_reg=True): # l1 and l2 regularization shouldn't be true in the same time if l1_reg and l2_reg: print("No need to run l1 and l2 regularization in the same time") quit() # print settings for this experiment print("number of epoch: {0}".format(nb_epoch)) print("data augmentation: {0}".format(data_augmentation)) print("noise: {0}".format(noise)) print("maxout: {0}".format(maxout)) print("dropout: {0}".format(dropout)) print("l1: {0}".format(l1_reg)) print("l2: {0}".format(l2_reg)) # the data, shuffled and split between train and test sets (X_train, y_train), (X_test, y_test) = cifar10.load_data() # split the validation dataset X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0) print('X_train shape:', X_train.shape) print(X_train.shape[0], 'train samples') print(X_valid.shape[0], 'valid samples') print(X_test.shape[0], 'test samples') # convert class vectors to binary class matrices Y_train = np_utils.to_categorical(y_train, nb_classes) Y_valid = np_utils.to_categorical(y_valid, nb_classes) Y_test = np_utils.to_categorical(y_test, nb_classes) X_train = X_train.astype('float32') X_valid = X_valid.astype('float32') X_test = X_test.astype('float32') X_train /= 255 X_valid /= 255 X_test /= 255 ##### try loading data using data_loader.py #### data_loader.download_and_extract(data_path, data_url) class_names = data_loader.load_class_names() print(class_names) images_train, cls_train, labels_train = data_loader.load_training_data() images_test, cls_test, labels_test = data_loader.load_test_data() X_train, Y_train = images_train, labels_train X_test, Y_test = images_test, labels_test X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.2, random_state=0) print("Size of:") print("- Training-set:\t\t{}".format(len(X_train))) print("- Validation-set:\t\t{}".format(len(X_valid))) print("- Test-set:\t\t{}".format(len(X_test))) model = Sequential() if noise: model.add( GaussianNoise(sigma, input_shape=(img_channels, img_rows, img_cols))) model.add( Convolution2D(32, 3, 3, border_mode='same', input_shape=(img_channels, img_rows, img_cols))) model.add(Activation('relu')) model.add(Convolution2D(32, 3, 3)) model.add(Activation('relu')) # model.add(MaxPooling2D(pool_size=(2, 2))) if dropout: model.add(Dropout(0.25)) model.add(Convolution2D(64, 3, 3, border_mode='same')) model.add(Activation('relu')) model.add(Convolution2D(64, 3, 3)) model.add(Activation('relu')) # model.add(MaxPooling2D(pool_size=(2, 2))) if dropout: model.add(Dropout(0.25)) model.add(Flatten()) if maxout: model.add(MaxoutDense(512, nb_feature=4, init='glorot_uniform')) else: if not (l1_reg or l2_reg): model.add(Dense(512)) # activation regularization not implemented yet if l1_reg: model.add(Dense(512, W_regularizer=l1(l1_weight))) elif l2_reg: model.add(Dense(512, W_regularizer=l2(l2_weight))) model.add(Activation('relu')) if dropout: model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) # let's train the model using SGD + momentum (how original). sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy']) start_time = time.time() if not data_augmentation: his = model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_data=(X_valid, Y_valid), shuffle=True) else: # this will do preprocessing and realtime data augmentation datagen = ImageDataGenerator( featurewise_center=False, # set input mean to 0 over the dataset samplewise_center=False, # set each sample mean to 0 featurewise_std_normalization= False, # divide inputs by std of the dataset samplewise_std_normalization=False, # divide each input by its std zca_whitening=True, # apply ZCA whitening rotation_range= 0, # randomly rotate images in the range (degrees, 0 to 180) width_shift_range= 0.1, # randomly shift images horizontally (fraction of total width) height_shift_range= 0.1, # randomly shift images vertically (fraction of total height) horizontal_flip=True, # randomly flip images vertical_flip=False) # randomly flip images # compute quantities required for featurewise normalization # (std, mean, and principal components if ZCA whitening is applied) datagen.fit(X_train) # fit the model on the batches generated by datagen.flow() his = model.fit_generator(datagen.flow(X_train, Y_train, batch_size=batch_size), samples_per_epoch=X_train.shape[0], nb_epoch=nb_epoch, validation_data=(X_valid, Y_valid)) # evaluate our model score = model.evaluate(X_test, Y_test, verbose=0) print('Test score:', score[0]) print('Test accuracy:', score[1]) print('training time', time.time() - start_time) # wirte test accuracy to a file output_file_name = './output_l1l2/train_val_loss_with_dropout_epochs_{0}_data_augmentation_{1}_noise_{2}_maxout_{3}_dropout_{4}_l1_{5}_l2_{6}_sigma_{7}_l1weight_{8}_l2weight_{9}.txt'.format( nb_epoch, data_augmentation, noise, maxout, dropout, l1_reg, l2_reg, sigma, l1_weight, l2_weight) print(output_file_name) with open(output_file_name, "w") as text_file: text_file.write('Test score: {}'.format(score[0])) text_file.write('\n') text_file.write('Test accuracy: {}'.format(score[1])) text_file.close() # visualize training history train_loss = his.history['loss'] val_loss = his.history['val_loss'] plt.plot(range(1, len(train_loss) + 1), train_loss, color='blue', label='train loss') plt.plot(range(1, len(val_loss) + 1), val_loss, color='red', label='val loss') plt.legend(loc="upper left", bbox_to_anchor=(1, 1)) plt.xlabel('#epoch') plt.ylabel('loss') # @TODO what's the deal around here ~"~"? output_fig_name = './output_no_maxout/train_val_loss_with_dropout_epochs_{0}_data_augmentation_{1}_noise_{2}_maxout_{3}_dropout_{4}_l1_{5}_l2_{6}_sigma_{7}_l1weight_{8}_l2weight_{9}.png'.format( nb_epoch, data_augmentation, noise, maxout, dropout, l1_reg, l2_reg, sigma, l1_weight, l2_weight) plt.savefig(output_fig_name, dpi=300) plt.show()
def test(data_testset, predict_only=True, reranking=True): test_source_file = os.path.join(config.DATA_DIR, 'test_source_dict.json') test_target_file = os.path.join(config.DATA_DIR, 'test_target.txt') predictions_file = os.path.join(config.PREDICTIONS_DIR, 'predictions.txt') predictions_final_file = os.path.join(config.PREDICTIONS_DIR, 'predictions_final.txt') predictions_reduced_file = os.path.join(config.METRICS_DIR, 'predictions_reduced.txt') test_reference_file = os.path.join(config.METRICS_DIR, 'test_references.txt') print('Loading test data...', end=' ') sys.stdout.flush() # Load and preprocess the test data data_loader.load_test_data(data_testset) print('DONE') print('Predicting...') sys.stdout.flush() # TODO: set DECODE_FILE and PREDICTION_FILE environment variables from here instead of the shell script # Run inference for the test samples os.system('bash ' + os.path.join(config.T2T_DIR, 't2t_test_script.sh')) print('DONE') print('Extracting beams...') sys.stdout.flush() # Read in the beams and their log-probs as produced by the T2T beam search df_predictions = pd.read_csv(predictions_file, sep='\t', header=None, encoding='utf8') beams_present = len(df_predictions.columns) > 1 if beams_present: # Combine beams and their corresponding scores into tuples beams = [] for i in range(0, len(df_predictions.columns), 2): beams.append( list( zip(df_predictions.iloc[:, i], df_predictions.iloc[:, i + 1]))) # Transpose the list of beams so as to have all beams of a single sample per line beams = list(map(list, zip(*beams))) else: beams = [[(beam, )] for beam in df_predictions.iloc[:, 0].tolist()] print('DONE') print('Reranking...') sys.stdout.flush() # Score the slot alignment in the beams, and rerank the beams accordingly if reranking and beams_present: beams = postprocessing.rerank_beams(beams) print('DONE') print('Evaluating...') sys.stdout.flush() with io.open(test_source_file, 'r', encoding='utf8') as f_test_source, \ io.open(predictions_final_file, 'w', encoding='utf8') as f_predictions_final: mrs = json.load(f_test_source, object_pairs_hook=OrderedDict) predictions = [prediction_beams[0][0] for prediction_beams in beams] predictions_final = postprocessing.finalize_utterances( predictions, mrs) for prediction in predictions_final: f_predictions_final.write(prediction + '\n') if not predict_only: # Create a file with a single prediction for each group of the same MRs if 'rest_e2e' in data_testset: test_mrs, _ = data_loader.read_rest_e2e_dataset_test( data_testset) elif 'tv' in data_testset: test_mrs, _, _ = data_loader.read_tv_dataset_test(data_testset) elif 'laptop' in data_testset: test_mrs, _, _ = data_loader.read_laptop_dataset_test( data_testset) elif 'hotel' in data_testset: test_mrs, _, _ = data_loader.read_hotel_dataset_test( data_testset) elif 'video_game' in data_testset: test_mrs, _ = data_loader.read_video_game_dataset_test( data_testset) else: raise FileNotFoundError with io.open(predictions_reduced_file, 'w', encoding='utf8') as f_predictions_reduced: for i in range(len(test_mrs)): if i == 0 or test_mrs[i] != test_mrs[i - 1]: f_predictions_reduced.write(predictions_final[i] + '\n') if not predict_only: # Depending on the OS, the tensor2tensor BLEU script might require a different way of executing if sys.executable is not None: bleu_script = 'python ' + os.path.join( os.path.dirname(sys.executable), 't2t-bleu') else: bleu_script = 't2t-bleu' metrics_script = 'python ' + os.path.join(config.METRICS_DIR, 'measure_scores.py') # Run the tensor2tensor internal BLEU script os.system(bleu_script + ' --translation=' + predictions_final_file + ' --reference=' + test_target_file) # Run the metrics script provided by the E2E NLG Challenge os.system(metrics_script + ' ' + test_reference_file + ' ' + predictions_reduced_file) print('DONE')
def main(options): train_data_dir = options.train_data_dir train_files = os.listdir(train_data_dir) test_data_dir = options.test_data_dir test_files = os.listdir(test_data_dir) data = data_loader.load_training_data(train_data_dir, train_files) images, labels, label_dict, imsize = data test_data = data_loader.load_test_data(test_data_dir, test_files, label_dict) test_images, test_labels = test_data num_classes = len(label_dict) test_info = test_images, test_labels has_test_data = False if len(test_images) > 0: has_test_data = True for key in label_dict: ex_count = np.sum(labels[:, label_dict[key]] == 1.) print("Number of examples of " + str(key) + ": " + str(ex_count)) print("Feature Mapping: ", label_dict) print("Image Size: " + str(imsize)) train_info, val_info = data_loader.merge_and_split_data(images, labels) train_data, train_labels = train_info val_data, val_labels = val_info print("Merged and Split Training Data") print("Training Data Size: ", len(train_data)) print("Validation Data Size: ", len(val_data)) if has_test_data: for key in label_dict: ex_count = np.sum(test_labels[:, label_dict[key]] == 1.) print("Number of test examples of " + str(key) + ": " + str(ex_count)) print("Test Data Size: ", len(test_images)) data_queue = data_queues.QueueManager(train_data, train_labels) placeholders = utils.create_placeholders(imsize, num_classes) indata, answer, is_training, keep_prob, learning_rate = placeholders runnables = vgg_encoder_model.setup_model(indata, answer, imsize, is_training, keep_prob, num_classes, learning_rate) train_step, loss, predictions, accuracy, summaries, fc_2 = runnables train_summary, val_summary, test_summary = summaries init = tf.global_variables_initializer() print("Setup Model") log_dir = options.tb_log_dir features_dir = options.features_dir with tf.device('/gpu:0'): with tf.Session() as sess: train_writer = tf.summary.FileWriter(log_dir + "/train", sess.graph) val_writer = tf.summary.FileWriter(log_dir + "/val", sess.graph) test_writer = tf.summary.FileWriter(log_dir + "/test", sess.graph) sess.run(init) patience = 0 max_patience = 1000 # wait for 30 steps of val loss not decreasing min_val_loss = float("inf") epoch = 0 while patience < max_patience: epoch += 1 print("EPOCH: ", str(epoch)) start = time.time() train_statistics = model_runner.process_train_data(data_queue, placeholders, runnables, train_writer, num_classes, sess) avg_train_loss, train_acc = train_statistics[:2] train_confusion_matrix = train_statistics[2] train_misclassified = train_statistics[3] end = time.time() val_statistics = model_runner.process_data(val_info, placeholders, runnables, val_writer, num_classes, False, sess) avg_val_loss, val_acc = val_statistics[:2] val_confusion_matrix = val_statistics[2] val_misclassified = val_statistics[3] val_features = val_statistics[4] if has_test_data: test_statistics = model_runner.process_data(test_info, placeholders, runnables, test_writer, num_classes, True, sess) avg_test_loss, test_acc = test_statistics[:2] test_confusion_matrix = test_statistics[2] test_misclassified = test_statistics[3] test_features = test_statistics[4] print("train_loss: " + str(avg_train_loss) + " val_loss: " + str(avg_val_loss) + " test_loss: " + str(avg_test_loss)) else: print("train_loss: " + str(avg_train_loss) + " val_loss: " + str(avg_val_loss)) print("train_acc: " + str(train_acc)) print("val_acc: " + str(val_acc)) if has_test_data: print("test_acc: ", str(test_acc)) print("Training Time: ", str(end - start)) if avg_val_loss < min_val_loss: min_val_loss = avg_val_loss print("Training Confusion:\n", train_confusion_matrix) print("Validation Confusion:\n", val_confusion_matrix) if has_test_data: print("Test Confusion:\n", test_confusion_matrix) utils.store_misclassified(train_misclassified, val_misclassified, test_misclassified=test_misclassified) else: utils.store_misclassified(train_misclassified, val_misclassified) pickle_name = [k + "_{}".format(v) for k, v in label_dict.items()] pickle_name = '_'.join(pickle_name) pickle_name = pickle_name + ".p" final_features = {'val': val_features} if has_test_data: final_features['test'] = test_features with open(os.path.join(features_dir, pickle_name), 'wb') as outf: pickle.dump(final_features, outf, protocol=pickle.HIGHEST_PROTOCOL) patience = 0 else: patience += 1
def test_all(data_testset, reranking=True): test_source_file = os.path.join(config.DATA_DIR, 'test_source_dict.json') test_target_file = os.path.join(config.DATA_DIR, 'test_target.txt') # Prepare the output folder if not os.path.exists(config.PREDICTIONS_BATCH_LEX_DIR): os.makedirs(config.PREDICTIONS_BATCH_LEX_DIR) print('Loading test data...', end=' ') sys.stdout.flush() # Load and preprocess the test data data_loader.load_test_data(data_testset) print('DONE') print('Predicting...') sys.stdout.flush() # Run inference for the test samples using each checkpoint of the model os.system('bash ' + os.path.join(config.T2T_DIR, 't2t_test_all_script.sh')) print('DONE') print('Evaluating...') sys.stdout.flush() # Relexicalize all prediction files for predictions_file in glob.glob( os.path.join(config.PREDICTIONS_BATCH_DIR, '*')): predictions_final_file = os.path.join( config.PREDICTIONS_BATCH_LEX_DIR, os.path.basename(predictions_file)) # Read in the beams and their log-probs as produced by the T2T beam search df_predictions = pd.read_csv(predictions_file, sep='\t', header=None, encoding='utf8') beams_present = len(df_predictions.columns) > 1 if beams_present: # Combine beams and their corresponding scores into tuples beams = [] for i in range(0, len(df_predictions.columns), 2): beams.append( list( zip(df_predictions.iloc[:, i], df_predictions.iloc[:, i + 1]))) # Transpose the list of beams so as to have all beams of a single sample per line beams = list(map(list, zip(*beams))) else: beams = [[(beam, )] for beam in df_predictions.iloc[:, 0].tolist()] # Score the slot alignment in the beams, and rerank the beams accordingly if reranking and beams_present: beams = postprocessing.rerank_beams(beams) # Postprocess the generated utterances and save them to a new file with io.open(test_source_file, 'r', encoding='utf8') as f_test_source, \ io.open(predictions_final_file, 'w', encoding='utf8') as f_predictions_final: mrs = json.load(f_test_source, object_pairs_hook=OrderedDict) predictions = [ prediction_beams[0][0] for prediction_beams in beams ] predictions_final = postprocessing.finalize_utterances( predictions, mrs) for prediction in predictions_final: f_predictions_final.write(prediction + '\n') # Depending on the OS, the tensor2tensor BLEU script might require a different way of executing if sys.executable is not None: bleu_script = 'python ' + os.path.join(os.path.dirname(sys.executable), 't2t-bleu') else: bleu_script = 't2t-bleu' # Run the tensor2tensor internal BLEU script os.system(bleu_script + ' --translations_dir=' + config.PREDICTIONS_BATCH_LEX_DIR + ' --reference=' + test_target_file + ' --event_dir=' + config.PREDICTIONS_BATCH_EVENT_DIR) print('DONE')
from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate from sklearn.feature_selection import mutual_info_classif from sklearn.feature_extraction.text import CountVectorizer from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier # load data (provided method) train_data, valid_data = data_loader.load_train_data('Data/adult.data', valid_rate=0.1, is_df=True) test_data = data_loader.load_test_data('Data/adult.test', is_df=True) #update fields native_country_dict = { ' ?': '?', ' Cambodia': 'Africa', ' Canada': 'North America', ' China': 'Asia', ' Columbia': 'Latin America', ' Cuba': 'Latin America', ' Dominican-Republic': 'Latin America', ' Ecuador': 'Latin America', ' El-Salvador': 'Latin America', ' England': 'Europe', ' France': 'Europe', ' Germany': 'Europe',
import augmentation_methods as am import data_loader as dl import word_vectors as wv import data_preprocessing as dp import classifier as cl import testing as t import visualization as vis if __name__ == "__main__": # get original data in tokenized form orig_corpus, y_train_orig = dl.load_train_data() test_corpus, y_test_orig = dl.load_test_data() # develop word vectors word_vectors = wv.get_word_vectors(orig_corpus) # augment corpi corpus_method_1, y_train_method_1 = am.method_1(orig_corpus.copy(), y_train_orig.copy(), word_vectors) corpus_method_2, y_train_method_2 = am.method_2(orig_corpus.copy(), y_train_orig.copy(), word_vectors) corpus_method_3, y_train_method_3 = am.method_3(orig_corpus.copy(), y_train_orig.copy(), word_vectors) # process data so they are in a form(td-idf) that can be fed to classifiers X_orig, vectorizer = dp.process_corpus_orig(orig_corpus) X_method_1 = dp.process_corpus(corpus_method_1, vectorizer) X_method_2 = dp.process_corpus(corpus_method_2, vectorizer) X_method_3 = dp.process_corpus(corpus_method_3, vectorizer) X_test = dp.process_corpus(test_corpus, vectorizer) # train classifiers on original corpus and all augmented corpi classifier_orig = cl.train_classifier_bayes(X_orig, y_train_orig) classifier_method_1 = cl.train_classifier_bayes(X_method_1, y_train_method_1)
def get_embedding_matrix(): embeddings_index = glove_word_embeddings.load_embeddings_index() embedding_matrix = np.zeros((max_words, embedding_dim)) for word, i in word_index.items(): if i < max_words: embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector return embedding_matrix # texts, labels = data_loader.load_data() # Loading test data for test results with loading weights texts, labels = data_loader.load_test_data() print("Test data loaded, length: ", len(texts)) print("Test data loaded, length: ", len(labels)) tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_texts(texts=texts) sequences = tokenizer.texts_to_sequences(texts) #word_index = tokenizer.word_index #print("Found %s unique tokens." % len(word_index)) #data = pad_sequences(sequences, maxlen=max_len) #labels = np.asarray(labels) #print("Shape of data tensor:", data.shape)