def main(): parser = argparse.ArgumentParser() parser.add_argument('--save-dir', '-s', type=str, default='model') parser.add_argument('--val', '-v', dest='val', action='store_true') parser.add_argument('--test', '-t', dest='val', action='store_false') parser.add_argument('--resize', dest='resize', action='store_true') parser.set_defaults(val=False, resize=False) args = parser.parse_args() if args.val: images, labels = load_data(True) _, val_dset = prepare_dataset(images, labels, train=True, augment=False) else: images, labels = load_data(False) test_dset = prepare_dataset(images, labels, train=False, resize=args.resize) dataset = val_dset if args.val else test_dset graph = tf.Graph() with graph.as_default(): with tf.Session(graph=graph) as sess: tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], args.save_dir) X = graph.get_tensor_by_name('images_ph:0') logits_op = graph.get_tensor_by_name('dense_2/BiasAdd:0') acc, num_correct, num_samples = check_accuracy( sess, dataset, X, logits_op) print('{} acc: {:.2%} ({}/{})'.format('val' if args.val else 'test', acc, num_correct, num_samples))
def get_test_data(test_file=None, level='word'): if level == 'word': x_a_test, x_b_test, vocab = load_data(test_file, level=level, test=True) return [x_a_test, x_b_test], vocab else: x_a_test, x_b_test, x_a_char_test, x_b_char_test, vocab = load_data( test_file, level=level, test=True) return [x_a_test, x_b_test, x_a_char_test, x_b_char_test], vocab
def get_data(train_file=None, test_file=None, level='word'): if level == 'word': x_a_train, x_b_train, x_c_train, vocab = load_data(train_file, level=level) x_a_test, x_b_test, _ = load_data(test_file, level=level, test=True) return [x_a_train, x_b_train, x_c_train], [x_a_test, x_b_test], vocab else: x_a_train, x_b_train, x_c_train, x_a_char_train, x_b_char_train, x_c_char_train, vocab = \ load_data(train_file, level=level) x_a_test, x_b_test, x_a_char_test, x_b_char_test, _ = load_data( test_file, level=level, test=True) return [x_a_train, x_b_train, x_c_train, x_a_char_train, x_b_char_train, x_c_char_train], \ [x_a_test, x_b_test, x_a_char_test, x_b_char_test], vocab
def preprocess(): # Load Data print("Data Preprocess Stage...") data_text, class_list = data_preprocess.load_data(FLAGS.data_file, FLAGS.class_file, FLAGS.char) # Build Vocabulary data_max_length = max([len(s.split(" ")) for s in data_text]) print("Data Max Length: ", data_max_length) data_processor = learn.preprocessing.VocabularyProcessor(data_max_length) print("Data Processor Made") x = np.array(list(data_processor.fit_transform(data_text))) del data_text print("Data Transformed to NPArray") class_processor = learn.preprocessing.VocabularyProcessor(1) print("Class Processor Made") y_np = np.array(list(class_processor.fit_transform(class_list))) del class_list print("Class Transformed to NPArray") y_max = np.max(y_np) print("Number of Class: ", y_max) #y = np.zeros((y_np.shape[0], y_max), dtype=int) #print("Zero NPArray for Class Made") #y_np = y_np.ravel() #y[np.arange(y_np.size), y_np-1] = 1 #y = tf.one_hot(y_np, y_max) #print("One-Hot Encoding for Class Finished") #del y_np # Randomly shuffle data np.random.seed(10) dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y_np))) shuffle_indices = np.random.permutation(np.arange(len(y_np))) shuffle_indices = shuffle_indices[dev_sample_index:] #x_shuffled = x[shuffle_indices] #del x #y_shuffled = y[shuffle_indices] #del y # Split train/test set # TODO: This is very crude, should use cross-validation # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) # x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] x_dev = x[shuffle_indices] #np.delete(x, shuffle_indices) y_dev = y_np[shuffle_indices] #np.delete(y, shuffle_indices) #del x_shuffled, y_shuffled if (FLAGS.char): print("Data Character Size: {:d}".format(len(data_processor.vocabulary_))) print("Class List Size: {:d}".format(len(class_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_np), len(y_dev))) return x, y_np, data_processor, class_processor, x_dev, y_dev
def test_env(): path = '../master_capston/the-movies-dataset/' features_embedding_movies = pd.read_csv( os.path.join(path, 'movie_embedding_features.csv')) sampler = LTSDocumentSampler(dataset=features_embedding_movies) # this mean the number of items in the recommendation return from the agent slate_size = 3 # i am assuming this number mean the # of possible items to send to the agent for recommend for each slate num_candidates = 10 format_data = data_preprocess.load_data(path) # print(format_data.head()) # print(format_data.shape) features_embedding_movies = pd.read_csv( os.path.join(path, 'movie_embedding_features.csv')) positive_user_ids, positive_history_data = data_preprocess.get_user_positive( format_data) user_sampler = LTSStaticUserSampler(positive_user_ids, positive_history_data, features_embedding_movies) LTSUserModel = UserModel(user_sampler, slate_size, LTSResponse) ltsenv = environment.Environment(LTSUserModel, sampler, num_candidates, slate_size, resample_documents=True) lts_gym_env = recsim_gym.RecSimGymEnv(ltsenv, clicked_engagement_reward) observation_0 = lts_gym_env.reset() # print(observation_0['user'][:5]) # print('Observation 0') # print('Available documents') # doc_strings = ['doc_id ' + key + " kaleness " + str(value) for key, value # in observation_0['doc'].items()] # print('\n'.join(doc_strings)) recommendation_slate_0 = [0, 1, 2] observation_1, reward, done, _ = lts_gym_env.step(recommendation_slate_0) print(observation_1['user'][:5]) # print('Noisy user state observation') # print(observation_0['user']) print(lts_gym_env.observation_space) print(lts_gym_env.action_space) # test_doc_model() # test_user_model() # test_env()
def main(unused_argv): # Load training and eval data (train_data, train_labels) = dt.load_data("E:\\BUAA\\实验室\\阶段2:\\training_set") train_data = train_data.astype(np.float32) #train_labels = train_labels.astype(np.float32) print(train_labels.dtype) (eval_data, eval_labels) = dt.load_data("E:\\BUAA\\实验室\\阶段2:\\test_set") eval_data = eval_data.astype(np.float32) #eval_labels = eval_labels.astype(np.float32) # Create the Estimator road_classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir="E:\\BUAA\\实验室\\阶段2:\\road_convnet_model") # Set up logging for predictions tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=50) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True) road_classifier.train(input_fn=train_input_fn, steps=20000, hooks=[logging_hook]) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False) eval_results = road_classifier.evaluate(input_fn=eval_input_fn) print(eval_results)
def test_user_model(): path = '../master_capston/the-movies-dataset/' format_data = data_preprocess.load_data(path) # print(format_data.head()) # print(format_data.shape) features_embedding_movies = pd.read_csv( os.path.join(path, 'movie_embedding_features.csv')) positive_user_ids, positive_history_data = data_preprocess.get_user_positive( format_data) user_sampler = LTSStaticUserSampler(positive_user_ids, positive_history_data, features_embedding_movies) current_user = user_sampler.sample_user() current_user.create_observation()
def main(): path = 'dataset/' use_100k = True if use_100k: rating_file_name = 'process_small_rating.csv' embedding_file_name = 'movie_embedding_features.csv' else: rating_file_name = 'process_1m_rating.csv' embedding_file_name = 'movie_embedding_features_1m.csv' # train_mode = True np.random.seed(0) test_mode = True offline_mode = True sample_user_randomly = False save_model_path = "model_test/save_model/49" # build enviroment # this mean the number of items in the recommendation return from the agent slate_size = 5 # i am assuming this number mean the # of possible items to send to the agent for recommend for each slate num_candidates = 30 rating_pivot = 4 time_budget = -1 time_budget_range = [2, 6] min_num_positive_rating = 40 min_num_rating = 70 resample_documents = True format_data = data_preprocess.load_data(path, file_name=rating_file_name) features_embedding_movies = pd.read_csv( os.path.join(path, embedding_file_name)) positive_user_ids, positive_history_data = data_preprocess.get_user_positive( format_data, number_positive_pivot=min_num_positive_rating) print("unique user id : ", len(np.unique(positive_user_ids))) print(len(positive_history_data)) # restrict number of total rating positive_user_ids, positive_history_data = data_preprocess.generate_new_dataset( positive_history_data, num_rating_pivot=min_num_rating) print("unique user id : ", len(np.unique(positive_user_ids))) print(len(positive_history_data)) # generate train and test set # user_details = positive_history_data.groupby('userId').size().reset_index() # user_details.columns = ['userId', 'number of rating'] # print(user_details.describe()) train_set, test_set = data_preprocess.generate_train_test_data( positive_history_data) print("train set size : ", len(train_set)) print("test set size : ", len(test_set)) users_history_data, train_set = data_preprocess.create_recent_history( train_set) train_set = train_set.astype({'rating': 'float64'}) test_set = test_set.astype({'rating': 'float64'}) print("user history set size : ", len(users_history_data)) print("new train set size : ", len(train_set)) user_details = test_set.groupby('userId').size().reset_index() user_details.columns = ['userId', 'number of rating'] print(user_details.describe()) # check the train set and test set quality offline_dataset = train_set if test_mode: offline_dataset = test_set sampler = LTSDocumentSampler(dataset=features_embedding_movies, num_candidate=num_candidates) user_sampler = LTSStaticUserSampler(users_history_data, features_embedding_movies, offline_data=offline_dataset, offline_mode=offline_mode, time_budget=time_budget, random=sample_user_randomly, time_budget_range=time_budget_range) # need to handle where we update dataset with num candidate< available func_select_train_set = select_dataset(features_embedding_movies, train_set) func_select_test_set = select_dataset(features_embedding_movies, test_set) # user_train_set = func(user_id=39) # print(len(user_train_set)) LTSUserModel = UserModel(user_sampler, offline_mode=offline_mode, rating_pivot=rating_pivot, slate_size=slate_size, response_ctor=LTSResponse) ltsenv = CustomSingleUserEnviroment( LTSUserModel, sampler, num_candidates, slate_size, resample_documents=resample_documents, offline_mode=offline_mode, select_subset_func=func_select_train_set) if test_mode: ltsenv = CustomSingleUserEnviroment( LTSUserModel, sampler, num_candidates, slate_size, resample_documents=resample_documents, offline_mode=offline_mode, select_subset_func=func_select_test_set) lts_gym_env = recsim_gym.RecSimGymEnv(ltsenv, clicked_engagement_reward) max_num_user = len(np.unique(users_history_data['userId'])) with tf.Session() as sess: popularAgent = PopularityRecommender(train_set, slate_size, "1") # RL_Agent = create_RL_Agent(sess,lts_gym_env,slate_size,save_model_path=save_model_path) # contentAgent = contentModel(slate_size,embedding_size=30) evaluate_agent_offline(popularAgent, slate_size, max_num_user, lts_gym_env)
def build_NN_classifier(filename, option, model_name=None): # LOAD DATA descriptors = qm_descriptors X, Y = data_preprocess.load_data(filename, descriptors) # IF DOWNSAMPLING: #print('>> Down sampling.') #smaller_x, smaller_y = data_preprocess.do_down_sampling(X,Y) if option == 'default': print('Training Logist...') print('*-----------------------------*') print('Training on default parameters.') accuracies_default = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data( X, Y, partition=0.20) accuracies_default.append( train_NN(x_train, y_train, x_valid, y_valid)) print('Average accuracy over 3 default runs: %.2f' % numpy.mean(accuracies_default)) elif option == 'train': print('*-----------------------------*') print('Searchig for best parameters.') params = [] accuracies = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data( X, Y, partition=0.20) best_parameters = scan_parameters(x_train, y_train) params.append(best_parameters) accuracy = train_NN(x_train, y_train, x_valid, y_valid, best_parameters) accuracies.append(accuracy) print('*-----------------------------*') print('Summary of Results.') print('*-----------------------------*') for i in range(len(accuracies)): print('Run ' + str(i + 1) + ' ', params[i], ' : ', accuracies[i]) elif option == 'test': print('TESTING') print('*-----------------------------*') hidden_layer_sizes = (100, 100) solver = 'adam' alpha = 0.001 params_dict = { 'hidden_layer_sizes': hidden_layer_sizes, 'solver': solver, 'alpha': alpha, 'max_iter': [400] } print(params_dict) acc_list = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data( X, Y, partition=0.20) acc_list.append( train_NN(x_train, y_train, x_valid, y_valid, params_dict)) print('Summary of Results.') print('*-----------------------------*') print('Average accuracy over 10 runs: %.2f' % numpy.mean(acc_list))
def main(options): args = get_default_args() set_args(args, options) mode, dataset_name = args['mode'], args['dataset'] # default setting args['raw_data'] = "data/%s/" % args['dataset'] args['qrels_file'] = "data/%s/qrels.all.txt" % args['dataset'] print_args(args) # get train/val/test names for specific dataset train_name, val_name, test_name, train_set, val_set, test_set, num_classes, with_url = config_dataset( args) max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict( int), defaultdict(int) vocab = {'word': {}, '3gram': {}} test_vocab = {'word': {}, '3gram': {}} train_vocab_emb, test_vocab_emb = None, None ############################# LOAD DATA ################################## data_name = ("data_m%s_%s_%s_%s" % (mode, dataset_name, train_name, test_name)).lower() if args["load_data"]: train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True) test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, test_name), False) if dataset_name != 'twitter' and dataset_name != 'TwitterURL': val_dataset, _, _, _, _, _ = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, val_name), False) if args['embedding'] == 'glove': train_vocab_emb, test_vocab_emb = construct_vocab_emb( "%s/%s" % (args["experimental_data"], data_name), vocab['word'], test_vocab['word'], 300, "word", base_embed_path=args["base_embed_path"], type=args["embedding"]) print('load dataset successfully') else: train_dataset = gen_data(args["raw_data"], train_set, vocab, test_vocab, True, max_query_len, max_doc_len, max_url_len, num_classes, args) print("create training set successfully...") if dataset_name != 'twitter' and dataset_name != 'TwitterURL': val_dataset = gen_data(args["raw_data"], val_set, vocab, test_vocab, False, max_query_len, max_doc_len, max_url_len, num_classes, args) print("create validation set successfully...") test_dataset = gen_data(args["raw_data"], test_set, vocab, test_vocab, False, max_query_len, max_doc_len, max_url_len, num_classes, args) train_vocab_emb, test_vocab_emb = construct_vocab_emb( "%s/%s" % (args["experimental_data"], data_name), vocab['word'], test_vocab['word'], 300, "word", base_embed_path=args["base_embed_path"]) save_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True, train_dataset, max_query_len, max_doc_len, max_url_len, vocab, train_vocab_emb) print("save training set successfully...") if dataset_name != 'twitter' and dataset_name != 'TwitterURL': save_data("%s/%s/%s" % (args["experimental_data"], data_name, val_name), False, val_dataset, vocab=test_vocab, vocab_emb=test_vocab_emb) print("save val set successfully...") save_data("%s/%s/%s" % (args["experimental_data"], data_name, test_name), False, test_dataset, vocab=test_vocab, vocab_emb=test_vocab_emb) print("save test set successfully...") if dataset_name == 'twitter' or dataset_name == 'TwitterURL': val_split = args['val_split'] num_samples, _ = train_dataset["query_word_input"].shape # randomly sample queries and all their documents if query_random is True # otherwise, query-doc pairs are randomly sampled query_random = True if dataset_name == 'twitter' else False if query_random: del train_dataset["overlap_feat"] val_indices = sample_aaai_val_set(args["raw_data"], train_set, val_split) else: val_split = 0.1 val_indices, val_set = [], set() for i in range(int(num_samples * val_split)): val_index = np.random.randint(num_samples) while val_index in val_set: val_index = np.random.randint(num_samples) val_indices.append(val_index) val_set.add(val_index) val_dataset = {} for key in train_dataset: #print(key, train_dataset[key].shape) val_dataset[key] = train_dataset[key][val_indices] train_dataset[key] = np.delete(train_dataset[key], val_indices, 0) # shuffle the train dataset explicitly to make results reproducible # whether the performance will be affected remains a question keys, values = [], [] for key in train_dataset: if train_dataset[key].size == 0: continue keys.append(key) values.append(train_dataset[key]) zipped_values = list(zip(*values)) random.shuffle(zipped_values) shuffled_values = list(zip(*zipped_values)) for i, key in enumerate(keys): train_dataset[key] = np.array(shuffled_values[i]) print('after shuffle:', train_dataset['id'][:5], train_dataset['sim'][:5], train_dataset['query_word_input'][:5]) # merge the vocabulory of train and test set merged_vocab = {} merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word']) merged_vocab['3gram'] = merge_two_dicts(vocab['3gram'], test_vocab['3gram']) print("TRAIN vocab: word(%d) 3gram(%d)" % (len(vocab['word']), len(vocab['3gram']))) print("TEST vocab: word(%d) 3gram(%d)" % (len(test_vocab['word']), len(test_vocab['3gram']))) print("MERGED vocab: word(%d) 3gram(%d)" % (len(merged_vocab['word']), len(merged_vocab['3gram']))) vocab_inv, vocab_size = {}, {} for key in vocab: vocab_inv[key] = invert_dict(merged_vocab[key]) vocab_size[key] = len(vocab[key]) print(vocab_size) # Print data samples for debug purpose print_dataset(mode, train_dataset, vocab_inv) print_dataset(mode, test_dataset, vocab_inv) ############################ TRAIN MODEL ################################# # create model model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, train_vocab_emb, args["nb_filters"], args["nb_layers"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option'], join=args['join'], num_classes=num_classes, with_url=with_url, highway=args['highway'], att=args['co_attention'], ext_feat=args["external_feat"], encoder_option=args['encoder_option']) model_name = ( "model_N%s_data%s_mo%s_e%s_c%s_NumFilter%d_nblayer%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f_Join%s_H%s_Att%s" % (mode, train_name, args['model_option'], args["encoder_option"], args['conv_option'], args["nb_filters"], args["nb_layers"], args["trainable"], args['dropout'], args['weighting'], args['mask'], args['batch_size'], args['val_split'], args['join'], args['highway'], args['co_attention'])).lower() model_path = "%s/%s/%s" % (args['experimental_data'], data_name, model_name) print(model_path) if args['optimizer'] == "adam": opt = optimizers.Adam(lr=args["learning_rate"], beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True) print('use Adam optimizer') elif args['optimizer'] == "sgd": opt = optimizers.SGD(lr=args["learning_rate"], decay=1e-6, momentum=0.9, nesterov=True) print('use SGD optimizer') elif args['optimizer'] == 'rmsprop': opt = optimizers.RMSprop(lr=args["learning_rate"], rho=0.9, epsilon=None, decay=0.0) print('use RMSprop optimizer') if num_classes <= 2: model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) else: print('compile model with categorical cross-entropy') model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) class_weight = None if args['dataset'] == 'Quora': #class_weight = {0:1, 1:2} print('apply class weight:', class_weight) print(model.summary()) print('model init weights sum: %.4f' % get_model_weights(model)) if not args['load_model']: early_stopping = EarlyStopping(monitor='val_loss', patience=4) checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights", monitor='val_loss', save_best_only=True, verbose=1) lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.0001, verbose=1) model.fit( train_dataset, train_dataset['sim'], #validation_split=0.05, batch_size=args['batch_size'], validation_data=(val_dataset, val_dataset['sim']), epochs=args['epochs'], shuffle=False, callbacks=[checkpoint, lr_reducer, early_stopping], class_weight=class_weight, verbose=args['verbose']) ############################ TEST MODEL ################################# print('load best model from %s.best.weights' % model_path) model.load_weights("%s.best.weights" % model_path) # load trained vocab embedding. trained_vocab_emb = model.get_layer('word-embedding').get_weights()[0] # merge trained vocab embedding with test OOV word embeddings merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300)) merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb merged_vocab_emb[ len(vocab['word']):len(merged_vocab['word']), :] = test_vocab_emb for key in vocab: vocab_size[key] = len(merged_vocab[key]) print(vocab_size) new_model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, merged_vocab_emb, args["nb_filters"], args["nb_layers"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option'], join=args['join'], num_classes=num_classes, with_url=with_url, highway=args['highway'], att=args['co_attention'], ext_feat=args["external_feat"], encoder_option=args['encoder_option']) new_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) #print(new_model.summary()) for layer_id in range(len(model.layers)): layer = model.layers[layer_id] if layer.name != 'word-embedding': new_model.layers[layer_id].set_weights(layer.get_weights()) print('copy weight done.') val_predictions = new_model.predict(val_dataset) predictions = new_model.predict(test_dataset) if dataset_name == 'twitter' or dataset_name == 'TrecQA': val_predictions = val_predictions[:, 1] predictions = predictions[:, 1] print(predictions[:10]) predictions_file = "%s/%s/predictions_%s.txt" % ( args["experimental_data"], data_name, model_name) with open(predictions_file, 'w') as f: for i in range(test_dataset['id'].shape[0]): f.write("%s %.4f %s\n" % (test_dataset['id'][i], predictions[i], args['mode'])) print('write predictions with trec format to %s' % predictions_file) val_predictions_file = "%s/%s/val_predictions_%s.txt" % ( args["experimental_data"], data_name, model_name) with open(val_predictions_file, 'w') as f: for i in range(val_dataset['id'].shape[0]): f.write( "%s %.4f %s\n" % (val_dataset['id'][i], val_predictions[i], args['mode'])) map, mrr, p30 = evaluate(val_predictions_file, args["qrels_file"]) print('write val predictions with trec format to %s' % val_predictions_file) print('Validation MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr)) map, mrr, p30 = evaluate(predictions_file, args["qrels_file"]) print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr)) else: preds = np.argmax(predictions, axis=-1) labels = np.argmax(test_dataset['sim'], axis=-1) corrects = preds == labels predictions_file = "%s/%s/predictions_%s.txt" % ( args["experimental_data"], data_name, model_name) with open(predictions_file, 'w') as f: f.write("id label pred prob model\n") for i in range(len(preds)): f.write("%s %s %s %.4f %s\n" % (test_dataset['id'][i], labels[i], preds[i], predictions[i][preds[i]], args['mode'])) print('write predictions with trec format to %s' % predictions_file) val_preds = np.argmax(val_predictions, axis=-1) val_labels = np.argmax(val_dataset['sim'], axis=-1) val_corrects = val_preds == val_labels val_predictions_file = "%s/%s/val_predictions_%s.txt" % ( args["experimental_data"], data_name, model_name) with open(val_predictions_file, 'w') as f: for i in range(val_dataset['id'].shape[0]): f.write("%s %s %s %.4f %s\n" % (val_dataset['id'][i], val_labels[i], val_preds[i], val_predictions[i][val_preds[i]], args['mode'])) print('write val predictions with trec format to %s' % val_predictions_file) print('val accuracy: %.4f' % (np.count_nonzero(val_corrects) * 1.0 / len(val_preds))) print('accuracy: %.4f' % (np.count_nonzero(corrects) * 1.0 / len(preds))) macro_prec = precision_score(labels, preds, average="macro") macro_recall = recall_score(labels, preds, average="macro") print('Macro Precision: %.3f, Recall: %.3f, F1: %.3f' % (macro_prec, macro_recall, 2 * macro_prec * macro_recall / (macro_prec + macro_recall))) print('Micro Precision: %.3f, Recall: %.3f, F1: %.3f' % (precision_score(labels, preds, average="micro"), recall_score(labels, preds, average="micro"), f1_score(labels, preds, average="micro"))) print('Confusion matrix:', confusion_matrix(labels, preds))
def main(options): args = get_default_args() set_args(args, options) print_args(args) mode = args['mode'] train_name, test_name = args['split']['train'], args['split']['test'] if train_name == 'train_all': train_set = ['train_2011', 'test_2011', 'train_2013', 'test_2013'] train_set.remove(test_name) else: train_set = [train_name] test_set = [test_name] print("train_set", train_set) print("test_set", test_set) max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict( int), defaultdict(int) vocab = {'word': {}, '3gram': {}, 'url': {}} test_vocab = {'word': {}, '3gram': {}, 'url': {}} train_vocab_emb, test_vocab_emb = None, None ############################# LOAD DATA ################################## data_name = ("data_m%s_%s_%s" % (mode, train_name, test_name)).lower() if args["load_data"]: train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True) test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, test_name), False) print('load dataset successfully') else: #vocab = build_vocab(args["raw_data"], train_set, test_set, vocab) #print('build vocab done. %d' % len(vocab['word'])) train_dataset = gen_data(args["raw_data"], train_set, vocab, test_vocab, True, max_query_len, max_doc_len, max_url_len, args) print("create training set successfully...") test_dataset = gen_data(args["raw_data"], test_set, vocab, test_vocab, False, max_query_len, max_doc_len, max_url_len, args) train_vocab_emb, test_vocab_emb = construct_vocab_emb( "%s/%s" % (args["experimental_data"], data_name), vocab['word'], test_vocab['word'], 300, "word", base_embed_path=args["base_embed_path"]) save_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True, train_dataset, max_query_len, max_doc_len, max_url_len, vocab, train_vocab_emb) print("save training set successfully...") save_data("%s/%s/%s" % (args["experimental_data"], data_name, test_name), False, test_dataset, vocab=test_vocab, vocab_emb=test_vocab_emb) print("save test set successfully...") if mode == 'dssm': train_dataset = convert_data_to_dssm_format(train_dataset, vocab, is_train_or_val=True) test_dataset = convert_data_to_dssm_format(test_dataset, vocab, is_train_or_val=False) print('data convertion done!') val_split = args['val_split'] num_samples, _ = train_dataset["query_word_input"].shape # randomly sample queries and all their documents if query_random is True # otherwise, query-doc pairs are randomly sampled query_random = True if query_random: val_indices = sample_val(train_set, num_samples=num_samples, val_split=val_split) else: val_indices, val_set = [], set() for i in range(int(num_samples * val_split)): val_index = np.random.randint(num_samples) while val_index in val_set: val_index = np.random.randint(num_samples) val_indices.append(val_index) val_set.add(val_index) print(val_indices[:5], np.sum(np.array(val_indices))) # sample validation set for debug purpose # val_indices = val_indices[:100] train_dataset["query_word_weight"] = train_dataset[ "query_word_weight"][:, :args['deeplevel']] train_dataset["query_3gram_weight"] = train_dataset[ "query_3gram_weight"][:, :args['deeplevel']] train_dataset["doc_word_weight"] = train_dataset[ "doc_word_weight"][:, :args['deeplevel']] train_dataset["doc_3gram_weight"] = train_dataset[ "doc_3gram_weight"][:, :args['deeplevel']] train_dataset["url_3gram_weight"] = train_dataset[ "url_3gram_weight"][:, :args['deeplevel']] test_dataset["query_word_weight"] = test_dataset[ "query_word_weight"][:, :args['deeplevel']] test_dataset["query_3gram_weight"] = test_dataset[ "query_3gram_weight"][:, :args['deeplevel']] test_dataset["doc_word_weight"] = test_dataset[ "doc_word_weight"][:, :args['deeplevel']] test_dataset["doc_3gram_weight"] = test_dataset[ "doc_3gram_weight"][:, :args['deeplevel']] test_dataset["url_3gram_weight"] = test_dataset[ "url_3gram_weight"][:, :args['deeplevel']] # print("SHAPEEEEEEEEEEEEEEEEEEEE: {}".format(len(train_dataset["query_word_weight"][100]))) val_dataset = {} for key in train_dataset: val_dataset[key] = train_dataset[key][val_indices] train_dataset[key] = np.delete(train_dataset[key], val_indices, 0) # shuffle the train dataset explicitly to make results reproducible # whether the performance will be affected remains a question keys, values = [], [] for key in train_dataset: keys.append(key) values.append(train_dataset[key]) zipped_values = list(zip(*values)) random.shuffle(zipped_values) shuffled_values = list(zip(*zipped_values)) for i, key in enumerate(keys): train_dataset[key] = np.array(shuffled_values[i]) print('after shuffle:', train_dataset['id'][:5], train_dataset['sim'][:5], train_dataset['query_word_input'][:5]) # sample training dataset for debug purpose # sample_num = 1000 # for key in train_dataset: # train_dataset[key] = train_dataset[key][:sample_num] # merge the vocabulory of train and test set print("TRAIN vocab: word(%d) 3gram(%d) url(%d)" % (len(vocab['word']), len(vocab['3gram']), len(vocab['url']))) print("TEST vocab: word(%d) 3gram(%d) url(%d)" % (len( test_vocab['word']), len(test_vocab['3gram']), len(test_vocab['url']))) merged_vocab = {'url': vocab['url'], '3gram': vocab['3gram']} merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word']) print("merged vocab: word(%d) 3gram(%d) url(%d)" % (len(merged_vocab['word']), len( merged_vocab['3gram']), len(merged_vocab['url']))) vocab_inv, vocab_size = {}, {} vocab['char'] = merge_two_dicts(vocab['3gram'], vocab['url']) test_vocab['char'] = merge_two_dicts(test_vocab['3gram'], test_vocab['url']) merged_vocab['char'] = merge_two_dicts(vocab['char'], test_vocab['char']) for key in vocab: vocab_inv[key] = invert_dict(merged_vocab[key]) vocab_size[key] = len(vocab[key]) print(vocab_size) # Print data samples for debug purpose # print_dataset(mode, train_dataset, vocab_inv) # print_dataset(mode, test_dataset, vocab_inv) ############################ TRAIN MODEL ################################# model = None if mode == 'deep_twitter': model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, train_vocab_emb, args["nb_filters"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option'], external=args["external_feat"], norm_weight=args['norm_weight'], cos_norm=args['cos'], only_word=args['only_word'], only_char=args['only_char'], pooling=args['pooling'], deeplevel=args['deeplevel']) elif mode == 'dssm': model = create_dssm_model(max_query_len, max_doc_len, max_url_len, vocab_size, train_vocab_emb, args["nb_filters"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"]) model_name = ( "model_N%s_data%s_mo%s_c%s_NumFilter%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f" % (mode, train_name, args['model_option'], args['conv_option'], args["nb_filters"], args["trainable"], args['dropout'], args['weighting'], args['mask'], args['batch_size'], args['val_split'])).lower() model_path = "%s/%s/%s" % (args['experimental_data'], data_name, model_name) print(model_path) if args['optimizer'] == "adam": opt = optimizers.Adam(lr=args["learning_rate"], beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) elif args['optimizer'] == "sgd": opt = optimizers.SGD(lr=args["learning_rate"], decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) print(model.summary()) model_weights, parameter_num = get_model_weights(model) print('model init weights sum: {} of {} parameters'.format( model_weights, parameter_num)) # if not args['load_model']: early_stopping = EarlyStopping(monitor='val_loss', patience=4) checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights", monitor='val_loss', save_best_only=True, verbose=1) lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=3, min_lr=0.0001) fit_mode = "fit" if fit_mode == "fit": model.fit( train_dataset, train_dataset['sim'], # validation_split=0.05, batch_size=args['batch_size'], validation_data=(val_dataset, val_dataset['sim']), epochs=args['epochs'], shuffle=False, callbacks=[checkpoint, lr_reducer, early_stopping], verbose=2) else: train_steps, train_batches = batch_iter( train_dataset, train_dataset["sim"], batch_size=args['batch_size']) valid_steps, valid_batches = batch_iter( val_dataset, val_dataset["sim"], batch_size=args['batch_size']) model.fit_generator( train_batches, train_steps, epochs=args['epochs'], validation_data=valid_batches, validation_steps=valid_steps, callbacks=[checkpoint, lr_reducer, early_stopping], verbose=2) #plot_model(model, to_file='model.png') ############################ TEST MODEL ################################# print('load best model from %s.best.weights' % model_path) model.load_weights("%s.best.weights" % model_path) if mode == 'deep_twitter': # load trained vocab embedding. if args["only_char"]: merged_vocab_emb = None else: embedding_layer_name = 'word_embedding' trained_vocab_emb = model.get_layer( embedding_layer_name).get_weights()[0] # merge trained vocab embedding with test OOV word embeddings merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300)) merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb merged_vocab_emb[len(vocab['word']):len(merged_vocab['word'] ), :] = test_vocab_emb for key in vocab: vocab_size[key] = len(merged_vocab[key]) print(vocab_size) new_model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, merged_vocab_emb, args["nb_filters"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option'], external=args["external_feat"], norm_weight=args['norm_weight'], cos_norm=args['cos'], only_word=args['only_word'], only_char=args['only_char'], pooling=args['pooling'], deeplevel=args['deeplevel']) new_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # print(new_model.summary()) num_layers = 0 for layer in model.layers: num_layers += 1 for layer_id in range(num_layers): layer = model.layers[layer_id] if not args["only_char"] and layer.name != embedding_layer_name: new_model.layers[layer_id].set_weights(layer.get_weights()) print('copy weight done.') predictions = new_model.predict(test_dataset) elif mode == 'dssm': getter = K.function([model.layers[0].input, model.layers[1].input], model.layers[-2].output) print('create DSSM functional getter...') num_samples, _, _ = test_dataset['query_3gram_input'].shape batch_size = 128 num_batch = int(math.ceil(num_samples * 1.0 / batch_size)) predictions = np.zeros((num_samples, )) for i in range(num_batch): start_idx, end_idx = i * batch_size, min(num_samples, (i + 1) * batch_size) predictions[start_idx:end_idx] = getter([ test_dataset['query_3gram_input'][start_idx:end_idx], test_dataset['doc_3gram_input'][start_idx:end_idx] ])[:, 0] #predictions = getter([test_dataset['query_3gram_input'], test_dataset['doc_3gram_input']]) print(predictions[:10]) predictions_file = "%s/%s/predictions_%s.txt" % (args["experimental_data"], data_name, model_name) with open(predictions_file, 'w') as f: for i in range(test_dataset['id'].shape[0]): f.write("%s %.4f %s\n" % (test_dataset['id'][i], predictions[i], args['mode'])) print('write predictions with trec format to %s' % predictions_file) map, mrr, p30 = evaluate(predictions_file, args["qrels_file"]) print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
def test_custom_env(): path = '../master_capston/the-movies-dataset/' features_embedding_movies = pd.read_csv( os.path.join(path, 'movie_embedding_features.csv')) # this mean the number of items in the recommendation return from the agent slate_size = 3 # i am assuming this number mean the # of possible items to send to the agent for recommend for each slate num_candidates = 11 format_data = data_preprocess.load_data(path) features_embedding_movies = pd.read_csv( os.path.join(path, 'movie_embedding_features.csv')) positive_user_ids, positive_history_data = data_preprocess.get_user_positive( format_data) # generate train and test set train_set, test_set = data_preprocess.generate_train_test_data( positive_history_data) users_history_data, train_set = data_preprocess.create_recent_history( train_set, embedding_features_data=features_embedding_movies) offline_mode = True rating_pivot = 4 sampler = LTSDocumentSampler(dataset=features_embedding_movies, num_candidate=num_candidates) user_sampler = LTSStaticUserSampler(users_history_data, features_embedding_movies, offline_data=test_set, offline_mode=offline_mode) # need to handle where we update dataset with num candidate< available func = select_dataset(features_embedding_movies, test_set) LTSUserModel = UserModel(user_sampler, offline_mode=offline_mode, rating_pivot=rating_pivot, slate_size=slate_size, response_ctor=LTSResponse) ltsenv = CustomSingleUserEnviroment(LTSUserModel, sampler, num_candidates, slate_size, resample_documents=False, offline_mode=True, select_subset_func=func) lts_gym_env = recsim_gym.RecSimGymEnv(ltsenv, clicked_engagement_reward) observation_0 = lts_gym_env.reset() print("current user : "******"current history of user items :", observation_0['user']['record_ids']) print("candidate recommend docs ids : ", observation_0['doc'].keys()) done = False while (not done): # for i in range(4): recommendation_slate_0 = [0, 1, 2] observation_1, reward, done, _ = lts_gym_env.step( recommendation_slate_0) print("response : ", observation_1['response']) print("reward : ", reward) print("next history of recommend items :", observation_1['user']['record_ids']) print("total remaind candidate items to recommend : ", len(observation_1['doc'].keys())) print("docs ids : ", observation_1['doc'].keys()) # test_custom_env()
from data_preprocess import load_data from models.Fac_Model import Fac_Model from tensorflow import keras from tensorflow.keras.optimizers import Adam from tensorflow.keras.losses import CategoricalCrossentropy from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau X_train, X_test, X_val, y_train, y_test, y_val = load_data() model = Fac_Model(7) model.build(input_shape=(None, 48, 48, 1)) model.summary() model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) reducePlateau = ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, min_delta=0.0001, patience=1, verbose=1) history = model.fit(X_train, y_train, epochs=14, batch_size=128, steps_per_epoch=250, validation_data=(X_val, y_val), verbose=1, callbacks=[reducePlateau]) result = model.evaluate(X_test, y_test, verbose=1) print('Resultado Final: ', result)
def build_logist(filename, option, model_name=None): # LOAD DATA descriptors = qm_descriptors X, Y = data_preprocess.load_data(filename, descriptors) if option == 'default': print('Training Logist...') print('*-----------------------------*') print('Training on default parameters.') accuracies_default = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data( X, Y, partition=0.20) accuracies_default.append( train_logist(x_train, y_train, x_valid, y_valid)) print('Average accuracy over 10 default runs: %.2f' % numpy.mean(accuracies_default)) elif option == 'train': print('*-----------------------------*') print('Searchig for best parameters.') params = [] accuracies = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data( X, Y, partition=0.20) best_parameters = scan_parameters(x_train, y_train) params.append(best_parameters) accuracy = train_logist(x_train, y_train, x_valid, y_valid, best_parameters) accuracies.append(accuracy) print('*-----------------------------*') print('Summary of Results.') print('*-----------------------------*') for i in range(len(accuracies)): print('Run ' + str(i + 1) + ' ', params[i], ' : ', accuracies[i]) elif option == 'RFE': print('*-----------------------------*') print('Recursive feature estimation.') #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE ranking = perform_RFE(X, Y) print('*-----------------------------*') print('Ranking of descriptors.') print('*-----------------------------*') for d in range(len(qm_descriptors)): print(qm_descriptors[d], ranking[d]) elif option == 'test': print('TESTING') print('*-----------------------------*') #penalties = 'l2' #Cs = 0.001 #weights = None penalties = 'l1' Cs = 10 weights = None params_dict = {'C': Cs, 'class_weight': weights, 'penalty': penalties} print(params_dict) acc_list = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data( X, Y, partition=0.20) acc_list.append( train_logist(x_train, y_train, x_valid, y_valid, params_dict)) print('Summary of Results.') print('*-----------------------------*') print('Average accuracy over 20 runs: %.2f' % numpy.mean(acc_list))
def train(): # args is a global variable in this task BATCH_SIZE = args.batch_size EPOCH = args.epoch EMBED_DIM = args.embeddingDim MAXLEN = args.maxLen NUM_UNITS = args.units LEARNING_RATE = args.learning_rate DROPOUT = args.dropout METHOD = args.method GPUNUM = args.gpuNum CKPT = args.checkpoint LIMIT = args.limit start_word = "<s>" end_word = "</s>" #Here, tokenizer saves all info to split data. #Itself is not a part of data. train_source_tensor, train_source_tokenizer, train_target_tensor, train_target_tokenizer = \ load_data(pad_length = MAXLEN, limit=LIMIT) buffer_size = len(train_source_tensor) train_source_tensor, val_source_tensor, train_target_tensor, val_target_tensor = \ train_test_split(train_source_tensor, train_target_tensor, random_state=2019) #TODO: check if we need target tokenizer training_steps = len(train_source_tensor) // BATCH_SIZE vocab_source_size = len(train_source_tokenizer.word_index) + 1 print("vocab_input_size: ", vocab_source_size) vocab_target_size = len(train_target_tokenizer.word_index) + 1 print("vocab_target_size: ", vocab_target_size) step = tf.Variable(0, trainable=False) # boundaries = [100, 200] # values = [1.0, 0.5, 0.1] # boundaries = [30, 40] # values = [1.0, 0.5, 0.0] # learning_rate_fn = tf.compat.v1.train.piecewise_constant(step, # boundaries, values) # optimizer = tf.optimizers.SGD(learning_rate=learning_rate_fn(step)) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.001) # set up checkpoint if not os.path.exists(CKPT): os.makedirs(CKPT) else: print( "Warning: current Checkpoint dir already exist! ", "\nPlease consider to choose a new dir to save your checkpoint!") checkpoint = tf.train.Checkpoint(optimzier=optimizer) checkpoint_prefix = os.path.join(CKPT, "ckpt") dataset = train_input_fn(train_source_tensor, train_target_tensor, buffer_size, EPOCH, BATCH_SIZE) apply_loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True) encoder = Encoder(vocab_source_size, EMBED_DIM, NUM_UNITS, dropout_rate=DROPOUT, batch_size=BATCH_SIZE) decoder = Decoder(vocab_target_size, EMBED_DIM, NUM_UNITS, batch_size=BATCH_SIZE, method=None, dropout_rate=DROPOUT) def train_wrapper(source, target): # with tf.GradientTape(watch_accessed_variables=False) as tape: with tf.GradientTape() as tape: # source_out, source_state, source_trainable_var, tape = encoder(source, encoder_state, vocab_source_size, # EMBED_DIM, NUM_UNITS, activation="tanh", # dropout_rate = DROPOUT) source_out, source_state = encoder(source, encoder_state, activation="tanh") initial = tf.expand_dims( [train_target_tokenizer.word_index[start_word]] * BATCH_SIZE, 1) attention_state = tf.zeros((BATCH_SIZE, 1, EMBED_DIM)) # cur_total_loss is a sum of loss for current steps, namely batch loss cur_total_loss, cur_loss = 0, 0 for i in range(1, target.shape[1]): output_state, source_state, attention_state = decoder( initial, source_state, source_out, attention_state) # TODO: check for the case where target is 0 cur_loss = apply_loss(target[:, i], output_state) # 0 should be the padding value in target. # I assumed that there should not be 0 value in target # for safety reason, we apply this mask to final loss # Mask is a array contains binary value(0 or 1) mask = tf.math.logical_not(tf.math.equal(target[:, i], 0)) mask = tf.cast(mask, dtype=cur_loss.dtype) cur_loss *= mask cur_total_loss += tf.reduce_mean(cur_loss) initial = tf.expand_dims(target[:, i], 1) # print(cur_loss) # print(cur_total_loss) batch_loss = cur_total_loss / target.shape[1] ## debug variables = encoder.trainable_variables + decoder.trainable_variables # print("check variable: ", len(variables)) #variables = encoder.trainable_variables # print("check var:", len(variables), variables[12:]) gradients = tape.gradient(cur_total_loss, variables) # print("check gradient: ", len(gradients)) # g_e = [type(ele) for ele in gradients if not isinstance(ele, tf.IndexedSlices)] # sum_g = [ele.numpy().sum() for ele in gradients if not isinstance(ele, tf.IndexedSlices)] # print(len(gradients), len(sum_g)) optimizer.apply_gradients(zip(gradients, variables), global_step=step) return batch_loss # print(len(train_source_tensor),BATCH_SIZE,training_steps,LIMIT) for epoch in range(EPOCH): per_epoch_loss = 0 start = time.time() encoder_hidden = encoder.initialize_hidden_state() encoder_ceil = encoder.initialize_cell_state() encoder_state = [[encoder_hidden, encoder_ceil], [encoder_hidden, encoder_ceil], [encoder_hidden, encoder_ceil], [encoder_hidden, encoder_ceil]] # TODO : Double check to make sure all re-initialization is performed for idx, data in enumerate(dataset.take(training_steps)): source, target = data cur_total_loss = train_wrapper(source, target) per_epoch_loss += cur_total_loss if idx % 10 == 0: # print("current step is: "+str(tf.compat.v1.train.get_global_step())) # print(dir(optimizer)) print("current learning rate is:" + str(optimizer._learning_rate)) print('Epoch {}/{} Batch {}/{} Loss {:.4f}'.format( epoch + 1, EPOCH, idx + 10, training_steps, cur_total_loss.numpy())) # tf.print(step) # print(dir(step)) # print(int(step)) if step >= 5: optimizer._learning_rate /= 2.0 print('Epoch {}/{} Total Loss per epoch {:.4f} - {} sec'.format( epoch + 1, EPOCH, per_epoch_loss / training_steps, time.time() - start)) # TODO: for evaluation add bleu score if epoch % 10 == 0: print('Saving checkpoint for each 10 epochs') checkpoint.save(file_prefix=checkpoint_prefix)
def test_autoencoder(finetune_lr=0.01,momentum=0.5,training_epochs=30,dataset='grayscale.pkl.gz',batch_size=10,pretrain='output/gray_pre.save',model_save='output/gray.save'): """ Take pre-trained models as input. Fold the network and fine-tune weights. :type finetune_lr: float :param finetune_lr: learning rate used in the finetune stage :type training_epochs: int :param training_epochs: maximal number of iterations ot run the optimizer :type dataset: string :param dataset: path the the pickled dataset :type batch_size: int :param batch_size: the size of a minibatch :type momentum: float :param momentum """ print 'loading data' datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] x_mean = datasets[3] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # numpy random generator numpy_rng = numpy.random.RandomState(123) # load trained model print 'loading the model' f = file(pretrain,'rb') s_rbm = cPickle.load(f) f.close() s_rbm.rbm_layers n_layers_rbm = s_rbm.n_layers bb = AutoEncoder(None, numpy_rng,s_rbm.rbm_layers,n_layers_rbm) #return bb print 'getting the fine-tuning functions' train_fn, validate_model, test_model = bb.build_finetune_functions( datasets=datasets, batch_size=batch_size, learning_rate=finetune_lr, momentum=momentum ) print '... fine-tuning the model' # early-stopping parameters patience = 10 * n_train_batches # look as this many examples regardless patience_increase = 2. # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatches before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 print n_train_batches print patience, patience_increase, validation_frequency, best_validation_loss while (epoch < training_epochs): # and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_fn(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = validate_model() this_validation_loss = numpy.mean(validation_losses) print( 'epoch %i, minibatch %i/%i, validation error %f ' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss ) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = test_model() test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f ') % (epoch, minibatch_index + 1, n_train_batches, test_score )) if patience <= iter: done_looping = True break f = file(model_save,'wb') cPickle.dump(bb,f,protocol=cPickle.HIGHEST_PROTOCOL) f.close() return bb
def train(args): images, labels = load_data(True) train_dset, val_dset = prepare_dataset(images, labels, True, augment=args.augment) # images, labels = load_data(False) # test_dset = prepare_dataset(images, labels, False) # learning_rate = (1 + 11*np.random.random(10)) * 1e-5 # learning_rate = [7.453742807604199e-05] # augment learning_rate = [0.00021837457574664458] # no augment for lr in learning_rate: tf.reset_default_graph() graph = tf.Graph() with graph.as_default(): global_step = tf.Variable(0, name='global_step', trainable=False) # global_step = tf.get_variable('global_step', shape=[None], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) # input placeholder input_shape = (32, 32) if not args.augment else (24, 24) X = tf.placeholder(tf.float32, [None, *input_shape, 3], name='images_ph') Y = tf.placeholder(tf.int32, [None], name='labels_ph') logits_op, loss_op, acc_op, train_op = build_model( X, Y, lr, global_step, lrn=args.lrn, full_model=args.full_model) # add variables to summary log_dir = os.path.join(args.log_dir, 'lr-{:.8f}'.format(lr), '') if not args.eval: rm_dir(log_dir) tf.summary.scalar('loss', loss_op) tf.summary.scalar('train_acc', acc_op) tf.summary.histogram('loss', loss_op) summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(log_dir, graph) # model saver save_dir = os.path.join(args.save_dir, 'lr-{:.8f}'.format(lr), '') if not args.eval else args.save_dir print('save at {}'.format(save_dir)) saver = tf.train.Saver(filename=save_dir) # run training session config = tf.ConfigProto(log_device_placement=False) with tf.Session(graph=graph, config=config) as sess: sess.run(tf.global_variables_initializer()) if args.eval: print('Resume from {}'.format(args.save_dir)) saver.restore(sess, tf.train.latest_checkpoint(save_dir)) step = sess.run(global_step) print('lr={}, normalization: {}, initializer:{}, step: {}'. format(lr, args.lrn, args.init, step)) if not args.eval: for e in range(args.epochs): print('Epoch {}'.format(e)) for x_batch, y_batch in train_dset: _, acc, summary = sess.run( [train_op, acc_op, summary_op], feed_dict={ X: x_batch, Y: y_batch }) step = sess.run(global_step) summary_writer.add_summary(summary, step) saver.save(sess, save_dir, global_step=global_step) acc, num_correct, num_samples = check_accuracy( sess, val_dset, X, logits_op) step = sess.run(global_step) print('step:{}, val acc: {:.2%} ({}/{})'.format( step, acc, num_correct, num_samples)) else: images, labels = load_data(False) test_dset = prepare_dataset(images, labels, train=False, augment=args.augment) acc, num_correct, num_samples = check_accuracy( sess, test_dset, X, logits_op) step = sess.run(global_step) print('step:{}, test acc: {:.2%} ({}/{})'.format( step, acc, num_correct, num_samples))
def test_train_agent_offline(): path = 'dataset' use_100k = True if use_100k: rating_file_name = 'process_small_rating.csv' embedding_file_name = 'movie_embedding_features.csv' else: rating_file_name = 'process_1m_rating.csv' embedding_file_name = 'movie_embedding_features_1m.csv' np.random.seed(0) # list_slate = [3,5,7,9] # list_time_budget = [2,4,6,8] # list_num_candidate = [20,30] list_slate = [5] list_time_budget = [4] list_num_candidate = [20] slate_size = 5 time_budget = -4 time_budget_range = [2, 6] num_candidates = 30 min_num_positive_rating = 40 min_num_rating = 70 test_mode = False offline_mode = True rating_pivot = 4 resample_documents = True #agent params embedding_size = 30 num_positive_hisotry_items = 10 num_action_vector = 1 s_dim = num_positive_hisotry_items * embedding_size a_dim = num_action_vector * embedding_size actor_lr = 0.001 critic_lr = 0.001 hidden_layer_1 = 32 hidden_layer_2 = 16 tau = 0.01 batch_size = 32 gamma = 0.75 buffer_size = 20000 # train agent max_eps = 50 sample_user_randomly = False max_num_user = 100 save_frequenly = 10 model_folder = "model_test" # config_file = save_model_path = "save_model" log_path = "logs/scalars/" history_path = "history_log" model_count = 0 # for slate_size in list_slate: # for time_budget in list_time_budget: # for num_candidates in list_num_candidate: # curret_model_path = os.path.join(save_model_path, str(slate_size), str(time_budget), str(num_candidates)) # current_log_path = os.path.join(log_path, str(slate_size), str(time_budget), str(num_candidates)) # current_history_path = os.path.join(history_path, str(slate_size), str(time_budget), str(num_candidates)) curret_model_path = os.path.join(model_folder, save_model_path) current_log_path = os.path.join(model_folder, log_path) current_history_path = os.path.join(model_folder, history_path) if not os.path.exists(curret_model_path): print("make model path") os.makedirs(curret_model_path) if not os.path.exists(current_log_path): os.makedirs(current_log_path) if not os.path.exists(current_history_path): os.makedirs(current_history_path) with tf.Session() as sess: # train_mode = True # build enviroment # this mean the number of items in the recommendation return from the agent # i am assuming this number mean the # of possible items to send to the agent for recommend for each slate # time_budget = 2 format_data = data_preprocess.load_data(path, file_name=rating_file_name) features_embedding_movies = pd.read_csv( os.path.join(path, embedding_file_name)) positive_user_ids, positive_history_data = data_preprocess.get_user_positive( format_data, min_num_positive_rating) print("unique user id : ", len(np.unique(positive_user_ids))) print(len(positive_history_data)) # restrict number of total rating positive_user_ids, positive_history_data = data_preprocess.generate_new_dataset( positive_history_data, num_rating_pivot=min_num_rating) print("unique user id : ", len(np.unique(positive_user_ids))) print(len(positive_history_data)) # generate train and test set train_set, test_set = data_preprocess.generate_train_test_data( positive_history_data) users_history_data, train_set = data_preprocess.create_recent_history( train_set) #check the train set and test set quality user_details = train_set.groupby('userId').size().reset_index() user_details.columns = ['userId', 'number of rating'] print("train set quality : ", user_details.describe()) user_details = test_set.groupby('userId').size().reset_index() user_details.columns = ['userId', 'number of rating'] print("test set quality : ", user_details.describe()) offline_dataset = train_set if test_mode: offline_dataset = test_set sampler = LTSDocumentSampler(dataset=features_embedding_movies, num_candidate=num_candidates) user_sampler = LTSStaticUserSampler( users_history_data, features_embedding_movies, offline_data=offline_dataset, offline_mode=offline_mode, time_budget=time_budget, random=sample_user_randomly, time_budget_range=time_budget_range) # need to handle where we update dataset with num candidate< available func_select_train_set = select_dataset(features_embedding_movies, train_set) func_select_test_set = select_dataset(features_embedding_movies, test_set) # user_train_set = func(user_id=39) # print(len(user_train_set)) LTSUserModel = UserModel(user_sampler, offline_mode=offline_mode, rating_pivot=rating_pivot, slate_size=slate_size, response_ctor=LTSResponse) ltsenv = CustomSingleUserEnviroment( LTSUserModel, sampler, num_candidates, slate_size, resample_documents=resample_documents, offline_mode=offline_mode, select_subset_func=func_select_train_set) if test_mode: ltsenv = CustomSingleUserEnviroment( LTSUserModel, sampler, num_candidates, slate_size, resample_documents=resample_documents, offline_mode=offline_mode, select_subset_func=func_select_test_set) lts_gym_env = recsim_gym.RecSimGymEnv(ltsenv, clicked_engagement_reward) # simulated environment # build agent actor = Actor(sess, s_dim, a_dim, batch_size, slate_size, embedding_size, tau, actor_lr, hidden_layer_1, hidden_layer_2) critic = Critic(sess, s_dim, a_dim, slate_size, embedding_size, gamma, tau, critic_lr, hidden_layer_1, hidden_layer_2) buffer = RelayBuffer(buffer_size, s_dim, a_dim) noise_model = Noise(a_dim) agent = Actor_Critic_Agent(sess, lts_gym_env.observation_space, lts_gym_env.action_space, actor, critic, buffer, noise_model, slate_size, embedding_size) #train section # max_num_user = len(np.unique(users_history_data['userId'])) history = agent.train(max_eps, max_num_user, batch_size, lts_gym_env, save_frequenly, curret_model_path, current_log_path, current_history_path) # # # print(history.keys()) # history_table = pd.DataFrame(history) # history_table.to_csv(os.path.join(current_history_path,"history_record.csv"),index=False) # # print("finish training for model : ",model_count ) # model_count += 1 #evaluate section config_info = { "use_teriminal_info": True, "use_100k": use_100k, "slate_size": slate_size, "num_candidates": num_candidates, "time_budget": time_budget, "time_budget_range": time_budget_range, "min_num_rating": min_num_rating, "min_num_positive_rating": min_num_positive_rating, "actor_lr": actor_lr, "critic_lr": critic_lr, "hidden_layer_1": hidden_layer_1, "hidden_layer_2": hidden_layer_2, "batch_size": batch_size, "tau": tau, "gamma": gamma, "buffer_size": buffer_size, "max_eps": max_eps, "max_num_user": max_num_user, "sample_user_randomly": sample_user_randomly, "save_frequenly": save_frequenly, } config_file_name = 'config.json' with open(os.path.join(model_folder, config_file_name), 'w') as fp: json.dump(config_info, fp, indent=4)
) # top_k tf.flags.DEFINE_integer("top_k", 1, "Allow evaluate ranking") FLAGS = tf.flags.FLAGS # FLAGS._parse_flags() # print("\nParameters:") # for attr, value in sorted(FLAGS.__flags.items()): # print("{}={}".format(attr.upper(), value)) # print("") # CHANGE THIS: Load data. Load your own data here # TODO: Modify Eval_train if FLAGS.eval_train: x_raw, y_raw = data_preprocess.load_data(FLAGS.data_file, FLAGS.class_file, FLAGS.char) class_vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "class_voca") class_processor = learn.preprocessing.VocabularyProcessor.restore( class_vocab_path) y_test = np.array(list(class_processor.transform(y_raw)), dtype="float32") y_test = y_test.ravel() # y_test = np.argmax(y_test, axis=1) else: x_raw = [ "a masterpiece four years in the making", "everything is off.", "what the f**k", "i love you", "hello, ma friend?", "go to hell", "do you want to be killed?" ] y_test = [1, 0, 0, 1, 1, 0, 0] # Map data into vocabulary
def test_DBN(finetune_lr=0.1, pretraining_epochs=50, pretrain_lr=0.001, k=1, training_epochs=1, dataset='grayscale.pkl.gz', batch_size=10, hidden_layers_sizes=[1000, 200, 50], pretrain_model='gray_pre1.save', logfile='newLog'): """ Demonstrates how to train and test a Deep Belief Network. This is demonstrated on MNIST. :type finetune_lr: float :param finetune_lr: learning rate used in the finetune stage :type pretraining_epochs: int :param pretraining_epochs: number of epoch to do pretraining :type pretrain_lr: float :param pretrain_lr: learning rate to be used during pre-training :type k: int :param k: number of Gibbs steps in CD/PCD :type training_epochs: int :param training_epochs: maximal number of iterations ot run the optimizer :type dataset: string :param dataset: path the the pickled dataset :type batch_size: int :param batch_size: the size of a minibatch """ f = open(logfile, "w") datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # numpy random generator numpy_rng = numpy.random.RandomState(123) print >> f, '... building the model' # construct the Deep Belief Network dbn = DBN( numpy_rng=numpy_rng, n_ins=48 * 64, # 26 * 56 hidden_layers_sizes=hidden_layers_sizes, n_outs=4) # start-snippet-2 ######################### # PRETRAINING THE MODEL # ######################### print >> f, '... getting the pretraining functions' pretraining_fns = dbn.pretraining_functions(train_set_x=train_set_x, batch_size=batch_size, k=k) print >> f, '... pre-training the model' best_obj = -99999999 start_time = time.clock() ## Pre-train layer-wise for i in xrange(dbn.n_layers): # go through pretraining epochs for epoch in xrange(pretraining_epochs): # go through the training set c = [] for batch_index in xrange(n_train_batches): c.append(pretraining_fns[i](index=batch_index, lr=pretrain_lr)) cost_e = numpy.mean(c) print >> f, 'Pre-training layer %i, epoch %d, cost ' % (i, epoch), print >> f, cost_e if cost_e > best_obj: best_obj = cost_e end_time = time.clock() # end-snippet-2 ff = file(pretrain_model, 'wb') cPickle.dump(dbn, ff, protocol=cPickle.HIGHEST_PROTOCOL) ff.close print >> f, ('The pretraining code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) print >> f, ("final pretraining cost: %f" % best_obj) f.close() # sys.stdout.close() return best_obj ######################## # FINETUNING THE MODEL # ######################## # get the training, validation and testing function for the model print '... getting the finetuning functions' train_fn, validate_model, test_model = dbn.build_finetune_functions( datasets=datasets, batch_size=batch_size, learning_rate=finetune_lr) print '... finetuning the model' # early-stopping parameters patience = 4 * n_train_batches # look as this many examples regardless patience_increase = 2. # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatches before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < training_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_fn(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = validate_model() this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if (this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = test_model() test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print(('Optimization complete with best validation score of %f %%, ' 'obtained at iteration %i, ' 'with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The fine tuning code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def main(options): args = get_default_args() load_best_args(args, options, get_best_args()) set_args(args, options) print_args(args) mode = args['mode'] train_name, test_name = args['split']['train'], args['split']['test'] if train_name == 'train_all': train_set = ['trec-2011', 'trec-2012', 'trec-2013', 'trec-2014'] train_set.remove(test_name) else: train_set = [train_name] test_set = test_name print('train_set: {}, test_set: {}'.format(train_set, test_set)) max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict( int), defaultdict(int) vocab = {'word': {}, '3gram': {}, 'url': {}} test_vocab = {'word': {}, '3gram': {}, 'url': {}} ############################# LOAD DATA ################################## data_name = ("data_m%s_%s_%s" % (mode, train_name, test_name)).lower() if args["load_data"]: train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True) test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, test_name), False) print('load dataset successfully') else: train_dataset = gen_data(args["raw_data"], train_set, vocab, test_vocab, True, max_query_len, max_doc_len, max_url_len, args) print("create training set successfully...") test_dataset = gen_data(args["raw_data"], [test_set], vocab, test_vocab, False, max_query_len, max_doc_len, max_url_len, args) train_vocab_emb, test_vocab_emb = construct_vocab_emb( "%s/%s" % (args["experimental_data"], data_name), vocab['word'], test_vocab['word'], 300, "word", base_embed_path=args["base_embed_path"]) save_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True, train_dataset, max_query_len, max_doc_len, max_url_len, vocab, train_vocab_emb) print("save training set successfully...") save_data("%s/%s/%s" % (args["experimental_data"], data_name, test_name), False, test_dataset, vocab=test_vocab, vocab_emb=test_vocab_emb) print("save test set successfully...") val_split = args['val_split'] num_samples, _ = train_dataset["query_word_input"].shape # randomly sample queries and all their documents if query_random is True # otherwise, query-doc pairs are randomly sampled query_random = True if query_random: val_indices = sample_val_set(args["raw_data"], train_set, val_split) else: val_indices, val_set = [], set() for i in range(int(num_samples * val_split)): val_index = np.random.randint(num_samples) while val_index in val_set: val_index = np.random.randint(num_samples) val_indices.append(val_index) val_set.add(val_index) val_dataset = {} for key in train_dataset: val_dataset[key] = train_dataset[key][val_indices] train_dataset[key] = np.delete(train_dataset[key], val_indices, 0) # shuffle the train dataset explicitly to make results reproducible # whether the performance will be affected remains a question keys, values = [], [] for key in train_dataset: keys.append(key) values.append(train_dataset[key]) zipped_values = list(zip(*values)) random.shuffle(zipped_values) shuffled_values = list(zip(*zipped_values)) for i, key in enumerate(keys): train_dataset[key] = np.array(shuffled_values[i]) print('after shuffle: id {}, sim {}, query_word_input'.format( train_dataset['id'][:3], train_dataset['sim'][:3], train_dataset['query_word_input'][:3])) # merge the vocabulory of train and test set merged_vocab = {'url': vocab['url'], '3gram': vocab['3gram']} merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word']) print("merged vocab: word(%d) 3gram(%d)" % (len(merged_vocab['word']), len(test_vocab['3gram']))) vocab_inv, vocab_size = {}, {} vocab['char'] = merge_two_dicts(vocab['3gram'], vocab['url']) test_vocab['char'] = merge_two_dicts(test_vocab['3gram'], test_vocab['url']) merged_vocab['char'] = merge_two_dicts(vocab['char'], test_vocab['char']) for key in vocab: vocab_inv[key] = invert_dict(merged_vocab[key]) vocab_size[key] = len(vocab[key]) print(vocab_size) # Print data samples for debug purpose print_dataset(mode, train_dataset, vocab_inv) print_dataset(mode, test_dataset, vocab_inv) ############################ TRAIN MODEL ################################# model = None if mode == 'deep_twitter': model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, train_vocab_emb, args["nb_filters"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option']) model_name = ( "model_N%s_data%s_mo%s_c%s_NumFilter%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f" % (mode, train_name, args['model_option'], args['conv_option'], args["nb_filters"], args["trainable"], args['dropout'], args['weighting'], args['mask'], args['batch_size'], args['val_split'])).lower() model_path = "%s/%s/%s" % (args['experimental_data'], data_name, model_name) print(model_path) if args['optimizer'] == "adam": opt = optimizers.Adam(lr=args["learning_rate"], beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True) print('use Adam optimizer') elif args['optimizer'] == "sgd": opt = optimizers.SGD(lr=args["learning_rate"], decay=1e-6, momentum=0.9, nesterov=True) print('use SGD optimizer') elif args['optimizer'] == 'rmsprop': opt = optimizers.RMSprop(lr=args["learning_rate"], rho=0.9, epsilon=None, decay=0.0) print('use RMSprop optimizer') model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) print(model.summary()) print('model init weights sum: %.4f' % get_model_weights(model)) if not args['load_model']: early_stopping = EarlyStopping(monitor='val_loss', patience=4) checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights", monitor='val_loss', save_best_only=True, verbose=1) lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.0001, verbose=1) #print(train_dataset['id'][:3], val_dataset['id'][:3], val_dataset['id'][-3:]) model.fit(train_dataset, train_dataset['sim'], validation_data=(val_dataset, val_dataset['sim']), batch_size=args['batch_size'], epochs=args['epochs'], shuffle=False, callbacks=[checkpoint, lr_reducer, early_stopping], verbose=args['verbose']) ############################ TEST MODEL ################################# print('load best model from %s.best.weights' % model_path) model.load_weights("%s.best.weights" % model_path) if mode == 'deep_twitter': # load trained vocab embedding. trained_vocab_emb = model.get_layer('sequential_2').get_weights()[0] # merge trained vocab embedding with test OOV word embeddings merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300)) merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb merged_vocab_emb[ len(vocab['word']):len(merged_vocab['word']), :] = test_vocab_emb for key in vocab: vocab_size[key] = len(merged_vocab[key]) print(vocab_size) new_model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, merged_vocab_emb, args["nb_filters"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option']) new_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(new_model.summary()) num_layers = 0 for layer in model.layers: num_layers += 1 for layer_id in range(num_layers): layer = model.layers[layer_id] if layer.name != 'sequential_2': new_model.layers[layer_id].set_weights(layer.get_weights()) print('copy weight done.') predictions = new_model.predict(test_dataset) print(predictions[:10]) predictions_file = "%s/%s/predictions_%s.txt" % (args["experimental_data"], data_name, model_name) with open(predictions_file, 'w') as f: for i in range(test_dataset['id'].shape[0]): f.write("%s %.4f %s\n" % (test_dataset['id'][i], predictions[i], args['mode'])) print('write predictions with trec format to %s' % predictions_file) map, mrr, p30 = evaluate(predictions_file, args["qrels_file"]) print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
def sgd_optimization_mnist(learning_rate=0.13, momentum=0, n_epochs=25, dataset='grayscale_seg_binary_data.pkl.gz', batch_size=50): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # generate symbolic variables for input (x and y represent a # minibatch) x = T.matrix('x') # data, presented as rasterized images y = T.ivector('y') # labels, presented as 1D vector of [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 # numpy.array(train_set_y.eval()).max() + 1 classifier = LogisticRegression(input=x, n_in=48 * 64, n_out=2) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # compute the gradient of cost with respect to theta = (W,b) # g_W = T.grad(cost=cost, wrt=classifier.W) # g_b = T.grad(cost=cost, wrt=classifier.b) # start-snippet-3 # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. # updates = [(classifier.W, classifier.W - learning_rate * g_W), # (classifier.b, classifier.b - learning_rate * g_b)] # LK change: sgd with momentum. (tested. okay) # updates = [(classifier.model_update_W, momentum * classifier.model_update_W - learning_rate * g_W), # (classifier.model_update_b, momentum * classifier.model_update_b - learning_rate * g_b), # (classifier.W, classifier.W + classifier.model_update_W), # (classifier.b, classifier.b + classifier.model_update_b)] # LK change2: sgd with momentum 2. grads = T.grad(cost, classifier.params) # updates1 = [ # (model_update_i, model_update_i * momentum - learning_rate * grad_i) # for model_update_i, grad_i in zip(classifier.model_update, grads) # ] # updates2 = [ # (param_i, param_i + model_update_i) # for param_i, model_update_i in zip(classifier.params, classifier.model_update) # ] updates1 = [(param_i, param_i + model_update_i * momentum - learning_rate * grad_i) for param_i, model_update_i, grad_i in zip( classifier.params, classifier.model_update, grads)] updates2 = [ (model_update_i, model_update_i * momentum - learning_rate * grad_i) for model_update_i, grad_i in zip(classifier.model_update, grads) ] updates = updates1 + updates2 # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-3 ############### # TRAIN MODEL # ############### print '... training the model' # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of' ' best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print(('Optimization complete with best validation score of %f %%,' 'with test performance %f %%') % (best_validation_loss * 100., test_score * 100.)) print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time)))
# # # update target networks # self.actor.update_target_network() # self.critic.update_target_network() # # return np.amax(q_value), critic_loss path = '../master_capston/the-movies-dataset/' features_embedding_movies = pd.read_csv( os.path.join(path, 'movie_embedding_features.csv')) sampler = LTSDocumentSampler(dataset=features_embedding_movies) slate_size = 3 num_candidates = 15 format_data = data_preprocess.load_data(path) # print(format_data.head()) # print(format_data.shape) features_embedding_movies = pd.read_csv( os.path.join(path, 'movie_embedding_features.csv')) positive_user_ids, positive_history_data = data_preprocess.get_user_positive( format_data) user_sampler = LTSStaticUserSampler(positive_user_ids, positive_history_data, features_embedding_movies) LTSUserModel = UserModel(user_sampler, slate_size, LTSResponse) ltsenv = environment.Environment(LTSUserModel, sampler, num_candidates, slate_size, resample_documents=True)
def build_SVM(filename, option, svm_type = None, poly_degree = None): # LOAD DATA descriptors = qm_descriptors X, Y = data_preprocess.load_data(filename, descriptors) if svm_type == None: svm_type = 'linear' if poly_degree == None: poly_degree = 2 #print('training polynomial SVM of degree', poly_degree) if option == 'default': print('Training SVM...') print('*-----------------------------*') print('Training on default parameters.') accuracies_default = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data (X, Y, partition=0.20) accuracies_default.append(train_SVM(x_train, y_train, x_valid, y_valid)) print('Average accuracy over 10 default runs: %.2f' % numpy.mean(accuracies_default)) elif option == 'train': print('*-----------------------------*') print('Searchig for best parameters.') params = [] accuracies = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data (X, Y, partition=0.20) best_parameters = scan_parameters(x_train, y_train) params.append(best_parameters) accuracy = train_SVM(x_train, y_train, x_valid, y_valid, best_parameters) accuracies.append(accuracy) print('*-----------------------------*') print('Summary of Results.') print('*-----------------------------*') for i in range(len(accuracies)): print('Run ' + str (i+1)+ ' ', params[i], ' : ', accuracies[i]) elif option == 'RFE': print('*-----------------------------*') print('Recursive feature estimation.') #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE ranking = perform_RFE(X, Y) print('*-----------------------------*') print('Ranking of descriptors.') print('*-----------------------------*') for d in range(len(qm_descriptors)): print(qm_descriptors[d], ranking[d]) elif option == 'test': print('TESTING') print('*-----------------------------*') #kernels = 'rbf' #Cs = 1 #gammas = 1 #degrees = 3 #weights = None kernels = 'rbf' Cs = 10 gammas = 0.1 degrees = 3 weights = None params_dict = {'kernel': kernels, 'C': Cs, 'class_weight' : weights, 'degree': degrees, 'gamma' : gammas} acc_list = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data (X, Y, partition=0.20) acc_list.append(train_SVM(x_train, y_train, x_valid, y_valid, params_dict)) print('Summary of Results.') print('*-----------------------------*') print('Average accuracy over 10 runs: %.2f' % numpy.mean(acc_list))
def train(opts): """ training process starts here """ print '==> Training a language model' print ' [Word only]' #--------------------------------------------------------- # prepare ingredients #--------------------------------------------------------- print '==> Loading dictionaries: ', # load word dictionary print 'word dict,', if opts['word_dictionary']: with open(opts['word_dictionary'], 'rb') as f: word_dict = pkl.load(f) # word -> index word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk # index -> word print 'Done' # reload options if opts['reload_'] and os.path.exists(opts['saveto']): with open('%s.pkl' % opts['saveto'], 'rb') as f: reloaded_options = pkl.load(f) opts.update(reloaded_options) # load training data train = load_data(path=opts['train_text']) # initialize params print '==> Building model:' params = init_params(opts) # reload parameters if opts['reload_'] and os.path.exists(opts['saveto']): params = load_params(opts['saveto'], params) # convert params to Theano shared variabel tparams = init_tparams(params) # build computational graph trng, is_train, x_word_input, x_mask, cost = build_model(tparams, opts) inps = [x_word_input, x_mask] print '==> Building f_cost...', f_cost = theano.function(inps, cost) print 'Done' # get gradients print '==> Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) # gradient clipping print 'gradient clipping...', grad_norm = tensor.sqrt(tensor.sum([tensor.sum(g**2.) for g in grads])) tau = opts['gradclip'] grad_clipped = [] for g in grads: grad_clipped.append(tensor.switch(tensor.ge(grad_norm, tau), g * tau / grad_norm, g)) print 'Done' # build optimizer lr = tensor.scalar(name='lr') print '==> Building optimizers...', f_grad_shared, f_update = eval(opts['optimizer'])(lr, tparams, grad_clipped, inps, cost) print 'Done' #--------------------------------------------------------- # start optimization #--------------------------------------------------------- print '==> Optimization:' # reload history history_errs = [] if opts['reload_'] and os.path.exists(opts['saveto']): history_errs = list(numpy.load(opts['saveto'])['history_errs']) best_p = None bad_counter = 0 # load validation and test data if opts['valid_text']: valid_lines = [] with open(opts['valid_text'], 'r') as f: for l in f: valid_lines.append(l) n_valid_lines = len(valid_lines) if opts['test_text']: test_lines = [] with open(opts['test_text'], 'r') as f: for l in f: test_lines.append(l) n_test_lines = len(test_lines) # initialize some values uidx = 0 # update counter estop = False # early stopping flag lrate = opts['lrate'] batch_size = opts['batch_size'] # outer loop: epochs for eidx in xrange(opts['max_epochs']): n_samples = 0 # sample counter # shuffle training data every epoch print '==> Shuffling sentences...', shuffle(train) print 'Done' # learning rate decay if eidx >= opts['lr_decay_start']: lrate /= opts['lr_decay'] print 'epoch = ', eidx, 'lr = ', lrate # training iterator kf_train = KFold(len(train), n_folds=len(train)/(batch_size-1), shuffle=False) # inner loop: batches for _, index in kf_train: n_samples += len(index) uidx += 1 # is_train=1 at training time is_train.set_value(1.) # get a batch x = [train[i] for i in index] # format input data x_word_input_, x_mask_ = txt_to_word_inps(x, word_dict, opts) # compute cost cost = f_grad_shared(x_word_input_, (1 - x_mask_)) # update parameters f_update(lrate) # check cost if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. # display cost if numpy.mod(uidx, opts['dispFreq']) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost # save params if numpy.mod(uidx, opts['saveFreq']) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(opts['saveto'], history_errs=history_errs, **params) pkl.dump(opts, open('%s.pkl' % opts['saveto'], 'wb')) print 'Done' # compute validation/test perplexity if numpy.mod(uidx, opts['validFreq']) == 0: print "Computing Dev/Test Perplexity" # is_train=0 at valid/test time is_train.set_value(0.) valid_err = perplexity(f_cost, valid_lines, word_dict, opts) test_err = perplexity(f_cost, test_lines, word_dict, opts) history_errs.append([valid_err, test_err]) # save the best params if len(history_errs) > 1: if uidx == 0 or valid_err <= numpy.array( history_errs)[:, 0].min(): best_p = unzip(tparams) print 'Saving best params...', numpy.savez(opts['savebestto'], history_errs=history_errs, **params) pkl.dump(opts, open('%s.pkl' % opts['savebestto'], 'wb')) print 'Done' bad_counter = 0 if len(history_errs) > opts['patience'] and valid_err >= numpy.array( history_errs)[:-opts['patience'], 0].min(): bad_counter += 1 if bad_counter > opts['patience']: print 'Early Stop!' estop = True break print 'Valid ', valid_err, 'Test ', test_err # inner loop: end print 'Seen %d samples' % n_samples # early stopping if estop: break # outer loop: end if best_p is not None: zipp(best_p, tparams) # compute validation/test perplexity at the end of training is_train.set_value(0.) valid_err = perplexity(f_cost, valid_lines, word_dict, opts) test_err = perplexity(f_cost, test_lines, word_dict, opts) print 'Valid ', valid_err, 'Test ', test_err # save everithing params = copy.copy(best_p) numpy.savez(opts['saveto'], zipped_params=best_p, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) return valid_err, test_err
def train(opts): """ training process starts here """ print '==> Training a language model' print ' [Word only]' #--------------------------------------------------------- # prepare ingredients #--------------------------------------------------------- print '==> Loading dictionaries: ', # load word dictionary print 'word dict,', if opts['word_dictionary']: with open(opts['word_dictionary'], 'rb') as f: word_dict = pkl.load(f) # word -> index word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk # index -> word print 'Done' # reload options if opts['reload_'] and os.path.exists(opts['saveto']): with open('%s.pkl' % opts['saveto'], 'rb') as f: reloaded_options = pkl.load(f) opts.update(reloaded_options) # load training data train = load_data(path=opts['train_text']) # initialize params print '==> Building model:' params = init_params(opts) # reload parameters if opts['reload_'] and os.path.exists(opts['saveto']): params = load_params(opts['saveto'], params) # convert params to Theano shared variabel tparams = init_tparams(params) # build computational graph trng, is_train, x_word_input, x_mask, cost = build_model(tparams, opts) inps = [x_word_input, x_mask] print '==> Building f_cost...', f_cost = theano.function(inps, cost) print 'Done' # get gradients print '==> Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) # gradient clipping print 'gradient clipping...', grad_norm = tensor.sqrt(tensor.sum([tensor.sum(g**2.) for g in grads])) tau = opts['gradclip'] grad_clipped = [] for g in grads: grad_clipped.append( tensor.switch(tensor.ge(grad_norm, tau), g * tau / grad_norm, g)) print 'Done' # build optimizer lr = tensor.scalar(name='lr') print '==> Building optimizers...', f_grad_shared, f_update = eval(opts['optimizer'])(lr, tparams, grad_clipped, inps, cost) print 'Done' #--------------------------------------------------------- # start optimization #--------------------------------------------------------- print '==> Optimization:' # reload history history_errs = [] if opts['reload_'] and os.path.exists(opts['saveto']): history_errs = list(numpy.load(opts['saveto'])['history_errs']) best_p = None bad_counter = 0 # load validation and test data if opts['valid_text']: valid_lines = [] with open(opts['valid_text'], 'r') as f: for l in f: valid_lines.append(l) n_valid_lines = len(valid_lines) if opts['test_text']: test_lines = [] with open(opts['test_text'], 'r') as f: for l in f: test_lines.append(l) n_test_lines = len(test_lines) # initialize some values uidx = 0 # update counter estop = False # early stopping flag lrate = opts['lrate'] batch_size = opts['batch_size'] # outer loop: epochs for eidx in xrange(opts['max_epochs']): n_samples = 0 # sample counter # shuffle training data every epoch print '==> Shuffling sentences...', shuffle(train) print 'Done' # learning rate decay if eidx >= opts['lr_decay_start']: lrate /= opts['lr_decay'] print 'epoch = ', eidx, 'lr = ', lrate # training iterator kf_train = KFold(len(train), n_folds=len(train) / (batch_size - 1), shuffle=False) # inner loop: batches for _, index in kf_train: n_samples += len(index) uidx += 1 # is_train=1 at training time is_train.set_value(1.) # get a batch x = [train[i] for i in index] # format input data x_word_input_, x_mask_ = txt_to_word_inps(x, word_dict, opts) # compute cost cost = f_grad_shared(x_word_input_, (1 - x_mask_)) # update parameters f_update(lrate) # check cost if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. # display cost if numpy.mod(uidx, opts['dispFreq']) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost # save params if numpy.mod(uidx, opts['saveFreq']) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(opts['saveto'], history_errs=history_errs, **params) pkl.dump(opts, open('%s.pkl' % opts['saveto'], 'wb')) print 'Done' # compute validation/test perplexity if numpy.mod(uidx, opts['validFreq']) == 0: print "Computing Dev/Test Perplexity" # is_train=0 at valid/test time is_train.set_value(0.) valid_err = perplexity(f_cost, valid_lines, word_dict, opts) test_err = perplexity(f_cost, test_lines, word_dict, opts) history_errs.append([valid_err, test_err]) # save the best params if len(history_errs) > 1: if uidx == 0 or valid_err <= numpy.array( history_errs)[:, 0].min(): best_p = unzip(tparams) print 'Saving best params...', numpy.savez(opts['savebestto'], history_errs=history_errs, **params) pkl.dump(opts, open('%s.pkl' % opts['savebestto'], 'wb')) print 'Done' bad_counter = 0 if len(history_errs ) > opts['patience'] and valid_err >= numpy.array( history_errs)[:-opts['patience'], 0].min(): bad_counter += 1 if bad_counter > opts['patience']: print 'Early Stop!' estop = True break print 'Valid ', valid_err, 'Test ', test_err # inner loop: end print 'Seen %d samples' % n_samples # early stopping if estop: break # outer loop: end if best_p is not None: zipp(best_p, tparams) # compute validation/test perplexity at the end of training is_train.set_value(0.) valid_err = perplexity(f_cost, valid_lines, word_dict, opts) test_err = perplexity(f_cost, test_lines, word_dict, opts) print 'Valid ', valid_err, 'Test ', test_err # save everithing params = copy.copy(best_p) numpy.savez(opts['saveto'], zipped_params=best_p, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) return valid_err, test_err
# print(str(b+1)+"): Label "+label_list[b]+ " corresponds with topic "+str(max_indexes[i])) # print("Topic: "+tps[max_indexes[i]][1]) # else: # print(str(b+1)+"): Label "+label_list[b]+ " has no clear topic match") # ## for index, score in sorted(model[bow_corpus[i]], key=lambda tup: -1*tup[1]): ## print(index, score, labels[i]) ## print(model.get_document_topics(bow_corpus[i])) ## print("\nScore: {}\t \nTopic: {}".format(score, model.print_topic(index, 10))) # return scores, topics if __name__ == "__main__": print("Loading and splitting data for training and testing.\n") data = dp.load_data() training_data, testing_data = dp.get_split_data(data) files = list(data.keys()) training_files = [] training_labels = [] training_content = [] testing_files = [] testing_labels = [] testing_content = [] training_keys = list(training_data.keys()) for i,f in enumerate(files): if files[i] in training_keys: training_files.append(files[i]) training_labels.append(training_data[files[i]]['label']) training_content.append(training_data[files[i]]['content']) else: