def main(): parser = argparse.ArgumentParser() parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning Rate') parser.add_argument('--batch_size', type=int, default=256, help='Learning Rate') parser.add_argument('--sample_every', type=int, default=2000, help='Sample generator output evry x steps') parser.add_argument('--summary_every', type=int, default=50, help='Sample generator output evry x steps') parser.add_argument('--save_model_every', type=int, default=1500, help='Save model every') parser.add_argument('--sample_size', type=int, default=300, help='Sampled output size') parser.add_argument('--top_k', type=int, default=5, help='Sample from top k predictions') parser.add_argument('--max_epochs', type=int, default=50, help='Max Epochs') parser.add_argument('--beta1', type=float, default=0.5, help='Momentum for Adam Update') parser.add_argument('--resume_model', type=str, default=None, help='Pre-Trained Model Path, to resume from') # parser.add_argument('--text_dir', type=str, default='Data/generator_training_data', # help='Directory containing text files') parser.add_argument( '--text_dir', type=str, default= 'Data/Session/user-filter-200000items-session10.csv-map-5to100.csv', help='Directory containing text files') parser.add_argument('--data_dir', type=str, default='Data', help='Data Directory') parser.add_argument( '--seed', type=str, default= 'f78c95a8-9256-4757-9a9f-213df5c6854e,1151b040-8022-4965-96d2-8a4605ce456c', help='Seed for text generation') parser.add_argument( '--sample_percentage', type=float, default=0.5, help= 'sample_percentage from whole data, e.g.0.2= 80% training 20% testing') parser.add_argument('--filter_sizes', nargs='?', default='[2,3,4]', help='Specify the filter_size') parser.add_argument( '--num_filters', type=int, default=100, help='Number of filters per filter size (default: 128)') parser.add_argument('--loss_type', nargs='?', default='square_loss', help='Specify a loss type (square_loss or log_loss).') parser.add_argument('--l2_reg_lambda', type=float, default=0, help='L2 regularization lambda (default: 0.0)') parser.add_argument("--allow_soft_placement", default=True, help="Allow device soft device placement") parser.add_argument("--log_device_placement", default=False, help="Log placement of ops on devices") parser.add_argument('--dropout_keep_prob', type=float, default=0.5, help='Dropout keep probability (default: 0.5)') args = parser.parse_args() dl = data_loader.Data_Loader({ 'model_type': 'generator', 'dir_name': args.text_dir }) # text_samples=16390600 vocab=947255 session100 all_samples = dl.item items = dl.item_dict # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(all_samples))) text_samples = all_samples[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int( args.sample_percentage * float(len(text_samples))) x_train, x_dev = text_samples[:dev_sample_index], text_samples[ dev_sample_index:] #create subsession only for training subseqtrain = [] for i in range(len(x_train)): #print x_train[i] seq = x_train[i] lenseq = len(seq) #session lens=100 shortest subsession=5 realvalue+95 0 for j in range(lenseq - 4): subseqend = seq[:len(seq) - j] subseqbeg = [0] * j subseq = np.append(subseqbeg, subseqend) #beginseq=padzero+subseq #newsubseq=pad+subseq subseqtrain.append(subseq) x_train = np.array(subseqtrain) #list to ndarray del subseqtrain # Randomly shuffle data np.random.seed(10) shuffle_train = np.random.permutation(np.arange(len(x_train))) x_train = x_train[shuffle_train] print "generating subsessions is done!" print "shape", x_train.shape[0] print "dataset", args.text_dir model_options = { 'vocab_size': len(items), 'residual_channels': 64, } cnn = TextCNN_hv(sequence_length=x_train.shape[1], num_classes=len(items), vocab_size=len(items), embedding_size=model_options['residual_channels'], filter_sizes=eval(args.filter_sizes), num_filters=args.num_filters, loss_type=args.loss_type, l2_reg_lambda=args.l2_reg_lambda) print "embedding_size", model_options['residual_channels'] session_conf = tf.ConfigProto( # allow to distribute device automatically if your assigned device is not found allow_soft_placement=args.allow_soft_placement, # whether print or not log_device_placement=args.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) sess.run(tf.global_variables_initializer()) step = 1 for epoch in range(args.max_epochs): batch_no = 0 batch_size = args.batch_size while (batch_no + 1) * batch_size < x_train.shape[0]: start = time.clock() text_batch = x_train[batch_no * batch_size:(batch_no + 1) * batch_size, :] _, loss, prediction = sess.run( [train_op, cnn.loss, cnn.arg_max_prediction], feed_dict={ cnn.wholesession: text_batch, cnn.dropout_keep_prob: args.dropout_keep_prob }) end = time.clock() if step % args.sample_every == 0: print "-------------------------------------------------------train1" print "LOSS: {}\tEPOCH: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( loss, epoch, batch_no, step, x_train.shape[0] / args.batch_size) print "TIME FOR BATCH", end - start print "TIME FOR EPOCH (mins)", (end - start) * ( x_train.shape[0] / args.batch_size) / 60.0 # print "-------------------------------------------------------train2" # loss = sess.run( # [generator_model.loss_test], # feed_dict={ # generator_model.seed_sentence: text_batch # }) # print "LOSS: {}\tEPOCH: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( # loss, epoch, batch_no, step, x_train.shape[0] / args.batch_size) # print "-------------------------------------------------------test1" # if (batch_no + 1) * batch_size < x_dev.shape[0]: # text_batch = x_dev[(batch_no) * batch_size: (batch_no + 1) * batch_size, :] # loss = sess.run( # [generator_model.loss], # feed_dict={ # generator_model.t_sentence: text_batch # }) # print "LOSS: {}\tEPOCH: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( # loss, epoch, batch_no, step, x_dev.shape[0] / args.batch_size) if step % args.sample_every == 0: print "-------------------------------------------------------test1" if (batch_no + 1) * batch_size < x_dev.shape[0]: text_batch = x_dev[(batch_no) * batch_size:(batch_no + 1) * batch_size, :] loss = sess.run([cnn.loss], feed_dict={ cnn.wholesession: text_batch, cnn.dropout_keep_prob: 1.0 }) print "LOSS: {}\tEPOCH: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( loss, epoch, batch_no, step, x_dev.shape[0] / args.batch_size) batch_no += 1 if step % args.sample_every == 0: print "********************************************************accuracy" batch_no_test = 0 batch_size_test = batch_size * 2 curr_preds_5 = [] rec_preds_5 = [] # 1 ndcg_preds_5 = [] # 1 curr_preds_20 = [] rec_preds_20 = [] # 1 ndcg_preds_20 = [] # 1 while (batch_no_test + 1) * batch_size_test < x_dev.shape[0]: # do not need to evaluate all, only after several 10 sample_every, then output final results if (step / (args.sample_every) < 10): if (batch_no_test > 2): break else: if (batch_no_test > 500): break text_batch = x_dev[batch_no_test * batch_size_test:(batch_no_test + 1) * batch_size_test, :] [probs] = sess.run([cnn.probs_flat], feed_dict={ cnn.wholesession: text_batch, cnn.dropout_keep_prob: 1.0 }) for bi in range(probs.shape[0]): pred_words_5 = utils.sample_top_k( probs[bi], top_k=args.top_k) # top_k=5 pred_words_20 = utils.sample_top_k(probs[bi], top_k=args.top_k + 15) true_word = text_batch[bi][-1] predictmap_5 = { ch: i for i, ch in enumerate(pred_words_5) } pred_words_20 = { ch: i for i, ch in enumerate(pred_words_20) } rank_5 = predictmap_5.get(true_word) rank_20 = pred_words_20.get(true_word) if rank_5 == None: curr_preds_5.append(0.0) rec_preds_5.append(0.0) # 2 ndcg_preds_5.append(0.0) # 2 else: MRR_5 = 1.0 / (rank_5 + 1) Rec_5 = 1.0 # 3 ndcg_5 = 1.0 / math.log(rank_5 + 2, 2) # 3 curr_preds_5.append(MRR_5) rec_preds_5.append(Rec_5) # 4 ndcg_preds_5.append(ndcg_5) # 4 if rank_20 == None: curr_preds_20.append(0.0) rec_preds_20.append(0.0) # 2 ndcg_preds_20.append(0.0) # 2 else: MRR_20 = 1.0 / (rank_20 + 1) Rec_20 = 1.0 # 3 ndcg_20 = 1.0 / math.log(rank_20 + 2, 2) # 3 curr_preds_20.append(MRR_20) rec_preds_20.append(Rec_20) # 4 ndcg_preds_20.append(ndcg_20) # 4 batch_no_test += 1 print "BATCH_NO: {}".format(batch_no_test) print "Accuracy mrr_5:", sum(curr_preds_5) / float( len(curr_preds_5)) # 5 print "Accuracy mrr_20:", sum(curr_preds_20) / float( len(curr_preds_20)) # 5 print "Accuracy hit_5:", sum(rec_preds_5) / float( len(rec_preds_5)) # 5 print "Accuracy hit_20:", sum(rec_preds_20) / float( len(rec_preds_20)) # 5 print "Accuracy ndcg_5:", sum(ndcg_preds_5) / float( len(ndcg_preds_5)) # 5 print "Accuracy ndcg_20:", sum(ndcg_preds_20) / float( len(ndcg_preds_20)) # # print "curr_preds",curr_preds step += 1
def main(): parser = argparse.ArgumentParser() parser.add_argument('--top_k', type=int, default=5, help='Sample from top k predictions') parser.add_argument('--beta1', type=float, default=0.9, help='hyperpara-Adam') parser.add_argument('--datapath', type=str, default='Data/Session/user-filter-20000items-session5.csv', help='data path') parser.add_argument('--eval_iter', type=int, default=10, help='Sample generator output evry x steps') parser.add_argument('--save_para_every', type=int, default=10, help='save model parameters every') parser.add_argument('--tt_percentage', type=float, default=0.2, help='default=0.2 means 80% training 20% testing') parser.add_argument('--is_generatesubsession', type=bool, default=False, help='whether generating a subsessions, e.g., 12345-->01234,00123,00012 It may be useful for very some very long sequences') args = parser.parse_args() dl = data_loader_recsys.Data_Loader({'model_type': 'generator', 'dir_name': args.datapath}) all_samples = dl.item items = dl.item_dict print "len(items)",len(items) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(all_samples))) all_samples = all_samples[shuffle_indices] # Split train/test set dev_sample_index = -1 * int(args.tt_percentage * float(len(all_samples))) train_set, valid_set = all_samples[:dev_sample_index], all_samples[dev_sample_index:] model_para = { #all parameters shuold be consist with those in nextitred.py!!!! 'item_size': len(items), 'dilated_channels': 100, 'dilations': [1, 2, 1, 2, 1, 2, ], 'kernel_size': 3, 'learning_rate':0.001, 'batch_size':128, 'iterations':2,#useless, can be removed 'is_negsample':False #False denotes no negative sampling } itemrec = generator_recsys.NextItNet_Decoder(model_para) itemrec.train_graph(model_para['is_negsample']) itemrec.predict_graph(model_para['is_negsample'],reuse=True) sess= tf.Session() init=tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() saver.restore(sess,"Data/Models/generation_model/model_nextitnet.ckpt") batch_no_test = 0 batch_size_test = model_para['batch_size'] curr_preds_5 = [] rec_preds_5 = [] # 1 ndcg_preds_5 = [] # 1 curr_preds_20 = [] rec_preds_20 = [] # 1 ndcg_preds_20 = [] # 1 while (batch_no_test + 1) * batch_size_test < valid_set.shape[0]: item_batch = valid_set[batch_no_test * batch_size_test: (batch_no_test + 1) * batch_size_test, :] [probs] = sess.run( [itemrec.g_probs], feed_dict={ itemrec.input_predict: item_batch }) for bi in range(probs.shape[0]): pred_items_5 = utils.sample_top_k(probs[bi][-1], top_k=args.top_k) # top_k=5 pred_items_20 = utils.sample_top_k(probs[bi][-1], top_k=args.top_k + 15) true_item = item_batch[bi][-1] predictmap_5 = {ch: i for i, ch in enumerate(pred_items_5)} pred_items_20 = {ch: i for i, ch in enumerate(pred_items_20)} rank_5 = predictmap_5.get(true_item) rank_20 = pred_items_20.get(true_item) if rank_5 == None: curr_preds_5.append(0.0) rec_preds_5.append(0.0) # 2 ndcg_preds_5.append(0.0) # 2 else: MRR_5 = 1.0 / (rank_5 + 1) Rec_5 = 1.0 # 3 ndcg_5 = 1.0 / math.log(rank_5 + 2, 2) # 3 curr_preds_5.append(MRR_5) rec_preds_5.append(Rec_5) # 4 ndcg_preds_5.append(ndcg_5) # 4 if rank_20 == None: curr_preds_20.append(0.0) rec_preds_20.append(0.0) # 2 ndcg_preds_20.append(0.0) # 2 else: MRR_20 = 1.0 / (rank_20 + 1) Rec_20 = 1.0 # 3 ndcg_20 = 1.0 / math.log(rank_20 + 2, 2) # 3 curr_preds_20.append(MRR_20) rec_preds_20.append(Rec_20) # 4 ndcg_preds_20.append(ndcg_20) # 4 batch_no_test += 1 print "BATCH_NO: {}".format(batch_no_test) print "Accuracy mrr_5:", sum(curr_preds_5) / float(len(curr_preds_5)) # 5 print "Accuracy mrr_20:", sum(curr_preds_20) / float(len(curr_preds_20)) # 5 print "Accuracy hit_5:", sum(rec_preds_5) / float(len(rec_preds_5)) # 5 print "Accuracy hit_20:", sum(rec_preds_20) / float(len(rec_preds_20)) # 5 print "Accuracy ndcg_5:", sum(ndcg_preds_5) / float(len(ndcg_preds_5)) # 5 print "Accuracy ndcg_20:", sum(ndcg_preds_20) / float(len(ndcg_preds_20)) #
def main(): parser = argparse.ArgumentParser() parser.add_argument('--top_k', type=int, default=5, help='Sample from top k predictions') parser.add_argument('--beta1', type=float, default=0.9, help='hyperpara-Adam') #history_sequences_20181014_fajie_smalltest.csv parser.add_argument( '--datapath', type=str, default= 'Data/Session/history_sequences_20181014_fajie_transfer_pretrain_small.csv', help='data path') parser.add_argument('--eval_iter', type=int, default=10, help='Sample generator output evry x steps') parser.add_argument('--save_para_every', type=int, default=10, help='save model parameters every') parser.add_argument('--tt_percentage', type=float, default=0.5, help='0.2 means 80% training 20% testing') parser.add_argument( '--is_generatesubsession', type=bool, default=False, help= 'whether generating a subsessions, e.g., 12345-->01234,00123,00012 It may be useful for very some very long sequences' ) parser.add_argument( '--padtoken', type=str, default='0', help='is the padding token in the beggining of the sequence') args = parser.parse_args() dl = data_loader_recsys.Data_Loader({ 'model_type': 'generator', 'dir_name': args.datapath }) all_samples = dl.item items = dl.item_dict print "len(items)", len(items) if items.has_key(args.padtoken): padtoken = items[ args. padtoken] # is the padding token in the beggining of the sentence else: # padtoken = sys.maxint padtoken = len(items) + 1 # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(all_samples))) all_samples = all_samples[shuffle_indices] # Split train/test set dev_sample_index = -1 * int(args.tt_percentage * float(len(all_samples))) train_set, valid_set = all_samples[:dev_sample_index], all_samples[ dev_sample_index:] if args.is_generatesubsession: train_set = generatesubsequence(train_set, padtoken) model_para = { #if you changed the parameters here, also do not forget to change paramters in nextitrec_generate.py 'item_size': len(items), 'dilated_channels': 64, # if you use nextitnet_residual_block, you can use [1, 4, ], # if you use nextitnet_residual_block_one, you can tune and i suggest [1, 2, 4, ], for a trial # when you change it do not forget to change it in nextitrec_generate.py # if you find removing residual network, the performance does not obviously decrease, then I think your data does not have strong seqeunce. Change a dataset and try again. 'dilations': [ 1, 4, 1, 4, 1, 4, 1, 4, ], 'kernel_size': 3, 'learning_rate': 0.001, 'batch_size': 2, 'iterations': 400, 'is_negsample': False #False denotes using full softmax } itemrec = generator_recsys_cau.NextItNet_Decoder(model_para) itemrec.train_graph(model_para['is_negsample']) optimizer = tf.train.AdamOptimizer(model_para['learning_rate'], beta1=args.beta1).minimize(itemrec.loss) itemrec.predict_graph(model_para['is_negsample'], reuse=True) tf.add_to_collection("dilate_input", itemrec.dilate_input) tf.add_to_collection("context_embedding", itemrec.context_embedding) # sess= tf.Session(config=tf.ConfigProto(log_device_placement=True)) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() numIters = 1 for iter in range(model_para['iterations']): batch_no = 0 batch_size = model_para['batch_size'] while (batch_no + 1) * batch_size < train_set.shape[0]: start = time.time() item_batch = train_set[batch_no * batch_size:(batch_no + 1) * batch_size, :] _, loss, results = sess.run( [optimizer, itemrec.loss, itemrec.arg_max_prediction], feed_dict={itemrec.itemseq_input: item_batch}) end = time.time() if numIters % args.eval_iter == 0: print "-------------------------------------------------------train1" print "LOSS: {}\tITER: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( loss, iter, batch_no, numIters, train_set.shape[0] / batch_size) print "TIME FOR BATCH", end - start print "TIME FOR ITER (mins)", (end - start) * ( train_set.shape[0] / batch_size) / 60.0 if numIters % args.eval_iter == 0: print "-------------------------------------------------------test1" if (batch_no + 1) * batch_size < valid_set.shape[0]: # it is well written here when train_set is much larger than valid_set, 'if' may not hold. it will not have impact on the final results. item_batch = valid_set[(batch_no) * batch_size:(batch_no + 1) * batch_size, :] loss = sess.run([itemrec.loss_test], feed_dict={itemrec.input_predict: item_batch}) print "LOSS: {}\tITER: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( loss, iter, batch_no, numIters, valid_set.shape[0] / batch_size) batch_no += 1 if numIters % args.eval_iter == 0: batch_no_test = 0 batch_size_test = batch_size * 1 curr_preds_5 = [] rec_preds_5 = [] #1 ndcg_preds_5 = [] #1 curr_preds_20 = [] rec_preds_20 = [] # 1 ndcg_preds_20 = [] # 1 while (batch_no_test + 1) * batch_size_test < valid_set.shape[0]: if (numIters / (args.eval_iter) < 10): if (batch_no_test > 20): break else: if (batch_no_test > 500): break item_batch = valid_set[batch_no_test * batch_size_test:(batch_no_test + 1) * batch_size_test, :] [probs] = sess.run( [itemrec.g_probs], feed_dict={itemrec.input_predict: item_batch}) for bi in range(probs.shape[0]): pred_items_5 = utils.sample_top_k( probs[bi][-1], top_k=args.top_k) #top_k=5 pred_items_20 = utils.sample_top_k(probs[bi][-1], top_k=args.top_k + 15) true_item = item_batch[bi][-1] predictmap_5 = { ch: i for i, ch in enumerate(pred_items_5) } pred_items_20 = { ch: i for i, ch in enumerate(pred_items_20) } rank_5 = predictmap_5.get(true_item) rank_20 = pred_items_20.get(true_item) if rank_5 == None: curr_preds_5.append(0.0) rec_preds_5.append(0.0) #2 ndcg_preds_5.append(0.0) #2 else: MRR_5 = 1.0 / (rank_5 + 1) Rec_5 = 1.0 #3 ndcg_5 = 1.0 / math.log(rank_5 + 2, 2) # 3 curr_preds_5.append(MRR_5) rec_preds_5.append(Rec_5) #4 ndcg_preds_5.append(ndcg_5) # 4 if rank_20 == None: curr_preds_20.append(0.0) rec_preds_20.append(0.0) #2 ndcg_preds_20.append(0.0) #2 else: MRR_20 = 1.0 / (rank_20 + 1) Rec_20 = 1.0 #3 ndcg_20 = 1.0 / math.log(rank_20 + 2, 2) # 3 curr_preds_20.append(MRR_20) rec_preds_20.append(Rec_20) #4 ndcg_preds_20.append(ndcg_20) # 4 batch_no_test += 1 print "BATCH_NO: {}".format(batch_no_test) print "Accuracy mrr_5:", sum(curr_preds_5) / float( len(curr_preds_5)) #5 print "Accuracy mrr_20:", sum(curr_preds_20) / float( len(curr_preds_20)) # 5 print "Accuracy hit_5:", sum(rec_preds_5) / float( len(rec_preds_5)) #5 print "Accuracy hit_20:", sum(rec_preds_20) / float( len(rec_preds_20)) # 5 print "Accuracy ndcg_5:", sum(ndcg_preds_5) / float( len(ndcg_preds_5)) # 5 print "Accuracy ndcg_20:", sum(ndcg_preds_20) / float( len(ndcg_preds_20)) # numIters += 1 if numIters % args.save_para_every == 0: save_path = saver.save( sess, "Data/Models/generation_model/model_nextitnet_transfer_pretrain.ckpt" .format(iter, numIters))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--top_k', type=int, default=5, help='Sample from top k predictions') parser.add_argument('--beta1', type=float, default=0.9, help='hyperpara-Adam') parser.add_argument('--datapath', type=str, default='Data/Session/user-filter-20000items-session5.csv', help='data path') parser.add_argument('--eval_iter', type=int, default=1000, help='Sample generator output evry x steps') parser.add_argument('--save_para_every', type=int, default=1000, help='save model parameters every') parser.add_argument('--tt_percentage', type=float, default=0.2, help='0.2 means 80% training 20% testing') parser.add_argument('--is_generatesubsession', type=bool, default=False, help='whether generating a subsessions') args = parser.parse_args() exps = pd.read_csv('exp.csv') cPid = os.getpid() train_time = 0 test_time = 0 for i,row in exps.iterrows(): gc.collect() args.expname = row['name'] args.sessionid = row['sessionid'] args.itemid = row['itemid'] args.data_folder = row['path'] args.valid_data = row['test'] args.train_data = row['train'] args.freq = row['freq'] args.model_type = 'generator' print(("\n\n############################################\n"), args.train_data, ' --- ', args.valid_data) with open("LOGGER_"+ args.expname + ".txt", "a") as myfile: myfile.write(row['train'] + ", " + row['test'] +"\n") train_data = os.path.join(args.data_folder, args.train_data) args.dir_name = train_data dl = data_loader_recsys.Data_Loader(vars(args)) train_set = dl.item items = dl.item_dict print ("len(train items)", len(items)) valid_data = os.path.join(args.data_folder, args.valid_data) args.dir_name = valid_data vdl = data_loader_recsys.Data_Loader(vars(args), testFlag = True, itemsIDs = dl.itemsIDs, max_doc = dl.max_document_length, vocab_proc = dl.vocab_processor) valid_set = vdl.item items2 = vdl.item_dict print ("len(valid items)", len(items2)) model_para = { #if you changed the parameters here, also do not forget to change paramters in nextitrec_generate.py 'item_size': len(items), 'dilated_channels': 100,#larger is better until 512 or 1024 # if you use nextitnet_residual_block, you can use [1, 4, 1, 4, 1,4,], # if you use nextitnet_residual_block_one, you can tune and i suggest [1, 2, 4, ], for a trial # when you change it do not forget to change it in nextitrec_generate.py 'dilations': [1, 2, 4, ],#YOU should tune this hyper-parameter, refer to the paper. 'kernel_size': 3, 'learning_rate':0.001,#YOU should tune this hyper-parameter 'batch_size':32,#YOU should tune this hyper-parameter 'epochs':10,# if your dataset is small, suggest adding regularization to prevent overfitting 'is_negsample':False #False denotes no negative sampling } tf.compat.v1.reset_default_graph() with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): itemrec = generator_recsys.NextItNet_Decoder(model_para) itemrec.train_graph(model_para['is_negsample']) optimizer = tf.compat.v1.train.AdamOptimizer(model_para['learning_rate'], beta1=args.beta1).minimize(itemrec.loss) itemrec.predict_graph(model_para['is_negsample'],reuse=True) sess= tf.Session() init=tf.global_variables_initializer() sess.run(init) for e in range(model_para['epochs']): print("\n############################\nEPOCH #:", e) batch_no = 0 batch_size = model_para['batch_size'] losses = [] t1 = time.time() while (batch_no + 1) * batch_size < train_set.shape[0]: batch_no += 1 item_batch = train_set[(batch_no-1) * batch_size: (batch_no) * batch_size, :] _, loss, results = sess.run([optimizer, itemrec.loss, itemrec.arg_max_prediction],feed_dict={itemrec.itemseq_input: item_batch}) losses.append(loss) if batch_no % 100 == 0: print('Finished Batch:', batch_no) print('Train Loss:', np.mean(losses), valid_set.shape[0]) train_time += (time.time() - t1) batch_no_test = 0 batch_size_test = batch_size * 1 MRR = [[], [], [], [], []] Rec = [[], [], [], [], []] cov = [[], [], [], [], []] pop = [[], [], [], [], []] Ks = [1, 3, 5, 10, 20] t1 = time.time() while (batch_no_test + 1) * batch_size_test < valid_set.shape[0]: batch_no_test += 1 item_batch = valid_set[(batch_no_test-1) * batch_size_test: (batch_no_test) * batch_size_test, :] [probs] = sess.run([itemrec.g_probs], feed_dict={itemrec.input_predict:item_batch}) for bi in range(probs.shape[0]): true_item = item_batch[bi][-1] if true_item == 1: continue if args.freq != 0 and true_item != 0 and dl.freqs[true_item] > args.freq: continue for k in range(len(Ks)): pred_items = utils.sample_top_k(probs[bi][-1], top_k=Ks[k]) predictmap = {ch : i for i, ch in enumerate(pred_items)} print(pred_items, predictmap) for p in pred_items: if p == 1: continue if p not in cov[k]: cov[k].append(p) pop[k].append(dl.freqs[p]) rank = predictmap.get(true_item) if rank == None: mrr = 0.0 rec = 0.0 else: mrr = 1.0/(rank+1) rec = 1.0 MRR[k].append(mrr) Rec[k].append(rec) test_time += (time.time() - t1) / len(Ks) Rec[:] = [np.mean(x) for x in Rec] MRR[:] = [np.mean(x) for x in MRR] cov[:] = [len(x) / len(items) for x in cov] maxi = max(dl.freqs.values()) pop[:] = [np.mean(x) / maxi for x in pop] print("MRR@20:", MRR[-1]) print("Recall@20:", Rec[-1]) print("Cov@20:", cov[-1]) print("Pop@20:", pop[-1]) with open("LOGGER_"+ args.expname + ".txt", "a") as myfile: myfile.write('EPOCH #:' + str(e)) myfile.write(str(Rec[0])+','+str(Rec[1])+','+str(Rec[2])+','+str(Rec[3])+','+str(Rec[4])+','+ str(MRR[0])+','+str(MRR[1])+','+str(MRR[2])+','+str(MRR[3])+','+str(MRR[4])) myfile.write("\nCOV:"+str(cov[0])+','+str(cov[1])+','+str(cov[2])+','+str(cov[3])+','+str(cov[4])) myfile.write("\nPOP:"+str(pop[0])+','+str(pop[1])+','+str(pop[2])+','+str(pop[3])+','+str(pop[4])) myfile.write("\nTrainTime:"+str(train_time)) myfile.write("\nTestTime:"+str(test_time)) myfile.write("\n############################################\n")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--top_k', type=int, default=5, help='Sample from top k predictions') parser.add_argument('--beta1', type=float, default=0.9, help='hyperpara-Adam') parser.add_argument('--datapath', type=str, default='Data/Session/ratings_seq20_order.txt', help='data path') parser.add_argument('--eval_iter', type=int, default=10, help='Sample generator output evry x steps') parser.add_argument('--save_para_every', type=int, default=10, help='save model parameters every') parser.add_argument('--tt_percentage', type=float, default=0.2, help='0.2 means 80% training 20% testing') parser.add_argument('--masked_lm_prob', type=float, default=0.4, help='0.2 means 20% items are masked') parser.add_argument('--max_predictions_per_seq', type=int, default=50, help='maximum number of masked tokens') parser.add_argument( '--max_position', type=int, default=100, help= 'maximum number of for positional embedding, it has to be larger than the sequence lens' ) parser.add_argument( '--has_positionalembedding', type=bool, default=False, help='whether contains positional embedding before performing cnnn') parser.add_argument( '--padtoken', type=str, default='-1', help='is the padding token in the beggining of the sequence') parser.add_argument( '--is_shuffle', type=bool, default=False, help= 'whether shuffle the training and testing dataset, e.g., 012345-->051324' ) args = parser.parse_args() dl = data_loader_recsys.Data_Loader({ 'model_type': 'generator', 'dir_name': args.datapath }) all_samples = dl.item items = dl.item_dict #key is the original token, value is the mapped value, i.e., 0, 1,2,3... itemlist = items.values() item_size = len(items) # the first token is 'unk' print "len(items)", item_size if items.has_key(args.padtoken): padtoken = items[ args. padtoken] # is the padding token in the beggining of the sentence else: padtoken = sys.maxint # padtoken=items[args.padtoken]# is the padding token in the beggining of the sentence max_predictions_per_seq = args.max_predictions_per_seq masked_lm_prob = args.masked_lm_prob # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(all_samples))) all_samples = all_samples[shuffle_indices] # Split train/test set dev_sample_index = -1 * int(args.tt_percentage * float(len(all_samples))) train_set, valid_set = all_samples[:dev_sample_index], all_samples[ dev_sample_index:] if args.is_shuffle: train_set = shuffleseq(train_set, padtoken) model_para = { #if you changed the parameters here, also do not forget to change paramters in nextitrec_generate.py 'item_size': item_size, 'dilated_channels': 64, # if you use nextitnet_residual_block, you can use [1, 4, ], # if you use nextitnet_residual_block_one, you can tune and i suggest [1, 2, 4, ], for a trial 'dilations': [ 1, 4, 1, 4, ], 'kernel_size': 3, 'learning_rate': 0.001, 'batch_size': 8, # a proper batch size helps a lot {e.g.,256, 512} 'iterations': 400, 'max_position': args. max_position, #maximum number of for positional embedding, it has to be larger than the sequence lens 'has_positionalembedding': args.has_positionalembedding, 'is_negsample': False, #False denotes no negative sampling 'top_k': args.top_k } itemrec = generator_recsys.GRec_Archi(model_para) itemrec.train_graph() optimizer = tf.train.AdamOptimizer(model_para['learning_rate'], beta1=args.beta1).minimize(itemrec.loss) itemrec.predict_graph(reuse=True) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() numIters = 1 for iter in range(model_para['iterations']): batch_no = 0 batch_size = model_para['batch_size'] while (batch_no + 1) * batch_size < train_set.shape[0]: start = time.time() item_batch = train_set[batch_no * batch_size:(batch_no + 1) * batch_size, :] # original input 1 2 3 4 5 6 7 8 9 # item_batch[:,1:-1] 2 3 4 5 6 7 8 # output_tokens_batch 2 0 4 5 0 7 8 #maskedpositions_batch [1 4] #maskedlabels_batch [3 6] output_tokens_batch, maskedpositions_batch, maskedlabels_batch, masked_lm_weights_batch = create_masked_lm_predictions_frombatch( item_batch, masked_lm_prob, max_predictions_per_seq, items=itemlist, rng=None, item_size=item_size) _, loss = sess.run( [optimizer, itemrec.loss], feed_dict={ itemrec.itemseq_output: item_batch[:, 1:], # 2 3 4 5 6 7 8 9 itemrec.itemseq_input_en: output_tokens_batch, # 1 2 0 4 5 0 7 8 9 itemrec.itemseq_input_de: item_batch, # 1 2 3 4 5 6 7 8 9 itemrec.masked_position: maskedpositions_batch, #[1 4] itemrec.masked_items: maskedlabels_batch, #[3,6] itemrec.label_weights: masked_lm_weights_batch #[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0] #useless }) end = time.time() if numIters % args.eval_iter == 0: print "-------------------------------------------------------train1" print "LOSS: {}\tITER: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( loss, iter, batch_no, numIters, train_set.shape[0] / batch_size) print "TIME FOR BATCH", end - start # print "TIME FOR ITER (mins)", (end - start) * (train_set.shape[0] / batch_size) / 60.0 if numIters % args.eval_iter == 0: print "-------------------------------------------------------test1" batch_no_valid = 0 batch_size_valid = batch_size if (batch_no_valid + 1) * batch_size_valid < valid_set.shape[0]: start = time.time() item_batch = valid_set[(batch_no_valid) * batch_size_valid:(batch_no_valid + 1) * batch_size_valid, :] output_tokens_batch, maskedpositions_batch, maskedlabels_batch, masked_lm_weights_batch = create_masked_lm_predictions_frombatch( item_batch, masked_lm_prob, max_predictions_per_seq, items=itemlist, rng=None, item_size=item_size) loss = sess.run( [itemrec.loss], feed_dict={ itemrec.itemseq_output: item_batch[:, 1:], itemrec.itemseq_input_en: output_tokens_batch, itemrec.itemseq_input_de: item_batch, itemrec.masked_position: maskedpositions_batch, itemrec.masked_items: maskedlabels_batch, itemrec.label_weights: masked_lm_weights_batch }) end = time.time() print "LOSS: {}\tITER: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}".format( loss, iter, batch_no_valid, numIters, valid_set.shape[0] / batch_size_valid) print "TIME FOR BATCH", end - start batch_no += 1 if numIters % args.eval_iter == 0: batch_no_test = 0 batch_size_test = batch_size * 1 curr_preds_5 = [] rec_preds_5 = [] #1 ndcg_preds_5 = [] #1 curr_preds_20 = [] rec_preds_20 = [] # 1 ndcg_preds_20 = [] # 1 while (batch_no_test + 1) * batch_size_test < valid_set.shape[0]: if (numIters / (args.eval_iter) < 20): if (batch_no_test > 10): break else: if (batch_no_test > 100): break item_batch = valid_set[batch_no_test * batch_size_test:(batch_no_test + 1) * batch_size_test, :] # output_tokens_batch,maskedpositions_batch,maskedlabels_batch=create_masked_predictions_frombatch(item_batch) [probs] = sess.run( [itemrec.log_probs], feed_dict={ itemrec.itemseq_input_en: item_batch[:, 0:-1], # 1 2 3 4 5 6 7 8 itemrec.itemseq_input_de: item_batch[:, 0:-1], # 1 2 3 4 5 6 7 8 # itemrec.itemseq_input_en: item_batch, # 1 2 3 4 5 6 7 8 # itemrec.itemseq_input_de: item_batch, # 1 2 3 4 5 6 7 8 }) for bi in range(probs.shape[0]): pred_items_5 = utils.sample_top_k( probs[bi], top_k=args.top_k) #top_k=5 pred_items_20 = utils.sample_top_k(probs[bi], top_k=args.top_k + 15) true_item = item_batch[bi][-1] predictmap_5 = { ch: i for i, ch in enumerate(pred_items_5) } pred_items_20 = { ch: i for i, ch in enumerate(pred_items_20) } rank_5 = predictmap_5.get(true_item) rank_20 = pred_items_20.get(true_item) if rank_5 == None: curr_preds_5.append(0.0) rec_preds_5.append(0.0) #2 ndcg_preds_5.append(0.0) #2 else: MRR_5 = 1.0 / (rank_5 + 1) Rec_5 = 1.0 #3 ndcg_5 = 1.0 / math.log(rank_5 + 2, 2) # 3 curr_preds_5.append(MRR_5) rec_preds_5.append(Rec_5) #4 ndcg_preds_5.append(ndcg_5) # 4 if rank_20 == None: curr_preds_20.append(0.0) rec_preds_20.append(0.0) #2 ndcg_preds_20.append(0.0) #2 else: MRR_20 = 1.0 / (rank_20 + 1) Rec_20 = 1.0 #3 ndcg_20 = 1.0 / math.log(rank_20 + 2, 2) # 3 curr_preds_20.append(MRR_20) rec_preds_20.append(Rec_20) #4 ndcg_preds_20.append(ndcg_20) # 4 batch_no_test += 1 if (numIters / (args.eval_iter) < 20): if (batch_no_test == 10): print "BATCH_NO: {}".format(batch_no_test) print "mrr_5:", sum(curr_preds_5) / float( len(curr_preds_5) ), "mrr_20:", sum(curr_preds_20) / float( len(curr_preds_20) ), "hit_5:", sum(rec_preds_5) / float( len(rec_preds_5) ), "hit_20:", sum(rec_preds_20) / float( len(rec_preds_20) ), "ndcg_5:", sum(ndcg_preds_5) / float( len(ndcg_preds_5)), "ndcg_20:", sum( ndcg_preds_20) / float(len(ndcg_preds_20)) else: if (batch_no_test == 100): print "BATCH_NO: {}".format(batch_no_test) print "mrr_5:", sum(curr_preds_5) / float( len(curr_preds_5) ), "mrr_20:", sum(curr_preds_20) / float( len(curr_preds_20) ), "hit_5:", sum(rec_preds_5) / float( len(rec_preds_5) ), "hit_20:", sum(rec_preds_20) / float( len(rec_preds_20) ), "ndcg_5:", sum(ndcg_preds_5) / float( len(ndcg_preds_5)), "ndcg_20:", sum( ndcg_preds_20) / float(len(ndcg_preds_20)) numIters += 1