def __init__(self): self.logger = logging.getLogger(__name__) try: # print("size "+str(os.path.getsize(self.proxy_filename))) # print("proxy life lines: ", utils.file_len(self.proxy_filename)) if os.path.getsize(self.proxy_filename) and utils.file_len(self.proxy_filename) > 0: self._proxy_list_file = list(filter(None, (line.rstrip() for line in open(self.proxy_filename)))) else: self.is_use_proxy = False self.dataCache.names_list = [] if os.path.getsize(self.names_filename) and utils.file_len(self.names_filename) > 0: self.dataCache.names_list = list( filter(None, (line.rstrip() for line in open(self.names_filename, encoding="utf8")))) else: self.logger.error("Please input a names list.") self.logger = logging.getLogger(__name__) except IOError: print("proxy/name file I/O error") pass
def filter(cls, field, value, infile=None, outfile=None): """Filters a GTF file based on a column in the data. :param field: Str field to examine. :type field: str. :param value: Criteria to filter column on. :type column: str. """ # Setting output file (default to stdout) log('Opening GTF output file: %s', outfile or 'stdout') fo = sys.stdout if outfile != None: fo = open(outfile, 'wb') # Perform different loop depending on input file log('Opening GTF input file: %s', infile or 'stdin') fi = sys.stdin count = 0 if infile != None: # Infile was provided try: fi = open(infile, 'rb') except IOError as e: sys.stderr.write('Cannot open file %s' % infile) log('Estimating time to compute...') flen = file_len(infile) log('CSV reader starting...') csvfile = csv.reader(fi, delimiter='\t') log('Beginning import') count = 0 prev = None for row in csvfile: percent = int(float(count) / float(flen) * 100) if len(row) == 9: gtf = GTF(row) if prev != percent: sys.stderr.write("\rPercent Complete: %d%%" % percent) sys.stderr.flush() if eval('gtf.' + field) == value: fo.write('\t'.join(row) + '\n') count += 1 prev = percent fi.close() else: count = 0 interval = 100 for line in sys.stdin: gtf = GTF(line.strip('\n').split('\t')) if eval('gtf.' + field) == value: fo.write(line) if count % interval == 0: sys.stderr.write("\rLines Read: %d" % count) sys.stderr.flush() count += 1 fi.close() fo.close() log('\nFinished filtering file')
def __init__(self): self.last_line = "" self.in_house_rooms = [] # Only rooms being parsed self.open_rooms = [] self.enqueued_rooms = [] self.room_count = 0 self.lineLimit = 50 self.lines_in_file = utils.file_len("maps/rooms.txt")
def set_data(): section_id = request.values.get("section_id") file_ = open("current_dataset/"+section_id+".csv") if file_len("current_dataset/"+section_id+".csv") == 60: file_.readline() for i in range(59): file_.write(file_.readline()) file_.write(request.values.get("values")) file_.close() else: file_ = open("current_dataset/"+section_id+".csv", "a") file_.write(request.values.get("values")) file_.close() return "OK"
def extract_image_urls(source, save_loc='image_urls.csv', batch_size=20000): articles = [] num = 0 _max = file_len(source) print(_max) with open(source, 'rb') as file_reader: for article in jl.reader(file_reader): articles.append(article) num += 1 if num % batch_size == 0: save_as_csv(get_image_url(articles), path=save_loc) print('Processed {} of {}'.format(num, _max)) articles = [] if num == _max: print('Processed {} of {}'.format(num, _max)) print('Finished extracting URLs') save_as_csv(get_image_url(articles), path=save_loc)
def image_fetcher(source, save_loc=os.curdir, start_index=0, batch_size=50, _max=None, max_retries=1): const_start = start_index _max = int(const_start + _max) if _max is not None else file_len(source) start_of_process = time.time() total = 0 while True: start_time = time.time() if start_index == _max: return elif _max is None: get_images(filepath=source, save_loc=save_loc, batch_size=batch_size, start_index=start_index, recursion_depth=max_retries) start_index += batch_size else: get_images(filepath=source, save_loc=save_loc, batch_size=batch_size, start_index=start_index, _max=_max, recursion_depth=max_retries) start_index += batch_size end_time = time.time() total_time = end_time - start_of_process total += batch_size print('Processed {} in {:.3g} seconds'.format(batch_size, end_time - start_time)) print('Progress: {} in ~{:.3g}'.format(total, int(total_time) / 60)) print( '---------------------------------------------------------------------' )
def main(): if os.path.exists(twpc_helper.save_path): twpc_helper.save_path = options(twpc_helper.save_path) print(f"New path: {twpc_helper.save_path}") articles = [] end = file_len(twpc_helper.source_path) with open(twpc_helper.source_path, "rb") as f: for i, article in enumerate(jl_reader(f), start=1): article["contents"] = list(filter(None, article["contents"])) article["category"], article["subcategory"] = get_categories( article) # Ugly code, but significantly speeds up the process if a category is set if twpc_helper.category is not None and not article[ "category"] == twpc_helper.category: if i % twpc_helper.batch_size == 0 or i == end: if len(articles) > 0: save_as_csv(articles) articles = [] print(f"Progress: {i} / {end}.") continue article["text"] = stringify_contents(article) article["date"], article["time"] = unix_to_dt(article) article["image_url"], article[ "image_caption"] = get_image_url_and_caption(article) article["author_bio"] = get_author_bio(article) if article["title"] is None or article["title"] == "": article["title"] = np.nan if article["author"] == "": article["author"], article[ 'subtype'] = get_author_if_compilation(article) else: article["subtype"] = "standalone" discard_properties(article) articles.append(article) if i % twpc_helper.batch_size == 0 or i == end: save_as_csv(articles) articles = [] print(f"Progress: {i} / {end}.")
def main(): print "load model..." param = cPickle.load(open(workdir + "model_dns_ori.pkl")) #载入model_dns_ori.pkl,导入用户项目等参数?可能是.txt文件相同信息的不同格式 generator = GEN(ITEM_NUM, USER_NUM, EMB_DIM, lamda=0.0 / BATCH_SIZE, param=param, initdelta=INIT_DELTA, learning_rate=0.001) discriminator = DIS(ITEM_NUM, USER_NUM, EMB_DIM, lamda=0.1 / BATCH_SIZE, param=None, initdelta=INIT_DELTA, learning_rate=0.001) #------------------------------------------------------动态申请显存 config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) #初始化 #----------------------------------------------------------- print "gen ", simple_test(sess, generator)#这里不知道在干什么 print "dis ", simple_test(sess, discriminator) dis_log = open(workdir + 'dis_log.txt', 'w') #pkl文件和txt文件的关系不明确 gen_log = open(workdir + 'gen_log.txt', 'w') ##########################################################################################输入和初始化 # minimax training------------------------------------------------------------------------------D部分生成负样本和正样本结合 best = 0. #??? for epoch in range(15): if epoch >= 0: for d_epoch in range(100): if d_epoch % 5 == 0: generate_for_d(sess, generator, DIS_TRAIN_FILE) #根据分布率得到采样文件,根据用户得到item train_size = ut.file_len(DIS_TRAIN_FILE) #从dis-train.txt得到文件数 ############################################################################################搭建模型前让雷一鸣搞清楚这一段的原因 index = 1 while True: if index > train_size: break if index + BATCH_SIZE <= train_size + 1: #这个判别条件的原理不知道 input_user, input_item, input_label = ut.get_batch_data(DIS_TRAIN_FILE, index, BATCH_SIZE) else: input_user, input_item, input_label = ut.get_batch_data(DIS_TRAIN_FILE, index, train_size - index + 1) index += BATCH_SIZE ########################################################################################################### _ = sess.run(discriminator.d_updates, #猜测是不断修改模型的过程 feed_dict={discriminator.u: input_user, discriminator.i: input_item, discriminator.label: input_label}) #给空出来的的placeholder传输值,定义好整个图之后才会用sess.run??把从G中得到的参数给了D ####################### Train G################################policy gradient生成item更新参数######################### for g_epoch in range(50): # 50 for u in user_pos_train: #u训练数据集 sample_lambda = 0.2 pos = user_pos_train[u] #-----------------------------------------------------------------------------很重要,需要明白一下 rating = sess.run(generator.all_logits, {generator.u: u})#{}中是字典型数据,猜测是在所有的猜测数据中检索字典型数据或者仅仅是把这个数据输入到模型中去,放到图中开始运行,为了取回fetch内容,在参数中加入需要输入的数据 exp_rating = np.exp(rating)#计算e指数 prob = exp_rating / np.sum(exp_rating) # prob is generator distribution p_\theta 分布率 pn = (1 - sample_lambda) * prob pn[pos] += sample_lambda * 1.0 / len(pos) # Now, pn is the Pn in importance sampling, prob is generator distribution p_\theta #不明白---------------------------------但很重要####################p和pn####################3 sample = np.random.choice(np.arange(ITEM_NUM), 2 * len(pos), p=pn) #这里应该是选择文档的过程给出item索引,在索引中随机以p这个概率选取2 * len(pos)个 #得到概率并且抽样了 根据用户的输入得到item ########################################################################### # Get reward and adapt it with importance sampling在D中才有reward ########################################################################### reward = sess.run(discriminator.reward, {discriminator.u: u, discriminator.i: sample}) #用户和抽样的项目放入D中计算反馈,sess实体运行图取回括号中的参数 sess.run(fetches,feed_dict),给placeholder创建出来的变量赋值 reward = reward * prob[sample] / pn[sample] ###########################################################################,挑出来的是训练数据中的采样和重点sample # Update G ########################################################################### _ = sess.run(generator.gan_updates, {generator.u: u, generator.i: sample, generator.reward: reward}) result = simple_test(sess, generator) print "epoch ", epoch, "gen: ", result buf = '\t'.join([str(x) for x in result]) #把结果搞到一起了 gen_log.write(str(epoch) + '\t' + buf + '\n') #输出准确率 gen_log.flush() p_5 = result[1] if p_5 > best: print 'best: ', result best = p_5 generator.save_model(sess, "ml-100k/gan_generator.pkl") gen_log.close() dis_log.close()
def main(): print "load model..." param = cPickle.load(open(workdir + "model_dns_ori.pkl")) generator = GEN(ITEM_NUM, USER_NUM, EMB_DIM, lamda=0.0 / BATCH_SIZE, param=None, initdelta=INIT_DELTA, learning_rate=0.001) discriminator = DIS(ITEM_NUM, USER_NUM, EMB_DIM, lamda=0.1 / BATCH_SIZE, param=None, initdelta=INIT_DELTA, learning_rate=0.001) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) print "gen ", simple_test(sess, generator) print "dis ", simple_test(sess, discriminator) dis_log = open(workdir + 'dis_log.txt', 'w') gen_log = open(workdir + 'gen_log.txt', 'w') # minimax training best = 0. for epoch in range(300): if epoch >= 0: for d_epoch in range(100): if d_epoch % 5 == 0: generate_for_d(sess, generator, DIS_TRAIN_FILE) train_size = ut.file_len(DIS_TRAIN_FILE) index = 1 while True: if index > train_size: break if index + BATCH_SIZE <= train_size + 1: input_user, input_item, input_label = ut.get_batch_data( DIS_TRAIN_FILE, index, BATCH_SIZE) else: input_user, input_item, input_label = ut.get_batch_data( DIS_TRAIN_FILE, index, train_size - index + 1) index += BATCH_SIZE _ = sess.run(discriminator.d_updates, feed_dict={ discriminator.u: input_user, discriminator.i: input_item, discriminator.label: input_label }) # Train G for g_epoch in range(50): # 50 for u in user_pos_train: sample_lambda = 0.2 pos = user_pos_train[u] rating = sess.run(generator.all_logits, {generator.u: u}) exp_rating = np.exp(rating) prob = exp_rating / np.sum( exp_rating) # prob is generator distribution p_\theta pn = (1 - sample_lambda) * prob pn[pos] += sample_lambda * 1.0 / len(pos) # Now, pn is the Pn in importance sampling, prob is generator distribution p_\theta sample = np.random.choice(np.arange(ITEM_NUM), 2 * len(pos), p=pn) ########################################################################### # Get reward and adapt it with importance sampling ########################################################################### reward = sess.run(discriminator.reward, { discriminator.u: u, discriminator.i: sample }) reward = reward * prob[sample] / pn[sample] ########################################################################### # Update G ########################################################################### _ = sess.run( generator.gan_updates, { generator.u: u, generator.i: sample, generator.reward: reward }) result = simple_test(sess, generator) print "epoch ", epoch, "gen: ", result buf = '\t'.join([str(x) for x in result]) gen_log.write(str(epoch) + '\t' + buf + '\n') gen_log.flush() p_5 = result[4] if p_5 > best: print 'best: ', result best = p_5 generator.save_model(sess, "ml-100k/gan_generator.pkl") gen_log.close() dis_log.close()
def main(): print("loading model...") generator = GEN(ITEM_NUM,USER_NUM,EMB_DIM,lamda = 0.0 / BATCH_SIZE,param = None,initdelta= INIT_DELTA, learning_rate = 0.001) discriminator = DIS(ITEM_NUM,USER_NUM,EMB_DIM,lamda = 0.1/BATCH_SIZE,param=None,initdelta = INIT_DELTA, learning_rate = 0.001) config = tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) print("gen ",simple_test(sess,generator)) print("dis ",simple_test(sess,discriminator)) dis_log = open(workdir + 'dis_log.txt','w') gen_log = open(workdir + 'gen_log.txt','w') best = 0. for epoch in range(15): if epoch >= 0: for d_epoch in range(100): if d_epoch % 5 == 0: generate_for_d(sess,generator,DIS_TRAIN_FILE) train_size = ut.file_len(DIS_TRAIN_FILE) index = 1 while True: if index > train_size: break if index + BATCH_SIZE <= train_size + 1: input_user,input_item,input_label = ut.get_batch_data(DIS_TRAIN_FILE,index,BATCH_SIZE) else: input_user,input_item,input_label = ut.get_batch_data(DIS_TRAIN_FILE,index,train_size-index+1) index += BATCH_SIZE _ = sess.run(discriminator.d_updates,feed_dict={ discriminator.u:input_user,discriminator.i:input_item,discriminator.label:input_label }) for g_epoch in range(50): for u in user_pos_train: sample_lambda = 0.2 pos = user_pos_train[u] rating = sess.run(generator.all_logits,{generator.u:u}) exp_rating = np.exp(rating) prob = exp_rating / np.sum(exp_rating) pn = (1-sample_lambda) * prob pn[pos] += sample_lambda * 1.0 / len(pos) sample = np.random.choice(np.arange(ITEM_NUM), 2 * len(pos), p=pn) reward = sess.run(discriminator.reward, {discriminator.u: u, discriminator.i: sample}) reward = reward * prob[sample] / pn[sample] _ = sess.run(generator.gan_updates, {generator.u: u, generator.i: sample, generator.reward: reward}) result = simple_test(sess, generator) print("epoch ", epoch, "gen: ", result) buf = '\t'.join([str(x) for x in result]) gen_log.write(str(epoch) + '\t' + buf + '\n') gen_log.flush() p_5 = result[1] if p_5 > best: print('best: ', result) best = p_5 gen_log.close() dis_log.close()
def main(): p_best_val = 0.0 ndcg_best_val = 0.0 for epoch in range(30): if epoch >= 0: print('Training D ...') for d_epoch in range(100): if d_epoch % 30 == 0: generate_for_d(DIS_TRAIN_FILE) train_size = ut.file_len(DIS_TRAIN_FILE) index = 1 while True: if index > train_size: break if index + BATCH_SIZE <= train_size + 1: input_pos, input_neg = ut.get_batch_data( DIS_TRAIN_FILE, index, BATCH_SIZE) else: input_pos, input_neg = ut.get_batch_data( DIS_TRAIN_FILE, index, train_size - index + 1) index += BATCH_SIZE pred_data = [] pred_data.extend(input_pos) pred_data.extend(input_neg) pred_data = np.asarray(pred_data) pred_data_label = [1.0] * len(input_pos) pred_data_label.extend([0.0] * len(input_neg)) pred_data_label = np.asarray(pred_data_label) loss_d = discriminator(torch.tensor(pred_data), torch.tensor(pred_data_label)) \ + WEIGHT_DECAY * (criterion(D_w1) + criterion(D_w2) + criterion(D_b1) + criterion(D_b2)) optimizer_D.zero_grad() loss_d.backward() optimizer_D.step() print("\r[D Epoch %d/%d] [loss: %f]" % (d_epoch, 100, loss_d.item())) print('Training G ...') for g_epoch in range(30): num = 0 for query in query_pos_train.keys(): pos_list = query_pos_train[query] pos_set = set(pos_list) all_list = query_index_url[query] all_list_feature = [ query_url_feature[query][url] for url in all_list ] all_list_feature = np.asarray(all_list_feature) # pdb.set_trace() with torch.cuda.device(device[0]): all_list_score = generator.module.pred_score( torch.tensor(all_list_feature).cuda()) all_list_score = all_list_score.detach().cpu().numpy() # softmax for all exp_rating = np.exp(all_list_score - np.max(all_list_score)) prob = exp_rating / np.sum(exp_rating) prob_IS = prob * (1.0 - LAMBDA) for i in range(len(all_list)): if all_list[i] in pos_set: prob_IS[i] += (LAMBDA / (1.0 * len(pos_list))) # pdb.set_trace() choose_index = np.random.choice(np.arange(len(all_list)), [5 * len(pos_list)], p=prob_IS.reshape(-1, )) choose_list = np.array(all_list)[choose_index] choose_feature = [ query_url_feature[query][url] for url in choose_list ] choose_IS = np.array(prob)[choose_index] / np.array( prob_IS)[choose_index] choose_index = np.asarray(choose_index) choose_feature = np.asarray(choose_feature) choose_IS = np.asarray(choose_IS) with torch.cuda.device(device[0]): choose_reward = discriminator.module.get_reward( torch.tensor(choose_feature).cuda()) choose_reward.detach_() loss_g = generator(torch.tensor(all_list_feature).cuda(), torch.tensor(choose_index), choose_reward, torch.tensor(choose_IS)) \ + WEIGHT_DECAY * (criterion(G_w1) + criterion(G_w2) + criterion(G_b1) + criterion(G_b2)) # pdb.set_trace() optimizer_G.zero_grad() loss_g.backward() optimizer_G.step() num += 1 # if num == 200: # pdb.set_trace() print("\r[G Epoch %d/%d] [loss: %f]" % (g_epoch, 30, loss_g.item())) # pdb.set_trace() p_5 = precision_at_k(device, generator, query_pos_test, query_pos_train, query_url_feature, k=5) ndcg_5 = ndcg_at_k(device, generator, query_pos_test, query_pos_train, query_url_feature, k=5) if p_5 > p_best_val: p_best_val = p_5 ndcg_best_val = ndcg_5 print("Best:", "gen p@5 ", p_5, "gen ndcg@5 ", ndcg_5) elif p_5 == p_best_val: if ndcg_5 > ndcg_best_val: ndcg_best_val = ndcg_5 print("Best:", "gen p@5 ", p_5, "gen ndcg@5 ", ndcg_5) #validation # p_5 = precision_at_k(val_loader, 5) # if p_5 > p_best_val: # p_best_val = p_5 # print("Best:", "gen p@5 ", p_5) # torch.save(recipe_emb.state_dict(), 'saved_models/recipe_emb_%d_%.3f.pth' % (epoch, p_5)) # param_num = 1 # for param in DG_param: # torch.save(param, 'saved_models/param%d_%d_%.3f.pt' % (param_num, epoch, p_5)) # param_num += 1 p_1_best = precision_at_k(device, generator, query_pos_test, query_pos_train, query_url_feature, k=1) p_3_best = precision_at_k(device, generator, query_pos_test, query_pos_train, query_url_feature, k=3) p_5_best = precision_at_k(device, generator, query_pos_test, query_pos_train, query_url_feature, k=5) p_10_best = precision_at_k(device, generator, query_pos_test, query_pos_train, query_url_feature, k=10) ndcg_1_best = ndcg_at_k(device, generator, query_pos_test, query_pos_train, query_url_feature, k=1) ndcg_3_best = ndcg_at_k(device, generator, query_pos_test, query_pos_train, query_url_feature, k=3) ndcg_5_best = ndcg_at_k(device, generator, query_pos_test, query_pos_train, query_url_feature, k=5) ndcg_10_best = ndcg_at_k(device, generator, query_pos_test, query_pos_train, query_url_feature, k=10) # map_best = MAP(sess, generator, query_pos_test, query_pos_train, query_url_feature) # mrr_best = MRR(sess, generator, query_pos_test, query_pos_train, query_url_feature) print("Best ", "p@1 ", p_1_best, "p@3 ", p_3_best, "p@5 ", p_5_best, "p@10 ", p_10_best) print("Best ", "ndcg@1 ", ndcg_1_best, "ndcg@3 ", ndcg_3_best, "ndcg@5 ", ndcg_5_best, "p@10 ", ndcg_10_best)
discriminator = Discriminator( ITEM_NUM, USER_NUM,EMB_DIM, lamda=0.0 / BATCH_SIZE, param=None, initdelta=INIT_DELTA) g_optimizer = torch.optim.SGD( generator.parameters(), lr=0.001, momentum=0.9) g_optimizer = torch.optim.SGD( discriminator.parameters(), lr=0.001, momentum=0.9) for epoch in range(15): if epoch >= 0: for d_epoch in range(100): if d_epoch % 5 == 0: generate_for_d(generator, DIS_TRAIN_FILE) train_size = ut.file_len(DIS_TRAIN_FILE) index = 1 while True: if index > train_size: break if index + BATCH_SIZE <= train_size + 1: users, items, labels = ut.get_batch_data( DIS_TRAIN_FILE, index, BATCH_SIZE) else: users, items, labels = ut.get_batch_data( DIS_TRAIN_FILE, index, train_size - index + 1) loss_d = discriminator(users, items, labels) d_optimizer.zero_grad() loss_d.backward() d_optimizer.step()
def main(): discriminator = DIS(FEATURE_SIZE, HIDDEN_SIZE, WEIGHT_DECAY, D_LEARNING_RATE, param=None) generator = GEN(FEATURE_SIZE, HIDDEN_SIZE, WEIGHT_DECAY, G_LEARNING_RATE, temperature=TEMPERATURE, param=None) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.initialize_all_variables()) print('start adversarial training') p_best_val = 0.0 ndcg_best_val = 0.0 for epoch in range(30): if epoch >= 0: # G generate negative for D, then train D print('Training D ...') for d_epoch in range(100): if d_epoch % 30 == 0: generate_for_d(sess, generator, DIS_TRAIN_FILE) train_size = ut.file_len(DIS_TRAIN_FILE) index = 1 while True: if index > train_size: break if index + BATCH_SIZE <= train_size + 1: input_pos, input_neg = ut.get_batch_data( DIS_TRAIN_FILE, index, BATCH_SIZE) else: input_pos, input_neg = ut.get_batch_data( DIS_TRAIN_FILE, index, train_size - index + 1) index += BATCH_SIZE pred_data = [] pred_data.extend(input_pos) pred_data.extend(input_neg) pred_data = np.asarray(pred_data) pred_data_label = [1.0] * len(input_pos) pred_data_label.extend([0.0] * len(input_neg)) pred_data_label = np.asarray(pred_data_label) _ = sess.run(discriminator.d_updates, feed_dict={ discriminator.pred_data: pred_data, discriminator.pred_data_label: pred_data_label }) # Train G print('Training G ...') for g_epoch in range(30): for query in query_pos_train.keys(): pos_list = query_pos_train[query] pos_set = set(pos_list) all_list = query_index_url[query] all_list_feature = [ query_url_feature[query][url] for url in all_list ] all_list_feature = np.asarray(all_list_feature) all_list_score = sess.run( generator.pred_score, {generator.pred_data: all_list_feature}) # softmax for all exp_rating = np.exp(all_list_score - np.max(all_list_score)) prob = exp_rating / np.sum(exp_rating) prob_IS = prob * (1.0 - LAMBDA) for i in range(len(all_list)): if all_list[i] in pos_set: prob_IS[i] += (LAMBDA / (1.0 * len(pos_list))) choose_index = np.random.choice(np.arange(len(all_list)), [5 * len(pos_list)], p=prob_IS) choose_list = np.array(all_list)[choose_index] choose_feature = [ query_url_feature[query][url] for url in choose_list ] choose_IS = np.array(prob)[choose_index] / np.array( prob_IS)[choose_index] choose_index = np.asarray(choose_index) choose_feature = np.asarray(choose_feature) choose_IS = np.asarray(choose_IS) choose_reward = sess.run( discriminator.reward, feed_dict={discriminator.pred_data: choose_feature}) _ = sess.run(generator.g_updates, feed_dict={ generator.pred_data: all_list_feature, generator.sample_index: choose_index, generator.reward: choose_reward, generator.important_sampling: choose_IS }) p_5 = precision_at_k(sess, generator, query_pos_test, query_pos_train, query_url_feature, k=5) ndcg_5 = ndcg_at_k(sess, generator, query_pos_test, query_pos_train, query_url_feature, k=5) if p_5 > p_best_val: p_best_val = p_5 ndcg_best_val = ndcg_5 generator.save_model(sess, GAN_MODEL_BEST_FILE) print("Best:", "gen p@5 ", p_5, "gen ndcg@5 ", ndcg_5) elif p_5 == p_best_val: if ndcg_5 > ndcg_best_val: ndcg_best_val = ndcg_5 generator.save_model(sess, GAN_MODEL_BEST_FILE) print("Best:", "gen p@5 ", p_5, "gen ndcg@5 ", ndcg_5) sess.close() param_best = cPickle.load(open(GAN_MODEL_BEST_FILE)) assert param_best is not None generator_best = GEN(FEATURE_SIZE, HIDDEN_SIZE, WEIGHT_DECAY, G_LEARNING_RATE, temperature=TEMPERATURE, param=param_best) sess = tf.Session(config=config) sess.run(tf.initialize_all_variables()) p_1_best = precision_at_k(sess, generator_best, query_pos_test, query_pos_train, query_url_feature, k=1) p_3_best = precision_at_k(sess, generator_best, query_pos_test, query_pos_train, query_url_feature, k=3) p_5_best = precision_at_k(sess, generator_best, query_pos_test, query_pos_train, query_url_feature, k=5) p_10_best = precision_at_k(sess, generator_best, query_pos_test, query_pos_train, query_url_feature, k=10) ndcg_1_best = ndcg_at_k(sess, generator_best, query_pos_test, query_pos_train, query_url_feature, k=1) ndcg_3_best = ndcg_at_k(sess, generator_best, query_pos_test, query_pos_train, query_url_feature, k=3) ndcg_5_best = ndcg_at_k(sess, generator_best, query_pos_test, query_pos_train, query_url_feature, k=5) ndcg_10_best = ndcg_at_k(sess, generator_best, query_pos_test, query_pos_train, query_url_feature, k=10) map_best = MAP(sess, generator_best, query_pos_test, query_pos_train, query_url_feature) mrr_best = MRR(sess, generator_best, query_pos_test, query_pos_train, query_url_feature) print("Best ", "p@1 ", p_1_best, "p@3 ", p_3_best, "p@5 ", p_5_best, "p@10 ", p_10_best) print("Best ", "ndcg@1 ", ndcg_1_best, "ndcg@3 ", ndcg_3_best, "ndcg@5 ", ndcg_5_best, "p@10 ", ndcg_10_best) print("Best MAP ", map_best) print("Best MRR ", mrr_best)
def main(): print("loading model...") generator = GEN(ITEM_NUM, USER_NUM, EMB_DIM, lamda=0.0 / BATCH_SIZE, param=None, initdelta=INIT_DELTA, learning_rate=0.001) discriminator = DIS(ITEM_NUM, USER_NUM, EMB_DIM, lamda=0.1 / BATCH_SIZE, param=None, initdelta=INIT_DELTA, learning_rate=0.001) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) print("gen ", simple_test(sess, generator)) print("dis ", simple_test(sess, discriminator)) dis_log = open(workdir + 'dis_log.txt', 'w') gen_log = open(workdir + 'gen_log.txt', 'w') best = 0. for epoch in range(15): if epoch >= 0: for d_epoch in range(100): if d_epoch % 5 == 0: generate_for_d(sess, generator, DIS_TRAIN_FILE) train_size = ut.file_len(DIS_TRAIN_FILE) index = 1 while True: if index > train_size: break if index + BATCH_SIZE <= train_size + 1: input_user, input_item, input_label = ut.get_batch_data( DIS_TRAIN_FILE, index, BATCH_SIZE) else: input_user, input_item, input_label = ut.get_batch_data( DIS_TRAIN_FILE, index, train_size - index + 1) index += BATCH_SIZE _ = sess.run(discriminator.d_updates, feed_dict={ discriminator.u: input_user, discriminator.i: input_item, discriminator.label: input_label }) for g_epoch in range(50): for u in user_pos_train: sample_lambda = 0.2 pos = user_pos_train[u] rating = sess.run(generator.all_logits, {generator.u: u}) exp_rating = np.exp(rating) prob = exp_rating / np.sum(exp_rating) pn = (1 - sample_lambda) * prob pn[pos] += sample_lambda * 1.0 / len(pos) sample = np.random.choice(np.arange(ITEM_NUM), 2 * len(pos), p=pn) reward = sess.run(discriminator.reward, { discriminator.u: u, discriminator.i: sample }) reward = reward * prob[sample] / pn[sample] _ = sess.run( generator.gan_updates, { generator.u: u, generator.i: sample, generator.reward: reward }) result = simple_test(sess, generator) print("epoch ", epoch, "gen: ", result) buf = '\t'.join([str(x) for x in result]) gen_log.write(str(epoch) + '\t' + buf + '\n') gen_log.flush() p_5 = result[1] if p_5 > best: print('best: ', result) best = p_5 gen_log.close() dis_log.close()
os.path.basename(os.path.normpath(csv))) return unique_events, events_to_sources csv = pd.read_csv(args.data, header=0, names=['timestamp', 'source', 'event'], dtype={ 'source': str, 'event': str }, parse_dates=[0], chunksize=args.chunksize) length = file_len(args.data) print('There are {} lines of data in {}.'.format(length, args.data)) t_start = time.time() for i, chunk in enumerate(csv): data = defaultdict(list) t0 = time.time() for row in chunk.itertuples(index=False): timestamp, source, event = row if pd.notnull(source): data[source].append((timestamp, event)) print('Demuxing chunk: {} seconds.'.format(pretty_float(time.time() - t0)),
def main(): #call discriminator, generator discriminator = DIS(FEATURE_SIZE, HIDDEN_SIZE, WEIGHT_DECAY, D_LEARNING_RATE) generator = GEN(FEATURE_SIZE, HIDDEN_SIZE, WEIGHT_DECAY, G_LEARNING_RATE, temperature=TEMPERATURE) print('start adversarial training') p_best_val = 0.0 ndcg_best_val = 0.0 for epoch in range(30): if epoch >= 0: # G generate negative for D, then train D print('Training D ...') for d_epoch in range(100): if d_epoch % 30 == 0: generate_for_d(generator, DIS_TRAIN_FILE) train_size = ut.file_len(DIS_TRAIN_FILE) index = 1 while True: if index > train_size: break if index + BATCH_SIZE <= train_size + 1: input_pos, input_neg = ut.get_batch_data(DIS_TRAIN_FILE, index, BATCH_SIZE) else: input_pos, input_neg = ut.get_batch_data(DIS_TRAIN_FILE, index, train_size - index + 1) index += BATCH_SIZE pred_data = [] #prepare pos and neg data pred_data.extend(input_pos) pred_data.extend(input_neg) pred_data = np.asarray(pred_data) #prepara pos and neg label pred_data_label = [1.0] * len(input_pos) pred_data_label.extend([0.0] * len(input_neg)) pred_data_label = np.asarray(pred_data_label) #train discriminator.train(pred_data, pred_data_label) # Train G print('Training G ...') for g_epoch in range(10): start_time = time.time() print ('now_ G_epoch : ', str(g_epoch)) for query in query_pos_train.keys(): pos_list = query_pos_train[query] pos_set = set(pos_list) #all url all_list = query_index_url[query] #all feature all_list_feature = [query_url_feature[query][url] for url in all_list] all_list_feature = np.asarray(all_list_feature) # G generate all url prob prob = generator.get_prob(all_list_feature[np.newaxis, :]) prob = prob[0] prob = prob.reshape([-1]) #important sampling, change doc prob prob_IS = prob * (1.0 - LAMBDA) for i in range(len(all_list)): if all_list[i] in pos_set: prob_IS[i] += (LAMBDA / (1.0 * len(pos_list))) # G generate some url (5 * postive doc num) choose_index = np.random.choice(np.arange(len(all_list)), [5 * len(pos_list)], p=prob_IS) #choose url choose_list = np.array(all_list)[choose_index] #choose feature choose_feature = [query_url_feature[query][url] for url in choose_list] #prob / importan sampling prob (loss => prob * reward * prob / importan sampling prob) choose_IS = np.array(prob)[choose_index] / np.array(prob_IS)[choose_index] choose_index = np.asarray(choose_index) choose_feature = np.asarray(choose_feature) choose_IS = np.asarray(choose_IS) #get reward((prob - 0.5) * 2 ) choose_reward = discriminator.get_preresult(choose_feature) #train generator.train(choose_feature[np.newaxis, :], choose_reward.reshape([-1])[np.newaxis, :], choose_IS[np.newaxis, :]) print("train end--- %s seconds ---" % (time.time() - start_time)) p_5 = precision_at_k(generator, query_pos_test, query_pos_train, query_url_feature, k=5) ndcg_5 = ndcg_at_k(generator, query_pos_test, query_pos_train, query_url_feature, k=5) if p_5 > p_best_val: p_best_val = p_5 ndcg_best_val = ndcg_5 generator.save_model(GAN_MODEL_BEST_FILE) print("Best:", "gen p@5 ", p_5, "gen ndcg@5 ", ndcg_5) elif p_5 == p_best_val: if ndcg_5 > ndcg_best_val: ndcg_best_val = ndcg_5 generator.save_model(GAN_MODEL_BEST_FILE) print("Best:", "gen p@5 ", p_5, "gen ndcg@5 ", ndcg_5)
start_time = time.time() user_directory = sys.argv[1] user_prediction_file_path = sys.argv[2] set_ids = set() output_directory = sys.argv[3] output_csv = open(output_directory, "a") with open(user_prediction_file_path) as open_file_object: for line in open_file_object: infos = line.rstrip("\n").split(',') user_id = infos[0] set_ids.add(user_id) user_prediction = infos[1] file_length = file_len(user_directory + '/' + user_id + '.csv') # number of records for a given user output_csv.write(user_id + ',' + user_prediction + ',' + str(file_length) + '\n') # Write user_id that has not any prediction (i.e, not in accio's matches) files = [filename for filename in os.listdir(user_directory)] for file in files: cur_id = file.split('.')[0] if not (cur_id in set_ids): # print(cur_id) output_csv.write(cur_id + ',' + ',' + str(file_len(user_directory + '/' + file)) + '\n') output_csv.close() # files = [filename for filename in os.listdir(user_directory)] # output_csv = open(output_directory + '/' + 'users_data.csv', "a") # for file in files:
def main(): start_time = time.time() options = get_arguments() logging.info("Options") logging.info(options) logging_level = logging.DEBUG if options["verbose"] else logging.ERROR print options['verbose'] print logging_level logging.getLogger().setLevel(logging_level) filepath = options['filepath'] clear_file(output_path) print "Start mask generation for file " + filepath if not os.path.isfile(filepath): print("File path {} does not exist. Exiting...".format(filepath)) sys.exit() # split files if requested, get line counts if options['split']: total_lines, rejected_lines = split_files(filepath, options["max_line_length"]) else: total_lines = file_len(filepath) rejected_lines = file_len(split_path + "/rejected_lines") all_masks = [] cumulated_generated_space = 0 treated_lines = 0 #only open split files of correct length for filename in os.listdir(split_path): if filename == "rejected_lines": continue if int(filename.split("file_")[1]) <= options['max_line_length']: with open(os.path.join(split_path, filename), 'r') as fp: # lines_read, generated_space, masks = learning_algorithm(fp) lines_read, generated_space, masks = stat_algorithm( fp, options["max_mask_combinations"], options["mask_rejection_ratio"]) treated_lines += lines_read cumulated_generated_space += generated_space print_status(lines_read, len(masks), cumulated_generated_space) print_masks_to_file(masks, lines_read, generated_space) all_masks += masks fp.close() logging.info("--- %s seconds ---" % (time.time() - start_time)) else: total_hits = 0 total_generated_space = 0 for mask in all_masks: total_hits += mask.hitcount total_generated_space += mask.generated_space else: rejection_ratio = rejected_lines / float(total_lines) * 100 coverage_ratio = total_hits / float(total_lines) * 100 logging.info("Total Lines : " + str(total_lines)) logging.info("Total Rejected Lines : " + str(rejected_lines)) logging.info("Rejection Ratio : " + str(rejection_ratio)) logging.info("\n") logging.info("Total treated lines : " + str(treated_lines)) logging.info("Total hits : " + str(total_hits)) logging.info("Coverage Ratio: {0:.2f}%".format(coverage_ratio)) logging.info("Generated space " + str(total_generated_space)) print "Masks Generated : " + str(len(all_masks)) for mask in all_masks: print mask.maskstring if total_generated_space > options['max_generated_space']: print "Game Over" else: print "Victory" print_masks_to_file(all_masks, total_lines, total_generated_space) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): discriminator = DIS(FEATURE_SIZE, HIDDEN_SIZE, WEIGHT_DECAY, D_LEARNING_RATE, param=None) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.initialize_all_variables()) print('start random negative sampling with log ranking discriminator') generate_uniform(DIS_TRAIN_FILE) train_size = ut.file_len(DIS_TRAIN_FILE) p_best_val = 0.0 ndcg_best_val = 0.0 for epoch in range(200): index = 1 while True: if index > train_size: break if index + BATCH_SIZE <= train_size + 1: input_pos, input_neg = ut.get_batch_data( DIS_TRAIN_FILE, index, BATCH_SIZE) else: input_pos, input_neg = ut.get_batch_data( DIS_TRAIN_FILE, index, train_size - index + 1) index += BATCH_SIZE pred_data = [] pred_data.extend(input_pos) pred_data.extend(input_neg) pred_data = np.asarray(pred_data) pred_data_label = [1.0] * len(input_pos) pred_data_label.extend([0.0] * len(input_neg)) pred_data_label = np.asarray(pred_data_label) _ = sess.run(discriminator.d_updates, feed_dict={ discriminator.pred_data: pred_data, discriminator.pred_data_label: pred_data_label }) p_5 = precision_at_k(sess, discriminator, query_pos_test, query_pos_train, query_url_feature, k=5) ndcg_5 = ndcg_at_k(sess, discriminator, query_pos_test, query_pos_train, query_url_feature, k=5) if p_5 > p_best_val: p_best_val = p_5 discriminator.save_model(sess, MLE_MODEL_BEST_FILE) print("Best: ", " p@5 ", p_5, "ndcg@5 ", ndcg_5) elif p_5 == p_best_val: if ndcg_5 > ndcg_best_val: ndcg_best_val = ndcg_5 discriminator.save_model(sess, MLE_MODEL_BEST_FILE) print("Best: ", " p@5 ", p_5, "ndcg@5 ", ndcg_5) sess.close() param_best = cPickle.load(open(MLE_MODEL_BEST_FILE)) assert param_best is not None discriminator_best = DIS(FEATURE_SIZE, HIDDEN_SIZE, WEIGHT_DECAY, D_LEARNING_RATE, param=param_best) sess = tf.Session(config=config) sess.run(tf.initialize_all_variables()) p_1_best = precision_at_k(sess, discriminator_best, query_pos_test, query_pos_train, query_url_feature, k=1) p_3_best = precision_at_k(sess, discriminator_best, query_pos_test, query_pos_train, query_url_feature, k=3) p_5_best = precision_at_k(sess, discriminator_best, query_pos_test, query_pos_train, query_url_feature, k=5) p_10_best = precision_at_k(sess, discriminator_best, query_pos_test, query_pos_train, query_url_feature, k=10) ndcg_1_best = ndcg_at_k(sess, discriminator_best, query_pos_test, query_pos_train, query_url_feature, k=1) ndcg_3_best = ndcg_at_k(sess, discriminator_best, query_pos_test, query_pos_train, query_url_feature, k=3) ndcg_5_best = ndcg_at_k(sess, discriminator_best, query_pos_test, query_pos_train, query_url_feature, k=5) ndcg_10_best = ndcg_at_k(sess, discriminator_best, query_pos_test, query_pos_train, query_url_feature, k=10) map_best = MAP(sess, discriminator_best, query_pos_test, query_pos_train, query_url_feature) mrr_best = MRR(sess, discriminator_best, query_pos_test, query_pos_train, query_url_feature) print("Best ", "p@1 ", p_1_best, "p@3 ", p_3_best, "p@5 ", p_5_best, "p@10 ", p_10_best) print("Best ", "ndcg@1 ", ndcg_1_best, "ndcg@3 ", ndcg_3_best, "ndcg@5 ", ndcg_5_best, "p@10 ", ndcg_10_best) print("Best MAP ", map_best) print("Best MRR ", mrr_best)
def main(): print("load model...") #param = pickle.load(open(workdir + "model_dns_ori.pkl")) #.pkl是python 用来保存文件的 with open(workdir + "model_dns_ori.pkl", 'rb') as data_file: param = pickle.load(data_file, encoding='bytes') #param = cPickle.load(open(workdir + "model_dns_ori.pkl")) #with open(workdir + "model_dns_ori.pkl",'rb') as data_file: #param = pickle.load(data_file,encoding='bytes') print(param) generator = GEN(ITEM_NUM, USER_NUM, EMB_DIM, lamda=0.0 / BATCH_SIZE, param=param, initdelta=INIT_DELTA, learning_rate=0.001) discriminator = DIS(ITEM_NUM, USER_NUM, EMB_DIM, lamda=0.1 / BATCH_SIZE, param=None, initdelta=INIT_DELTA, learning_rate=0.001) config = tf.ConfigProto( ) # 一般用在创建session的时候。用来对session进行参数配置,配置session运行参数&&GPU设备指定 config.gpu_options.allow_growth = True ## 使用allow_growth option,刚一开始分配少量的GPU容量,然后按需慢慢的增加,由于不会释放 #内存,所以会导致碎片 sess = tf.Session( config=config ) # 要运行刚才定义的三个操作中的任何一个,我们需要为Graph创建一个Session。 Session还将分配内存来存储变量的当前值 sess.run(tf.global_variables_initializer()) print("gen ", simple_test(sess, generator)) print("dis ", simple_test(sess, discriminator)) dis_log = open(workdir + 'dis_log.txt', 'w') gen_log = open(workdir + 'gen_log.txt', 'w') # minimax training best = 0. for epoch in range(15): if epoch >= 0: for d_epoch in range(100): if d_epoch % 5 == 0: generate_for_d(sess, generator, DIS_TRAIN_FILE) train_size = ut.file_len(DIS_TRAIN_FILE) index = 1 while True: if index > train_size: break if index + BATCH_SIZE <= train_size + 1: input_user, input_item, input_label = ut.get_batch_data( DIS_TRAIN_FILE, index, BATCH_SIZE) else: input_user, input_item, input_label = ut.get_batch_data( DIS_TRAIN_FILE, index, train_size - index + 1) index += BATCH_SIZE _ = sess.run(discriminator.d_updates, feed_dict={ discriminator.u: input_user, discriminator.i: input_item, discriminator.label: input_label }) # Train G for g_epoch in range(50): # 50 for u in user_pos_train: sample_lambda = 0.2 pos = user_pos_train[u] rating = sess.run(generator.all_logits, {generator.u: u}) exp_rating = np.exp(rating) prob = exp_rating / np.sum( exp_rating) # prob is generator distribution p_\theta pn = (1 - sample_lambda) * prob pn[pos] += sample_lambda * 1.0 / len(pos) # Now, pn is the Pn in importance sampling, prob is generator distribution p_\theta sample = np.random.choice(np.arange(ITEM_NUM), 2 * len(pos), p=pn) ########################################################################### # Get reward and adapt it with importance sampling ########################################################################### reward = sess.run(discriminator.reward, { discriminator.u: u, discriminator.i: sample }) reward = reward * prob[sample] / pn[sample] ########################################################################### # Update G ########################################################################### _ = sess.run( generator.gan_updates, { generator.u: u, generator.i: sample, generator.reward: reward }) result = simple_test(sess, generator) print("epoch ", epoch, "gen: ", result) buf = '\t'.join([str(x) for x in result]) gen_log.write(str(epoch) + '\t' + buf + '\n') gen_log.flush() p_5 = result[1] if p_5 > best: print('best: ', result) best = p_5 generator.save_model(sess, "ml-100k/gan_generator.pkl") gen_log.close() dis_log.close()
unique_events.update(local_events) for event in eoi.event: if event in local_events: events_to_sources[safe_filename(event)].append(os.path.basename(os.path.normpath(csv))) return unique_events, events_to_sources csv = pd.read_csv(args.data, header=0, names=['timestamp', 'source', 'event'], dtype={'source': str, 'event': str}, parse_dates=[0], chunksize=args.chunksize) length = file_len(args.data) print('There are {} lines of data in {}.'.format(length, args.data)) t_start = time.time() for i, chunk in enumerate(csv): data = defaultdict(list) t0 = time.time() for row in chunk.itertuples(index=False): timestamp, source, event = row if pd.notnull(source): data[source].append((timestamp, event)) print('Demuxing chunk: {} seconds.'.format(pretty_float(time.time()-t0)), end=' ')
def main(): print("load initial model ...") param_nn = cPickle.load(open(DIS_MODEL_FILE_NN)) assert param_nn is not None discriminator = DIS(FEATURE_SIZE, HIDDEN_SIZE, D_WEIGHT_DECAY, D_LEARNING_RATE, loss='log', param=param_nn) generator = GEN(FEATURE_SIZE, HIDDEN_SIZE, G_WEIGHT_DECAY, G_LEARNING_RATE, param=param_nn) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.initialize_all_variables()) print('start adversarial training') p_best_val = 0.0 ndcg_best_val = 0.0 for epoch in range(30): if epoch > 0: # G generate negative for D, then train D print('Training D ...') generate_for_d(sess, generator, DIS_TRAIN_FILE) train_size = ut.file_len(DIS_TRAIN_FILE) for d_epoch in range(30): index = 1 while True: if index > train_size: break if index + BATCH_SIZE <= train_size + 1: input_pos, input_neg = ut.get_batch_data( DIS_TRAIN_FILE, index, BATCH_SIZE) else: input_pos, input_neg = ut.get_batch_data( DIS_TRAIN_FILE, index, train_size - index + 1) index += BATCH_SIZE _ = sess.run(discriminator.d_updates, feed_dict={ discriminator.pos_data: input_pos, discriminator.neg_data: input_neg }) p_5 = precision_at_k(sess, discriminator, query_pos_test, query_pos_train, query_url_feature, k=5) ndcg_5 = ndcg_at_k(sess, discriminator, query_pos_test, query_pos_train, query_url_feature, k=5) if p_5 > p_best_val: p_best_val = p_5 ndcg_best_val = ndcg_5 discriminator.save_model(sess, GAN_MODEL_BEST_FILE) print("Best: ", "dis p@5 ", p_5, "dis ndcg@5 ", ndcg_5) elif p_5 == p_best_val: if ndcg_5 > ndcg_best_val: ndcg_best_val = ndcg_5 discriminator.save_model(sess, GAN_MODEL_BEST_FILE) print("Best: ", "dis p@5 ", p_5, "dis ndcg@5 ", ndcg_5) # Train G print('Training G ...') for g_epoch in range(50): # 50 for query in query_pos_train.keys(): pos_list = query_pos_train[query] # candidate_list = list(set(query_url_feature[query].keys()) - set(pos_list)) candidate_list = list(query_url_feature[query].keys()) if len(candidate_list) <= 0: continue candidate_list_feature = [ query_url_feature[query][url] for url in candidate_list ] candidate_list_feature = np.asarray(candidate_list_feature) candidate_list_score = sess.run( generator.pred_score, {generator.pred_data: candidate_list_feature}) # softmax for all exp_rating = np.exp(candidate_list_score) prob = exp_rating / np.sum(exp_rating) neg_index = np.random.choice(np.arange(len(candidate_list)), size=[len(pos_list)], p=prob) neg_list = np.array(candidate_list)[neg_index] pos_list_feature = [ query_url_feature[query][url] for url in pos_list ] neg_list_feature = [ query_url_feature[query][url] for url in neg_list ] neg_index = np.asarray(neg_index) # every negative samples have a reward neg_reward = sess.run(discriminator.reward, feed_dict={ discriminator.pos_data: pos_list_feature, discriminator.neg_data: neg_list_feature }) # Method 1: softmax before gather _ = sess.run(generator.gan_updates, feed_dict={ generator.pred_data: candidate_list_feature, generator.sample_index: neg_index, generator.reward: neg_reward }) print('Best p@5: ', p_best_val, 'Best ndcg@5: ', ndcg_best_val)
def main(): i_file_output = 0 print "load model..." generator = GEN(AUTHER_NUM, EMB_DIM, lamda=0.0 / BATCH_SIZE, param=None, initdelta=INIT_DELTA, learning_rate=FLAGS.init_lr_gen, lr_decay_step=FLAGS.lr_decay_iter_gen) discriminator = DIS(AUTHER_NUM, EMB_DIM, lamda=0.01 / BATCH_SIZE, param=None, initdelta=INIT_DELTA, learning_rate=FLAGS.init_lr_dis, lr_decay_step=FLAGS.lr_decay_iter_dis) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) dis_log = open(outputdir + 'dis_log.txt', 'w') gen_log = open(outputdir + 'gen_log.txt', 'w') # minimax training best_gen = 0. best_dis = 0. draw_count_D = 0 draw_count_G = 0 for epoch in range(FLAGS.epochs): #5000 if epoch >= 0: # Train D generate_for_d(sess, generator, DIS_TRAIN_FILE) train_size = ut.file_len(DIS_TRAIN_FILE) # generate file length for d_epoch in range(5): index = 1 while True: if index > train_size: break if index + BATCH_SIZE <= train_size + 1: input_auther, input_coauther_real, input_coauther_fake = ut.get_batch_data( DIS_TRAIN_FILE, index, BATCH_SIZE) else: input_auther, input_coauther_real, input_coauther_fake = ut.get_batch_data( DIS_TRAIN_FILE, index, train_size - index + 1) index += BATCH_SIZE _ = sess.run( [discriminator.d_updates, discriminator.clip_D], feed_dict={ discriminator.auther: input_auther, discriminator.co_real: input_coauther_real, discriminator.co_fake: input_coauther_fake }) result = simple_test(sess, discriminator) buf = '\t'.join([str(x) for x in result]) dis_log.write(str(epoch) + '\t' + buf + '\n') dis_log.flush() p_5 = result[2] if p_5 > best_dis: print 'best_dis: ', epoch, result best_dis = p_5 discriminator.save_model(sess, outputdir + "gan_discriminator.pkl") # Train G for g_epoch in range(1): for u in auther_pos_train: sample_lambda = 0.2 pos = list(set(auther_pos_train[u])) sample_times = 128 rating = sess.run(generator.softmax_logits, {generator.auther: [u]}) prob = np.reshape(rating, [-1]) sample = np.random.choice(np.arange(AUTHER_NUM), size=sample_times, p=prob) ########################################################################### # Get reward and adapt it with importance sampling ########################################################################### reward = sess.run( discriminator.reward, { discriminator.auther: np.tile(u, (sample_times)), discriminator.co_fake: sample }) ########################################################################### # Update G ########################################################################### _ = sess.run( generator.gan_updates, { generator.auther: np.tile(u, (sample_times)), generator.co: sample, generator.reward: reward }) result = simple_test(sess, generator) buf = '\t'.join([str(x) for x in result]) gen_log.write(str(epoch) + '\t' + buf + '\n') gen_log.flush() p_5 = result[2] if p_5 > best_gen: print 'best_gen: ', epoch, result best_gen = p_5 generator.save_model(sess, outputdir + "gan_generator.pkl") draw_count_G += 1 gen_log.close() dis_log.close()
def main(): best = 0. gen_log = open(workdir + 'gen_log.txt', 'w') for epoch in range(15): if epoch >= 0: for d_epoch in range(100): if d_epoch % 5 == 0: generate_for_d(generator, DIS_TRAIN_FILE) train_size = ut.file_len(DIS_TRAIN_FILE) index = 1 while True: if index > train_size: break if index + BATCH_SIZE <= train_size + 1: input_user, input_item, input_label = ut.get_batch_data( DIS_TRAIN_FILE, index, BATCH_SIZE) else: input_user, input_item, input_label = ut.get_batch_data( DIS_TRAIN_FILE, index, train_size - index + 1) index += BATCH_SIZE # pre_logits = discriminator.module.pre_logits(input_user, input_item) D_loss = discriminator(input_user, input_item, torch.tensor(input_label)) \ + lamda * (criterion(D_user_embeddings) + criterion(D_item_embeddings) + criterion(D_item_bias)) optimizer_D.zero_grad() D_loss.backward() optimizer_D.step() print("\r[D Epoch %d/%d] [loss: %f]" % (d_epoch, 100, D_loss.item())) for g_epoch in range(50): for u in user_pos_train: sample_lambda = 0.2 pos = user_pos_train[u] rating = generator.module.all_logits(u) rating = rating.detach_().cpu().numpy() exp_rating = np.exp(rating) prob = exp_rating / np.sum( exp_rating) # prob is generator distribution p_\theta pn = (1 - sample_lambda) * prob pn[pos] += sample_lambda * 1.0 / len(pos) # Now, pn is the Pn in importance sampling, prob is generator distribution p_\theta sample = np.random.choice(np.arange(ITEM_NUM), 2 * len(pos), p=pn) ########################################################################### # Get reward and adapt it with importance sampling ########################################################################### reward = discriminator.module.get_reward(u, sample) reward = reward.detach_().cpu().numpy( ) * prob[sample] / pn[sample] ########################################################################### # Update G ########################################################################### with torch.cuda.device(device[0]): G_loss = generator(u, torch.tensor(sample), torch.tensor(reward)) optimizer_G.zero_grad() G_loss.backward() optimizer_G.step() print("\r[G Epoch %d/%d] [loss: %f]" % (g_epoch, 50, G_loss.item())) result = simple_test(generator) print("epoch ", epoch, "gen: ", result) buf = '\t'.join([str(x) for x in result]) gen_log.write(str(epoch) + '\t' + buf + '\n') gen_log.flush() gen_log.close()