def _get_rewards_01(config, data_loader, x_fake_for_rewards, eof_code, sess, first, all_bleu_metrics): batch_size = config['batch_size'] gan_type = config['gan_type'] seq_len = config['seq_len'] vocab_size = config['vocab_size'] rl_bleu_ref_count = data_loader.num_batch * batch_size # all of training set # 3000 # rl_n_grams = 4 rl_mc_samples = 1 gamma_discount = 0.9 rewards = np.zeros((batch_size, seq_len), np.float32) if first == True: bleu_metric_2 = list() bleu_metric_3 = list() bleu_metric_4 = list() bleu_metric_5 = list() # train_refs = data_loader.random_some(rl_bleu_ref_count, seq_len+1) train_refs = data_loader.get_as_lol_no_padding() # np_train_refs = np.array(train_refs) for t in range(2, seq_len + 1): # train_refs = data_loader.random_some(rl_bleu_ref_count, t) # bleu_metric_2.append(Bleu.from_references_indices(2, train_refs)) bleu_metric_2.append( Bleu.from_references_indices(2, [l[:t] for l in train_refs])) for t in range(3, seq_len + 1): # train_refs = data_loader.random_some(rl_bleu_ref_count, t) bleu_metric_3.append( Bleu.from_references_indices(3, [l[:t] for l in train_refs])) for t in range(4, seq_len + 1): # train_refs = data_loader.random_some(rl_bleu_ref_count, t) bleu_metric_4.append( Bleu.from_references_indices(4, [l[:t] for l in train_refs])) for t in range(5, seq_len + 1): # train_refs = data_loader.random_some(rl_bleu_ref_count, t) bleu_metric_5.append( Bleu.from_references_indices(5, [l[:t] for l in train_refs])) # put the 5 all_bleu_metrics = [ bleu_metric_2, bleu_metric_3, bleu_metric_4, bleu_metric_5 ] first = False for _ in range(rl_mc_samples): # samples_for_rewards, _ = self.generator.generate_from_noise(self.sess, batch_size, self.current_tau, Config.args.BATCH_SIZE) samples_for_rewards = sess.run(x_fake_for_rewards) gen_seq_list = samples_no_padding(samples_for_rewards, eof_code) for b in range(len(gen_seq_list)): rewards[b, :] = rewards[b, :] + _compute_rl_rewards_01( gen_seq_list[b], all_bleu_metrics, gamma_discount, seq_len) rewards = rewards / (1.0 * rl_mc_samples) return samples_for_rewards, rewards, first, all_bleu_metrics
def _get_rewards_02(config, data_loader, x_fake_for_rewards, given_num, r_x, r_gen_x, r_gen_x_sample, eof_code, sess, first, all_bleu_metrics): # print("Start computing rewards ...") batch_size = config['batch_size'] gan_type = config['gan_type'] seq_len = config['seq_len'] vocab_size = config['vocab_size'] rl_bleu_ref_count = data_loader.num_batch * batch_size # all of training set # rl_n_grams = 4 rl_mc_samples = config['mc_samples'] gamma_discount = 0.5 # rewards = np.zeros((batch_size, seq_len), np.float32) if first == True: train_refs = data_loader.get_as_lol_no_padding() # train_refs = data_loader.random_some(rl_bleu_ref_count, seq_len + 1) bleu_metric_2 = Bleu.from_references_indices(2, train_refs) # train_refs = data_loader.random_some(rl_bleu_ref_count, seq_len + 1) bleu_metric_3 = Bleu.from_references_indices(3, train_refs) # train_refs = data_loader.random_some(rl_bleu_ref_count, seq_len + 1) bleu_metric_4 = Bleu.from_references_indices(4, train_refs) # train_refs = data_loader.random_some(rl_bleu_ref_count, seq_len + 1) bleu_metric_5 = Bleu.from_references_indices(5, train_refs) all_bleu_metrics = [ bleu_metric_2, bleu_metric_3, bleu_metric_4, bleu_metric_5 ] first = False rewards = list() samples_for_rewards = sess.run(x_fake_for_rewards) for i in range(rl_mc_samples): for given_num_i in range(1, seq_len): feed = {r_x: samples_for_rewards, given_num: given_num_i} roll_out_samples = sess.run(r_gen_x, feed) # feed = {discriminator.input_x: samples} # ypred_for_auc = sess.run(discriminator.ypred_for_auc, feed) ypred = _compute_rl_rewards_02(roll_out_samples, all_bleu_metrics, gamma_discount, eof_code) # ypred = np.array([item[1] for item in ypred_for_auc]) if i == 0: rewards.append(ypred) else: rewards[given_num_i - 1] += ypred # the last token reward # feed = {discriminator.input_x: input_x} # ypred_for_auc = sess.run(discriminator.ypred_for_auc, feed) # ypred = np.array([item[1] for item in ypred_for_auc]) ypred = _compute_rl_rewards_02(samples_for_rewards, all_bleu_metrics, gamma_discount, eof_code) if i == 0: rewards.append(ypred) else: rewards[(len(samples_for_rewards[0]) - 1)] += ypred # for _ in range(rl_mc_samples): # # samples_for_rewards, _ = self.generator.generate_from_noise(self.sess, batch_size, self.current_tau, Config.args.BATCH_SIZE) # samples_for_rewards = sess.run(x_fake_for_rewards) # for b in range(len(samples_for_rewards)): # rewards[b, :] = rewards[b, :] + _compute_rl_rewards(samples_for_rewards[b], all_bleu_metrics, gamma_discount) # rewards = rewards / (1.0 * rl_mc_samples) # return samples_for_rewards, rewards reward_res = np.transpose(np.array(rewards)) / ( 1.0 * rl_mc_samples) # batch_size x seq_length if config['pg_baseline']: reward_res -= config['pg_baseline_val'] # 2.0 for emnlp # print("Rewards computed.") return samples_for_rewards, reward_res, first, all_bleu_metrics