class MGAIL(object): def __init__(self, environment): self.env = environment # Create placeholders for all the inputs self.states_ = tf.placeholder( "float", shape=(None, ) + self.env.state_size, name='states_') # Batch x State, previous state self.states = tf.placeholder( "float", shape=(None, ) + self.env.state_size, name='states') # Batch x State, current_state self.actions = tf.placeholder("float", shape=(None, self.env.action_size), name='action') # Batch x Action self.label = tf.placeholder("float", shape=(None, 1), name='label') self.gamma = tf.placeholder("float", shape=(), name='gamma') self.temp = tf.placeholder("float", shape=(), name='temperature') self.noise = tf.placeholder("float", shape=(), name='noise_flag') self.do_keep_prob = tf.placeholder("float", shape=(), name='do_keep_prob') if self.env.use_airl: self.done_ph = tf.placeholder(name="dones", shape=(None, ), dtype=tf.float32) # Create MGAIL blocks self.forward_model = ForwardModel( state_size=self.env.state_size[0] if self.env.obs_mode == 'state' else self.env.encoder_feat_size, action_size=self.env.action_size, encoding_size=self.env.fm_size, lr=self.env.fm_lr, forward_model_type=self.env.forward_model_type, obs_mode=self.env.obs_mode, use_scale_dot_product=self.env.use_scale_dot_product, use_skip_connection=self.env.use_skip_connection, use_dropout=self.env.use_dropout) if self.env.obs_mode == 'pixel': if self.env.state_only: feat_in_dim = 1024 # self.env.encoder_feat_size[0] policy_input_feat = 1024 else: feat_in_dim = 1024 + self.env.action_size # self.env.encoder_feat_size[0] policy_input_feat = 1024 else: if self.env.state_only: feat_in_dim = self.env.state_size[0] policy_input_feat = self.env.state_size[0] else: feat_in_dim = self.env.state_size[0] + self.env.action_size policy_input_feat = self.env.state_size[0] self.discriminator = Discriminator( in_dim=feat_in_dim, out_dim=self.env.disc_out_dim, size=self.env.d_size, lr=self.env.d_lr, do_keep_prob=self.do_keep_prob, weight_decay=self.env.weight_decay, use_airl=self.env.use_airl, phi_hidden_size=self.env.phi_size, state_only=self.env.state_only, ) self.policy = Policy(in_dim=policy_input_feat, out_dim=self.env.action_size, size=self.env.p_size, lr=self.env.p_lr, do_keep_prob=self.do_keep_prob, n_accum_steps=self.env.policy_accum_steps, weight_decay=self.env.weight_decay) # Create experience buffers self.er_agent = ER( memory_size=self.env.er_agent_size, state_dim=self.env.state_size, action_dim=self.env.action_size, reward_dim=1, # stub connection qpos_dim=self.env.qpos_size, qvel_dim=self.env.qvel_size, batch_size=self.env.batch_size, history_length=1) self.er_expert = common.load_er(fname=os.path.join( self.env.run_dir, self.env.expert_data), batch_size=self.env.batch_size, history_length=1, traj_length=2) self.env.sigma = self.er_expert.actions_std / self.env.noise_intensity if self.env.obs_mode == 'pixel': current_states = ops.preprocess(self.states, bits=8) current_states_feat = ops.encoder(current_states, reuse=tf.AUTO_REUSE) prev_states = ops.preprocess(self.states_, bits=8) prev_states_feat = ops.encoder(prev_states, reuse=tf.AUTO_REUSE) else: # Normalize the inputs prev_states = common.normalize(self.states_, self.er_expert.states_mean, self.er_expert.states_std) current_states = common.normalize(self.states, self.er_expert.states_mean, self.er_expert.states_std) prev_states_feat = prev_states current_states_feat = current_states if self.env.continuous_actions: actions = common.normalize(self.actions, self.er_expert.actions_mean, self.er_expert.actions_std) else: actions = self.actions # 1. Forward Model initial_gru_state = np.ones((1, self.forward_model.encoding_size)) forward_model_prediction, _, divergence_loss = self.forward_model.forward( [prev_states_feat, actions, initial_gru_state]) if self.env.obs_mode == 'pixel': forward_model_prediction = ops.decoder( forward_model_prediction, data_shape=self.env.state_size, reuse=tf.AUTO_REUSE) self.forward_model_prediction = ops.postprocess( forward_model_prediction, bits=8, dtype=tf.uint8) else: self.forward_model_prediction = forward_model_prediction forward_model_loss = tf.reduce_mean( tf.square(current_states - forward_model_prediction) ) + self.env.forward_model_lambda * tf.reduce_mean(divergence_loss) self.forward_model.train(objective=forward_model_loss) if self.env.use_airl: # 1.1 action log prob logits = self.policy.forward(current_states_feat) if self.env.continuous_actions: mean, logstd = logits, tf.log(tf.ones_like(logits)) std = tf.exp(logstd) n_elts = tf.cast(tf.reduce_prod(mean.shape[1:]), tf.float32) # first dimension is batch size log_normalizer = n_elts / 2. * (np.log(2 * np.pi).astype( np.float32)) + 1 / 2 * tf.reduce_sum(logstd, axis=1) # Diagonal Gaussian action probability, for every action action_logprob = -tf.reduce_sum(tf.square(actions - mean) / (2 * std), axis=1) - log_normalizer else: # Override since the implementation of tfp.RelaxedOneHotCategorical # yields positive values. if actions.shape[1:] != logits.shape[1:]: actions = tf.cast(actions, tf.int8) values = tf.one_hot(actions, logits.shape.as_list()[-1], dtype=tf.float32) assert values.shape == logits.shape, (values.shape, logits.shape) else: values = actions # [0]'s implementation (see line below) seems to be an approximation # to the actual Gumbel Softmax density. # TODO: to confirm 'action' or 'value' action_logprob = -tf.reduce_sum( -values * tf.nn.log_softmax(logits, axis=-1), axis=-1) # prob = logit[np.arange(self.action_test.shape[0]), self.action_test] # action_logprob = tf.log(prob) # 2. Discriminator self.discriminator.airl_entropy_weight = self.env.airl_entropy_weight # labels = tf.concat([1 - self.label, self.label], 1) # labels = 1 - self.label # 0 for expert, 1 for policy labels = self.label # 1 for expert, 0 for policy d, self.disc_shaped_reward_output, self.disc_reward = self.discriminator.forward( state=current_states_feat, action=actions, prev_state=prev_states_feat, done_inp=self.done_ph, log_policy_act_prob=action_logprob, ) # 2.1 0-1 accuracy correct_predictions = tf.equal(tf.argmax(d, 1), tf.argmax(labels, 1)) self.discriminator.acc = tf.reduce_mean( tf.cast(correct_predictions, "float")) # 2.2 prediction d_cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits( labels=labels, logits=d, name="disc_loss", ) # Construct generator reward: # \[\hat{r}(s,a) = \log(D_{\theta}(s,a)) - \log(1 - D_{\theta}(s,a)).\] # This simplifies to: # \[\hat{r}(s,a) = f_{\theta}(s,a) - \log \pi(a \mid s).\] # This is just an entropy-regularized objective # ent_bonus = -self.env.airl_entropy_weight * self.discriminator.log_policy_act_prob_ph # policy_train_reward = self.discriminator.reward_net.reward_output_train + ent_bonus else: # 2. Discriminator labels = tf.concat([1 - self.label, self.label], 1) d, _, _ = self.discriminator.forward(state=current_states_feat, action=actions) # 2.1 0-1 accuracy correct_predictions = tf.equal(tf.argmax(d, 1), tf.argmax(labels, 1)) self.discriminator.acc = tf.reduce_mean( tf.cast(correct_predictions, "float")) # 2.2 prediction d_cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=d, labels=labels) # cost sensitive weighting (weight true=expert, predict=agent mistakes) d_loss_weighted = self.env.cost_sensitive_weight * tf.multiply(tf.to_float(tf.equal(tf.squeeze(self.label), 1.)), d_cross_entropy) +\ tf.multiply(tf.to_float(tf.equal(tf.squeeze(self.label), 0.)), d_cross_entropy) discriminator_loss = tf.reduce_mean(d_loss_weighted) self.discriminator.train(objective=discriminator_loss) # 3. Collect experience mu = self.policy.forward(current_states_feat) if self.env.continuous_actions: a = common.denormalize(mu, self.er_expert.actions_mean, self.er_expert.actions_std) eta = tf.random_normal(shape=tf.shape(a), stddev=self.env.sigma) self.action_test = tf.squeeze(a + self.noise * eta) else: a = common.gumbel_softmax(logits=mu, temperature=self.temp) self.action_test = tf.argmax(a, dimension=1) # 4.3 AL def policy_loop(current_state_policy_update, t, total_cost, total_trans_err, env_term_sig, prev_state): if self.env.obs_mode == 'pixel': current_state_feat_policy_update = ops.encoder( current_state_policy_update, reuse=True) prev_state_feat_policy_update = ops.encoder(prev_state, reuse=True) else: current_state_feat_policy_update = current_state_policy_update prev_state_feat_policy_update = prev_state mu = self.policy.forward(current_state_feat_policy_update, reuse=True) if self.env.continuous_actions: eta = self.env.sigma * tf.random_normal(shape=tf.shape(mu)) action = mu + eta if self.env.use_airl: mean, logstd = mu, tf.log( tf.ones_like(mu) * self.env.sigma) std = tf.exp(logstd) n_elts = tf.cast( tf.reduce_prod(mean.shape[1:]), tf.float32) # first dimension is batch size log_normalizer = n_elts / 2. * (np.log(2 * np.pi).astype( np.float32)) + 1 / 2 * tf.reduce_sum(logstd, axis=1) # Diagonal Gaussian action probability, for every action action_logprob = -tf.reduce_sum(tf.square(action - mean) / (2 * std), axis=1) - log_normalizer else: action = common.gumbel_softmax_sample(logits=mu, temperature=self.temp) if self.env.use_airl: # Override since the implementation of tfp.RelaxedOneHotCategorical # yields positive values. if action.shape[1:] != logits.shape[1:]: actions = tf.cast(action, tf.int8) values = tf.one_hot(actions, logits.shape.as_list()[-1], dtype=tf.float32) assert values.shape == logits.shape, (values.shape, logits.shape) else: values = action # [0]'s implementation (see line below) seems to be an approximation # to the actual Gumbel Softmax density. # TODO: to confirm 'action' or 'value' action_logprob = -tf.reduce_sum( -values * tf.nn.log_softmax(logits, axis=-1), axis=-1) # minimize the gap between agent logit (d[:,0]) and expert logit (d[:,1]) if self.env.use_airl: d, shaped_reward_output, reward = self.discriminator.forward( state=current_state_feat_policy_update, action=action, prev_state=prev_state_feat_policy_update, done_inp=tf.cast(env_term_sig, tf.float32), log_policy_act_prob=action_logprob, reuse=True) if self.env.alg in ['mairlTransfer', 'mairlImit4Transfer']: reward_for_updating_policy = reward else: # 'mairlImit' reward_for_updating_policy = shaped_reward_output if self.env.train_mode and not self.env.alg in [ 'mairlTransfer', 'mairlImit4Transfer' ]: ent_bonus = -self.env.airl_entropy_weight * tf.stop_gradient( action_logprob) policy_reward = reward_for_updating_policy + ent_bonus else: policy_reward = reward_for_updating_policy cost = tf.reduce_mean(-policy_reward) * self.env.policy_al_w else: d, _, _ = self.discriminator.forward( state=current_state_feat_policy_update, action=action, reuse=True) cost = self.al_loss(d) # add step cost total_cost += tf.multiply(tf.pow(self.gamma, t), cost) # get action if self.env.continuous_actions: a_sim = common.denormalize(action, self.er_expert.actions_mean, self.er_expert.actions_std) else: a_sim = tf.argmax(action, dimension=1) # get next state state_env, _, env_term_sig, = self.env.step(a_sim, mode='tensorflow')[:3] state_e = common.normalize(state_env, self.er_expert.states_mean, self.er_expert.states_std) state_e = tf.stop_gradient(state_e) state_a, _, divergence_loss_a = self.forward_model.forward( [current_state_feat_policy_update, action, initial_gru_state], reuse=True) if self.env.obs_mode == 'pixel': state_a = ops.decoder(state_a, data_shape=self.env.state_size, reuse=True) if True: # self.env.alg in ['mgail']: state, nu = common.re_parametrization(state_e=state_e, state_a=state_a) else: _, nu = common.re_parametrization(state_e=state_e, state_a=state_a) state = state_a total_trans_err += tf.reduce_mean(abs(nu)) t += 1 if self.env.obs_mode == 'pixel': state = tf.slice(state, [0, 0, 0, 0], [1, -1, -1, -1]) return state, t, total_cost, total_trans_err, env_term_sig, current_state_policy_update def policy_stop_condition(current_state_policy_update, t, cost, trans_err, env_term_sig, prev_state): cond = tf.logical_not( env_term_sig) # not done: env_term_sig = False cond = tf.logical_and(cond, t < self.env.n_steps_train) cond = tf.logical_and(cond, trans_err < self.env.total_trans_err_allowed) return cond if self.env.obs_mode == 'pixel': state_0 = tf.slice(current_states, [0, 0, 0, 0], [1, -1, -1, -1]) else: state_0 = tf.slice(current_states, [0, 0], [1, -1]) # prev_state_0 = tf.slice(states_, [0, 0], [1, -1]) loop_outputs = tf.while_loop(policy_stop_condition, policy_loop, [state_0, 0., 0., 0., False, state_0]) self.policy.train(objective=loop_outputs[2]) def al_loss(self, d): logit_agent, logit_expert = tf.split(axis=1, num_or_size_splits=2, value=d) # Cross entropy loss labels = tf.concat( [tf.zeros_like(logit_agent), tf.ones_like(logit_expert)], 1) d_cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=d, labels=labels) loss = tf.reduce_mean(d_cross_entropy) return loss * self.env.policy_al_w
class MainModel(nn.Module): def __init__(self, marker_num, neighbor_num, embed_dim, d_model, d_inner, d_q, d_k, d_v, n_head, candi_size, max_time, beta, cuda_id, K, discount, regular, dropout=0.1): super(MainModel, self).__init__() self.generator = Generator(marker_num, neighbor_num, embed_dim, d_model, d_inner, d_q, d_k, d_v, n_head, candi_size, max_time, beta, cuda_id, dropout=dropout) self.discriminator = Discriminator(marker_num, embed_dim, d_model, d_inner, d_q, d_k, d_v, n_head, beta, cuda_id, K, dropout=dropout) self.marker_embeddings = nn.Parameter(torch.ones(marker_num, d_model)) self.d_loss_func = D_Loss(K) self.g_loss_func = PolicyGradient(discount, regular, K, cuda_id) self.discount = discount self.regular = regular self.marker_num = marker_num self.K = K def forward(self, marker_data, time_data, mask_data): gen_markers, gen_times, gen_masks, gen_p_neighbor, gen_p_sample = [], [], [], [], [] for i in range(self.K): new_markers, new_times, new_masks, new_p_neighbor, new_p_sample = \ self.generator.forward(marker_data, time_data, mask_data, self.marker_embeddings) gen_markers.append(new_markers) gen_times.append(new_times.detach()) gen_masks.append(new_masks) gen_p_neighbor.append(new_p_neighbor) gen_p_sample.append(new_p_sample) true_reward, true_masks, bogus_reward, bogus_masks = \ self.discriminator.forward(marker_data, time_data, mask_data, gen_markers, gen_times, gen_masks, self.marker_embeddings) d_loss = self.d_loss_func.forward(true_reward, true_masks, bogus_reward, bogus_masks) g_loss = self.g_loss_func.forward(gen_p_neighbor, gen_p_sample, bogus_reward, bogus_masks) return d_loss, g_loss
class DCGAN: def __init__(self, img_shape, epochs=50000, lr_gen=0.0002, lr_dc=0.0002, z_shape=100, batch_size=100, beta1=0.5, epochs_for_sample=50): self.rows, self.cols, self.channels = img_shape self.batch_size = batch_size self.epochs = epochs self.z_shape = z_shape self.epochs_for_sample = epochs_for_sample self.generator = Generator(img_shape) self.discriminator = Discriminator(img_shape) mnist = tf.keras.datasets.mnist (x_train, _), (x_test, _) = mnist.load_data() X = np.concatenate([x_train, x_test]) X = np.reshape(X, (-1, 28, 28, 1)) X = tf.image.resize_images(X, [64, 64]) self.X = (X / 127.5) - 1 # Scale between -1 and 1 self.phX = tf.placeholder(dtype=tf.float32, shape=[None, self.rows, self.cols, self.channels]) self.phZ = tf.placeholder(dtype=tf.float32, shape=[None, 1, 1, self.z_shape]) self.loss_plot = tf.placeholder(dtype=tf.float32, shape=[]) self.gen_out = self.generator.forward(self.phZ) disc_logits_fake = self.discriminator.forward(self.gen_out) disc_logits_real = self.discriminator.forward(self.phX) disc_loss_fake = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_logits_fake, labels=tf.zeros_like(disc_logits_fake))) disc_loss_real = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_logits_real, labels=tf.ones_like(disc_logits_real))) self.disc_loss = tf.add(disc_loss_fake, disc_loss_real) self.gen_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_logits_fake, labels=tf.ones_like(disc_logits_fake))) self.disc_train = tf.train.AdamOptimizer(lr_dc, beta1=beta1).minimize(self.disc_loss, var_list=self.discriminator.variables) self.gen_train = tf.train.AdamOptimizer(lr_gen, beta1=beta1).minimize(self.gen_loss, var_list=self.generator.variables) def train(self): init = tf.global_variables_initializer() self.sess = tf.Session() self.sess.run(init) train_writer = tf.summary.FileWriter('./logs') train_writer.add_graph(tf.get_default_graph()) dc_plot = tf.summary.scalar('Discriminator', self.loss_plot) gen_plot = tf.summary.scalar('Generator', self.loss_plot) cnt = 0 for i in range(self.epochs): X_numpy = self.sess.run(self.X) idx = np.random.randint(0, len(X_numpy), self.batch_size) batch_X = X_numpy[idx] batch_Z = np.random.uniform(-1, 1, (self.batch_size, 1, 1, self.z_shape)) _, d_loss = self.sess.run([self.disc_train, self.disc_loss], feed_dict={self.phX: batch_X, self.phZ: batch_Z}) batch_Z = np.random.uniform(-1, 1, (self.batch_size, 1, 1, self.z_shape)) _, g_loss = self.sess.run([self.gen_train, self.gen_loss], feed_dict={self.phZ: batch_Z}) if i % self.epochs_for_sample == 0: self.generate_sample(i) print("Epoch: " + str(i) + " Discriminator loss: " + str(d_loss) + " Generator loss: " + str(g_loss)) train_writer.add_summary(self.sess.run(dc_plot, feed_dict={self.loss_plot: d_loss}), i / self.epochs_for_sample) train_writer.add_summary(self.sess.run(gen_plot, feed_dict={self.loss_plot: g_loss}), i / self.epochs_for_sample) def generate_sample(self, epoch): c = 5 r = 5 imgs = self.sess.run(self.gen_out, feed_dict={self.phZ: fixed_z}) imgs = imgs * 0.5 + 0.5 # scale between 0, 1 fig, axs = plt.subplots(c, r) cnt = 0 for i in range(c): for j in range(r): axs[i, j].imshow(imgs[cnt, :, :, 0], cmap="gray") axs[i, j].axis('off') cnt += 1 fig.savefig("samples/%05d.png" % epoch) plt.close()
class GAN_CLS(object): def __init__(self, args, data_loader, SUPERVISED=True): """ args : Arguments data_loader = An instance of class DataLoader for loading our dataset in batches """ self.data_loader = data_loader self.num_epochs = args.num_epochs self.batch_size = args.batch_size self.log_step = args.log_step self.sample_step = args.sample_step self.log_dir = args.log_dir self.checkpoint_dir = args.checkpoint_dir self.sample_dir = args.sample_dir self.final_model = args.final_model self.model_save_step = args.model_save_step #self.dataset = args.dataset #self.model_name = args.model_name self.img_size = args.img_size self.z_dim = args.z_dim self.text_embed_dim = args.text_embed_dim self.text_reduced_dim = args.text_reduced_dim self.learning_rate = args.learning_rate self.beta1 = args.beta1 self.beta2 = args.beta2 self.l1_coeff = args.l1_coeff self.resume_epoch = args.resume_epoch self.resume_idx = args.resume_idx self.SUPERVISED = SUPERVISED # Logger setting log_name = datetime.datetime.now().strftime('%Y-%m-%d') + '.log' self.logger = logging.getLogger('__name__') self.logger.setLevel(logging.INFO) self.formatter = logging.Formatter( '%(asctime)s:%(levelname)s:%(message)s') self.file_handler = logging.FileHandler( os.path.join(self.log_dir, log_name)) self.file_handler.setFormatter(self.formatter) self.logger.addHandler(self.file_handler) self.build_model() def smooth_label(self, tensor, offset): return tensor + offset def dump_imgs(images_Array, name): with open('{}.pickle'.format(name), 'wb') as file: dump(images_Array, file) def build_model(self): """ A function of defining following instances : ----- Generator ----- Discriminator ----- Optimizer for Generator ----- Optimizer for Discriminator ----- Defining Loss functions """ # ---------------------------------------------------------------------# # 1. Network Initialization # # ---------------------------------------------------------------------# self.gen = Generator(batch_size=self.batch_size, img_size=self.img_size, z_dim=self.z_dim, text_embed_dim=self.text_embed_dim, text_reduced_dim=self.text_reduced_dim) self.disc = Discriminator(batch_size=self.batch_size, img_size=self.img_size, text_embed_dim=self.text_embed_dim, text_reduced_dim=self.text_reduced_dim) self.gen_optim = optim.Adam(self.gen.parameters(), lr=self.learning_rate, betas=(self.beta1, self.beta2)) self.disc_optim = optim.Adam(self.disc.parameters(), lr=self.learning_rate, betas=(self.beta1, self.beta2)) self.cls_gan_optim = optim.Adam(itertools.chain( self.gen.parameters(), self.disc.parameters()), lr=self.learning_rate, betas=(self.beta1, self.beta2)) print('------------- Generator Model Info ---------------') self.print_network(self.gen, 'G') print('------------------------------------------------') print('------------- Discriminator Model Info ---------------') self.print_network(self.disc, 'D') print('------------------------------------------------') self.criterion = nn.BCELoss().cuda() # self.CE_loss = nn.CrossEntropyLoss().cuda() # self.MSE_loss = nn.MSELoss().cuda() self.gen.train() self.disc.train() def print_network(self, model, name): """ A function for printing total number of model parameters """ num_params = 0 for p in model.parameters(): num_params += p.numel() print(model) print(name) print("Total number of parameters: {}".format(num_params)) def load_checkpoints(self, resume_epoch, idx): """Restore the trained generator and discriminator.""" print('Loading the trained models from epoch {} and iteration {}...'. format(resume_epoch, idx)) G_path = os.path.join(self.checkpoint_dir, '{}-{}-G.ckpt'.format(resume_epoch, idx)) D_path = os.path.join(self.checkpoint_dir, '{}-{}-D.ckpt'.format(resume_epoch, idx)) self.gen.load_state_dict( torch.load(G_path, map_location=lambda storage, loc: storage)) self.disc.load_state_dict( torch.load(D_path, map_location=lambda storage, loc: storage)) def train_model(self): data_loader = self.data_loader start_epoch = 0 if self.resume_epoch >= 0: start_epoch = self.resume_epoch self.load_checkpoints(self.resume_epoch, self.resume_idx) print('--------------- Model Training Started ---------------') start_time = time.time() for epoch in range(start_epoch, self.num_epochs): print("Epoch: {}".format(epoch + 1)) for idx, batch in enumerate(data_loader): print("Index: {}".format(idx + 1), end="\t") true_imgs = batch['true_imgs'] true_embed = batch['true_embds'] false_imgs = batch['false_imgs'] real_labels = torch.ones(true_imgs.size(0)) fake_labels = torch.zeros(true_imgs.size(0)) smooth_real_labels = torch.FloatTensor( self.smooth_label(real_labels.numpy(), -0.1)) true_imgs = Variable(true_imgs.float()).cuda() true_embed = Variable(true_embed.float()).cuda() false_imgs = Variable(false_imgs.float()).cuda() real_labels = Variable(real_labels).cuda() smooth_real_labels = Variable(smooth_real_labels).cuda() fake_labels = Variable(fake_labels).cuda() # ---------------------------------------------------------------# # 2. Training the generator # # ---------------------------------------------------------------# self.gen.zero_grad() z = Variable(torch.randn(true_imgs.size(0), self.z_dim)).cuda() fake_imgs = self.gen.forward(true_embed, z) fake_out, fake_logit = self.disc.forward(fake_imgs, true_embed) fake_out = Variable(fake_out.data, requires_grad=True).cuda() true_out, true_logit = self.disc.forward(true_imgs, true_embed) true_out = Variable(true_out.data, requires_grad=True).cuda() g_sf = self.criterion(fake_out, real_labels) #g_img = self.l1_coeff * nn.L1Loss()(fake_imgs, true_imgs) gen_loss = g_sf gen_loss.backward() self.gen_optim.step() # ---------------------------------------------------------------# # 3. Training the discriminator # # ---------------------------------------------------------------# self.disc.zero_grad() false_out, false_logit = self.disc.forward( false_imgs, true_embed) false_out = Variable(false_out.data, requires_grad=True) sr = self.criterion(true_out, smooth_real_labels) sw = self.criterion(true_out, fake_labels) sf = self.criterion(false_out, smooth_real_labels) disc_loss = torch.log(sr) + (torch.log(1 - sw) + torch.log(1 - sf)) / 2 disc_loss.backward() self.disc_optim.step() self.cls_gan_optim.step() # Logging loss = {} loss['G_loss'] = gen_loss.item() loss['D_loss'] = disc_loss.item() # ---------------------------------------------------------------# # 4. Logging INFO into log_dir # # ---------------------------------------------------------------# log = "" if (idx + 1) % self.log_step == 0: end_time = time.time() - start_time end_time = datetime.timedelta(seconds=end_time) log = "Elapsed [{}], Epoch [{}/{}], Idx [{}]".format( end_time, epoch + 1, self.num_epochs, idx) for net, loss_value in loss.items(): log += "{}: {:.4f}".format(net, loss_value) self.logger.info(log) print(log) """ # ---------------------------------------------------------------# # 5. Saving generated images # # ---------------------------------------------------------------# if (idx + 1) % self.sample_step == 0: concat_imgs = torch.cat((true_imgs, fake_imgs), 0) # ?????????? concat_imgs = (concat_imgs + 1) / 2 # out.clamp_(0, 1) save_path = os.path.join(self.sample_dir, '{}-{}-images.jpg'.format(epoch, idx + 1)) # concat_imgs.cpu().detach().numpy() self.dump_imgs(concat_imgs.cpu().numpy(), save_path) #save_image(concat_imgs.data.cpu(), self.sample_dir, nrow=1, padding=0) print ('Saved real and fake images into {}...'.format(self.sample_dir)) """ # ---------------------------------------------------------------# # 6. Saving the checkpoints & final model # # ---------------------------------------------------------------# if (idx + 1) % self.model_save_step == 0: G_path = os.path.join( self.checkpoint_dir, '{}-{}-G.ckpt'.format(epoch, idx + 1)) D_path = os.path.join( self.checkpoint_dir, '{}-{}-D.ckpt'.format(epoch, idx + 1)) torch.save(self.gen.state_dict(), G_path) torch.save(self.disc.state_dict(), D_path) print('Saved model checkpoints into {}...\n'.format( self.checkpoint_dir)) print('--------------- Model Training Completed ---------------') # Saving final model into final_model directory G_path = os.path.join(self.final_model, '{}-G.pth'.format('final')) D_path = os.path.join(self.final_model, '{}-D.pth'.format('final')) torch.save(self.gen.state_dict(), G_path) torch.save(self.disc.state_dict(), D_path) print('Saved final model into {}...'.format(self.final_model))
class DCGAN: def __init__(self, img_shape, epochs=50000, lr_gen=0.0001, lr_disc=0.0001, z_shape=100, num_classes = 256, batch_size=100, beta1=0.5, epochs_for_sample=500): self.rows, self.cols, self.channels = img_shape self.batch_size = batch_size self.epochs = epochs self.z_shape = z_shape self.num_classes = num_classes self.epochs_for_sample = epochs_for_sample self.generator = Generator(self.z_shape,self.num_classes, img_shape, self.batch_size) self.discriminator = Discriminator(self.channels, self.num_classes, img_shape) self.samples = [] self.losses = [] self.SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__)) # Default paths. self.DEFAULT_LABEL_FILE = os.path.join(self.SCRIPT_PATH, './labels/256-common-hangul.txt') self.DEFAULT_TFRECORDS_DIR = os.path.join(self.SCRIPT_PATH, 'tfrecords-output') """Perform graph definition and model training. Here we will first create our input pipeline for reading in TFRecords files and producing random batches of images and labels. """ labels = io.open(self.DEFAULT_LABEL_FILE, 'r', encoding='utf-8').read().splitlines() num_classes = len(labels) print('Processing data...') tf_record_pattern = os.path.join(self.DEFAULT_TFRECORDS_DIR, '%s-*' % 'train') self.train_data_files = tf.gfile.Glob(tf_record_pattern) """ label, image = get_image(self.train_data_files, num_classes) # Associate objects with a randomly selected batch of labels and images. self.image_batch, self.label_batch = tf.train.shuffle_batch( [image, label], batch_size=self.batch_size, capacity=2000, min_after_dequeue=1000) """ # Make tf.data.Dataset # If you want to use one more parameter for decode, use 'lambda' for data.map dataset = tf.data.TFRecordDataset(self.train_data_files) dataset = dataset.map(lambda x: get_image(x, self.num_classes)) dataset = dataset.repeat(self.train_epoch) # set epoch dataset = dataset.shuffle(buffer_size=3 * self.batch_size) # for getting data in each buffer size data part dataset = dataset.batch(self.batch_size) # set batch size dataset = dataset.prefetch(buffer_size=1) # reduce GPU starvation # Make iterator for dataset self.iterator = dataset.make_initializable_iterator() self.next_element = self.iterator.get_next() self.phX = tf.placeholder(tf.float32, [None, self.rows, self.cols, self.channels]) self.phZ = tf.placeholder(tf.float32, [None, self.z_shape]) self.phY_g = tf.placeholder(tf.float32, [None, self.num_classes]) self.phY_d = tf.placeholder(tf.float32, shape=(None, self.rows, self.cols, self.num_classes)) self.gen_out = self.generator.forward(self.phZ, self.phY_g) #output shape of this z is (?, 28, 28, 1) disc_logits_fake = self.discriminator.forward(self.gen_out, self.phY_d ) #out put shape of this logit is (?, 1) disc_logits_real = self.discriminator.forward(self.phX, self.phY_d ) # out put shape of this logit is (?, 1) disc_fake_loss = cost(tf.zeros_like(disc_logits_fake), disc_logits_fake) disc_real_loss = cost(tf.ones_like(disc_logits_real), disc_logits_real) self.disc_loss = tf.add(disc_fake_loss, disc_real_loss) self.gen_loss = cost(tf.ones_like(disc_logits_fake), disc_logits_fake) train_vars = tf.trainable_variables() self.disc_vars = [var for var in train_vars if 'd' in var.name] self.gen_vars = [var for var in train_vars if 'g' in var.name] self.disc_train = tf.train.AdamOptimizer(lr_disc,beta1=beta1).minimize(self.disc_loss, var_list=self.disc_vars) self.gen_train = tf.train.AdamOptimizer(lr_gen, beta1=beta1).minimize(self.gen_loss, var_list=self.gen_vars) def train(self): init = [tf.global_variables_initializer(), self.iterator.initializer] config = tf.ConfigProto() config.gpu_options.allow_growth=True self.sess = tf.Session(config=config) self.sess.run(init) # Initialize the queue threads. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=self.sess, coord=coord) epoch_start_time = time.time() for i in range(self.epochs): # Get a random batch of images and labels. train_labels, train_images = self.sess.run(self.next_element) # Real image input for Real Discriminator, # Get images, reshape and rescale to pass to D batch_X = train_images.reshape((self.batch_size, self.rows, self.cols, self.channels)) batch_X = batch_X * 2 - 1 # Z noise for Generator batch_Z = np.random.uniform(-1, 1, (self.batch_size, self.z_shape)) # Shape is [?, 100] # Label input for Generator batch_Y_g = train_labels batch_Y_g = batch_Y_g.reshape([self.batch_size, self.num_classes]) # Label input for Discriminator batch_Y_d = train_labels batch_Y_d = batch_Y_d.reshape([self.batch_size,1,1,self.num_classes]) batch_Y_d = batch_Y_d * np.ones([self.batch_size, self.rows, self.cols, self.num_classes]) _, d_loss = self.sess.run([self.disc_train, self.disc_loss], feed_dict={self.phX:batch_X, self.phZ:batch_Z, self.phY_g:batch_Y_g, self.phY_d:batch_Y_d}) batch_Z = np.random.uniform(-1, 1, (self.batch_size, self.z_shape)) _, g_loss = self.sess.run([self.gen_train, self.gen_loss], feed_dict={self.phX:batch_X, self.phZ:batch_Z, self.phY_g:batch_Y_g, self.phY_d:batch_Y_d}) if i % self.epochs_for_sample == 0: epoch_end_time = time.time() per_epoch_ptime = epoch_end_time - epoch_start_time print(f"Epoch: {i}. Discriminator loss: {d_loss}. Generator loss: {g_loss}") # Save losses to view after training self.losses.append((d_loss, g_loss)) # Save training generator samples with open('train_samples.pkl', 'wb') as f: pkl.dump(self.samples, f) # Generate random sample after training self.generate_random_sample() # Stop queue threads and close session. coord.request_stop() coord.join(threads) self.sess.close() def generate_random_sample(self): init = tf.global_variables_initializer() config = tf.ConfigProto() config.gpu_options.allow_growth=True self.sess = tf.Session(config=config) self.sess.run(init) # Only save generator variables saver = tf.train.Saver(var_list=self.gen_vars) c = 7 r = 7 # data_len = Get_dataset_length(self.train_data_files) # data_len_y = np.ndarray(data_len, dtype=np.uint8) # z = np.random.uniform(-1, 1, (self.batch_size, self.z_shape)) # idx = np.random.randint(0, data_len, self.batch_size) # print('length of images are ', data_len) # print('Batch size is ', self.batch_size) # print('idx shape is is ', idx.shape) # print('Y shape is ', data_len_y.shape) # # Label input for Generator # batch_Y_g = np.eye(self.num_classes)[data_len_y] # batch_Y_g = batch_Y_g[idx] # batch_Y_g = batch_Y_g.reshape([self.batch_size, self.num_classes]) n_sample = 100 z = np.random.uniform(-1, 1, (self.batch_size, self.z_shape)) # Create conditional one-hot vector, with index 5 = 1 batch_Y_g = np.zeros(shape=[n_sample, 256]) batch_Y_g[:, 0] = 4 saver.restore(self.sess, tf.train.latest_checkpoint('checkpoints')) samples = self.sess.run(self.gen_out, feed_dict={self.phZ:z, self.phY_g:batch_Y_g}) # scale between 0, 1 fig, axs = plt.subplots(c, r) cnt = 0 for i in range(c): for j in range(r): axs[i, j].imshow(samples[cnt, :, :, 0], cmap="gray") axs[i, j].axis('off') cnt += 1 fig.savefig("generated/generated_test_1.png") plt.close()
class DCGAN: def __init__(self, img_shape, sample_folder_name, iterations=15000, lr_gen=0.0001, lr_dc=0.00005, z_shape=100, batch_size=64, beta1=0.7, sample_interval=1000): #Create sample folder if not os.path.exists(f"{sample_folder_name}/"): os.makedirs(f"{sample_folder_name}/") self.SAMPLE_FOLDER_NAME = sample_folder_name #Unpack Image shape self.rows, self.cols, self.channels = img_shape self.batch_size = batch_size self.iterations = iterations self.z_shape = z_shape self.sample_interval = sample_interval self.generator = Generator() self.discriminator = Discriminator(img_shape) #Load CelebA dataset dir_data = "./data/celebA/" Ntrain = 200000 Ntest = 100 nm_imgs = np.sort(os.listdir(dir_data)) ## name of the jpg files for training set nm_imgs_train = nm_imgs[:Ntrain] ## name of the jpg files for the testing data nm_imgs_test = nm_imgs[Ntrain:Ntrain + Ntest] img_shape = (28, 28, 3) X_train = [] for i, myid in enumerate(nm_imgs_train): im = image.load_img(dir_data + "/" + myid, target_size=img_shape[:2]) im = image.img_to_array(im) X_train.append(im) X = np.array(X_train) #Values 0~255 #Scale -1~1 self.X = X / 127.5 - 1 #Create placeholders for input self.phX = tf.placeholder(tf.float32, [None, self.rows, self.cols, self.channels]) self.phZ = tf.placeholder(tf.float32, [None, self.z_shape]) #Generate forward pass self.gen_out = self.generator.forward(self.phZ) #Discriminator predictions #Fake IMG dc_logits_fake = self.discriminator.forward(self.gen_out) #Real IMG dc_logits_real = self.discriminator.forward(self.phX) #cost functions #fake -- 0; real -- 1 dc_fake_loss = cost(tf.zeros_like(dc_logits_fake), dc_logits_fake) dc_real_loss = cost(tf.ones_like(dc_logits_real), dc_logits_real) self.dc_loss = tf.add(dc_fake_loss, dc_real_loss) #Generator tries to fool D so that it outputs 1 for fake IMGs self.gen_loss = cost(tf.ones_like(dc_logits_fake), dc_logits_fake) #Collect trainable variables train_vars = tf.trainable_variables() #Differentiate G and D variables dc_vars = [var for var in train_vars if 'd' in var.name] gen_vars = [var for var in train_vars if 'g' in var.name] #Create training variables self.dc_train = tf.train.AdamOptimizer(lr_dc, beta1=beta1).minimize( self.dc_loss, var_list=dc_vars) self.gen_train = tf.train.AdamOptimizer(lr_gen, beta1=beta1).minimize( self.gen_loss, var_list=gen_vars) def train(self): init = tf.global_variables_initializer() self.sess = tf.Session() #Init all vars self.sess.run(init) #Start training loop for i in range(self.iterations): #rand batch and indices idx = np.random.randint(0, len(self.X), self.batch_size) batch_X = self.X[idx] batch_Z = np.random.uniform(-1, 1, (self.batch_size, self.z_shape)) #Train D and store dc loss batch_X = batch_X.reshape([-1, 28, 28, self.channels]) _, d_loss = self.sess.run([self.dc_train, self.dc_loss], feed_dict={ self.phX: batch_X, self.phZ: batch_Z }) #Create new batch for G batch_Z = np.random.uniform(-1, 1, (self.batch_size, self.z_shape)) #Train G and store G loss _, g_loss = self.sess.run([self.gen_train, self.gen_loss], feed_dict={self.phZ: batch_Z}) #Generate samples and print loss if i % self.sample_interval == 0: self.generate_sample(i) print( f"Epoch:{i}. Discriminator loss: {d_loss}. Generator loss {g_loss}" ) def generate_sample(self, iteration): # 5 samples per IMG c, r = 5, 5 # New input for sample, 5*5 = 25 IMGs z = np.random.uniform(-1, 1, (25, self.z_shape)) imgs = self.sess.run(self.gen_out, feed_dict={self.phZ: z}) #Scale back to values (0,1), currently (-1,1) imgs = imgs * 0.5 + 0.5 imgs = cv2.cvtColor(imgs, cv2.COLOR_BGR2RGB) fig, axs = plt.subplots(c, r) count = 0 for i in range(c): for j in range(r): axs[i, j].imshow(imgs[count, :, :, 0]) axs[i, j].axis('off') count += 1 # save image fig.savefig(f"{self.SAMPLE_FOLDER_NAME}/{iteration}.png") plt.close()
class MGAIL(object): def __init__(self, environment, use_irl=False): self.use_irl = use_irl self.env = environment # Create placeholders for all the inputs self.states_ = tf.compat.v1.placeholder("float", shape=(None, self.env.state_size), name='states_') # Batch x State self.states = tf.compat.v1.placeholder("float", shape=(None, self.env.state_size), name='states') # Batch x State self.actions = tf.compat.v1.placeholder("float", shape=(None, self.env.action_size), name='action') # Batch x Action self.label = tf.compat.v1.placeholder("float", shape=(None, 1), name='label') self.gamma = tf.compat.v1.placeholder("float", shape=(), name='gamma') self.temp = tf.compat.v1.placeholder("float", shape=(), name='temperature') self.noise = tf.compat.v1.placeholder("float", shape=(), name='noise_flag') self.do_keep_prob = tf.compat.v1.placeholder("float", shape=(), name='do_keep_prob') self.lprobs = tf.compat.v1.placeholder('float', shape=(None, 1), name='log_probs') # Create MGAIL blocks self.forward_model = ForwardModel(state_size=self.env.state_size, action_size=self.env.action_size, encoding_size=self.env.fm_size, lr=self.env.fm_lr) # MODIFYING THE NEW DISCRIMINATOR: if self.use_irl: self.discriminator = DiscriminatorIRL(in_dim=self.env.state_size + self.env.action_size, out_dim=1, size=self.env.d_size, lr=self.env.d_lr, do_keep_prob=self.do_keep_prob, weight_decay=self.env.weight_decay, state_only=True, gamma=self.gamma, state_size = self.env.state_size, action_size = self.env.action_size) # END MODIFYING THE NEW DISCRIMINATOR else: self.discriminator = Discriminator(in_dim=self.env.state_size + self.env.action_size, out_dim=2, size=self.env.d_size, lr=self.env.d_lr, do_keep_prob=self.do_keep_prob, weight_decay=self.env.weight_decay) self.policy = Policy(in_dim=self.env.state_size, out_dim=self.env.action_size, size=self.env.p_size, lr=self.env.p_lr, do_keep_prob=self.do_keep_prob, n_accum_steps=self.env.policy_accum_steps, weight_decay=self.env.weight_decay) # Create experience buffers self.er_agent = ER(memory_size=self.env.er_agent_size, state_dim=self.env.state_size, action_dim=self.env.action_size, batch_size=self.env.batch_size, history_length=1) self.er_expert = common.load_d4rl_er(h5path=os.path.join(self.env.run_dir, self.env.expert_data), batch_size=self.env.batch_size, history_length=1, traj_length=2) self.env.sigma = self.er_expert.actions_std / self.env.noise_intensity # Normalize the inputs states_ = common.normalize(self.states_, self.er_expert.states_mean, self.er_expert.states_std) states = common.normalize(self.states, self.er_expert.states_mean, self.er_expert.states_std) if self.env.continuous_actions: actions = common.normalize(self.actions, self.er_expert.actions_mean, self.er_expert.actions_std) else: actions = self.actions # 1. Forward Model initial_gru_state = np.ones((1, self.forward_model.encoding_size)) forward_model_prediction, _ = self.forward_model.forward([states_, actions, initial_gru_state]) forward_model_loss = tf.reduce_mean(tf.square(states-forward_model_prediction)) self.forward_model.train(objective=forward_model_loss) # 2. Discriminator labels = tf.concat([1 - self.label, self.label], 1) lprobs = self.lprobs # MODIFIED DISCRIMINATOR SECTION if self.use_irl: self.discrim_output, log_p_tau, log_q_tau, log_pq = self.discriminator.forward(states_, actions, states, lprobs) correct_predictions = tf.equal(tf.cast(tf.round(self.discrim_output), tf.int64), tf.argmax(labels, 1)) self.discriminator.acc = tf.reduce_mean(tf.cast(correct_predictions, "float")) d_cross_entropy = self.label*(log_p_tau-log_pq) + (1-self.label)*(log_q_tau-log_pq) d_loss_weighted = self.env.cost_sensitive_weight * tf.multiply(tf.compat.v1.to_float(tf.equal(tf.squeeze(self.label), 1.)), d_cross_entropy) +\ tf.multiply(tf.compat.v1.to_float(tf.equal(tf.squeeze(self.label), 0.)), d_cross_entropy) discriminator_loss = -tf.reduce_mean(d_loss_weighted) self.discriminator.train(objective=discriminator_loss) # END MODIFIED DISCRIMINATOR SECTION else: d = self.discriminator.forward(states, actions) # 2.1 0-1 accuracy correct_predictions = tf.equal(tf.argmax(d, 1), tf.argmax(labels, 1)) self.discriminator.acc = tf.reduce_mean(tf.cast(correct_predictions, "float")) # 2.2 prediction d_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=d, labels=labels) # cost sensitive weighting (weight true=expert, predict=agent mistakes) d_loss_weighted = self.env.cost_sensitive_weight * tf.multiply(tf.compat.v1.to_float(tf.equal(tf.squeeze(self.label), 1.)), d_cross_entropy) +\ tf.multiply(tf.compat.v1.to_float(tf.equal(tf.squeeze(self.label), 0.)), d_cross_entropy) discriminator_loss = tf.reduce_mean(d_loss_weighted) self.discriminator.train(objective=discriminator_loss) # 3. Collect experience mu = self.policy.forward(states) if self.env.continuous_actions: a = common.denormalize(mu, self.er_expert.actions_mean, self.er_expert.actions_std) eta = tf.random.normal(shape=tf.shape(a), stddev=self.env.sigma) self.action_test = a + self.noise * eta # self.action_means = mu N = tf.shape(self.action_test)[0] expanded_sigma= tf.repeat(tf.expand_dims(tf.cast(self.env.sigma, dtype=tf.float32), 0), N, axis=0) self.action_probs_test = common.compute_action_probs_tf(self.action_test, mu, expanded_sigma) else: a = common.gumbel_softmax(logits=mu, temperature=self.temp) self.action_test = tf.compat.v1.argmax(a, dimension=1) self.action_means = tf.squeeze(mu) # 4.3 AL def policy_loop(state_, t, total_cost, total_trans_err, _): mu = self.policy.forward(state_, reuse=True) if self.env.continuous_actions: eta = self.env.sigma * tf.random.normal(shape=tf.shape(mu)) action = mu + eta N = tf.shape(action)[0] expanded_sigma= tf.repeat(tf.expand_dims(tf.cast(self.env.sigma, dtype=tf.float32), 0), N, axis=0) a_prob = common.compute_action_probs_tf(action, mu, expanded_sigma) else: action = common.gumbel_softmax_sample(logits=mu, temperature=self.temp) a_prob = 0.5 # get action if self.env.continuous_actions: a_sim = common.denormalize(action, self.er_expert.actions_mean, self.er_expert.actions_std) else: a_sim = tf.compat.v1.argmax(action, dimension=1) # get next state state_env, _, env_term_sig, = self.env.step(a_sim, mode='tensorflow')[:3] state_e = common.normalize(state_env, self.er_expert.states_mean, self.er_expert.states_std) state_e = tf.stop_gradient(state_e) state_a, _ = self.forward_model.forward([state_, action, initial_gru_state], reuse=True) state, nu = common.re_parametrization(state_e=state_e, state_a=state_a) total_trans_err += tf.reduce_mean(abs(nu)) t += 1 # minimize the gap between agent logit (d[:,0]) and expert logit (d[:,1]) # MODIFIED DISCRIMINATOR SECTION: if self.use_irl: self.discrim_output, log_p_tau, log_q_tau, log_pq = self.discriminator.forward(state_, action, state, a_prob, reuse=True) cost = self.al_loss(log_p=log_p_tau, log_q=log_q_tau, log_pq=log_pq) else: d = self.discriminator.forward(state_, action, reuse=True) cost = self.al_loss(d=d) # END MODIFIED DISCRIMINATOR SECTION # add step cost total_cost += tf.multiply(tf.pow(self.gamma, t), cost) return state, t, total_cost, total_trans_err, env_term_sig def policy_stop_condition(state_, t, cost, trans_err, env_term_sig): cond = tf.logical_not(env_term_sig) cond = tf.logical_and(cond, t < self.env.n_steps_train) cond = tf.logical_and(cond, trans_err < self.env.total_trans_err_allowed) return cond state_0 = tf.slice(states, [0, 0], [1, -1]) loop_outputs = tf.while_loop(policy_stop_condition, policy_loop, [state_0, 0., 0., 0., False]) self.policy.train(objective=loop_outputs[2]) def al_loss(self, d=None, log_p=None, log_q=None, log_pq=None): if not self.use_irl: logit_agent, logit_expert = tf.split(axis=1, num_or_size_splits=2, value=d) labels = tf.concat([tf.zeros_like(logit_agent), tf.ones_like(logit_expert)], 1) d_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=d, labels=labels) else: # USING IRL d_cross_entropy = - (log_p - log_pq) + (log_q - log_pq) loss = tf.reduce_mean(d_cross_entropy) return loss*self.env.policy_al_w
class DCGAN: # initialise network with learning rate, layer shape etc def __init__(self, img_shape, epochs=50000, lr_gen=0.0001, lr_disc=0.0001, z_shape=100, batch_size=64, beta1=0.5, epochs_for_sample=500): # initalise architecture vars self.rows, self.cols, self.channels = img_shape self.batch_size = batch_size self.epochs = epochs self.z_shape = z_shape self.epochs_for_sample = epochs_for_sample # intialise underlying networks self.generator = Generator(img_shape, self.batch_size) self.discriminator = Discriminator(img_shape) mnist = tf.keras.datasets.mnist (x_train, _), (x_test, _) = mnist.load_data() X = np.concatenate([x_train, x_test]) # As and after training for the generator,sampling will occur. Uses tanh for generator output for # best results <--- need to rescale MNIST [0,1] -> [-1,1] self.X = X / 127.5 - 1 # Scale between -1 and 1 self.phX = tf.placeholder(tf.float32, [None, self.rows, self.cols]) self.phZ = tf.placeholder(tf.float32, [None, self.z_shape]) self.gen_out = self.generator.forward(self.phZ) disc_logits_fake = self.discriminator.forward(self.gen_out) disc_logits_real = self.discriminator.forward(self.phX) # compute cost functions - sigmoid cross entropy (sigmoid as real or fake) disc_fake_loss = cost(tf.zeros_like(disc_logits_fake), disc_logits_fake) disc_real_loss = cost(tf.ones_like(disc_logits_real), disc_logits_real) self.disc_loss = tf.add(disc_fake_loss, disc_real_loss) self.gen_loss = cost(tf.ones_like(disc_logits_fake), disc_logits_fake) train_vars = tf.trainable_variables() disc_vars = [var for var in train_vars if 'd' in var.name] gen_vars = [var for var in train_vars if 'g' in var.name] self.disc_train = tf.train.AdamOptimizer( lr_disc, beta1=beta1).minimize(self.disc_loss, var_list=disc_vars) self.gen_train = tf.train.AdamOptimizer(lr_gen, beta1=beta1).minimize( self.gen_loss, var_list=gen_vars) def train(self): init = tf.global_variables_initializer() self.sess = tf.Session() self.sess.run(init) for i in range(self.epochs): idx = np.random.randint(0, len(self.X), self.batch_size) batch_X = self.X[idx] batch_Z = np.random.uniform(-1, 1, (self.batch_size, self.z_shape)) _, d_loss = self.sess.run([self.disc_train, self.disc_loss], feed_dict={ self.phX: batch_X, self.phZ: batch_Z }) batch_Z = np.random.uniform(-1, 1, (self.batch_size, self.z_shape)) _, g_loss = self.sess.run([self.gen_train, self.gen_loss], feed_dict={self.phZ: batch_Z}) if i % self.epochs_for_sample == 0: self.generate_sample(i) print( f"Epoch: {i}. Discriminator loss: {d_loss}. Generator loss: {g_loss}" ) def generate_sample(self, epoch): c = 7 r = 7 z = np.random.uniform(-1, 1, (self.batch_size, self.z_shape)) imgs = self.sess.run(self.gen_out, feed_dict={self.phZ: z}) imgs = imgs * 0.5 + 0.5 # scale between 0, 1 fig, axs = plt.subplots(c, r) cnt = 0 for i in range(c): for j in range(r): axs[i, j].imshow(imgs[cnt, :, :, 0], cmap="gray") axs[i, j].axis('off') cnt += 1 fig.savefig("samples/%d.png" % epoch) plt.close()
def main(args): transform = transforms.ToTensor() args.dataset_root.mkdir(parents=True, exist_ok=True) train_dataset = torchvision.datasets.MNIST( args.dataset_root, train=True, download=True, transform=transform ) train_loader = torch.utils.data.DataLoader( train_dataset, shuffle=True, batch_size=args.batch_size, pin_memory=True, num_workers=8, ) log_dir = get_summary_writer_log_dir(args) summary_writer = SummaryWriter( str(log_dir), flush_secs=5 ) noise_vector_size = 100 real_label = 1 fake_label = 0 fixed_noise = torch.randn(16,100,1,1).to(DEVICE) fixed_noise_generator_output = dict() generator = Generator(noise_vector_size,28,28,1).to(DEVICE) discriminator = Discriminator(28,28,1).to(DEVICE) criterion = nn.BCELoss() generator_optimiser = optim.Adam(generator.parameters(), 0.0002,(0.5,0.999)) discriminator_optimiser = optim.Adam(discriminator.parameters(), 0.0002, (0.5,0.999)) try: os.mkdir("generated_digits") except FileExistsError: pass step = 0 for i in range(args.epochs): for j, (batch,labels) in enumerate(train_loader): batch = batch.to(DEVICE) labels = labels.to(DEVICE) discriminator.zero_grad() d_real_output = discriminator.forward(batch).view(-1) Dx = d_real_output.mean().item() real_labels = torch.full(labels.shape, real_label, device=DEVICE) d_real_error = criterion(d_real_output, real_labels) d_real_error.backward() noise = torch.randn(len(labels), noise_vector_size, 1, 1).to(DEVICE) fake_data = generator.forward(noise) fake_labels = torch.full(labels.shape, fake_label, device=DEVICE) d_fake_output = discriminator.forward(fake_data.detach()).view(-1) d_fake_error = criterion(d_fake_output, fake_labels) d_fake_error.backward() DGz = d_fake_output.mean().item() d_error = d_fake_error + d_real_error discriminator_optimiser.step() generator.zero_grad() g_real_labels = torch.full(labels.shape, real_label, device=DEVICE) d_output = discriminator.forward(fake_data).view(-1) g_error = criterion(d_output, g_real_labels) g_error.backward() generator_optimiser.step() print(f"epoch: {i}, step: {j+1}/{len(train_loader)}, Dx: {Dx:.5f}, DGz: {DGz:.5f}, D loss: {d_error:.5f}, G loss: {g_error:.5f}") summary_writer.add_scalars( "loss", {"D": d_error, "G": g_error}, step ) step += 1 with torch.no_grad(): fixed_output = generator.forward(fixed_noise).detach().cpu() fixed_noise_generator_output[i] = vutils.make_grid(fixed_output, padding=2, normalize=True) plt.imshow(np.transpose(fixed_noise_generator_output[i], (1,2,0))) plt.axis("off") plt.tight_layout() plt.savefig(f"{str(log_dir)}/{i}.png",dpi=400,bbox_inches=0)
class DCGAN: def __init__(self, img_shape, epochs=50000, lr_gen=0.0001, lr_disc=0.0001, z_shape=11, batch_size=64, beta1=0.5, epochs_for_sample=10000): self.rows, self.cols, self.channels = img_shape self.batch_size = batch_size self.epochs = epochs self.z_shape = z_shape self.epochs_for_sample = epochs_for_sample self.generator = Generator(img_shape, self.batch_size, self.z_shape) self.discriminator = Discriminator(img_shape) self.matching = 0 mnist = tf.keras.datasets.mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() X = np.concatenate([x_train, x_test]) # replace the last column with the digit information in the input data Y = np.concatenate([y_train, y_test]) Y_onehot = indices_to_one_hot(Y, self.rows) self.X = X / 127.5 - 1 # Scale between -1 and 1 self.X[:, :, -1] = Y_onehot self.X[:, -1, :] = Y_onehot # FIXME: only works with rectangular images, because the same Y_onehot vector is used for both column & row self.phX = tf.placeholder(tf.float32, [None, self.rows, self.cols]) self.phZ = tf.placeholder(tf.float32, [None, self.z_shape]) self.gen_out = self.generator.forward(self.phZ) disc_logits_fake = self.discriminator.forward(self.gen_out) disc_logits_real = self.discriminator.forward(self.phX) disc_fake_loss = cost(tf.ones_like(disc_logits_fake), disc_logits_fake) disc_real_loss = cost(tf.zeros_like(disc_logits_real), disc_logits_real) self.disc_loss = tf.add(disc_fake_loss, disc_real_loss) self.gen_loss = cost(tf.zeros_like(disc_logits_fake), disc_logits_fake) train_vars = tf.trainable_variables() disc_vars = [var for var in train_vars if 'd' in var.name] gen_vars = [var for var in train_vars if 'g' in var.name] self.disc_train = tf.train.AdamOptimizer(lr_disc,beta1=beta1).minimize(self.disc_loss, var_list=disc_vars) self.gen_train = tf.train.AdamOptimizer(lr_gen, beta1=beta1).minimize(self.gen_loss, var_list=gen_vars) def train(self): init = tf.global_variables_initializer() self.sess = tf.Session() self.sess.run(init) for i in range(self.epochs): # print(i) idx = np.random.randint(0, len(self.X), self.batch_size) batch_X = self.X[idx] batch_Z = np.random.uniform(-1, 1, (self.batch_size, self.z_shape)) # add digit information in the input batch_Z[:, :10] = 0. # if i % self.epochs_for_sample == 0: # self.generate_sample(i) # print(i) np.put_along_axis(batch_Z, np.random.randint(10, size=self.batch_size)[..., np.newaxis], 1, axis=1) _, d_loss = self.sess.run([self.disc_train, self.disc_loss], feed_dict={self.phX:batch_X, self.phZ:batch_Z}) batch_Z = np.random.uniform(-1, 1, (self.batch_size, self.z_shape)) batch_Z[:, :10] = 0. np.put_along_axis(batch_Z, np.random.randint(10, size=self.batch_size)[..., np.newaxis], 1, axis=1) _, g_loss = self.sess.run([self.gen_train, self.gen_loss], feed_dict={self.phZ: batch_Z}) if i % self.epochs_for_sample == 0: self.generate_sample(i) print(f"Epoch: {i}. Discriminator loss: {d_loss}. Generator loss: {g_loss}. Matching digit indicators: {self.matching}") def generate_sample(self, epoch): c = 7 r = 7 z = np.random.uniform(-1, 1, (self.batch_size, self.z_shape)) z[:, :10] = 0. y = np.random.randint(10, size=self.batch_size) np.put_along_axis(z, y[..., np.newaxis], 1, axis=1) imgs = self.sess.run(self.gen_out, feed_dict={self.phZ:z}) imgs = imgs*0.5 + 0.5 result = np.argmax(imgs[:, :, -1, 0], axis=1) self.matching = np.sum(y == result) # scale between 0, 1 fig, axs = plt.subplots(c, r) fig.suptitle(f"Matching indices: {self.matching}") cnt = 0 for i in range(c): for j in range(r): axs[i, j].imshow(imgs[cnt, :, :, 0], cmap="gray") axs[i, j].axis('off') # if discs[cnt]: # col = 'g' # else: # col = 'r' axs[i, j].set_title(str(y[cnt]), size=7, pad=0.5) #, color = col) axs[i, j].text(30, 13.5, str(result[cnt]), size=7, verticalalignment='center') cnt += 1 fig.savefig("samples/targets_swapped_" + str(epoch).zfill(len(str(self.epochs))) + ".png") plt.close()
NEGATIVE_FILE1 = NEGATIVE_FILE + '\\' + str(total_batch) + '\\gene' samples_lenth = generate_samples(generator, BATCH_SIZE, GENERATED_NUM, NEGATIVE_FILE1,x_info,x_ids,start_id_list,end_id_list,bank_dict) NEGATIVE_FILEtxt = NEGATIVE_FILE + '\\' + str(total_batch) + '\\gene.txt' dis_data_iter = DisDataIter(real_data_id1, NEGATIVE_FILEtxt, BATCH_SIZE) for q in range(2): total_loss = 0. total_words = 0. n = 0 for (data, target) in dis_data_iter: n+=1 data = Variable(data) target = Variable(target) if opt.cuda: data, target = data.cuda(), target.cuda() target = target.contiguous().view(-1) pred = discriminator.forward(data) loss = dis_criterion(pred, target) # negative log likelihood loss total_loss += loss.item() total_words += data.size(0) * data.size(1) dis_optimizer.zero_grad() loss.backward() dis_optimizer.step() dis_data_iter.reset() f_loss = math.exp(total_loss/ total_words) d_save_path = os.path.join(m_save_path, c_cat) if not os.path.exists(d_save_path): os.mkdir(d_save_path) d_save_path = os.path.join(d_save_path, 'discriminator'+str(total_batch)+'.pkl')
for epoch in range(num_epochs): x_batch = torch.from_numpy(next(x)) y_batch = torch.from_numpy(next(y)) model.train(x_batch.type(torch.FloatTensor), y_batch.type(torch.FloatTensor), args.loss_type) #torch.save(model.state_dict(), os.path.join(directory, 'best_params_'+str(i)+'.pt')) x_dist = samplers.distribution1(0, 10000) y_dist = samplers.distribution1(i, 10000) x_dist_batch = torch.from_numpy(next(x_dist)) y_dist_batch = torch.from_numpy(next(y_dist)) x_value = x_dist_batch.type(torch.FloatTensor) y_value = y_dist_batch.type(torch.FloatTensor) if args.loss_type == "JSD": print("JSD") jsd = model.loss_JSD( model.forward(x_dist_batch.type(torch.FloatTensor)), model.forward(y_dist_batch.type(torch.FloatTensor))) values.append(-jsd) elif args.loss_type == "WD": wd = torch.mean(model.forward(x_value) - model.forward(y_value)) values.append(wd) plt.plot(phi, values, 'o-') if args.loss_type == "JSD": plt.ylabel("JSD") plt.xlabel("phi") plt.title("JSD vs phi") plt.savefig(directory + '_JSD_phi.png', bbox_inches='tight') elif args.loss_type == "WD": plt.ylabel("Wasserstein Distance") plt.xlabel("Phi")
class DCGAN: def __init__(self, img_shape, epochs=50000, lr_gen=0.0001, lr_dc=0.0001, z_shape=100, batch_size=64, beta1=0.5, epochs_for_sample=500): # lr_gen = Learning rate for Generator # lr_dc = Learning rate for Discriminator # z_shape = Shape for generator input # batch_size can be changed --> bigger = slower training/epochs--> smaller = faster training/epochs(but needs more epochs) # epochs_for_sample --> Interval for genrating images # Unpack image Shape self.rows, self.cols, self.channels = img_shape self.batch_size = batch_size self.epochs = epochs self.z_shape = z_shape self.epochs_for_sample = epochs_for_sample self.generator = Generator(img_shape, self.batch_size) self.discriminator = Discriminator(img_shape) # Load MNIST dataset mnist = tf.keras.datasets.mnist (x_train, _), (x_test, _) = mnist.load_data() # Labels not needed # Differentiation between x_train and x_test not needed --> Concat x_train and x_test X = np.concatenate([x_train, x_test]) # Values between 0 and 255 # Scale between -1 and 1 self.X = X / 127.5 - 1 # Create placeholders for input self.phX = tf.placeholder(tf.float32, [None, self.rows, self.cols]) self.phZ = tf.placeholder(tf.float32, [None, self.z_shape]) # Generator forward pass self.gen_out = self.generator.forward(self.phZ) # Discriminator prediction dc_logits_fake = self.discriminator.forward(self.gen_out) # Real images dc_logits_real = self.discriminator.forward(self.phX) # Cost functions # Discriminator should predict that fake images are 0 and real images are 1 dc_fake_loss = cost(tf.zeros_like(dc_logits_fake), dc_logits_fake) dc_real_loss = cost(tf.ones_like(dc_logits_real), dc_logits_real) self.dc_loss = tf.add(dc_fake_loss, dc_real_loss) # Generator tries to fool discriminator so that the discriminator outputs 1 for fake images self.gen_loss = cost(tf.ones_like(dc_logits_fake), dc_logits_fake) train_vars = tf.trainable_variables() # Differentiating between generator and discriminator variables dc_vars = [var for var in train_vars if 'd' in var.name] gen_vars = [var for var in train_vars if 'g' in var.name] # Create training variables self.dc_train = tf.train.AdamOptimizer(lr_dc, beta1=beta1).minimize( self.dc_loss, var_list=dc_vars) self.gen_train = tf.train.AdamOptimizer(lr_gen, beta1=beta1).minimize( self.gen_loss, var_list=gen_vars) def train(self): init = tf.global_variables_initializer() self.sess = tf.Session() # Initialize all variables self.sess.run(init) # Start training loop for i in range(self.epochs): # Create random batch for training # Create random indices (minimum: 0, maxmium: size of X, size: batch_size) idx = np.random.randint(0, len(self.X), self.batch_size) batch_X = self.X[idx] batch_Z = np.random.uniform(-1, 1, (self.batch_size, self.z_shape)) # Train discriminator and store dc loss # batch_X = batch_X.reshape([-1, 28, 28, 1]) _, d_loss = self.sess.run([self.dc_train, self.dc_loss], feed_dict={ self.phX: batch_X, self.phZ: batch_Z }) # Create new batch for generator training batch_Z = np.random.uniform(-1, 1, (self.batch_size, self.z_shape)) # Train generator and store generator loss _, g_loss = self.sess.run([self.gen_train, self.gen_loss], feed_dict={self.phZ: batch_Z}) # Generate samples and print loss if i % self.epochs_for_sample == 0: self.generate_sample(i) print( f"Epoch: {i}. Discriminator loss: {d_loss}. Generator loss: {g_loss}" ) def generate_sample(self, epoch): # 7 sample per image c = 7 r = 7 # New input for sample # 7x7 = 49 image samples z = np.random.uniform(-1, 1, (self.batch_size, self.z_shape)) imgs = self.sess.run(self.gen_out, feed_dict={self.phZ: z}) # Scale back to values between 0 and 1 (currently between -1 and 1) imgs = imgs * 0.5 + 0.5 # Create subplots fig, axs = plt.subplots(c, r) count = 0 for i in range(c): for j in range(r): axs[i, j].imshow(imgs[count, :, :, 0], cmap="gray") axs[i, j].axis("off") count += 1 # Save images fig.savefig("DCGAN 01/samples/%d.png" % epoch) plt.close()
# Batch size is 1 by default. # It's not neccessary to use data loader # As I'm loading images one by one. for index in tqdm(range(len(inp_list))): img = mpimg.imread(config.data_dir + inp_list[index]) img = utils.image2tensor(img) label_img = mpimg.imread(config.label_dir + label_list[index]) label_img = utils.image2tensor(label_img) if torch.cuda.is_available(): img = img.cuda() label_img = label_img.cuda() # Train Discriminator optim_d.zero_grad() mask_r, D_real = model_discriminator.forward(label_img) masks, f1, f2, x = model_gen.forward(img) mask_f, D_fake = model_discriminator.forward(x) # Eq9 # L_map is the loss between the features extraced from # interior layers of the discriminator and the final attention map map_loss = d_map_loss(masks[-1], mask_f, mask_r) # -log(D(R)) D_loss_real = BCE_loss(D_real, label_real) # -log(1-D(O)) where O = G(z) D_loss_fake = BCE_loss(D_fake, label_fake) # Eq8. Gamma default to 0.05 D_loss = D_loss_real + D_loss_fake + config.gamma * map_loss
class MGAIL(object): def __init__(self, environment, reweight, ensemble): self.env = environment self.reweight = reweight self.ensemble = ensemble # Create placeholders for all the inputs self.states_ = tf.placeholder("float", shape=(None, self.env.state_size), name='states_') # Batch x State self.states = tf.placeholder("float", shape=(None, self.env.state_size), name='states') # Batch x State self.actions = tf.placeholder("float", shape=(None, self.env.action_size), name='action') # Batch x Action self.label = tf.placeholder("float", shape=(None, 1), name='label') self.gamma = tf.placeholder("float", shape=(), name='gamma') self.temp = tf.placeholder("float", shape=(), name='temperature') self.noise = tf.placeholder("float", shape=(), name='noise_flag') self.do_keep_prob = tf.placeholder("float", shape=(), name='do_keep_prob') self.states_e_ = tf.placeholder("float", shape=(None, self.env.state_size), name='states_e_') self.states_e = tf.placeholder("float", shape=(None, self.env.state_size), name='states_e') self.actions_e = tf.placeholder("float", shape=(None, self.env.action_size), name='action_e') self.ex_wts_ = tf.placeholder("float", shape=(self.ensemble, None), name='ex_wts') # Create MGAIL blocks self.forward_model = ForwardModel(state_size=self.env.state_size, action_size=self.env.action_size, encoding_size=self.env.fm_size, lr=self.env.fm_lr, ensemble=self.ensemble) self.discriminator = Discriminator(in_dim=self.env.state_size + self.env.action_size, out_dim=2, size=self.env.d_size, lr=self.env.d_lr, do_keep_prob=self.do_keep_prob, weight_decay=self.env.weight_decay) self.policy = Policy(in_dim=self.env.state_size, out_dim=self.env.action_size, size=self.env.p_size, lr=self.env.p_lr, do_keep_prob=self.do_keep_prob, n_accum_steps=self.env.policy_accum_steps, weight_decay=self.env.weight_decay) # Create experience buffers self.er_agent = ER(memory_size=self.env.er_agent_size, state_dim=self.env.state_size, action_dim=self.env.action_size, reward_dim=1, # stub connection qpos_dim=self.env.qpos_size, qvel_dim=self.env.qvel_size, batch_size=self.env.batch_size, history_length=1) self.er_expert = common.load_er(fname=os.path.join(self.env.run_dir, self.env.expert_data), batch_size=self.env.batch_size, history_length=1, traj_length=2) self.env.sigma = self.er_expert.actions_std / self.env.noise_intensity # Normalize the inputs states_ = common.normalize(self.states_, self.er_expert.states_mean, self.er_expert.states_std) states = common.normalize(self.states, self.er_expert.states_mean, self.er_expert.states_std) if self.env.continuous_actions: actions = common.normalize(self.actions, self.er_expert.actions_mean, self.er_expert.actions_std) else: actions = self.actions states_e_ = common.normalize(self.states_e_, self.er_expert.states_mean, self.er_expert.states_std) states_e = common.normalize(self.states_e, self.er_expert.states_mean, self.er_expert.states_std) if self.env.continuous_actions: actions_e = common.normalize(self.actions_e, self.er_expert.actions_mean, self.er_expert.actions_std) else: actions_e = self.actions_e # 1. Forward Model if self.reweight: initial_gru_state = np.ones((1, self.forward_model.encoding_size)) self.forward_model.train(x_=[states_, actions, initial_gru_state], y_=states, ex_wts=self.ex_wts_) initial_gru_state_rw = np.ones((1, self.forward_model.encoding_size)) initial_gru_state_val = np.ones((1, self.forward_model.encoding_size)) self.forward_model.reweight(x_=[states_, actions, initial_gru_state_rw], y_=states, x_val_=[states_e_, actions_e, initial_gru_state_val], y_val_=states_e, bsize_a=self.env.batch_size, bsize_b=self.env.batch_size) else: initial_gru_state = np.ones((1, self.forward_model.encoding_size)) self.forward_model.train(x_=[states_, actions, initial_gru_state], y_=states, ex_wts=None) # 1.1 prediction (for development) # self.forward_model.predict(x_=[states_, actions, initial_gru_state], y_=states) # 2. Discriminator labels = tf.concat([1 - self.label, self.label], 1) d = self.discriminator.forward(states, actions) # 2.1 0-1 accuracy correct_predictions = tf.equal(tf.argmax(d, 1), tf.argmax(labels, 1)) self.discriminator.acc = tf.reduce_mean(tf.cast(correct_predictions, "float")) # 2.2 prediction d_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=d, labels=labels) # cost sensitive weighting (weight true=expert, predict=agent mistakes) d_loss_weighted = self.env.cost_sensitive_weight * tf.multiply(tf.to_float(tf.equal(tf.squeeze(self.label), 1.)), d_cross_entropy) +\ tf.multiply(tf.to_float(tf.equal(tf.squeeze(self.label), 0.)), d_cross_entropy) discriminator_loss = tf.reduce_mean(d_loss_weighted) self.discriminator.train(objective=discriminator_loss) # 3. Collect experience mu = self.policy.forward(states) if self.env.continuous_actions: a = common.denormalize(mu, self.er_expert.actions_mean, self.er_expert.actions_std) eta = tf.random_normal(shape=tf.shape(a), stddev=self.env.sigma) self.action_test = tf.squeeze(a + self.noise * eta) else: a = common.gumbel_softmax(logits=mu, temperature=self.temp) self.action_test = tf.argmax(a, dimension=1) # 4.3 AL def policy_loop(state_, t, total_cost, total_trans_err, _): mu = self.policy.forward(state_, reuse=True) if self.env.continuous_actions: eta = self.env.sigma * tf.random_normal(shape=tf.shape(mu)) action = mu + eta else: action = common.gumbel_softmax_sample(logits=mu, temperature=self.temp) # minimize the gap between agent logit (d[:,0]) and expert logit (d[:,1]) d = self.discriminator.forward(state_, action, reuse=True) cost = self.al_loss(d) # add step cost total_cost += tf.multiply(tf.pow(self.gamma, t), cost) # get action if self.env.continuous_actions: a_sim = common.denormalize(action, self.er_expert.actions_mean, self.er_expert.actions_std) else: a_sim = tf.argmax(action, dimension=1) # get next state state_env, _, env_term_sig, = self.env.step(a_sim, mode='tensorflow')[:3] state_e = common.normalize(state_env, self.er_expert.states_mean, self.er_expert.states_std) state_e = tf.stop_gradient(state_e) # state_a, _ = self.forward_model.forward([state_, action, initial_gru_state], reuse=True) state_a, _ = self.forward_model.forward(inputs=[state_, action, initial_gru_state], is_training=False, dtype=tf.float32, w_dict=None, ex_wts=None, reuse=True) state, nu = common.re_parametrization(state_e=state_e, state_a=state_a) total_trans_err += tf.reduce_mean(abs(nu)) t += 1 return state, t, total_cost, total_trans_err, env_term_sig def policy_stop_condition(state_, t, cost, trans_err, env_term_sig): cond = tf.logical_not(env_term_sig) cond = tf.logical_and(cond, t < self.env.n_steps_train) cond = tf.logical_and(cond, trans_err < self.env.total_trans_err_allowed) return cond state_0 = tf.slice(states, [0, 0], [1, -1]) loop_outputs = tf.while_loop(policy_stop_condition, policy_loop, [state_0, 0., 0., 0., False]) self.policy.train(objective=loop_outputs[2]) def al_loss(self, d): logit_agent, logit_expert = tf.split(axis=1, num_or_size_splits=2, value=d) # Cross entropy loss labels = tf.concat([tf.zeros_like(logit_agent), tf.ones_like(logit_expert)], 1) d_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=d, labels=labels) loss = tf.reduce_mean(d_cross_entropy) return loss*self.env.policy_al_w