def wgangp_loss(inputshape, noiseshape, generator, discriminator, K): opt = Adam(lr=1e-4, beta_1=0, beta_2=0.9) ϵ_input = K.placeholder(shape=(None,1,1,1)) realimg = Input(shape=imageshape) noise = Input(shape=noiseshape) fakeimg = generator(noise) d_real = discriminator(realimg) d_fake = discriminator(fakeimg) d_loss1 = K.mean(d_real, axis=-1) d_loss2 = K.mean(d_fake, axis=-1) mixed_input = Input(shape=imageshape, tensor=ϵ_input * realimg + (1-ϵ_input) * fakeimg) grad_mixed = K.gradients(discriminator(mixed_input), [mixed_input])[0] norm_grad_mixed = K.sqrt(K.sum(K.square(grad_mixed), axis=[1,2,3])) grad_penalty = K.mean(K.square(norm_grad_mixed -1)) d_loss = d_loss2 - d_loss1 + 10*grad_penalty d_training_updates = opt.get_updates(discriminator.trainable_weights,[], d_loss) d_train = K.function([realimg, noise, ϵ_input], [d_loss], d_training_updates) g_loss = - K.mean(d_fake, axis=-1) g_training_updates = opt.get_updates(generator.trainable_weights,[], g_loss) g_train = K.function([noise], [g_loss], g_training_updates) return d_train, g_train
def _build_optimizer(self): """build optimizer and loss method. Returns: [actor optimizer, critic optimizer]. """ # actor optimizer actions = K.placeholder(shape=(None, 1)) advantages = K.placeholder(shape=(None, 1)) action_pred = self.actor.output entropy = K.sum(action_pred * K.log(action_pred + 1e-10), axis=1) closs = K.binary_crossentropy(actions, action_pred) actor_loss = K.mean(closs * K.flatten(advantages)) - 0.01 * entropy actor_optimizer = Adam(lr=self.actor_lr) actor_updates = actor_optimizer.get_updates(self.actor.trainable_weights, [], actor_loss) actor_train = K.function([self.actor.input, actions, advantages], [], updates=actor_updates) # critic optimizer discounted_reward = K.placeholder(shape=(None, 1)) value = self.critic.output critic_loss = K.mean(K.square(discounted_reward - value)) critic_optimizer = Adam(lr=self.critic_lr) critic_updates = critic_optimizer.get_updates(self.critic.trainable_weights, [], critic_loss) critic_train = K.function([self.critic.input, discounted_reward], [], updates=critic_updates) return [actor_train, critic_train]
def __init__(self, latent_dim, hidden_dim, exploration_probability, clip_value, value_decay, data, batch_size, exploration_decay_rate): self.latent_dim = latent_dim self.words = data["words"] self.depth = 1 + max(len(w) for w in self.words) depth = self.depth self.hidden_dim = hidden_dim self.characters = data["characters"] self.charset = data["charset"] self.charmap = data["charmap"] self.wordcount = len(self.words) self.charcount = len(self.charset) self.generator = Generator("generator", latent_dim, depth, self.charcount, hidden_dim, exploration_probability, exploration_decay_rate) self.discriminator = Discriminator("discriminator", depth, self.charcount, hidden_dim) self.clip_value = np.float32(clip_value) self.value_decay = theano.shared(np.float32(value_decay), "value_decay") self.batch_size = batch_size self.word_vectors = np.vstack([self.word_to_vector(word).reshape((1, -1)) for word in self.words]).astype( np.int32) xreal = Input((depth,), name="xreal", dtype="int32") batch_n = T.iscalar("batch_n") srng = RandomStreams(seed=234) z = srng.normal(size=(batch_n, latent_dim)) e = srng.uniform(size=(batch_n, depth), low=0, high=1) ex = srng.random_integers(size=(batch_n, latent_dim), low=0, high=self.charcount) # z = Input((latent_dim,), name="z", dtype="float32") # e = Input((depth,), name="e", dtype="float32") # ex = Input((depth,), name="ex", dtype="int32") # xreal = T.imatrix("xreal") # z = T.fmatrix("z") # e = T.fmatrix("e") # ex = T.imatrix("ex") _, xfake = self.generator.policy(z, e, ex) xfake = theano.gradient.zero_grad(xfake) # print("xfake: {}, {}".format(xfake, xfake.type)) # print("xreal: {}, {}".format(xreal, xreal.type)) _, yfake = self.discriminator.discriminator(xfake) _, yreal = self.discriminator.discriminator(xreal) dloss = T.mean(yfake, axis=None) - T.mean(yreal, axis=None) dconstraints = {p: ClipConstraint(self.clip_value) for p in self.discriminator.clip_params} dopt = Adam(1e-4) dupdates = dopt.get_updates(self.discriminator.params, dconstraints, dloss) n = z.shape[0] outputs_info = [T.zeros((n,), dtype='float32')] yfaker = T.transpose(yfake[:, ::-1], (1, 0)) vtarget, _ = theano.scan(reward_function, outputs_info=outputs_info, sequences=yfaker, non_sequences=self.value_decay) vtarget = T.transpose(vtarget, (1, 0))[:, ::-1] # print("vtarget: {}, {}, {}".format(vtarget, vtarget.ndim, vtarget.type)) _, vpred = self.generator.value(z, xfake) gloss = T.mean(T.abs_(vtarget - vpred), axis=None) gopt = Adam(1e-5) gupdates = gopt.get_updates(self.generator.params, {}, gloss) self.discriminator_train_function = theano.function([xreal, batch_n], [dloss], updates=dupdates) self.generator_train_function = theano.function([batch_n], [gloss], updates=gupdates) self.generator_sample_function = theano.function([batch_n], [xfake]) self.test_function = theano.function([xreal, batch_n], [dloss, gloss])
def actor_optimizer(self): action = K.placeholder(shape=[None, self.action_size]) log_old_pi = K.placeholder(shape=[ None, ]) advantages = K.placeholder(shape=[ None, ]) mu = self.actor.output std = 0.1 log_pi = -0.5 * K.square( (action - mu) / std) - 0.5 * K.log(2 * np.pi) - K.log(std) ratio = K.exp(log_pi - log_old_pi) cliped_ratio = K.clip(ratio, 1 - self.clip, 1 + self.clip) returns = K.minimum(ratio * advantages, cliped_ratio * advantages) returns = -K.mean(returns) entropy = K.sum(K.exp(log_pi) * log_pi, axis=1) entropy = K.mean(entropy) loss = returns + self.entropy * entropy optimizer = Adam(lr=self.actor_lr) updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) train = K.function([self.actor.input, action, log_old_pi, advantages], [loss], updates=updates) return train
def __init__(self, state_size, action_size, action_max): states_in = Input(shape=[state_size]) h1 = Dense(units=H1_UNITS, activation='linear')(states_in) h1 = BatchNormalization()(h1) h1 = Activation('relu')(h1) h2 = Dense(units=H2_UNITS, activation='linear')(h1) h2 = BatchNormalization()(h2) h2 = Activation('relu')(h2) raw_actions = Dense(units=action_size, activation='tanh')(h2) actions = Lambda(lambda ra: ra * action_max)(raw_actions) self.model = Model(inputs=states_in, outputs=actions) # TODO:以下梯度策略算法没搞明白 action_gradients = Input(shape=[action_size]) loss = K.mean(-action_gradients * actions) # Incorporate any additional losses here (e.g. from regularizers) optimizer = Adam(lr=ACTOR_LR) updates_op = optimizer.get_updates(params=self.model.trainable_weights, loss=loss) self.train_fn = K.function( inputs=[self.model.input, action_gradients, K.learning_phase()], outputs=[], updates=updates_op)
def optimizer(self): """ grad Loss = - mean_t (G_t * grad log pi(s_t, a_t) ) over an entire episide """ #Placeholders states_pl = self.model.input actions_onehot_pl = K.placeholder(name='actions', shape=(None, self.output_dim)) return_pl = K.placeholder(shape=(None, )) #Loss pi_pl = self.model.output pi_vec = K.sum(actions_onehot_pl * pi_pl, axis=1) loss_vec = -K.log(pi_vec) * K.stop_gradient(return_pl) loss = K.mean(loss_vec) #Apply updates opt = Adam(self.lr) pars = self.model.trainable_weights updates = opt.get_updates(loss=loss, params=pars) return K.function(inputs=[states_pl, actions_onehot_pl, return_pl], outputs=[], updates=updates)
def actor_optimizer(self): action = K.placeholder(shape=(None, self.action_size)) advantages = K.placeholder(shape=(None, )) policy = self.actor.output good_prob = K.sum(((action - policy[:, :self.action_size]) / (policy[:, self.action_size:] + 1e-9))**2 + K.log(policy[:, self.action_size:] + 1e-9), axis=1) #good_prob = -K.sum(-2 * ((action - policy[:, :self.action_size]) / (policy[:, self.action_size:] + 1e-9)) - K.log(policy[:, self.action_size:] + 1e-9), axis=1) #good_prob = K.sum(action * policy, axis=1) #eligibility = K.log(good_prob + 1e-10) * K.stop_gradient(advantages) #eligibility = good_prob * K.stop_gradient(advantages) eligibility = good_prob * K.stop_gradient(advantages) #eligibility = -K.stop_gradient(advantages) loss = K.sum(eligibility) entropy = K.sum((policy**2) * K.log((policy**2) + 1e-10), axis=1) actor_loss = loss + 0.01 * entropy #actor_loss = K.sum((action - policy[:, :self.action_size]))# / (policy[:, self.action_size:] + 1e-9)) + K.sum(K.log(policy[:, self.action_size:] + 1e10)) optimizer = Adam(lr=self.actor_lr) updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss) train = K.function([self.actor.input, action, advantages], [], updates=updates) return train
def set_mc_optimizer_fcn(self, output_action_num): # Determine optimization function # - Arg: input_state, action and reward # -> Calculate cross entropy loss function # Set the action place holder action_pseudo = K.placeholder(shape=[None, output_action_num]) value_normdis = K.placeholder(shape=[ None, ]) # Set the action probability - expectation of model output action_prob = K.sum(action_pseudo * self.policy_model.output, axis=1) # Set the cross entropy loss function cross_entropy = K.log(action_prob) * value_normdis loss = -K.sum(cross_entropy) # Declare train function with optimizer optimizer = Adam(lr=self.conf_lrn_rate) updates = optimizer.get_updates(self.policy_model.trainable_weights, [], loss) train_fcn = K.function( [self.policy_model.input, action_pseudo, value_normdis], [], updates=updates) return train_fcn
def optimizer(self): """ gradL = - E_{t} * ( Adv(t)*grad_{\theta} log(\pi(s_t, a_t)) ) where E_{t} is the average over an episode """ #Placeholders state_pl = self.model.input action_onehot_pl = K.placeholder(name='action_onehot', shape=(None, self.output_dim)) adv_pl = K.placeholder(name='advantage', shape=(None, )) #Set up loss pi_pl = self.model.output pi_vec = K.sum(action_onehot_pl * pi_pl, axis=1) loss_vec = -K.log(pi_vec) * K.stop_gradient(adv_pl) loss = K.mean(loss_vec) #Get updates opt = Adam(self.lr) pars = self.model.trainable_weights updates = opt.get_updates(loss=loss, params=pars) return K.function(inputs=[state_pl, action_onehot_pl, adv_pl], outputs=[], updates=updates)
def compile_train_fn(model, learning_rate=1e-5): """ Build the CTC training routine for speech models. Args: model: A keras model (built=True) instance Returns: train_fn (theano.function): Function that takes in acoustic inputs, and updates the model. Returns network outputs and ctc cost """ logger.info("Building train_fn") acoustic_input = model.inputs[0] network_output = model.outputs[0] label = K.placeholder(ndim=2, dtype='int32') label_lens = K.placeholder(ndim=2, dtype='int32') ctc_input_lengths = K.placeholder(ndim=2, dtype='int32') # ctc_cost = K.mean(K.ctc_batch_cost(label, network_output, ctc_input_lengths, label_lens)) ctc_cost = K.ctc_batch_cost(label, network_output, ctc_input_lengths, label_lens) trainable_vars = model.trainable_weights optimz = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, decay=0.0, epsilon=10e-8) # optimz = SGD(lr=1e-03, clipnorm=100, decay=1e-6, momentum=0.9, nesterov=True) updates = list(optimz.get_updates(trainable_vars, [], ctc_cost)) train_fn = K.function([ acoustic_input, ctc_input_lengths, label, label_lens, K.learning_phase() ], [network_output, ctc_cost], updates) return train_fn
def training_function(training_model): def sim(v1, v2): return K.sum(v1 * v2, axis=-1) target_out = training_model.outputs[0] relevant_out = training_model.outputs[1] violate_out = training_model.outputs[2] margin = K.placeholder(shape=(None, )) loss = K.abs(margin + sim(target_out, violate_out) - sim(target_out, relevant_out)) # adam = Adadelta(lr=0.01) adam = Adam(lr=1e-4) updates = adam.get_updates(params=training_model.trainable_weights, loss=loss) return K.function( inputs=[ training_model.inputs[0], # target_inputs: p_input training_model.inputs[1], # l_input training_model.inputs[2], # relevant_input training_model.inputs[3], # violate_input margin ], # traj distance outputs=[loss], updates=updates)
def build_actor_optimizer(self): action = keras.backend.placeholder(shape=[None, self.action_size]) advantages = keras.backend.placeholder(shape=[ None, ]) policy = self.actor.output action_prob = keras.backend.sum(action * policy, axis=1) cross_entropy = keras.backend.log(action_prob + 1e-6) * advantages cross_entropy = -keras.backend.mean(cross_entropy) entropy = keras.backend.sum(policy * keras.backend.log(policy + 1e-6), axis=1) entropy = keras.backend.mean(entropy) loss = cross_entropy + self.entropy * entropy optimizer = Adam(lr=self.actor_lr) updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) train = keras.backend.function( [self.actor.input[0], self.actor.input[1], action, advantages], [loss], updates=updates) return train
def optimizer(self): """ The gradient of the loss function L is \grad L = \grad_pars V ( V(s_t) - Q(s_t, a_t) + \alpha*log( \pi(s_t, a_t) ) ) """ #Find terms in bracket S_pl = self.model.input Pi_pl = K.placeholder(shape=(None, 1)) Q_pl = K.placeholder(shape=(None, 1)) V_pl = self.model.output temp = V_pl - Q_pl + self.alpha * K.log(Pi_pl) #Find gradient pars = self.model.trainable_weights grads = tf.gradients(V_pl, pars, -temp) #scalar multiply by temp #Clip gradients if self.clipnorm == True: grads = tf.clip_by_global_norm(grads, self.clipnorm_val)[0] #Do learning #To get keras to apply updates given a custom gradients (i.e. run the above line) I had to alter the source #Code. It was easy to do. See line X in the get_updates function. opt = Adam(self.lr) loss = grads #placeholder, keras doesn't use it updates = opt.get_updates(loss=loss, params=pars, grads=grads) #This function will apply updates when called func = K.function(inputs=[S_pl, Q_pl, Pi_pl], outputs=[], updates=updates) return func
def actor_optimizer(self): action = K.placeholder(shape=(None, 1)) advantages = K.placeholder(shape=(None, 1)) #self.model.outputs mu, sigma_sq = self.actor.output #mu, sigma_sq = self.actor.predict(state) #entropy of Gaussian entropy_loss = ENTROPY_BETA * ( -K.mean(0.5 * (K.log(2. * np.pi * sigma_sq) + 1.))) #Prob Density Fn (PDF) #if sigma_sq is not None: #problem with clip, don't use TF tensor as bool error #sigma_sq = np.clip(sigma_sq,1e-3, None) p1 = -((action - mu)**2) / (2 * K.clip(sigma_sq, 1e-3, None) ) #clip min only p2 = -K.log(K.sqrt(2 * np.pi * sigma_sq)) #log prob(a|s) given theta log_prob = p1 + p2 #log_prob * score fn = advantage log_prob_v = advantages * log_prob loss_policy_v = -K.mean(log_prob_v) #sum losses loss_v = loss_policy_v + entropy_loss optimizer = Adam(lr=self.actor_lr) updates = optimizer.get_updates(self.actor.trainable_weights, [], loss_v) train = K.function([self.actor.input, action, advantages], [], updates=updates) return train
def actor_optimizer(self): action = K.placeholder(shape=(None, self.action_size)) advantages = K.placeholder(shape=(None, )) policy = self.actor.output good_prob = K.sum(action * policy, axis=1) eligibility = K.log(good_prob + 1e-10) * K.stop_gradient( advantages) # 1e-10 to 1e-8 loss = -K.sum(eligibility) entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) # 1e-10 to 1e-8 actor_loss = loss + 0.01 * entropy optimizer = Adam(lr=self.actor_lr) #updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss) #train = K.function([self.actor.input, action, advantages], [], updates=updates) updates = optimizer.get_updates(params=self.actor.trainable_weights, loss=actor_loss) train = K.function([self.actor.input, action, advantages], [], updates=updates) return train
def critic_optimizer(self): discounted_reward = K.placeholder(shape=(None, ), name="discounted_reward") value = self.critic.output critic_loss = K.square(discounted_reward - value) optimizer = Adam(lr=self.critic_lr, beta_1=self.beta_1, beta_2=self.beta_2, epsilon=self.epsilon) with self.__global_score_list_lock: updates = optimizer.get_updates(self.critic.trainable_weights, [], critic_loss) weights_0 = self.actor.trainable_weights[0] weights_1 = self.actor.trainable_weights[1] weights_2 = self.actor.trainable_weights[2] weights_3 = self.actor.trainable_weights[3] train = K.function( [self.critic.input, discounted_reward], [critic_loss, value, weights_0, weights_1, weights_2, weights_3], updates=updates) global_logger.info("discounted_reward: {0}".format(discounted_reward)) global_logger.info("critic ourput value: {0}".format(value)) global_logger.info("self.critic.trainable_weights: {0}".format( self.critic.trainable_weights)) return train
def compile_train_fn(self, learning_rate=2e-4): """ Build the CTC training routine for speech models. Args: learning_rate (float) Returns: train_fn (theano.function): Function that takes in acoustic inputs, and updates the model. Returns network outputs and ctc cost """ logger.info("Building train_fn") f_inputs = [self.acoustic_input, self.ctc_in_lens] f_outputs = [] f_updates = [] for branch in self.branch_outputs: labels, label_lens = self.branch_labels[branch.name] f_inputs.append(labels) f_inputs.append(label_lens) if K.backend() == 'tensorflow': network_output = branch.output ctc_cost = K.mean( K.ctc_batch_cost(labels, network_output, self.ctc_in_lens, label_lens)) else: network_output = branch.output.dimshuffle((1, 0, 2)) ctc_cost = ctc_th.gpu_ctc(network_output, labels, self.ctc_in_lens).mean() f_outputs.extend([network_output, ctc_cost]) trainable_vars = self.branch_vars[branch.name] optmz = Adam(lr=learning_rate, clipnorm=100) f_updates.extend(optmz.get_updates(trainable_vars, [], ctc_cost)) f_inputs.append(K.learning_phase()) self.train_fn = K.function(f_inputs, f_outputs, f_updates) return self.train_fn
def build_actor(self, state_size, action_size): h1_size = 64 h2_size = 32 h3_size = 16 states = Input(shape=[state_size], name='states') h1 = Dense(h1_size, activation='relu', name='hidden1')(states) h2 = Dense(h2_size, activation='relu', name='hidden2')(h1) h3 = Dense(h3_size, activation='relu', name='hidden3')(h2) # relu to make the min zero, step function in task # has safety to reduce high inputs to max speed actions_0_1 = Dense(action_size, activation='sigmoid', name='actions_0_1')(h3) actions = Lambda(lambda x: (x * self.action_range) + self.action_low, name='output_actions')(actions_0_1) self.model = Model(inputs=states, outputs=actions) action_gradients = Input(shape=([self.action_size]), name='action_grads') loss = K.mean(-action_gradients * actions) optimizer = Adam() updates_op = optimizer.get_updates(params=self.model.trainable_weights, loss=loss) self.train_fn = K.function( inputs=[self.model.input, action_gradients, K.learning_phase()], outputs=[], updates=updates_op)
def model_optimizer(self): target = K.placeholder(shape=[None, self.action_size]) weight = K.placeholder(shape=[ None, ]) # hubber loss에 대한 코드입니다. clip_delta = 1.0 pred = self.model.output err = target - pred cond = K.abs(err) < clip_delta squared_loss = 0.5 * K.square(err) linear_loss = clip_delta * (K.abs(err) - 0.5 * clip_delta) loss1 = tf.where(cond, squared_loss, linear_loss) # 기존 hubber loss에 importance sampling ratio를 곱하는 형태의 PER loss를 정의합니다. weighted_loss = tf.multiply(tf.expand_dims(weight, -1), loss1) loss = K.mean(weighted_loss, axis=-1) optimizer = Adam(lr=self.learning_rate) updates = optimizer.get_updates(self.model.trainable_weights, [], loss) train = K.function([self.model.input, target, weight], [err], updates=updates) return train
def build_actor_optimizer(self): action = keras.backend.placeholder(shape=[None, self.action_size]) advantages = keras.backend.placeholder(shape=[ None, ]) policy = self.actor.output action_prob = keras.backend.sum(action * policy, axis=1) log_prob_actions_v = keras.backend.log(action_prob + 1e-6) * advantages loss_p = -keras.backend.sum(log_prob_actions_v) loss = keras.backend.mean(loss_p) optimizer = Adam(lr=self.actor_lr) updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) train = keras.backend.function( [self.actor.input[0], self.actor.input[1], action, advantages], [loss], updates=updates) return train
def __build_train_fn(self): action_prob_placeholder = self.speaker_model.output action_onehot_placeholder = K.placeholder(shape=(None, self.alphabet_size), name="action_onehot") reward_placeholder = K.placeholder(shape=(None, ), name="reward") action_prob = K.sum(action_prob_placeholder * action_onehot_placeholder, axis=1) log_action_prob = K.log(action_prob) loss = -log_action_prob * reward_placeholder ## Add entropy to the loss entropy = K.sum(action_prob_placeholder * K.log(action_prob_placeholder + 1e-10), axis=1) entropy = K.sum(entropy) loss = loss + 0.1 * entropy loss = K.mean(loss) adam = Adam() updates = adam.get_updates(params=self.speaker_model.trainable_weights, loss=loss) self.train_fn = K.function(inputs=[ self.speaker_model.input, action_onehot_placeholder, reward_placeholder ], outputs=[loss, entropy], updates=updates)
def critic_optimizer(self): target = K.placeholder(shape=[None, ]) loss = K.mean(K.square(target - self.critic.output)) optimizer = Adam(lr=self.critic_lr) updates = optimizer.get_updates(self.critic.trainable_weights, [], loss) train = K.function([self.critic.input, target], [], updates=updates) return train
def __build_train_fn(self): # def loss(discount_r): # def f(y_true, y_pred): # action_prob = K.sum(y_true*y_pred, axis=1) # action_prob = K.log(action_prob) # policy_loss = -K.sum(discount_r) * K.mean(action_prob) # policy_loss = K.print_tensor(policy_loss) # return policy_loss # return f # discount_reward_ = Input(shape=(1,)) # state = Input(shape=(6400,)) # pi_action = self.model(state) # model = Model([state, discount_reward_], pi_action) # adam = Adam(lr=1e-4) # rmsprop = RMSprop(lr=1e-4 ,clipnorm=1) #10 # model.compile(optimizer=rmsprop, loss=loss(discount_reward_)) action_prob_placeholder = self.model.output action_onehot_placeholder = K.placeholder(shape=(None, 2)) discount_reward_placeholder = K.placeholder(shape=(None,)) action_prob = K.sum(action_prob_placeholder * action_onehot_placeholder, axis=1) log_action_prob = K.log(action_prob) loss = - log_action_prob * discount_reward_placeholder loss = K.sum(loss) adam = Adam(lr=1e-4)#,decay = 0.99) rmsprop = RMSprop(lr=1e-4, decay=0.99) updates = adam.get_updates(params=self.model.trainable_weights, loss=loss) self.train_fn = K.function(inputs=[self.model.input, action_onehot_placeholder, discount_reward_placeholder], outputs=[loss], updates=updates)
def build_critic_optimizer(self): y = keras.backend.placeholder(shape=(None, 1)) value = self.critic.output # # Huber loss error = tf.abs(y - value) quadratic = keras.backend.clip(error, 0.0, 1.0) linear = error - quadratic loss = keras.backend.mean(0.5 * keras.backend.square(quadratic) + linear) optimizer = Adam(lr=self.critic_lr) updates = optimizer.get_updates(self.critic.trainable_weights, [], loss) train = keras.backend.function( [self.critic.input[0], self.critic.input[1], y], [loss], updates=updates) return train
def actor_optimizer(self): action = K.placeholder(shape=(None, 1)) advantages = K.placeholder(shape=(None, 1)) # mu = K.placeholder(shape=(None, self.action_size)) # sigma_sq = K.placeholder(shape=(None, self.action_size)) mu, sigma_sq = self.actor.output pdf = 1. / K.sqrt(2. * np.pi * sigma_sq) * K.exp( -K.square(action - mu) / (2. * sigma_sq)) log_pdf = K.log(pdf + K.epsilon()) entropy = K.sum(0.5 * (K.log(2. * np.pi * sigma_sq) + 1.)) exp_v = log_pdf * advantages exp_v = K.sum(exp_v + 0.01 * entropy) actor_loss = -exp_v optimizer = Adam(lr=self.actor_lr) updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss) train = K.function([self.actor.input, action, advantages], [], updates=updates) return train
def build_model(self, target=False): states = Input(shape=(self.state_size, ), name='states') net = Dense(units=40, activation='relu')(states) net = Dense(units=20, activation='relu')(net) actions = Dense(units=self.action_size, activation='tanh', name='actions')(net) if target: self.target = Model(inputs=states, outputs=actions) return self.model = Model(inputs=states, outputs=actions) action_gradients = Input(shape=(self.action_size, )) loss = K.mean(-action_gradients * actions) optimizer = Adam(lr=0.0001) updates_op = optimizer.get_updates(params=self.model.trainable_weights, loss=loss) self.train_fn = K.function( inputs=[self.model.input, action_gradients, K.learning_phase()], outputs=[], updates=updates_op)
def K_function_train(): # K.function을 이용해서, training을 할 수 있다. from keras import backend as K from keras.layers.core import Dense from keras.models import Sequential from keras.optimizers import Adam y = K.placeholder(shape=[None, 1]) model = Sequential() model.add(Dense(24, input_dim=3, activation='relu')) model.add(Dense(1)) loss = K.mean(K.square(model.output - y)) optimizer = Adam(lr=0.001) updates = optimizer.get_updates(model.trainable_weights, [], loss) train = K.function([model.input, y], [model.output, loss], updates=updates) # train 해보기 data = np.random.randn(2, 3) target = np.random.randn(2, 1) output = model.predict(data) for i in range(1000): temp = train([data, target]) if i % 100 == 0: print(i, temp) # 결과 확인 print('data', data) print('target', target) print('predict', model.predict(data))
def actor_optimizer(self): #placeholders for actions and advantages parameters coming in action = K.placeholder(shape=(None, 1)) advantages = K.placeholder(shape=(None, 1)) # mu = K.placeholder(shape=(None, self.action_size)) # sigma_sq = K.placeholder(shape=(None, self.action_size)) mu, sigma_sq = self.actor.output #defined a custom loss using PDF formula, K.exp is element-wise exponential pdf = 1. / K.sqrt(2. * np.pi * sigma_sq) * K.exp(-K.square(action - mu) / (2. * sigma_sq)) #log pdf why? log_pdf = K.log(pdf + K.epsilon()) #entropy looks different from log(sqrt(2 * pi * e * sigma_sq)) #Sum of the values in a tensor, alongside the specified axis. entropy = K.sum(0.5 * (K.log(2. * np.pi * sigma_sq) + 1.)) exp_v = log_pdf * advantages #entropy is made small before added to exp_v exp_v = K.sum(exp_v + 0.01 * entropy) #loss is a negation actor_loss = -exp_v #use custom loss to perform updates with Adam, ie. get gradients optimizer = Adam(lr=self.actor_lr) updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss) #adjust params with custom train function train = K.function([self.actor.input, action, advantages], [], updates=updates) #return custom train function return train
def __init__(self, render=False): self.render = True self.state_dim = env.observation_space.n self.action_count = env.action_space.n self.update_frequency = 5 self.discount_factor = 0.9 self.running_variance = RunningVariance() print('state_dim: {}, action_count: {}, update_frequency: {} '.format( self.state_dim, self.action_count, self.update_frequency)) #actor network actor = Sequential() actor.add(Dense(16, input_shape=(self.state_dim,), activation='relu', kernel_initializer='he_uniform')) actor.add(Dense(self.action_count, activation='softmax', kernel_initializer='he_uniform')) actor.summary() self.actor = actor #actor_optimizer action = K.placeholder(shape=[None, self.action_count]) advantage = K.placeholder(shape=[None, ]) action_prob = K.sum(action - self.actor.output, axis=1) cross_entropy = K.log(action_prob) * advantage loss = -K.sum(cross_entropy) optimizer = Adam(lr=0.005) updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) train = K.function([self.actor.input, action, advantage], [], updates=updates) self.actor_optimizer = train #critic network critic = Sequential() critic.add(Dense(16, input_shape=(self.state_dim,), activation='relu', kernel_initializer='he_uniform')) critic.add(Dense(1, activation='linear', kernel_initializer='he_uniform')) critic.summary() self.critic = critic #critic optimizer target = K.placeholder(shape=[None, ]) loss = K.mean(K.square(target - self.critic.output)) optimizer = Adam(lr=0.001) updates = optimizer.get_updates(self.critic.trainable_weights, [], loss) train = K.function([self.critic.input, target], [], updates=updates) self.critic_optimizer = train
def scores_from_adgan_generator(x_test, prior_gen, generator, n_seeds=8, k=5, z_lr=0.25, gen_lr=5e-5): generator.trainable = True initial_weights = generator.get_weights() gen_opt = Adam(lr=gen_lr, beta_1=0.5) z_opt = Adam(lr=z_lr, beta_1=0.5) x_ph = K.placeholder((1, ) + x_test.shape[1:]) z = K.variable(prior_gen(1)) rec_loss = K.mean(K.square(x_ph - generator(z))) z_train_fn = K.function([x_ph], [rec_loss], updates=z_opt.get_updates(rec_loss, [z])) g_train_fn = K.function([x_ph, K.learning_phase()], [rec_loss], updates=gen_opt.get_updates( rec_loss, generator.trainable_weights)) gen_opt_initial_params = gen_opt.get_weights() z_opt_initial_params = z_opt.get_weights() scores = [] for x in x_test: x = np.expand_dims(x, axis=0) losses = [] for j in range(n_seeds): K.set_value(z, prior_gen(1)) generator.set_weights(initial_weights) gen_opt.set_weights(gen_opt_initial_params) z_opt.set_weights(z_opt_initial_params) for _ in range(k): z_train_fn([x]) g_train_fn([x, 1]) loss = z_train_fn([x])[0] losses.append(loss) score = -np.mean(losses) scores.append(score) return np.array(scores)
def critic_optimizer(self): discounted_reward = K.placeholder(shape=(None, )) value = self.critic.output loss = K.mean(K.square(discounted_reward - value)) optimizer = Adam(lr=self.critic_lr) updates = optimizer.get_updates(self.critic.trainable_weights, [], loss) train = K.function([self.critic.input, discounted_reward], [], updates=updates) return train
def optimizer(self): action = K.placeholder(shape=[None, 5]) discounted_rewards = K.placeholder(shape=[None, ]) # Calculate cross entropy error function action_prob = K.sum(action * self.model.output, axis=1) cross_entropy = K.log(action_prob) * discounted_rewards loss = -K.sum(cross_entropy) # create training function optimizer = Adam(lr=self.learning_rate) updates = optimizer.get_updates(self.model.trainable_weights, [], loss) train = K.function([self.model.input, action, discounted_rewards], [], updates=updates) return train
def actor_optimizer(self): action = K.placeholder(shape=(None, self.action_size)) advantages = K.placeholder(shape=(None, )) policy = self.actor.output good_prob = K.sum(action * policy, axis=1) eligibility = K.log(good_prob + 1e-10) * K.stop_gradient(advantages) loss = -K.sum(eligibility) entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) actor_loss = loss + 0.01*entropy optimizer = Adam(lr=self.actor_lr) updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss) train = K.function([self.actor.input, action, advantages], [], updates=updates) return train
regularizers += _regularizers constraints += _consts updates += _updates print("parameters:") print(params) print("regularizers:") print(regularizers) print("constrains:") print(constraints) print("updates:") print(updates) """updates""" optimizer = Adam() _updates = optimizer.get_updates(params, constraints, train_loss) updates += _updates print("after Adam, updates:") for update in updates: print(update) train_ins = [X_train, y, weights] test_ins = [X_test, y, weights] predict_ins = [X_test] """Get functions""" print("complie: _train") _train = K.function(train_ins, [train_loss], updates=updates) print("complie: _train_with_acc") _train_with_acc = K.function(train_ins, [train_loss, train_accuracy], updates=updates)