def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): """ In order to get this to work we need to be careful not to update the actor parameters when updating the critic. This can be an issue when the Concatenating networks together. The first first network becomes a part of the second. However you can still access the first network by itself but an updates on the second network will effect the first network. Care needs to be taken to make sure only the parameters of the second network are updated. """ super(DPG, self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) self._Fallen = T.bcol("Fallen") ## because float64 <= float32 * int32, need to use int16 or int8 self._Fallen.tag.test_value = np.zeros((self._batch_size, 1), dtype=np.dtype('int8')) self._fallen_shared = theano.shared(np.zeros((self._batch_size, 1), dtype='int8'), broadcastable=(False, True)) self._Action = T.matrix("Action2") self._Action.tag.test_value = np.random.rand(self._batch_size, self._action_length) self._Tmp_Target = T.col("Tmp_Target") self._Tmp_Target.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._tmp_target_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._modelTarget = copy.deepcopy(model) # print ("Initial W " + str(self._w_o.get_value()) ) self._learning_rate = self.getSettings()['learning_rate'] self._discount_factor = self.getSettings()['discount_factor'] self._rho = self.getSettings()['rho'] self._rms_epsilon = self.getSettings()['rms_epsilon'] self._weight_update_steps = self.getSettings( )['steps_until_target_network_update'] self._updates = 0 self._decay_weight = self.getSettings()['regularization_weight'] self._critic_regularization_weight = self.getSettings( )["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] # self._q_valsA = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) # self._q_valsA_drop = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) # self._q_valsNextState = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) # self._q_valsTargetNextState = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) # self._q_valsTarget = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) # self._q_valsTarget_drop = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsActA = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsActTarget = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) # self._q_valsActA_drop = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) inputs_1 = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions() } self._q_valsA = lasagne.layers.get_output( self._model.getCriticNetwork(), inputs_1) inputs_1_policy = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._q_valsActA } self._q_vals_train_policy = lasagne.layers.get_output( self._model.getCriticNetwork(), inputs_1_policy) inputs_2 = { self._modelTarget.getStateSymbolicVariable(): self._model.getResultStates(), self._modelTarget.getActionSymbolicVariable(): self._model.getActions() } self._q_valsB_ = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), inputs_2, deterministic=True) self._q_func = self._q_valsA self._q_funcB = self._q_valsB_ # self._q_funcTarget = self._q_valsTarget # self._q_func_drop = self._q_valsA_drop # self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA # self._q_funcAct_drop = self._q_valsActA_drop # self._q_funcAct = theano.function(inputs=[State], outputs=self._q_valsActA, allow_input_downcast=True) # self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsB )), self._Fallen) self._diff = self._Tmp_Target - self._q_func # self._diff_drop = self._target - self._q_func_drop # loss = 0.5 * self._diff ** 2 loss = T.pow(self._diff, 2) self._loss = T.mean(loss) # self._loss_drop = T.mean(0.5 * self._diff_drop ** 2) # assert len(lasagne.layers.helper.get_all_params(self._l_outA)) == 16 # Need to remove the action layers from these params self._params = lasagne.layers.helper.get_all_params( self._model.getCriticNetwork()) print("******Number of Layers is: " + str( len( lasagne.layers.helper.get_all_params( self._model.getCriticNetwork())))) print("******Number of Action Layers is: " + str( len( lasagne.layers.helper.get_all_params( self._model.getActorNetwork())))) self._actionParams = lasagne.layers.helper.get_all_params( self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._Action: self._q_valsActTarget, # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._Fallen: self._fallen_shared self._Tmp_Target: self._tmp_target_shared } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._model.getActions(), # self._Fallen: self._fallen_shared # self._tmp_diff: self._tmp_diff_shared } self._critic_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) ## MSE update self._value_grad = T.grad(self._loss + self._critic_regularization, self._params) print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adam(self._value_grad, self._params, self._critic_learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._model.getActionSymbolicVariable(): self._model.getActions(), } ## Some cool stuff to backprop action gradients self._action_grad = T.matrix("Action_Grad") self._action_grad.tag.test_value = np.zeros( (self._batch_size, self._action_length), dtype=np.dtype(self.getSettings()['float_type'])) self._action_grad_shared = theano.shared( np.zeros((self._batch_size, self._action_length), dtype=self.getSettings()['float_type'])) ### Maximize wrt q function self._action_mean_grads = T.grad( cost=None, wrt=self._actionParams, known_grads={self._q_valsActA: self._action_grad_shared}), print("Action grads: ", self._action_mean_grads[0]) ## When passing in gradients it needs to be a proper list of gradient expressions self._action_mean_grads = list(self._action_mean_grads[0]) # print ("isinstance(self._action_mean_grads, list): ", isinstance(self._action_mean_grads, list)) # print ("Action grads: ", self._action_mean_grads) self._actionGRADUpdates = lasagne.updates.adam( self._action_mean_grads, self._actionParams, self._learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) self._actGradGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._model.getActions(), # self._Fallen: self._fallen_shared, # self._advantage: self._advantage_shared, # self._KL_Weight: self._kl_weight_shared } # theano.gradient.grad_clip(x, lower_bound, upper_bound) # // TODO # self._actionUpdates = lasagne.updates.adam(-T.mean(self._q_vals_train_policy) + # (self._decay_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)), self._actionParams, # self._learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) if ('train_extra_value_function' in self.getSettings() and (self.getSettings()['train_extra_value_function'] == True)): self._valsA = lasagne.layers.get_output( self._model._value_function, self._model.getStateSymbolicVariable(), deterministic=True) self._valsA_drop = lasagne.layers.get_output( self._model._value_function, self._model.getStateSymbolicVariable(), deterministic=False) self._valsNextState = lasagne.layers.get_output( self._model._value_function, self._model.getResultStateSymbolicVariable(), deterministic=True) self._valsTargetNextState = lasagne.layers.get_output( self._modelTarget._value_function, self._model.getResultStateSymbolicVariable(), deterministic=True) self._valsTarget = lasagne.layers.get_output( self._modelTarget._value_function, self._model.getStateSymbolicVariable(), deterministic=True) # self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsB )), self._Fallen) # self._target = self._model.getRewardSymbolicVariable() + ((self._discount_factor * self._q_valsTargetNextState ) * self._NotFallen) + (self._NotFallen - 1) self._v_target = self._model.getRewardSymbolicVariable() + ( self._discount_factor * self._valsTargetNextState) self._v_diff = self._v_target - self._valsA # loss = 0.5 * self._diff ** 2 loss_v = T.pow(self._v_diff, 2) self._v_loss = T.mean(loss_v) self._params_value = lasagne.layers.helper.get_all_params( self._model._value_function) self._givens_value = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._NotFallen: self._NotFallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, } self._value_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model._value_function, lasagne.regularization.l2)) self._value_grad = T.grad( self._v_loss + self._value_regularization, self._params_value) print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_value = lasagne.updates.adam( self._value_grad, self._params_value, self._critic_learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) ## TD update DPG.compile(self)
def __init__(self, num_actions): # remember parameters self.num_actions = num_actions self.batch_size = BATCH_SIZE self.discount_rate = DISCOUNT_RATE self.history_length = HISTORY_LENGTH self.screen_dim = DIMS self.img_height = SCREEN_HEIGHT self.img_width = SCREEN_WIDTH self.clip_error = CLIP_ERROR self.input_color_scale = COLOR_SCALE self.target_steps = TARGET_STEPS self.train_iterations = TRAIN_STEPS self.train_counter = 0 self.momentum = MOMENTUM self.update_rule = UPDATE_RULE self.learning_rate = LEARNING_RATE self.rms_decay = RMS_DECAY self.rms_epsilon = RMS_EPSILON self.rng = np.random.RandomState(RANDOM_SEED) # set seed lasagne.random.set_rng(self.rng) # prepare tensors once and reuse them states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') # terminals are bool for our case terminals = T.bcol('terminals') # create shared theano variables self.states_shared = theano.shared( np.zeros((self.batch_size, self.history_length, self.img_height, self.img_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((self.batch_size, self.history_length, self.img_height, self.img_width), dtype=theano.config.floatX)) # !broadcast ? self.rewards_shared = theano.shared( np.zeros((self.batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((self.batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( #np.zeros((self.batch_size, 1), dtype='int32'), np.zeros((self.batch_size, 1), dtype='int8'), broadcastable=(False, True)) # can add multiple nets here self.l_primary = self.build_network() if self.target_steps > 0: self.l_secondary = self.build_network() self.copy_to_secondary() """ # input scale i.e. division can be applied to input directly also to normalize """ # define output symbols q_vals = lasagne.layers.get_output(self.l_primary, states / self.input_color_scale) if self.target_steps > 0: q_vals_secondary = lasagne.layers.get_output(self.l_secondary, next_states / self.input_color_scale) else: # why this ? q_vals_secondary = lasagne.layers.get_output(self.l_primary, next_states / self.input_color_scale) q_vals_secondary = theano.gradient.disconnected_grad(q_vals_secondary) # target = r + max target = (rewards + (T.ones_like(terminals) - terminals) * self.discount_rate * T.max(q_vals_secondary, axis=1, keepdims=True)) """ # check what this does """ diff = target - q_vals[T.arange(self.batch_size), actions.reshape((-1,))].reshape((-1, 1)) # print shape ? if self.clip_error > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_error) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_error * linear_part else: loss = 0.5 * diff ** 2 loss = T.sum(loss) params = lasagne.layers.helper.get_all_params(self.l_primary) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } g_time = time.time() logger.info("graph compiling") if self.update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.learning_rate, self.rms_decay, self.rms_epsilon) elif self.update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.learning_rate, self.rms_decay, self.rms_epsilon) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared}) logger.info("Theano Graph Compiled !! %f", time.time() - g_time)
def __init__(self, n_actions, replay_memory, build_network, updates, screen_size, initial_weights_file=None): self.screen_width, self.screen_height = screen_size self.mood_q = None self.last_q = 0 self.n_parameter_updates = 0 self.alpha = 0.00025 # update frequency ? # gradient momentum ? 0.95 # squared gradient momentum ? 0.95 # min squared gradient ? 0.01 self.save_every_n_frames = 100000 # ~ once per hour self.final_exploration_frame = 1000000 self.replay_start_size = 50000 self.i_action = 0 self.state = None self.initial_epsilon = 1 self.final_epsilon = 0.1 self.epsilon = self.initial_epsilon self.gamma = 0.99 self.replay_memory = replay_memory self.log_frequency = 1 self.minibatch_size = 32 # self.replay_memory_size = 1000000 self.target_network_update_frequency = 10000 s0_var = T.tensor4("s0", dtype=theano.config.floatX) a0_var = T.bmatrix("a0") r0_var = T.wcol("r0") s1_var = T.tensor4("s1", dtype=theano.config.floatX) future_reward_indicator_var = T.bcol("future_reward_indicator") self.n_actions = n_actions self.a_lookup = np.eye(self.n_actions, dtype=np.int8) self.network = build_network(n_actions=self.n_actions, input_var=T.cast(s0_var, 'float32') / np.float32(256), screen_size=(self.screen_height, self.screen_width)) print("Compiling forward.") self.forward = theano.function([s0_var], lasagne.layers.get_output(self.network, deterministic=True)) self.network_stale = build_network(n_actions=self.n_actions, input_var=T.cast(s1_var, 'float32') / np.float32(256), screen_size=(self.screen_height, self.screen_width)) print("Compiling forward_stale.") self.forward_stale = theano.function([s1_var], lasagne.layers.get_output(self.network_stale, deterministic=True)) self._update_network_stale() out = lasagne.layers.get_output(self.network) out_stale = lasagne.layers.get_output(self.network_stale) self.loss, self.err, __y, __q = build_loss(out=out, out_stale=out_stale, a0_var=a0_var, r0_var=r0_var, future_reward_indicator_var=future_reward_indicator_var, gamma=self.gamma) params = lasagne.layers.get_all_params(self.network, trainable=True) print("Compiling train_fn.") self.train_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], [self.loss, self.err, T.transpose(__y), T.transpose(__q), out, out_stale], updates=updates(self.loss, params)) print("Compiling loss_fn.") self.loss_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], self.loss)
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(CACLA,self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # create a small convolutional neural network self._Fallen = T.bcol("Fallen") self._Fallen.tag.test_value = np.zeros((self._batch_size,1),dtype=np.dtype('int8')) self._fallen_shared = theano.shared( np.zeros((self._batch_size, 1), dtype='int8'), broadcastable=(False, True)) self._usingDropout = True """ self._target_shared = theano.shared( np.zeros((self._batch_size, 1), dtype='float64'), broadcastable=(False, True)) """ self._critic_regularization_weight = self.getSettings()["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] # primary network self._model = model # Target network self._modelTarget = copy.deepcopy(model) self._q_valsA = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsA_drop = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsTargetNextState = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTarget = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsTarget_drop = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsActA = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsActTarget = lasagne.layers.get_output(self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsActA_drop = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_func = self._q_valsA self._q_funcTarget = self._q_valsTarget self._q_func_drop = self._q_valsA_drop self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA self._q_funcAct_drop = self._q_valsActA_drop # self._q_funcAct = theano.function(inputs=[self._model.getStateSymbolicVariable()], outputs=self._q_valsActA, allow_input_downcast=True) # self._target = (self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState )) * theano.tensor.maximum(1.0, theano.tensor.ceil(self._model.getRewardSymbolicVariable())) # Did not understand how the maximum was working # self._target = (self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState )) * theano.tensor.ceil(self._model.getRewardSymbolicVariable()) ## Don't need to use dropout for the target network self._target = (self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState )) * self._Fallen # self._target = self._model.getTargetSymbolicVariable() ## When there is no dropout in the network it will have no affect here self._diff = self._target - self._q_func self._diff_drop = self._target - self._q_func_drop loss = 0.5 * self._diff ** 2 self._loss = T.mean(loss) # self._loss_drop = T.mean(0.5 * (self._diff_drop ** 2)) self._params = lasagne.layers.helper.get_all_params(self._model.getCriticNetwork()) self._actionParams = lasagne.layers.helper.get_all_params(self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._Fallen: self._fallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._next_states_shared, # self._model.getRewardSymbolicVariable(): self._rewards_shared, self._model.getActionSymbolicVariable(): self._model.getActions() } self._critic_regularization = (self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) self._actor_regularization = (self._regularization_weight * lasagne.regularization.regularize_network_params( self._model.getActorNetwork(), lasagne.regularization.l2)) # SGD update # self._updates_ = lasagne.updates.rmsprop(self._loss + self._critic_regularization, self._params, # self._learning_rate, self._rho, self._rms_epsilon) # TD update self._updates_ = lasagne.updates.rmsprop(T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon) # actDiff1 = (self._model.getActionSymbolicVariable() - self._q_valsActTarget) #TODO is this correct? # actDiff = (actDiff1 - (self._model.getActionSymbolicVariable() - self._q_valsActA)) # self._actDiff = ((self._model.getActionSymbolicVariable() - self._q_valsActA)) # Target network does not work well here? self._actDiff = ((self._model.getActionSymbolicVariable() - self._q_valsActA_drop)) # Target network does not work well here? # self._actLoss = 0.5 * (self._actDiff ** 2) ## Should produce a single column vector or costs for each sample in the batch self._actLoss_ = T.mean(T.pow(self._actDiff, 2),axis=1) # self._actLoss = T.sum(self._actLoss)/float(self._batch_size) self._actLoss = T.mean(self._actLoss_) # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2)) ## Computes the distance between actions weighted by the distances between the states that result in those actions """ state_sum = T.mean(T.pow(self._model.getStateSymbolicVariable(),2), axis=1) Distance = ((((state_sum + T.reshape(state_sum, (1,-1)).T) - 2*T.dot(self._model.getStateSymbolicVariable(), self._model.getStateSymbolicVariable().T)))) action_sum = T.mean(T.pow(self._q_valsActA_drop,2), axis=1) Distance_action = ((((action_sum + T.reshape(action_sum, (1,-1)).T) - 2*T.dot(self._q_valsActA_drop, self._q_valsActA_drop.T)))) weighted_dist = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(Distance, Distance_action) self._weighted_mean_dist = T.mean(weighted_dist, axis=1) """ ## Entropy from A3C, make sure network is not producing same action for everything.. # self.entropy = -T.mean(T.sum(self._q_valsActA_drop, axis=0)) # self._weighted_entropy = -T.mean(self._weighted_mean_dist) self._weighted_entropy = 0 self._actionUpdates = lasagne.updates.rmsprop(self._actLoss + self._actor_regularization + (0.00001 * self._weighted_entropy), self._actionParams, self._learning_rate , self._rho, self._rms_epsilon) # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) + # (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams, # self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._actions_shared, } ## Bellman error self._bellman = self._target - self._q_funcTarget CACLA.compile(self)
def __init__(self, input_size, output_size, build_network=simple_network2, discount=0.99, learningRate=0.001, frozen_network_update_time=1000): print "Initializing new Q network" self.input_size = input_size self.output_size = output_size self.discount = discount self.learningRate = learningRate self.frozen_network_update_time = frozen_network_update_time self.frozen_timer = 0 self.epoch = 0 # logging variables self.log = { "batchMeanQValue": [], "batchMeanTargetQValue": [], "cost": [], 'performance': [], 'epoch': [] } # symbolic inputs sym_state = T.tensor4('state') #Batchsize, channels, X, Y sym_action = T.icol('action') sym_reward = T.col('reward') sym_isDone = T.bcol('isDone') sym_nextState = T.tensor4('nextState') # networks self.network = build_network(input_size, output_size) self.frozen_network = build_network(input_size, output_size) self.update_frozen_network() # forward pass print "Compiling forward passes" self.forward_pass = theano.function([sym_state], lasagne.layers.get_output( self.network, sym_state, deterministic=True)) self.frozen_forward_pass = theano.function([sym_state], lasagne.layers.get_output( self.frozen_network, sym_state, deterministic=True)) #clipped_reward = T.clip(sym_reward,-1,1) #cost function definition cost, error, q_action, q_target = self.build_cost_function( sym_state, sym_action, sym_reward, sym_isDone, sym_nextState) params = lasagne.layers.get_all_params(self.network, trainable=True) update_function = lasagne.updates.rmsprop( cost, params, learning_rate=self.learningRate) # training function print "Compiling training function" self._train = theano.function( [sym_state, sym_action, sym_reward, sym_isDone, sym_nextState], [cost, error, q_action, q_target], updates=update_function)
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(MBPG, self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # scale = (bounds[1][i]-bounds[0][i])/2.0 # create a small convolutional neural network # self._action_std_scaling = (self._action_bounds[1] - self._action_bounds[0]) / 2.0 self._NotFallen = T.bcol("Not_Fallen") ## because float64 <= float32 * int32, need to use int16 or int8 self._NotFallen.tag.test_value = np.zeros((self._batch_size, 1), dtype=np.dtype('int8')) self._NotFallen_shared = theano.shared(np.zeros((self._batch_size, 1), dtype='int8'), broadcastable=(False, True)) self._advantage = T.col("Advantage") self._advantage.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._advantage_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._dyna_target = T.col("DYNA_Target") self._dyna_target.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._dyna_target_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._KL_Weight = T.scalar("KL_Weight") self._KL_Weight.tag.test_value = np.zeros( (1), dtype=np.dtype(self.getSettings()['float_type']))[0] self._kl_weight_shared = theano.shared( np.ones((1), dtype=self.getSettings()['float_type'])[0]) self._kl_weight_shared.set_value( self.getSettings()['previous_value_regularization_weight']) """ self._target_shared = theano.shared( np.zeros((self._batch_size, 1), dtype='float64'), broadcastable=(False, True)) """ self._critic_regularization_weight = self.getSettings( )["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] # primary network self._model = model # Target network self._modelTarget = copy.deepcopy(model) self._q_valsA = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsA_drop = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsNextState = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTargetNextState = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTarget = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsTarget_drop = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsActA = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:, :self._action_length] # self._q_valsActA = scale_action(self._q_valsActA, self._action_bounds) self._q_valsActASTD = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:, self._action_length:] ## prevent value from being 0 """ if ( 'use_fixed_std' in self.getSettings() and ( self.getSettings()['use_fixed_std'])): self._q_valsActASTD = ( T.ones_like(self._q_valsActA)) * self.getSettings()['exploration_rate'] # self._q_valsActASTD = ( T.ones_like(self._q_valsActA)) * self.getSettings()['exploration_rate'] else: """ self._q_valsActASTD = ((self._q_valsActASTD) * self.getSettings()['exploration_rate']) + 2e-2 self._q_valsActTarget = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())[:, :self._action_length] # self._q_valsActTarget = scale_action(self._q_valsActTarget, self._action_bounds) self._q_valsActTargetSTD = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())[:, self._action_length:] """ if ( 'use_fixed_std' in self.getSettings() and ( self.getSettings()['use_fixed_std'])): self._q_valsActTargetSTD = (T.ones_like(self._q_valsActTarget)) * self.getSettings()['exploration_rate'] # self._q_valsActTargetSTD = (self._action_std_scaling * T.ones_like(self._q_valsActTarget)) * self.getSettings()['exploration_rate'] else: """ self._q_valsActTargetSTD = ( (self._q_valsActTargetSTD) * self.getSettings()['exploration_rate']) + 2e-2 self._q_valsActA_drop = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_func = self._q_valsA self._q_funcTarget = self._q_valsTarget self._q_func_drop = self._q_valsA_drop self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA self._q_funcAct_drop = self._q_valsActA_drop # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen # self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsTargetNextState )), self._NotFallen) + (self._NotFallen - 1) self._target = self._model.getRewardSymbolicVariable() + ( self._discount_factor * self._q_valsTargetNextState) self._diff = self._target - self._q_func self._diff_drop = self._target - self._q_func_drop # loss = 0.5 * self._diff ** 2 loss = 0.5 * T.pow(self._diff, 2) self._loss = T.mean(loss) self._loss_drop = T.mean(0.5 * self._diff_drop**2) self._params = lasagne.layers.helper.get_all_params( self._model.getCriticNetwork()) self._actionParams = lasagne.layers.helper.get_all_params( self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._NotFallen: self._NotFallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._NotFallen: self._NotFallen_shared, self._advantage: self._advantage_shared, # self._KL_Weight: self._kl_weight_shared } self._allGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._NotFallen: self._NotFallen_shared, self._advantage: self._advantage_shared, # self._KL_Weight: self._kl_weight_shared } self._critic_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) self._actor_regularization = ( self._regularization_weight * lasagne.regularization.regularize_network_params( self._model.getActorNetwork(), lasagne.regularization.l2)) self._kl_firstfixed = T.mean( kl(self._q_valsActTarget, self._q_valsActTargetSTD, self._q_valsActA, self._q_valsActASTD, self._action_length)) # self._actor_regularization = (( self.getSettings()['previous_value_regularization_weight']) * self._kl_firstfixed ) # self._actor_regularization = (( self._KL_Weight ) * self._kl_firstfixed ) + (10*(self._kl_firstfixed>self.getSettings()['kl_divergence_threshold'])* # T.square(self._kl_firstfixed-self.getSettings()['kl_divergence_threshold'])) self._actor_entropy = 0.5 * T.mean((2 * np.pi * self._q_valsActASTD)) # SGD update # self._updates_ = lasagne.updates.rmsprop(self._loss + (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getCriticNetwork(), lasagne.regularization.l2)), self._params, self._learning_rate, self._rho, # self._rms_epsilon) self._value_grad = T.grad(self._loss + self._critic_regularization, self._params) ## Clipping the max gradient """ for x in range(len(self._value_grad)): self._value_grad[x] = T.clip(self._value_grad[x] , -0.1, 0.1) """ if (self.getSettings()['optimizer'] == 'rmsprop'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.rmsprop(self._value_grad, self._params, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.momentum( self._value_grad, self._params, self._critic_learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adam(self._value_grad, self._params, self._critic_learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adagrad( self._value_grad, self._params, self._critic_learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) ## Need to perform an element wise operation or replicate _diff for this to work properly. # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA), # theano.tensor.tile((self._advantage * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here? ## advantage = Q(a,s) - V(s) = (r + gamma*V(s')) - V(s) # self._advantage = (((self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState)) * self._NotFallen)) - self._q_func self._Advantage = self._advantage # * (1.0/(1.0-self._discount_factor)) ## scale back to same as rewards # self._log_prob = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActA, self._q_valsActASTD, self._action_length) # self._log_prob_target = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length) ### Only change the std self._prob = likelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActASTD, self._action_length) self._prob_target = likelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length) ## This does the sum already self._r = (self._prob / self._prob_target) self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( (self._r), self._Advantage) ppo_epsilon = self.getSettings()['kl_divergence_threshold'] self._actLoss_2 = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( (theano.tensor.clip(self._r, 1.0 - ppo_epsilon, 1 + ppo_epsilon), self._Advantage)) self._actLoss_ = theano.tensor.minimum((self._actLoss_), (self._actLoss_2)) self._actLoss = (-1.0 * (T.mean(self._actLoss_) + (self.getSettings()['std_entropy_weight'] * self._actor_entropy))) + self._actor_regularization self._policy_grad = T.grad(self._actLoss, self._actionParams) self._policy_grad = lasagne.updates.total_norm_constraint( self._policy_grad, 5) if (self.getSettings()['optimizer'] == 'rmsprop'): self._actionUpdates = lasagne.updates.rmsprop( self._policy_grad, self._actionParams, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._actionUpdates = lasagne.updates.momentum(self._policy_grad, self._actionParams, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._actionUpdates = lasagne.updates.adam(self._policy_grad, self._actionParams, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) if (('train_state_encoding' in self.getSettings()) and (self.getSettings()['train_state_encoding'])): self._encoded_state = lasagne.layers.get_output( self._model.getEncodeNet(), self._model.getStateSymbolicVariable(), deterministic=True) self._encoding_loss = T.mean( T.pow(self._encoded_state - self._model.getStates(), 2)) self._full_loss = ( self._loss + self._critic_regularization + (-1.0 * self.getSettings()['policy_loss_weight'] * (T.mean(self._actLoss_) + (self.getSettings()['std_entropy_weight'] * self._actor_entropy))) + (self._actor_regularization + self._encoding_loss)) else: self._full_loss = ( self._loss + self._critic_regularization + (-1.0 * self.getSettings()['policy_loss_weight'] * (T.mean(self._actLoss_) + (self.getSettings()['std_entropy_weight'] * self._actor_entropy))) + self._actor_regularization) if (('train_state_encoding' in self.getSettings()) and (self.getSettings()['train_state_encoding'])): self._encodeParams = lasagne.layers.helper.get_all_params( self._model.getEncodeNet()) self._all_Params = self._params + self._actionParams + self._encodeParams else: # self._all_Params = self._params + self._actionParams[-3:] self._all_Params = self._params + self._actionParams print("Num params: ", len(self._all_Params), " params: ", len(self._params), " act params: ", len(self._actionParams)) self._both_grad = T.grad(self._full_loss, self._all_Params) self._both_grad = lasagne.updates.total_norm_constraint( self._both_grad, 5) if (self.getSettings()['optimizer'] == 'rmsprop'): self._collectiveUpdates = lasagne.updates.rmsprop( self._both_grad, self._all_Params, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._collectiveUpdates = lasagne.updates.momentum( self._both_grad, self._all_Params, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._collectiveUpdates = lasagne.updates.adam(self._both_grad, self._all_Params, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) + # (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams, # self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._actions_shared, } ### _q_valsA because the predicted state is stored in self._model.getStateSymbolicVariable() self._diff_dyna = self._dyna_target - self._q_valsNextState # loss = 0.5 * self._diff ** 2 loss = 0.5 * T.pow(self._diff_dyna, 2) self._loss_dyna = T.mean(loss) self._dyna_grad = T.grad(self._loss_dyna + self._critic_regularization, self._params) self._givens_dyna = { # self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._NotFallen: self._NotFallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, self._dyna_target: self._dyna_target_shared } if (self.getSettings()['optimizer'] == 'rmsprop'): self._DYNAUpdates = lasagne.updates.rmsprop( self._dyna_grad, self._params, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._DYNAUpdates = lasagne.updates.momentum(self._dyna_grad, self._params, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._DYNAUpdates = lasagne.updates.adam(self._dyna_grad, self._params, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): self._DYNAUpdates = lasagne.updates.adagrad( self._dyna_grad, self._params, self._learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) ## Bellman error self._bellman = self._target - self._q_funcTarget ## Some cool stuff to backprop action gradients self._action_grad = T.matrix("Action_Grad") self._action_grad.tag.test_value = np.zeros( (self._batch_size, self._action_length), dtype=np.dtype(self.getSettings()['float_type'])) self._action_grad_shared = theano.shared( np.zeros((self._batch_size, self._action_length), dtype=self.getSettings()['float_type'])) self._action_mean_grads = T.grad( cost=None, wrt=self._actionParams, known_grads={self._q_valsActA: self._action_grad_shared}), # print ("Action grads: ", self._action_mean_grads[0]) ## When passing in gradients it needs to be a proper list of gradient expressions self._action_mean_grads = list(self._action_mean_grads[0]) # print ("isinstance(self._action_mean_grads, list): ", isinstance(self._action_mean_grads, list)) # print ("Action grads: ", self._action_mean_grads) self._actionGRADUpdates = lasagne.updates.adagrad( self._action_mean_grads, self._actionParams, self._learning_rate, epsilon=self._rms_epsilon) self._actGradGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._model.getActions(), # self._NotFallen: self._NotFallen_shared, # self._advantage: self._advantage_shared, # self._KL_Weight: self._kl_weight_shared } """ self._get_grad = theano.function([], outputs=T.grad(cost=None, wrt=[self._model._actionInputVar] + self._params, known_grads={self._forward: self._fd_grad_target_shared}), allow_input_downcast=True, givens= { self._model.getStateSymbolicVariable() : self._model.getStates(), # self._model.getResultStateSymbolicVariable() : self._model.getResultStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._fd_grad_target : self._fd_grad_target_shared }) """ MBPG.compile(self)
def fit(self, data, sample_store=10000000, store_type='gpu'): ''' Trains the network. Parameters -------- data : pandas.DataFrame Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps). It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties). sample_store : int If additional negative samples are used (n_sample > 0), the efficiency of GPU utilization can be sped up, by precomputing a large batch of negative samples (and recomputing when necessary). This parameter regulizes the size of this precomputed ID set. Its value is the maximum number of int values (IDs) to be stored. Precomputed IDs are stored in the RAM. For the most efficient computation, a balance must be found between storing few examples and constantly interrupting GPU computations for a short time vs. computing many examples and interrupting GPU computations for a long time (but rarely). store_type : 'cpu', 'gpu' Where to store the negative sample buffer (sample store). The cpu mode is legacy and is no longer supported. ''' self.predict = None self.error_during_train = False itemids = data[self.item_key].unique() self.n_items = len(itemids) self.itemidmap = pd.Series(data=np.arange(self.n_items), index=itemids, name='ItemIdx') data['ItemIdx'] = self.itemidmap[data[self.item_key].values].values offset_sessions = self.init(data) pop = data.groupby(self.item_key).size() if self.logq: self.P0 = theano.shared( pop[self.itemidmap.index.values].values.astype( theano.config.floatX), name='P0', borrow=False) if self.n_sample: pop = pop[self.itemidmap.index.values].values**self.sample_alpha pop = pop.cumsum() / pop.sum() pop[-1] = 1 if sample_store: generate_length = sample_store // self.n_sample if generate_length <= 1: sample_store = 0 print('No example store was used') elif store_type == 'cpu': neg_samples = self.generate_neg_samples( pop, generate_length) sample_pointer = 0 print( 'Created sample store with {} batches of samples (type=CPU)' .format(generate_length)) elif store_type == 'gpu': P = theano.shared(pop.astype(theano.config.floatX), name='P') self.ST = theano.shared( np.zeros((generate_length, self.n_sample), dtype='int64')) self.STI = theano.shared(np.asarray(0, dtype='int64')) X = mrng.uniform((generate_length * self.n_sample, )) updates_st = OrderedDict() updates_st[self.ST] = gpu_searchsorted( P, X, dtype_int64=True).reshape( (generate_length, self.n_sample)) updates_st[self.STI] = np.asarray(0, dtype='int64') generate_samples = theano.function([], updates=updates_st) generate_samples() sample_pointer = 0 print( 'Created sample store with {} batches of samples (type=GPU)' .format(generate_length)) else: print('Invalid store type {}'.format(store_type)) raise NotImplementedError else: print('No example store was used') X = T.ivector(name='X') Y = T.ivector(name='Y') M = T.iscalar(name='M') R = T.bcol(name='R') H_new, Y_pred, sparams, full_params, sidxs = self.model( X, self.H, M, R, Y, self.dropout_p_hidden, self.dropout_p_embed) cost = self.loss_function(Y_pred, M) / self.batch_size params = [ self.Wx if self.embedding or self.constrained_embedding else self.Wx[1:], self.Wh, self.Wrz, self.Bh ] updates = self.RMSprop(cost, params, full_params, sparams, sidxs) for i in range(len(self.H)): updates[self.H[i]] = H_new[i] if hasattr(self, 'STI'): updates[self.STI] = self.STI + 1 train_function = function(inputs=[X, Y, M, R], outputs=cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') base_order = np.argsort( data.groupby(self.session_key)[self.time_key].min().values ) if self.time_sort else np.arange(len(offset_sessions) - 1) data_items = data.ItemIdx.values for epoch in range(self.n_epochs): t0 = time.time() for i in range(len(self.layers)): self.H[i].set_value(np.zeros((self.batch_size, self.layers[i]), dtype=theano.config.floatX), borrow=True) c = [] cc = [] session_idx_arr = np.random.permutation( len(offset_sessions) - 1) if self.train_random_order else base_order iters = np.arange(self.batch_size) maxiter = iters.max() start = offset_sessions[session_idx_arr[iters]] end = offset_sessions[session_idx_arr[iters] + 1] finished = False while not finished: minlen = (end - start).min() out_idx = data_items[start] for i in range(minlen - 1): in_idx = out_idx out_idx = data_items[start + i + 1] if self.n_sample and store_type == 'cpu': if sample_store: if sample_pointer == generate_length: neg_samples = self.generate_neg_samples( pop, generate_length) sample_pointer = 0 sample = neg_samples[sample_pointer] sample_pointer += 1 else: sample = self.generate_neg_samples(pop, 1) y = np.hstack([out_idx, sample]) else: y = out_idx if self.n_sample: if sample_pointer == generate_length: generate_samples() sample_pointer = 0 sample_pointer += 1 reset = (start + i + 1 == end - 1) cost = train_function(in_idx, y, len(iters), reset.reshape(len(reset), 1)) c.append(cost) cc.append(len(iters)) if np.isnan(cost): print(str(epoch) + ': NaN error!') self.error_during_train = True return start = start + minlen - 1 finished_mask = (end - start <= 1) n_finished = finished_mask.sum() iters[finished_mask] = maxiter + np.arange(1, n_finished + 1) maxiter += n_finished valid_mask = (iters < len(offset_sessions) - 1) n_valid = valid_mask.sum() if (n_valid == 0) or (n_valid < 2 and self.n_sample == 0): finished = True break mask = finished_mask & valid_mask sessions = session_idx_arr[iters[mask]] start[mask] = offset_sessions[sessions] end[mask] = offset_sessions[sessions + 1] iters = iters[valid_mask] start = start[valid_mask] end = end[valid_mask] if n_valid < len(valid_mask): for i in range(len(self.H)): tmp = self.H[i].get_value(borrow=True) tmp = tmp[valid_mask] self.H[i].set_value(tmp, borrow=True) c = np.array(c) cc = np.array(cc) avgc = np.sum(c * cc) / np.sum(cc) if np.isnan(avgc): print('Epoch {}: NaN error!'.format(str(epoch))) self.error_during_train = True return t1 = time.time() dt = t1 - t0 print( 'Epoch{} --> loss: {:.6f} \t({:.2f}s) \t[{:.2f} mb/s | {:.0f} e/s]' .format(epoch + 1, avgc, dt, len(c) / dt, np.sum(cc) / dt)) if hasattr(self, 'ST'): del (self.ST) del (self.STI)
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(A_CACLA, self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # create a small convolutional neural network self._actor_buffer_states = [] self._actor_buffer_result_states = [] self._actor_buffer_actions = [] self._actor_buffer_rewards = [] self._actor_buffer_falls = [] self._actor_buffer_diff = [] self._NotFallen = T.bcol("Not_Fallen") ## because float64 <= float32 * int32, need to use int16 or int8 self._NotFallen.tag.test_value = np.zeros((self._batch_size, 1), dtype=np.dtype('int8')) self._NotFallen_shared = theano.shared(np.zeros((self._batch_size, 1), dtype='int8'), broadcastable=(False, True)) self._tmp_diff = T.col("Tmp_Diff") self._tmp_diff.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._tmp_diff_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._dyna_target = T.col("DYNA_Target") self._dyna_target.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._dyna_target_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._KL_Weight = T.scalar("KL_Weight") self._KL_Weight.tag.test_value = np.zeros( (1), dtype=np.dtype(self.getSettings()['float_type']))[0] self._kl_weight_shared = theano.shared( np.ones((1), dtype=self.getSettings()['float_type'])[0]) self._kl_weight_shared.set_value(1.0) """ self._target_shared = theano.shared( np.zeros((self._batch_size, 1), dtype='float64'), broadcastable=(False, True)) """ self._critic_regularization_weight = self.getSettings( )["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] ## Target network self._modelTarget = copy.deepcopy(model) self._q_valsA = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsA_drop = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsNextState = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTargetNextState = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTarget = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsTarget_drop = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsActA = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsActTarget = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsActA_drop = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_func = self._q_valsA self._q_funcTarget = self._q_valsTarget self._q_func_drop = self._q_valsA_drop self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA self._q_funcAct_drop = self._q_valsActA_drop # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen # self._target = self._model.getRewardSymbolicVariable() + ((self._discount_factor * self._q_valsTargetNextState ) * self._NotFallen) + (self._NotFallen - 1) self._target = self._model.getRewardSymbolicVariable() + ( self._discount_factor * self._q_valsTargetNextState) self._diff = self._target - self._q_func self._diff_drop = self._target - self._q_func_drop # loss = 0.5 * self._diff ** 2 loss = T.pow(self._diff, 2) self._loss = T.mean(loss) self._loss_drop = T.mean(0.5 * self._diff_drop**2) self._params = lasagne.layers.helper.get_all_params( self._model.getCriticNetwork()) self._actionParams = lasagne.layers.helper.get_all_params( self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._NotFallen: self._NotFallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._NotFallen: self._NotFallen_shared self._tmp_diff: self._tmp_diff_shared } self._critic_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) self._actor_regularization = ( (self._regularization_weight * lasagne.regularization.regularize_network_params( self._model.getActorNetwork(), lasagne.regularization.l2))) if (self.getSettings()['use_previous_value_regularization']): self._actor_regularization = self._actor_regularization + ( (self.getSettings()['previous_value_regularization_weight']) * change_penalty(self._model.getActorNetwork(), self._modelTarget.getActorNetwork())) elif ('regularization_type' in self.getSettings() and (self.getSettings()['regularization_type'] == 'KL_Divergence')): self._kl_firstfixed = T.mean( kl( self._q_valsActTarget, T.ones_like(self._q_valsActTarget) * self.getSettings()['exploration_rate'], self._q_valsActA, T.ones_like(self._q_valsActA) * self.getSettings()['exploration_rate'], self._action_length)) #self._actor_regularization = (( self._KL_Weight ) * self._kl_firstfixed ) + (10*(self._kl_firstfixed>self.getSettings()['kl_divergence_threshold'])* # T.square(self._kl_firstfixed-self.getSettings()['kl_divergence_threshold'])) self._actor_regularization = (self._kl_firstfixed) * ( self.getSettings()['kl_divergence_threshold']) print("Using regularization type : ", self.getSettings()['regularization_type']) # SGD update # self._updates_ = lasagne.updates.rmsprop(self._loss, self._params, self._learning_rate, self._rho, # self._rms_epsilon) self._value_grad = T.grad(self._loss + self._critic_regularization, self._params) ## Clipping the max gradient """ for x in range(len(self._value_grad)): self._value_grad[x] = T.clip(self._value_grad[x] , -0.1, 0.1) """ if (self.getSettings()['optimizer'] == 'rmsprop'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.rmsprop(self._value_grad, self._params, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.momentum( self._value_grad, self._params, self._critic_learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adam(self._value_grad, self._params, self._critic_learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adagrad( self._value_grad, self._params, self._critic_learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) ## TD update """ if (self.getSettings()['optimizer'] == 'rmsprop'): self._updates_ = lasagne.updates.rmsprop(T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._updates_ = lasagne.updates.momentum(T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), momentum=self._rho) elif ( self.getSettings()['optimizer'] == 'adam'): self._updates_ = lasagne.updates.adam(T.mean(self._q_func), self._params, self._critic_learning_rate * -T.mean(self._diff), beta1=0.9, beta2=0.999, epsilon=1e-08) else: print ("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) """ ## Need to perform an element wise operation or replicate _diff for this to work properly. # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA), theano.tensor.tile((self._diff * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here? self._actDiff = (self._model.getActionSymbolicVariable() - self._q_valsActA_drop) # self._actDiff = ((self._model.getActionSymbolicVariable() - self._q_valsActA)) # Target network does not work well here? # self._actDiff_drop = ((self._model.getActionSymbolicVariable() - self._q_valsActA_drop)) # Target network does not work well here? ## This should be a single column vector # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(( T.transpose(T.sum(T.pow(self._actDiff, 2),axis=1) )), (self._diff * (1.0/(1.0-self._discount_factor)))) # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(( T.reshape(T.sum(T.pow(self._actDiff, 2),axis=1), (self._batch_size, 1) )), # (self._tmp_diff * (1.0/(1.0-self._discount_factor))) # self._actLoss_ = (T.mean(T.pow(self._actDiff, 2),axis=1)) self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( (T.mean(T.pow(self._actDiff, 2), axis=1)), (self._tmp_diff * (1.0 / (1.0 - self._discount_factor)))) # self._actLoss = T.sum(self._actLoss)/float(self._batch_size) self._actLoss = T.mean(self._actLoss_) # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2)) self._policy_grad = T.grad(self._actLoss + self._actor_regularization, self._actionParams) ## Clipping the max gradient """ for x in range(len(self._policy_grad)): self._policy_grad[x] = T.clip(self._policy_grad[x] , -0.5, 0.5) """ if (self.getSettings()['optimizer'] == 'rmsprop'): self._actionUpdates = lasagne.updates.rmsprop( self._policy_grad, self._actionParams, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._actionUpdates = lasagne.updates.momentum(self._policy_grad, self._actionParams, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._actionUpdates = lasagne.updates.adam( self._policy_grad, self._actionParams, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): self._actionUpdates = lasagne.updates.adagrad( self._policy_grad, self._actionParams, self._learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) + # (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams, # self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._model.getActions(), } ### Noisey state updates # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen # self._target_dyna = theano.gradient.disconnected_grad(self._q_func) ### _q_valsA because the predicted state is stored in self._model.getStateSymbolicVariable() self._diff_dyna = self._dyna_target - self._q_valsNextState # loss = 0.5 * self._diff ** 2 loss = T.pow(self._diff_dyna, 2) self._loss_dyna = T.mean(loss) self._dyna_grad = T.grad(self._loss_dyna + self._critic_regularization, self._params) self._givens_dyna = { # self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._NotFallen: self._NotFallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, self._dyna_target: self._dyna_target_shared } if (self.getSettings()['optimizer'] == 'rmsprop'): self._DYNAUpdates = lasagne.updates.rmsprop( self._dyna_grad, self._params, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._DYNAUpdates = lasagne.updates.momentum(self._dyna_grad, self._params, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._DYNAUpdates = lasagne.updates.adam(self._dyna_grad, self._params, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): self._DYNAUpdates = lasagne.updates.adagrad( self._dyna_grad, self._params, self._learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) ## Bellman error self._bellman = self._target - self._q_funcTarget # self._target = self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState ) ### Give v(s') the next state and v(s) (target) the current state self._diff_adv = (self._discount_factor * self._q_func) - (self._q_valsTargetNextState) self._diff_adv_givens = { self._model.getStateSymbolicVariable(): self._model.getResultStates(), self._model.getResultStateSymbolicVariable(): self._model.getStates(), } A_CACLA.compile(self)
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(PPOCritic,self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # create a small convolutional neural network self._Fallen = T.bcol("Fallen") ## because float64 <= float32 * int32, need to use int16 or int8 self._Fallen.tag.test_value = np.zeros((self._batch_size,1),dtype=np.dtype('int8')) self._fallen_shared = theano.shared( np.zeros((self._batch_size, 1), dtype='int8'), broadcastable=(False, True)) self._advantage = T.col("Advantage") self._advantage.tag.test_value = np.zeros((self._batch_size,1),dtype=np.dtype(self.getSettings()['float_type'])) self._advantage_shared = theano.shared( np.zeros((self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._dyna_target = T.col("DYNA_Target") self._dyna_target.tag.test_value = np.zeros((self._batch_size,1),dtype=np.dtype(self.getSettings()['float_type'])) self._dyna_target_shared = theano.shared( np.zeros((self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._KL_Weight = T.scalar("KL_Weight") self._KL_Weight.tag.test_value = np.zeros((1),dtype=np.dtype(self.getSettings()['float_type']))[0] self._kl_weight_shared = theano.shared( np.ones((1), dtype=self.getSettings()['float_type'])[0]) self._kl_weight_shared.set_value(self.getSettings()['previous_value_regularization_weight']) """ self._target_shared = theano.shared( np.zeros((self._batch_size, 1), dtype='float64'), broadcastable=(False, True)) """ self._critic_regularization_weight = self.getSettings()["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] # primary network self._model = model # Target network self._modelTarget = copy.deepcopy(model) self._q_valsA = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsA_drop = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsNextState = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTargetNextState = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTarget = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsTarget_drop = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsActA = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:,:self._action_length] self._q_valsActASTD = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:,self._action_length:] ## prevent value from being 0 self._q_valsActASTD = (self._q_valsActASTD * self.getSettings()['exploration_rate']) + 1e-1 self._q_valsActTarget = lasagne.layers.get_output(self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())[:,:self._action_length] self._q_valsActTargetSTD = lasagne.layers.get_output(self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())[:,self._action_length:] self._q_valsActTargetSTD = (self._q_valsActTargetSTD * self.getSettings()['exploration_rate']) + 1e-1 self._q_valsActA_drop = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_func = self._q_valsA self._q_funcTarget = self._q_valsTarget self._q_func_drop = self._q_valsA_drop self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA self._q_funcAct_drop = self._q_valsActA_drop # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._Fallen self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsTargetNextState )), self._Fallen) self._diff = self._target - self._q_func self._diff_drop = self._target - self._q_func_drop # loss = 0.5 * self._diff ** 2 loss = T.pow(self._diff, 2) self._loss = T.mean(loss) self._loss_drop = T.mean(0.5 * self._diff_drop ** 2) self._params = lasagne.layers.helper.get_all_params(self._model.getCriticNetwork()) self._actionParams = lasagne.layers.helper.get_all_params(self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._Fallen: self._fallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._Fallen: self._fallen_shared, self._advantage: self._advantage_shared, # self._KL_Weight: self._kl_weight_shared } self._critic_regularization = (self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) self._actor_regularization = (self._regularization_weight * lasagne.regularization.regularize_network_params( self._model.getActorNetwork(), lasagne.regularization.l2)) self._kl_firstfixed = T.mean(kl(self._q_valsActTarget, self._q_valsActTargetSTD, self._q_valsActA, self._q_valsActASTD, self._action_length)) # self._actor_regularization = (( self.getSettings()['previous_value_regularization_weight']) * self._kl_firstfixed ) # self._actor_regularization = (( self._KL_Weight ) * self._kl_firstfixed ) + (10*(self._kl_firstfixed>self.getSettings()['kl_divergence_threshold'])* # T.square(self._kl_firstfixed-self.getSettings()['kl_divergence_threshold'])) self._actor_entropy = 0.5 * T.mean(T.log(2 * np.pi * self._q_valsActASTD ) + 1 ) # SGD update # self._updates_ = lasagne.updates.rmsprop(self._loss + (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getCriticNetwork(), lasagne.regularization.l2)), self._params, self._learning_rate, self._rho, # self._rms_epsilon) # TD update if (self.getSettings()['optimizer'] == 'rmsprop'): self._updates_ = lasagne.updates.rmsprop(T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._updates_ = lasagne.updates.momentum(T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), momentum=self._rho) elif ( self.getSettings()['optimizer'] == 'adam'): self._updates_ = lasagne.updates.adam(T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), beta1=0.9, beta2=0.999, epsilon=1e-08) else: print ("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) ## Need to perform an element wise operation or replicate _diff for this to work properly. # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA), # theano.tensor.tile((self._advantage * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here? ## advantage = Q(a,s) - V(s) = (r + gamma*V(s')) - V(s) # self._advantage = (((self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState)) * self._Fallen)) - self._q_func # self._Advantage = self._diff # * (1.0/(1.0-self._discount_factor)) ## scale back to same as rewards self._Advantage = self._advantage * (1.0/(1.0-self._discount_factor)) ## scale back to same as rewards # self._log_prob = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActA, self._q_valsActASTD, self._action_length) # self._log_prob_target = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length) self._prob = likelihood(self._model.getActionSymbolicVariable(), self._q_valsActA, self._q_valsActASTD, self._action_length) self._prob_target = likelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length) # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target).dot(self._Advantage)) ) # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target) * (self._Advantage)) ) # self._actLoss_ = ( ((self._log_prob) * self._Advantage) ) # self._actLoss_ = ( ((self._log_prob)) ) ## This does the sum already # self._actLoss_ = ( (self._log_prob).dot( self._Advantage) ) self._r = (self._prob / self._prob_target) self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._r), self._Advantage) ppo_epsilon = self.getSettings()['kl_divergence_threshold'] self._actLoss_2 = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((theano.tensor.clip(self._r, 1.0 - ppo_epsilon, 1+ppo_epsilon), self._Advantage)) self._actLoss_ = theano.tensor.minimum((self._actLoss_), (self._actLoss_2)) # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(T.exp(self._log_prob - self._log_prob_target), self._Advantage) # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._log_prob), self._Advantage) # self._actLoss_ = T.mean(self._log_prob) # self._policy_entropy = 0.5 * T.mean(T.log(2 * np.pi * self._q_valsActASTD ) + 1 ) ## - because update computes gradient DESCENT updates # self._actLoss = -1.0 * ((T.mean(self._actLoss_)) + (self._actor_regularization )) # self._entropy = -1. * T.sum(T.log(self._q_valsActA + 1e-8) * self._q_valsActA, axis=1, keepdims=True) ## - because update computes gradient DESCENT updates self._actLoss = (-1.0 * (T.mean(self._actLoss_) + (1e-2 * self._actor_entropy))) + self._actor_regularization # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2)) self._policy_grad = T.grad(self._actLoss , self._actionParams) self._policy_grad = lasagne.updates.total_norm_constraint(self._policy_grad, 5) if (self.getSettings()['optimizer'] == 'rmsprop'): self._actionUpdates = lasagne.updates.rmsprop(self._policy_grad, self._actionParams, self._learning_rate , self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._actionUpdates = lasagne.updates.momentum(self._policy_grad, self._actionParams, self._learning_rate , momentum=self._rho) elif ( self.getSettings()['optimizer'] == 'adam'): self._actionUpdates = lasagne.updates.adam(self._policy_grad, self._actionParams, self._learning_rate , beta1=0.9, beta2=0.999, epsilon=1e-08) else: print ("Unknown optimization method: ", self.getSettings()['optimizer']) # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) + # (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams, # self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._actions_shared, } ### _q_valsA because the predicted state is stored in self._model.getStateSymbolicVariable() self._diff_dyna = self._dyna_target - self._q_valsNextState # loss = 0.5 * self._diff ** 2 loss = T.pow(self._diff_dyna, 2) self._loss_dyna = T.mean(loss) self._dyna_grad = T.grad(self._loss_dyna + self._critic_regularization , self._params) self._givens_dyna = { # self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._Fallen: self._fallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, self._dyna_target: self._dyna_target_shared } if (self.getSettings()['optimizer'] == 'rmsprop'): self._DYNAUpdates = lasagne.updates.rmsprop(self._dyna_grad, self._params, self._learning_rate , self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._DYNAUpdates = lasagne.updates.momentum(self._dyna_grad, self._params, self._learning_rate , momentum=self._rho) elif ( self.getSettings()['optimizer'] == 'adam'): self._DYNAUpdates = lasagne.updates.adam(self._dyna_grad, self._params, self._learning_rate , beta1=0.9, beta2=0.999, epsilon=self._rms_epsilon) elif ( self.getSettings()['optimizer'] == 'adagrad'): self._DYNAUpdates = lasagne.updates.adagrad(self._dyna_grad, self._params, self._learning_rate, epsilon=self._rms_epsilon) else: print ("Unknown optimization method: ", self.getSettings()['optimizer']) ## Bellman error self._bellman = self._target - self._q_funcTarget PPOCritic.compile(self)
def __init__(self, n_actions, replay_memory, initial_weights_file=None): self.mood_q = None self.last_q = 0 self.n_parameter_updates = 0 self.ignore_feedback = False self.alpha = 0.00025 # update frequency ? # gradient momentum ? 0.95 # squared gradient momentum ? 0.95 # min squared gradient ? 0.01 self.save_every_n_frames = 100000 # ~ once per hour self.final_exploration_frame = 1000000 self.replay_start_size = 50000 self.i_frames = 0 self.state = None self.initial_epsilon = 1 self.final_epsilon = 0.1 self.epsilon = self.initial_epsilon self.gamma = 0.99 self.replay_memory = replay_memory self.log_frequency = 50 self.minibatch_size = 32 # self.replay_memory_size = 1000000 self.target_network_update_frequency = 10000 s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var = T.tensor4("s0", dtype=theano.config.floatX), T.bmatrix( "a0"), T.wcol( "r0"), T.tensor4("s1", dtype=theano.config.floatX), T.bcol( "future_reward_indicator") self.n_actions = n_actions self.a_lookup = np.eye(self.n_actions, dtype=np.int8) self.network = build_cnn(n_actions=self.n_actions, input_var=T.cast(s0_var, 'float32') / np.float32(256)) print("Compiling forward.") self.forward = theano.function([s0_var], lasagne.layers.get_output(self.network, deterministic=True)) self.network_stale = build_cnn(n_actions=self.n_actions, input_var=T.cast(s1_var, 'float32') / np.float32(256)) print("Compiling forward stale.") self.forward_stale = theano.function([s1_var], lasagne.layers.get_output(self.network_stale, deterministic=True)) if initial_weights_file is not None: with np.load(initial_weights_file) as initial_weights: param_values = [initial_weights['arr_%d' % i] for i in range(len(initial_weights.files))] lasagne.layers.set_all_param_values(self.network, param_values) self._update_network_stale() out = lasagne.layers.get_output(self.network) out_stale = lasagne.layers.get_output(self.network_stale) self.loss, self.err, __y, __q = build_loss(out=out, out_stale=out_stale, a0_var=a0_var, r0_var=r0_var, future_reward_indicator_var=future_reward_indicator_var, gamma=self.gamma) params = lasagne.layers.get_all_params(self.network, trainable=True) updates = lasagne.updates.rmsprop(self.loss, params, learning_rate=0.0002, rho=0.95, epsilon=1e-6) # TODO RMSPROP in the paper has slightly different definition (see Lua) print("Compiling train_fn.") self.train_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], [self.loss, self.err, T.transpose(__y), T.transpose(__q), out, out_stale], updates=updates) print("Compiling loss_fn.") self.loss_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], self.loss)
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(A3C2,self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # create a small convolutional neural network self._Fallen = T.bcol("Fallen") ## because float64 <= float32 * int32, need to use int16 or int8 self._Fallen.tag.test_value = np.zeros((self._batch_size,1),dtype=np.dtype('int8')) self._fallen_shared = theano.shared( np.zeros((self._batch_size, 1), dtype='int8'), broadcastable=(False, True)) self._advantage = T.col("Tmp_Diff") self._advantage.tag.test_value = np.zeros((self._batch_size,1),dtype=np.dtype(self.getSettings()['float_type'])) self._advantage_shared = theano.shared( np.zeros((self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) """ self._target_shared = theano.shared( np.zeros((self._batch_size, 1), dtype='float64'), broadcastable=(False, True)) """ self._critic_regularization_weight = self.getSettings()["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] # primary network self._model = model # Target network self._modelTarget = copy.deepcopy(model) self._q_valsA = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsA_drop = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsTargetNextState = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable()) self._q_valsTarget = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable()) self._q_valsTarget_drop = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsActA = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:,:self._action_length] self._q_valsActASTD = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:,self._action_length:] ## prevent value from being 0 self._q_valsActASTD = self._q_valsActASTD + 1e-3 self._q_valsActTarget = lasagne.layers.get_output(self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())[:,:self._action_length] self._q_valsActTargetSTD = lasagne.layers.get_output(self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())[:,self._action_length:] self._q_valsActTargetSTD = self._q_valsActTargetSTD + 1e-3 self._q_valsActA_drop = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_func = self._q_valsA self._q_funcTarget = self._q_valsTarget self._q_func_drop = self._q_valsA_drop self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA self._q_funcAct_drop = self._q_valsActA_drop N = self._model.getStateSymbolicVariable().shape[0] # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._Fallen self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsTargetNextState )), self._Fallen) self._diff = self._target - self._q_func # self._Advantage = self._diff self._diff_drop = self._target - self._q_func_drop # loss = 0.5 * self._diff ** 2 loss = T.pow(self._diff, 2) self._loss = T.mean(loss) self._loss_drop = T.mean(0.5 * self._diff_drop ** 2) self._params = lasagne.layers.helper.get_all_params(self._model.getCriticNetwork()) self._actionParams = lasagne.layers.helper.get_all_params(self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._Fallen: self._fallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._model.getActionSymbolicVariable(): self._model.getActions(), self._Fallen: self._fallen_shared # self._advantage: self._advantage_shared } self._critic_regularization = (self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) self._actor_regularization = (self._regularization_weight * lasagne.regularization.regularize_network_params( self._model.getActorNetwork(), lasagne.regularization.l2)) # SGD update # self._updates_ = lasagne.updates.rmsprop(self._loss + (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getCriticNetwork(), lasagne.regularization.l2)), self._params, self._learning_rate, self._rho, # self._rms_epsilon) # TD update if (self.getSettings()['optimizer'] == 'rmsprop'): self._updates_ = lasagne.updates.rmsprop(T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._updates_ = lasagne.updates.momentum(T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), momentum=self._rho) elif ( self.getSettings()['optimizer'] == 'adam'): self._updates_ = lasagne.updates.adam(T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), beta1=0.9, beta2=0.999, epsilon=1e-08) else: print ("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) ## Need to perform an element wise operation or replicate _diff for this to work properly. # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA), # theano.tensor.tile((self._advantage * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here? # self._actLoss # self._actDiff = (self._model.getActionSymbolicVariable() - self._q_valsActA) # self._actDiff = ((self._model.getActionSymbolicVariable() - self._q_valsActA)) # Target network does not work well here? # self._actDiff_drop = ((self._model.getActionSymbolicVariable() - self._q_valsActA_drop)) # Target network does not work well here? ## This should be a single column vector # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(( (T.mean(T.pow(self._actDiff, 2),axis=1) )), (self._diff * (1.0/(1.0-self._discount_factor)))) # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(( T.reshape(T.sum(T.pow(self._actDiff, 2),axis=1), (self._batch_size, 1) )), # (self._advantage * (1.0/(1.0-self._discount_factor))) # self._actLoss_ = (T.mean(T.pow(self._actDiff, 2),axis=1)) # self._Advantage = theano.tensor.tile(theano.gradient.disconnected_grad(self._diff), self._action_length) # self._Advantage = theano.gradient.disconnected_grad(self._diff) # self._Advantage = theano.tensor.clip(self._diff * (1.0/(1.0-self._discount_factor)), 0, 100000.0) ## scale back to same as rewards self._Advantage = self._diff # * (1.0/(1.0-self._discount_factor)) ## scale back to same as rewards self._log_prob = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActA, self._q_valsActASTD, self._action_length) self._log_prob_target = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length) # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target).dot(self._Advantage)) ) # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target) * (self._Advantage)) ) # self._actLoss_ = ( ((self._log_prob) * self._Advantage) ) # self._actLoss_ = ( ((self._log_prob)) ) ## This does the sum already # self._actLoss_ = ( (self._log_prob).dot( self._Advantage) ) self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(T.exp(self._log_prob - self._log_prob_target), self._Advantage) # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._log_prob), self._Advantage) # self._actLoss_ = T.mean(self._log_prob) self._policy_entropy = 0.5 * T.mean(T.log(2 * np.pi * self._q_valsActASTD ) + 1 ) ## - because update computes gradient DESCENT updates self._actLoss = -1.0 * ((T.mean(self._actLoss_)) + (self._policy_entropy * 1e-2)) # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2)) # self._policy_grad = T.grad(self._actLoss , self._actionParams) # self._policy_grad = self._actLoss # self._policy_grad = lasagne.updates.total_norm_constraint(self._policy_grad, 5) # steps, self._actionUpdates = get_adam_steps_and_updates(self._policy_grad, self._actionParams, self._learning_rate) # self._actionUpdates = adam_updates(self._actLoss, self._actionParams, self._learning_rate) if (self.getSettings()['optimizer'] == 'rmsprop'): self._actionUpdates = lasagne.updates.rmsprop(self._actLoss , self._actionParams, self._learning_rate , self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._actionUpdates = lasagne.updates.momentum(self._actLoss , self._actionParams, self._learning_rate , momentum=self._rho) elif ( self.getSettings()['optimizer'] == 'adam'): self._actionUpdates = lasagne.updates.adam(self._actLoss , self._actionParams, self._learning_rate , beta1=0.9, beta2=0.999, epsilon=1e-08) else: print ("Unknown optimization method: ", self.getSettings()['optimizer']) # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) + # (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams, # self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._actions_shared, } ## Bellman error self._bellman = self._target - self._q_funcTarget A3C2.compile(self)
def __init__(self, n_actions, replay_memory, build_network, updates, screen_size, initial_weights_file=None): self.screen_width, self.screen_height = screen_size self.mood_q = None self.last_q = 0 self.n_parameter_updates = 0 self.alpha = 0.00025 # update frequency ? # gradient momentum ? 0.95 # squared gradient momentum ? 0.95 # min squared gradient ? 0.01 self.save_every_n_frames = 100000 # ~ once per hour self.final_exploration_frame = 1000000 self.replay_start_size = 50000 self.i_action = 0 self.state = None self.initial_epsilon = 1 self.final_epsilon = 0.1 self.epsilon = self.initial_epsilon self.gamma = 0.99 self.replay_memory = replay_memory self.log_frequency = 1 self.minibatch_size = 32 # self.replay_memory_size = 1000000 self.target_network_update_frequency = 10000 s0_var = T.tensor4("s0", dtype=theano.config.floatX) a0_var = T.bmatrix("a0") r0_var = T.wcol("r0") s1_var = T.tensor4("s1", dtype=theano.config.floatX) future_reward_indicator_var = T.bcol("future_reward_indicator") self.n_actions = n_actions self.a_lookup = np.eye(self.n_actions, dtype=np.int8) self.network = build_network( n_actions=self.n_actions, input_var=T.cast(s0_var, 'float32') / np.float32(256), screen_size=(self.screen_height, self.screen_width)) print("Compiling forward.") self.forward = theano.function([s0_var], lasagne.layers.get_output( self.network, deterministic=True)) self.network_stale = build_network( n_actions=self.n_actions, input_var=T.cast(s1_var, 'float32') / np.float32(256), screen_size=(self.screen_height, self.screen_width)) print("Compiling forward_stale.") self.forward_stale = theano.function([s1_var], lasagne.layers.get_output( self.network_stale, deterministic=True)) self._update_network_stale() out = lasagne.layers.get_output(self.network) out_stale = lasagne.layers.get_output(self.network_stale) self.loss, self.err, __y, __q = build_loss( out=out, out_stale=out_stale, a0_var=a0_var, r0_var=r0_var, future_reward_indicator_var=future_reward_indicator_var, gamma=self.gamma) params = lasagne.layers.get_all_params(self.network, trainable=True) print("Compiling train_fn.") self.train_fn = theano.function( [s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], [ self.loss, self.err, T.transpose(__y), T.transpose(__q), out, out_stale ], updates=updates(self.loss, params)) print("Compiling loss_fn.") self.loss_fn = theano.function( [s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], self.loss)
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(TRPOCritic, self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # create a small convolutional neural network self._Fallen = T.bcol("Fallen") ## because float64 <= float32 * int32, need to use int16 or int8 self._Fallen.tag.test_value = np.zeros((self._batch_size, 1), dtype=np.dtype('int8')) self._fallen_shared = theano.shared(np.zeros((self._batch_size, 1), dtype='int8'), broadcastable=(False, True)) self._advantage = T.col("Advantage") self._advantage.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._advantage_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._KL_Weight = T.scalar("KL_Weight") self._KL_Weight.tag.test_value = np.zeros( (1), dtype=np.dtype(self.getSettings()['float_type']))[0] self._kl_weight_shared = theano.shared( np.ones((1), dtype=self.getSettings()['float_type'])[0]) self._kl_weight_shared.set_value( self.getSettings()['previous_value_regularization_weight']) """ self._target_shared = theano.shared( np.zeros((self._batch_size, 1), dtype='float64'), broadcastable=(False, True)) """ self._critic_regularization_weight = self.getSettings( )["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] # primary network self._model = model # Target network self._modelTarget = copy.deepcopy(model) self._q_valsA = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsA_drop = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsTargetNextState = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTarget = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsTarget_drop = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsActA = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:, :self._action_length] self._q_valsActASTD = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:, self._action_length:] ## prevent value from being 0 self._q_valsActASTD = (self._q_valsActASTD * self.getSettings()['exploration_rate']) + 1e-3 self._q_valsActTarget_ = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable()) self._q_valsActTarget = self._q_valsActTarget_[:, :self._action_length] self._q_valsActTargetSTD = self._q_valsActTarget_[:, self._action_length:] self._q_valsActTargetSTD = ( self._q_valsActTargetSTD * self.getSettings()['exploration_rate']) + 1e-3 self._q_valsActA_drop = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_func = self._q_valsA self._q_funcTarget = self._q_valsTarget self._q_func_drop = self._q_valsA_drop self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA self._q_funcAct_drop = self._q_valsActA_drop # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._Fallen self._target = T.mul( T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsTargetNextState)), self._Fallen) self._diff = self._target - self._q_func self._diff_drop = self._target - self._q_func_drop # loss = 0.5 * self._diff ** 2 loss = T.pow(self._diff, 2) self._loss = T.mean(loss) self._loss_drop = T.mean(0.5 * self._diff_drop**2) self._params = lasagne.layers.helper.get_all_params( self._model.getCriticNetwork()) self._actionParams = lasagne.layers.helper.get_all_params( self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._Fallen: self._fallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._model.getActionSymbolicVariable(): self._model.getActions(), self._Fallen: self._fallen_shared, # self._advantage: self._advantage_shared, # self._KL_Weight: self._kl_weight_shared } self._critic_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) # self._actor_regularization = ( (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)) ) self._kl_firstfixed = kl(self._q_valsActTarget, self._q_valsActTargetSTD, self._q_valsActA, self._q_valsActASTD, self._action_length).mean() # self._actor_regularization = (( self.getSettings()['previous_value_regularization_weight']) * self._kl_firstfixed ) self._actor_regularization = ( (self._KL_Weight) * self._kl_firstfixed) + ( (self._kl_firstfixed > self.getSettings()['kl_divergence_threshold']) * T.square(self._kl_firstfixed - self.getSettings()['kl_divergence_threshold'])) # SGD update # self._updates_ = lasagne.updates.rmsprop(self._loss + (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getCriticNetwork(), lasagne.regularization.l2)), self._params, self._learning_rate, self._rho, # self._rms_epsilon) # TD update if (self.getSettings()['optimizer'] == 'rmsprop'): self._updates_ = lasagne.updates.rmsprop( T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._updates_ = lasagne.updates.momentum( T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._updates_ = lasagne.updates.adam( T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), beta1=0.9, beta2=0.999, epsilon=1e-08) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) ## Need to perform an element wise operation or replicate _diff for this to work properly. # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA), # theano.tensor.tile((self._advantage * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here? ## advantage = Q(a,s) - V(s) = (r + gamma*V(s')) - V(s) # self._advantage = (((self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState)) * self._Fallen)) - self._q_func self._Advantage = self._diff # * (1.0/(1.0-self._discount_factor)) ## scale back to same as rewards self._log_prob = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActA, self._q_valsActASTD, self._action_length) self._log_prob_target = loglikelihood( self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length) # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target).dot(self._Advantage)) ) # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target) * (self._Advantage)) ) # self._actLoss_ = ( ((self._log_prob) * self._Advantage) ) # self._actLoss_ = ( ((self._log_prob)) ) ## This does the sum already # self._actLoss_ = ( (self._log_prob).dot( self._Advantage) ) self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( T.exp(self._log_prob - self._log_prob_target), self._Advantage) # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(T.exp(self._log_prob - self._log_prob_target), self._advantage) # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._log_prob), self._Advantage) # self._actLoss_ = T.mean(self._log_prob) # self._policy_entropy = 0.5 * T.mean(T.log(2 * np.pi * self._q_valsActASTD ) + 1 ) ## - because update computes gradient DESCENT updates # self._actLoss = -1.0 * ((T.mean(self._actLoss_)) + (self._actor_regularization )) # self._entropy = -1. * T.sum(T.log(self._q_valsActA + 1e-8) * self._q_valsActA, axis=1, keepdims=True) ## - because update computes gradient DESCENT updates self._actLoss = (T.mean(self._actLoss_)) # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2)) self._policy_grad = T.grad(self._actLoss, self._actionParams) if (self.getSettings()['optimizer'] == 'rmsprop'): self._actionUpdates = lasagne.updates.rmsprop( self._policy_grad, self._actionParams, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._actionUpdates = lasagne.updates.momentum(self._policy_grad, self._actionParams, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._actionUpdates = lasagne.updates.adam(self._policy_grad, self._actionParams, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) N = self._model.getStateSymbolicVariable().shape[0] params = self._actionParams surr = self._actLoss * (-1.0) self.pg = flatgrad(surr, params) prob_mean_fixed = theano.gradient.disconnected_grad(self._q_valsActA) prob_std_fixed = theano.gradient.disconnected_grad(self._q_valsActASTD) kl_firstfixed = kl(prob_mean_fixed, prob_std_fixed, self._q_valsActA, self._q_valsActASTD, self._action_length).sum() / N grads = T.grad(kl_firstfixed, params) self.flat_tangent = T.vector(name="flat_tan") shapes = [var.get_value(borrow=True).shape for var in params] start = 0 tangents = [] for shape in shapes: size = np.prod(shape) tangents.append( T.reshape(self.flat_tangent[start:start + size], shape)) start += size self.gvp = T.add( *[T.sum(g * tangent) for (g, tangent) in zipsame(grads, tangents)]) #pylint: disable=E1111 # Fisher-vector product self.fvp = flatgrad(self.gvp, params) self.ent = entropy(self._q_valsActASTD).mean() self.kl = kl(self._q_valsActTarget, self._q_valsActTargetSTD, self._q_valsActA, self._q_valsActASTD, self._action_length).mean() self.losses = [surr, self.kl, self.ent] self.loss_names = ["surr", "kl", "ent"] self.args = [ self._model.getStateSymbolicVariable(), self._model.getActionSymbolicVariable(), self._model.getResultStateSymbolicVariable(), self._model.getRewardSymbolicVariable(), self._Fallen # self._advantage # self._q_valsActTarget_ ] self.args_fvp = [ self._model.getStateSymbolicVariable(), # self._model.getActionSymbolicVariable() # self._advantage, # self._q_valsActTarget_ ] # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) + # (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams, # self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._actions_shared, } ## Bellman error self._bellman = self._target - self._q_funcTarget TRPOCritic.compile(self)
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(Distillation, self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # create a small convolutional neural network ### Load expert policy files self._expert_policies = [] file_name_ = "" for i in range(len(self.getSettings()['expert_policy_files'])): file_name = self.getSettings( )['expert_policy_files'][i] + '/' + self.getSettings( )['model_type'] + '/' + getAgentName() + '.pkl' if (file_name_ == file_name): ## To help save memory when experts are the same self._expert_policies.append(model_) else: print("Loading pre compiled network: ", file_name) f = open(file_name, 'rb') model_ = dill.load(f) f.close() self._expert_policies.append( model_) # expert model, load the 2 expert models file_name_ = file_name self._actor_buffer_states = [] self._actor_buffer_result_states = [] self._actor_buffer_actions = [] self._actor_buffer_rewards = [] self._actor_buffer_falls = [] self._actor_buffer_diff = [] self._NotFallen = T.bcol("Not_Fallen") ## because float64 <= float32 * int32, need to use int16 or int8 self._NotFallen.tag.test_value = np.zeros((self._batch_size, 1), dtype=np.dtype('int8')) self._NotFallen_shared = theano.shared(np.zeros((self._batch_size, 1), dtype='int8'), broadcastable=(False, True)) self._tmp_diff = T.col("Tmp_Diff") self._tmp_diff.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._tmp_diff_shared = theano.shared( np.zeros((self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) #定义一个共享变量,初始值为为0 self._critic_regularization_weight = self.getSettings( )["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] ## Target network self._modelTarget = copy.deepcopy(model) # target model 是要更新的模型 self._q_valsA = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) #确定性原始模型的state值输出 self._q_valsA_drop = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) #非确定的state值输出 self._q_valsNextState = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) #下一步的state值 self._q_valsTargetNextState = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) #目标模型的下一步的state值 self._q_valsTarget = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) #目标模型的state值 self._q_valsTarget_drop = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) #目标模型的state self._q_valsActA = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsActTarget = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) #remove the random self._q_valsActA_drop = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) #actor 值 self._q_func = self._q_valsA self._q_funcTarget = self._q_valsTarget self._q_func_drop = self._q_valsA_drop self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA self._q_funcAct_drop = self._q_valsActA_drop self._target = self._model.getRewardSymbolicVariable() + ( self._discount_factor * self._q_valsTargetNextState) # self._model.getRewardSymbolicVariable() 获取rewards的值getRewards() =self._rewards_shared 从0开始一直更新 self._diff = self._target - self._q_func self._diff_drop = self._target - self._q_func_drop #更新的模型的reward减去原始模型的critic的输出值 loss = T.pow(self._diff, 2) self._loss = T.mean(loss) # 两个模型的reward的差值 self._loss_drop = T.mean(0.5 * self._diff_drop**2) self._params = lasagne.layers.helper.get_all_params( self._model.getCriticNetwork()) self._actionParams = lasagne.layers.helper.get_all_params( self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards() } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), self._tmp_diff: self._tmp_diff_shared } self._critic_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) self._actor_regularization = ( (self._regularization_weight * lasagne.regularization.regularize_network_params( self._model.getActorNetwork(), lasagne.regularization.l2))) if (self.getSettings()['use_previous_value_regularization']): self._actor_regularization = self._actor_regularization + ( (self.getSettings()['previous_value_regularization_weight']) * change_penalty(self._model.getActorNetwork(), self._modelTarget.getActorNetwork())) elif ('regularization_type' in self.getSettings() and (self.getSettings()['regularization_type'] == 'KL_Divergence')): self._kl_firstfixed = T.mean( kl( self._q_valsActTarget, T.ones_like(self._q_valsActTarget) * self.getSettings()['exploration_rate'], self._q_valsActA, T.ones_like(self._q_valsActA) * self.getSettings()['exploration_rate'], self._action_length)) self._actor_regularization = (self._kl_firstfixed) * ( self.getSettings()['kl_divergence_threshold']) print("Using regularization type : ", self.getSettings()['regularization_type']) # SGD update self._value_grad = T.grad(self._loss + self._critic_regularization, self._params) if (self.getSettings()['optimizer'] == 'rmsprop'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.rmsprop(self._value_grad, self._params, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.momentum( self._value_grad, self._params, self._critic_learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adam(self._value_grad, self._params, self._critic_learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adagrad( self._value_grad, self._params, self._critic_learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) ## TD update ## Need to perform an element wise operation or replicate _diff for this to work properly. self._actDiff = (self._model.getActionSymbolicVariable() - self._q_valsActA_drop) # 更新模型的actor的输出减去原始模型的actor值 self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( (T.mean(T.pow(self._actDiff, 2), axis=1)), (self._tmp_diff)) self._actLoss = T.mean(self._actLoss_) self._policy_grad = T.grad(self._actLoss + self._actor_regularization, self._actionParams) ## Clipping the max gradient if (self.getSettings()['optimizer'] == 'rmsprop'): self._actionUpdates = lasagne.updates.rmsprop( self._policy_grad, self._actionParams, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._actionUpdates = lasagne.updates.momentum(self._policy_grad, self._actionParams, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._actionUpdates = lasagne.updates.adam( self._policy_grad, self._actionParams, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): self._actionUpdates = lasagne.updates.adagrad( self._policy_grad, self._actionParams, self._learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates() } ## Bellman error self._bellman = self._target - self._q_funcTarget ### Give v(s') the next state and v(s) (target) the current state self._diff_adv = (self._discount_factor * self._q_func) - ( self._q_valsTargetNextState ) #\gamma*critic模型的输出-critic模型在下一个状态的输出值 self._diff_adv_givens = { self._model.getStateSymbolicVariable(): self._model.getResultStates(), self._model.getResultStateSymbolicVariable(): self._model.getStates(), } Distillation.compile(self)
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): """ In order to get this to work we need to be careful not to update the actor parameters when updating the critic. This can be an issue when the Concatenating networks together. The first first network becomes a part of the second. However you can still access the first network by itself but an updates on the second network will effect the first network. Care needs to be taken to make sure only the parameters of the second network are updated. """ super(QProp, self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # if ('train_extra_value_function' in self.getSettings() and (self.getSettings()['train_extra_value_function'] == True)): self._experience = ExperienceMemory( n_in, n_out, self.getSettings()['expereince_length'], continuous_actions=True, settings=self.getSettings()) self._experience.setStateBounds(copy.deepcopy(self.getStateBounds())) self._experience.setRewardBounds(copy.deepcopy(self.getRewardBounds())) self._experience.setActionBounds(copy.deepcopy(self.getActionBounds())) self._use_basic_polcy_grad = False self._Fallen = T.bcol("Fallen") ## because float64 <= float32 * int32, need to use int16 or int8 self._Fallen.tag.test_value = np.zeros((self._batch_size, 1), dtype=np.dtype('int8')) self._fallen_shared = theano.shared(np.zeros((self._batch_size, 1), dtype='int8'), broadcastable=(False, True)) self._Action = T.matrix("Action2") self._Action.tag.test_value = np.random.rand(self._batch_size, self._action_length) self._Tmp_Target = T.col("Tmp_Target") self._Tmp_Target.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._tmp_target_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._Advantage = T.col("Advantage") self._Advantage.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._advantage_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._QProp_N = T.col("QProp_N") self._QProp_N.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._QProp_N_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._modelTarget = copy.deepcopy(model) self._modelTarget2 = copy.deepcopy(model) self._learning_rate = self.getSettings()['learning_rate'] self._discount_factor = self.getSettings()['discount_factor'] self._rho = self.getSettings()['rho'] self._rms_epsilon = self.getSettings()['rms_epsilon'] self._weight_update_steps = self.getSettings( )['steps_until_target_network_update'] self._updates = 0 self._decay_weight = self.getSettings()['regularization_weight'] self._critic_regularization_weight = self.getSettings( )["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] self._q_valsActA = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsActTarget = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsActTarget_State = lasagne.layers.get_output( self._modelTarget2.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsActASTD = (T.ones_like( self._q_valsActA)) * self.getSettings()['exploration_rate'] self._q_valsActTargetSTD = (T.ones_like(self._q_valsActTarget_State) ) * self.getSettings()['exploration_rate'] inputs_1 = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions() } self._q_valsA = lasagne.layers.get_output( self._model.getCriticNetwork(), inputs_1) inputs_2 = { self._modelTarget.getStateSymbolicVariable(): self._model.getResultStates(), self._modelTarget.getActionSymbolicVariable(): self._model.getActions() } self._q_valsB_ = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), inputs_2, deterministic=True) self._q_func = self._q_valsA self._q_funcB = self._q_valsB_ self._q_funcAct = self._q_valsActA self._diff = self._Tmp_Target - self._q_func loss = T.pow(self._diff, 2) self._loss = T.mean(loss) self._params = lasagne.layers.helper.get_all_params( self._model.getCriticNetwork()) print("******Number of Layers is: " + str( len( lasagne.layers.helper.get_all_params( self._model.getCriticNetwork())))) print("******Number of Action Layers is: " + str( len( lasagne.layers.helper.get_all_params( self._model.getActorNetwork())))) self._actionParams = lasagne.layers.helper.get_all_params( self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), self._Tmp_Target: self._tmp_target_shared } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), } self._critic_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) ## MSE update self._value_grad = T.grad(self._loss + self._critic_regularization, self._params) print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adam(self._value_grad, self._params, self._critic_learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), } ## Some cool stuff to backprop action gradients self._action_grad = T.matrix("Action_Grad") self._action_grad.tag.test_value = np.zeros( (self._batch_size, self._action_length), dtype=np.dtype(self.getSettings()['float_type'])) self._action_grad_shared = theano.shared( np.zeros((self._batch_size, self._action_length), dtype=self.getSettings()['float_type'])) ### Maximize wrt q function self._action_mean_grads = T.grad( cost=None, wrt=self._actionParams, known_grads={self._q_valsActA: self._action_grad_shared}), print("Action grads: ", self._action_mean_grads[0]) ## When passing in gradients it needs to be a proper list of gradient expressions self._action_mean_grads = list(self._action_mean_grads[0]) self._actionGRADUpdates = lasagne.updates.adam( self._action_mean_grads, self._actionParams, self._learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) self._actGradGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), } self._actor_regularization = ( self._regularization_weight * lasagne.regularization.regularize_network_params( self._model.getActorNetwork(), lasagne.regularization.l2)) ### update Actor wrt to Q function """ inputs_1_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._q_valsActA: self._model.getActions() } q = self._model.getCriticNetwork()(self._model.getStateSymbolicVariable(), self._q_valsActA) self._q_valsA_ = lasagne.layers.get_output(self._model.getCriticNetwork(), inputs_1_) # self._q_valsA_ = lasagne.layers.get_output(self._model.getCriticNetwork(), self._q_valsActA) self._q_val2 = theano.function([self._model.getStateSymbolicVariable()], self._q_valsA_) self._actionUpdates = lasagne.updates.adam(-T.mean(self._q_valsA_), self._actionParams, self._learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) """ ## Compute on-policy policy gradient self._prob = likelihood(self._model.getActionSymbolicVariable(), self._q_valsActA, self._q_valsActASTD, self._action_length) ### How should this work if the target network is very odd, as in not a slightly outdated copy. self._prob_target = likelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget_State, self._q_valsActTargetSTD, self._action_length) ## This does the sum already self._r = (self._prob / self._prob_target) self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( (self._r), self._Advantage) ppo_epsilon = self.getSettings()['kl_divergence_threshold'] self._actLoss_2 = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( (theano.tensor.clip(self._r, 1.0 - ppo_epsilon, 1 + ppo_epsilon), self._Advantage)) self._actLoss_ = theano.tensor.minimum((self._actLoss_), (self._actLoss_2)) self._actLoss = ( (T.mean(self._actLoss_))) + -self._actor_regularization self._policy_grad = T.grad(-1.0 * self._actLoss, self._actionParams) self._policy_grad = lasagne.updates.total_norm_constraint( self._policy_grad, 5) if (self.getSettings()['optimizer'] == 'rmsprop'): self._actionUpdates = lasagne.updates.rmsprop( self._policy_grad, self._actionParams, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._actionUpdates = lasagne.updates.momentum(self._policy_grad, self._actionParams, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._actionUpdates = lasagne.updates.adam(self._policy_grad, self._actionParams, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08) self._qprop_loss = self._actLoss + T.mean( (self._QProp_N * self._q_func)) self._policy_grad_loss = self._actLoss # if ('train_extra_value_function' in self.getSettings() and (self.getSettings()['train_extra_value_function'] == True)): self._valsA = lasagne.layers.get_output( self._model._value_function, self._model.getStateSymbolicVariable(), deterministic=True) self._valsA_drop = lasagne.layers.get_output( self._model._value_function, self._model.getStateSymbolicVariable(), deterministic=False) self._valsNextState = lasagne.layers.get_output( self._model._value_function, self._model.getResultStateSymbolicVariable(), deterministic=True) self._valsTargetNextState = lasagne.layers.get_output( self._modelTarget._value_function, self._model.getResultStateSymbolicVariable(), deterministic=True) self._valsTarget = lasagne.layers.get_output( self._modelTarget._value_function, self._model.getStateSymbolicVariable(), deterministic=True) self._v_target = self._model.getRewardSymbolicVariable() + ( self._discount_factor * self._valsTargetNextState) self._v_diff = self._v_target - self._valsA loss_v = T.pow(self._v_diff, 2) self._v_loss = T.mean(loss_v) self._params_value = lasagne.layers.helper.get_all_params( self._model._value_function) self._givens_value = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), } self._value_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model._value_function, lasagne.regularization.l2)) self._value_grad = T.grad(self._v_loss + self._value_regularization, self._params_value) print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_value = lasagne.updates.adam(self._value_grad, self._params_value, self._critic_learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) self._actGivens_PPO = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._NotFallen: self._NotFallen_shared, self._Advantage: self._advantage_shared, # self._KL_Weight: self._kl_weight_shared } QProp.compile(self)