def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(MBPG, self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # scale = (bounds[1][i]-bounds[0][i])/2.0 # create a small convolutional neural network # self._action_std_scaling = (self._action_bounds[1] - self._action_bounds[0]) / 2.0 self._NotFallen = T.bcol("Not_Fallen") ## because float64 <= float32 * int32, need to use int16 or int8 self._NotFallen.tag.test_value = np.zeros((self._batch_size, 1), dtype=np.dtype('int8')) self._NotFallen_shared = theano.shared(np.zeros((self._batch_size, 1), dtype='int8'), broadcastable=(False, True)) self._advantage = T.col("Advantage") self._advantage.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._advantage_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._dyna_target = T.col("DYNA_Target") self._dyna_target.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._dyna_target_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._KL_Weight = T.scalar("KL_Weight") self._KL_Weight.tag.test_value = np.zeros( (1), dtype=np.dtype(self.getSettings()['float_type']))[0] self._kl_weight_shared = theano.shared( np.ones((1), dtype=self.getSettings()['float_type'])[0]) self._kl_weight_shared.set_value( self.getSettings()['previous_value_regularization_weight']) """ self._target_shared = theano.shared( np.zeros((self._batch_size, 1), dtype='float64'), broadcastable=(False, True)) """ self._critic_regularization_weight = self.getSettings( )["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] # primary network self._model = model # Target network self._modelTarget = copy.deepcopy(model) self._q_valsA = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsA_drop = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsNextState = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTargetNextState = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTarget = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsTarget_drop = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsActA = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:, :self._action_length] # self._q_valsActA = scale_action(self._q_valsActA, self._action_bounds) self._q_valsActASTD = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:, self._action_length:] ## prevent value from being 0 """ if ( 'use_fixed_std' in self.getSettings() and ( self.getSettings()['use_fixed_std'])): self._q_valsActASTD = ( T.ones_like(self._q_valsActA)) * self.getSettings()['exploration_rate'] # self._q_valsActASTD = ( T.ones_like(self._q_valsActA)) * self.getSettings()['exploration_rate'] else: """ self._q_valsActASTD = ((self._q_valsActASTD) * self.getSettings()['exploration_rate']) + 2e-2 self._q_valsActTarget = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())[:, :self._action_length] # self._q_valsActTarget = scale_action(self._q_valsActTarget, self._action_bounds) self._q_valsActTargetSTD = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())[:, self._action_length:] """ if ( 'use_fixed_std' in self.getSettings() and ( self.getSettings()['use_fixed_std'])): self._q_valsActTargetSTD = (T.ones_like(self._q_valsActTarget)) * self.getSettings()['exploration_rate'] # self._q_valsActTargetSTD = (self._action_std_scaling * T.ones_like(self._q_valsActTarget)) * self.getSettings()['exploration_rate'] else: """ self._q_valsActTargetSTD = ( (self._q_valsActTargetSTD) * self.getSettings()['exploration_rate']) + 2e-2 self._q_valsActA_drop = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_func = self._q_valsA self._q_funcTarget = self._q_valsTarget self._q_func_drop = self._q_valsA_drop self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA self._q_funcAct_drop = self._q_valsActA_drop # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen # self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsTargetNextState )), self._NotFallen) + (self._NotFallen - 1) self._target = self._model.getRewardSymbolicVariable() + ( self._discount_factor * self._q_valsTargetNextState) self._diff = self._target - self._q_func self._diff_drop = self._target - self._q_func_drop # loss = 0.5 * self._diff ** 2 loss = 0.5 * T.pow(self._diff, 2) self._loss = T.mean(loss) self._loss_drop = T.mean(0.5 * self._diff_drop**2) self._params = lasagne.layers.helper.get_all_params( self._model.getCriticNetwork()) self._actionParams = lasagne.layers.helper.get_all_params( self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._NotFallen: self._NotFallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._NotFallen: self._NotFallen_shared, self._advantage: self._advantage_shared, # self._KL_Weight: self._kl_weight_shared } self._allGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._NotFallen: self._NotFallen_shared, self._advantage: self._advantage_shared, # self._KL_Weight: self._kl_weight_shared } self._critic_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) self._actor_regularization = ( self._regularization_weight * lasagne.regularization.regularize_network_params( self._model.getActorNetwork(), lasagne.regularization.l2)) self._kl_firstfixed = T.mean( kl(self._q_valsActTarget, self._q_valsActTargetSTD, self._q_valsActA, self._q_valsActASTD, self._action_length)) # self._actor_regularization = (( self.getSettings()['previous_value_regularization_weight']) * self._kl_firstfixed ) # self._actor_regularization = (( self._KL_Weight ) * self._kl_firstfixed ) + (10*(self._kl_firstfixed>self.getSettings()['kl_divergence_threshold'])* # T.square(self._kl_firstfixed-self.getSettings()['kl_divergence_threshold'])) self._actor_entropy = 0.5 * T.mean((2 * np.pi * self._q_valsActASTD)) # SGD update # self._updates_ = lasagne.updates.rmsprop(self._loss + (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getCriticNetwork(), lasagne.regularization.l2)), self._params, self._learning_rate, self._rho, # self._rms_epsilon) self._value_grad = T.grad(self._loss + self._critic_regularization, self._params) ## Clipping the max gradient """ for x in range(len(self._value_grad)): self._value_grad[x] = T.clip(self._value_grad[x] , -0.1, 0.1) """ if (self.getSettings()['optimizer'] == 'rmsprop'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.rmsprop(self._value_grad, self._params, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.momentum( self._value_grad, self._params, self._critic_learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adam(self._value_grad, self._params, self._critic_learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adagrad( self._value_grad, self._params, self._critic_learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) ## Need to perform an element wise operation or replicate _diff for this to work properly. # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA), # theano.tensor.tile((self._advantage * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here? ## advantage = Q(a,s) - V(s) = (r + gamma*V(s')) - V(s) # self._advantage = (((self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState)) * self._NotFallen)) - self._q_func self._Advantage = self._advantage # * (1.0/(1.0-self._discount_factor)) ## scale back to same as rewards # self._log_prob = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActA, self._q_valsActASTD, self._action_length) # self._log_prob_target = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length) ### Only change the std self._prob = likelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActASTD, self._action_length) self._prob_target = likelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length) ## This does the sum already self._r = (self._prob / self._prob_target) self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( (self._r), self._Advantage) ppo_epsilon = self.getSettings()['kl_divergence_threshold'] self._actLoss_2 = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( (theano.tensor.clip(self._r, 1.0 - ppo_epsilon, 1 + ppo_epsilon), self._Advantage)) self._actLoss_ = theano.tensor.minimum((self._actLoss_), (self._actLoss_2)) self._actLoss = (-1.0 * (T.mean(self._actLoss_) + (self.getSettings()['std_entropy_weight'] * self._actor_entropy))) + self._actor_regularization self._policy_grad = T.grad(self._actLoss, self._actionParams) self._policy_grad = lasagne.updates.total_norm_constraint( self._policy_grad, 5) if (self.getSettings()['optimizer'] == 'rmsprop'): self._actionUpdates = lasagne.updates.rmsprop( self._policy_grad, self._actionParams, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._actionUpdates = lasagne.updates.momentum(self._policy_grad, self._actionParams, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._actionUpdates = lasagne.updates.adam(self._policy_grad, self._actionParams, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) if (('train_state_encoding' in self.getSettings()) and (self.getSettings()['train_state_encoding'])): self._encoded_state = lasagne.layers.get_output( self._model.getEncodeNet(), self._model.getStateSymbolicVariable(), deterministic=True) self._encoding_loss = T.mean( T.pow(self._encoded_state - self._model.getStates(), 2)) self._full_loss = ( self._loss + self._critic_regularization + (-1.0 * self.getSettings()['policy_loss_weight'] * (T.mean(self._actLoss_) + (self.getSettings()['std_entropy_weight'] * self._actor_entropy))) + (self._actor_regularization + self._encoding_loss)) else: self._full_loss = ( self._loss + self._critic_regularization + (-1.0 * self.getSettings()['policy_loss_weight'] * (T.mean(self._actLoss_) + (self.getSettings()['std_entropy_weight'] * self._actor_entropy))) + self._actor_regularization) if (('train_state_encoding' in self.getSettings()) and (self.getSettings()['train_state_encoding'])): self._encodeParams = lasagne.layers.helper.get_all_params( self._model.getEncodeNet()) self._all_Params = self._params + self._actionParams + self._encodeParams else: # self._all_Params = self._params + self._actionParams[-3:] self._all_Params = self._params + self._actionParams print("Num params: ", len(self._all_Params), " params: ", len(self._params), " act params: ", len(self._actionParams)) self._both_grad = T.grad(self._full_loss, self._all_Params) self._both_grad = lasagne.updates.total_norm_constraint( self._both_grad, 5) if (self.getSettings()['optimizer'] == 'rmsprop'): self._collectiveUpdates = lasagne.updates.rmsprop( self._both_grad, self._all_Params, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._collectiveUpdates = lasagne.updates.momentum( self._both_grad, self._all_Params, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._collectiveUpdates = lasagne.updates.adam(self._both_grad, self._all_Params, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) + # (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams, # self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._actions_shared, } ### _q_valsA because the predicted state is stored in self._model.getStateSymbolicVariable() self._diff_dyna = self._dyna_target - self._q_valsNextState # loss = 0.5 * self._diff ** 2 loss = 0.5 * T.pow(self._diff_dyna, 2) self._loss_dyna = T.mean(loss) self._dyna_grad = T.grad(self._loss_dyna + self._critic_regularization, self._params) self._givens_dyna = { # self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._NotFallen: self._NotFallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, self._dyna_target: self._dyna_target_shared } if (self.getSettings()['optimizer'] == 'rmsprop'): self._DYNAUpdates = lasagne.updates.rmsprop( self._dyna_grad, self._params, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._DYNAUpdates = lasagne.updates.momentum(self._dyna_grad, self._params, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._DYNAUpdates = lasagne.updates.adam(self._dyna_grad, self._params, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): self._DYNAUpdates = lasagne.updates.adagrad( self._dyna_grad, self._params, self._learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) ## Bellman error self._bellman = self._target - self._q_funcTarget ## Some cool stuff to backprop action gradients self._action_grad = T.matrix("Action_Grad") self._action_grad.tag.test_value = np.zeros( (self._batch_size, self._action_length), dtype=np.dtype(self.getSettings()['float_type'])) self._action_grad_shared = theano.shared( np.zeros((self._batch_size, self._action_length), dtype=self.getSettings()['float_type'])) self._action_mean_grads = T.grad( cost=None, wrt=self._actionParams, known_grads={self._q_valsActA: self._action_grad_shared}), # print ("Action grads: ", self._action_mean_grads[0]) ## When passing in gradients it needs to be a proper list of gradient expressions self._action_mean_grads = list(self._action_mean_grads[0]) # print ("isinstance(self._action_mean_grads, list): ", isinstance(self._action_mean_grads, list)) # print ("Action grads: ", self._action_mean_grads) self._actionGRADUpdates = lasagne.updates.adagrad( self._action_mean_grads, self._actionParams, self._learning_rate, epsilon=self._rms_epsilon) self._actGradGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._model.getActions(), # self._NotFallen: self._NotFallen_shared, # self._advantage: self._advantage_shared, # self._KL_Weight: self._kl_weight_shared } """ self._get_grad = theano.function([], outputs=T.grad(cost=None, wrt=[self._model._actionInputVar] + self._params, known_grads={self._forward: self._fd_grad_target_shared}), allow_input_downcast=True, givens= { self._model.getStateSymbolicVariable() : self._model.getStates(), # self._model.getResultStateSymbolicVariable() : self._model.getResultStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._fd_grad_target : self._fd_grad_target_shared }) """ MBPG.compile(self)
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(A_CACLA, self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # create a small convolutional neural network self._actor_buffer_states = [] self._actor_buffer_result_states = [] self._actor_buffer_actions = [] self._actor_buffer_rewards = [] self._actor_buffer_falls = [] self._actor_buffer_diff = [] self._NotFallen = T.bcol("Not_Fallen") ## because float64 <= float32 * int32, need to use int16 or int8 self._NotFallen.tag.test_value = np.zeros((self._batch_size, 1), dtype=np.dtype('int8')) self._NotFallen_shared = theano.shared(np.zeros((self._batch_size, 1), dtype='int8'), broadcastable=(False, True)) self._tmp_diff = T.col("Tmp_Diff") self._tmp_diff.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._tmp_diff_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._dyna_target = T.col("DYNA_Target") self._dyna_target.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._dyna_target_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._KL_Weight = T.scalar("KL_Weight") self._KL_Weight.tag.test_value = np.zeros( (1), dtype=np.dtype(self.getSettings()['float_type']))[0] self._kl_weight_shared = theano.shared( np.ones((1), dtype=self.getSettings()['float_type'])[0]) self._kl_weight_shared.set_value(1.0) """ self._target_shared = theano.shared( np.zeros((self._batch_size, 1), dtype='float64'), broadcastable=(False, True)) """ self._critic_regularization_weight = self.getSettings( )["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] ## Target network self._modelTarget = copy.deepcopy(model) self._q_valsA = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsA_drop = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsNextState = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTargetNextState = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTarget = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsTarget_drop = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsActA = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsActTarget = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsActA_drop = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_func = self._q_valsA self._q_funcTarget = self._q_valsTarget self._q_func_drop = self._q_valsA_drop self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA self._q_funcAct_drop = self._q_valsActA_drop # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen # self._target = self._model.getRewardSymbolicVariable() + ((self._discount_factor * self._q_valsTargetNextState ) * self._NotFallen) + (self._NotFallen - 1) self._target = self._model.getRewardSymbolicVariable() + ( self._discount_factor * self._q_valsTargetNextState) self._diff = self._target - self._q_func self._diff_drop = self._target - self._q_func_drop # loss = 0.5 * self._diff ** 2 loss = T.pow(self._diff, 2) self._loss = T.mean(loss) self._loss_drop = T.mean(0.5 * self._diff_drop**2) self._params = lasagne.layers.helper.get_all_params( self._model.getCriticNetwork()) self._actionParams = lasagne.layers.helper.get_all_params( self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._NotFallen: self._NotFallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._NotFallen: self._NotFallen_shared self._tmp_diff: self._tmp_diff_shared } self._critic_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) self._actor_regularization = ( (self._regularization_weight * lasagne.regularization.regularize_network_params( self._model.getActorNetwork(), lasagne.regularization.l2))) if (self.getSettings()['use_previous_value_regularization']): self._actor_regularization = self._actor_regularization + ( (self.getSettings()['previous_value_regularization_weight']) * change_penalty(self._model.getActorNetwork(), self._modelTarget.getActorNetwork())) elif ('regularization_type' in self.getSettings() and (self.getSettings()['regularization_type'] == 'KL_Divergence')): self._kl_firstfixed = T.mean( kl( self._q_valsActTarget, T.ones_like(self._q_valsActTarget) * self.getSettings()['exploration_rate'], self._q_valsActA, T.ones_like(self._q_valsActA) * self.getSettings()['exploration_rate'], self._action_length)) #self._actor_regularization = (( self._KL_Weight ) * self._kl_firstfixed ) + (10*(self._kl_firstfixed>self.getSettings()['kl_divergence_threshold'])* # T.square(self._kl_firstfixed-self.getSettings()['kl_divergence_threshold'])) self._actor_regularization = (self._kl_firstfixed) * ( self.getSettings()['kl_divergence_threshold']) print("Using regularization type : ", self.getSettings()['regularization_type']) # SGD update # self._updates_ = lasagne.updates.rmsprop(self._loss, self._params, self._learning_rate, self._rho, # self._rms_epsilon) self._value_grad = T.grad(self._loss + self._critic_regularization, self._params) ## Clipping the max gradient """ for x in range(len(self._value_grad)): self._value_grad[x] = T.clip(self._value_grad[x] , -0.1, 0.1) """ if (self.getSettings()['optimizer'] == 'rmsprop'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.rmsprop(self._value_grad, self._params, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.momentum( self._value_grad, self._params, self._critic_learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adam(self._value_grad, self._params, self._critic_learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adagrad( self._value_grad, self._params, self._critic_learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) ## TD update """ if (self.getSettings()['optimizer'] == 'rmsprop'): self._updates_ = lasagne.updates.rmsprop(T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._updates_ = lasagne.updates.momentum(T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), momentum=self._rho) elif ( self.getSettings()['optimizer'] == 'adam'): self._updates_ = lasagne.updates.adam(T.mean(self._q_func), self._params, self._critic_learning_rate * -T.mean(self._diff), beta1=0.9, beta2=0.999, epsilon=1e-08) else: print ("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) """ ## Need to perform an element wise operation or replicate _diff for this to work properly. # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA), theano.tensor.tile((self._diff * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here? self._actDiff = (self._model.getActionSymbolicVariable() - self._q_valsActA_drop) # self._actDiff = ((self._model.getActionSymbolicVariable() - self._q_valsActA)) # Target network does not work well here? # self._actDiff_drop = ((self._model.getActionSymbolicVariable() - self._q_valsActA_drop)) # Target network does not work well here? ## This should be a single column vector # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(( T.transpose(T.sum(T.pow(self._actDiff, 2),axis=1) )), (self._diff * (1.0/(1.0-self._discount_factor)))) # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(( T.reshape(T.sum(T.pow(self._actDiff, 2),axis=1), (self._batch_size, 1) )), # (self._tmp_diff * (1.0/(1.0-self._discount_factor))) # self._actLoss_ = (T.mean(T.pow(self._actDiff, 2),axis=1)) self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( (T.mean(T.pow(self._actDiff, 2), axis=1)), (self._tmp_diff * (1.0 / (1.0 - self._discount_factor)))) # self._actLoss = T.sum(self._actLoss)/float(self._batch_size) self._actLoss = T.mean(self._actLoss_) # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2)) self._policy_grad = T.grad(self._actLoss + self._actor_regularization, self._actionParams) ## Clipping the max gradient """ for x in range(len(self._policy_grad)): self._policy_grad[x] = T.clip(self._policy_grad[x] , -0.5, 0.5) """ if (self.getSettings()['optimizer'] == 'rmsprop'): self._actionUpdates = lasagne.updates.rmsprop( self._policy_grad, self._actionParams, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._actionUpdates = lasagne.updates.momentum(self._policy_grad, self._actionParams, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._actionUpdates = lasagne.updates.adam( self._policy_grad, self._actionParams, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): self._actionUpdates = lasagne.updates.adagrad( self._policy_grad, self._actionParams, self._learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) + # (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams, # self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._model.getActions(), } ### Noisey state updates # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen # self._target_dyna = theano.gradient.disconnected_grad(self._q_func) ### _q_valsA because the predicted state is stored in self._model.getStateSymbolicVariable() self._diff_dyna = self._dyna_target - self._q_valsNextState # loss = 0.5 * self._diff ** 2 loss = T.pow(self._diff_dyna, 2) self._loss_dyna = T.mean(loss) self._dyna_grad = T.grad(self._loss_dyna + self._critic_regularization, self._params) self._givens_dyna = { # self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._NotFallen: self._NotFallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, self._dyna_target: self._dyna_target_shared } if (self.getSettings()['optimizer'] == 'rmsprop'): self._DYNAUpdates = lasagne.updates.rmsprop( self._dyna_grad, self._params, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._DYNAUpdates = lasagne.updates.momentum(self._dyna_grad, self._params, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._DYNAUpdates = lasagne.updates.adam(self._dyna_grad, self._params, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): self._DYNAUpdates = lasagne.updates.adagrad( self._dyna_grad, self._params, self._learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) ## Bellman error self._bellman = self._target - self._q_funcTarget # self._target = self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState ) ### Give v(s') the next state and v(s) (target) the current state self._diff_adv = (self._discount_factor * self._q_func) - (self._q_valsTargetNextState) self._diff_adv_givens = { self._model.getStateSymbolicVariable(): self._model.getResultStates(), self._model.getResultStateSymbolicVariable(): self._model.getStates(), } A_CACLA.compile(self)
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(PPOCritic2, self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # create a small convolutional neural network self._Fallen = T.bcol("Fallen") ## because float64 <= float32 * int32, need to use int16 or int8 self._Fallen.tag.test_value = np.zeros((self._batch_size, 1), dtype=np.dtype('int8')) self._fallen_shared = theano.shared(np.zeros((self._batch_size, 1), dtype='int8'), broadcastable=(False, True)) self._advantage = T.col("Advantage") self._advantage.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._advantage_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._KL_Weight = T.scalar("KL_Weight") self._KL_Weight.tag.test_value = np.zeros( (1), dtype=np.dtype(self.getSettings()['float_type']))[0] self._kl_weight_shared = theano.shared( np.ones((1), dtype=self.getSettings()['float_type'])[0]) self._kl_weight_shared.set_value( self.getSettings()['previous_value_regularization_weight']) """ self._target_shared = theano.shared( np.zeros((self._batch_size, 1), dtype='float64'), broadcastable=(False, True)) """ self._critic_regularization_weight = self.getSettings( )["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] # primary network self._model = model # Target network self._modelTarget = copy.deepcopy(model) self._q_valsA = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsA_drop = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsTargetNextState = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTarget = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsTarget_drop = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsActA = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:, :self._action_length] self._q_valsActASTD = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:, self._action_length:] ## prevent value from being 0 self._q_valsActASTD = (self._q_valsActASTD * self.getSettings()['exploration_rate']) + 5e-2 self._q_valsActTarget = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())[:, :self._action_length] self._q_valsActTargetSTD = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())[:, self._action_length:] self._q_valsActTargetSTD = ( self._q_valsActTargetSTD * self.getSettings()['exploration_rate']) + 5e-2 self._q_valsActA_drop = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_func = self._q_valsA self._q_funcTarget = self._q_valsTarget self._q_func_drop = self._q_valsA_drop self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA self._q_funcAct_drop = self._q_valsActA_drop # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._Fallen self._target = T.mul( T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsTargetNextState)), self._Fallen) self._diff = self._target - self._q_func self._diff_drop = self._target - self._q_func_drop # loss = 0.5 * self._diff ** 2 loss = T.pow(self._diff, 2) self._loss = T.mean(loss) self._loss_drop = T.mean(0.5 * self._diff_drop**2) self._params = lasagne.layers.helper.get_all_params( self._model.getCriticNetwork()) self._actionParams = lasagne.layers.helper.get_all_params( self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._Fallen: self._fallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._model.getActionSymbolicVariable(): self._model.getActions(), self._Fallen: self._fallen_shared, # self._advantage: self._advantage_shared, self._KL_Weight: self._kl_weight_shared } self._critic_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) # self._actor_regularization = ( (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)) ) self._kl_firstfixed = T.mean( kl(self._q_valsActTarget, self._q_valsActTargetSTD, self._q_valsActA, self._q_valsActASTD, self._action_length)) # self._actor_regularization = (( self.getSettings()['previous_value_regularization_weight']) * self._kl_firstfixed ) self._actor_regularization = ( (self._KL_Weight) * self._kl_firstfixed) + ( 10 * (self._kl_firstfixed > self.getSettings()['kl_divergence_threshold']) * T.square(self._kl_firstfixed - self.getSettings()['kl_divergence_threshold'])) # SGD update # self._updates_ = lasagne.updates.rmsprop(self._loss, self._params, self._learning_rate, self._rho, # self._rms_epsilon) if (self.getSettings()['optimizer'] == 'rmsprop'): self._updates_ = lasagne.updates.rmsprop( self._loss # + self._critic_regularization , self._params, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._updates_ = lasagne.updates.momentum( self._loss # + self._critic_regularization , self._params, self._critic_learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._updates_ = lasagne.updates.adam( self._loss # + self._critic_regularization , self._params, self._critic_learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) ## TD update """ if (self.getSettings()['optimizer'] == 'rmsprop'): self._updates_ = lasagne.updates.rmsprop(T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._updates_ = lasagne.updates.momentum(T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), momentum=self._rho) elif ( self.getSettings()['optimizer'] == 'adam'): self._updates_ = lasagne.updates.adam(T.mean(self._q_func), self._params, self._critic_learning_rate * -T.mean(self._diff), beta1=0.9, beta2=0.999, epsilon=1e-08) else: print ("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) """ ## Need to perform an element wise operation or replicate _diff for this to work properly. # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA), # theano.tensor.tile((self._advantage * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here? ## advantage = Q(a,s) - V(s) = (r + gamma*V(s')) - V(s) # self._advantage = (((self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState)) * self._Fallen)) - self._q_func self._Advantage = self._diff * (1.0 / (1.0 - self._discount_factor) ) ## scale back to same as rewards self._log_prob = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActA, self._q_valsActASTD, self._action_length) self._log_prob_target = loglikelihood( self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length) # self._prob = likelihood(self._model.getActionSymbolicVariable(), self._q_valsActA, self._q_valsActASTD, self._action_length) # self._prob_target = likelihood(self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length) # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target).dot(self._Advantage)) ) # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target) * (self._Advantage)) ) # self._actLoss_ = ( ((self._log_prob) * self._Advantage) ) # self._actLoss_ = ( ((self._log_prob)) ) ## This does the sum already # self._actLoss_ = ( (self._log_prob).dot( self._Advantage) ) # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._prob / self._prob_target), self._Advantage) self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( T.exp(self._log_prob - self._log_prob_target), self._Advantage) # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._log_prob), self._Advantage) # self._actLoss_ = T.mean(self._log_prob) # self._policy_entropy = 0.5 * T.mean(T.log(2 * np.pi * self._q_valsActASTD ) + 1 ) ## - because update computes gradient DESCENT updates # self._actLoss = -1.0 * ((T.mean(self._actLoss_)) + (self._actor_regularization )) # self._entropy = -1. * T.sum(T.log(self._q_valsActA + 1e-8) * self._q_valsActA, axis=1, keepdims=True) ## - because update computes gradient DESCENT updates self._actLoss = (-1.0 * T.mean(self._actLoss_)) + ( 1.0 * self._actor_regularization) + (-1e-3 * entropy(self._q_valsActASTD)) # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2)) self._policy_grad = T.grad(self._actLoss, self._actionParams) if (self.getSettings()['optimizer'] == 'rmsprop'): self._actionUpdates = lasagne.updates.rmsprop( self._policy_grad, self._actionParams, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._actionUpdates = lasagne.updates.momentum(self._policy_grad, self._actionParams, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._actionUpdates = lasagne.updates.adam(self._policy_grad, self._actionParams, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) + # (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams, # self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._actions_shared, } ## Bellman error self._bellman = self._target - self._q_funcTarget PPOCritic2.compile(self)
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(TRPO, self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # create a small convolutional neural network # self._Fallen = T.bcol("Fallen") ## because float64 <= float32 * int32, need to use int16 or int8 # self._Fallen.tag.test_value = np.zeros((self._batch_size,1),dtype=np.dtype('int8')) # self._fallen_shared = theano.shared( # np.zeros((self._batch_size, 1), dtype='int8'), # broadcastable=(False, True)) self._advantage = T.col("Advantage") self._advantage.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._advantage_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) self._KL_Weight = T.scalar("KL_Weight") self._KL_Weight.tag.test_value = np.zeros( (1), dtype=np.dtype(self.getSettings()['float_type']))[0] self._kl_weight_shared = theano.shared( np.ones((1), dtype=self.getSettings()['float_type'])[0]) self._kl_weight_shared.set_value( self.getSettings()['previous_value_regularization_weight']) """ self._target_shared = theano.shared( np.zeros((self._batch_size, 1), dtype='float64'), broadcastable=(False, True)) """ self._critic_regularization_weight = self.getSettings( )["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] # primary network self._model = model # Target network self._modelTarget = copy.deepcopy(model) self._q_valsA = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsA_drop = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsNextState = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTargetNextState = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTarget = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsTarget_drop = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsActA = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:, :self._action_length] # self._q_valsActA = scale_action(self._q_valsActA, self._action_bounds) self._q_valsActASTD = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)[:, self._action_length:] ## prevent value from being 0 if ('use_fixed_std' in self.getSettings() and (self.getSettings()['use_fixed_std'])): self._q_valsActASTD = (T.ones_like( self._q_valsActA)) * self.getSettings()['exploration_rate'] # self._q_valsActASTD = ( T.ones_like(self._q_valsActA)) * self.getSettings()['exploration_rate'] else: self._q_valsActASTD = ( (self._q_valsActASTD) * self.getSettings()['exploration_rate']) + 2e-2 self._q_valsActTarget = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())[:, :self._action_length] # self._q_valsActTarget = scale_action(self._q_valsActTarget, self._action_bounds) self._q_valsActTargetSTD = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())[:, self._action_length:] if ('use_fixed_std' in self.getSettings() and (self.getSettings()['use_fixed_std'])): self._q_valsActTargetSTD = (T.ones_like( self._q_valsActTarget)) * self.getSettings( )['exploration_rate'] # self._q_valsActTargetSTD = (self._action_std_scaling * T.ones_like(self._q_valsActTarget)) * self.getSettings()['exploration_rate'] else: self._q_valsActTargetSTD = ( (self._q_valsActTargetSTD) * self.getSettings()['exploration_rate']) + 2e-2 self._q_valsActA_drop = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_func = self._q_valsA self._q_funcTarget = self._q_valsTarget self._q_func_drop = self._q_valsA_drop self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA self._q_funcAct_drop = self._q_valsActA_drop # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen # self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsTargetNextState )), self._NotFallen) + (self._NotFallen - 1) self._target = self._model.getRewardSymbolicVariable() + ( self._discount_factor * self._q_valsTargetNextState) self._diff = self._target - self._q_func self._diff_drop = self._target - self._q_func_drop # loss = 0.5 * self._diff ** 2 loss = T.pow(self._diff, 2) self._loss = T.mean(loss) self._loss_drop = T.mean(0.5 * self._diff_drop**2) self._params = lasagne.layers.helper.get_all_params( self._model.getCriticNetwork()) #if ( 'use_fixed_std' in self.getSettings() and ( self.getSettings()['use_fixed_std'])): # self._actionParams = lasagne.layers.helper.get_all_params(self._model.getActorNetwork()) #else: self._actionParams = lasagne.layers.helper.get_all_params( self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._NotFallen: self._NotFallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._Fallen: self._fallen_shared, self._advantage: self._advantage_shared, # self._KL_Weight: self._kl_weight_shared } self._critic_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) # self._actor_regularization = ( (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)) ) self._kl_firstfixed = kl(self._q_valsActTarget, self._q_valsActTargetSTD, self._q_valsActA, self._q_valsActASTD, self._action_length).mean() # self._actor_regularization = (( self.getSettings()['previous_value_regularization_weight']) * self._kl_firstfixed ) self._actor_regularization = ( (self._KL_Weight) * self._kl_firstfixed) + ( (self._kl_firstfixed > self.getSettings()['kl_divergence_threshold']) * T.square(self._kl_firstfixed - self.getSettings()['kl_divergence_threshold'])) # SGD update # self._updates_ = lasagne.updates.rmsprop(self._loss + (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getCriticNetwork(), lasagne.regularization.l2)), self._params, self._learning_rate, self._rho, # self._rms_epsilon) # TD update if (self.getSettings()['optimizer'] == 'rmsprop'): self._updates_ = lasagne.updates.rmsprop( T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._updates_ = lasagne.updates.momentum( T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._updates_ = lasagne.updates.adam( T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), beta1=0.9, beta2=0.999, epsilon=1e-08) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) ## Need to perform an element wise operation or replicate _diff for this to work properly. # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA), # theano.tensor.tile((self._advantage * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here? ## advantage = Q(a,s) - V(s) = (r + gamma*V(s')) - V(s) # self._advantage = (((self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState)) * self._Fallen)) - self._q_func self._Advantage = self._advantage # * (1.0/(1.0-self._discount_factor)) ## scale back to same as rewards # self._Advantage = self._diff # * (1.0/(1.0-self._discount_factor)) ## scale back to same as rewards self._log_prob = loglikelihood(self._model.getActionSymbolicVariable(), self._q_valsActA, self._q_valsActASTD, self._action_length) self._log_prob_target = loglikelihood( self._model.getActionSymbolicVariable(), self._q_valsActTarget, self._q_valsActTargetSTD, self._action_length) # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target).dot(self._Advantage)) ) # self._actLoss_ = ( (T.exp(self._log_prob - self._log_prob_target) * (self._Advantage)) ) # self._actLoss_ = ( ((self._log_prob) * self._Advantage) ) # self._actLoss_ = ( ((self._log_prob)) ) ## This does the sum already # self._actLoss_ = ( (self._log_prob).dot( self._Advantage) ) # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(T.exp(self._log_prob - self._log_prob_target), self._Advantage) self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( T.exp(self._log_prob - self._log_prob_target), self._Advantage) # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._log_prob), self._Advantage) # self._actLoss_ = T.mean(self._log_prob) # self._policy_entropy = 0.5 * T.mean(T.log(2 * np.pi * self._q_valsActASTD ) + 1 ) ## - because update computes gradient DESCENT updates # self._actLoss = -1.0 * ((T.mean(self._actLoss_)) + (self._actor_regularization )) # self._entropy = -1. * T.sum(T.log(self._q_valsActA + 1e-8) * self._q_valsActA, axis=1, keepdims=True) ## - because update computes gradient DESCENT updates self._actLoss = (-1.0 * T.mean(self._actLoss_)) # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2)) self._policy_grad = T.grad(self._actLoss, self._actionParams) if (self.getSettings()['optimizer'] == 'rmsprop'): self._actionUpdates = lasagne.updates.rmsprop( self._policy_grad, self._actionParams, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._actionUpdates = lasagne.updates.momentum(self._policy_grad, self._actionParams, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._actionUpdates = lasagne.updates.adam(self._policy_grad, self._actionParams, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) N = self._model.getStateSymbolicVariable().shape[0] params = self._actionParams surr = self._actLoss * (1.0 / N) self.pg = flatgrad(surr, params) prob_mean_fixed = theano.gradient.disconnected_grad(self._q_valsActA) prob_std_fixed = theano.gradient.disconnected_grad(self._q_valsActASTD) kl_firstfixed = kl(prob_mean_fixed, prob_std_fixed, self._q_valsActA, self._q_valsActASTD, self._action_length).sum() / N grads = T.grad(kl_firstfixed, params) self.flat_tangent = T.vector(name="flat_tan") shapes = [var.get_value(borrow=True).shape for var in params] start = 0 tangents = [] for shape in shapes: size = np.prod(shape) tangents.append( T.reshape(self.flat_tangent[start:start + size], shape)) start += size self.gvp = T.add( *[T.sum(g * tangent) for (g, tangent) in zipsame(grads, tangents)]) #pylint: disable=E1111 # Fisher-vector product self.fvp = flatgrad(self.gvp, params) self.ent = entropy(self._q_valsActASTD).mean() self.kl = kl(self._q_valsActTarget, self._q_valsActTargetSTD, self._q_valsActA, self._q_valsActASTD, self._action_length).mean() self.losses = [surr, self.kl, self.ent] self.loss_names = ["surr", "kl", "ent"] self.args = [ self._model.getStateSymbolicVariable(), self._model.getActionSymbolicVariable(), self._advantage # self._q_valsActTarget_ ] self.args_fvp = [ self._model.getStateSymbolicVariable(), # self._model.getActionSymbolicVariable() # self._advantage, # self._q_valsActTarget_ ] # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) + # (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams, # self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._actions_shared, } ## Bellman error self._bellman = self._target - self._q_funcTarget TRPO.compile(self)
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(Distillation, self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # create a small convolutional neural network ### Load expert policy files self._expert_policies = [] file_name_ = "" for i in range(len(self.getSettings()['expert_policy_files'])): file_name = self.getSettings( )['expert_policy_files'][i] + '/' + self.getSettings( )['model_type'] + '/' + getAgentName() + '.pkl' if (file_name_ == file_name): ## To help save memory when experts are the same self._expert_policies.append(model_) else: print("Loading pre compiled network: ", file_name) f = open(file_name, 'rb') model_ = dill.load(f) f.close() self._expert_policies.append( model_) # expert model, load the 2 expert models file_name_ = file_name self._actor_buffer_states = [] self._actor_buffer_result_states = [] self._actor_buffer_actions = [] self._actor_buffer_rewards = [] self._actor_buffer_falls = [] self._actor_buffer_diff = [] self._NotFallen = T.bcol("Not_Fallen") ## because float64 <= float32 * int32, need to use int16 or int8 self._NotFallen.tag.test_value = np.zeros((self._batch_size, 1), dtype=np.dtype('int8')) self._NotFallen_shared = theano.shared(np.zeros((self._batch_size, 1), dtype='int8'), broadcastable=(False, True)) self._tmp_diff = T.col("Tmp_Diff") self._tmp_diff.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._tmp_diff_shared = theano.shared( np.zeros((self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) #定义一个共享变量,初始值为为0 self._critic_regularization_weight = self.getSettings( )["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] ## Target network self._modelTarget = copy.deepcopy(model) # target model 是要更新的模型 self._q_valsA = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) #确定性原始模型的state值输出 self._q_valsA_drop = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) #非确定的state值输出 self._q_valsNextState = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) #下一步的state值 self._q_valsTargetNextState = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) #目标模型的下一步的state值 self._q_valsTarget = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) #目标模型的state值 self._q_valsTarget_drop = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) #目标模型的state self._q_valsActA = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsActTarget = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) #remove the random self._q_valsActA_drop = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) #actor 值 self._q_func = self._q_valsA self._q_funcTarget = self._q_valsTarget self._q_func_drop = self._q_valsA_drop self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA self._q_funcAct_drop = self._q_valsActA_drop self._target = self._model.getRewardSymbolicVariable() + ( self._discount_factor * self._q_valsTargetNextState) # self._model.getRewardSymbolicVariable() 获取rewards的值getRewards() =self._rewards_shared 从0开始一直更新 self._diff = self._target - self._q_func self._diff_drop = self._target - self._q_func_drop #更新的模型的reward减去原始模型的critic的输出值 loss = T.pow(self._diff, 2) self._loss = T.mean(loss) # 两个模型的reward的差值 self._loss_drop = T.mean(0.5 * self._diff_drop**2) self._params = lasagne.layers.helper.get_all_params( self._model.getCriticNetwork()) self._actionParams = lasagne.layers.helper.get_all_params( self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards() } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), self._tmp_diff: self._tmp_diff_shared } self._critic_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) self._actor_regularization = ( (self._regularization_weight * lasagne.regularization.regularize_network_params( self._model.getActorNetwork(), lasagne.regularization.l2))) if (self.getSettings()['use_previous_value_regularization']): self._actor_regularization = self._actor_regularization + ( (self.getSettings()['previous_value_regularization_weight']) * change_penalty(self._model.getActorNetwork(), self._modelTarget.getActorNetwork())) elif ('regularization_type' in self.getSettings() and (self.getSettings()['regularization_type'] == 'KL_Divergence')): self._kl_firstfixed = T.mean( kl( self._q_valsActTarget, T.ones_like(self._q_valsActTarget) * self.getSettings()['exploration_rate'], self._q_valsActA, T.ones_like(self._q_valsActA) * self.getSettings()['exploration_rate'], self._action_length)) self._actor_regularization = (self._kl_firstfixed) * ( self.getSettings()['kl_divergence_threshold']) print("Using regularization type : ", self.getSettings()['regularization_type']) # SGD update self._value_grad = T.grad(self._loss + self._critic_regularization, self._params) if (self.getSettings()['optimizer'] == 'rmsprop'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.rmsprop(self._value_grad, self._params, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.momentum( self._value_grad, self._params, self._critic_learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adam(self._value_grad, self._params, self._critic_learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adagrad( self._value_grad, self._params, self._critic_learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) ## TD update ## Need to perform an element wise operation or replicate _diff for this to work properly. self._actDiff = (self._model.getActionSymbolicVariable() - self._q_valsActA_drop) # 更新模型的actor的输出减去原始模型的actor值 self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( (T.mean(T.pow(self._actDiff, 2), axis=1)), (self._tmp_diff)) self._actLoss = T.mean(self._actLoss_) self._policy_grad = T.grad(self._actLoss + self._actor_regularization, self._actionParams) ## Clipping the max gradient if (self.getSettings()['optimizer'] == 'rmsprop'): self._actionUpdates = lasagne.updates.rmsprop( self._policy_grad, self._actionParams, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._actionUpdates = lasagne.updates.momentum(self._policy_grad, self._actionParams, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._actionUpdates = lasagne.updates.adam( self._policy_grad, self._actionParams, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): self._actionUpdates = lasagne.updates.adagrad( self._policy_grad, self._actionParams, self._learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates() } ## Bellman error self._bellman = self._target - self._q_funcTarget ### Give v(s') the next state and v(s) (target) the current state self._diff_adv = (self._discount_factor * self._q_func) - ( self._q_valsTargetNextState ) #\gamma*critic模型的输出-critic模型在下一个状态的输出值 self._diff_adv_givens = { self._model.getStateSymbolicVariable(): self._model.getResultStates(), self._model.getResultStateSymbolicVariable(): self._model.getStates(), } Distillation.compile(self)