def getAdvantageGrads(self, states, next_states, alreadyNormed=False): """ The states should be normalized """ if (alreadyNormed == False): states = norm_state(states, self._state_bounds) next_states = norm_state(next_states, self._state_bounds) states = np.array(states, dtype=self._settings['float_type']) self._model.setStates(states) self._model.setResultStates(next_states) return self._get_grad()
def predict(self, state, deterministic_=True, evaluation_=False, p=None, sim_index=None, bootstrapping=False): # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type']) # states[0, ...] = state # state = np.array(state, dtype=self._settings['float_type']) state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) self._model.setStates(state) # action_ = lasagne.layers.get_output(self._model.getActorNetwork(), state, deterministic=deterministic_).mean() # action_ = scale_action(self._q_action()[0], self._action_bounds) # if deterministic_: action_ = scale_action( self._model.getActorNetwork().predict( state, batch_size=1)[:, :self._action_length], self._action_bounds) # action_ = scale_action(self._q_action_target()[0], self._action_bounds) # else: # action_ = scale_action(self._q_action()[0], self._action_bounds) # action_ = q_valsActA[0] return action_
def q_value(self, state): """ For returning a vector of q values, state should NOT be normalized """ # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type']) # states[0, ...] = state """ if ( ('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): pass else: """ # print ("Agent state bounds: ", self._state_bounds) state = norm_state(state, self._state_bounds) # print ("Agent normalized state: ", state) state = np.array(state, dtype=self._settings['float_type']) self._model.setStates(state) self._modelTarget.setStates(state) if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): value = scale_reward(self._q_val(), self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) # return (self._q_val())[0] else: value = scale_reward(self._q_val(), self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) # print ("Agent scaled value: ", value) return value
def q_value(self, state): state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) value = scale_reward( self._value([state, 0])[0], self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) return value
def predict_std(self, state, deterministic_=True): # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type']) # states[0, ...] = state """ if ( ('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): pass else: """ state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) self._model.setStates(state) # action_ = lasagne.layers.get_output(self._model.getActorNetwork(), state, deterministic=deterministic_).mean() # action_ = scale_action(self._q_action()[0], self._action_bounds) # if deterministic_: # action_std = scale_action(self._q_action_std()[0], self._action_bounds) if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): action_std = self._q_action_std() # action_std = self._q_action_std()[0] * (action_bound_std(self._action_bounds)) else: action_std = self._q_action_std() * (action_bound_std( self._action_bounds)) # else: # action_ = scale_action(self._q_action()[0], self._action_bounds) # action_ = q_valsActA[0] return action_std
def predict(self, state, deterministic_=True, evaluation_=False, p=None, sim_index=None, bootstrapping=False): # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type']) # states[0, ...] = state """ if ( ('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): pass else: """ # print ("Agent state bounds: ", self._state_bounds) state = norm_state(state, self._state_bounds) # print ("Agent normalized state: ", state) state = np.array(state, dtype=self._settings['float_type']) self._model.setStates(state) # action_ = lasagne.layers.get_output(self._model.getActorNetwork(), state, deterministic=deterministic_).mean() # action_ = scale_action(self._q_action()[0], self._action_bounds) # if deterministic_: if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): action_ = self._q_action() # action_ = scale_action(self._q_action()[0], self._action_bounds) else: action_ = scale_action(self._q_action(), self._action_bounds) # print ("Agent Scaled action: ", action_) # action_ = scale_action(self._q_action_target()[0], self._action_bounds) # else: # action_ = scale_action(self._q_action()[0], self._action_bounds) # action_ = q_valsActA[0] return action_
def q_values(self, state): """ For returning a vector of q values, state should already be normalized """ state = norm_state(state, self._state_bounds) state = np.array(state, dtype=theano.config.floatX) self._model.setStates(state) self._modelTarget.setStates(state) action = self._q_action() self._model.setActions(action) self._modelTarget.setActions(action) if ('train_extra_value_function' in self.getSettings() and (self.getSettings()['train_extra_value_function'] == True)): q_vals = self._vals_extra() else: q_vals = self._q_val() if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): return scale_reward(q_vals, self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) # return (self._q_val())[0] else: return scale_reward(q_vals, self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor']))
def q_valueWithDropout(self, state): # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type']) # states[0, ...] = state state = np.array(state, dtype=self._settings['float_type']) state = norm_state(state, self._state_bounds) self._model.setStates(state) return scale_reward(self._q_val_drop(), self.getRewardBounds())
def predict_std(self, state, deterministic_=True): state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) # action_std = self._model.getActorNetwork().predict(state, batch_size=1)[:,self._action_length:] * (action_bound_std(self._action_bounds)) action_std = self._q_action_std([state])[0] * action_bound_std( self._action_bounds) # print ("Policy std: ", repr(action_std)) return action_std
def getGrads(self, states, alreadyNormed=False): """ The states should be normalized """ # self.setData(states, actions, rewards, result_states) if (alreadyNormed == False): states = norm_state(states, self._state_bounds) states = np.array(states, dtype=self._settings['float_type']) self._model.setStates(states) return self._get_grad()
def predict_std(self, state, deterministic_=True): state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) self._model.setStates(state) if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): action_std = self._q_action_std()[0] else: action_std = self._q_action_std()[0] * (action_bound_std( self._action_bounds)) return action_std
def getGrads(self, states, actions=None, alreadyNormed=False): """ The states should be normalized """ # self.setData(states, actions, rewards, result_states) if (alreadyNormed == False): states = norm_state(states, self._state_bounds) states = np.array(states, dtype=theano.config.floatX) self._model.setStates(states) if (actions is None): actions = self.predict_batch(states) self._model.setActions(actions) return self._get_state_grad()
def predictWithDropout(self, state, deterministic_=True): # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type']) # states[0, ...] = state state = np.array(state, dtype=self._settings['float_type']) state = norm_state(state, self._state_bounds) action_ = scale_action( self._model.getActorNetwork().predict( states, batch_size=1)[:, :self._action_length], self._action_bounds) # else: # action_ = scale_action(self._q_action()[0], self._action_bounds) # action_ = q_valsActA[0] return action_
def q_value(self, state): # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type']) # states[0, ...] = state state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) self._model.setStates(state) self._modelTarget.setStates(state) # return scale_reward(self._q_valTarget(), self.getRewardBounds())[0] value = scale_reward( self._model.getCriticNetwork().predict(state, batch_size=1), self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) return value
def predict(self, state, deterministic_=True, evaluation_=False, p=None, sim_index=None, bootstrapping=False): state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) action_ = scale_action( self._model.getActorNetwork().predict( state, batch_size=1)[:, :self._action_length], self._action_bounds) return action_
def getGrads(self, states, alreadyNormed=False): """ The states should be normalized """ # self.setData(states, actions, rewards, result_states) if (alreadyNormed == False): states = norm_state(states, self._state_bounds) states = np.array(states, dtype=self._settings['float_type']) # grads = np.reshape(np.array(self._get_gradients([states])[0], dtype=self._settings['float_type']), (states.shape[0],states.shape[1])) grads = np.array(self._get_gradients([states, 0]), dtype=self._settings['float_type']) # print ("State grads: ", grads.shape) # print ("State grads: ", repr(grads)) return grads
def predict_std(self, state, deterministic_=True): state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) self._model.setStates(state) if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): action_std = self._model.getActorNetwork().predict( state, batch_size=1)[:, self._action_length:] # action_std = self._q_action_std()[0] * (action_bound_std(self._action_bounds)) else: action_std = self._model.getActorNetwork().predict( state, batch_size=1)[:, self._action_length:] * ( action_bound_std(self._action_bounds)) return action_std
def q_values(self, state): """ For returning a vector of q values, state should already be normalized """ state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) self._model.setStates(state) self._modelTarget.setStates(state) if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): return scale_reward(self._q_val(), self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) else: return scale_reward(self._q_val(), self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor']))
def predict(self, state, deterministic_=True, evaluation_=False, p=None, sim_index=None, bootstrapping=False): state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) self._model.setStates(state) if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): action_ = self._q_action()[0] else: action_ = scale_action( self._q_action()[0], self._action_bounds) # transform the action value to a range return action_
def q_valueWithDropout(self, state): if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): pass else: state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) self._model.setStates(state) if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): return scale_reward( self._q_val_drop(), self.getRewardBounds())[0] * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) else: return scale_reward( self._q_val_drop(), self.getRewardBounds())[0] * ( 1.0 / (1.0 - self.getSettings()['discount_factor']))
def q_valueWithDropout(self, state): # states = np.zeros((self._batch_size, self._state_length), dtype=self._settings['float_type']) # states[0, ...] = state if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): pass else: state = norm_state(state, self._state_bounds) state = np.array(state, dtype=self._settings['float_type']) self._model.setStates(state) if (('disable_parameter_scaling' in self._settings) and (self._settings['disable_parameter_scaling'])): return scale_reward(self._q_val_drop(), self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) # return (self._q_val_drop())[0] else: return scale_reward(self._q_val_drop(), self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor']))
def predictWithDropout(self, state, deterministic_=True): state = np.array(state, dtype=self._settings['float_type']) state = norm_state(state, self._state_bounds) self._model.setStates(state) action_ = scale_action(self._q_action_drop()[0], self._action_bounds) return action_