def step(self, observations, policy, action, remove_base=False, terminal_step=False, **vals): """Update the Network Args: observations (list): real-valued list of observations from the environment. policy (list): list of length num_actions; the policy of the control policy for the given state. Returns: predictions (list): the predictions for each GVF given the observations and policy. """ # get the next feature vector phi_next = self.function_approximation.get_features(observations) if type(self.last_phi) is np.ndarray: discounts = self.discounts if terminal_step: discounts = np.zeros(self.discounts.shape) # calculate importance sampling rho = (self.policies/policy)[:, action] # update the traces based on the new visitation self.eligibility_traces = accumulate(self.eligibility_traces, discounts, self.traces_lambda, self.last_phi, rho) # calculate the new cumulants current_cumulants = np.array([cumulant.cumulant(observations) for cumulant in self.cumulants]) # get a vector of TD errors corresponding to the performance. td_error = calculate_temporal_difference_error(self.weights, current_cumulants, discounts, phi_next, self.last_phi) self.td_error = td_error # update the weights based on the caluculated TD error self.weights = update_weights(td_error, self.eligibility_traces, self.weights, discounts, self.traces_lambda, self.step_sizes, self.last_phi, self.bias_correction) # update bias correction term self.bias_correction = update_h_trace(self.bias_correction, td_error, self.step_size_bias_correction , self.eligibility_traces, self.last_phi) # maintain verifiers self.rupee, self.tau, self.eligibility_avg= \ update_rupee( beta_naught=self.rupee_beta, tau=self.tau, delta_e=self.eligibility_avg, h=self.bias_correction, e=self.eligibility_traces, delta=td_error, alpha=self.step_sizes, phi=self.last_phi ) self.ude, self.delta_avg, self.delta_var = update_ude( self.ude_beta, self.delta_avg, self.delta_var, td_error ) self.avg_error = self.avg_error * 0.9 + 0.1 * np.abs(td_error) self.last_phi = phi_next self.last_prediction = np.inner(self.weights, phi_next) return self.last_prediction
def step(self, observations, remove_base=False, **vals): """Update the Network""" # get the next feature vector add = len(self.min_obs) - len(observations) observations = np.concatenate( (observations, np.zeros(add)) ) # we don't have the predictions in the first layer, so concat zeros self.min_obs = np.minimum(observations, self.min_obs) self.max_obs = np.maximum(observations, self.max_obs) observations += np.abs(self.min_obs) observations = np.divide( observations, (np.abs(self.max_obs) + np.abs(self.min_obs)), where=(np.abs(self.max_obs) + np.abs(self.min_obs)) != 0) observations[np.isnan(observations)] = 0 observations[np.isinf(observations)] = 0 # we take off the protected range, as they exist only to serve as cumulants. if remove_base: phi_next = self.function_approximation.get_features(observations) else: phi_next = self.function_approximation.get_features( observations[self.protected_range:]) phi_next = np.concatenate((phi_next, [1]))[:, None] if type(self.last_phi) is np.ndarray: # update the traces based on the new visitation self.eligibility_traces = accumulate(self.eligibility_traces, self.discounts, self.traces_lambda, phi_next) # calculate the new cumulants current_cumulants = np.array([ cumulant.cumulant(observations) for cumulant in self.cumulants ])[:, None] # get a vector of TD errors corresponding to the performance. td_error = calculate_temporal_difference_error( self.weights, current_cumulants, self.discounts, phi_next, self.last_phi) # update the weights based on the caluculated TD error predictions = self.predict(phi_next) # update the running trace of maximum meta_weight updates if self.use_step_sizes: self.n = update_normalizer_accumulation( self.n, self.beta, self.eligibility_traces, self.last_phi, self.h, td_error) self.beta = update_meta_weights(self.beta, self.last_phi, self.meta_step_size, td_error, self.h, self.n) self.step_sizes = normalize_step_size( calculate_step_size(self.beta), self.beta, self.eligibility_traces, self.discounts, self.last_phi, phi_next) # update weights self.weights = update_weights(td_error, self.eligibility_traces, self.weights, self.step_sizes) # update beta trace self.h = update_meta_weight_update_trace(self.h, self.eligibility_traces, self.last_phi, td_error, self.step_sizes) # maintain verifiers self.rupee, self.tau, self.eligibility_avg, self.rupee_h_trace = \ update_rupee( self.rupee_beta, self.tau, self.eligibility_avg, self.rupee_h_trace, self.eligibility_traces, td_error, self.step_sizes, self.last_phi ) self.ude, self.delta_avg, self.delta_var = update_ude( self.ude_beta, self.delta_avg, self.delta_var, td_error) self.estimated_return, self.synced_prediction, self.reward_history, self.gamma_history, self.prediction_history = \ update_verifier( self.reward_history, self.gamma_history, self.prediction_history, self.discounts, current_cumulants, predictions ) self.avg_error = self.avg_error * 0.9 + 0.1 * np.abs(td_error) self.last_phi = phi_next return self.last_prediction
def step(self, observations, remove_base=False, **vals): """Update the Network""" # get the next feature vector add = len(self.min_obs) - len(observations) observations = np.concatenate( (observations, np.zeros(add)) ) # we don't have the predictions in the first layer, so concat zeros self.min_obs = np.minimum(observations, self.min_obs) self.max_obs = np.maximum(observations, self.max_obs) observations += np.abs(self.min_obs) observations = np.divide( observations, (np.abs(self.max_obs) + np.abs(self.min_obs)), where=(np.abs(self.max_obs) + np.abs(self.min_obs)) != 0) observations[np.isnan(observations)] = 0 observations[np.isinf(observations)] = 0 # we take off the protected range, as they exist only to serve as cumulants. if remove_base: phi_next = self.function_approximation.get_features(observations) else: phi_next = self.function_approximation.get_features( observations[self.protected_range:]) phi_next = np.concatenate((phi_next, [1]))[:, None] if type(self.last_phi) is np.ndarray: # update the traces based on the new visitation self.eligibility_traces = accumulate(self.eligibility_traces, self.discounts, self.traces_lambda, phi_next) # calculate the new cumulants current_cumulants = np.array([ cumulant.cumulant(observations) for cumulant in self.cumulants ])[:, None] # get a vector of TD errors corresponding to the performance. td_error = calculate_temporal_difference_error( self.weights, current_cumulants, self.discounts, phi_next, self.last_phi) # update the weights based on the caluculated TD error predictions = self.predict(phi_next) # update the running trace of maximum meta_weight updates if self.use_step_sizes: self.n = update_normalizer_accumulation( self.n, self.beta, self.eligibility_traces, self.last_phi, self.h, td_error) self.beta = update_meta_weights(self.beta, self.last_phi, self.meta_step_size, td_error, self.h) self.beta = normalize_meta_weights(self.beta, self.eligibility_traces, self.discounts, self.last_phi, phi_next) self.step_sizes = calculate_step_size(self.beta) # update weights self.weights = update_weights(td_error, self.eligibility_traces, self.weights, self.step_sizes) # update beta trace self.h = update_meta_weight_update_trace(self.h, self.eligibility_traces, self.last_phi, td_error, self.step_sizes) # print "begin \t", self.ude_beta.shape, \ # "\nele \t", self.eligibility_avg.shape, \ # "\nh tra \t", self.rupee_h_trace.shape, \ # "\nele t \t", self.eligibility_traces.shape, \ # "\ntd er \t", td_error.shape, \ # "\nss \t", self.step_sizes.shape, \ # "\nphi \t", self.last_phi.shape, \ # "\ntau \t", self.tau.shape,\ # '\n' # maintain verifiers self.rupee, self.tau, self.eligibility_avg, self.rupee_h_trace =\ update_rupee( self.ude_beta, self.tau, self.eligibility_avg, self.rupee_h_trace, self.eligibility_traces, td_error, self.step_sizes, self.last_phi ) self.ude, self.delta_avg, self.delta_var = update_ude( self.ude_beta, self.delta_avg, self.delta_var, td_error) self.estimated_return, self.synced_prediction, self.reward_history, self.gamma_history, self.prediction_history = \ update_verifier( self.reward_history, self.gamma_history, self.prediction_history, self.discounts, current_cumulants, predictions ) if len(np.where(np.isnan(td_error))[0]) > 0: print("regenning", np.where(np.isnan(td_error))[0]) self.generate_prediction( np.where(np.isnan(td_error))[0], observations) self.generate_prediction( np.where(np.isinf(td_error))[0], observations) self.generate_prediction( np.where(np.isnan(self.error()))[0], observations) self.generate_prediction( np.where(np.isinf(self.error()))[0], observations) self.last_prediction = predictions[:, 0] # Unexpected demon error self.ude_beta[np.where(np.isnan( self.ude))] = (10 * np.average(self.step_sizes)) self.ude_beta[np.where(np.isinf( self.ude))] = (10 * np.average(self.step_sizes)) self.delta_avg[np.where(np.isnan(self.ude))] = 0 self.delta_avg[np.where(np.isinf(self.ude))] = 0 self.delta_var[np.where(np.isnan(self.ude))] = 0 self.delta_var[np.where(np.isinf(self.ude))] = 0 self.ude[np.where(np.isinf(self.ude))] = 0 self.ude[np.where(np.isnan(self.ude))] = 0 # RUPEE # beta is shared between RUPEE and UDE # todo: does sharing beta make sense: rupee seems to use 0.1 alpha, not 10 alpha # self.tau[np.where(np.isnan(self.rupee))] = 0.001 # self.tau[np.where(np.isinf(self.rupee))] = 0.001 # self.rupee_h_trace[:,np.where(np.isnan(self.rupee))] = np.zeros(self.eligibility_traces.shape[1]) # self.rupee_h_trace[:,np.where(np.isinf(self.rupee))] = np.zeros(self.eligibility_traces.shape[1]) # self.eligibility_avg[:,np.where(np.isnan(self.rupee))] = np.zeros(self.eligibility_traces.shape[1]) # self.eligibility_avg[:,np.where(np.isinf(self.rupee))] = np.zeros(self.eligibility_traces.shape[1]) self.last_phi = phi_next return self.last_prediction