예제 #1
0
    def step(self, observations, policy, action, remove_base=False, terminal_step=False, **vals):
        """Update the Network
        Args:
            observations (list): real-valued list of observations from the environment.
            policy (list): list of length num_actions; the policy of the control policy for the given state.
        Returns:
            predictions (list): the predictions for each GVF given the observations and policy.
        """
        # get the next feature vector
        phi_next = self.function_approximation.get_features(observations)
        if type(self.last_phi) is np.ndarray:
            discounts = self.discounts
            if terminal_step:
                discounts = np.zeros(self.discounts.shape)
            # calculate importance sampling
            rho = (self.policies/policy)[:, action]
            # update the traces based on the new visitation
            self.eligibility_traces = accumulate(self.eligibility_traces, discounts, self.traces_lambda, self.last_phi, rho)
            # calculate the new cumulants
            current_cumulants = np.array([cumulant.cumulant(observations) for cumulant in self.cumulants])
            # get a vector of TD errors corresponding to the performance.
            td_error = calculate_temporal_difference_error(self.weights, current_cumulants, discounts, phi_next,
                                                           self.last_phi)
            self.td_error = td_error
            # update the weights based on the caluculated TD error
            self.weights = update_weights(td_error, self.eligibility_traces, self.weights, discounts, self.traces_lambda, self.step_sizes, self.last_phi, self.bias_correction)
            # update bias correction term
            self.bias_correction = update_h_trace(self.bias_correction, td_error, self.step_size_bias_correction
                                                  , self.eligibility_traces, self.last_phi)

            # maintain verifiers
            self.rupee, self.tau, self.eligibility_avg= \
                update_rupee(
                    beta_naught=self.rupee_beta,
                    tau=self.tau,
                    delta_e=self.eligibility_avg,
                    h=self.bias_correction,
                    e=self.eligibility_traces,
                    delta=td_error,
                    alpha=self.step_sizes,
                    phi=self.last_phi
                )
            self.ude, self.delta_avg, self.delta_var = update_ude(
                self.ude_beta,
                self.delta_avg,
                self.delta_var,
                td_error
            )
            self.avg_error = self.avg_error * 0.9 + 0.1 * np.abs(td_error)

        self.last_phi = phi_next
        self.last_prediction = np.inner(self.weights, phi_next)
        return self.last_prediction
예제 #2
0
    def step(self, observations, remove_base=False, **vals):
        """Update the Network"""
        # get the next feature vector
        add = len(self.min_obs) - len(observations)
        observations = np.concatenate(
            (observations, np.zeros(add))
        )  # we don't have the predictions in the first layer, so concat zeros
        self.min_obs = np.minimum(observations, self.min_obs)
        self.max_obs = np.maximum(observations, self.max_obs)
        observations += np.abs(self.min_obs)
        observations = np.divide(
            observations, (np.abs(self.max_obs) + np.abs(self.min_obs)),
            where=(np.abs(self.max_obs) + np.abs(self.min_obs)) != 0)
        observations[np.isnan(observations)] = 0
        observations[np.isinf(observations)] = 0
        # we take off the protected range, as they exist only to serve as cumulants.
        if remove_base:
            phi_next = self.function_approximation.get_features(observations)
        else:
            phi_next = self.function_approximation.get_features(
                observations[self.protected_range:])
        phi_next = np.concatenate((phi_next, [1]))[:, None]
        if type(self.last_phi) is np.ndarray:
            # update the traces based on the new visitation
            self.eligibility_traces = accumulate(self.eligibility_traces,
                                                 self.discounts,
                                                 self.traces_lambda, phi_next)
            # calculate the new cumulants
            current_cumulants = np.array([
                cumulant.cumulant(observations) for cumulant in self.cumulants
            ])[:, None]
            # get a vector of TD errors corresponding to the performance.
            td_error = calculate_temporal_difference_error(
                self.weights, current_cumulants, self.discounts, phi_next,
                self.last_phi)
            # update the weights based on the caluculated TD error
            predictions = self.predict(phi_next)
            # update the running trace of maximum meta_weight updates
            if self.use_step_sizes:
                self.n = update_normalizer_accumulation(
                    self.n, self.beta, self.eligibility_traces, self.last_phi,
                    self.h, td_error)
                self.beta = update_meta_weights(self.beta, self.last_phi,
                                                self.meta_step_size, td_error,
                                                self.h, self.n)
                self.step_sizes = normalize_step_size(
                    calculate_step_size(self.beta), self.beta,
                    self.eligibility_traces, self.discounts, self.last_phi,
                    phi_next)

            # update weights
            self.weights = update_weights(td_error, self.eligibility_traces,
                                          self.weights, self.step_sizes)
            # update beta trace
            self.h = update_meta_weight_update_trace(self.h,
                                                     self.eligibility_traces,
                                                     self.last_phi, td_error,
                                                     self.step_sizes)

            # maintain verifiers
            self.rupee, self.tau, self.eligibility_avg, self.rupee_h_trace = \
                update_rupee(
                    self.rupee_beta,
                    self.tau,
                    self.eligibility_avg,
                    self.rupee_h_trace,
                    self.eligibility_traces,
                    td_error,
                    self.step_sizes,
                    self.last_phi
                )
            self.ude, self.delta_avg, self.delta_var = update_ude(
                self.ude_beta, self.delta_avg, self.delta_var, td_error)

            self.estimated_return, self.synced_prediction, self.reward_history, self.gamma_history, self.prediction_history = \
                update_verifier(
                    self.reward_history,
                    self.gamma_history,
                    self.prediction_history,
                    self.discounts,
                    current_cumulants,
                    predictions
                )

            self.avg_error = self.avg_error * 0.9 + 0.1 * np.abs(td_error)

        self.last_phi = phi_next
        return self.last_prediction
예제 #3
0
    def step(self, observations, remove_base=False, **vals):
        """Update the Network"""
        # get the next feature vector
        add = len(self.min_obs) - len(observations)
        observations = np.concatenate(
            (observations, np.zeros(add))
        )  # we don't have the predictions in the first layer, so concat zeros
        self.min_obs = np.minimum(observations, self.min_obs)
        self.max_obs = np.maximum(observations, self.max_obs)
        observations += np.abs(self.min_obs)
        observations = np.divide(
            observations, (np.abs(self.max_obs) + np.abs(self.min_obs)),
            where=(np.abs(self.max_obs) + np.abs(self.min_obs)) != 0)
        observations[np.isnan(observations)] = 0
        observations[np.isinf(observations)] = 0
        # we take off the protected range, as they exist only to serve as cumulants.
        if remove_base:
            phi_next = self.function_approximation.get_features(observations)
        else:
            phi_next = self.function_approximation.get_features(
                observations[self.protected_range:])
        phi_next = np.concatenate((phi_next, [1]))[:, None]
        if type(self.last_phi) is np.ndarray:
            # update the traces based on the new visitation
            self.eligibility_traces = accumulate(self.eligibility_traces,
                                                 self.discounts,
                                                 self.traces_lambda, phi_next)
            # calculate the new cumulants
            current_cumulants = np.array([
                cumulant.cumulant(observations) for cumulant in self.cumulants
            ])[:, None]
            # get a vector of TD errors corresponding to the performance.
            td_error = calculate_temporal_difference_error(
                self.weights, current_cumulants, self.discounts, phi_next,
                self.last_phi)
            # update the weights based on the caluculated TD error
            predictions = self.predict(phi_next)
            # update the running trace of maximum meta_weight updates
            if self.use_step_sizes:
                self.n = update_normalizer_accumulation(
                    self.n, self.beta, self.eligibility_traces, self.last_phi,
                    self.h, td_error)
                self.beta = update_meta_weights(self.beta, self.last_phi,
                                                self.meta_step_size, td_error,
                                                self.h)
                self.beta = normalize_meta_weights(self.beta,
                                                   self.eligibility_traces,
                                                   self.discounts,
                                                   self.last_phi, phi_next)
                self.step_sizes = calculate_step_size(self.beta)
            # update weights
            self.weights = update_weights(td_error, self.eligibility_traces,
                                          self.weights, self.step_sizes)
            # update beta trace
            self.h = update_meta_weight_update_trace(self.h,
                                                     self.eligibility_traces,
                                                     self.last_phi, td_error,
                                                     self.step_sizes)
            # print "begin \t", self.ude_beta.shape, \
            #     "\nele \t", self.eligibility_avg.shape, \
            #     "\nh tra \t", self.rupee_h_trace.shape, \
            #     "\nele t \t", self.eligibility_traces.shape, \
            #     "\ntd er \t", td_error.shape, \
            #     "\nss \t", self.step_sizes.shape, \
            #     "\nphi \t", self.last_phi.shape, \
            #     "\ntau \t",  self.tau.shape,\
            #     '\n'

            # maintain verifiers
            self.rupee, self.tau, self.eligibility_avg, self.rupee_h_trace =\
                update_rupee(
                    self.ude_beta,
                    self.tau,
                    self.eligibility_avg,
                    self.rupee_h_trace,
                    self.eligibility_traces,
                    td_error,
                    self.step_sizes,
                    self.last_phi
                )
            self.ude, self.delta_avg, self.delta_var = update_ude(
                self.ude_beta, self.delta_avg, self.delta_var, td_error)

            self.estimated_return, self.synced_prediction, self.reward_history, self.gamma_history, self.prediction_history = \
                update_verifier(
                    self.reward_history,
                    self.gamma_history,
                    self.prediction_history,
                    self.discounts,
                    current_cumulants,
                    predictions
                )
            if len(np.where(np.isnan(td_error))[0]) > 0:
                print("regenning", np.where(np.isnan(td_error))[0])
            self.generate_prediction(
                np.where(np.isnan(td_error))[0], observations)
            self.generate_prediction(
                np.where(np.isinf(td_error))[0], observations)
            self.generate_prediction(
                np.where(np.isnan(self.error()))[0], observations)
            self.generate_prediction(
                np.where(np.isinf(self.error()))[0], observations)
            self.last_prediction = predictions[:, 0]

            # Unexpected demon error
            self.ude_beta[np.where(np.isnan(
                self.ude))] = (10 * np.average(self.step_sizes))
            self.ude_beta[np.where(np.isinf(
                self.ude))] = (10 * np.average(self.step_sizes))
            self.delta_avg[np.where(np.isnan(self.ude))] = 0
            self.delta_avg[np.where(np.isinf(self.ude))] = 0
            self.delta_var[np.where(np.isnan(self.ude))] = 0
            self.delta_var[np.where(np.isinf(self.ude))] = 0
            self.ude[np.where(np.isinf(self.ude))] = 0
            self.ude[np.where(np.isnan(self.ude))] = 0

            # RUPEE
            # beta is shared between RUPEE and UDE
            # todo: does sharing beta make sense: rupee seems to use 0.1 alpha, not 10 alpha
            # self.tau[np.where(np.isnan(self.rupee))] = 0.001
            # self.tau[np.where(np.isinf(self.rupee))] = 0.001
            # self.rupee_h_trace[:,np.where(np.isnan(self.rupee))] = np.zeros(self.eligibility_traces.shape[1])
            # self.rupee_h_trace[:,np.where(np.isinf(self.rupee))] = np.zeros(self.eligibility_traces.shape[1])
            # self.eligibility_avg[:,np.where(np.isnan(self.rupee))] = np.zeros(self.eligibility_traces.shape[1])
            # self.eligibility_avg[:,np.where(np.isinf(self.rupee))] = np.zeros(self.eligibility_traces.shape[1])

        self.last_phi = phi_next
        return self.last_prediction