Exemplo n.º 1
0
                             show_episode_avg_V_value=True,
                             show_avg_Bellman_residual=True))

    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we
    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
    agent.attach(
        bc.LearningRateController(
            initial_learning_rate=parameters.learning_rate,
            learning_rate_decay=parameters.learning_rate_decay,
            periodicity=1))

    # Same for the discount factor.
    agent.attach(
        bc.DiscountFactorController(
            initial_discount_factor=parameters.discount,
            discount_factor_growth=parameters.discount_inc,
            discount_factor_max=parameters.discount_max,
            periodicity=1))

    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
    # episode or epoch (or never, hence the resetEvery='none').
    agent.attach(
        bc.EpsilonController(initial_e=parameters.epsilon_start,
                             e_decays=parameters.epsilon_decay,
                             e_min=parameters.epsilon_min,
                             evaluate_on='action',
                             periodicity=1,
                             reset_every='none'))
Exemplo n.º 2
0
        evaluateOn='action', 
        periodicity=parameters.update_frequency, 
        showEpisodeAvgVValue=True, 
        showAvgBellmanResidual=True))
    
    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
    agent.attach(bc.LearningRateController(
        initialLearningRate=parameters.learning_rate, 
        learningRateDecay=parameters.learning_rate_decay,
        periodicity=1))
    
    # Same for the discount factor.
    agent.attach(bc.DiscountFactorController(
        initialDiscountFactor=parameters.discount, 
        discountFactorGrowth=parameters.discount_inc, 
        discountFactorMax=parameters.discount_max,
        periodicity=1))
    
    # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
    # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
    # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
    # episode or epoch (or never, hence the resetEvery='none').
    agent.attach(bc.EpsilonController(
        initialE=parameters.epsilon_start, 
        eDecays=parameters.epsilon_decay, 
        eMin=parameters.epsilon_min,
        evaluateOn='action',
        periodicity=1,
        resetEvery='none'))
        k=parameters.k if hasattr(parameters, 'k') else 0,
        learn_representation=parameters.learn_representation,
        reward_type=parameters.reward_type,
        train_q=train_q,
    ))
    # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we
    # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
    agent.attach(bc.LearningRateController(
        initial_learning_rate=parameters.learning_rate,
        learning_rate_decay=parameters.learning_rate_decay,
        periodicity=1))

    # Same for the discount factor.
    agent.attach(bc.DiscountFactorController(
        initial_discount_factor=parameters.discount,
        discount_factor_growth=parameters.discount_inc,
        discount_factor_max=parameters.discount_max,
        evaluate_on='action',
        periodicity=1))

    # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
    # Plus, we also want to display after each training episode (!= than after every training) the average bellman
    # residual and the average of the V values obtained during the last episode, hence the two last arguments.
    agent.attach(bc.TrainerController(
        evaluate_on='action',
        periodicity=parameters.update_frequency,
        show_episode_avg_V_value=True,
        show_avg_Bellman_residual=True))

    agent.run(parameters.epochs, parameters.steps_per_epoch, break_on_done=True)

Exemplo n.º 4
0
    def __init__(self,
                 inputDims,
                 q_network,
                 actions,
                 file='',
                 replay_memory_size=Parameters.REPLAY_MEMORY_SIZE,
                 replay_start_size=Parameters.BATCH_SIZE,
                 batch_size=Parameters.BATCH_SIZE,
                 random_state=np.random.RandomState(),
                 exp_priority=0,
                 batch_type='sequential',
                 train_policy=None,
                 test_policy=None,
                 only_full_history=True,
                 reward_as_input=False):

        CompleteLearner.__init__(self, actions, file)
        self.polfile = open(self.file + 'policy.txt', "w")
        # if replay_start_size == None:
        #     replay_start_size = max(inputDims[i][0] for i in range(len(inputDims)))
        # elif replay_start_size < max(inputDims[i][0] for i in range(len(inputDims))):
        #     raise AgentError("Replay_start_size should be greater than the biggest history of a state.")

        self._controllers = []

        # --- Bind controllers to the agent ---
        # For comments, please refer to run_toy_env.py
        self.attach(bc.VerboseController(evaluate_on='epoch', periodicity=1))

        self.attach(
            bc.TrainerController(evaluate_on='action',
                                 periodicity=Parameters.UPDATE_FREQUENCY,
                                 show_episode_avg_V_value=True,
                                 show_avg_Bellman_residual=True))

        self.attach(
            bc.LearningRateController(
                initial_learning_rate=Parameters.LEARNING_RATE,
                learning_rate_decay=Parameters.LEARNING_RATE_DECAY,
                periodicity=10000))

        self.attach(
            bc.DiscountFactorController(
                initial_discount_factor=Parameters.DISCOUNT,
                discount_factor_growth=Parameters.DISCOUNT_INC,
                discount_factor_max=Parameters.DISCOUNT_MAX,
                periodicity=10000))

        self.attach(
            bc.EpsilonController(initial_e=Parameters.EPSILON_START,
                                 e_decays=Parameters.EPSILON_DECAY,
                                 e_min=Parameters.EPSILON_MIN,
                                 evaluate_on='action',
                                 periodicity=1000,
                                 reset_every='none'))

        # self.attach(bc.InterleavedTestEpochController(
        #     id=0,
        #     epoch_length=Parameters.STEPS_PER_TEST,
        #     controllers_to_disable=[0, 1, 2, 3, 4],
        #     periodicity=2,
        #     show_score=True,
        #     summarize_every=-1))

        self.obs = []
        self.reward_as_input = reward_as_input
        self._network = q_network
        self._replay_memory_size = replay_memory_size
        self._replay_start_size = replay_start_size  # make sure you gather this many observations before learning
        self._batch_size = batch_size
        self._batch_type = batch_type
        self._random_state = random_state
        self._exp_priority = exp_priority
        self._only_full_history = only_full_history
        #inputDimensions, n_actions, observation_type = np.float32, random_state = None, max_size = 1000, batch_type = 'random', only_full_history = True
        self._dataset = DataSet(inputDimensions=inputDims,
                                n_actions=len(actions),
                                max_size=replay_memory_size,
                                random_state=random_state,
                                batch_type=self._batch_type,
                                only_full_history=self._only_full_history)
        self._tmp_dataset = None  # Will be created by startTesting() when necessary
        self._mode = -1
        self._mode_epochs_length = 0
        self._total_mode_reward = 0
        self._training_loss_averages = []
        self._Vs_on_last_episode = []
        #self._in_episode = False

        self._selected_action = -1
        self._state = []
        for i in range(len(inputDims)):
            self._state.append(np.zeros(inputDims[i], dtype=config.floatX))
        if (train_policy == None):
            self._train_policy = EpsilonGreedyPolicy(q_network, len(actions),
                                                     random_state, 0.05)
        else:
            self._train_policy = train_policy
        if (test_policy == None):
            self._test_policy = EpsilonGreedyPolicy(q_network, len(actions),
                                                    random_state, 0.)
        else:
            self._test_policy = test_policy

        self.initEpisode()