show_episode_avg_V_value=True, show_avg_Bellman_residual=True)) # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given. agent.attach( bc.LearningRateController( initial_learning_rate=parameters.learning_rate, learning_rate_decay=parameters.learning_rate_decay, periodicity=1)) # Same for the discount factor. agent.attach( bc.DiscountFactorController( initial_discount_factor=parameters.discount, discount_factor_growth=parameters.discount_inc, discount_factor_max=parameters.discount_max, periodicity=1)) # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every # episode or epoch (or never, hence the resetEvery='none'). agent.attach( bc.EpsilonController(initial_e=parameters.epsilon_start, e_decays=parameters.epsilon_decay, e_min=parameters.epsilon_min, evaluate_on='action', periodicity=1, reset_every='none'))
evaluateOn='action', periodicity=parameters.update_frequency, showEpisodeAvgVValue=True, showAvgBellmanResidual=True)) # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given. agent.attach(bc.LearningRateController( initialLearningRate=parameters.learning_rate, learningRateDecay=parameters.learning_rate_decay, periodicity=1)) # Same for the discount factor. agent.attach(bc.DiscountFactorController( initialDiscountFactor=parameters.discount, discountFactorGrowth=parameters.discount_inc, discountFactorMax=parameters.discount_max, periodicity=1)) # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every # episode or epoch (or never, hence the resetEvery='none'). agent.attach(bc.EpsilonController( initialE=parameters.epsilon_start, eDecays=parameters.epsilon_decay, eMin=parameters.epsilon_min, evaluateOn='action', periodicity=1, resetEvery='none'))
k=parameters.k if hasattr(parameters, 'k') else 0, learn_representation=parameters.learn_representation, reward_type=parameters.reward_type, train_q=train_q, )) # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given. agent.attach(bc.LearningRateController( initial_learning_rate=parameters.learning_rate, learning_rate_decay=parameters.learning_rate_decay, periodicity=1)) # Same for the discount factor. agent.attach(bc.DiscountFactorController( initial_discount_factor=parameters.discount, discount_factor_growth=parameters.discount_inc, discount_factor_max=parameters.discount_max, evaluate_on='action', periodicity=1)) # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes. # Plus, we also want to display after each training episode (!= than after every training) the average bellman # residual and the average of the V values obtained during the last episode, hence the two last arguments. agent.attach(bc.TrainerController( evaluate_on='action', periodicity=parameters.update_frequency, show_episode_avg_V_value=True, show_avg_Bellman_residual=True)) agent.run(parameters.epochs, parameters.steps_per_epoch, break_on_done=True)
def __init__(self, inputDims, q_network, actions, file='', replay_memory_size=Parameters.REPLAY_MEMORY_SIZE, replay_start_size=Parameters.BATCH_SIZE, batch_size=Parameters.BATCH_SIZE, random_state=np.random.RandomState(), exp_priority=0, batch_type='sequential', train_policy=None, test_policy=None, only_full_history=True, reward_as_input=False): CompleteLearner.__init__(self, actions, file) self.polfile = open(self.file + 'policy.txt', "w") # if replay_start_size == None: # replay_start_size = max(inputDims[i][0] for i in range(len(inputDims))) # elif replay_start_size < max(inputDims[i][0] for i in range(len(inputDims))): # raise AgentError("Replay_start_size should be greater than the biggest history of a state.") self._controllers = [] # --- Bind controllers to the agent --- # For comments, please refer to run_toy_env.py self.attach(bc.VerboseController(evaluate_on='epoch', periodicity=1)) self.attach( bc.TrainerController(evaluate_on='action', periodicity=Parameters.UPDATE_FREQUENCY, show_episode_avg_V_value=True, show_avg_Bellman_residual=True)) self.attach( bc.LearningRateController( initial_learning_rate=Parameters.LEARNING_RATE, learning_rate_decay=Parameters.LEARNING_RATE_DECAY, periodicity=10000)) self.attach( bc.DiscountFactorController( initial_discount_factor=Parameters.DISCOUNT, discount_factor_growth=Parameters.DISCOUNT_INC, discount_factor_max=Parameters.DISCOUNT_MAX, periodicity=10000)) self.attach( bc.EpsilonController(initial_e=Parameters.EPSILON_START, e_decays=Parameters.EPSILON_DECAY, e_min=Parameters.EPSILON_MIN, evaluate_on='action', periodicity=1000, reset_every='none')) # self.attach(bc.InterleavedTestEpochController( # id=0, # epoch_length=Parameters.STEPS_PER_TEST, # controllers_to_disable=[0, 1, 2, 3, 4], # periodicity=2, # show_score=True, # summarize_every=-1)) self.obs = [] self.reward_as_input = reward_as_input self._network = q_network self._replay_memory_size = replay_memory_size self._replay_start_size = replay_start_size # make sure you gather this many observations before learning self._batch_size = batch_size self._batch_type = batch_type self._random_state = random_state self._exp_priority = exp_priority self._only_full_history = only_full_history #inputDimensions, n_actions, observation_type = np.float32, random_state = None, max_size = 1000, batch_type = 'random', only_full_history = True self._dataset = DataSet(inputDimensions=inputDims, n_actions=len(actions), max_size=replay_memory_size, random_state=random_state, batch_type=self._batch_type, only_full_history=self._only_full_history) self._tmp_dataset = None # Will be created by startTesting() when necessary self._mode = -1 self._mode_epochs_length = 0 self._total_mode_reward = 0 self._training_loss_averages = [] self._Vs_on_last_episode = [] #self._in_episode = False self._selected_action = -1 self._state = [] for i in range(len(inputDims)): self._state.append(np.zeros(inputDims[i], dtype=config.floatX)) if (train_policy == None): self._train_policy = EpsilonGreedyPolicy(q_network, len(actions), random_state, 0.05) else: self._train_policy = train_policy if (test_policy == None): self._test_policy = EpsilonGreedyPolicy(q_network, len(actions), random_state, 0.) else: self._test_policy = test_policy self.initEpisode()