def __init__(self, env: Environment, model: RQModelBase, memory_capacity=1000000, mini_batch_size=32, discount_factor=0.96, epsilon_start=1.0, epsilon_end=0.05, epsilon_decay_over_steps=1000000, mean_reward_for_episodes=100, transfer_target_steps=1000): super(DunStrategy, self).__init__(env, model) self.env = env self.model = model self.replay_memory = DqnReplayMemory(memory_capacity, state_shape=env.state_shape, state_dtype=np.float32, action_dtype=np.uint16) self.transfer_target_steps = transfer_target_steps self.mini_batch_size = mini_batch_size self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_decay_over_steps = epsilon_decay_over_steps self.epsilon = epsilon_start self.discount_factor = discount_factor # Mean reward for last N episodes self.mean_reward = IncrementalMean(mean_reward_for_episodes)
def train_irl(sess: tf.Session, model: RQModelBase, replay: StateActionReplay, epochs: int): avg_delta_loss = IncrementalMean(50) loss = 0 for epoch in range(epochs): if len(replay) > IRL_BATCH_SIZE: states, actions = replay.sample(IRL_BATCH_SIZE) new_loss = model.train_r(sess, states, actions) avg_delta_loss.add(new_loss - loss) loss = new_loss print('IRL: Epoch: {0}/{1} Loss: {2:.3f} AvgLossDelta: {3:.3f}'. format(epoch, epochs, loss, avg_delta_loss.value)) if avg_delta_loss.value < IRL_LOSS_DELTA_STOP: print('No significant change in loss, stopping training') return
def __init__(self, env: Environment, model: RQModelBase, memory_capacity=100000, discount_factor=0.96, batch_size=64, epsilon=0.5): super(DunMcStrategy, self).__init__(env, model) self.replay = GenericMemory(memory_capacity, [ ('state', np.float32, env.state_shape), ('q', np.float32, env.num_actions), ]) self.discount_factor = discount_factor self.epsilon = epsilon self.batch_size = batch_size self.mean_reward = IncrementalMean()
def __init__(self, env: Environment, model: RQModelBase, memory_capacity=10000, discount_factor=0.96, batch_size=32, epsilon=0.5): super(DunBroadStrategy, self).__init__(env, model) self.replay = GenericMemory(memory_capacity, [ ('state', np.float32, env.state_shape), ('u', np.float32, env.num_actions), ]) self.discount_factor = discount_factor self.epsilon = epsilon self.batch_size = batch_size self.actions_to_sample = min(env.num_actions, batch_size) self.next_states = np.zeros( (self.actions_to_sample, ) + env.state_shape, dtype=np.float32) self.mean_reward = IncrementalMean(100)
def __init__(self, env: gym.Env, ppo: ProximalPolicyOptimization, num_steps: int, logger: TfLogger = None): self.rollout = GenericMemory( num_steps, [ #('observations', np.float32, env.state_shape), ('observations', np.float32, env.observation_space.shape), ('actions', np.float32, env.action_space.shape), ('rewards', np.float32, ()), ('values', np.float32, ()), ('next_is_terminal', np.bool, ()) ]) self.env = env self.ppo = ppo self.observation = env.reset() self.num_steps = num_steps self.running_reward = IncrementalMean(20) self.episode_reward = 0.0 self.episode = 0 self.logger = logger