示例#1
0
    def __init__(self,
                 env: Environment,
                 model: RQModelBase,
                 memory_capacity=1000000,
                 mini_batch_size=32,
                 discount_factor=0.96,
                 epsilon_start=1.0,
                 epsilon_end=0.05,
                 epsilon_decay_over_steps=1000000,
                 mean_reward_for_episodes=100,
                 transfer_target_steps=1000):
        super(DunStrategy, self).__init__(env, model)
        self.env = env
        self.model = model
        self.replay_memory = DqnReplayMemory(memory_capacity,
                                             state_shape=env.state_shape,
                                             state_dtype=np.float32,
                                             action_dtype=np.uint16)

        self.transfer_target_steps = transfer_target_steps
        self.mini_batch_size = mini_batch_size
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay_over_steps = epsilon_decay_over_steps
        self.epsilon = epsilon_start
        self.discount_factor = discount_factor

        # Mean reward for last N episodes
        self.mean_reward = IncrementalMean(mean_reward_for_episodes)
示例#2
0
def train_irl(sess: tf.Session, model: RQModelBase, replay: StateActionReplay,
              epochs: int):
    avg_delta_loss = IncrementalMean(50)
    loss = 0
    for epoch in range(epochs):
        if len(replay) > IRL_BATCH_SIZE:
            states, actions = replay.sample(IRL_BATCH_SIZE)
            new_loss = model.train_r(sess, states, actions)
            avg_delta_loss.add(new_loss - loss)
            loss = new_loss
            print('IRL: Epoch: {0}/{1} Loss: {2:.3f} AvgLossDelta: {3:.3f}'.
                  format(epoch, epochs, loss, avg_delta_loss.value))
            if avg_delta_loss.value < IRL_LOSS_DELTA_STOP:
                print('No significant change in loss, stopping training')
                return
示例#3
0
 def __init__(self,
              env: Environment,
              model: RQModelBase,
              memory_capacity=100000,
              discount_factor=0.96,
              batch_size=64,
              epsilon=0.5):
     super(DunMcStrategy, self).__init__(env, model)
     self.replay = GenericMemory(memory_capacity, [
         ('state', np.float32, env.state_shape),
         ('q', np.float32, env.num_actions),
     ])
     self.discount_factor = discount_factor
     self.epsilon = epsilon
     self.batch_size = batch_size
     self.mean_reward = IncrementalMean()
示例#4
0
    def __init__(self,
                 env: Environment,
                 model: RQModelBase,
                 memory_capacity=10000,
                 discount_factor=0.96,
                 batch_size=32,
                 epsilon=0.5):
        super(DunBroadStrategy, self).__init__(env, model)
        self.replay = GenericMemory(memory_capacity, [
            ('state', np.float32, env.state_shape),
            ('u', np.float32, env.num_actions),
        ])
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.batch_size = batch_size

        self.actions_to_sample = min(env.num_actions, batch_size)
        self.next_states = np.zeros(
            (self.actions_to_sample, ) + env.state_shape, dtype=np.float32)
        self.mean_reward = IncrementalMean(100)
示例#5
0
 def __init__(self,
              env: gym.Env,
              ppo: ProximalPolicyOptimization,
              num_steps: int,
              logger: TfLogger = None):
     self.rollout = GenericMemory(
         num_steps,
         [
             #('observations', np.float32, env.state_shape),
             ('observations', np.float32, env.observation_space.shape),
             ('actions', np.float32, env.action_space.shape),
             ('rewards', np.float32, ()),
             ('values', np.float32, ()),
             ('next_is_terminal', np.bool, ())
         ])
     self.env = env
     self.ppo = ppo
     self.observation = env.reset()
     self.num_steps = num_steps
     self.running_reward = IncrementalMean(20)
     self.episode_reward = 0.0
     self.episode = 0
     self.logger = logger