Exemplo n.º 1
0
    def __init__(self, config):
        self._config = config
        self._eps_schedule = LinearSchedule(self._config.eps_begin,
                                            self._config.eps_end,
                                            self._config.nsteps)
        self._lr_schedule = LinearSchedule(self._config.lr_begin,
                                           self._config.lr_end,
                                           self._config.lr_nsteps)
        self._sim = TrafficSimulator(config)
        self._bf = ReplayBuffer(10000, config)

        self._action_fn = self.get_action_fn()

        self.build()
Exemplo n.º 2
0
    def __init__(self, config):
        self._config = config
        self._eps_schedule = LinearSchedule(self._config.eps_begin,
                                            self._config.eps_end,
                                            self._config.nsteps)
        self._lr_schedule = LinearSchedule(self._config.lr_begin,
                                           self._config.lr_end,
                                           self._config.lr_nsteps)
        self._oq = Order_Queue(self._config.order_path)
        self._mq = Message_Queue(self._config.message_path)
        self._bf = ReplayBuffer(1000000, config)

        self._action_fn = self.get_action_fn()

        self.build()
Exemplo n.º 3
0
def purge_round():
    candidate_leaders_map = {}  # {filename --> agent}

    # Load in all of the leaders
    for leader_checkpoint in os.listdir(LEADER_DIR):
        path = os.path.join(LEADER_DIR, leader_checkpoint)
        candidate_leader = try_gpu(
            DQNAgent(6,
                     LinearSchedule(0.05, 0.05, 1),
                     OBSERVATION_MODE,
                     lr=LR,
                     max_grad_norm=GRAD_CLIP_NORM,
                     name=leader_checkpoint))
        candidate_leader.load_state_dict(
            torch.load(path, map_location=lambda storage, loc: storage))
        candidate_leaders_map[leader_checkpoint] = candidate_leader

    candidate_scores = []  # list[(filename, score)]
    filenames, candidate_leaders = zip(*candidate_leaders_map.items())
    for i, (filename,
            candidate_leader) in enumerate(zip(filenames, candidate_leaders)):
        print "EVALUATING {}".format(candidate_leader.name)
        leaders = EnsembleDQNAgent(candidate_leaders[:i] +
                                   candidate_leaders[i + 1:])
        candidate_scores.append((filename,
                                 evaluate(candidate_leader, leaders,
                                          EPISODES_EVALUATE_PURGE)))
    sorted_scores = sorted(candidate_scores, key=lambda x: x[1], reverse=True)

    print "SCORES: {}".format(sorted_scores)
    for filename, score in sorted_scores[NUM_LEADERS:]:
        print "PURGING ({}, {})".format(filename, score)
        leader_path = os.path.join(LEADER_DIR, filename)
        graveyard_path = os.path.join(GRAVEYARD_DIR, filename)
        os.rename(leader_path, graveyard_path)
Exemplo n.º 4
0
 def __init__(self,
              total_timesteps=100000,
              buffer_size=50000,
              type_buffer="HER",
              prioritized_replay=True,
              prioritized_replay_alpha=0.6,
              prioritized_replay_beta0=0.4,
              prioritized_replay_beta_iters=None,
              prioritized_replay_eps=1e-6):
     self.buffer_size = buffer_size
     self.prioritized_replay_eps = prioritized_replay_eps
     self.type_buffer = type_buffer
     if prioritized_replay:
         if type_buffer == "PER":
             self.replay_buffer = PrioritizedReplayBuffer(
                 buffer_size, alpha=prioritized_replay_alpha)
         if type_buffer == "HER":
             self.replay_buffer = HighlightReplayBuffer(
                 buffer_size, alpha=prioritized_replay_alpha)
         if prioritized_replay_beta_iters is None:
             prioritized_replay_beta_iters = total_timesteps
         self.beta_schedule = LinearSchedule(
             prioritized_replay_beta_iters,
             initial_p=prioritized_replay_beta0,
             final_p=1.0)
     else:
         self.replay_buffer = ReplayBuffer(buffer_size)
         self.beta_schedule = None
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 seed,
                 fc1=400,
                 fc2=300,
                 update_times=10,
                 weight_decay=1.e-5):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.num_agents = num_agents
        self.n_seed = np.random.seed(seed)
        self.update_times = update_times
        self.n_step = 0

        self.noise = []
        for i in range(num_agents):
            self.noise.append(
                rm.OrnsteinUhlenbeckProcess(size=(action_size, ),
                                            std=LinearSchedule(0.2)))

        # critic local and target network (Q-Learning)
        self.critic_local = Critic(state_size, action_size, fc1, fc2,
                                   seed).to(device)

        self.critic_target = Critic(state_size, action_size, fc1, fc2,
                                    seed).to(device)
        self.critic_target.load_state_dict(self.critic_local.state_dict())

        # actor local and target network (Policy gradient)
        self.actor_local = Actor(state_size, action_size, fc1, fc2,
                                 seed).to(device)
        self.actor_target = Actor(state_size, action_size, fc1, fc2,
                                  seed).to(device)
        self.actor_target.load_state_dict(self.actor_local.state_dict())

        # optimizer for critic and actor network
        self.optimizer_critic = optim.Adam(self.critic_local.parameters(),
                                           lr=CRITIC_LR,
                                           weight_decay=1.e-5)
        self.optimizer_actor = optim.Adam(self.actor_local.parameters(),
                                          lr=ACTOR_LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.a_step = 0
Exemplo n.º 6
0
    def __init__(self, size=5000, epsilon=0.01,
                 alpha=0.6, beta=0.4, num_steps=int(1e6)):
        super(PrioritizedReplayBuffer, self).__init__(size=size)

        self.size = size
        self.epsilon = epsilon
        self.alpha = alpha
        self.beta = LinearSchedule(min_val=beta, max_val=1.0,
                                   num_steps=num_steps)
        self.probs = deque(maxlen=size)

        self._max_prob = epsilon
Exemplo n.º 7
0
def main(num_epochs):

    exploration_schedule = LinearSchedule(300000, 0.1)

    dqn_learing(
        dataLoader=videoLoader,
        num_epochs=num_epochs,
        feature_size=2048,
        num_classes=101,
        r_p=0.01,
        q_func=QNet,
        exploration=exploration_schedule,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
    )
Exemplo n.º 8
0
def train_maze(output_path):
    config = Config()
    config.set_paths(output_path)

    env = EnvMaze(n=config.maze_size, hard=config.hard)

    # exploration strategy
    exp_schedule = LinearExploration(env, config.eps_begin,
                                     config.eps_end, config.eps_nsteps, config.env_name)

    # learning rate schedule
    lr_schedule = LinearSchedule(config.lr_begin, config.lr_end,
                                 config.lr_nsteps, config.env_name)

    # train model
    print(config.output_path)
    model = NatureQN(env, config)
    model.bfs_len = env.get_bfs_length()
    evaluation_result_list, oos_evalution_result_list = model.run(exp_schedule, lr_schedule)
    return evaluation_result_list, oos_evalution_result_list
Exemplo n.º 9
0
 def __init__(self,
              total_timesteps=100000,
              buffer_size=50000,
              prioritized_replay=False,
              prioritized_replay_alpha=0.6,
              prioritized_replay_beta0=0.4,
              prioritized_replay_beta_iters=None,
              prioritized_replay_eps=1e-6):
     self.buffer_size = buffer_size
     if prioritized_replay:
         self.replay_buffer = PrioritizedReplayBuffer(
             buffer_size, alpha=prioritized_replay_alpha)
         if prioritized_replay_beta_iters is None:
             prioritized_replay_beta_iters = total_timesteps
         beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                        initial_p=prioritized_replay_beta0,
                                        final_p=1.0)
     else:
         self.replay_buffer = ReplayBuffer(buffer_size)
         beta_schedule = None
Exemplo n.º 10
0
invader = Invader(speed=1)
guard = Guard2Targets(speed=1)
target = Target(speed=0)

env = Environment2Targets([32, 32], guard, invader, target)

OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs"])

Statistic = {"mean_episode_rewards": [], "best_mean_episode_rewards": []}

optimizer_spec = OptimizerSpec(
    constructor=optim.RMSprop,
    kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
)

exploration_schedule = LinearSchedule(100000, 0.1)


# Construct an epilson greedy policy with given exploration schedule
def select_epilson_greedy_action(model, obs, t):
    sample = random.random()
    eps_threshold = exploration_schedule.value(t)
    if sample > eps_threshold:
        obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
        # Use volatile = True if variable is only used in inference mode, i.e. don't save the history
        return model(Variable(obs)).data.max(1)[1].cpu()
    else:
        return torch.IntTensor([[random.randrange(NUM_ACTIONS)]])


# vis = visdom.Visdom(port=8124)
Exemplo n.º 11
0
    student_config.lr_nsteps = args.nsteps_train / 2
    student_config.exp_policy = args.exp_policy

    # make env
    env = gym.make(student_config.env_name)
    if hasattr(student_config, 'skip_frame'):
        env = MaxAndSkipEnv(env, skip=student_config.skip_frame)
    if hasattr(student_config, 'preprocess_state'
               ) and student_config.preprocess_state is not None:
        env = PreproWrapper(env,
                            prepro=greyscale,
                            shape=(80, 80, 1),
                            overwrite_render=student_config.overwrite_render)

    # exploration strategy
    if student_config.exp_policy == 'egreedy':
        exp_schedule = LinearExploration(env, student_config.eps_begin,
                                         student_config.eps_end,
                                         student_config.eps_nsteps)
    else:
        exp_schedule = LinearGreedyExploration(env, student_config.eps_begin,
                                               student_config.eps_end,
                                               student_config.eps_nsteps)
    # learning rate schedule
    lr_schedule = LinearSchedule(student_config.lr_begin,
                                 student_config.lr_end,
                                 student_config.lr_nsteps)

    # train model
    model = DistilledQN(env, student_config)
    model.run(exp_schedule, lr_schedule)
Exemplo n.º 12
0
You'll find the results, log and video recordings of your agent every 250k under
the corresponding file in the results folder. A good way to monitor the progress
of the training is to use Tensorboard. The starter code writes summaries of different
variables.

To launch tensorboard, open a Terminal window and run 
tensorboard --logdir=results/
Then, connect remotely to 
address-ip-of-the-server:6006 
6006 is the default port used by tensorboard.
"""
if __name__ == '__main__':
    # make env
    env = gym.make(config.env_name)
    env = MaxAndSkipEnv(env, skip=config.skip_frame)
    env = PreproWrapper(env,
                        prepro=greyscale,
                        shape=(80, 80, 1),
                        overwrite_render=config.overwrite_render)

    # exploration strategy
    exp_schedule = LinearExploration(env, config.eps_begin, config.eps_end,
                                     config.eps_nsteps)

    # learning rate schedule
    lr_schedule = LinearSchedule(config.lr_begin, config.lr_end,
                                 config.lr_nsteps)

    # train model
    model = NatureQN(env, config)
    model.run(exp_schedule, lr_schedule)
Exemplo n.º 13
0
def learn(env,
          q_func,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          callback=None):
    """Train a deepq model.
    Parameters
    -------
    env : gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    num_cpu: int
        number of cpus to use for training
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput([84, 84], name=name)

    act, train, update_target, debug = build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=2,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10)
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': 2,
    }
    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.step(0)
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            action = act(np.array(obs)[None],
                         update_eps=exploration.value(t))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                #obs = env.reset()
                episode_rewards.append(0)

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  np.ones_like(rewards))
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                #logger.record_tabular("steps", t)
                #logger.record_tabular("episodes", num_episodes)
                #logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                #logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                #logger.dump_tabular()
                print("steps: {}".format(t))
                print("episodes: {}".format(num_episodes))
                print("mean 100 episode reward: {}".format(mean_100ep_reward))
                print("% time spent exploring: {}".format(
                    int(100 * exploration.value(t))))

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    #if print_freq is not None:
                    #logger.log("Saving model due to mean reward increase: {} -> {}".format(
                    #           saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            #if print_freq is not None:
            #logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act, act_params)
Exemplo n.º 14
0
class model(object):
    def __init__(self, config):
        self._config = config
        self._eps_schedule = LinearSchedule(self._config.eps_begin,
                                            self._config.eps_end,
                                            self._config.nsteps)
        self._lr_schedule = LinearSchedule(self._config.lr_begin,
                                           self._config.lr_end,
                                           self._config.lr_nsteps)
        self._oq = Order_Queue(self._config.order_path)
        self._mq = Message_Queue(self._config.message_path)
        self._bf = ReplayBuffer(1000000, config)

        self._action_fn = self.get_action_fn()

        self.build()

    def build(self):
        pass

    def initialize(self):
        pass

    def get_random_action(self, state):
        pass

    def get_best_action(self, state):
        ### return action, q value
        pass

    def get_action(self, state):
        if np.random.random() < self._eps_schedule.get_epsilon():
            return self.get_random_action(state)[0]
        else:
            return self.get_best_action(state)[0]

    def get_random_action_fn(self):
        def random_action_fn(t, amount, state, mid_price):
            action = np.random.randint(
                self._config.L)  # action = L for market order
            price = (action -
                     self._config.L // 2) * self._config.base_point + mid_price
            return (price, action)

        return random_action_fn

    def get_action_fn(self):
        def action_fn(t, amount, state, mid_price):
            action = self.get_action(state)
            price = (action -
                     self._config.L // 2) * self._config.base_point + mid_price
            return (price, action)

        return action_fn

    def pad_state(self, states, state_history):
        tmp_states, tmp_its = zip(*states)
        tmp_state = np.concatenate(
            [np.expand_dims(state, -1) for state in tmp_states], axis=-1)
        tmp_state = np.pad(tmp_state,
                           ((0, 0), (0, 0),
                            (state_history - tmp_state.shape[-1], 0)),
                           'constant',
                           constant_values=0)
        tmp_it = tmp_its[-1]
        return ([tmp_state], [tmp_it])

    def simulate_an_episode(self, amount, T, H, start_time, order_direction,
                            action_fn, depth):
        dH = H // T
        self._mq.reset()
        lob_data = self._oq.create_orderbook_time(start_time, self._mq)
        lob = Limit_Order_book(**lob_data,
                               own_amount_to_trade=0,
                               own_init_price=-order_direction *
                               Limit_Order_book._DUMMY_VARIABLE,
                               own_trade_type=order_direction)
        rewards = []
        states = []
        actions = []
        done_mask = []

        amount_remain = amount
        cum_reward = 0

        for t in range(start_time, start_time + H - dH, dH):
            tmp1 = 1.0 * amount_remain / amount  # amount remain
            tmp2 = 1.0 * (start_time + H - t) / H  # time remain
            state = (lob.display_book(depth),
                     np.array([tmp1, tmp2], dtype=float))
            state = self.process_state(state)
            states.append(state)

            mid_price = lob.get_mid_price()
            state_input = self.pad_state(states[-self._config.state_history:],
                                         self._config.state_history)
            price, action = action_fn(start_time + H - t, amount_remain,
                                      state_input, mid_price)
            actions.append(action)
            done_mask.append(False)

            lob.update_own_order(price, amount_remain)

            for idx, message in self._mq.pop_to_next_time(t + dH):
                lob.process(**message)
                if lob.own_amount_to_trade == 0:
                    done_mask.append(True)
                    state = (lob.display_book(depth),
                             np.array([
                                 0, 1.0 * (start_time + H - self._mq._time) / H
                             ],
                                      dtype=float))
                    state = self.process_state(state)
                    states.append(state)
                    rewards.append(lob.own_reward - cum_reward)
                    break
            if done_mask[-1]:
                break
            else:
                # What is going on over here?
                rewards.append(lob.own_reward - cum_reward)
                cum_reward = lob.own_reward
                amount_remain = lob.own_amount_to_trade

        if not done_mask[-1]:
            tmp1 = 1.0 * amount_remain / amount
            tmp2 = 1.0 * (start_time + H - t - dH) / H
            state = (lob.display_book(depth),
                     np.array([tmp1, tmp2], dtype=float))
            state = self.process_state(state)
            states.append(state)
            done_mask.append(False)

            lob.update_own_order(lob.own_trade_type *
                                 Limit_Order_book._DUMMY_VARIABLE)
            if lob.own_amount_to_trade == 0:
                rewards.append(lob.own_reward - cum_reward)
            else:
                rewards.append(-Limit_Order_book._DUMMY_VARIABLE)
            tmp1 = 1.0 * lob.own_amount_to_trade / amount
            state = (lob.display_book(depth), np.array([tmp1, 0], dtype=float))
            state = self.process_state(state)
            states.append(state)
            actions.append(self._config.L)
            done_mask.append(True)
        return (states, rewards, actions, done_mask[1:])

    def sampling_buffer(self):
        for start_time in range(self._config.train_start,
                                self._config.train_end, self._config.H):
            states, rewards, actions, done_mask = self.simulate_an_episode(
                self._config.I, self._config.T, self._config.H, start_time,
                self._config.direction, self._action_fn, self._config.depth)
            self._bf.store(states, actions, rewards, done_mask)

    def process_state(self, state):
        state_book, state_it = state
        state_book = state_book.astype('float32')
        state_book[:, 0] /= 1.e6
        state_book[:, 1] /= 1.e2
        state_book[:, 2] /= 1.e6
        state_book[:, 3] /= 1.e2
        return (state_book, state_it)
Exemplo n.º 15
0
    model.compile(optimizer=Adam(lr=0.001), loss='mae')
    model.summary()
    return model


model = create_model()

if os.path.isfile(WEIGHTS_PATH) and os.access(WEIGHTS_PATH, os.R_OK):
    model.load_weights(WEIGHTS_PATH)

# memory = deque(maxlen=MEMORY)
env = gym.make(ENV_NAME)

replay_buffer = PrioritizedReplayBuffer(MEMORY, alpha=prioritized_replay_alpha)
beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                               initial_p=prioritized_replay_beta0,
                               final_p=1.0)
eps_schedule = LinearSchedule(eps_iters, initial_p=eps0, final_p=0.1)


def get_action(ob, epsilon):
    moves = env.action_space
    q_values = []

    observations = np.expand_dims(ob, 0)

    for move in moves:
        action = np.zeros((80, ))
        if move > 99:
            action[move - 29] = 1
        else:
Exemplo n.º 16
0
class model(object):
    def __init__(self, config):
        self._config = config
        self._eps_schedule = LinearSchedule(self._config.eps_begin,
                                            self._config.eps_end,
                                            self._config.nsteps)
        self._lr_schedule = LinearSchedule(self._config.lr_begin,
                                           self._config.lr_end,
                                           self._config.lr_nsteps)
        self._sim = TrafficSimulator(config)
        self._bf = ReplayBuffer(10000, config)

        self._action_fn = self.get_action_fn()

        self.build()

    def build(self):
        pass

    def initialize(self):
        pass

    def get_random_action(self, state):
        pass

    def get_best_action(self, state):
        ### return action, q value
        pass

    def get_action(self, state):
        if np.random.random() < self._eps_schedule.get_epsilon():
            return self.get_random_action(state)[0]
        else:
            return self.get_best_action(state)[0]

    def get_random_action_fn(self):
        def random_action_fn(state):
            action = np.random.randint(5)
            return action

        return random_action_fn

    def get_action_fn(self):
        return self.get_action

    def pad_state(self, states, state_history):
        tmp_states = states
        tmp_state = np.concatenate(
            [np.expand_dims(state, -1) for state in tmp_states], axis=-1)
        tmp_state = np.pad(tmp_state,
                           ((0, 0), (state_history - tmp_state.shape[-1], 0)),
                           'constant',
                           constant_values=0)
        return [tmp_state]

    def simulate_an_episode(self, T, action_fn):
        rewards = []
        states = []
        actions = []

        self._sim.reset()
        cum_reward = 0

        for t in range(T):
            state = self._sim.state()
            states.append(state)

            state_input = self.pad_state(states[-self._config.state_history:],
                                         self._config.state_history)
            action = action_fn(state_input)
            actions.append(action)

            reward = self._sim.progress(action)
            rewards.append(reward)

        return (states, rewards, actions)

    def sampling_buffer(self):
        for s in range(self._config.nBufferSample):
            if s % 20 == 0:
                print("Sample buffer: ", s)
            states, rewards, actions = self.simulate_an_episode(
                self._config.T, self._action_fn)
            self._bf.store(states, actions, rewards)
Exemplo n.º 17
0
        outside_value=student_config.exp_outside_value)
    # exp_schedule = LinearExploration(env, student_config.eps_begin,
    #         student_config.eps_end, student_config.eps_nsteps)

    # learning rate schedule
    lr_schedule = PiecewiseSchedule(
        student_config.lr_endpoints,
        outside_value=student_config.lr_outside_value)
    # lr_schedule  = LinearSchedule(student_config.lr_begin, student_config.lr_end,
    #         student_config.lr_nsteps)

    # teacher choice strategy
    num_teachers = len(args.teacher_checkpoint_dirs)
    if args.choose_teacher_q in ['eps_greedy_bandit']:
        eps_schedule = LinearSchedule(student_config.teacher_choice_eps_begin,
                                      student_config.teacher_choice_eps_end,
                                      student_config.teacher_choice_eps_nsteps)
    if args.bandit_reward_method == 'rolling_avg':
        bandit_alpha_schedule = LinearSchedule(
            student_config.teacher_choice_bandit_alpha_begin,
            student_config.teacher_choice_bandit_alpha_end,
            student_config.teacher_choice_bandit_alpha_nsteps)
    elif args.bandit_reward_method == 'avg_all_time':
        bandit_alpha_schedule = None

    if args.choose_teacher_q == 'random_bandit':
        choose_teacher_strategy = RandomBandit(num_teachers,
                                               args.bandit_reward_method,
                                               bandit_alpha_schedule)
    elif args.choose_teacher_q == 'eps_greedy_bandit':
        choose_teacher_strategy = EpsilonGreedyBandit(
Exemplo n.º 18
0
  elif FLAGS.network_type == 'recurrent_q':
    network = network.RecurrentQ(FLAGS)

  elif FLAGS.network_type == 'transfer_q':
    network = network.TransferQ(FLAGS)

  elif FLAGS.network_type == 'deep_ac':
    network = network.DeepAC(FLAGS)

  else: raise NotImplementedError

  # Initialize exploration strategy
  exp_schedule = LinearExploration(env, FLAGS.epsilon, FLAGS.eps_end, FLAGS.eps_nsteps)

  # Initialize exploration rate schedule
  lr_schedule  = LinearSchedule(FLAGS.learning_rate, FLAGS.lr_end, FLAGS.lr_nsteps)

  # train model
  model = None
  if FLAGS.model_type == 'q':
    model = Model(env, record_env, network, FLAGS)
  elif FLAGS.model_type == 'ac':
    model = ModelAC(env, record_env, network, FLAGS)
  else:
    raise NotImplementedError

  if FLAGS.record_only:
    model.record_videos(FLAGS.model_path+'checkpoint')
  else:
    try:
      model.run(exp_schedule, lr_schedule)
Exemplo n.º 19
0
Arquivo: deepq.py Projeto: shukon/SDC
def learn(env,
          lr=1e-4,
          total_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          action_repeat=4,
          batch_size=32,
          learning_starts=1000,
          gamma=0.99,
          target_network_update_freq=500,
          model_identifier='agent'):
    """ Train a deep q-learning model.
    Parameters
    -------
    env: gym.Env
        environment to train on
    lr: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to take
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
    action_repeat: int
        selection action on every n-th frame and repeat action for intermediate frames
    batch_size: int
        size of a batched sampled from replay buffer for training
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    model_identifier: string
        identifier of the agent
    """
    episode_rewards = [0.0]
    training_losses = []
    actions = get_action_set()
    action_size = len(actions)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Build networks
    policy_net = DQN(action_size, device).to(device)
    target_net = DQN(action_size, device).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    # Create replay buffer
    replay_buffer = ReplayBuffer(buffer_size)

    # Create optimizer
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        total_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize environment and get first state
    obs = get_state(env.reset())

    # Iterate over the total number of time steps
    try:
        for t in range(total_timesteps):

            # Select action
            action_id = select_exploratory_action(obs, policy_net, action_size,
                                                  exploration, t)
            env_action = actions[action_id]

            # Perform action fram_skip-times
            for f in range(action_repeat):
                new_obs, rew, done, _ = env.step(env_action)
                episode_rewards[-1] += rew
                # if episode_rewards[-1] < termination_reward:
                #     done = True
                if done:
                    break

            # Store transition in the replay buffer.
            new_obs = get_state(new_obs)
            replay_buffer.add(obs, action_id, rew, new_obs, float(done))
            obs = new_obs

            if done:
                # Start new episode after previous episode has terminated
                print("timestep: " + str(t) + " \t reward: " +
                      str(episode_rewards[-1]))
                obs = get_state(env.reset())
                episode_rewards.append(0.0)

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                loss = perform_qlearning_step(policy_net, target_net,
                                              optimizer, replay_buffer,
                                              batch_size, gamma, device)
                training_losses.append(loss)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target_net(policy_net, target_net)

            if total_timesteps % 1000 == 0:
                # Save the trained policy network
                torch.save(policy_net.state_dict(), model_identifier + '.pt')

                # Visualize the training loss and cumulative reward curves
                visualize_training(episode_rewards, training_losses,
                                   model_identifier)
    except KeyboardInterrupt:
        pass
    finally:

        # Save the trained policy network
        torch.save(policy_net.state_dict(), model_identifier + '.pt')

        # Visualize the training loss and cumulative reward curves
        visualize_training(episode_rewards, training_losses, model_identifier)
Exemplo n.º 20
0
def challenger_round():
    challengers = []
    leaders = []
    leader_checkpoints = os.listdir(LEADER_DIR)
    # Need to share the same schedule with all challengers, so they all anneal
    # at same rate
    epsilon_schedule = LinearSchedule(EPS_START, EPS_END, TRAIN_FRAMES)
    for i in xrange(NUM_LEADERS):
        challenger = try_gpu(
            DQNAgent(6,
                     epsilon_schedule,
                     OBSERVATION_MODE,
                     lr=LR,
                     max_grad_norm=GRAD_CLIP_NORM))
        if i < len(leader_checkpoints):
            leader = try_gpu(
                DQNAgent(6, LinearSchedule(0.1, 0.1, 500000),
                         OBSERVATION_MODE))
            leader_path = os.path.join(LEADER_DIR, leader_checkpoints[i])
            print "LOADING CHECKPOINT: {}".format(leader_path)
            challenger.load_state_dict(
                torch.load(leader_path,
                           map_location=lambda storage, loc: storage))
            leader.load_state_dict(
                torch.load(leader_path,
                           map_location=lambda storage, loc: storage))
        else:
            leader = RandomAgent(6)
            print "INITIALIZING NEW CHALLENGER AND LEADER"
        challengers.append(challenger)
        leaders.append(leader)

    if CHALLENGER_DIR is not None:
        challengers = []
        # Load in all of the leaders
        for checkpoint in os.listdir(CHALLENGER_DIR):
            path = os.path.join(CHALLENGER_DIR, checkpoint)
            print "LOADING FROM CHALLENGER_DIR: {}".format(path)
            challenger = try_gpu(
                DQNAgent(6,
                         LinearSchedule(0.05, 0.05, 1),
                         CHALLENGER_OBSERVATION_MODE,
                         lr=LR,
                         max_grad_norm=GRAD_CLIP_NORM,
                         name=checkpoint))
            challenger.load_state_dict(
                torch.load(path, map_location=lambda storage, loc: storage))
            challengers.append(challenger)

    challenger = EnsembleDQNAgent(challengers)
    leader = EnsembleDQNAgent(leaders)
    if OPPONENT is not None or HUMAN:
        leader = NoOpAgent()
    replay_buffer = ReplayBuffer(1000000)
    rewards = collections.deque(maxlen=1000)
    frames = 0  # number of training frames seen
    episodes = 0  # number of training episodes that have been played
    with tqdm(total=TRAIN_FRAMES) as progress:
        # Each loop completes a single episode
        while frames < TRAIN_FRAMES:
            states = env.reset()
            challenger.reset()
            leader.reset()
            episode_reward = 0.
            episode_frames = 0
            # Each loop completes a single step, duplicates _evaluate() to
            # update at the appropriate frame #s
            for _ in xrange(MAX_EPISODE_LENGTH):
                frames += 1
                episode_frames += 1
                action1 = challenger.act(states[0])
                action2 = leader.act(states[1])
                next_states, reward, done = env.step(action1, action2)
                episode_reward += reward

                # NOTE: state and next_state are LazyFrames and must be
                # converted to np.arrays
                replay_buffer.add(
                    Experience(states[0], action1._action_index, reward,
                               next_states[0], done))
                states = next_states

                if len(replay_buffer) > 50000 and \
                        frames % 4 == 0:
                    experiences = replay_buffer.sample(32)
                    challenger.update_from_experiences(experiences)

                if frames % 10000 == 0:
                    challenger.sync_target()

                if frames % SAVE_FREQ == 0:
                    # TODO: Don't access internals
                    for agent in challenger._agents:
                        path = os.path.join(LEADER_DIR,
                                            agent.name + "-{}".format(frames))
                        print "SAVING CHECKPOINT TO: {}".format(path)
                        torch.save(agent.state_dict(), path)
                    #path = os.path.join(
                    #        LEADER_DIR, challenger.name + "-{}".format(frames))
                    #torch.save(challenger.state_dict(), path)

                if frames >= TRAIN_FRAMES:
                    break

                if done:
                    break

            if episodes % 300 == 0:
                print "Evaluation: {}".format(
                    evaluate(challenger, leader, EPISODES_EVALUATE_TRAIN))
            print "Episode reward: {}".format(episode_reward)
            episodes += 1
            rewards.append(episode_reward)
            stats = challenger.stats
            stats["Avg Episode Reward"] = float(sum(rewards)) / len(rewards)
            stats["Num Episodes"] = episodes
            stats["Replay Buffer Size"] = len(replay_buffer)
            progress.set_postfix(stats, refresh=False)
            progress.update(episode_frames)
            episode_frames = 0