def __init__(self, config): self._config = config self._eps_schedule = LinearSchedule(self._config.eps_begin, self._config.eps_end, self._config.nsteps) self._lr_schedule = LinearSchedule(self._config.lr_begin, self._config.lr_end, self._config.lr_nsteps) self._sim = TrafficSimulator(config) self._bf = ReplayBuffer(10000, config) self._action_fn = self.get_action_fn() self.build()
def __init__(self, config): self._config = config self._eps_schedule = LinearSchedule(self._config.eps_begin, self._config.eps_end, self._config.nsteps) self._lr_schedule = LinearSchedule(self._config.lr_begin, self._config.lr_end, self._config.lr_nsteps) self._oq = Order_Queue(self._config.order_path) self._mq = Message_Queue(self._config.message_path) self._bf = ReplayBuffer(1000000, config) self._action_fn = self.get_action_fn() self.build()
def purge_round(): candidate_leaders_map = {} # {filename --> agent} # Load in all of the leaders for leader_checkpoint in os.listdir(LEADER_DIR): path = os.path.join(LEADER_DIR, leader_checkpoint) candidate_leader = try_gpu( DQNAgent(6, LinearSchedule(0.05, 0.05, 1), OBSERVATION_MODE, lr=LR, max_grad_norm=GRAD_CLIP_NORM, name=leader_checkpoint)) candidate_leader.load_state_dict( torch.load(path, map_location=lambda storage, loc: storage)) candidate_leaders_map[leader_checkpoint] = candidate_leader candidate_scores = [] # list[(filename, score)] filenames, candidate_leaders = zip(*candidate_leaders_map.items()) for i, (filename, candidate_leader) in enumerate(zip(filenames, candidate_leaders)): print "EVALUATING {}".format(candidate_leader.name) leaders = EnsembleDQNAgent(candidate_leaders[:i] + candidate_leaders[i + 1:]) candidate_scores.append((filename, evaluate(candidate_leader, leaders, EPISODES_EVALUATE_PURGE))) sorted_scores = sorted(candidate_scores, key=lambda x: x[1], reverse=True) print "SCORES: {}".format(sorted_scores) for filename, score in sorted_scores[NUM_LEADERS:]: print "PURGING ({}, {})".format(filename, score) leader_path = os.path.join(LEADER_DIR, filename) graveyard_path = os.path.join(GRAVEYARD_DIR, filename) os.rename(leader_path, graveyard_path)
def __init__(self, total_timesteps=100000, buffer_size=50000, type_buffer="HER", prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6): self.buffer_size = buffer_size self.prioritized_replay_eps = prioritized_replay_eps self.type_buffer = type_buffer if prioritized_replay: if type_buffer == "PER": self.replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if type_buffer == "HER": self.replay_buffer = HighlightReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(buffer_size) self.beta_schedule = None
def __init__(self, state_size, action_size, num_agents, seed, fc1=400, fc2=300, update_times=10, weight_decay=1.e-5): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.num_agents = num_agents self.n_seed = np.random.seed(seed) self.update_times = update_times self.n_step = 0 self.noise = [] for i in range(num_agents): self.noise.append( rm.OrnsteinUhlenbeckProcess(size=(action_size, ), std=LinearSchedule(0.2))) # critic local and target network (Q-Learning) self.critic_local = Critic(state_size, action_size, fc1, fc2, seed).to(device) self.critic_target = Critic(state_size, action_size, fc1, fc2, seed).to(device) self.critic_target.load_state_dict(self.critic_local.state_dict()) # actor local and target network (Policy gradient) self.actor_local = Actor(state_size, action_size, fc1, fc2, seed).to(device) self.actor_target = Actor(state_size, action_size, fc1, fc2, seed).to(device) self.actor_target.load_state_dict(self.actor_local.state_dict()) # optimizer for critic and actor network self.optimizer_critic = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LR, weight_decay=1.e-5) self.optimizer_actor = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.a_step = 0
def __init__(self, size=5000, epsilon=0.01, alpha=0.6, beta=0.4, num_steps=int(1e6)): super(PrioritizedReplayBuffer, self).__init__(size=size) self.size = size self.epsilon = epsilon self.alpha = alpha self.beta = LinearSchedule(min_val=beta, max_val=1.0, num_steps=num_steps) self.probs = deque(maxlen=size) self._max_prob = epsilon
def main(num_epochs): exploration_schedule = LinearSchedule(300000, 0.1) dqn_learing( dataLoader=videoLoader, num_epochs=num_epochs, feature_size=2048, num_classes=101, r_p=0.01, q_func=QNet, exploration=exploration_schedule, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, )
def train_maze(output_path): config = Config() config.set_paths(output_path) env = EnvMaze(n=config.maze_size, hard=config.hard) # exploration strategy exp_schedule = LinearExploration(env, config.eps_begin, config.eps_end, config.eps_nsteps, config.env_name) # learning rate schedule lr_schedule = LinearSchedule(config.lr_begin, config.lr_end, config.lr_nsteps, config.env_name) # train model print(config.output_path) model = NatureQN(env, config) model.bfs_len = env.get_bfs_length() evaluation_result_list, oos_evalution_result_list = model.run(exp_schedule, lr_schedule) return evaluation_result_list, oos_evalution_result_list
def __init__(self, total_timesteps=100000, buffer_size=50000, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6): self.buffer_size = buffer_size if prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None
invader = Invader(speed=1) guard = Guard2Targets(speed=1) target = Target(speed=0) env = Environment2Targets([32, 32], guard, invader, target) OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs"]) Statistic = {"mean_episode_rewards": [], "best_mean_episode_rewards": []} optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(100000, 0.1) # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration_schedule.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don't save the history return model(Variable(obs)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(NUM_ACTIONS)]]) # vis = visdom.Visdom(port=8124)
student_config.lr_nsteps = args.nsteps_train / 2 student_config.exp_policy = args.exp_policy # make env env = gym.make(student_config.env_name) if hasattr(student_config, 'skip_frame'): env = MaxAndSkipEnv(env, skip=student_config.skip_frame) if hasattr(student_config, 'preprocess_state' ) and student_config.preprocess_state is not None: env = PreproWrapper(env, prepro=greyscale, shape=(80, 80, 1), overwrite_render=student_config.overwrite_render) # exploration strategy if student_config.exp_policy == 'egreedy': exp_schedule = LinearExploration(env, student_config.eps_begin, student_config.eps_end, student_config.eps_nsteps) else: exp_schedule = LinearGreedyExploration(env, student_config.eps_begin, student_config.eps_end, student_config.eps_nsteps) # learning rate schedule lr_schedule = LinearSchedule(student_config.lr_begin, student_config.lr_end, student_config.lr_nsteps) # train model model = DistilledQN(env, student_config) model.run(exp_schedule, lr_schedule)
You'll find the results, log and video recordings of your agent every 250k under the corresponding file in the results folder. A good way to monitor the progress of the training is to use Tensorboard. The starter code writes summaries of different variables. To launch tensorboard, open a Terminal window and run tensorboard --logdir=results/ Then, connect remotely to address-ip-of-the-server:6006 6006 is the default port used by tensorboard. """ if __name__ == '__main__': # make env env = gym.make(config.env_name) env = MaxAndSkipEnv(env, skip=config.skip_frame) env = PreproWrapper(env, prepro=greyscale, shape=(80, 80, 1), overwrite_render=config.overwrite_render) # exploration strategy exp_schedule = LinearExploration(env, config.eps_begin, config.eps_end, config.eps_nsteps) # learning rate schedule lr_schedule = LinearSchedule(config.lr_begin, config.lr_end, config.lr_nsteps) # train model model = NatureQN(env, config) model.run(exp_schedule, lr_schedule)
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, callback=None): """Train a deepq model. Parameters ------- env : gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput([84, 84], name=name) act, train, update_target, debug = build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=2, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': 2, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.step(0) with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: #obs = env.reset() episode_rewards.append(0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: #logger.record_tabular("steps", t) #logger.record_tabular("episodes", num_episodes) #logger.record_tabular("mean 100 episode reward", mean_100ep_reward) #logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) #logger.dump_tabular() print("steps: {}".format(t)) print("episodes: {}".format(num_episodes)) print("mean 100 episode reward: {}".format(mean_100ep_reward)) print("% time spent exploring: {}".format( int(100 * exploration.value(t)))) if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: #if print_freq is not None: #logger.log("Saving model due to mean reward increase: {} -> {}".format( # saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: #if print_freq is not None: #logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)
class model(object): def __init__(self, config): self._config = config self._eps_schedule = LinearSchedule(self._config.eps_begin, self._config.eps_end, self._config.nsteps) self._lr_schedule = LinearSchedule(self._config.lr_begin, self._config.lr_end, self._config.lr_nsteps) self._oq = Order_Queue(self._config.order_path) self._mq = Message_Queue(self._config.message_path) self._bf = ReplayBuffer(1000000, config) self._action_fn = self.get_action_fn() self.build() def build(self): pass def initialize(self): pass def get_random_action(self, state): pass def get_best_action(self, state): ### return action, q value pass def get_action(self, state): if np.random.random() < self._eps_schedule.get_epsilon(): return self.get_random_action(state)[0] else: return self.get_best_action(state)[0] def get_random_action_fn(self): def random_action_fn(t, amount, state, mid_price): action = np.random.randint( self._config.L) # action = L for market order price = (action - self._config.L // 2) * self._config.base_point + mid_price return (price, action) return random_action_fn def get_action_fn(self): def action_fn(t, amount, state, mid_price): action = self.get_action(state) price = (action - self._config.L // 2) * self._config.base_point + mid_price return (price, action) return action_fn def pad_state(self, states, state_history): tmp_states, tmp_its = zip(*states) tmp_state = np.concatenate( [np.expand_dims(state, -1) for state in tmp_states], axis=-1) tmp_state = np.pad(tmp_state, ((0, 0), (0, 0), (state_history - tmp_state.shape[-1], 0)), 'constant', constant_values=0) tmp_it = tmp_its[-1] return ([tmp_state], [tmp_it]) def simulate_an_episode(self, amount, T, H, start_time, order_direction, action_fn, depth): dH = H // T self._mq.reset() lob_data = self._oq.create_orderbook_time(start_time, self._mq) lob = Limit_Order_book(**lob_data, own_amount_to_trade=0, own_init_price=-order_direction * Limit_Order_book._DUMMY_VARIABLE, own_trade_type=order_direction) rewards = [] states = [] actions = [] done_mask = [] amount_remain = amount cum_reward = 0 for t in range(start_time, start_time + H - dH, dH): tmp1 = 1.0 * amount_remain / amount # amount remain tmp2 = 1.0 * (start_time + H - t) / H # time remain state = (lob.display_book(depth), np.array([tmp1, tmp2], dtype=float)) state = self.process_state(state) states.append(state) mid_price = lob.get_mid_price() state_input = self.pad_state(states[-self._config.state_history:], self._config.state_history) price, action = action_fn(start_time + H - t, amount_remain, state_input, mid_price) actions.append(action) done_mask.append(False) lob.update_own_order(price, amount_remain) for idx, message in self._mq.pop_to_next_time(t + dH): lob.process(**message) if lob.own_amount_to_trade == 0: done_mask.append(True) state = (lob.display_book(depth), np.array([ 0, 1.0 * (start_time + H - self._mq._time) / H ], dtype=float)) state = self.process_state(state) states.append(state) rewards.append(lob.own_reward - cum_reward) break if done_mask[-1]: break else: # What is going on over here? rewards.append(lob.own_reward - cum_reward) cum_reward = lob.own_reward amount_remain = lob.own_amount_to_trade if not done_mask[-1]: tmp1 = 1.0 * amount_remain / amount tmp2 = 1.0 * (start_time + H - t - dH) / H state = (lob.display_book(depth), np.array([tmp1, tmp2], dtype=float)) state = self.process_state(state) states.append(state) done_mask.append(False) lob.update_own_order(lob.own_trade_type * Limit_Order_book._DUMMY_VARIABLE) if lob.own_amount_to_trade == 0: rewards.append(lob.own_reward - cum_reward) else: rewards.append(-Limit_Order_book._DUMMY_VARIABLE) tmp1 = 1.0 * lob.own_amount_to_trade / amount state = (lob.display_book(depth), np.array([tmp1, 0], dtype=float)) state = self.process_state(state) states.append(state) actions.append(self._config.L) done_mask.append(True) return (states, rewards, actions, done_mask[1:]) def sampling_buffer(self): for start_time in range(self._config.train_start, self._config.train_end, self._config.H): states, rewards, actions, done_mask = self.simulate_an_episode( self._config.I, self._config.T, self._config.H, start_time, self._config.direction, self._action_fn, self._config.depth) self._bf.store(states, actions, rewards, done_mask) def process_state(self, state): state_book, state_it = state state_book = state_book.astype('float32') state_book[:, 0] /= 1.e6 state_book[:, 1] /= 1.e2 state_book[:, 2] /= 1.e6 state_book[:, 3] /= 1.e2 return (state_book, state_it)
model.compile(optimizer=Adam(lr=0.001), loss='mae') model.summary() return model model = create_model() if os.path.isfile(WEIGHTS_PATH) and os.access(WEIGHTS_PATH, os.R_OK): model.load_weights(WEIGHTS_PATH) # memory = deque(maxlen=MEMORY) env = gym.make(ENV_NAME) replay_buffer = PrioritizedReplayBuffer(MEMORY, alpha=prioritized_replay_alpha) beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) eps_schedule = LinearSchedule(eps_iters, initial_p=eps0, final_p=0.1) def get_action(ob, epsilon): moves = env.action_space q_values = [] observations = np.expand_dims(ob, 0) for move in moves: action = np.zeros((80, )) if move > 99: action[move - 29] = 1 else:
class model(object): def __init__(self, config): self._config = config self._eps_schedule = LinearSchedule(self._config.eps_begin, self._config.eps_end, self._config.nsteps) self._lr_schedule = LinearSchedule(self._config.lr_begin, self._config.lr_end, self._config.lr_nsteps) self._sim = TrafficSimulator(config) self._bf = ReplayBuffer(10000, config) self._action_fn = self.get_action_fn() self.build() def build(self): pass def initialize(self): pass def get_random_action(self, state): pass def get_best_action(self, state): ### return action, q value pass def get_action(self, state): if np.random.random() < self._eps_schedule.get_epsilon(): return self.get_random_action(state)[0] else: return self.get_best_action(state)[0] def get_random_action_fn(self): def random_action_fn(state): action = np.random.randint(5) return action return random_action_fn def get_action_fn(self): return self.get_action def pad_state(self, states, state_history): tmp_states = states tmp_state = np.concatenate( [np.expand_dims(state, -1) for state in tmp_states], axis=-1) tmp_state = np.pad(tmp_state, ((0, 0), (state_history - tmp_state.shape[-1], 0)), 'constant', constant_values=0) return [tmp_state] def simulate_an_episode(self, T, action_fn): rewards = [] states = [] actions = [] self._sim.reset() cum_reward = 0 for t in range(T): state = self._sim.state() states.append(state) state_input = self.pad_state(states[-self._config.state_history:], self._config.state_history) action = action_fn(state_input) actions.append(action) reward = self._sim.progress(action) rewards.append(reward) return (states, rewards, actions) def sampling_buffer(self): for s in range(self._config.nBufferSample): if s % 20 == 0: print("Sample buffer: ", s) states, rewards, actions = self.simulate_an_episode( self._config.T, self._action_fn) self._bf.store(states, actions, rewards)
outside_value=student_config.exp_outside_value) # exp_schedule = LinearExploration(env, student_config.eps_begin, # student_config.eps_end, student_config.eps_nsteps) # learning rate schedule lr_schedule = PiecewiseSchedule( student_config.lr_endpoints, outside_value=student_config.lr_outside_value) # lr_schedule = LinearSchedule(student_config.lr_begin, student_config.lr_end, # student_config.lr_nsteps) # teacher choice strategy num_teachers = len(args.teacher_checkpoint_dirs) if args.choose_teacher_q in ['eps_greedy_bandit']: eps_schedule = LinearSchedule(student_config.teacher_choice_eps_begin, student_config.teacher_choice_eps_end, student_config.teacher_choice_eps_nsteps) if args.bandit_reward_method == 'rolling_avg': bandit_alpha_schedule = LinearSchedule( student_config.teacher_choice_bandit_alpha_begin, student_config.teacher_choice_bandit_alpha_end, student_config.teacher_choice_bandit_alpha_nsteps) elif args.bandit_reward_method == 'avg_all_time': bandit_alpha_schedule = None if args.choose_teacher_q == 'random_bandit': choose_teacher_strategy = RandomBandit(num_teachers, args.bandit_reward_method, bandit_alpha_schedule) elif args.choose_teacher_q == 'eps_greedy_bandit': choose_teacher_strategy = EpsilonGreedyBandit(
elif FLAGS.network_type == 'recurrent_q': network = network.RecurrentQ(FLAGS) elif FLAGS.network_type == 'transfer_q': network = network.TransferQ(FLAGS) elif FLAGS.network_type == 'deep_ac': network = network.DeepAC(FLAGS) else: raise NotImplementedError # Initialize exploration strategy exp_schedule = LinearExploration(env, FLAGS.epsilon, FLAGS.eps_end, FLAGS.eps_nsteps) # Initialize exploration rate schedule lr_schedule = LinearSchedule(FLAGS.learning_rate, FLAGS.lr_end, FLAGS.lr_nsteps) # train model model = None if FLAGS.model_type == 'q': model = Model(env, record_env, network, FLAGS) elif FLAGS.model_type == 'ac': model = ModelAC(env, record_env, network, FLAGS) else: raise NotImplementedError if FLAGS.record_only: model.record_videos(FLAGS.model_path+'checkpoint') else: try: model.run(exp_schedule, lr_schedule)
def learn(env, lr=1e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, action_repeat=4, batch_size=32, learning_starts=1000, gamma=0.99, target_network_update_freq=500, model_identifier='agent'): """ Train a deep q-learning model. Parameters ------- env: gym.Env environment to train on lr: float learning rate for adam optimizer total_timesteps: int number of env steps to take buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. action_repeat: int selection action on every n-th frame and repeat action for intermediate frames batch_size: int size of a batched sampled from replay buffer for training learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. model_identifier: string identifier of the agent """ episode_rewards = [0.0] training_losses = [] actions = get_action_set() action_size = len(actions) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Build networks policy_net = DQN(action_size, device).to(device) target_net = DQN(action_size, device).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() # Create replay buffer replay_buffer = ReplayBuffer(buffer_size) # Create optimizer optimizer = optim.Adam(policy_net.parameters(), lr=lr) # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize environment and get first state obs = get_state(env.reset()) # Iterate over the total number of time steps try: for t in range(total_timesteps): # Select action action_id = select_exploratory_action(obs, policy_net, action_size, exploration, t) env_action = actions[action_id] # Perform action fram_skip-times for f in range(action_repeat): new_obs, rew, done, _ = env.step(env_action) episode_rewards[-1] += rew # if episode_rewards[-1] < termination_reward: # done = True if done: break # Store transition in the replay buffer. new_obs = get_state(new_obs) replay_buffer.add(obs, action_id, rew, new_obs, float(done)) obs = new_obs if done: # Start new episode after previous episode has terminated print("timestep: " + str(t) + " \t reward: " + str(episode_rewards[-1])) obs = get_state(env.reset()) episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. loss = perform_qlearning_step(policy_net, target_net, optimizer, replay_buffer, batch_size, gamma, device) training_losses.append(loss) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target_net(policy_net, target_net) if total_timesteps % 1000 == 0: # Save the trained policy network torch.save(policy_net.state_dict(), model_identifier + '.pt') # Visualize the training loss and cumulative reward curves visualize_training(episode_rewards, training_losses, model_identifier) except KeyboardInterrupt: pass finally: # Save the trained policy network torch.save(policy_net.state_dict(), model_identifier + '.pt') # Visualize the training loss and cumulative reward curves visualize_training(episode_rewards, training_losses, model_identifier)
def challenger_round(): challengers = [] leaders = [] leader_checkpoints = os.listdir(LEADER_DIR) # Need to share the same schedule with all challengers, so they all anneal # at same rate epsilon_schedule = LinearSchedule(EPS_START, EPS_END, TRAIN_FRAMES) for i in xrange(NUM_LEADERS): challenger = try_gpu( DQNAgent(6, epsilon_schedule, OBSERVATION_MODE, lr=LR, max_grad_norm=GRAD_CLIP_NORM)) if i < len(leader_checkpoints): leader = try_gpu( DQNAgent(6, LinearSchedule(0.1, 0.1, 500000), OBSERVATION_MODE)) leader_path = os.path.join(LEADER_DIR, leader_checkpoints[i]) print "LOADING CHECKPOINT: {}".format(leader_path) challenger.load_state_dict( torch.load(leader_path, map_location=lambda storage, loc: storage)) leader.load_state_dict( torch.load(leader_path, map_location=lambda storage, loc: storage)) else: leader = RandomAgent(6) print "INITIALIZING NEW CHALLENGER AND LEADER" challengers.append(challenger) leaders.append(leader) if CHALLENGER_DIR is not None: challengers = [] # Load in all of the leaders for checkpoint in os.listdir(CHALLENGER_DIR): path = os.path.join(CHALLENGER_DIR, checkpoint) print "LOADING FROM CHALLENGER_DIR: {}".format(path) challenger = try_gpu( DQNAgent(6, LinearSchedule(0.05, 0.05, 1), CHALLENGER_OBSERVATION_MODE, lr=LR, max_grad_norm=GRAD_CLIP_NORM, name=checkpoint)) challenger.load_state_dict( torch.load(path, map_location=lambda storage, loc: storage)) challengers.append(challenger) challenger = EnsembleDQNAgent(challengers) leader = EnsembleDQNAgent(leaders) if OPPONENT is not None or HUMAN: leader = NoOpAgent() replay_buffer = ReplayBuffer(1000000) rewards = collections.deque(maxlen=1000) frames = 0 # number of training frames seen episodes = 0 # number of training episodes that have been played with tqdm(total=TRAIN_FRAMES) as progress: # Each loop completes a single episode while frames < TRAIN_FRAMES: states = env.reset() challenger.reset() leader.reset() episode_reward = 0. episode_frames = 0 # Each loop completes a single step, duplicates _evaluate() to # update at the appropriate frame #s for _ in xrange(MAX_EPISODE_LENGTH): frames += 1 episode_frames += 1 action1 = challenger.act(states[0]) action2 = leader.act(states[1]) next_states, reward, done = env.step(action1, action2) episode_reward += reward # NOTE: state and next_state are LazyFrames and must be # converted to np.arrays replay_buffer.add( Experience(states[0], action1._action_index, reward, next_states[0], done)) states = next_states if len(replay_buffer) > 50000 and \ frames % 4 == 0: experiences = replay_buffer.sample(32) challenger.update_from_experiences(experiences) if frames % 10000 == 0: challenger.sync_target() if frames % SAVE_FREQ == 0: # TODO: Don't access internals for agent in challenger._agents: path = os.path.join(LEADER_DIR, agent.name + "-{}".format(frames)) print "SAVING CHECKPOINT TO: {}".format(path) torch.save(agent.state_dict(), path) #path = os.path.join( # LEADER_DIR, challenger.name + "-{}".format(frames)) #torch.save(challenger.state_dict(), path) if frames >= TRAIN_FRAMES: break if done: break if episodes % 300 == 0: print "Evaluation: {}".format( evaluate(challenger, leader, EPISODES_EVALUATE_TRAIN)) print "Episode reward: {}".format(episode_reward) episodes += 1 rewards.append(episode_reward) stats = challenger.stats stats["Avg Episode Reward"] = float(sum(rewards)) / len(rewards) stats["Num Episodes"] = episodes stats["Replay Buffer Size"] = len(replay_buffer) progress.set_postfix(stats, refresh=False) progress.update(episode_frames) episode_frames = 0