def train(num_timesteps, seed, model_path=None): env_id = 'Humanoid-v2' from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) # parameters below were the best found in a simple random search # these are good enough to make humanoid walk, but whether those are # an absolute best or not is not certain env = RewScale(env, 0.1) pi = pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close() if model_path: U.save_state(model_path) return pi
def save(self, path): """Save model to a pickle located at `path`""" with tempfile.TemporaryDirectory() as td: U.save_state(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, 'w') as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: dill.dump((model_data, self._act_params), f)
def maybe_save_model(savedir, container, state): """This function checkpoints the model and state of the training algorithm.""" if savedir is None: return start_time = time.time() model_dir = "model-{}".format(state["num_iters"]) U.save_state(os.path.join(savedir, model_dir, "saved")) if container is not None: container.put(os.path.join(savedir, model_dir), model_dir) relatively_safe_pickle_dump(state, os.path.join(savedir, 'training_state.pkl.zip'), compression=True) if container is not None: container.put(os.path.join(savedir, 'training_state.pkl.zip'), 'training_state.pkl.zip') relatively_safe_pickle_dump(state["monitor_state"], os.path.join(savedir, 'monitor_state.pkl')) if container is not None: container.put(os.path.join(savedir, 'monitor_state.pkl'), 'monitor_state.pkl') logger.log("Saved model in {} seconds\n".format(time.time() - start_time))
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None, task_name=None, verbose=False): val_per_iter = int(max_iters/10) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac-pi.ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)]) U.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') train_loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, True) logger.log("Training loss: {}, Validation loss: {}".format(train_loss, val_loss)) if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: savedir_fname = osp.join(ckpt_dir, task_name) U.save_state(savedir_fname, var_list=pi.get_variables()) return savedir_fname
def save(self, save_path): tf_util.save_state(save_path, sess=self.sess)
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=1000, learning_starts=50, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, job_id=None, outdir="/tmp/rosrl/experiments/discrete/deepq/"): """Train a deepqn model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. action_no: int number of actions available in action space actions_discr: Box space Discretized actions Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model # sess = tf.Session() # sess.__enter__() if job_id is not None: #Directory for log and Tensorboard data outdir = '/tmp/rosrl/' + str( env.__class__.__name__) + '/deepq/' + 'sim_' + job_id else: outdir = '/tmp/rosrl/' + str(env.__class__.__name__) + '/deepq/' #TODO This should not go here. Instead pass both action_no and actions as arguments to learn function #Discrete actions goal_average_steps = 2 max_number_of_steps = 20 last_time_steps = np.ndarray(0) n_bins = 10 epsilon_decay_rate = 0.99 ######## it = 1 ###### # Number of states is huge so in order to simplify the situation # typically, we discretize the space to: n_bins ** number_of_features joint1_bins = pandas.cut([-np.pi / 2, np.pi / 2], bins=n_bins, retbins=True)[1][1:-1] joint2_bins = pandas.cut([-np.pi / 2, np.pi / 2], bins=n_bins, retbins=True)[1][1:-1] joint3_bins = pandas.cut([-np.pi / 2, np.pi / 2], bins=n_bins, retbins=True)[1][1:-1] action_bins = pandas.cut([-np.pi / 2, np.pi / 2], bins=n_bins, retbins=True)[1][1:-1] difference_bins = abs(joint1_bins[0] - joint1_bins[1]) actions_discr = [(difference_bins, 0.0, 0.0), (-difference_bins, 0.0, 0.0), (0.0, difference_bins, 0.0), (0.0, -difference_bins, 0.0), (0.0, 0.0, difference_bins), (0.0, 0.0, -difference_bins), (0.0, 0.0, 0.0)] action_no = 7 actions = [0, 1, 2, 3, 4, 5, 6] # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape # with tf.Session(config=tf.ConfigProto()) as session: def make_obs_ph(name): return U.BatchInput(observation_space_shape, name=name) act, train, update_target, debug = build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=action_no, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, #'num_actions': env.action_space.n, 'num_actions': action_no, } act = ActWrapper(act, act_params) # TODO: include also de Prioritized buffer # # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int( exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: # Log training stuff using tf primitives summary_writer = tf.summary.FileWriter( outdir, graph=tf.get_default_graph()) # render the environment to visualize the progress env.render() sim_r = 0 sim_t = 0 done_quant = 0 model_saved = False model_file = os.path.join(td, "model") for e in range(150): # run 10 episodes print("Episode: ", e) # reset the environment obs = env.reset() print("observation: ", obs[:3]) episode_rewards = [0.0] for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} ## TODO: review in more detail if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # # Compute the threshold such that the KL divergence between perturbed and non-perturbed # # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] if isinstance(env.action_space, gym.spaces.MultiBinary): env_action = np.zeros(env.action_space.n) env_action[action] = 1 else: env_action = action update_eps = exploration.value(t) update_param_noise_threshold = 0. # Choose action action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] reset = False new_obs, rew, done, _ = step(env, actions_discr[action], obs[:3]) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew # RK: removed this, too many prints # print("reward: ", rew) # Log the episode reward #summary = tf.Summary(value=[tf.Summary.Value(tag="Episode reward", simple_value = episode_rewards[-1]/(t + 1))]) #summary_writer.add_summary(summary, t+ e*max_timesteps) # print("average episode reward: ", episode_rewards[-1]/(t + 1)) sim_r += rew sim_t += 1 if done: # summary = tf.Summary(value=[tf.Summary.Value(tag="Mean episode reward", simple_value = episode_rewards[-1]/(t + 1))]) # summary_writer.add_summary(summary, t) done_quant += 1 print("Done!") obs = env.reset() episode_rewards.append(0.0) reset = True if t + e * max_timesteps > learning_starts and t % train_freq == 0: # TODO review if prioritized_replay is needed # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None #td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) #[td_errors, weighted_error] = train(obses_t, actions, rewards, obses_tp1, dones, weights) [ td_error, weighted_error, q_t_selected_target, rew_t_ph ] = train(obses_t, actions, rewards, obses_tp1, dones, weights) #logger.log("Evaluating losses...") #logger.log("q_t_selected_target", q_t_selected_target) #logger.log("Episode reward", episode_rewards[-1]) # TODO review if prioritized_replay is needed if prioritized_replay: new_priorities = np.abs( td_errors) + prioritized_replay_eps replay_buffer.update_priorities( batch_idxes, new_priorities) if t + e * max_timesteps > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-6:-1]), 1) #print("SIMPLE ROBOTICS -> Episode rewards",episode_rewards) #print("SIMPLE ROBOTICS -> np.mean(Episode rewards)", len(episode_rewards)) #print("SIMPLE ROBOTICS -> mean_100ep_reward", mean_100ep_reward) #print("line 383 -> SIMULATION_REWARD", sim_r / 5 * max_timesteps) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() print("steps", t) print("episodes", num_episodes) print("mean 100 episode reward", mean_100ep_reward) print("% time spent exploring", int(100 * exploration.value(t))) if (checkpoint_freq is not None and t + e * max_timesteps > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) opt_r = 1 - (sim_r / sim_t) # Log training stuff using tf primitives summary_writer = tf.summary.FileWriter(outdir + '/error/', graph=tf.get_default_graph()) summary = tf.Summary( value=[tf.Summary.Value(tag="Simulation error", simple_value=opt_r)]) summary_writer.add_summary(summary, job_id) summary_writer.flush() summary_writer_done = tf.summary.FileWriter(outdir + '/done/', graph=tf.get_default_graph()) summary_done = tf.Summary( value=[tf.Summary.Value(tag="No. dones", simple_value=done_quant)]) summary_writer_done.add_summary(summary_done, job_id) summary_writer_done.flush() print("OPT_r", opt_r) print("No. of times it converges: ", done_quant) # act_tmp = act # session.close() # tf.reset_default_graph() return act, opt_r
lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards ext_rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards int_rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards distbuffer = deque(maxlen=100) tstart = time.time() writer = U.FileWriter(tensorboard_dir) loss_stats = stats(["pol_surr", "pol_entpen", "vf_ext_loss", "vf_int_loss", "kl", "ent", "aux_loss"] ) ep_stats = stats(["Reward_Ext", "Reward_Int", "Episode_Length", "Episode_This_Iter", "Distance"]) while timesteps_so_far < args.max_timesteps: # Save model if iters_so_far % args.save_per_iter == 0 and iters_so_far > 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log2("********** Iteration %i ************"%iters_so_far) seg = seg_gen.next() losses = policy.train(seg, args.optim_batchsize, args.optim_epochs) lrlocal = (seg["ep_lens"], seg["ep_rets_ext"], seg["ep_rets_int"], seg["ep_dists"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews_ext, rews_int, dists = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) ext_rewbuffer.extend(rews_ext) int_rewbuffer.extend(rews_int) #rewbuffer.extend(list(np.array(rews_ext) + np.array(rews_int))) distbuffer.extend(dists)
def main(): start_time = datetime.datetime.now().strftime("%Y%m%d%H%M") env = StarCraft2Env(map_name="8m", reward_only_positive=False, reward_scale_rate=200, state_last_action=True, obs_last_action=True, obs_timestep_number=True, state_timestep_number=True) #reward_defeat=-200 env_info = env.get_env_info() n_episodes = 2500 #4000 #2000 timesteps = 500000 n_agents = env_info["n_agents"] n_actions = env_info["n_actions"] output_len = n_actions lr = 0.002 buffer_size = 70000 #int(timesteps * 0.1) # 80000 # 减少一下,尽量是训练步数的1/10 70000 test 200 80000 20000 batch_size = 32 # 32 gamma = 0.99 num_agents = 8 local_obs_len = 179 # local obs:80 ; global state:168; global_state_len = 348 # 179+169 hidden_vector_len = 256 # 128 # 1 256 tau = 0.001 num_exploring = buffer_size # buffer_size action_low = -1 action_high = 1 save_freq = 10000 critic_output_len = 1 logdir = "tensorboard/%s/%s_lr%s/%s" % ("BicNet", timesteps, lr, start_time) Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) sess = U.make_session() sess.__enter__() actor = ActorNetwork(sess, lr, tau, batch_size, num_agents, local_obs_len, output_len, hidden_vector_len) critic = CriticNetwork(sess, lr, tau, actor.get_num_trainable_vars(), num_agents, global_state_len, critic_output_len, hidden_vector_len, n_actions) sess.run(tf.global_variables_initializer()) replay_buffer = ReplayBuffer(buffer_size) action_noise = OU_noise(decay_period=timesteps - buffer_size) action_noise.reset() # model_file_load = os.path.join(str(350000) + "_" + "model_segment_training2/", "defeat_zerglings") # U.load_state(model_file_load, sess) U.initialize() t = 0 step_train = 0 for e in range(n_episodes): env.reset() terminated = False episode_reward = 0 local_obs = env.get_obs() local_obs = np.array(local_obs) global_state = env.get_state() global_state_expand = np.zeros( [local_obs.shape[0], local_obs.shape[1] + global_state.shape[0]]) reward_hl_own_old = [] reward_hl_en_old = [] episode_reward_agent = [0 for n in range(n_agents)] for i in range(local_obs.shape[0]): global_state_expand[i] = np.append(local_obs[i], global_state.flatten()) reward_hl_own_old.append(env.get_agent_health(i)) reward_hl_en_old.append(env.get_enemy_health(i)) while not terminated: t = t + 1 critic_input = np.expand_dims(global_state_expand, axis=0) actor_input = np.expand_dims(local_obs, axis=0) action = actor.predict(actor_input)[0] act_with_noise = action #np.clip(action + action_noise.get_noise(step_train), action_low, action_high) act_mat_norm = (act_with_noise + 1) / 2 actions = [] dead_unit = [] rew_expand = np.zeros((n_agents, 1)) for agent_id in range(n_agents): sum_avail_act = 0 act_prob = [] avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] act_unit_norm = act_mat_norm[agent_id] for i in avail_actions_ind: act_prob.append(act_unit_norm[i]) sum_avail_act = sum_avail_act + act_unit_norm[i] if (sum_avail_act == 0): act_prob = (np.array(act_prob) + 1) / len(act_prob) else: act_prob = np.array(act_prob) / sum_avail_act index = np.random.choice(np.array(avail_actions_ind), p=act_prob.ravel()) actions.append(index) if (len(avail_actions_ind) == 1 and avail_actions_ind[0] == 0): dead_unit.append(agent_id) reward_base, terminated, info = env.step(actions) new_local_obs = env.get_obs() new_local_obs = np.array(new_local_obs) new_global_state = env.get_state() new_global_state_expand = np.zeros([ new_local_obs.shape[0], new_local_obs.shape[1] + new_global_state.shape[0] ]) reward_hl_own_new = [] reward_hl_en_new = [] for i in range(new_local_obs.shape[0]): new_global_state_expand[i] = np.append( new_local_obs[i], new_global_state.flatten()) reward_hl_own_new.append(env.get_agent_health(i)) reward_hl_en_new.append(env.get_enemy_health(i)) for i in range(n_agents): if (i in dead_unit): rew_expand[i] = 0 else: rew_expand[i] = -0.05 if (actions[i] > 5): target_id = actions[i] - 6 health_reduce_en = reward_hl_en_old[ target_id] - reward_hl_en_new[target_id] if (health_reduce_en > 0): rew_expand[i] += 2 + health_reduce_en * 5 # if (reward_base > 50): # rew_expand[i] += 20 else: rew_expand[i] += 1 else: rew_expand[i] += (reward_hl_own_new[i] - reward_hl_own_old[i]) * 5 # if (terminated): if (info["battle_won"] is False): rew_expand[i] += -10 else: rew_expand[i] += 10 episode_reward_agent[i] += rew_expand[i] replay_buffer.add(local_obs, global_state_expand, act_with_noise, rew_expand, terminated, new_local_obs, new_global_state_expand) episode_reward += reward_base local_obs = new_local_obs global_state_expand = new_global_state_expand if (t == num_exploring): print("training starts") if (t >= num_exploring): local_s_batch, global_s_batch, a_batch, r_batch, done_batch, local_s2_batch, global_s2_batch = replay_buffer.sample_batch( batch_size ) # [group0:[batch_size, trace.dimension], group1, ... group8] target_q = r_batch + gamma * critic.predict_target( global_s2_batch, actor.predict_target(local_s2_batch)) predicted_q_value, _ = critic.train( global_s_batch, a_batch, np.reshape(target_q, (batch_size, num_agents, critic_output_len))) a_outs = actor.predict(local_s_batch) # a_outs和a_batch是完全相同的 grads = critic.action_gradients(global_s_batch, a_outs) # delta Q对a的导数 actor.train(local_s_batch, grads) step_train = step_train + 1 actor.update_target_network() critic.update_target_network() if (t % save_freq == 0): model_file_save = os.path.join( "model/" + str(step_train) + "_" + "training_steps_model/", "8m") U.save_state(model_file_save) print("Model have been trained for %s times" % (step_train)) # replay_buffer.save() print("steps until now : %s, episode: %s, episode reward: %s" % (t, e, episode_reward)) logger.record_tabular("steps", t) logger.record_tabular("episodes", e) logger.record_tabular("reward_episode", episode_reward) for i in range(n_agents): logger.record_tabular("reward_agent_" + str(i), episode_reward_agent[i]) logger.dump_tabular() # model_file_save = os.path.join(str(t) + "_" + "model_segment_training/", "defeat_zerglings") # U.save_state(model_file_save) env.close()
def learn( env, var_func, cvar_func, nb_atoms, run_alpha=None, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=0.95, target_network_update_freq=500, num_cpu=4, callback=None, periodic_save_freq=1000000, periodic_save_path=None, grad_norm_clip=None, ): """Train a CVaR DQN model. Parameters ------- env: gym.Env environment to train on var_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. cvar_func: function same as var_func nb_atoms: int number of atoms used in CVaR discretization run_alpha: float optimize CVaR_alpha while running. None if you want random alpha each episode. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the best model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. periodic_save_freq: int How often do we save the model - periodically periodic_save_path: str Where do we save the model - periodically grad_norm_clip: float Clip gradient to this value. No clipping if None Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/distdeepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = make_session(num_cpu=num_cpu) sess.__enter__() obs_space_shape = env.observation_space.shape def make_obs_ph(name): return U.BatchInput(obs_space_shape, name=name) act, train, update_target, debug = build_train( make_obs_ph=make_obs_ph, var_func=var_func, cvar_func=cvar_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, nb_atoms=nb_atoms, grad_norm_clipping=grad_norm_clip) act_params = { 'make_obs_ph': make_obs_ph, 'cvar_func': cvar_func, 'var_func': var_func, 'num_actions': env.action_space.n, 'nb_atoms': nb_atoms } # Create the replay buffer replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True episode = 0 alpha = 1. # --------------------------------- RUN --------------------------------- with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): print('Target reached') model_saved = False break # Take action and update exploration to the newest value update_eps = exploration.value(t) update_param_noise_threshold = 0. action = act(np.array(obs)[None], alpha, update_eps=update_eps)[0] reset = False new_obs, rew, done, _ = env.step(action) # ===== DEBUG ===== # s = np.ones_like(np.array(obs)[None]) # a = np.ones_like(act(np.array(obs)[None], run_alpha, update_eps=update_eps)) # r = np.array([0]) # s_ = np.ones_like(np.array(obs)[None]) # d = np.array([False]) # s = obs[None] # a = np.array([action]) # r = np.array([rew]) # s_ = new_obs[None] # d = np.array([done]) # if t % 100 == 0: # for f in debug: # print(f(s, a, r, s_, d)) # print('-------------') # # # print([sess.run(v) for v in tf.global_variables('cvar_dqn/cvar_func')]) # # print([sess.run(v) for v in tf.global_variables('cvar_dqn/var_func')]) # ================= # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if run_alpha is None: alpha = np.random.random() if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # Log results and periodically save the model mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.record_tabular("(current alpha)", "%.2f" % alpha) logger.dump_tabular() # save and report best model if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward # save periodically if periodic_save_freq is not None and periodic_save_path is not None and t > learning_starts: if t % periodic_save_freq == 0: ActWrapper(act, act_params).save("{}-{}.pkl".format( periodic_save_path, int(t / periodic_save_freq))) if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)
def learn( env, p_dist_func, lr=2.5e-4, eps=0.0003125, max_timesteps=100000, buffer_size=50000, exp_t1=1e6, exp_p1=0.1, exp_t2=25e6, exp_p2=0.01, # exploration_fraction=0.1, # exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=0.95, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, callback=None, dist_params=None): """Train a distdeepq model. Parameters ------- env: gym.Env environment to train on p_dist_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/distdeepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = make_session(num_cpu=num_cpu) sess.__enter__() #logger.configure() def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) if dist_params is None: raise ValueError('dist_params is required') # z, dz = build_z(**dist_params) act, train, update_target, debug = distdeepq.build_train( make_obs_ph=make_obs_ph, p_dist_func=p_dist_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr, epsilon=eps), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise, dist_params=dist_params) act_params = { 'make_obs_ph': make_obs_ph, 'p_dist_func': p_dist_func, 'num_actions': env.action_space.n, 'dist_params': dist_params } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. #exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), # initial_p=1.0, # final_p=exploration_final_eps) #exploration = PiecewiseSchedule([(0, 1.0),(max_timesteps/25, 0.1), # (max_timesteps, 0.01)], outside_value=0.01) exploration = PiecewiseSchedule([(0, 1.0), (exp_t1, exp_p1), (exp_t2, exp_p2)], outside_value=exp_p2) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] reset = False new_obs, rew, done, _ = env.step(action) # rew = rew-1 for proposed loss with new metric # rew = rew-1 # Store transition in the replay buffer. replay_buffer.add(obs, action, np.sign(rew), new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # debug['pi'] = tf.Print(debug['pi'], [debug['pi'], "target pi"]) # tf.Print(debug['mu'], [debug['mu'], "target mu"]) # tf.Print(debug['sigma'], [debug['sigma'], "target sigma"]) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)
def learn( env, policy_func, *, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None, load_model, # 'True' means load the model, 'False' build new model model_path): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) entbonus = entcoeff * meanent vferr = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = U.mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n( [U.sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out # Load neural net variables from file or Initialize if load_model == True: print("Loading model...") model_file = tf.train.get_checkpoint_state(model_path) U.load_state(model_file.model_checkpoint_path) else: U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], seg["adv"] fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() # Save the model if iters_so_far % 5 == 0: U.save_state(model_path + '/model-' + str(episodes_so_far) + '.cptk') print("Model saved")
def train(model_file, game="CartPole-v1"): """Train at a game.""" with tf_util.make_session(8): env = gym.make(game) def make_placeholder(name): """Make a placeholder input.""" return tf_util.BatchInput(env.observation_space.shape, name=name) act_params = { 'make_obs_ph': make_placeholder, 'q_func': model, 'num_actions': env.action_space.n } act, train, update_target, debug = deepq.build_train( **act_params, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4) ) act = ActWrapper(act, act_params) replay_buffer = ReplayBuffer(50000) exploration = LinearSchedule( schedule_timesteps=100000, initial_p=1.0, final_p=0.02 ) tf_util.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in itertools.count(): action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) if not len(episode_rewards) % 100: env.render() if t > 1000: obses_t, actions, rewards, obses_tp1, dones = ( replay_buffer.sample(32) ) train( obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards) ) if not t % 1000: update_target() if not t % 3000: if model_file: tf_util.save_state(model_file) yield act if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular( "mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1) ) logger.record_tabular( "% time spent exploring", int(100 * exploration.value(t)) ) logger.dump_tabular()
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: U.load_state(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: U.load_state(load_path) logger.log('Loaded model from {}'.format(load_path)) for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return act
def learn(env, q_func, num_actions=3, lr=5e-4, max_timesteps=1000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((64, 64), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] num_episodes = 0 saved_mean_reward = None path_memory = np.zeros((64, 64)) obs = env.reset() # Select all marines first player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = player_relative + path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() obs = env.step( actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) for i in range(len(player_x)): xy = [player_x[i], player_y[i]] obs = env.step( actions=[sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy])]) group_id = 0 group_list = [] unit_xy_list = [] for i in range(len(player_x)): if i % 4 != 0: continue if group_id > 2: break xy = [player_x[i], player_y[i]] unit_xy_list.append(xy) if (len(unit_xy_list) >= 1): for idx, xy in enumerate(unit_xy_list): if (idx == 0): obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy]) ]) else: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [[1], xy]) ]) obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_CONTROL_GROUP, [[_CONTROL_GROUP_SET], [group_id]]) ]) unit_xy_list = [] group_list.append(group_id) group_id += 1 if (len(unit_xy_list) >= 1): for idx, xy in enumerate(unit_xy_list): if (idx == 0): obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy]) ]) else: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [[1], xy]) ]) obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_CONTROL_GROUP, [[_CONTROL_GROUP_SET], [group_id]]) ]) group_list.append(group_id) group_id += 1 return obs reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if t % 1000 == 0: ActWrapper.save(ActWrapper, "mineral_shards.pkl") if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False rew = 0 #select marines player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = player_relative + path_memory player = [] while (len(group_list) > 0): group_id = np.random.choice(group_list) obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_CONTROL_GROUP, [[_CONTROL_GROUP_RECALL], [group_id]]) ]) selected = obs[0].observation["screen"][_SELECTED] player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() if (len(player_y) > 0): player = [int(player_x.mean()), int(player_y.mean())] break else: group_list.remove(group_id) if (len(player) == 2): if (player[0] > 32): screen = shift(LEFT, player[0] - 32, screen) elif (player[0] < 32): screen = shift(RIGHT, 32 - player[0], screen) if (player[1] > 32): screen = shift(UP, player[1] - 32, screen) elif (player[1] < 32): screen = shift(DOWN, 32 - player[1], screen) coord = [player[0], player[1]] path_memory_ = np.array(path_memory, copy=True) if (action == 0): #UP if (player[1] >= 16): coord = [player[0], player[1] - 16] path_memory_[player[1] - 16:player[1], player[0]] = -1 elif (player[1] > 0): coord = [player[0], 0] path_memory_[0:player[1], player[0]] = -1 #else: # rew -= 1 elif (action == 1): #DOWN if (player[1] <= 47): coord = [player[0], player[1] + 16] path_memory_[player[1]:player[1] + 16, player[0]] = -1 elif (player[1] > 47): coord = [player[0], 63] path_memory_[player[1]:63, player[0]] = -1 #else: # rew -= 1 elif (action == 2): #LEFT if (player[0] >= 16): coord = [player[0] - 16, player[1]] path_memory_[player[1], player[0] - 16:player[0]] = -1 elif (player[0] < 16): coord = [0, player[1]] path_memory_[player[1], 0:player[0]] = -1 #else: # rew -= 1 elif (action == 3): #RIGHT if (player[0] <= 47): coord = [player[0] + 16, player[1]] path_memory_[player[1], player[0]:player[0] + 16] = -1 elif (player[0] > 47): coord = [63, player[1]] path_memory_[player[1], player[0]:63] = -1 path_memory = np.array(path_memory_) if _MOVE_SCREEN not in obs[0].observation["available_actions"]: for i in range(len(player_x)): xy = [player_x[i], player_y[i]] obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy]) ]) #obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] # else: # new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = player_relative + path_memory selected = obs[0].observation["screen"][_SELECTED] player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer.add(screen, action, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew #episode_minerals[-1] += obs[0].reward if done: obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screen = player_relative + path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 32): screen = shift(LEFT, player[0] - 32, screen) elif (player[0] < 32): screen = shift(RIGHT, 32 - player[0], screen) if (player[1] > 32): screen = shift(UP, player[1] - 32, screen) elif (player[1] < 32): screen = shift(DOWN, 32 - player[1], screen) # Select all marines first obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) for i in range(len(player_x)): xy = [player_x[i], player_y[i]] obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy]) ]) group_id = 0 group_list = [] unit_xy_list = [] for i in range(len(player_x)): if i % 4 != 0: continue if group_id > 2: break xy = [player_x[i], player_y[i]] unit_xy_list.append(xy) if (len(unit_xy_list) >= 1): for idx, xy in enumerate(unit_xy_list): if (idx == 0): obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_POINT, [[0], xy]) ]) else: obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_POINT, [[1], xy]) ]) obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_CONTROL_GROUP, [[_CONTROL_GROUP_SET], [group_id]]) ]) unit_xy_list = [] group_list.append(group_id) group_id += 1 if (len(unit_xy_list) >= 1): for idx, xy in enumerate(unit_xy_list): if (idx == 0): obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_POINT, [[0], xy]) ]) else: obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_POINT, [[1], xy]) ]) obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_CONTROL_GROUP, [[_CONTROL_GROUP_SET], [group_id]]) ]) group_list.append(group_id) group_id += 1 episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) #mean_100ep_mineral = round(np.mean(episode_minerals[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) #logger.record_tabular("mean 100 episode mineral", mean_100ep_mineral) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act)
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) restore_model_from_file=None, save_model_with_prefix, # this is the naming of the saved model file. Usually here we set indication of the target goal: # for example 3dof_ppo1_H. # That way we can only select which networks we can execute to the real robot. We do not have to send all files or folder. # Naming of the model file should be self explanatory. job_id=None, # this variable is used for indentifing Spearmint iteration number. It is usually set by the Spearmint iterator outdir="/tmp/rosrl/experiments/continuous/ppo1/"): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() """ Here we add a possibility to resume from a previously saved model if a model file is provided """ if restore_model_from_file: # saver = tf.train.Saver(tf.all_variables()) saver = tf.train.import_meta_graph(restore_model_from_file) saver.restore( tf.get_default_session(), tf.train.latest_checkpoint('./')) #restore_model_from_file) logger.log("Loaded model from {}".format(restore_model_from_file)) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" if save_model_with_prefix: if job_id is not None: basePath = '/tmp/rosrl/' + str( env.__class__.__name__) + '/ppo1/' + job_id else: basePath = '/tmp/rosrl/' + str(env.__class__.__name__) + '/ppo1/' # Create the writer for TensorBoard logs summary_writer = tf.summary.FileWriter(outdir, graph=tf.get_default_graph()) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpRewSEM", np.std(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) """ Save the model at every itteration """ if save_model_with_prefix: #if np.mean(rewbuffer) > -50.0: if iters_so_far % 10 == 0: basePath = outdir + "/models/" if not os.path.exists(basePath): os.makedirs(basePath) modelF = basePath + save_model_with_prefix + "_afterIter_" + str( iters_so_far) + ".model" U.save_state(modelF) logger.log("Saved model to file :{}".format(modelF)) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() summary = tf.Summary(value=[ tf.Summary.Value(tag="EpRewMean", simple_value=np.mean(rewbuffer)) ]) summary_writer.add_summary(summary, timesteps_so_far)
def learn(env, q_func, num_actions=4, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def agent(): """Run the agent, connecting to a (remote) host started independently.""" agent_module, agent_name = FLAGS.agent.rsplit(".", 1) agent_cls = getattr(importlib.import_module(agent_module), agent_name) with lan_sc2_env.LanSC2Env( host=FLAGS.host, config_port=FLAGS.config_port, race=sc2_env.Race[FLAGS.agent_race], step_mul=FLAGS.step_mul, realtime=FLAGS.realtime, agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=FLAGS.feature_screen_size, feature_minimap=FLAGS.feature_minimap_size, rgb_screen=FLAGS.rgb_screen_size, rgb_minimap=FLAGS.rgb_minimap_size, action_space=FLAGS.action_space, use_unit_counts=True, use_camera_position=True, show_cloaked=True, show_burrowed_shadows=True, show_placeholders=True, send_observation_proto=True, crop_to_playable_area=True, raw_crop_to_playable_area=True, allow_cheating_layers=True, add_cargo_to_units=True, use_feature_units=FLAGS.use_feature_units), visualize=FLAGS.render) as env: agents = [agent_cls()] logging.info("Connected, starting run_loop.") try: run_loop.run_loop(agents, env) except lan_sc2_env.RestartError: pass logging.info("Done.") def make_obs_ph(name): return BatchInput((1, 16, 16), name=name) act_x, train_x, update_target_x, debug_x = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq_x") act_y, train_y, update_target_y, debug_y = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq_y") act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer_x = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) replay_buffer_y = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule_x = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer_x = ReplayBuffer(buffer_size) replay_buffer_y = ReplayBuffer(buffer_size) beta_schedule_x = None beta_schedule_y = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target_x() update_target_y() #time.sleep(30) # Stagger startups, otherwise tshey seem to conflict somehow episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() action_blacklist = ['0'] #function_id = numpy.random.choice(obs[0].observation.available_actions) #step forward a noop so units and prob appear obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] screen = (player_relative == _PLAYER_NEUTRAL).astype(int) #+ path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [0, 0] reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join("model/", "nexus_wars") print(model_file) for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action_x = act_x(np.expand_dims(np.array(screen)[None], axis=0), update_eps=update_eps, **kwargs)[0] action_y = act_y(np.expand_dims(np.array(screen)[None], axis=0), update_eps=update_eps, **kwargs)[0] reset = False coord = [player[0], player[1]] rew = 0 coord = [action_x, action_y] observation_spec = env.observation_spec() action_spec = env.action_spec() #get available actions avail_actions_now = obs[0].observation.available_actions #ready for actions yet? 4 actions = nothing to do yet if len(avail_actions_now) > 5: #game state is ready for random action commands, get them and args function_id = numpy.random.choice( obs[0].observation.available_actions) args = [[numpy.random.randint(0, size) for size in arg.sizes] for arg in action_spec[0].functions[function_id].args] #issue random command and arg obs = env.step( actions=[sc2_actions.FunctionCall(function_id, args)]) #obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) else: #step no matter wat obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) player_relative = obs[0].observation["feature_screen"][ _PLAYER_RELATIVE] new_screen = (player_relative == _PLAYER_NEUTRAL).astype(int) player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() # resolve the cannot convert float NaN to integer issue if len(player_x) == 0: player_x = np.array([0]) if len(player_y) == 0: player_y = np.array([0]) player = [int(player_x.mean()), int(player_y.mean())] rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer_x.add(screen, action_x, rew, new_screen, float(done)) replay_buffer_y.add(screen, action_y, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: obs = env.reset() player_relative = obs[0].observation["feature_screen"][ _PLAYER_RELATIVE] screent = (player_relative == _PLAYER_NEUTRAL).astype(int) player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] # Select all marines first env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) episode_rewards.append(0.0) #episode_minerals.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience_x = replay_buffer_x.sample( batch_size, beta=beta_schedule_x.value(t)) (obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x, weights_x, batch_idxes_x) = experience_x experience_y = replay_buffer_y.sample( batch_size, beta=beta_schedule_y.value(t)) (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y else: obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x = replay_buffer_x.sample( batch_size) weights_x, batch_idxes_x = np.ones_like(rewards_x), None obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample( batch_size) weights_y, batch_idxes_y = np.ones_like(rewards_y), None td_errors_x = train_x(np.expand_dims(obses_t_x, axis=1), actions_x, rewards_x, np.expand_dims(obses_tp1_x, axis=1), dones_x, weights_x) td_errors_y = train_x(np.expand_dims(obses_t_y, axis=1), actions_y, rewards_y, np.expand_dims(obses_tp1_y, axis=1), dones_y, weights_y) if prioritized_replay: new_priorities_x = np.abs( td_errors_x) + prioritized_replay_eps new_priorities_y = np.abs( td_errors_y) + prioritized_replay_eps replay_buffer_x.update_priorities(batch_idxes_x, new_priorities_x) replay_buffer_y.update_priorities(batch_idxes_y, new_priorities_y) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target_x() update_target_y() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act_x), ActWrapper(act_y)
def learn( env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts asyn 之下该参数修改为在replay_buffer的数据大小下开始? gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. 探索率 exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td # 在这里我们如果指定了checkpoint_path,则模型在训练时会保存当前网络状态至指定路径,不过其使用的是tensorflow的Saver() # 程序中断后,再次运行前会从保存的状态开始恢复训练,没有保存强化学习部分的参数,需要改动 # 但应当注意的是,需要保存的内容包括replay_buffer model_file = os.path.join(td, "model_tn") # 将两端路径名/文件名 合在一起 model_saved = False if tf.train.latest_checkpoint(td) is not None: load_state(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True # 在最大步数内训练 for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # 这里选择动作 action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. # 这里直接将observation放入了buffer,DQN论文中则是将序列作为状态,也许是在atari_wrappers中已经做好了相关转换 replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs env.render() # 将即时回报加入回报序列,如果多actor的话,这里应该怎么修改?相当于查看一下整体的mean_reward?或是在每个actor上单独计算(better) episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True # 经过learning_starts步后开始训练网络(先在buffer中存入一定量数据) # 每经过train_freq步进行一次梯度下降 if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) # 注意beta的用法 (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) # np.ones_like() : Return an array of ones with the same shape and type as a given array. weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # print(td_errors) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # 平均100次情景的回报(), 注意episode是轮数,不是步数 # mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) mean_100ep_reward = round(np.mean(episode_rewards[-21:-1]), 2) num_episodes = len(episode_rewards) # 下面是关于输出训练信息,以及保存网络参数的部分 if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 20 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() # checkpoint_freq轮数、mean reward增长才会保存模型 if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward # 至此,训练结束 # 训练结束后保存最佳模型 if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_state(model_file) # 返回一个ActWrapper,用来act.save("cartpole_model.pkl")或其它的动作 return act
def learn( env, actor_deque, action_pipes, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on actor_deque: structure is --> (ac_num, obs, action, new_obs, rew, done) action_pipes: structure is --> pipes_conn1 = [pipes[i][1] for i in range(0, 2)] use --> action_pipes[actor_num].send(s) default is str 至于为什么一处为deque,一处为pipe. well, actor需要接受action来执行下一步,此前为阻塞状态. 而trainer是响应式的,无论哪个actor有数据都要进行计算,使用deque.empty()很方便, q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts asyn 之下该参数修改为在replay_buffer的数据大小下开始? gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. 探索率 exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None # obs = env.reset() reset = True done = None end = 100 # 传输一个非正常动作,结束训练 with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model_tn") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_state(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True # 在最大步数内训练 t = 0 while t <= max_timesteps: if callback is not None: if callback(locals(), globals()): break if actor_deque.empty() is True: pass # time.sleep() else: actor_information = actor_deque.get() if actor_information[2] is None: # 表示其为一轮开始 ac_num = actor_information[0] new_obs = actor_information[3] done = False # important # print("ac_num "+str(ac_num)+" start") else: ac_num = actor_information[0] obs = actor_information[1] action = actor_information[2] new_obs = actor_information[3] rew = actor_information[4] done = actor_information[5] replay_buffer.add(obs, action, rew, new_obs, float(done)) if done: # done 与start是不会共存的 # obs = env.reset() # episode_rewards.append(0.0) reset = True else: # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(new_obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False action_pipes[ac_num-1].send(env_action) # 这里ac_num与pipe位置没有对齐 # 经过learning_starts步后开始训练网络(先在buffer中存入一定量数据) # 每经过train_freq步进行一次梯度下降 if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) # 注意beta的用法 (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) # np.ones_like() : Return an array of ones with the same shape and type as a given array. weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # 下面是关于输出训练信息,以及保存网络参数的部分 if print_freq is not None and t % print_freq == 0: logger.record_tabular("total_steps", t) # logger.record_tabular("episodes", num_episodes) # logger.record_tabular("mean 20 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() # checkpoint_freq轮数、mean reward增长才会保存模型 if checkpoint_freq is not None and t > learning_starts and t % checkpoint_freq == 0: save_state(model_file) model_saved = True t += 1 # 至此,训练结束 # end = True for i in range(0, len(action_pipes)): action_pipes[i].send(end) # end = 100 # 训练结束后保存最佳模型 # if model_saved: # if print_freq is not None: # logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) # load_state(model_file) # 返回一个ActWrapper,用来act.save("cartpole_model.pkl")或其它的动作 return act
def train_DISCARL(env_id, num_timesteps, seed, render, max_steps_episode, clip_action=False, ckpt_dir=None, restore_dir=None, n=1.0): def policy_pro(name, ob_space, ac_space): # return MlpPolicy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, # hid_size=64, num_hid_layers=2) return MlpPolicy_Pro.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, tau=3e-4, hid_size=64, num_hid_layers=4) # 倒立双摆 4层 def policy_adv(name, ob_space, ac_space): # return MlpPolicy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, # hid_size=64, num_hid_layers=2) return MlpPolicy_Adv.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, tau=3e-4, hid_size=64, num_hid_layers=4) # 倒立双摆 4层 env = gym.make(env_id) env.update_adversary(n) set_global_seeds(seed) env.seed(seed) save_timestep_period = num_timesteps if ckpt_dir: print('logging to ' + ckpt_dir) pro_pi, rew, timesteps_so_far, len_mean = PPO_RARL_DISCARL2_v5.learn( env, policy_pro, policy_adv, max_timesteps=num_timesteps, timesteps_per_batch=2048, clip_param=0.02, entcoeff=0.0, optim_epochs=10, optim_stepsize=5e-4, optim_batchsize=64, max_steps_episode=max_steps_episode, gamma=0.99, lam=0.95, lr_l=5e-4, lr_a=5e-4, schedule='linear', clip_action=clip_action, restore_dir=restore_dir, ckpt_dir=None, save_timestep_period=save_timestep_period, ) if ckpt_dir: # print(model_path) U.save_state(ckpt_dir) env.close() return pro_pi, len_mean, timesteps_so_far
def learn( env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, epoch_steps=20000, gpu_memory=1.0, double_q=False, scope="deepq", directory='.', nb_test_steps=10000, ): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.per_process_gpu_memory_fraction = gpu_memory config.gpu_options.polling_inactive_delay_msecs = 25 sess = tf.Session(config=config) sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, act_greedy, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise, double_q=bool(double_q), scope=scope) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True #recording records = {'loss': [], 'online_reward': [], 'test_reward': []} with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_state(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True ep_losses, ep_means, losses = [], [], [] print("===== LEARNING STARTS =====") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. timelimit_env = env while (not hasattr(timelimit_env, '_elapsed_steps')): timelimit_env = timelimit_env.env if timelimit_env._elapsed_steps < timelimit_env._max_episode_steps: # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) else: replay_buffer.add(obs, action, rew, new_obs, float(not done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if losses: ep_losses.append(np.mean(losses)) losses = [] if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) losses.append(td_errors) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if (t + 1) % epoch_steps == 0 and (t + 1) > learning_starts: test_reward = test(env, act_greedy, nb_test_steps=nb_test_steps) records['test_reward'].append(test_reward) records['loss'].append(np.mean(ep_losses)) records['online_reward'].append( round(np.mean(episode_rewards[-101:-1]), 1)) pickle.dump(records, open(os.path.join(directory, "records.pkl"), "wb")) print("==== EPOCH %d ===" % ((t + 1) / epoch_steps)) print(tabulate([[k, v[-1]] for (k, v) in records.items()])) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and (t + 1) > learning_starts and num_episodes > 100 and (t + 1) % checkpoint_freq == 0): print("Saving model to model_%d.pkl" % (t + 1)) act.save( os.path.join(directory, "model_" + str(t + 1) + ".pkl")) if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_state(model_file) return act, records
def save_model(dict_state): save_state("saved_model/model.ckpt") relatively_safe_pickle_dump(dict_state, "saved_model/model_state.pkl.zip", compression=True)
def save(self, path): save_state(path)
def learn( update_flag, end_train_flag, total_step, net_list, net_list_lock, mem_queue, env, q_func, lr=5e-4, max_timesteps=1000000, buffer_size=100000, batch_size=32, checkpoint_freq=10000, checkpoint_path=None, learning_starts=5000, gamma=1.0, target_network_update_freq=500, # asyn中 trainer要比正常运行快,这些参数都有待商议 actor_network_update_freq=500, # 最好比actor那边小点(到也没必要,trainer这边运行速度肯定比actor快得多) prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability batch_size: int size of a batched sampled from replay buffer for training checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts asyn 之下该参数修改为在replay_buffer的数据大小下开始? gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model # sess = tf.Session() config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.2 # 占用GPU20%的显存 sess = tf.Session(config=config) # sess = U.single_threaded_session() # 限制使用单核心 sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, train, update_target, init_actor_qfunc, update_actor_qfunc, debug = build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer replay_buffer = MemBufferThread( mem_queue, max_timesteps=max_timesteps, buffer_size=buffer_size, batch_size=batch_size, prioritized_replay=prioritized_replay, prioritized_replay_alpha=prioritized_replay_alpha, prioritized_replay_beta0=prioritized_replay_beta0, prioritized_replay_beta_iters=prioritized_replay_beta_iters, prioritized_replay_eps=prioritized_replay_eps) replay_buffer.setDaemon(True) # 设置子线程与主线程一起退出,需在start之前 replay_buffer.start() # Initialize the parameters and copy them to the target network. U.initialize() update_target() init_actor_qfunc(sess=sess, net_list=net_list) # 初始化结束后,先为actor传递一次网络 # update_actor_qfunc(sess=sess, net_list=net_list, net_list_lock=net_list_lock) update_flag.value += 1 # 设置标志位,允许各actor复制初始网络 with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model_tn") # 将两端路径名/文件名 合在一起 model_saved = False if tf.train.latest_checkpoint(td) is not None: load_state(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True t = 0 # 在最大步数内训练, infinite # for t in range(max_timesteps): while True: if callback is not None: if callback(locals(), globals()): break # 一直等待replay_buffer的数据足够多,才开始训练网络 while replay_buffer.__len__() < learning_starts: # print(replay_buffer.__len__()) time.sleep(1) # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones, weights = replay_buffer.sample( total_step.value) td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # print(td_errors) if prioritized_replay: replay_buffer.update_priorities(td_errors) if t % target_network_update_freq == 0: # Update target network periodically. update_target() # 更新actor_network if t % actor_network_update_freq == 0: update_actor_qfunc(sess=sess, net_list=net_list, net_list_lock=net_list_lock) # time.sleep(0.05) # 不应该存在 # checkpoint_freq轮数保存模型 if (checkpoint_freq is not None and t % checkpoint_freq == 0): logger.log("Saving model") save_state(model_file) # 这里是tensorflow的保存方式,是为了继续训练的 model_saved = True act.save("n_robot_model.pkl") # 这里只保存了act相关内容,可以用来检查运行结果 # act.save("cartpole_model.pkl") # 这里只保存了act相关内容,可以用来检查运行结果 # act.save("MountainCar_model.pkl") t += 1 # # 4 是actor数量, max_timesteps 是每个actor的最大步数,意味着actor训练结束,train随之结束(not work well) # if (total_step.value+4)/4 + 1000 >= max_timesteps: # break if end_train_flag.value == 4: # 4 是actor数量 break # 至此,训练结束 # 返回一个ActWrapper,用来act.save("cartpole_model.pkl")或其它的动作 print("end training") if model_saved: # logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) logger.log("Restored model") load_state(model_file) # replay_buffer.join() return act
def save(self, save_path): tf_util.save_state(save_path, sess=self.sess)
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, double_q=True, grad_norm_clipping=10 ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)
def learn(env, policy_func, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) save_name=None, save_per_acts=3, reload_name=None ): # Setup losses and stuff # ---------------------------------------- ob_space = env.sensor_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() if reload_name: saver = tf.train.Saver() saver.restore(tf.get_default_session(), reload_name) print("Loaded model successfully.") # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************"%iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses,_,_ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_"+name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank()==0: logger.dump_tabular() #print(iters_so_far, save_per_acts) if save_name and (iters_so_far % save_per_acts == 0): base_path = os.path.dirname(os.path.abspath(__file__)) print(base_path) out_name = os.path.join(base_path, 'models', save_name + '_' + str(iters_so_far) + ".model") U.save_state(out_name) print ("Saved model successfully.")
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, save_model_with_prefix=None, restore_model_from_file=None, outdir="/tmp/rosrl/experiments/continuous/acktr/"): obfilter = ZFilter(env.observation_space.shape) # Risto change max_pathlength = env.max_episode_steps stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async_=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() """ Here we add a possibility to resume from a previously saved model if a model file is provided """ if restore_model_from_file: saver = tf.train.Saver() saver.restore(tf.get_default_session(), restore_model_from_file) logger.log("Loaded model from {}".format(restore_model_from_file)) # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend( qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 if save_model_with_prefix: # basePath = '/tmp/rosrl/' + str(env.__class__.__name__) +'/acktr/' summary_writer = tf.summary.FileWriter(outdir, graph=tf.get_default_graph()) while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() """ Save the model at every itteration """ if save_model_with_prefix: if np.mean([path["reward"].sum() for path in paths]) > -50.0: # basePath = '/tmp/rosrl/' + str(env.__class__.__name__) +'/acktr/' summary = tf.Summary(value=[ tf.Summary.Value(tag="EpRewMean", simple_value=np.mean([ path["reward"].sum() for path in paths ])) ]) summary_writer.add_summary(summary, i) if not os.path.exists(outdir): os.makedirs(outdir) modelF = outdir + '/' + save_model_with_prefix + "_afterIter_" + str( i) + ".model" U.save_state(modelF) logger.log("Saved model to file :{}".format(modelF)) i += 1 coord.request_stop() coord.join(enqueue_threads)
def learn(env, q_func, beta1=0.9, beta2=0.999, epsilon=1e-8, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, exploration_schedule=None, start_lr=5e-4, end_lr=5e-4, start_step=0, end_step=1, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, model_directory=None, lamda=0.1): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer beta1: float beta1 parameter for adam beta2: float beta2 parameter for adam epsilon: float epsilon parameter for adam max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability exploration_schedule: Schedule a schedule for exploration chance train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return U.BatchInput(observation_space_shape, name=name) global_step = tf.Variable(0, trainable=False) lr = interpolated_decay(start_lr, end_lr, global_step, start_step, end_step) act, train, update_target, debug = multiheaded_build_graph.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr, beta1=beta1, beta2=beta2, epsilon=epsilon), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise, global_step=global_step, lamda=lamda, ) tf.summary.FileWriter(logger.get_dir(), graph_def=sess.graph_def) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. if exploration_schedule is None: exploration = LinearSchedule(schedule_timesteps=int( exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) else: exploration = exploration_schedule # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False if model_directory is None: model_directory = pathlib.Path(td) model_file = str(model_directory / "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] if isinstance(env.action_space, gym.spaces.MultiBinary): env_action = np.zeros(env.action_space.n) env_action[action] = 1 else: env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) act.save(str(model_directory / "act_model.pkl")) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return act
class ActWrapper(object): def __init__(self, act): self._act = act #self._act_params = act_params @staticmethod def load(path, act_params, num_cpu=16): with open(path, "rb") as f: model_data = dill.load(f) act = deepq.build_act(**act_params) sess = U.make_session(num_cpu=num_cpu) sess.__enter__() with tempfile.TemporaryDirectory() as td: arc_path = os.path.join(td, "packed.zip") with open(arc_path, "wb") as f: f.write(model_data) zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) U.load_state(os.path.join(td, "model")) return ActWrapper(act) def __call__(self, *args, **kwargs): return self._act(*args, **kwargs) def save(self, path): # Save model to a pickle located at `path` with tempfile.TemporaryDirectory() as td: U.save_state(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, 'w') as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: dill.dump((model_data), f) def load(path, act_params, num_cpu=16): """ Load act function that was returned by learn function. Parameters ---------- path: str path to the act function pickle num_cpu: int number of cpus to use for executing the policy Returns ------- act: ActWrapper function that takes a batch of observations and returns actions. """ return ActWrapper.load(path, num_cpu=cpu, act_params=act_params) def learn( env, q_func, num_actions=4, lr=5e-4 max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1 exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500; prioritized_replay=False, prioritized_replay_alpha=0.6; prioritized_replay_beta0=0.4; prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None ): # Train a deepq model. # Parameters # ------- # env: pysc2.env.SC2Env # environment to train on # q_func: (tf.Variable, int, str, bool) -> tf.Variable # the model that takes the following inputs: # observation_in: object # the output of observation placeholder # num_actions: int # number of actions # scope: str # reuse: bool # should be passed to outer variable scope # and returns a tensor of shape (batch_size, num_actions) with values of every action. # lr: float # learning rate for adam optimizer # max_timesteps: int # number of env steps to optimizer for # buffer_size: int # size of the replay buffer # exploration_fraction: float # fraction of entire training period over which the exploration rate is annealed # exploration_final_eps: float # final value of random action probability # train_freq: int # update the model every `train_freq` steps. # set to None to disable printing # batch_size: int # size of a batched sampled from replay buffer for training # print_freq: int # how often to print out training progress # set to None to disable printing # checkpoint_freq: int # how often to save the model. This is so that the best version is restored # at the end of the training. If you do not wish to restore the best version at # the end of the training set this variable to None. # learning_starts: int # how many steps of the model to collect transitions for before learning starts # gamma: float # discount factor # target_network_update_freq: int # update the target network every `target_network_update_freq` steps. # prioritized_replay: True # if True prioritized replay buffer will be used. # prioritized_replay_alpha: float # alpha parameter for prioritized replay buffer # prioritized_replay_beta0: float # initial value of beta for prioritized replay buffer # prioritized_replay_beta_iters: int # number of iterations over which beta will be annealed from initial value # to 1.0. If set to None equals to max_timesteps. # prioritized_replay_eps: float # epsilon to add to the TD errors when updating priorities. # num_cpu: int # number of cpus to use for training # callback: (locals, globals) -> None # function called at every steps with state of the algorithm. # If callback returns true training stops. # Returns # ------- # act: ActWrapper # Wrapper over act function. Adds ability to save it and load it. # See header of baselines/deepq/categorical.py for details on the act function. # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((64, 64), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rage=lr), gamma=gamma, grad_norm_clipping=10 ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer=ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration startin gfrom 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial+p=1,0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards=[0.0] # episode_minerals = [0.0] saved_mean_reward = None path_memory = np.zeros((64,64)) obs = env.reset() # Select all marines first obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = player_relative + path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if(player[0]>32): screen=shift(LEFT, player[0]-32, screen) elif(player[0]<32): screen=shift(RIGHT, 32- player[0], screen) if(player[1]>32): screen = shift(UP, player[1]-32, screen) screen = shift(DOWN, 32- player[1], screen) reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = update_param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False coord = [player[0], player[1]] rew = 0 path_memory_ = np.array(path_memory, copy=True) if(action == 0): #UP if(player[1] >= 16): coord = [player[0], player[1] - 16] path_memory_[player[1] - 16 : player[1], player[0]] = -1 elif(player[1] > 0): coord = [player[0], 0] path_memory_[0 : player[1], player[0]] = -1 #else: # rew -= 1 elif(action == 1): #DOWN if(player[1] <= 47): coord = [player[0], player[1] + 16] path_memory_[player[1] : player[1] + 16, player[0]] = -1 elif(player[1] > 47): coord = [player[0], 63] path_memory_[player[1] : 63, player[0]] = -1 #else: # rew -=1 elif(action == 2): # LEFT if(player[0] >= 16): coord = [player[0] - 16, player[1]] path_memory_[player[1], player[0] - 16 : player[0]] = -1 elif(player[0] < 16): coord = [0, player[1]] path_memory_[player[1], 0 : player[0]] = -1 #else: # rew -= 1 elif(action == 3): #RIGHT if(player[0] <= 47): coord = [player[0] + 16, player[1]] path_memory_[player[1], player[0] : player[0] + 16] = -1 elif(player[0] > 47): coord = [63, player[1]] path_memory_[player[1], player[0] : 63] = -1 #else: # rew -= 1 #else: #Cannot move, give minus reward # # if(path_memory[coord[1],coord[0]] !=0): # rew -= 0.5 path_memory = np.array(path_memory_) #print("action : %s Coord : %s" % (action, coord)) if _MOVE_SCREEN not in obs[0].observation["available_actions"]: obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) new_action = [sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord])] # else: # new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = player_relative + path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if(player[0]>32): new_screen = shift(LEFT, player[0]-32, new_screen) elif(player[0]<32): new_screen = shift(RIGHT, 32 - player[0], new_screen) if(player[1]>32): new_screen = shift(UP, player[1]-32, new_screen) elif(player[1]<32): new_screen = shift(DOWN, 32 - player[1], new_screen) rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer.add(screen, action, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew #episode_minerals[-1] += obs[0].reward if done: obs = env.reset() player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = player_relative + path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if(player[0]>32): screen = shift(LEFT, player[0]-32, screen) elif(player[0]<32): screen = shift(RIGHT, 32 - player[0], screen) if(player[1]>32): screen = shift(UP, player[1]-32, screen) elif(player[1]<32): screen = shift(DOWN, 32 - player[1], screen) # Select all marines first env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) episode_rewards.append(0.0) #episode_minerals.append(0.0) path_memory = np.zeros((64,64)) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) #mean_100ep_mineral = round(np.mean(episode_minerals[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) #logger.record_tabular("mean 100 episode mineral", mean_100ep_mineral) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_req is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) U.load_state(model_file) return ActWrapper(act) def intToCoordinate(num, size=64): if size !=64: num = num * size * size // 4096 y = num // size x = num - size * y return [x, y] UP, DOWN, LEFT, RIGHT = 'up', 'down', 'left', 'right' def shift(direction, number, matrix): ''' shift given 2D matrix in-place the given number of rows or columns in the specified (UP, DOWN, LEFT, RIGHT) direction and return it ''' if direction in (UP): matrix = np.roll(matrix, -number, axis=0) matrix[number:,:] = -2 return matrix elif direction in (DOWN): matrix = np.roll(matrix, number, axis=0) matrix[:number,:] = -2 return matrix elif direction in (LEFT): matrix = np.roll(matrix, -number, axis=1) matrix[:,number:] = -2 return matrix elif direction in (RIGHT): matrix = np.roll(matrix, number, axis=1) matrix[:,:number] = -2 return matrix else: return matrix
def learn(env, q_func, num_actions=4, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((64, 64), name=name) act_x, train_x, update_target_x, debug_x = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq_x") act_y, train_y, update_target_y, debug_y = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq_y") act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer_x = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) replay_buffer_y = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule_x = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer_x = ReplayBuffer(buffer_size) replay_buffer_y = ReplayBuffer(buffer_size) beta_schedule_x = None beta_schedule_y = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target_x() update_target_y() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # Select all marines first obs = env.step( actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = player_relative #+ path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join("model/", "mineral_shards") print(model_file) for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action_x = act_x(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] action_y = act_y(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False coord = [player[0], player[1]] rew = 0 coord = [action_x, action_y] if _MOVE_SCREEN not in obs[0].observation["available_actions"]: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] # else: # new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = player_relative #+ path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer_x.add(screen, action_x, rew, new_screen, float(done)) replay_buffer_y.add(screen, action_y, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screen = player_relative #+ path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] # Select all marines first env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) episode_rewards.append(0.0) #episode_minerals.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience_x = replay_buffer_x.sample( batch_size, beta=beta_schedule_x.value(t)) (obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x, weights_x, batch_idxes_x) = experience_x experience_y = replay_buffer_y.sample( batch_size, beta=beta_schedule_y.value(t)) (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y else: obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x = replay_buffer_x.sample( batch_size) weights_x, batch_idxes_x = np.ones_like(rewards_x), None obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample( batch_size) weights_y, batch_idxes_y = np.ones_like(rewards_y), None td_errors_x = train_x(obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x, weights_x) td_errors_y = train_x(obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y) if prioritized_replay: new_priorities_x = np.abs( td_errors_x) + prioritized_replay_eps new_priorities_y = np.abs( td_errors_y) + prioritized_replay_eps replay_buffer_x.update_priorities(batch_idxes_x, new_priorities_x) replay_buffer_y.update_priorities(batch_idxes_y, new_priorities_y) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target_x() update_target_y() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act_x), ActWrapper(act_y)
def learn(env, q_func, num_actions=4, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((32, 32), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq") # # act_y, train_y, update_target_y, debug_y = deepq.build_train( # make_obs_ph=make_obs_ph, # q_func=q_func, # num_actions=num_actions, # optimizer=tf.train.AdamOptimizer(learning_rate=lr), # gamma=gamma, # grad_norm_clipping=10, # scope="deepq_y" # ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) # replay_buffer_y = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) # beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters, # initial_p=prioritized_replay_beta0, # final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) # replay_buffer_y = ReplayBuffer(buffer_size) beta_schedule = None # beta_schedule_y = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule( schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() # update_target_y() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # Select all marines first obs = env.step( actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = (player_relative == _PLAYER_NEUTRAL).astype(int) #+ path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 16): screen = shift(LEFT, player[0] - 16, screen) elif (player[0] < 16): screen = shift(RIGHT, 16 - player[0], screen) if (player[1] > 16): screen = shift(UP, player[1] - 16, screen) elif (player[1] < 16): screen = shift(DOWN, 16 - player[1], screen) reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join("model/", "mineral_shards") print(model_file) for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act( np.array(screen)[None], update_eps=update_eps, **kwargs)[0] # action_y = act_y(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False coord = [player[0], player[1]] rew = 0 if (action == 0): #UP if (player[1] >= 8): coord = [player[0], player[1] - 8] #path_memory_[player[1] - 16 : player[1], player[0]] = -1 elif (player[1] > 0): coord = [player[0], 0] #path_memory_[0 : player[1], player[0]] = -1 #else: # rew -= 1 elif (action == 1): #DOWN if (player[1] <= 23): coord = [player[0], player[1] + 8] #path_memory_[player[1] : player[1] + 16, player[0]] = -1 elif (player[1] > 23): coord = [player[0], 31] #path_memory_[player[1] : 63, player[0]] = -1 #else: # rew -= 1 elif (action == 2): #LEFT if (player[0] >= 8): coord = [player[0] - 8, player[1]] #path_memory_[player[1], player[0] - 16 : player[0]] = -1 elif (player[0] < 8): coord = [0, player[1]] #path_memory_[player[1], 0 : player[0]] = -1 #else: # rew -= 1 elif (action == 3): #RIGHT if (player[0] <= 23): coord = [player[0] + 8, player[1]] #path_memory_[player[1], player[0] : player[0] + 16] = -1 elif (player[0] > 23): coord = [31, player[1]] #path_memory_[player[1], player[0] : 63] = -1 if _MOVE_SCREEN not in obs[0].observation["available_actions"]: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] # else: # new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = (player_relative == _PLAYER_NEUTRAL).astype( int) #+ path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 16): new_screen = shift(LEFT, player[0] - 16, new_screen) elif (player[0] < 16): new_screen = shift(RIGHT, 16 - player[0], new_screen) if (player[1] > 16): new_screen = shift(UP, player[1] - 16, new_screen) elif (player[1] < 16): new_screen = shift(DOWN, 16 - player[1], new_screen) rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer.add(screen, action, rew, new_screen, float(done)) # replay_buffer_y.add(screen, action_y, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screen = (player_relative == _PLAYER_NEUTRAL).astype( int) #+ path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] # Select all marines first env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) episode_rewards.append(0.0) #episode_minerals.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience # experience_y = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) # (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample(batch_size) # weights_y, batch_idxes_y = np.ones_like(rewards_y), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # td_errors_y = train_x(obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps # new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # update_target_y() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act)
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] reset = False new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)
def task_train(self): self.episode_rewards = [0.0] self.episode_steps = [0.0] self.saved_mean_reward = None obs = self.env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(self.max_timesteps): if self.callback is not None: if self.callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - self.exploration.value(t) + self.exploration.value(t) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs self.episode_rewards[-1] += rew self.episode_steps[-1] += 1 if done: obs = self.env.reset() self.episode_rewards.append(0.0) self.episode_steps.append(0.0) reset = True if t > self.learning_starts and t % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones, weights) if self.prioritized_replay: new_priorities = np.abs(td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities(batch_idxes, new_priorities) if t > self.learning_starts and t % self.target_network_update_freq == 0: # Update target network periodically. self.update_target() mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 1) num_episodes = len(self.episode_rewards) if done and self.print_freq is not None and len(self.episode_rewards) % self.print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(t))) logger.dump_tabular() if (self.checkpoint_freq is not None and t > self.learning_starts and num_episodes > 100 and t % self.checkpoint_freq == 0): if self.saved_mean_reward is None or mean_100ep_reward > self.saved_mean_reward: if self.print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( self.saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True self.saved_mean_reward = mean_100ep_reward if num_episodes >= self.max_episodes: break if model_saved: if self.print_freq is not None: logger.log("Restored model with mean reward: {}".format(self.saved_mean_reward)) U.load_state(model_file) return self.act, self.episode_rewards, self.episode_steps
def learn(env, q_func, num_actions=3, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None, demo_replay=[]): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() #def make_obs_ph(name): # return U.BatchInput((64, 64), name=name) obs_spec = env.observation_spec()[0] screen_dim = obs_spec['feature_screen'][1:3] def make_obs_ph(name): return ObservationInput(Box(low=0.0, high=screen_dim[0], shape=(screen_dim[0],screen_dim[1],1)), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule( schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # Select all marines first player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] screen = player_relative obs, xy_per_marine = common.init(env, obs) group_id = 0 reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # custom process for DefeatZerglingsAndBanelings obs, screen, player = common.select_marine(env, obs) action = act( np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False rew = 0 new_action = None obs, new_action = common.marine_action(env, obs, player, action) army_count = env._obs[0].observation.player_common.army_count try: if army_count > 0 and _ATTACK_SCREEN in obs[0].observation["available_actions"]: obs = env.step(actions=new_action) else: new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) except Exception as e: #print(e) 1 # Do nothing player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] new_screen = player_relative rew += obs[0].reward done = obs[0].step_type == environment.StepType.LAST selected = obs[0].observation["feature_screen"][_SELECTED] player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() if (len(player_y) > 0): player = [int(player_x.mean()), int(player_y.mean())] if (len(player) == 2): if (player[0] > 32): new_screen = common.shift(LEFT, player[0] - 32, new_screen) elif (player[0] < 32): new_screen = common.shift(RIGHT, 32 - player[0], new_screen) if (player[1] > 32): new_screen = common.shift(UP, player[1] - 32, new_screen) elif (player[1] < 32): new_screen = common.shift(DOWN, 32 - player[1], new_screen) # Store transition in the replay buffer. replay_buffer.add(screen, action, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: print("Episode Reward : %s" % episode_rewards[-1]) obs = env.reset() player_relative = obs[0].observation["feature_screen"][ _PLAYER_RELATIVE] screen = player_relative group_list = common.init(env, obs) # Select all marines first #env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])]) episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act)
def train(env, eval_env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, my_skill_set=None, log_dir = None, num_eval_episodes=10, render=False, render_eval = False, commit_for = 1 ): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model if my_skill_set: assert commit_for>=1, "commit_for >= 1" save_idx = 0 with U.single_threaded_session() as sess: ## restore if my_skill_set: action_shape = my_skill_set.len else: action_shape = env.action_space.n # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return U.BatchInput(observation_space_shape, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=action_shape, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': action_shape, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() # sess.run(tf.variables_initializer(new_variables)) # sess.run(tf.global_variables_initializer()) update_target() if my_skill_set: ## restore skills my_skill_set.restore_skillset(sess=sess) episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True model_saved = False model_file = os.path.join(log_dir, "model", "deepq") # save the initial act model print("Saving the starting model") os.makedirs(os.path.dirname(model_file), exist_ok=True) act.save(model_file + '.pkl') for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True paction = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] if(my_skill_set): skill_obs = obs.copy() primitive_id = paction rew = 0. for _ in range(commit_for): ## break actions into primitives and their params action = my_skill_set.pi(primitive_id=primitive_id, obs = skill_obs.copy(), primitive_params=None) new_obs, skill_rew, done, _ = env.step(action) if render: # print(action) env.render() sleep(0.1) rew += skill_rew skill_obs = new_obs terminate_skill = my_skill_set.termination(new_obs) if done or terminate_skill: break else: action= paction env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) if render: env.render() sleep(0.1) # Store transition in the replay buffer for the outer env replay_buffer.add(obs, paction, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True print("Time:%d, episodes:%d"%(t,len(episode_rewards))) # add hindsight experience if t > learning_starts and t % train_freq == 0: # print('Training!') # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # print(len(episode_rewards), episode_rewards[-11:-1]) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if (checkpoint_freq is not None and t > learning_starts and num_episodes > 50 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) act.save(model_file + '%d.pkl'%save_idx) save_idx += 1 model_saved = True saved_mean_reward = mean_100ep_reward # else: # print(saved_mean_reward, mean_100ep_reward) if (eval_env is not None) and t > learning_starts and t % target_network_update_freq == 0: # dumping other stats logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("%d time spent exploring", int(100 * exploration.value(t))) print("Testing!") eval_episode_rewards = [] eval_episode_successes = [] for i in range(num_eval_episodes): eval_episode_reward = 0. eval_obs = eval_env.reset() eval_obs_start = eval_obs.copy() eval_done = False while(not eval_done): eval_paction = act(np.array(eval_obs)[None])[0] if(my_skill_set): eval_skill_obs = eval_obs.copy() eval_primitive_id = eval_paction eval_r = 0. for _ in range(commit_for): ## break actions into primitives and their params eval_action, _ = my_skill_set.pi(primitive_id=eval_primitive_id, obs = eval_skill_obs.copy(), primitive_params=None) eval_new_obs, eval_skill_rew, eval_done, eval_info = eval_env.step(eval_action) # print('env reward:%f'%eval_skill_rew) if render_eval: print("Render!") eval_env.render() print("rendered!") eval_r += eval_skill_rew eval_skill_obs = eval_new_obs eval_terminate_skill = my_skill_set.termination(eval_new_obs) if eval_done or eval_terminate_skill: break else: eval_action= eval_paction env_action = eval_action reset = False eval_new_obs, eval_r, eval_done, eval_info = eval_env.step(env_action) if render_eval: # print("Render!") eval_env.render() # print("rendered!") eval_episode_reward += eval_r # print("eval_r:%f, eval_episode_reward:%f"%(eval_r, eval_episode_reward)) eval_obs = eval_new_obs eval_episode_success = (eval_info["done"]=="goal reached") if(eval_episode_success): logger.info("success, training epoch:%d,starting config:"%t) eval_episode_rewards.append(eval_episode_reward) eval_episode_successes.append(eval_episode_success) combined_stats = {} # print(eval_episode_successes, np.mean(eval_episode_successes)) combined_stats['eval/return'] = normal_mean(eval_episode_rewards) combined_stats['eval/success'] = normal_mean(eval_episode_successes) combined_stats['eval/episodes'] = (len(eval_episode_rewards)) for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) print("dumping the stats!") logger.dump_tabular() if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) U.load_state(model_file)
def learn( env, q_func, # input obs,num od actions etc and obtain q value for each action num_actions=16, # available actions: up down left right lr=5e-4, max_timesteps=100000, buffer_size=50000, # size of the replay buffer exploration_fraction=0.1, # during the first 10% training period, exploration rate is decreased from 1 to 0.02 exploration_final_eps=0.02, # final value of random action probability train_freq=1, # update the model every `train_freq` steps. batch_size=32, # size of a batched sampled from replay buffer for training print_freq=1, checkpoint_freq=10000, learning_starts=1000, # time for the model to collect transitions before learning starts gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, # beta keeps to be beta0 prioritized_replay_eps=1e-6, num_cpu=16, # number of cpus to use for training param_noise=False, # whether or not to use parameter space noise param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph( name ): # Creates a placeholder for a batch of tensors of a given shape and dtype return U_b.BatchInput((16, 16), name=name) act_x, train_x, update_target_x, debug_x = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, # clip gradient norms to this value scope="deepq_x") act_y, train_y, update_target_y, debug_y = deepq.build_train( #because there are two players in the game make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq_y") act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer_x = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) replay_buffer_y = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule_x = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, # 0.4->1 final_p=1.0) beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer_x = ReplayBuffer(buffer_size) replay_buffer_y = ReplayBuffer(buffer_size) beta_schedule_x = None beta_schedule_y = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. ---环境初始化 U.initialize() update_target_x() update_target_y() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # start a new episode # Select all marines first ---选择所有个体,获得新的观察 obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) # Apply actions, step the world forward, and return observations. # 查看返回的字典中屏幕中的目标关系分布图:1表示着地图中个体的位置,3表示着矿物的位置,就是终端的矩阵图 player_relative = obs[0].observation["feature_screen"][ _PLAYER_RELATIVE] #obs is a 'TimeStep' whose type is tuple of ['step_type', 'reward', 'discount', 'observation'];step_type.first or mid or last # 矿的位置 0,1矩阵分布 screen = (player_relative == _PLAYER_NEUTRAL).astype( int ) #+ path_memory screen=1 or 0 to indicate the location of mineral # 队友的位置,给出行列信息 player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero( ) #the location of team member: row, col <-> y,x # print(player_relative) # print('*************') # print(screen) # print(_PLAYER_FRIENDLY) # # print(player_x) # print(player_y) # print('ssss) # if (len(player_x) == 0): # player_x = np.array([0]) # # print('player_x from null to 0') # # print(player_x) # if (len(player_y) == 0): # player_y = np.array([0]) # # print('player_y from null to 0') # # print(player_y) player = [int(player_x.mean()), int(player_y.mean())] reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join("model/", "mineral_shards") #给了一个模型保存路径 print(model_file) for t in range(max_timesteps): # print('timestep=',t) if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value--更新探索并采取动作 kwargs = {} if not param_noise: update_eps = exploration.value(t) # 输出一个1->0.02之间的值 update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # actions obtained after exploration action_x = act_x(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] # print('action_x is ',action_x) action_y = act_y(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] # print('action_y is ',action_y) reset = False # coord = [player[0], player[1]] rew = 0 #reward coord = [action_x, action_y] if _MOVE_SCREEN not in obs[0].observation["available_actions"]: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) # obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] # else: # new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) player_relative = obs[0].observation["feature_screen"][ _PLAYER_RELATIVE] # print(player_relative) new_screen = (player_relative == _PLAYER_NEUTRAL).astype(int) # print(_PLAYER_FRIENDLY) # print(player_x) # print(player_y) # print('ssssss2') # if (len(player_x) == 0): # player_x = np.array([0]) # # print('player_x from null to 0') # # print(player_x) # if (len(player_y) == 0): # player_y = np.array([0]) # # print('player_y from null to 0') # # print(player_y) # player = [int(player_x.mean()), int(player_y.mean())] rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer_x.add(screen, action_x, rew, new_screen, float(done)) replay_buffer_y.add(screen, action_y, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: obs = env.reset() # player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] # screent = (player_relative == _PLAYER_NEUTRAL).astype(int) # # player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() # player = [int(player_x.mean()), int(player_y.mean())] # Select all marines first env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) episode_rewards.append(0.0) # print("episode_rewards is ", episode_rewards) print('num_episodes is', len(episode_rewards)) #episode_minerals.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: #train_freq=1: update the model every `train_freq` steps # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience_x = replay_buffer_x.sample( batch_size, beta=beta_schedule_x.value(t)) (obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x, weights_x, batch_idxes_x) = experience_x experience_y = replay_buffer_y.sample( batch_size, beta=beta_schedule_y.value(t)) (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y else: obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x = replay_buffer_x.sample( batch_size) weights_x, batch_idxes_x = np.ones_like( rewards_x ), None # weights_x is an array padded with 1 which has the same shape as rewards_x obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample( batch_size) weights_y, batch_idxes_y = np.ones_like(rewards_y), None td_errors_x = train_x(obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x, weights_x) td_errors_y = train_y(obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y) if prioritized_replay: new_priorities_x = np.abs( td_errors_x) + prioritized_replay_eps new_priorities_y = np.abs( td_errors_y) + prioritized_replay_eps replay_buffer_x.update_priorities(batch_idxes_x, new_priorities_x) replay_buffer_y.update_priorities(batch_idxes_y, new_priorities_y) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target_x() update_target_y() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) # round: sishewuru value num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act_x), ActWrapper(act_y)
def learn( env, sess, actor, critic, replay_buffer, action_noise, num_exploring, max_timesteps=100000, train_freq=1, #1 batch_size=32, #32 print_freq=1, save_freq=10000, #10000 gamma=1.0, target_network_update_freq=1, #500, num_agents=9, output_len=4, # num_baneling=4, # num_zergling=6, # unit_flag_friend=0.4, #48 # unit_flag_baneling=0.7, #9 # unit_flag_zergling=1, #105 action_low=-1, action_high=1): # """Train a deepq model. # # Parameters # ------- # env: pysc2.env.SC2Env # environment to train on # q_func: (tf.Variable, int, str, bool) -> tf.Variable # the model that takes the following inputs: # observation_in: object # the output of observation placeholder # num_actions: int # number of actions # scope: str # reuse: bool # should be passed to outer variable scope # and returns a tensor of shape (batch_size, num_actions) with values of every action. # lr: float # learning rate for adam optimizer # max_timesteps: int # number of env steps to optimizer for # buffer_size: int # size of the replay buffer # train_freq: int # update the model every `train_freq` steps. # set to None to disable printing # batch_size: int # size of a batched sampled from replay buffer for training # print_freq: int # how often to print out training progress # set to None to disable printing # checkpoint_freq: int # how often to save the model. This is so that the best version is restored # at the end of the training. If you do not wish to restore the best version at # the end of the training set this variable to None. # learning_starts: int # how many steps of the model to collect transitions for before learning starts # gamma: float # discount factor # target_network_update_freq: int # update the target network every `target_network_update_freq` steps. # num_cpu: int # number of cpus to use for training # callback: (locals, globals) -> None # function called at every steps with state of the algorithm. # If callback returns true training stops. # # Returns # ------- # act: ActWrapper # Wrapper over act function. Adds ability to save it and load it. # See # # # of baselines/deepq/categorical.py for details on the act function. # """ # # Create all the functions necessary to train the model # # tf.reset_default_graph() # config = tf.ConfigProto() # config.gpu_options.allow_growth = True # sess = tf.Session(config = config) # sess.__enter__() obs = env.reset() action_noise.reset() episode_rewards = [0.0] obs, _ = common_group.init(env, obs) # model_file_load = os.path.join(str(40000) + "_" + "model_segment_training/", "defeat_zerglings") # U.load_state(model_file_load, sess) U.initialize() min = 5 punish = -0.01 eps_time = 1 #求出screen_expand player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] screen = np.zeros((player_relative.shape[0] - 9, player_relative.shape[1])) for i in range(player_relative.shape[0] - 9): for j in range(player_relative.shape[1]): screen[i, j] = round(player_relative[i, j] / 3, 1) screen_expand = screenConcat(screen, num_agents) #选择,以便MOVE_SCREEN是available的 obs = env.step( actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() # 对player_x和player_y的处理,保证其含有两个元素,对应两个agent的位置 # 智能体不存在的情况,一般不会出现,因为地图不是敌对双方的设置 if (len(player_x) == 0): player_x = np.array([0]) player_y = np.array([0]) #两个智能体重合于一点的情况 if (len(player_x) == 1): player_x = np.append(player_x, player_x[0]) player_y = np.append(player_y, player_y[0]) pos_agent1_target = [player_x[0], player_y[0]] pos_agent2_target = [player_x[1], player_y[1]] with tempfile.TemporaryDirectory() as td: for t in range(max_timesteps): startTime = datetime.datetime.now() #输入观察,得到动作 screen_input = np.expand_dims(screen_expand, axis=0) action = actor.predict(screen_input)[0] # (2, 4) rnn_out = actor.rnn_out_pre(screen_input) # action[0] = MaxMinNormalization(action[0], getMax(action[0]), getMin(action[0])) # action[1] = MaxMinNormalization(action[1], getMax(action[1]), getMin(action[1])) act_with_noise = np.clip( action + action_noise.get_noise(t - num_exploring), action_low, action_high) act_prob = (act_with_noise + 1) / 2 #act_with_noise # act_prob_sum = act_prob.sum(axis=1) act_index = [0, 1, 2, 3] # if(act_prob_sum[0] == 0): # prob = (np.array(act_prob[0]) + 1) / len(act_prob[0]) # else: # prob = act_prob[0]/act_prob_sum[0] # # a1 = np.random.choice(np.array(act_index), p=prob.ravel()) # # if (act_prob_sum[1] == 0): # prob = (np.array(act_prob[1]) + 1) / len(act_prob[1]) # else: # prob = act_prob[1] / act_prob_sum[1] # # a2 = np.random.choice(np.array(act_index), p=prob.ravel()) # a1 = act_with_noise[0] # a2 = act_with_noise[1] arr_m = act_prob[0] max_m = -1 for i in range(len(arr_m)): if (arr_m[i] > max_m): max_m = arr_m[i] idx_max_m = i arr_n = act_prob[1] max_n = -1 for j in range(len(arr_n)): if (arr_n[j] > max_n): max_n = arr_n[j] idx_max_n = j #选择概率最大的动作 a1 = idx_max_m a2 = idx_max_n #动作执行 pos_agent1 = [player_x[0], player_y[0]] pos_agent2 = [player_x[1], player_y[1]] diff_1toTarget1 = (pos_agent1_target[0] - pos_agent1[0]) * ( pos_agent1_target[0] - pos_agent1[0]) + (pos_agent1_target[1] - pos_agent1[1]) * ( pos_agent1_target[1] - pos_agent1[1]) diff_2toTarget1 = (pos_agent1_target[0] - pos_agent2[0]) * ( pos_agent1_target[0] - pos_agent2[0]) + (pos_agent1_target[1] - pos_agent2[1]) * ( pos_agent1_target[1] - pos_agent2[1]) diff_1toTarget2 = (pos_agent2_target[0] - pos_agent1[0]) * ( pos_agent2_target[0] - pos_agent1[0]) + (pos_agent2_target[1] - pos_agent1[1]) * ( pos_agent2_target[1] - pos_agent1[1]) diff_2toTarget2 = (pos_agent2_target[0] - pos_agent2[0]) * ( pos_agent2_target[0] - pos_agent2[0]) + (pos_agent2_target[1] - pos_agent2[1]) * ( pos_agent2_target[1] - pos_agent2[1]) if ((diff_1toTarget1 > diff_2toTarget1) and (diff_1toTarget2 < diff_2toTarget2)): pos_agent1 = [player_x[1], player_y[1]] pos_agent2 = [player_x[0], player_y[0]] # 如果本来就位于边缘,还往边缘方向跑,就给惩罚 pos_agent1_target, punish_1 = obtainTargetPos(a1, pos_agent1) pos_agent2_target, punish_2 = obtainTargetPos(a2, pos_agent2) player_relative_old = obs[0].observation["feature_screen"][ _PLAYER_RELATIVE] mineral_y_old, mineral_x_old = ( player_relative_old == _PLAYER_NEUTRAL).nonzero() if (len(mineral_x_old) == 0): mineral_x_old = np.array([0]) mineral_y_old = np.array([0]) obs = env.step_rewrite(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [_SELECT_POINT_ACT, pos_agent1]) ]) obs = env.step_rewrite(actions=[ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, pos_agent1_target]) ]) obs = env.step_rewrite(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [_SELECT_POINT_ACT, pos_agent2]) ]) obs = env.step_rewrite(actions=[ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, pos_agent2_target]) ]) obs = env._step() flag_end = obs[0].step_type == environment.StepType.LAST rew = obs[0].reward #得到新的观察 player_relative = obs[0].observation["feature_screen"][ _PLAYER_RELATIVE] new_screen = np.zeros( (player_relative.shape[0] - 9, player_relative.shape[1])) for i in range(player_relative.shape[0] - 9): for j in range(player_relative.shape[1]): new_screen[i, j] = round(player_relative[i, j] / 3, 1) new_screen_expand = screenConcat(new_screen, num_agents) #得到新的矿到智能体距离总和 player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() # 对player_x和player_y的处理,保证其含有两个元素,对应两个agent的位置 # 智能体不存在的情况,一般不会出现,因为地图不是敌对双方的设置 if (len(player_x) == 0): player_x = np.array([0]) player_y = np.array([0]) # 两个智能体重合于一点的情况 if (len(player_x) == 1): player_x = np.append(player_x, player_x[0]) player_y = np.append(player_y, player_y[0]) # 求智能体到step之前最近的矿的距离,如果小于最小值,认为已经采到矿 reward_dist_a1 = False reward_dist_a2 = False for i in range(len(mineral_x_old)): dist_agent1 = (mineral_x_old[i] - player_x[0]) * ( mineral_x_old[i] - player_x[0]) + ( mineral_y_old[i] - player_y[0]) * (mineral_y_old[i] - player_y[0]) dist_agent2 = (mineral_x_old[i] - player_x[1]) * ( mineral_x_old[i] - player_x[1]) + ( mineral_y_old[i] - player_y[1]) * (mineral_y_old[i] - player_y[1]) if (dist_agent1 < min and rew > 0): reward_dist_a1 = True break if (dist_agent2 < min and rew > 0): reward_dist_a2 = True break # 根据前后矿到智能体的距离计算各自奖励 rew_expand = np.zeros((num_agents, 1)) #collect mineral reward if (reward_dist_a1 and rew > 0): rew_expand[0] = rew if (reward_dist_a2 and rew > 0): rew_expand[1] = rew # if(reward_dist_a1 or reward_dist_a2 or rew==1): # rew_expand[0] += rew*10 # rew_expand[1] += rew*10 # 每一步给一个惩罚值 if (punish_1): rew_expand[0] += -10 #碰壁给一个惩罚 rew_expand[0] += punish * eps_time if (punish_2): rew_expand[1] += -10 rew_expand[1] += punish * eps_time # if (punish_1 or punish_2): # rew_expand[0] += -10 # rew_expand[1] += -10 replay_buffer.add(screen_expand, act_with_noise, rew_expand, flag_end, new_screen_expand) episode_rewards[-1] += rew # rew.sum(axis=0) # 将新的观察作为当前观察 screen_expand = new_screen_expand eps_time += 1 if (flag_end): eps_time = 1 reward = episode_rewards[-1] print("Episode Reward : %s" % reward) obs = env.reset() action_noise.reset() print('num_episodes is', len(episode_rewards)) episode_rewards.append(0.0) #得到初始观察 player_relative = obs[0].observation["feature_screen"][ _PLAYER_RELATIVE] screen = np.zeros( (player_relative.shape[0] - 9, player_relative.shape[1])) for i in range(player_relative.shape[0] - 9): for j in range(player_relative.shape[1]): screen[i, j] = round(player_relative[i, j] / 3, 1) screen_expand = screenConcat(screen, num_agents) #选中全部 obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) # 求出最开始矿到每个智能体的距离和 player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() # 对player_x和player_y的处理,保证其含有两个元素,对应两个agent的位置 if (len(player_x) == 0): player_x = np.array([0]) player_y = np.array([0]) if (len(player_x) == 1): player_x = np.append(player_x, player_x[0]) player_y = np.append(player_y, player_y[0]) pos_agent1_target = [player_x[0], player_y[0]] pos_agent2_target = [player_x[1], player_y[1]] if (t > num_exploring) and (t % train_freq == 0): #t % train_freq == 0: # trainStartTime = datetime.datetime.now() print("training starts") s_batch, a_batch, r_batch, done_batch, s2_batch = replay_buffer.sample_batch( batch_size ) #[group0:[batch_size, trace.dimension], group1, ... group8] target_q = r_batch + gamma * critic.predict_target( s2_batch, actor.predict_target(s2_batch)) rnn_c_out = critic.predict_target_rnn( s2_batch, actor.predict_target(s2_batch)) predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(target_q, (batch_size, num_agents, output_len))) a_outs = actor.predict(s_batch) # a_outs和a_batch是完全相同的 grads = critic.action_gradients(s_batch, a_outs) # delta Q对a的导数 actor.train(s_batch, grads) # 这里会计算a对θ的导数和最后的梯度 if (t > num_exploring) and ( t % target_network_update_freq == 0): #t % target_network_update_freq == 0: actor.update_target_network() critic.update_target_network() if (t > num_exploring) and ((t - num_exploring) % save_freq == 0): # saveStartTime = datetime.datetime.now() model_file_save = os.path.join( str(t) + "_" + "model_segment_training2/", "defeat_zerglings") U.save_state(model_file_save) replay_buffer.save() elif (t == max_timesteps - 1): model_file_save = os.path.join( str(t) + "_" + "model_segment_training2/", "defeat_zerglings") U.save_state(model_file_save) # mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if flag_end and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) # logger.record_tabular("mean 100 episode reward", # mean_100ep_reward) logger.dump_tabular() endTime = datetime.datetime.now() time_used = str(endTime - startTime) print("t = %d, time used = %s" % (t, time_used))