) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 if is_solved: # Show off the result env.render() else:
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, stage1_total_timesteps=None, stage2_total_timesteps=None, buffer_size=50000, exploration_fraction=0.3, initial_exploration_p=1.0, exploration_final_eps=0.0, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1, gamma=1.0, target_network_update_freq=100, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, double_q=True, obs_dim=None, qmdp_expert=None, stage1_td_error_threshold=1e-3, pretrain_experience=None, flatten_belief=False, num_experts=None, **network_kwargs): """Train a bootstrap-dqn model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) qmdp_expert: takes obs, bel -> returns qmdp q-vals **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) nenvs = env.num_envs print("{} envs".format(nenvs)) assert pretrain_experience is not None and qmdp_expert is not None and num_experts is not None # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph # import IPython; IPython.embed() #assert isinstance(env.envs[0].env.env.env, ExplicitBayesEnv) #belief_space = env.envs[0].env.env.env.belief_space #observation_space = env.envs[0].env.env.env.internal_observation_space obs_space = env.observation_space assert obs_dim is not None observation_space = Box(obs_space.low[:obs_dim], obs_space.high[:obs_dim], dtype=np.float32) #belief_space = Box(obs_space.low[obs_dim:], obs_space.high[obs_dim:], dtype=np.float32) observed_belief_space = Box(obs_space.low[obs_dim:], obs_space.high[obs_dim:], dtype=np.float32) belief_space = Box(np.zeros(num_experts), np.ones(num_experts), dtype=np.float32) # rocksample num_experts = belief_space.high.size # print("Num experts", num_experts) def make_obs_ph(name): return ObservationInput(observation_space, name=name) def make_bel_ph(name): return ObservationInput(belief_space, name=name) q_func = build_q_func(network, num_experts, **network_kwargs) print('=============== got qfunc ============== ') if stage1_total_timesteps is None and stage2_total_timesteps is None: stage1_total_timesteps = total_timesteps // 2 stage2_total_timesteps = total_timesteps // 2 total_timesteps = stage1_total_timesteps + stage2_total_timesteps act, train, update_target, debug = rbqnfe_staged.build_train( make_obs_ph=make_obs_ph, make_bel_ph=make_bel_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise, double_q=double_q) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=initial_exploration_p, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_reward = np.zeros(nenvs, dtype=np.float32) saved_mean_reward = None reset = True epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_episodes = 0 episode_rewards_history = deque(maxlen=1000) episode_step = np.zeros(nenvs, dtype=int) episodes = 0 #scalar # Load model with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") print("Model will be saved at ", model_file) model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) print('Loaded model from {}'.format(load_path)) t = 0 accumulated_td_errors = deque(maxlen=100) # copy all pre-experiences for expert, experience in enumerate(pretrain_experience): obs, val, action, rew, new_obs, done = experience obs, bel = obs[:, :-observed_belief_space. shape[0]], obs[:, -observed_belief_space.shape[0]:] if flatten_belief: bel = qmdp_expert.flatten_to_belief(bel, approximate=True).transpose() new_obs, new_bel = new_obs[:, :-observed_belief_space. shape[0]], new_obs[:, -observed_belief_space. shape[0]:] if flatten_belief: new_bel = qmdp_expert.flatten_to_belief( new_bel, approximate=True).transpose() # rocksample specific new_expert_qval = qmdp_expert(new_obs, new_bel) expert_qval = qmdp_expert(obs, bel) obs = obs.astype(np.float32) bel = bel.astype(np.float32) expert_qval = expert_qval.astype(np.float32) action = action.astype(np.float32) rew = rew.astype(np.float32).ravel() new_obs = new_obs.astype(np.float32) new_bel = new_bel.astype(np.float32) new_expert_qval = new_expert_qval.astype(np.float32) replay_buffer.add(obs, bel, expert_qval, action, rew, new_obs, new_bel, new_expert_qval, done) print("Added {} samples to ReplayBuffer".format( len(replay_buffer._storage))) # Stage 1: Train Residual without exploration, just with batches from replay buffer while t < stage1_total_timesteps: if callback is not None: if callback(locals(), globals()): break kwargs = {} update_param_noise_threshold = 0. obs = env.reset() episode_reward = np.zeros(nenvs, dtype=np.float32) episode_step[:] = 0 obs, bel = obs[:, :-observed_belief_space. shape[0]], obs[:, -observed_belief_space.shape[0]:] expert_qval = qmdp_expert(obs, bel) t += 1 # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) obses_t, bels_t, expert_qvals, actions, rewards, obses_tp1, bels_tp1, expert_qvals_1, dones, weights, batch_idxes = experience else: experience = replay_buffer.sample(batch_size) obses_t, bels_t, expert_qvals, actions, rewards, obses_tp1, bels_tp1, expert_qvals_1, dones = experience weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, bels_t, expert_qvals, actions, rewards, obses_tp1, bels_tp1, expert_qvals_1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) accumulated_td_errors.append(np.mean(np.abs(td_errors))) if np.random.rand() < 0.01: print("Stage 1 TD error", np.around(td_errors, 1)) if t % target_network_update_freq == 0: # Update target network periodically. print("Update target") update_target() if len(accumulated_td_errors) == 100 and np.mean( np.abs(accumulated_td_errors)) < stage1_td_error_threshold: if saved_mean_reward is not None: save_variables(model_file) print("Breaking due to low td error", np.mean(accumulated_td_errors)) break if t % print_freq == 0: # Just to get test rewards obs = env.reset() episode_reward = np.zeros(nenvs, dtype=np.float32) episode_step[:] = 0 obs, bel = obs[:, :-observed_belief_space. shape[0]], obs[:, -observed_belief_space.shape[0]:] expert_qval = qmdp_expert(obs, bel) episode_rewards_history = [] horizon = 100 while len(episode_rewards_history) < 1000: action, q_values = act(np.array(obs)[None], np.array(bel)[None], np.array(expert_qval)[None], update_eps=0, **kwargs) env_action = action new_obs, rew, done, info = env.step(env_action) new_obs, new_bel = new_obs[:, :-observed_belief_space.shape[ 0]], new_obs[:, -observed_belief_space.shape[0]:] new_expert_qval = qmdp_expert(new_obs, new_bel) if flatten_belief: new_bel = qmdp_expert.flatten_to_belief(new_bel) obs = new_obs bel = new_bel expert_qval = new_expert_qval episode_reward += 0.95**episode_step * rew episode_step += 1 for d in range(len(done)): if done[d]: epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 mean_100ep_reward = round(np.mean(episode_rewards_history), 2) num_episodes = episodes logger.record_tabular("stage", 1) logger.record_tabular("steps", t) logger.record_tabular("mean 1000 episode reward", mean_100ep_reward) logger.record_tabular("td errors", np.mean(accumulated_td_errors)) logger.dump_tabular() print("episodes ", num_episodes, "steps {}/{}".format(t, total_timesteps)) print("mean reward", mean_100ep_reward) print("exploration", int(100 * exploration.value(t))) if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward)) print("saving model") save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_variables(model_file) # Post stage1 saving stage1_model_file = os.path.join(td, "stage1_model") save_variables(stage1_model_file) update_target() print("===========================================") print(" Stage 1 complete ") print("===========================================") stage1_total_timesteps = t episode_rewards_history = deque(maxlen=1000) # Stage 2: Train Resisual with explorationi t = 0 while t < stage2_total_timesteps: if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} update_eps = exploration.value(t) update_param_noise_threshold = 0. obs = env.reset() episode_reward = np.zeros(nenvs, dtype=np.float32) episode_step[:] = 0 obs, bel = obs[:, :-observed_belief_space. shape[0]], obs[:, -observed_belief_space.shape[0]:] expert_qval = qmdp_expert(obs, bel) start_time = timer.time() horizon = 100 for m in range(horizon): action, q_values = act(np.array(obs)[None], np.array(bel)[None], np.array(expert_qval)[None], update_eps=update_eps, **kwargs) env_action = action new_obs, rew, done, info = env.step(env_action) new_obs, new_bel = new_obs[:, :-observed_belief_space.shape[ 0]], new_obs[:, -observed_belief_space.shape[0]:] new_expert_qval = qmdp_expert(new_obs, new_bel) if flatten_belief: new_bel = qmdp_expert.flatten_to_belief(new_bel) # Store transition in the replay buffer. replay_buffer.add(obs, bel, expert_qval, action, rew, new_obs, new_bel, new_expert_qval, done) # if np.random.rand() < 0.05: # # # write to file # # with open('rbqn_fixed_expert.csv', 'a') as f: # # out = ','.join(str(np.around(x,2)) for x in [bel[0], obs[0], q_values[0]]) # # f.write(out + "\n") # print(np.around(bel[-1], 2), rew[-1], np.around(q_values[-1], 1), np.around(expert_qval[-1], 1)) obs = new_obs bel = new_bel expert_qval = new_expert_qval episode_reward += 0.95**episode_step * rew episode_step += 1 # print(action, done, obs) for d in range(len(done)): if done[d]: epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 print("Took {}".format(timer.time() - start_time)) t += 1 if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) if experience is None: continue obses_t, bels_t, expert_qvals, actions, rewards, obses_tp1, bels_tp1, expert_qvals_1, dones, weights, batch_idxes = experience else: experience = replay_buffer.sample(batch_size) if experience is None: continue obses_t, bels_t, expert_qvals, actions, rewards, obses_tp1, bels_tp1, expert_qvals_1, dones = experience weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, bels_t, expert_qvals, actions, rewards, obses_tp1, bels_tp1, expert_qvals_1, dones, weights) if np.random.rand() < 0.01: print("TD error", np.around(td_errors, 1)) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) accumulated_td_errors.append(np.mean(td_errors)) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. print("Update target") update_target() mean_100ep_reward = round(np.mean(episode_rewards_history), 2) num_episodes = episodes if print_freq is not None and num_episodes % print_freq == 0: logger.record_tabular("stage", 2) logger.record_tabular("steps", t + stage1_total_timesteps) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 1000 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.record_tabular("td errors", np.mean(accumulated_td_errors)) logger.dump_tabular() print("episodes ", num_episodes, "steps {}/{}".format(t, total_timesteps)) print("mean reward", mean_100ep_reward) print("exploration", int(100 * exploration.value(t))) if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward)) print("saving model") save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_variables(model_file) return act
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return U.BatchInput(observation_space_shape, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return act
replay_buffer = ReplayBuffer(args.replay_buffer_size) # mem = ReplayBuffer(args.memory_capacity) # schedule of epsilon annealing exploration = LinearSchedule(args.final_exploration_step, args.final_exploration, 1) # import pdb # pdb.set_trace() # Training loop dqn.online_net.train() timestamp = 0 for episode in range(args.max_episodes): epsilon = exploration.value(episode) state, done = env.reset(), False if args.agent == 'BootstrappedDQN': k = random.randrange(args.nheads) elif args.agent == 'VariationalDQN': dqn.online_net.freeze_noise() elif args.agent == 'BayesBackpropDQN': dqn.online_net.reset_noise() elif args.agent == 'MNFDQN': dqn.online_net.reset_noise() while not done: timestamp += 1 if args.agent == 'BootstrappedDQN': action = dqn.act_single_head(state[None], k)
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, initial_exploration_p=1.0, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=100, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, pretraining_obs=None, pretraining_targets=None, pretrain_steps=1000, pretrain_experience=None, pretrain_num_episodes=0, double_q=True, expert_qfunc=None, aggrevate_steps=0, pretrain_lr=1e-4, sampling_starts=0, beb_agent=None, qvalue_file="qvalue.csv", **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) beb_agent: takes Q values and suggests actions after adding beb bonus **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) nenvs = env.num_envs print("Bayes-DeepQ:", env.num_envs) print("Total timesteps", total_timesteps) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, train_target, copy_target_to_q, debug = brl_deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), pretrain_optimizer=tf.train.AdamOptimizer(learning_rate=pretrain_lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise, double_q=double_q) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=initial_exploration_p, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") print("Model will be saved at ", model_file) model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) print('Loaded model from {}'.format(load_path)) if pretraining_obs is not None: # pretrain target and copy to qfunc print("Pretrain steps ", pretrain_steps) for i in range(pretrain_steps): pretrain_errors = train_target(pretraining_obs, pretraining_targets) if i % 500 == 0: print("Step {}".format(i), np.mean(pretrain_errors)) if np.mean(pretrain_errors) < 1e-5: break min_rew = 0 # copy all pre-experiences if pretrain_experience is not None: for obs, action, rew, new_obs, done in zip(*pretrain_experience): replay_buffer.add(obs, action, rew, new_obs, float(done)) print("Added {} samples to ReplayBuffer".format( len(replay_buffer._storage))) min_rew = min(rew, min_rew) print("Pretrain Error", np.mean(pretrain_errors)) else: print("Skipping pretraining") update_target() print("Save the pretrained model", model_file) save_variables(model_file) episode_reward = np.zeros(nenvs, dtype=np.float32) saved_mean_reward = None obs = env.reset() reset = True epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_episodes = 0 episode_rewards_history = deque(maxlen=100) episode_step = np.zeros(nenvs, dtype=int) episodes = 0 #scalar start = 0 if expert_qfunc is None: aggrevate_steps = 0 # if pretraining_obs is None or pretraining_obs.size == 0: # episode_rewards = [] # else: # episode_rewards = [[0.0]] * pretrain_num_episodes # start = len(pretraining_obs) # if print_freq is not None: # for t in range(0, len(pretraining_obs), print_freq): # logger.record_tabular("steps", t) # logger.record_tabular("episodes", pretrain_num_episodes) # logger.record_tabular("mean 100 episode reward", min_rew) # logger.record_tabular("% time spent exploring", 0) # logger.dump_tabular() # print("pretraining episodes", pretrain_num_episodes, "steps {}/{}".format(t, total_timesteps)) with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") print("Aggrevate: Model will be saved at ", model_file) model_saved = False for i in range(aggrevate_steps): obses_t, values = [], [] for j in range(30): # TODO: 30 should be changed to max horizon? t = np.random.randint(50) + 1 obs = env.reset() for k in range(t): action, value = act(np.array(obs)[None], update_eps=exploration.value(i)) obs, rew, done, _ = env.step(action) obses_t.extend(obs) # Roll out expert policy episode_reward[:] = 0 dones = np.array([False] * obs.shape[0]) for k in range(51 - t): obs, rew, done, _ = env.step( [expert_qfunc.step(o) for o in obs]) dones[done] = True rew[dones] = 0 episode_reward += 0.95**k * rew # TODO: change this to exploration-savvy action # action = np.random.randint(env.action_space.n, size=len(obs)) # Rocksample specific, take sensing actions # prob = np.array([1] * 6 + [2] * (env.action_space.n - 6), dtype=np.float32) # prob = prob / np.sum(prob) # action = np.random.choice(env.action_space.n, p=prob, size=len(action)) # new_obs, rew, done, _ = env.step(action) # value = rew.copy() # value[np.logical_not(done)] += gamma * np.max(expert_qfunc.value(new_obs[np.logical_not(done)]), axis=1) # current_value[tuple(np.array([np.arange(len(action)), action]))] = value # episode reward # episode_reward[np.logical_not(done)] += np.max(current_value[np.logical_not(done)], axis=1) # episode_rewards_history.extend(np.max(current_value, axis=1)) value[tuple([np.arange(len(action)), action])] = episode_reward values.extend(value) print("Aggrevate got {} / {} new data".format( obs.shape[0] * 30, len(obses_t))) # print("Mean expected cost at the explored points", np.mean(np.max(values, axis=1))) for j in range(1000): obs, val = np.array(obses_t), np.array(values) # indices = np.random.choice(len(obs), min(1000, len(obses_t))) aggrevate_errors = train_target(obs, val) if np.mean(aggrevate_errors) < 1e-5: print("Aggrevate Step {}, {}".format(i, j), np.mean(aggrevate_errors)) break print("Aggrevate Step {}, {}".format(i, j), np.mean(aggrevate_errors)) update_target() print("Save the aggrevate model", i, model_file) # Evaluate current policy episode_reward[:] = 0 obs = env.reset() num_episodes = 0 k = np.zeros(len(obs)) while num_episodes < 100: action, _ = act(np.array(obs)[None], update_eps=0.0) # print(action) obs, rew, done, _ = env.step(action) episode_reward += 0.95**k * rew k += 1 for d in range(len(done)): if done[d]: episode_rewards_history.append(episode_reward[d]) episode_reward[d] = 0. k[d] = 0 num_episodes += 1 mean_1000ep_reward = round(np.mean(episode_rewards_history), 2) print("Mean discounted reward", mean_1000ep_reward) logger.record_tabular("mean 100 episode reward", mean_1000ep_reward) logger.dump_tabular() save_variables(model_file) t = 0 # could start from pretrain-steps epoch = 0 while True: epoch += 1 if t >= total_timesteps: break if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # no randomization # update_eps = 0 print('update_eps', int(100 * exploration.value(t))) qv_error = [] obs = env.reset() for m in range(100): action, q_values = act(np.array(obs)[None], update_eps=update_eps, **kwargs) if beb_agent is not None: action = beb_agent.step(obs, action, q_values, exploration.value(t)) # if expert_qfunc is not None: # v = expert_qfunc.value(obs) # qv_error += [v - q_values[0]] env_action = action reset = False new_obs, rew, done, info = env.step(env_action) if t >= sampling_starts: # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, done) obs = new_obs episode_reward += rew episode_step += 1 for d in range(len(done)): if done[d]: # Episode done. # discount(np.array(rewards), gamma) consider doing discount epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 t += 100 * nenvs if t > learning_starts: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if target_network_update_freq is not None and t > sampling_starts \ and epoch % target_network_update_freq == 0: # Update target network periodically. print("Update target") update_target() mean_1000ep_reward = round(np.mean(episode_rewards_history), 2) num_episodes = episodes if print_freq is not None: logger.record_tabular("steps", t) logger.record_tabular("td errors", np.mean(td_errors)) logger.record_tabular("td errors std", np.std(np.abs(td_errors))) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 1000 episode reward", mean_1000ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() print("episodes", num_episodes, "steps {}/{}".format(t, total_timesteps)) if (checkpoint_freq is not None and t > learning_starts and len(episode_rewards_history) >= 1000): if saved_mean_reward is None or mean_1000ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_1000ep_reward)) print("saving model") save_variables(model_file) model_saved = True saved_mean_reward = mean_1000ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_variables(model_file) return act
class DQNAgent_Vanila_simple(agent): def __init__(self, model, opt, learning=True): super().__init__() self.memory = ReplayBuffer(3000) self.previous_state = None self.previous_action = None self.previous_legal_actions = None self.step = 0 self.model = model self.opt = opt self.loss = 0 self.batch_size = 10 self.test_q = 0 self.max_tile = 0 #self.test_q = 0 self.epsilon_schedule = LinearSchedule(1000000, initial_p=0.99, final_p=0.01) self.learning = learning def should_explore(self): self.epsilon = self.epsilon_schedule.value(self.step) return random.random() < self.epsilon def action(self): if self.learning: self.step += 1 legalActions = self.legal_actions(deepcopy(self.gb.board)) if len(legalActions) == 0: print(111111111111111111111111111111111111111) board = deepcopy(self.gb.board) board = oneHotMap(board) if self.learning and self.should_explore(): q_values = None action = random.choice(legalActions) choice = self.actions[action] else: #mark state = torch.from_numpy(board).type( torch.FloatTensor).cuda().view(-1, 17, 4, 4) action, q_values = self.predict(state, legalActions) choice = self.actions[action] if self.learning: reward = self.gb.currentReward if reward != 0: reward = np.log2(reward) if (self.previous_state is not None and self.previous_action is not None): self.memory.add(self.previous_state, self.previous_action, self.previous_legal_actions, reward, legalActions, board, 0) self.previous_state = board self.previous_action = action self.previous_legal_actions = legalActions if self.learning: self.update() return choice def enableLearning(self): self.model.train() self.learning = True self.max_tile = 0 self.reset() def disableLearning(self): self.model.eval() self.learning = False def end_episode(self): if not self.learning: m = np.max(self.gb.board) if m > self.max_tile: self.max_tile = m return #print(self.gb.board) board = deepcopy(self.gb.board) board = oneHotMap(board) #legalActions = self.legal_actions(deepcopy(self.gb.board)) #print(legalActions) self.memory.add(self.previous_state, self.previous_action, self.previous_legal_actions, self.gb.currentReward, [], board, 1) self.reset() def reset(self): self.previous_state = None self.previous_action = None self.previous_legal_actions = None def update(self): if self.step < self.batch_size: return batch = self.memory.sample(self.batch_size) (states, actions, legal_actions, reward, next_legal_actions, next_states, is_terminal) = batch terminal = torch.tensor(is_terminal).type(torch.cuda.FloatTensor) reward = torch.tensor(reward).type(torch.cuda.FloatTensor) states = torch.from_numpy(states).type(torch.FloatTensor).cuda().view( -1, 17, 4, 4) next_states = torch.from_numpy(next_states).type( torch.FloatTensor).cuda().view(-1, 17, 4, 4) # Current Q Values _, q_values = self.predict_batch(states) batch_index = torch.arange(self.batch_size, dtype=torch.long) #print(actions) #print(q_values) q_values = q_values[batch_index, actions] #print(q_values) # Calculate target q_actions_next, q_values_next = self.predict_batch( next_states, legalActions=next_legal_actions) #print(q_values_next) q_max = q_values_next.max(1)[0].detach() q_max = (1 - terminal) * q_max # if sum(terminal == 1) > 0: # print(reward) # print( (terminal == 1).nonzero()) # print(terminal) # print(next_legal_actions) # print(q_max) # input() q_target = reward + 0.99 * q_max self.opt.zero_grad() loss = self.model.loss_function(q_target, q_values) loss.backward() self.opt.step() #train_loss = loss_vae.item() + loss_dqn.item() self.loss += loss.item() / len(states) def predict_batch(self, input, legalActions=None): input = input #print(legalActions) q_values = self.model(input) if legalActions is None: values, q_actions = q_values.max(1) else: isNotlegal = True # print(legalActions) # print(q_values) q_values_true = torch.full((self.batch_size, 4), -100000000).cuda() for i, action in enumerate(legalActions): q_values_true[i, action] = q_values[i, action] values, q_actions = q_values_true.max(1) q_values = q_values_true #print(q_values_true) ''' while isNotlegal: isNotlegal = False values, q_actions = q_values.max(1) #print(q_values) #print(values) #print(q_actions) for i, action in enumerate(q_actions): #print(legalActions[i]) if len(legalActions[i]) == 0: continue if action.item() not in legalActions[i]: isNotlegal = True # print(i) # print(action.item()) # print(q_values) q_values[i, action] = -1 # print(q_values) # print("*********************") ''' return q_actions, q_values def predict(self, input, legalActions): q_values = self.model(input) for action in range(4): if action not in legalActions: q_values[0, action] = -100000000 action = torch.argmax(q_values) if int(action.item()) not in legalActions: print(legalActions, q_values, action) print("!!!!!!!!!!!!!!!!!!!!!!!!!") return action.item(), q_values def legal_actions(self, copy_gb): legalActions = [] for i in range(4): try_gb = gameboard(4, deepcopy(copy_gb)) changed = try_gb.takeAction(self.actions[i]) if changed: legalActions.append(i) return legalActions '''
def learn(env, q_func, num_actions=4, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): # Create all the functions necessary to train the model # Returns a session that will use <num_cpu> CPU's only sess = U.make_session(num_cpu=num_cpu) sess.__enter__() # Creates a placeholder for a batch of tensors of a given shape and dtyp def make_obs_ph(name): return U.BatchInput((64, 64), name=name) # act, train, update_target are function, debug is dict act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) # Choose use prioritized replay buffer or normal replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1 exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # SC2的部分開始 # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None path_memory = np.zeros((64, 64)) obs = env.reset() # Select all marines obs = env.step( actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) # obs is tuple, obs[0] is 'pysc2.env.environment.TimeStep', obs[0].observation is dictionary. player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] # 利用path memory記憶曾經走過的軌跡 screen = player_relative + path_memory # 取得兩個陸戰隊的中心位置 player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # np.array()[None] 是指多包一個維度在外面 e.g. [1] -> [[1]] action = act(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False coord = [player[0], player[1]] rew = 0 # 只有四個action,分別是上下左右,走過之後在路徑上留下一整排-3,目的是與水晶碎片的id(=3)相抵銷,代表有順利採集到。 path_memory_ = np.array(path_memory, copy=True) if (action == 0): # UP if (player[1] >= 16): coord = [player[0], player[1] - 16] path_memory_[player[1] - 16:player[1], player[0]] = -3 elif (player[1] > 0): coord = [player[0], 0] path_memory_[0:player[1], player[0]] = -3 elif (action == 1): # DOWN if (player[1] <= 47): coord = [player[0], player[1] + 16] path_memory_[player[1]:player[1] + 16, player[0]] = -3 elif (player[1] > 47): coord = [player[0], 63] path_memory_[player[1]:63, player[0]] = -3 elif (action == 2): # LEFT if (player[0] >= 16): coord = [player[0] - 16, player[1]] path_memory_[player[1], player[0] - 16:player[0]] = -3 elif (player[0] < 16): coord = [0, player[1]] path_memory_[player[1], 0:player[0]] = -3 elif (action == 3): # RIGHT if (player[0] <= 47): coord = [player[0] + 16, player[1]] path_memory_[player[1], player[0]:player[0] + 16] = -3 elif (player[0] > 47): coord = [63, player[1]] path_memory_[player[1], player[0]:63] = -3 # 更新path_memory path_memory = np.array(path_memory_) # 如果不能移動陸戰隊,想必是還沒圈選到陸戰隊,圈選他們 if _MOVE_SCREEN not in obs[0].observation["available_actions"]: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) # 移動陸戰隊 new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] # 取得環境給的observation obs = env.step(actions=new_action) # 這裡要重新取得player_relative,因為上一行的obs是個有複數資訊的tuple # 但我們要存入replay_buffer的只有降維後的screen畫面 player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = player_relative + path_memory # 取得reward rew = obs[0].reward # StepType.LAST 代表done的意思 done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer replay_buffer.add(screen, action, rew, new_screen, float(done)) # 確實存入之後就能以新screen取代舊screen screen = new_screen episode_rewards[-1] += rew if done: # 重新取得敵我中立關係位置圖 obs = env.reset() # player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] # # 還是看不懂為何要加上path_memory # screen = player_relative + path_memory # player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() # player = [int(player_x.mean()), int(player_y.mean())] # # 圈選全部的陸戰隊(為何要在done observation做這件事情?) # env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) episode_rewards.append(0.0) # 清空path_memory path_memory = np.zeros((64, 64)) reset = True # 定期從replay buffer中抽experience來訓練,以及train target network if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # 這裡的train來自deepq.build_train td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # target network if t > learning_starts and t % target_network_update_freq == 0: # 同樣來自deepq.build_train # Update target network periodically update_target() # 下LOG追蹤reward mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() # 當model進步時,就存檔下來 if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act)
class MemBufferThread(threading.Thread): # 注意可变参数概念 def __init__(self, mem_queue, max_timesteps=1000000, buffer_size=50000, batch_size=32, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6): threading.Thread.__init__(self) self.mem_queue = mem_queue self.prioritized_replay = prioritized_replay self.batch_size = batch_size self.batch_idxes = None self.prioritized_replay_eps = prioritized_replay_eps # Create the replay buffer if prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(buffer_size) self.beta_schedule = None def __len__(self): return self.replay_buffer.__len__() def sample(self, t): if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(t)) # 这个t的取值有待商议, (obses_t, actions, rewards, obses_tp1, dones, weights, self.batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) # np.ones_like() : Return an array of ones with the same shape and type as a given array. weights, self.batch_idxes = np.ones_like(rewards), None return obses_t, actions, rewards, obses_tp1, dones, weights def update_priorities(self, td_errors): new_priorities = np.abs(td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities(self.batch_idxes, new_priorities) def run(self): # flag = 1 while True: if self.mem_queue.full() is True: print("the mem_queue is full") # if self.replay_buffer.__len__() >= 100000 and self.replay_buffer.__len__() % 100 == 0: # bool(flag): # # print("replay_buffer is 100000 !") # print('') # flag = 0 if self.mem_queue.empty() is not True: single_mem = self.mem_queue.get() self.replay_buffer.add(single_mem[0], single_mem[1], single_mem[2], single_mem[3], single_mem[4])
def evaluate(self, num_episodes, render=False): with U.make_session(NUM_CORES): self.t0 = time.time() env = self.env.env # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4) ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() self.episode_count += 1 state = env.reset() self.scores = [0.0] episode_q = [] for t in itertools.count(): action = act(state[None], update_eps=exploration.value(t))[0] observation, reward, done, _ = env.step(action) replay_buffer.add(state, action, reward, observation, float(done)) state = observation self.scores[-1] += reward episode_q.append(float(debug['q_values'](state[None]).max())) if render: env.render() if done: print('{0}, score: {1} ({2})'.format(len(self.scores), self.scores[-1], np.mean(self.scores[-100:]))) self.evaluation.info['q_values'].append(np.mean(episode_q)) if len(self.scores) >= num_episodes: return self.final_evaluation() state = env.reset() episode_q = [] self.scores.append(0) if self.env.solved(self.scores): self.evaluation.info['solved'] = len(self.scores) # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() U.reset() return self.final_evaluation()
def do_agent_exploration(updates_queue: multiprocessing.Queue, q_func_vars_trained_queue: multiprocessing.Queue, network, seed, config, lr, total_timesteps, learning_starts, buffer_size, exploration_fraction, exploration_initial_eps, exploration_final_eps, train_freq, batch_size, print_freq, checkpoint_freq, gamma, target_network_update_freq, prioritized_replay, prioritized_replay_alpha, prioritized_replay_beta0, prioritized_replay_beta_iters, prioritized_replay_eps, experiment_name, load_path, network_kwargs): env = DotaEnvironment() sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, _, _, debug = deepq.build_train( scope='deepq_act', make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=exploration_initial_eps, final_p=exploration_final_eps) U.initialize() reward_shaper = ActionAdviceRewardShaper(config=config) reward_shaper.load() reward_shaper.generate_merged_demo() full_exp_name = '{}-{}'.format(date.today().strftime('%Y%m%d'), experiment_name) experiment_dir = os.path.join('experiments', full_exp_name) os.makedirs(experiment_dir, exist_ok=True) summary_dir = os.path.join(experiment_dir, 'summaries') os.makedirs(summary_dir, exist_ok=True) summary_writer = tf.summary.FileWriter(summary_dir) checkpoint_dir = os.path.join(experiment_dir, 'checkpoints') os.makedirs(checkpoint_dir, exist_ok=True) stats_dir = os.path.join(experiment_dir, 'stats') os.makedirs(stats_dir, exist_ok=True) with tempfile.TemporaryDirectory() as td: td = checkpoint_dir or td os.makedirs(td, exist_ok=True) model_file = os.path.join(td, "best_model") model_saved = False saved_mean_reward = None # if os.path.exists(model_file): # print('Model is loading') # load_variables(model_file) # logger.log('Loaded model from {}'.format(model_file)) # model_saved = True # elif load_path is not None: # load_variables(load_path) # logger.log('Loaded model from {}'.format(load_path)) def synchronize_q_func_vars(): updates_queue.put( UpdateMessage(UPDATE_STATUS_SEND_WEIGHTS, None, None)) q_func_vars_trained = q_func_vars_trained_queue.get() update_q_func_expr = [] for var, var_trained in zip(debug['q_func_vars'], q_func_vars_trained): update_q_func_expr.append(var.assign(var_trained)) update_q_func_expr = tf.group(*update_q_func_expr) sess.run(update_q_func_expr) synchronize_q_func_vars() episode_rewards = [] act_step_t = 0 while act_step_t < total_timesteps: # Reset the environment obs = env.reset() obs = StatePreprocessor.process(obs) episode_rewards.append(0.0) done = False # Demo preservation variables demo_picked = 0 demo_picked_step = 0 # Demo switching statistics demo_switching_stats = [(0, 0)] # Sample the episode until it is completed act_started_step_t = act_step_t while not done: # Take action and update exploration to the newest value biases, demo_indexes = reward_shaper.get_action_potentials_with_indexes( obs, act_step_t) update_eps = exploration.value(act_step_t) actions, is_randoms = act(np.array(obs)[None], biases, update_eps=update_eps) action, is_random = actions[0], is_randoms[0] if not is_random: bias_demo = demo_indexes[action] if bias_demo != demo_switching_stats[-1][1]: demo_switching_stats.append( (act_step_t - act_started_step_t, bias_demo)) if bias_demo != 0 and demo_picked == 0: demo_picked = bias_demo demo_picked_step = act_step_t + 1 pairs = env.step(action) action, (new_obs, rew, done, _) = pairs[-1] logger.log( f'{act_step_t}/{total_timesteps} obs {obs} action {action}' ) # Compute state on the real reward but learn from the normalized version episode_rewards[-1] += rew rew = np.sign(rew) * np.log(1 + np.abs(rew)) new_obs = StatePreprocessor.process(new_obs) if len(new_obs) == 0: done = True else: transition = (obs, action, rew, new_obs, float(done), act_step_t) obs = new_obs act_step_t += 1 if act_step_t - demo_picked_step >= MIN_STEPS_TO_FOLLOW_DEMO_FOR: demo_picked = 0 reward_shaper.set_demo_picked(act_step_t, demo_picked) updates_queue.put( UpdateMessage(UPDATE_STATUS_CONTINUE, transition, demo_picked)) # Post episode logging summary = tf.Summary(value=[ tf.Summary.Value(tag="rewards", simple_value=episode_rewards[-1]) ]) summary_writer.add_summary(summary, act_step_t) summary = tf.Summary( value=[tf.Summary.Value(tag="eps", simple_value=update_eps)]) summary_writer.add_summary(summary, act_step_t) summary = tf.Summary(value=[ tf.Summary.Value(tag="episode_steps", simple_value=act_step_t - act_started_step_t) ]) summary_writer.add_summary(summary, act_step_t) mean_5ep_reward = round(float(np.mean(episode_rewards[-5:])), 1) num_episodes = len(episode_rewards) if print_freq is not None and num_episodes % print_freq == 0: logger.record_tabular("steps", act_step_t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 5 episode reward", mean_5ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(act_step_t))) logger.dump_tabular() # Wait for the learning to finish and synchronize synchronize_q_func_vars() # Record demo_switching_stats if num_episodes % 10 == 0: save_demo_switching_stats(demo_switching_stats, stats_dir, num_episodes) if checkpoint_freq is not None and num_episodes % checkpoint_freq == 0: # Periodically save the model rec_model_file = os.path.join( td, "model_{}_{:.2f}".format(num_episodes, mean_5ep_reward)) save_variables(rec_model_file) # Check whether the model is the best so far if saved_mean_reward is None or mean_5ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_5ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_5ep_reward updates_queue.put(UpdateMessage(UPDATE_STATUS_FINISH, None, None))
def train(env, eval_env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, my_skill_set=None, log_dir = None, num_eval_episodes=10, render=False, render_eval = False, commit_for = 1 ): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model if my_skill_set: assert commit_for>=1, "commit_for >= 1" save_idx = 0 with U.single_threaded_session() as sess: ## restore if my_skill_set: action_shape = my_skill_set.len else: action_shape = env.action_space.n # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return U.BatchInput(observation_space_shape, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=action_shape, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': action_shape, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() # sess.run(tf.variables_initializer(new_variables)) # sess.run(tf.global_variables_initializer()) update_target() if my_skill_set: ## restore skills my_skill_set.restore_skillset(sess=sess) episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True model_saved = False model_file = os.path.join(log_dir, "model", "deepq") # save the initial act model print("Saving the starting model") os.makedirs(os.path.dirname(model_file), exist_ok=True) act.save(model_file + '.pkl') for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True paction = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] if(my_skill_set): skill_obs = obs.copy() primitive_id = paction rew = 0. for _ in range(commit_for): ## break actions into primitives and their params action = my_skill_set.pi(primitive_id=primitive_id, obs = skill_obs.copy(), primitive_params=None) new_obs, skill_rew, done, _ = env.step(action) if render: # print(action) env.render() sleep(0.1) rew += skill_rew skill_obs = new_obs terminate_skill = my_skill_set.termination(new_obs) if done or terminate_skill: break else: action= paction env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) if render: env.render() sleep(0.1) # Store transition in the replay buffer for the outer env replay_buffer.add(obs, paction, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True print("Time:%d, episodes:%d"%(t,len(episode_rewards))) # add hindsight experience if t > learning_starts and t % train_freq == 0: # print('Training!') # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # print(len(episode_rewards), episode_rewards[-11:-1]) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if (checkpoint_freq is not None and t > learning_starts and num_episodes > 50 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) act.save(model_file + '%d.pkl'%save_idx) save_idx += 1 model_saved = True saved_mean_reward = mean_100ep_reward # else: # print(saved_mean_reward, mean_100ep_reward) if (eval_env is not None) and t > learning_starts and t % target_network_update_freq == 0: # dumping other stats logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("%d time spent exploring", int(100 * exploration.value(t))) print("Testing!") eval_episode_rewards = [] eval_episode_successes = [] for i in range(num_eval_episodes): eval_episode_reward = 0. eval_obs = eval_env.reset() eval_obs_start = eval_obs.copy() eval_done = False while(not eval_done): eval_paction = act(np.array(eval_obs)[None])[0] if(my_skill_set): eval_skill_obs = eval_obs.copy() eval_primitive_id = eval_paction eval_r = 0. for _ in range(commit_for): ## break actions into primitives and their params eval_action, _ = my_skill_set.pi(primitive_id=eval_primitive_id, obs = eval_skill_obs.copy(), primitive_params=None) eval_new_obs, eval_skill_rew, eval_done, eval_info = eval_env.step(eval_action) # print('env reward:%f'%eval_skill_rew) if render_eval: print("Render!") eval_env.render() print("rendered!") eval_r += eval_skill_rew eval_skill_obs = eval_new_obs eval_terminate_skill = my_skill_set.termination(eval_new_obs) if eval_done or eval_terminate_skill: break else: eval_action= eval_paction env_action = eval_action reset = False eval_new_obs, eval_r, eval_done, eval_info = eval_env.step(env_action) if render_eval: # print("Render!") eval_env.render() # print("rendered!") eval_episode_reward += eval_r # print("eval_r:%f, eval_episode_reward:%f"%(eval_r, eval_episode_reward)) eval_obs = eval_new_obs eval_episode_success = (eval_info["done"]=="goal reached") if(eval_episode_success): logger.info("success, training epoch:%d,starting config:"%t) eval_episode_rewards.append(eval_episode_reward) eval_episode_successes.append(eval_episode_success) combined_stats = {} # print(eval_episode_successes, np.mean(eval_episode_successes)) combined_stats['eval/return'] = normal_mean(eval_episode_rewards) combined_stats['eval/success'] = normal_mean(eval_episode_successes) combined_stats['eval/episodes'] = (len(eval_episode_rewards)) for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) print("dumping the stats!") logger.dump_tabular() if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) U.load_state(model_file)
def do_network_training(updates_queue: multiprocessing.Queue, weights_queue: multiprocessing.Queue, network, seed, config, lr, total_timesteps, learning_starts, buffer_size, exploration_fraction, exploration_initial_eps, exploration_final_eps, train_freq, batch_size, print_freq, checkpoint_freq, gamma, target_network_update_freq, prioritized_replay, prioritized_replay_alpha, prioritized_replay_beta0, prioritized_replay_beta_iters, prioritized_replay_eps, experiment_name, load_path, network_kwargs): _ = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) def make_obs_ph(name): return ObservationInput(DotaEnvironment.get_observation_space(), name=name) _, train, update_target, debug = deepq.build_train( scope='deepq_train', make_obs_ph=make_obs_ph, q_func=q_func, num_actions=DotaEnvironment.get_action_space().n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, ) if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None U.initialize() update_target() reward_shaper = ActionAdviceRewardShaper(config=config) reward_shaper.load() reward_shaper.generate_merged_demo() full_exp_name = '{}-{}'.format(date.today().strftime('%Y%m%d'), experiment_name) experiment_dir = os.path.join('experiments', full_exp_name) os.makedirs(experiment_dir, exist_ok=True) learning_dir = os.path.join(experiment_dir, 'learning') learning_summary_writer = tf.summary.FileWriter(learning_dir) update_step_t = 0 should_finish = False while not should_finish: message = updates_queue.get() logger.log(f'do_network_training ← {message}') if message.status == UPDATE_STATUS_CONTINUE: transition = message.transition replay_buffer.add(*transition) next_act_step = transition[5] + 1 reward_shaper.set_demo_picked(next_act_step, message.demo_picked) if update_step_t >= learning_starts and update_step_t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(update_step_t)) (obses_t, actions, rewards, obses_tp1, dones, ts, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones, ts = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None biases_t = [] for obs_t, timestep in zip(obses_t, ts): biases_t.append( reward_shaper.get_action_potentials(obs_t, timestep)) biases_tp1 = [] for obs_tp1, timestep in zip(obses_tp1, ts): biases_tp1.append( reward_shaper.get_action_potentials( obs_tp1, timestep + 1)) td_errors, weighted_error = train(obses_t, biases_t, actions, rewards, obses_tp1, biases_tp1, dones, weights) # Loss logging summary = tf.Summary(value=[ tf.Summary.Value(tag='weighted_error', simple_value=weighted_error) ]) learning_summary_writer.add_summary(summary, update_step_t) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if update_step_t % target_network_update_freq == 0: # Update target network periodically. update_target() update_step_t += 1 elif message.status == UPDATE_STATUS_SEND_WEIGHTS: q_func_vars = get_session().run(debug['q_func_vars']) weights_queue.put(q_func_vars) elif message.status == UPDATE_STATUS_FINISH: should_finish = True else: logger.log(f'Unknown status in UpdateMessage: {message.status}')
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=3000, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=3000, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs ): sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name), q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # gamma=gamma, # grad_norm_clipping=10, # param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(100000), initial_p=1.0, final_p=0.02) # Initialize the paramete print(type(act))rs and copy them to the target network. U.initialize() update_target() old_state = None formula_LTLf_1 = "!F(die)" monitoring_RightToLeft = MonitoringSpecification( ltlf_formula=formula_LTLf_1, r=1, c=-10, s=1, f=-10 ) monitoring_specifications = [monitoring_RightToLeft] stepCounter = 0 done = False def RightToLeftConversion(observation) -> TraceStep: print(stepCounter) if(done and not(stepCounter>=199)): die=True else: die=False dictionary={'die': die} print(dictionary) return dictionary multi_monitor = MultiRewardMonitor( monitoring_specifications=monitoring_specifications, obs_to_trace_step=RightToLeftConversion ) episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) episodeCounter=0 num_episodes=0 for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] #print(action) new_obs, rew, done, _ = env.step(action) stepCounter+=1 rew, is_perm = multi_monitor(new_obs) old_state=new_obs # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 if episodeCounter % 100 == 0 or episodeCounter<1: # Show off the result #print("coming here Again and Again") env.render() if done: episodeCounter+=1 num_episodes+=1 obs = env.reset() episode_rewards.append(0) multi_monitor.reset() stepCounter=0 else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean 100 episode reward", round(np.mean(episode_rewards[-101:-1]), 1)) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 500 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) act.save_act() #save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward # if model_saved: # if print_freq is not None: # logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) # load_variables(model_file) return act
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=5, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. batch_size: int size of a batch sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the trained model from. (default: None)(used in test stage) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) med_libs = MedLibs() '''Define Q network inputs: observation place holder(make_obs_ph), num_actions, scope, reuse outputs(tensor of shape batch_size*num_actions): values of each action, Q(s,a_{i}) ''' q_func = build_q_func(network, **network_kwargs) ''' To put observations into a placeholder ''' # TODO: Can only deal with Discrete and Box observation spaces for now # observation_space = env.observation_space (default) # Use sub_obs_space instead observation_space = med_libs.subobs_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) ''' Customize action ''' # TODO: subset of action space. action_dim = med_libs.sub_act_dim ''' Returns: deepq.build_train() act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. act is computed by [build_act] or [build_act_with_param_noise] train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. update_target: () -> () copy the parameters from optimized Q function to the target Q function. debug: {str: function} a bunch of functions to print debug data like q_values. ''' act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=action_dim, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, double_q=True, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': action_dim, } '''Contruct an act object using ActWrapper''' act = ActWrapper(act, act_params) ''' Create the replay buffer''' if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None '''Create the schedule for exploration starting from 1.''' exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) ''' Initialize all the uninitialized variables in the global scope and copy them to the target network. ''' U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() sub_obs = med_libs.custom_obs(obs) # TODO: customize observations pre_obs = obs reset = True mydict = med_libs.action_dict already_starts = False with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: # load_path: a trained model/policy load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) ''' Training loop starts''' t = 0 while t < total_timesteps: if callback is not None: if callback(locals(), globals()): break kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True ''' Choose action: take action and update exploration to the newest value ''' # TODO: Mixed action strategy # Normal status, action is easily determined by rules, use [obs] action = med_libs.simple_case_action(obs) # Distraction status, action is determined by Q, with [sub_obs] if action == -10: action = act(np.array(sub_obs)[None], update_eps=update_eps, **kwargs)[0] action = med_libs.action_Q_env( action ) # TODO:action_Q_env, from Q_action(0~2) to env_action(2~4) reset = False ''' Step action ''' new_obs, rew, done, d_info = env.step(action) d_att_last = int(pre_obs[0][0]) d_att_now = int(obs[0][0]) d_att_next = int(new_obs[0][0]) #TODO: you can customize reward here. ''' Store transition in the replay buffer.''' pre_obs = obs obs = new_obs sub_new_obs = med_libs.custom_obs(new_obs) if (d_att_last == 0 and d_att_now == 1) and not already_starts: already_starts = True if already_starts and d_att_now == 1: replay_buffer.add(sub_obs, action, rew, sub_new_obs, float(done)) episode_rewards[-1] += rew # Sum of rewards t = t + 1 print( '>> Iteration:{}, State[d_att,cd_activate,L4_available,ssl4_activate,f_dc]:{}' .format(t, sub_obs)) print( 'Dis_Last:{}, Dis_Now:{}, Dis_Next:{},Reward+Cost:{}, Action:{}' .format( d_att_last, d_att_now, d_att_next, rew, list(mydict.keys())[list( mydict.values()).index(action)])) # update sub_obs sub_obs = sub_new_obs # Done and Reset if done: print('Done infos: ', d_info) print('======= end =======') obs = env.reset() sub_obs = med_libs.custom_obs(obs) # TODO: custom obs pre_obs = obs # TODO: save obs at t-1 already_starts = False episode_rewards.append(0.0) reset = True # Update the Q network parameters if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # Calculate td-errors actions = med_libs.action_env_Q( actions ) # TODO:action_env_Q, from env_action(2~4) to Q_action(0~2) td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically, copy weights of Q to target Q update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_variables(model_file) return act
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_state(model_file) return act
def learn( env, q_func, num_actions=3, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, #NO discounted reward target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None, demo_replay=[]): #Create functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((64, 64), name=name) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize U, reward, obs, environment U.initialize() #update_target() #WHAT DOES THIS DO OR HOW DO I DO THIS IF I AM NOT USING DEEPQ.BUILD_TRAIN episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # Init action_vector action_vector = [[]] track_unit_vector = np.array([[]]) #Init feature_vector feature_vector = [[]] #feature_vector = FeatureObservation.PopulateFeatureVector(env, obs) #print(feature_vector) # Initialize Unit LastActionTaken Vector - This is for determining which units need to select actions still! time_between_actions = 9.0 #frames?? #ActionVector = np.array([[]], dtype=[('unit_id', 'int'), ('x_pos', 'float'), ('y_pos', 'float'), # ('last_action', 'float')]) #unit_count_for_ActionVector = 0 #for unit in feature_vector: # if unit[1] == 1: #Add unit identifier to last action taken vector # np.append(ActionVector, [unit_count_for_ActionVector, unit[2], unit[3], (-1) * time_between_actions], axis=1) # unit_count_for_ActionVector = unit_count_for_ActionVector + 1 reset = True # WHAT IS RESET with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") First = True for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break #EXPLORATION SPACE kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True #Populate Observation Vector feature_vector = FeatureObservation.PopulateFeatureVector( env, obs, feature_vector, action_vector) print(feature_vector) track_unit_vector = TrackUnits.track(track_unit_vector, feature_vector) print(track_unit_vector) #for each unit in the vector, get an action for that unit... for u in range(0, feature_vector.shape[0]): if feature_vector[u][1] == 1: xy = UnitAction.take_action(feature_vector, u, q_func) obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [[ 0 ], [feature_vector[u][3], feature_vector[u][2]]]) ]) #if movement then move #if action then action obs = env.step(actions=[ sc2_actions.FunctionCall(_ATTACK_SCREEN, [[0], xy]) ]) #update track_unit_vector #DO ACTIONS obs, screen, player = common.select_marine(env, obs) #get action from training model thingy #action = act() reset = False rew = 0 new_action = None #obs, new_action = common.marine_action(env, obs, player, action) new_screen = obs[0].observation["screen"][_PLAYER_RELATIVE] army_count = env._obs.observation.player_common.army_count rew += obs[0].reward / army_count game_info = sc_pb.ResponseGameInfo feature_vector = FeatureObservation.PopulateFeatureVector(env, obs) output = q_func(feature_vector) print(output) # available_actions = obs[0].observation["available_actions"] # for i in available_actions: # print(i) # print("") # #select marine and see what we can do now... # obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_POINT, [[0], [feature_vector[0][2], # feature_vector[0][3]]])]) # obs = env.step(actions=[sc2_actions.FunctionCall(_ATTACK_SCREEN, [_NOT_QUEUED, [feature_vector[12][2], # feature_vector[12][3]]])]) # available_actions = obs[0].observation["available_actions"] # for i in available_actions: # print(i) # print("") for t in range(max_timesteps): for unit in FeatureVector: game_info = sc_pb.ResponseGameInfo #obs, screen, player = common.select_marine(env, obs) action = act(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] #reset = False #rew = 0 #new_action = None #obs, new_action = common.marine_action(env, obs, player, action) #Make decisions for each ally unit based on Feature Vector fed into #for unit in FeatureVector: # if unit[1] == 1: # Then Friendly needs to make decision #do things #if ally army count > 0 make army actions try: if army_count > 0 and _ATTACK_SCREEN in obs[0].observation[ "available_actions"]: obs = env.step(actions=new_action) else: new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) except Exception as e: print(e)
def learn( env, q_func, # input obs,num od actions etc and obtain q value for each action num_actions=16, # available actions: up down left right lr=5e-4, max_timesteps=100000, buffer_size=50000, # size of the replay buffer exploration_fraction=0.1, # during the first 10% training period, exploration rate is decreased from 1 to 0.02 exploration_final_eps=0.02, # final value of random action probability train_freq=1, # update the model every `train_freq` steps. batch_size=32, # size of a batched sampled from replay buffer for training print_freq=1, checkpoint_freq=10000, learning_starts=1000, # time for the model to collect transitions before learning starts gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, # beta keeps to be beta0 prioritized_replay_eps=1e-6, num_cpu=16, # number of cpus to use for training param_noise=False, # whether or not to use parameter space noise param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph( name ): # Creates a placeholder for a batch of tensors of a given shape and dtype return U_b.BatchInput((16, 16), name=name) act_x, train_x, update_target_x, debug_x = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, # clip gradient norms to this value scope="deepq_x") act_y, train_y, update_target_y, debug_y = deepq.build_train( #because there are two players in the game make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq_y") act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer_x = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) replay_buffer_y = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule_x = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, # 0.4->1 final_p=1.0) beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer_x = ReplayBuffer(buffer_size) replay_buffer_y = ReplayBuffer(buffer_size) beta_schedule_x = None beta_schedule_y = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. ---环境初始化 U.initialize() update_target_x() update_target_y() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # start a new episode # Select all marines first ---选择所有个体,获得新的观察 obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) # Apply actions, step the world forward, and return observations. # 查看返回的字典中屏幕中的目标关系分布图:1表示着地图中个体的位置,3表示着矿物的位置,就是终端的矩阵图 player_relative = obs[0].observation["feature_screen"][ _PLAYER_RELATIVE] #obs is a 'TimeStep' whose type is tuple of ['step_type', 'reward', 'discount', 'observation'];step_type.first or mid or last # 矿的位置 0,1矩阵分布 screen = (player_relative == _PLAYER_NEUTRAL).astype( int ) #+ path_memory screen=1 or 0 to indicate the location of mineral # 队友的位置,给出行列信息 player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero( ) #the location of team member: row, col <-> y,x # print(player_relative) # print('*************') # print(screen) # print(_PLAYER_FRIENDLY) # # print(player_x) # print(player_y) # print('ssss) # if (len(player_x) == 0): # player_x = np.array([0]) # # print('player_x from null to 0') # # print(player_x) # if (len(player_y) == 0): # player_y = np.array([0]) # # print('player_y from null to 0') # # print(player_y) player = [int(player_x.mean()), int(player_y.mean())] reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join("model/", "mineral_shards") #给了一个模型保存路径 print(model_file) for t in range(max_timesteps): # print('timestep=',t) if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value--更新探索并采取动作 kwargs = {} if not param_noise: update_eps = exploration.value(t) # 输出一个1->0.02之间的值 update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # actions obtained after exploration action_x = act_x(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] # print('action_x is ',action_x) action_y = act_y(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] # print('action_y is ',action_y) reset = False # coord = [player[0], player[1]] rew = 0 #reward coord = [action_x, action_y] if _MOVE_SCREEN not in obs[0].observation["available_actions"]: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) # obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] # else: # new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) player_relative = obs[0].observation["feature_screen"][ _PLAYER_RELATIVE] # print(player_relative) new_screen = (player_relative == _PLAYER_NEUTRAL).astype(int) # print(_PLAYER_FRIENDLY) # print(player_x) # print(player_y) # print('ssssss2') # if (len(player_x) == 0): # player_x = np.array([0]) # # print('player_x from null to 0') # # print(player_x) # if (len(player_y) == 0): # player_y = np.array([0]) # # print('player_y from null to 0') # # print(player_y) # player = [int(player_x.mean()), int(player_y.mean())] rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer_x.add(screen, action_x, rew, new_screen, float(done)) replay_buffer_y.add(screen, action_y, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: obs = env.reset() # player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] # screent = (player_relative == _PLAYER_NEUTRAL).astype(int) # # player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() # player = [int(player_x.mean()), int(player_y.mean())] # Select all marines first env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) episode_rewards.append(0.0) # print("episode_rewards is ", episode_rewards) print('num_episodes is', len(episode_rewards)) #episode_minerals.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: #train_freq=1: update the model every `train_freq` steps # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience_x = replay_buffer_x.sample( batch_size, beta=beta_schedule_x.value(t)) (obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x, weights_x, batch_idxes_x) = experience_x experience_y = replay_buffer_y.sample( batch_size, beta=beta_schedule_y.value(t)) (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y else: obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x = replay_buffer_x.sample( batch_size) weights_x, batch_idxes_x = np.ones_like( rewards_x ), None # weights_x is an array padded with 1 which has the same shape as rewards_x obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample( batch_size) weights_y, batch_idxes_y = np.ones_like(rewards_y), None td_errors_x = train_x(obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x, weights_x) td_errors_y = train_y(obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y) if prioritized_replay: new_priorities_x = np.abs( td_errors_x) + prioritized_replay_eps new_priorities_y = np.abs( td_errors_y) + prioritized_replay_eps replay_buffer_x.update_priorities(batch_idxes_x, new_priorities_x) replay_buffer_y.update_priorities(batch_idxes_y, new_priorities_y) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target_x() update_target_y() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) # round: sishewuru value num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act_x), ActWrapper(act_y)
def learn( env, var_func, cvar_func, nb_atoms, run_alpha=None, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=0.95, target_network_update_freq=500, num_cpu=4, callback=None, periodic_save_freq=1000000, periodic_save_path=None, grad_norm_clip=None, ): """Train a CVaR DQN model. Parameters ------- env: gym.Env environment to train on var_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. cvar_func: function same as var_func nb_atoms: int number of atoms used in CVaR discretization run_alpha: float optimize CVaR_alpha while running. None if you want random alpha each episode. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the best model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. periodic_save_freq: int How often do we save the model - periodically periodic_save_path: str Where do we save the model - periodically grad_norm_clip: float Clip gradient to this value. No clipping if None Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/distdeepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = make_session(num_cpu=num_cpu) sess.__enter__() obs_space_shape = env.observation_space.shape def make_obs_ph(name): return U.BatchInput(obs_space_shape, name=name) act, train, update_target, debug = build_train( make_obs_ph=make_obs_ph, var_func=var_func, cvar_func=cvar_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, nb_atoms=nb_atoms, grad_norm_clipping=grad_norm_clip) act_params = { 'make_obs_ph': make_obs_ph, 'cvar_func': cvar_func, 'var_func': var_func, 'num_actions': env.action_space.n, 'nb_atoms': nb_atoms } # Create the replay buffer replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True episode = 0 alpha = 1. # --------------------------------- RUN --------------------------------- with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): print('Target reached') model_saved = False break # Take action and update exploration to the newest value update_eps = exploration.value(t) update_param_noise_threshold = 0. action = act(np.array(obs)[None], alpha, update_eps=update_eps)[0] reset = False new_obs, rew, done, _ = env.step(action) # ===== DEBUG ===== # s = np.ones_like(np.array(obs)[None]) # a = np.ones_like(act(np.array(obs)[None], run_alpha, update_eps=update_eps)) # r = np.array([0]) # s_ = np.ones_like(np.array(obs)[None]) # d = np.array([False]) # s = obs[None] # a = np.array([action]) # r = np.array([rew]) # s_ = new_obs[None] # d = np.array([done]) # if t % 100 == 0: # for f in debug: # print(f(s, a, r, s_, d)) # print('-------------') # # # print([sess.run(v) for v in tf.global_variables('cvar_dqn/cvar_func')]) # # print([sess.run(v) for v in tf.global_variables('cvar_dqn/var_func')]) # ================= # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if run_alpha is None: alpha = np.random.random() if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # Log results and periodically save the model mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.record_tabular("(current alpha)", "%.2f" % alpha) logger.dump_tabular() # save and report best model if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward # save periodically if periodic_save_freq is not None and periodic_save_path is not None and t > learning_starts: if t % periodic_save_freq == 0: ActWrapper(act, act_params).save("{}-{}.pkl".format( periodic_save_path, int(t / periodic_save_freq))) if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)
# Create the replay buffer replay_buffer = create_replay_buffer(replay, 50000) # Create the schedule for exploration starting from 1 (every action is random) down to exploration = LinearSchedule(schedule_timesteps=300000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. if replay != 'None': replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 if is_solved: break else:
def learn( policy, env, seed, training, use_adda, adda_lr, adda_batch, total_timesteps=int(80e6), lrschedule='linear', nsteps=20, max_grad_norm=None, lr=7e-4, epsilon=0.1, alpha=0.99, gamma=0.99, log_interval=1000, #alpha and epsilon for RMSprop used in Model() exploration_fraction=0.8, exploration_final_eps=0.001, target_network_update_freq=10000 ): # Additional arguments for epsilon greedy tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs # 16, from train() in run_doom.py -> used by env, which is a parameter in learn() print('Num Envs {}'.format(nenvs)) ob_space = env.observation_space # (84,84,1) ac_space = env.action_space # Discrete(6) print('RL SEED: ', seed) model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, use_adda=use_adda, adda_lr=adda_lr, adda_batch=adda_batch, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, seed=seed) print('Model Obj created') #import sys; sys.exit() runner = Runner(env, model, nsteps=nsteps, gamma=gamma) if training: nbatch = nenvs * nsteps # 16*20 exploration = LinearSchedule( schedule_timesteps=50000000, initial_p=1.0, final_p=exploration_final_eps ) # U want to hit lowest epsilon value in 50e6 steps model.update_target() tstart = time.time() save_step = 0 for update in range( 1, total_timesteps // nbatch + 1 ): # For 100k steps, loop is from 1 to 313 -> runs 312 updates update_eps = exploration.value(update * nbatch) # Performs 1 update step (320 total_timesteps). For 16 envs nd nstep = 20, shapes r: (16*20,84,84,1), (320,), (320,) resp obs, rewards, actions = runner.run(update_eps) action_value_loss, cur_lr = model.train( obs, rewards, actions, update) # Computes TD Error nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) # Save model every 1e6 steps (each iteration of loop makes 320 steps. 320*3125 = 1e6 steps. So update % 3125) if update % 3125 == 0 or update == 1: model.save_model(save_step) save_step += 1 #print('Model Saved') # Update target network every 10k steps if update % 31 == 0: #print('Target Network Updated') model.update_target() if update % log_interval == 0 or update == 1: logger.record_tabular("learning rate", cur_lr) #logger.record_tabular("adda learning rate", cur_adda_lr) logger.record_tabular("epsilon", update_eps) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("action_value_loss", float(action_value_loss)) #logger.record_tabular("mapping_loss", float(mapping_loss_val)) #logger.record_tabular("adversary_loss", float(adversary_loss_val)) logger.record_tabular("time_elapsed", nseconds) logger.dump_tabular() else: snapshots = [66] seeds = [0] #snapshot_rewards = np.zeros(shape=(seeds, snapshots+1)) seed_list = ['Seed 0', 'Seed 1', 'Seed 2'] for seed in seeds: #seed = seed + 1 snapshot_reward = [] #snapshot_health = [] print('################### Seed {}!!! ###################'.format( seed)) tstart = time.time() #for snapshot in range(snapshots+1): for snapshot in snapshots: model.load_model(snapshot, seed, adda_mode=True) #print('##################################################') print('Evaluating snapshot {}!!!'.format(snapshot)) reward = runner.runner_eval_parallel(num_episodes=1, num_envs=nenvs) #reward = runner.runner_eval(num_episodes=1000) snapshot_reward.append(reward) #snapshot_health.append(health) print('Mean Reward of every ST decLR 40e6 snapshot on target: ', snapshot_reward) print('Max Reward: ', max(snapshot_reward)) #snapshot_rewards[seed] = snapshot_reward #print('Mean Health of every snapshot: ', snapshot_health) print('##################################################') nseconds = time.time() - tstart print('\n') print('Time Elapsed:', nseconds) epochs = np.arange(0, snapshots + 1) #plt.figure() #plt.plot(epochs, np.array(snapshot_reward), '-o', label = seed_list[seed]) #plt.legend(loc = 'lower right') #plt.xlabel('TimeSteps (1e6)') #plt.ylabel('Mean Reward after 1000 episodes') #plt.savefig('TargetEnv_on_SourceModel with ADDA every 10 steps after 20e6.png') # Create plot of mean reward of all seed values with std devns mean = [] std = [] for x, y, z in zip(snapshot_rewards[0], snapshot_rewards[1], snapshot_rewards[2]): mean_val = np.mean([x, y, z]) std_val = np.std([x, y, z]) mean.append(mean_val) std.append(std_val) epochs = np.arange(0, snapshots + 1) lower = np.array(mean) - np.array(std) upper = np.array(mean) + np.array(std) print('Mean of 3 seeds: ', mean) print('Std Devn of 3 seeds: ', std) '''
class DQNEvaluator(Evaluator): def __init__(self, config, env_creator): self.config = config self.local_timestep = 0 self.episode_rewards = [0.0] self.episode_lengths = [0.0] if "cartpole" in self.config["env_config"]: self.env = env_creator(self.config["env_config"]) else: self.env = wrap_deepmind( env_creator(self.config["env_config"]), clip_rewards=False, frame_stack=True, scale=True) self.obs = self.env.reset() self.sess = U.make_session() self.sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = self.env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) if "cartpole" in self.config["env_config"]: q_func = models.mlp([64]) else: q_func = models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True, ) act, self.train, self.update_target, debug = build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=self.env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=self.config["lr"]), gamma=self.config["gamma"], grad_norm_clipping=10, param_noise=False ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': self.env.action_space.n, } self.act = ActWrapper(act, act_params) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.config["exploration_fraction"] * self.config["schedule_max_timesteps"]), initial_p=1.0, final_p=self.config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() def sample(self): obs, actions, rewards, new_obs, dones = [], [], [], [], [] for _ in range( self.config["sample_batch_size"] + self.config["n_step"] - 1): update_eps = self.exploration.value(self.local_timestep) action = self.act( np.array(self.obs)[None], update_eps=update_eps)[0] obs_tp1, reward, done, _ = self.env.step(action) obs.append(self.obs) actions.append(action) rewards.append(np.sign(reward)) new_obs.append(obs_tp1) dones.append(1.0 if done else 0.0) self.obs = obs_tp1 self.episode_rewards[-1] += reward self.episode_lengths[-1] += 1 if done: self.obs = self.env.reset() self.episode_rewards.append(0.0) self.episode_lengths.append(0.0) self.local_timestep += 1 # N-step Q adjustments if self.config["n_step"] > 1: # Adjust for steps lost from truncation self.local_timestep -= (self.config["n_step"] - 1) adjust_nstep( self.config["n_step"], self.config["gamma"], obs, actions, rewards, new_obs, dones) batch = SampleBatch({ "obs": obs, "actions": actions, "rewards": rewards, "new_obs": new_obs, "dones": dones, "weights": np.ones_like(rewards)}) assert batch.count == self.config["sample_batch_size"] # td_errors = self.agent.compute_td_error(batch) batch.data["obs"] = [pack(o) for o in batch["obs"]] batch.data["new_obs"] = [pack(o) for o in batch["new_obs"]] # new_priorities = ( # np.abs(td_errors) + self.config["prioritized_replay_eps"]) # batch.data["weights"] = new_priorities return batch def compute_gradients(self, samples): raise NotImplementedError def apply_gradients(self, grads): raise NotImplementedError def compute_apply(self, samples): return self.train( samples["obs"], samples["actions"], samples["rewards"], samples["new_obs"], samples["dones"], samples["weights"]) def get_weights(self): raise NotImplementedError def set_weights(self, weights): raise NotImplementedError def stats(self): mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 5) mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 5) return { "mean_100ep_reward": mean_100ep_reward, "mean_100ep_length": mean_100ep_length, "num_episodes": len(self.episode_rewards), "local_timestep": self.local_timestep, }
class DDPG(object): @store_args def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, temperature, prioritization, env_name, alpha, beta0, beta_iters, eps, max_timesteps, rank_method, reuse=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] self.prioritization = prioritization self.env_name = env_name self.temperature = temperature self.rank_method = rank_method # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) stage_shapes['w'] = (None, ) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = { key: (self.T if key != 'o' else self.T + 1, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T + 1, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size if self.prioritization == 'energy': self.buffer = ReplayBufferEnergy(buffer_shapes, buffer_size, self.T, self.sample_transitions, self.prioritization, self.env_name) elif self.prioritization == 'tderror': self.buffer = PrioritizedReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions, alpha, self.env_name) if beta_iters is None: beta_iters = max_timesteps self.beta_schedule = LinearSchedule(beta_iters, initial_p=beta0, final_p=1.0) else: self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) def _random_action(self, n): return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu)) def _preprocess_og(self, o, ag, g): if self.relative_goals: g_shape = g.shape g = g.reshape(-1, self.dimg) ag = ag.reshape(-1, self.dimg) g = self.subtract_goals(g, ag) g = g.reshape(*g_shape) o = np.clip(o, -self.clip_obs, self.clip_obs) g = np.clip(g, -self.clip_obs, self.clip_obs) return o, g def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False, compute_Q=False): o, g = self._preprocess_og(o, ag, g) policy = self.target if use_target_net else self.main # values to compute vals = [policy.pi_tf] if compute_Q: vals += [policy.Q_pi_tf] # feed feed = { policy.o_tf: o.reshape(-1, self.dimo), policy.g_tf: g.reshape(-1, self.dimg), policy.u_tf: np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32) } ret = self.sess.run(vals, feed_dict=feed) # action postprocessing u = ret[0] noise = noise_eps * self.max_u * np.random.randn( *u.shape) # gaussian noise u += noise u = np.clip(u, -self.max_u, self.max_u) u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * ( self._random_action(u.shape[0]) - u) # eps-greedy if u.shape[0] == 1: u = u[0] u = u.copy() ret[0] = u if len(ret) == 1: return ret[0] else: return ret def get_td_errors(self, o, g, u): o, g = self._preprocess_og(o, g, g) vals = [self.td_error_tf] r = np.ones((o.reshape(-1, self.dimo).shape[0], 1)) feed = { self.target.o_tf: o.reshape(-1, self.dimo), self.target.g_tf: g.reshape(-1, self.dimg), self.bath_tf_r: r, self.main.o_tf: o.reshape(-1, self.dimo), self.main.g_tf: g.reshape(-1, self.dimg), self.main.u_tf: u.reshape(-1, self.dimu) } td_errors = self.sess.run(vals, feed_dict=feed) td_errors = td_errors.copy() return td_errors def store_episode(self, episode_batch, dump_buffer, w_potential, w_linear, w_rotational, rank_method, clip_energy, update_stats=True): """ episode_batch: array of batch_size x (T or T+1) x dim_key 'o' is of size T+1, others are of size T """ if self.prioritization == 'tderror': self.buffer.store_episode(episode_batch, dump_buffer) elif self.prioritization == 'energy': self.buffer.store_episode(episode_batch, w_potential, w_linear, w_rotational, rank_method, clip_energy) else: self.buffer.store_episode(episode_batch) if update_stats: # add transitions to normalizer episode_batch['o_2'] = episode_batch['o'][:, 1:, :] episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :] num_normalizing_transitions = transitions_in_episode_batch( episode_batch) if self.prioritization == 'energy': if not self.buffer.current_size == 0 and not len( episode_batch['ag']) == 0: transitions = self.sample_transitions( episode_batch, num_normalizing_transitions, 'none', 1.0, True) elif self.prioritization == 'tderror': transitions, weights, episode_idxs = \ self.sample_transitions(self.buffer, episode_batch, num_normalizing_transitions, beta=0) else: transitions = self.sample_transitions( episode_batch, num_normalizing_transitions) o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions[ 'g'], transitions['ag'] transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g) self.o_stats.update(transitions['o']) self.g_stats.update(transitions['g']) self.o_stats.recompute_stats() self.g_stats.recompute_stats() def get_current_buffer_size(self): return self.buffer.get_current_size() def dump_buffer(self, epoch): self.buffer.dump_buffer(epoch) def _sync_optimizers(self): self.Q_adam.sync() self.pi_adam.sync() def _grads(self): # Avoid feed_dict here for performance! critic_loss, actor_loss, Q_grad, pi_grad, td_error = self.sess.run([ self.Q_loss_tf, self.main.Q_pi_tf, self.Q_grad_tf, self.pi_grad_tf, self.td_error_tf ]) return critic_loss, actor_loss, Q_grad, pi_grad, td_error def _update(self, Q_grad, pi_grad): self.Q_adam.update(Q_grad, self.Q_lr) self.pi_adam.update(pi_grad, self.pi_lr) def sample_batch(self, t): if self.prioritization == 'energy': transitions = self.buffer.sample(self.batch_size, self.rank_method, temperature=self.temperature) weights = np.ones_like(transitions['r']).copy() elif self.prioritization == 'tderror': transitions, weights, idxs = self.buffer.sample( self.batch_size, beta=self.beta_schedule.value(t)) else: transitions = self.buffer.sample(self.batch_size) weights = np.ones_like(transitions['r']).copy() o, o_2, g = transitions['o'], transitions['o_2'], transitions['g'] ag, ag_2 = transitions['ag'], transitions['ag_2'] transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g) transitions['o_2'], transitions['g_2'] = self._preprocess_og( o_2, ag_2, g) transitions['w'] = weights.flatten().copy() # note: ordered dict transitions_batch = [ transitions[key] for key in self.stage_shapes.keys() ] if self.prioritization == 'tderror': return (transitions_batch, idxs) else: return transitions_batch def stage_batch(self, t, batch=None): # if batch is None: if self.prioritization == 'tderror': batch, idxs = self.sample_batch(t) else: batch = self.sample_batch(t) assert len(self.buffer_ph_tf) == len(batch) self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch))) if self.prioritization == 'tderror': return idxs def train(self, t, dump_buffer, stage=True): if not self.buffer.current_size == 0: if stage: if self.prioritization == 'tderror': idxs = self.stage_batch(t) else: self.stage_batch(t) critic_loss, actor_loss, Q_grad, pi_grad, td_error = self._grads() if self.prioritization == 'tderror': new_priorities = np.abs(td_error) + self.eps # td_error if dump_buffer: T = self.buffer.buffers['u'].shape[1] episode_idxs = idxs // T t_samples = idxs % T batch_size = td_error.shape[0] with self.buffer.lock: for i in range(batch_size): self.buffer.buffers['td'][episode_idxs[i]][ t_samples[i]] = td_error[i] self.buffer.update_priorities(idxs, new_priorities) self._update(Q_grad, pi_grad) return critic_loss, actor_loss def _init_target_net(self): self.sess.run(self.init_target_net_op) def update_target_net(self): self.sess.run(self.update_target_net_op) def clear_buffer(self): self.buffer.clear_buffer() def _vars(self, scope): res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope + '/' + scope) assert len(res) > 0 return res def _global_vars(self, scope): res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope + '/' + scope) return res def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf.get_default_session() if self.sess is None: self.sess = tf.InteractiveSession() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([ (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys()) ]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) batch_tf['w'] = tf.reshape(batch_tf['w'], [-1, 1]) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic(target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value( batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.td_error_tf = tf.stop_gradient(target_tf) - self.main.Q_tf self.errors_tf = tf.square(self.td_error_tf) self.errors_tf = tf.reduce_mean(batch_tf['w'] * self.errors_tf) self.Q_loss_tf = tf.reduce_mean(self.errors_tf) self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars( 'g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net() def logs(self, prefix=''): logs = [] logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))] logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))] logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))] logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))] if prefix is not '' and not prefix.endswith('/'): return [(prefix + '/' + key, val) for key, val in logs] else: return logs def __getstate__(self): """Our policies can be loaded from pkl, but after unpickling you cannot continue training. """ excluded_subnames = [ '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main', 'target', 'lock', 'env', 'sample_transitions', 'stage_shapes', 'create_actor_critic' ] state = { k: v for k, v in self.__dict__.items() if all([not subname in k for subname in excluded_subnames]) } state['buffer_size'] = self.buffer_size state['tf'] = self.sess.run( [x for x in self._global_vars('') if 'buffer' not in x.name]) return state def __setstate__(self, state): if 'sample_transitions' not in state: # We don't need this for playing the policy. state['sample_transitions'] = None state['env_name'] = None # No need for playing the policy self.__init__(**state) # set up stats (they are overwritten in __init__) for k, v in state.items(): if k[-6:] == '_stats': self.__dict__[k] = v # load TF variables vars = [x for x in self._global_vars('') if 'buffer' not in x.name] assert (len(vars) == len(state["tf"])) node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])] self.sess.run(node)
def learn(env, q_func, num_actions=64 * 64, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((64, 64), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] episode_minerals = [0.0] saved_mean_reward = None path_memory = np.zeros((64, 64)) obs = env.reset() # Select all marines first step_result = env.step( actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] obs = player_relative + path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 32): obs = shift(LEFT, player[0] - 32, obs) elif (player[0] < 32): obs = shift(RIGHT, 32 - player[0], obs) if (player[1] > 32): obs = shift(UP, player[1] - 32, obs) elif (player[1] < 32): obs = shift(DOWN, 32 - player[1], obs) reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] reset = False coord = [player[0], player[1]] rew = 0 path_memory_ = np.array(path_memory, copy=True) if (action == 0): #UP if (player[1] >= 16): coord = [player[0], player[1] - 16] path_memory_[player[1] - 16:player[1], player[0]] = -1 elif (player[1] > 0): coord = [player[0], 0] path_memory_[0:player[1], player[0]] = -1 else: rew -= 1 elif (action == 1): #DOWN if (player[1] <= 47): coord = [player[0], player[1] + 16] path_memory_[player[1]:player[1] + 16, player[0]] = -1 elif (player[1] > 47): coord = [player[0], 63] path_memory_[player[1]:63, player[0]] = -1 else: rew -= 1 elif (action == 2): #LEFT if (player[0] >= 16): coord = [player[0] - 16, player[1]] path_memory_[player[1], player[0] - 16:player[0]] = -1 elif (player[0] < 16): coord = [0, player[1]] path_memory_[player[1], 0:player[0]] = -1 else: rew -= 1 elif (action == 3): #RIGHT if (player[0] <= 47): coord = [player[0] + 16, player[1]] path_memory_[player[1], player[0]:player[0] + 16] = -1 elif (player[0] > 47): coord = [63, player[1]] path_memory_[player[1], player[0]:63] = -1 else: rew -= 1 else: #Cannot move, give minus reward rew -= 1 if (path_memory[coord[1], coord[0]] != 0): rew -= 0.5 path_memory = np.array(path_memory_) #print("action : %s Coord : %s" % (action, coord)) new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] step_result = env.step(actions=new_action) player_relative = step_result[0].observation["screen"][ _PLAYER_RELATIVE] new_obs = player_relative + path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 32): new_obs = shift(LEFT, player[0] - 32, new_obs) elif (player[0] < 32): new_obs = shift(RIGHT, 32 - player[0], new_obs) if (player[1] > 32): new_obs = shift(UP, player[1] - 32, new_obs) elif (player[1] < 32): new_obs = shift(DOWN, 32 - player[1], new_obs) rew += step_result[0].reward * 10 done = step_result[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew episode_minerals[-1] += step_result[0].reward if done: obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] obs = player_relative + path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 32): obs = shift(LEFT, player[0] - 32, obs) elif (player[0] < 32): obs = shift(RIGHT, 32 - player[0], obs) if (player[1] > 32): obs = shift(UP, player[1] - 32, obs) elif (player[1] < 32): obs = shift(DOWN, 32 - player[1], obs) # Select all marines first env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) episode_rewards.append(0.0) episode_minerals.append(0.0) path_memory = np.zeros((64, 64)) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) mean_100ep_mineral = round(np.mean(episode_minerals[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("mean 100 episode mineral", mean_100ep_mineral) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act)
class DeepqLearner: def __init__(self, env, q_func, config=DEEPQ_CONFIG, callback=None): self.env = env self.q_func = q_func self.config = config self.callback = callback # Create all the functions necessary to train the model gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=config["gpu_memory_fraction"]) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, self.train, self.update_target, self.debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=config["lr"]), gamma=config["gamma"], grad_norm_clipping=10, param_noise=config["param_noise"]) act_params = { # 'make_obs_ph': make_obs_ph, # 'q_func': q_func, 'num_actions': env.action_space.n, } self.act = ActWrapper(act, act_params) # Create the replay buffer self.config = config self.replay_buffer = None self.beta_schedule = None self.make_replay_buffer() # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() self.t = 0 self.episode_rewards = [0.0] self.num_episodes = 1 self.saved_mean_reward = None self.saved_episode_num = None self.episode_frames = 0 self.model_file = None self.start_time = 0 self.episode_start_time = 0 def make_replay_buffer(self): if self.config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( self.config["buffer_size"], alpha=self.config["prioritized_replay_alpha"]) if self.config["prioritized_replay_beta_iters"] is None: self.config["prioritized_replay_beta_iters"] = self.config[ "max_timesteps"] self.beta_schedule = LinearSchedule( self.config["prioritized_replay_beta_iters"], initial_p=self.config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.config["buffer_size"]) self.beta_schedule = None def run(self): reset = True obs = self.env.reset() self.start_time = time.time() self.episode_start_time = time.time() with tempfile.TemporaryDirectory() as td: td = self.config["checkpoint_path"] or td self.model_file = os.path.join(td, "model") if tf.train.latest_checkpoint(td) is not None: load_state(self.model_file) logger.log('Loaded model from {}'.format(self.model_file)) for self.t in range(self.config["max_timesteps"]): if self.callback is not None: if self.callback(locals(), globals()): break # Determine next action to take, then take that action and observe results action = self._action(obs, reset) env_action = action new_obs, rew, done, _ = self.env.step(env_action) self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs # Increment typical values reset = False self.episode_frames += 1 self.episode_rewards[-1] += rew # See if done with episode if done: obs = self._reset() reset = True # Do training and deepq updating as needed if self.t > self.config["learning_starts"]: if self.t % self.config["train_freq"] == 0: self._train() if self.t % self.config["target_network_update_freq"] == 0: self.update_target() def _action(self, obs, reset): # Take action and update exploration to the newest value kwargs = {} if not self.config["param_noise"]: update_eps = self.exploration.value(self.t) # update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - self.exploration.value(self.t) + self.exploration.value(self.t) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True return self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] def _train(self): try: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.config["prioritized_replay"]: experience = self.replay_buffer.sample( self.config["batch_size"], beta=self.beta_schedule.value(self.t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.config["batch_size"]) weights, batch_idxes = np.ones_like(rewards), None # Determine errors td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones, weights) if self.config["prioritized_replay"]: new_priorities = np.abs( td_errors) + self.config["prioritized_replay_eps"] self.replay_buffer.update_priorities(batch_idxes, new_priorities) except Exception as e: self.make_replay_buffer() print(e) def _reset(self): self.attempt_print() self.attempt_checkpoint() self.episode_rewards.append(0.0) self.num_episodes += 1 self.episode_frames = 0 self.episode_start_time = time.time() return self.env.reset() def calc_mean_100ep_reward(self): if self.num_episodes <= 1: return None return round(np.mean(self.episode_rewards[-101:-1]), 1) def attempt_print(self): p_freq = self.config["print_freq"] if p_freq is not None and self.num_episodes % p_freq == 0: logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(self.t))) logger.record_tabular("reward - current", self.episode_rewards[-1]) logger.record_tabular("reward - mean", self.calc_mean_100ep_reward()) logger.record_tabular("reward - saved", self.saved_mean_reward) logger.record_tabular("episode # - current", self.num_episodes) logger.record_tabular("episode # - saved", self.saved_episode_num) logger.record_tabular("steps - total", self.t) logger.record_tabular("steps - episode", self.episode_frames) logger.record_tabular( "time - ep duration", str(time.time() - self.episode_start_time) + "s") logger.record_tabular("time - remaining", self.estimate_time_remaining()) logger.dump_tabular() def estimate_time_remaining(self): duration = time.time() - self.start_time if duration <= 0: return "Unknown" time_remaining = self.t / duration * (self.config["max_timesteps"] - self.t) / 60.0 suffix = "" # Format based on time if time_remaining < MINUTE: suffix = " seconds" elif time_remaining < HOUR: suffix = " minutes" time_remaining = time_remaining / MINUTE elif time_remaining < DAY: suffix = " hours" time_remaining = time_remaining / HOUR else: suffix = " days" time_remaining = time_remaining / DAY # Round remaining time and return time_remaining = round(time_remaining * 100.0) / 100.0 return str(time_remaining) + suffix def attempt_checkpoint(self): # Determine if we're going to checkpoint c_freq = self.config["checkpoint_freq"] if c_freq is not None \ and self.num_episodes > 100 \ and self.t > self.config["learning_starts"] \ and self.num_episodes % c_freq == 0: # Determine if reward is growing mean_100ep_reward = self.calc_mean_100ep_reward() if self.saved_mean_reward is None or mean_100ep_reward > self.saved_mean_reward: if self.config["print_freq"] is not None: logger.log( "Saving model due to mean reward increase: {} -> {}". format(self.saved_mean_reward, mean_100ep_reward)) self.saved_mean_reward = mean_100ep_reward self.saved_episode_num = self.num_episodes save_state(self.model_file) def save(self, save_path): print("Saving model to " + save_path) self.act.save(save_path)
def learn(env, q_func, num_actions=4, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((16, 16), name=name) act_x, train_x, update_target_x, debug_x = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope='deep_x') act_y, train_y, update_target_y, debug_y = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope='deep_y') act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer_x = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) replay_buffer_y = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule_x = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer_x = ReplayBuffer(buffer_size) replay_buffer_y = ReplayBuffer(buffer_size) beta_schedule_x = None beta_schedule_y = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) U.initialize() update_target_x() update_target_y() episode_rewards = [0.0] episode_beacons = [0.0] saved_mean_reward = None obs = env.reset() # Select marines obs = env.step( actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = (player_relative == _PLAYER_NEUTRAL).astype(int) player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] #print(np.array(screen)[None].shape) reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join("model/", "mineral_shards") print(model_file) for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True #print(np.array(screen)[None].shape) action_x = act_x(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] action_y = act_y(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False coord = [player[0], player[1]] rew = 0 coord = [action_x, action_y] change_x = coord[0] - player[0] change_y = coord[1] - player[1] change_m = np.sqrt((change_x**2) + (change_y**2)) #print(change_y, change_x, change_m) # action 0-3 # path_memory = np.array(path_memory_) # at end of action, edit path_memory if _MOVE_SCREEN not in obs[0].observation["available_actions"]: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) else: new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] obs = env.step(actions=new_action) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = (player_relative == _PLAYER_NEUTRAL).astype(int) player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] rew = obs[0].reward * 10 done = obs[0].step_type == environment.StepType.LAST replay_buffer_x.add(screen, action_x, rew, new_screen, float(done)) replay_buffer_y.add(screen, action_y, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew episode_beacons[-1] += obs[0].reward reward = episode_rewards[-1] if done: obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screen = (player_relative == _PLAYER_NEUTRAL).astype(int) player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) episode_rewards.append(0.0) episode_beacons.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience_x = replay_buffer_x.sample( batch_size, beta=beta_schedule_x.value(t)) (obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x, weights_x, batch_idxes_x) = experience_x experience_y = replay_buffer_y.sample( batch_size, beta=beta_schedule_y.value(t)) (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y else: obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x = replay_buffer_x.sample( batch_size) weights_x, batch_idxes_x = np.ones_like(rewards_x), None obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample( batch_size) weights_y, batch_idxes_y = np.ones_like(rewards_y), None td_errors_x = train_x(obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x, weights_x) td_errors_y = train_x(obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y) if prioritized_replay: new_priorities_x = np.abs( td_errors_x) + prioritized_replay_eps new_priorities_y = np.abs( td_errors_y) + prioritized_replay_eps replay_buffer_x.update_priorities(batch_idxes_x, new_priorities_x) replay_buffer_y.update_priorities(batch_idxes_y, new_priorities_y) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target_x() update_target_y() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) mean_100ep_beacon = round(np.mean(episode_beacons[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("mean 100 episode beacon", mean_100ep_beacon) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act)
class LineModel_PPO1: REWARD_RIVAL_DMG = 250 def __init__( self, statesize, actionsize, hero, ob, ac, policy_func=None, update_target_period=100, scope="ppo1", schedule_timesteps=10000, initial_p=0, final_p=0, gamma=0.99, lam=0.95, optim_epochs=4, optim_stepsize=1e-3, # optimization hypers schedule='linear', max_timesteps=40e6): self.act = None self.train = None self.update_target = None self.debug = None self.state_size = statesize self.action_size = actionsize # 50=8*mov+10*attack+10*skill1+10*skill2+10*skill3+回城+hold self.gamma = gamma # discount rate self.lam = lam self.hero = hero self.scope = scope # todo:英雄1,2普攻距离为2,后续需修改 self.att_dist = 2 self.act_times = 0 self.train_times = 0 self.update_target_period = update_target_period self.exploration = LinearSchedule(schedule_timesteps=5000, initial_p=initial_p, final_p=final_p) # Initialize history arrays self.obs = np.array([ob for _ in range(update_target_period)]) self.rews = np.zeros(update_target_period, 'float32') self.vpreds = np.zeros(update_target_period, 'float32') self.news = np.zeros(update_target_period, 'int32') self.acs = np.array([ac for _ in range(update_target_period)]) self.prevacs = self.acs.copy() self.schedule = schedule self.max_timesteps = max_timesteps self.t = 0 self.optim_epochs = optim_epochs self.optim_stepsize = optim_stepsize self.ep_rets = [] self.ep_lens = [] self.cur_ep_ret = 0 self.cur_ep_len = 0 self.lenbuffer = deque( maxlen=100) # rolling buffer for episode lengths self.rewbuffer = deque( maxlen=100) # rolling buffer for episode rewards self.episodes_so_far = 0 self.timesteps_so_far = 0 self.iters_so_far = 0 self.exploration = LinearSchedule( schedule_timesteps=schedule_timesteps, initial_p=initial_p, final_p=final_p) policy_func = LinePPOModel if LinePPOModel is None else policy_func self._build_model(input_space=statesize, action_size=actionsize, policy_func=policy_func) self.tstart = time.time() def _build_model( self, input_space, action_size, policy_func, clip_param=0.2, entcoeff=0.01, # clipping parameter epsilon, entropy coeff adam_epsilon=1e-5): sess = U.get_session() if sess is None: sess = U.make_session(8) sess.__enter__() # Setup losses and stuff # ---------------------------------------- with tf.variable_scope(self.scope): self.pi = policy_func( "pi", input_space, action_size) # Construct network for new policy self.oldpi = policy_func("oldpi", input_space, action_size) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = self.pi.pdtype.sample_placeholder([None]) kloldnew = self.oldpi.pd.kl(self.pi.pd) ent = self.pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(self.pi.pd.logp(ac) - self.oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(self.pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss var_list = self.pi.get_trainable_variables() # more debug info debug_atarg = atarg pi_ac = self.pi.pd.logp(ac) opi_ac = self.oldpi.pd.logp(ac) vpred = U.mean(self.pi.vpred) pi_pd = U.mean(self.pi.pd.flatparam()) opi_pd = self.oldpi.pd.flatparam()[0] kl_oldnew = kloldnew[0] grads = tf.gradients(total_loss, var_list) losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] debugs = [ debug_atarg, pi_ac, opi_ac, vpred, pi_pd, opi_pd, kl_oldnew, total_loss ] self.lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + debugs + [var_list, grads] + [U.flatgrad(total_loss, var_list)]) self.adam = MpiAdam(var_list, epsilon=adam_epsilon) self.assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame( self.oldpi.get_variables(), self.pi.get_variables()) ]) self.compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() self.adam.sync() def load(self, name): saver = tf.train.Saver(var_list=tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope)) sess = U.get_session() saver.restore(sess, name) def save(self, name): saver = tf.train.Saver(var_list=tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope)) sess = U.get_session() saver.save(sess, name) @staticmethod def gen_input(cur_state, hero_name, rival_hero): cur_line_input = Line_Input_Lite(cur_state, hero_name, rival_hero) cur_state_input = cur_line_input.gen_line_input() return cur_state_input def remember(self, cur_state, new_state, vpred, prevac): hero_name = self.hero action = cur_state.get_hero_action(hero_name) if action is not None: selected_action_idx = action.output_index reward = action.reward # 暂时将1v1的rival_hero 定义为对面英雄 for hero in cur_state.heros: if hero.hero_name != hero_name: rival_hero = hero.hero_name break cur_line_input = Line_Input_Lite(cur_state, hero_name, rival_hero) cur_state_input = cur_line_input.gen_line_input() new_line_input = Line_Input_Lite(new_state, hero_name, rival_hero) new_state_input = new_line_input.gen_line_input() new = True if new_state.get_hero(hero_name).hp <= 0 else False i = self.t % self.update_target_period self.obs[i] = cur_state_input self.vpreds[i] = vpred self.news[i] = new self.acs[i] = selected_action_idx self.prevacs[i] = prevac self.rews[i] = reward self.t += 1 self.cur_ep_ret += reward self.cur_ep_len += 1 if new: self.ep_rets.append(self.cur_ep_ret) self.ep_lens.append(self.cur_ep_len) self.cur_ep_ret = 0 self.cur_ep_len = 0 def get_memory_size(self): return self.iters_so_far def add_vtarg_and_adv(self, seg, gamma, lam): """ Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) """ new = np.append( seg["new"], 0 ) # last element is only used for last vtarg, but we already zeroed it if last new = 1 vpred = np.append(seg["vpred"], seg["nextvpred"]) T = len(seg["rew"]) seg["adv"] = gaelam = np.empty(T, 'float32') rew = seg["rew"] lastgaelam = 0 for t in reversed(range(T)): nonterminal = 1 - new[t + 1] delta = rew[t] + gamma * vpred[t + 1] * nonterminal - vpred[t] gaelam[ t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam # print('gaelam', gaelam[t], 'rew', rew[t], 'vpred_t+1', vpred[t+1], 'vpred_t', vpred[t]) seg["tdlamret"] = seg["adv"] + seg["vpred"] # 需要下一次行动的vpred,所以需要在执行完一次act之后计算是否replay def replay(self, seg_list, batch_size): print(self.scope + " training") if self.schedule == 'constant': cur_lrmult = 1.0 elif self.schedule == 'linear': cur_lrmult = max( 1.0 - float(self.timesteps_so_far) / self.max_timesteps, 0) # Here we do a bunch of optimization epochs over the data # 批量计算的思路是,每次将所有战斗的g值得到,然后求平均,优化。循环多次 newlosses_list = [] logger.log("Optimizing...") loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] logger.log(fmt_row(13, loss_names)) for _ in range(self.optim_epochs): g_list = [] for seg in seg_list: self.add_vtarg_and_adv(seg, self.gamma, self.lam) # print(seg) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not self.pi.recurrent) if hasattr(self.pi, "ob_rms"): self.pi.ob_rms.update( ob) # update running mean/std for policy self.assign_old_eq_new( ) # set old parameter values to new parameter values # 完整的拿所有行为 batch = d.next_batch(d.n) # print("ob", batch["ob"], "ac", batch["ac"], "atarg", batch["atarg"], "vtarg", batch["vtarg"]) *newlosses, debug_atarg, pi_ac, opi_ac, vpred, pi_pd, opi_pd, kl_oldnew, total_loss, var_list, grads, g = \ self.lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) # print("debug_atarg", debug_atarg, "pi_ac", pi_ac, "opi_ac", opi_ac, "vpred", vpred, "pi_pd", pi_pd, # "opi_pd", opi_pd, "kl_oldnew", kl_oldnew, "var_mean", np.mean(g), "total_loss", total_loss) if np.isnan(np.mean(g)): print('output nan, ignore it!') else: g_list.append(g) newlosses_list.append(newlosses) # 批量计算之后求平均在优化模型 if len(g_list) > 0: avg_g = np.mean(g_list, axis=0) self.adam.update(avg_g, self.optim_stepsize * cur_lrmult) logger.log(fmt_row(13, np.mean(newlosses_list, axis=0))) logger.log("Evaluating losses...") losses = [] for seg in seg_list: self.add_vtarg_and_adv(seg, self.gamma, self.lam) # print(seg) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not self.pi.recurrent) # 完整的拿所有行为 batch = d.next_batch(d.n) newlosses = self.compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) print(losses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): if np.isinf(lossval): debug = True logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(self.flatten_lists, zip(*listoflrpairs)) self.lenbuffer.extend(lens) self.rewbuffer.extend(rews) last_rew = self.rewbuffer[-1] if len(self.rewbuffer) > 0 else 0 logger.record_tabular("LastRew", last_rew) logger.record_tabular( "LastLen", 0 if len(self.lenbuffer) <= 0 else self.lenbuffer[-1]) logger.record_tabular("EpLenMean", np.mean(self.lenbuffer)) logger.record_tabular("EpRewMean", np.mean(self.rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) self.episodes_so_far += len(lens) self.timesteps_so_far += sum(lens) self.iters_so_far += 1 logger.record_tabular("EpisodesSoFar", self.episodes_so_far) logger.record_tabular("TimestepsSoFar", self.timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - self.tstart) logger.record_tabular("IterSoFar", self.iters_so_far) logger.record_tabular("CalulateActions", self.act_times) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() def flatten_lists(self, listoflists): return [el for list_ in listoflists for el in list_] def get_actions(self, state_inputs): self.act_times += len(state_inputs) stochastic = True explor_value = self.exploration.value(self.act_times) actions, vpreds = self.pi.acts(stochastic=stochastic, update_eps=explor_value, ob=state_inputs) return actions, explor_value, vpreds def get_action(self, state_input): self.act_times += 1 stochastic = True explor_value = self.exploration.value(self.act_times) actions, vpred = self.pi.act(stochastic=stochastic, update_eps=explor_value, ob=state_input) actions = np.array([actions]) return actions, explor_value, vpred @staticmethod # 只使用当前帧(做决定帧)+下一帧来计算奖惩,目的是在游戏结束时候可以计算所有之前行为的奖惩,不会因为需要延迟n下而没法计算 # 另外最核心的是,ppo本身就不要要求奖惩值是根据上一个行动来得到的 def cal_target_ppo(prev_state, cur_state, next_state, hero_name, rival_hero_name, line_idx): # 只计算当前帧的得失,得失为金币获取情况 + 敌方血量变化 # 获得小兵死亡情况, 根据小兵属性计算他们的金币情况 cur_rival_hero = cur_state.get_hero(rival_hero_name) rival_team = cur_rival_hero.team cur_hero = cur_state.get_hero(hero_name) cur_rival_hero = cur_state.get_hero(rival_hero_name) next_hero = next_state.get_hero(hero_name) next_rival_hero = next_state.get_hero(rival_hero_name) # 找到英雄附近死亡的敌方小兵 dead_units = StateUtil.get_dead_units_in_line( next_state, rival_team, line_idx, cur_hero, StateUtil.GOLD_GAIN_RADIUS) dead_golds = sum([ StateUtil.get_unit_value(u.unit_name, u.cfg_id) for u in dead_units ]) dead_unit_str = (','.join([u.unit_name for u in dead_units])) # 如果英雄有小额金币变化,则忽略 gold_delta = next_hero.gold - cur_hero.gold if gold_delta % 10 == 3 or gold_delta % 10 == 8 or gold_delta == int( dead_golds / 2) + 3: gold_delta -= 3 # 很难判断英雄的最后一击,所以我们计算金币变化,超过死亡单位一半的金币作为英雄获得金币 gold_delta = gold_delta * 2 - dead_golds if gold_delta < 0: print('获得击杀金币不应该小于零', cur_state.tick, 'dead_units', dead_unit_str, 'gold_gain', (next_hero.gold - cur_hero.gold)) gold_delta = 0 # if dead_golds > 0: # print('dead_gold', dead_golds, 'delta_gold', gold_delta, "hero", hero_name, "tick", cur_state.tick) # 计算对指定敌方英雄造成的伤害,计算接受的伤害 # 伤害信息和击中信息都有延迟,在两帧之后(但是一般会出现在同一条信息中,偶尔也会出现在第二条中) # 这里只计算下一帧中英雄对对方造成的伤害 # 扩大自己受到伤害的惩罚 # 扩大对方低血量下受到伤害的奖励 # 扩大攻击伤害的权重 # TODO 防御型辅助型法术的定义,辅助法术不能乱放,否则惩罚 dmg = next_state.get_hero_total_dmg( hero_name, rival_hero_name) / float(cur_rival_hero.maxhp) dmg *= 3 * cur_rival_hero.maxhp / float(cur_rival_hero.hp + cur_rival_hero.maxhp) # 估算玩家接收的伤害时候,只考虑下一帧中的变化,像塔的攻击需要飞行所有有延迟这种情况这里不需要考虑 self_hp_loss = (cur_hero.hp - next_hero.hp) / float(cur_hero.maxhp) / 2 if ( cur_hero.hp >= next_hero.hp >= next_hero.hp) else 0 self_hp_loss *= 3 * cur_hero.maxhp / float(cur_hero.hp + cur_hero.maxhp) dmg_delta = int((dmg - self_hp_loss) * LineModel.REWARD_RIVAL_DMG) # 统计和更新变量 # print('reward debug info, hero: %s, max_gold: %s, gold_gain: %s, dmg: %s, hp_loss: %s, dmg_delta: %s, ' # 'dead_units: %s' # % ( # hero_name, str(dead_golds), str(gold_delta), str(dmg), str(self_hp_loss), str(dmg_delta), dead_unit_str)) # 最大奖励是击杀小兵和塔的金币加上对方一条命血量的奖励 # 最大惩罚是被对方造成了一条命伤害 # 零分为获得了所有的死亡奖励 reward = float(gold_delta + dmg_delta) / 100 # 特殊情况处理 # 鼓励攻击对方小兵,塔 if_hit_unit = next_state.if_hero_hit_any_unit(hero_name, rival_hero_name) if if_hit_unit is not None: # print("物理攻击到了小兵", if_hit_unit) reward += 0.01 if_hit_tower = next_state.if_hero_hit_tower(hero_name) if if_hit_tower is not None: # print("物理攻击到了塔", if_hit_tower) reward += 0.01 # 将所有奖励缩小 final_reward = reward / 10 final_reward = min(max(final_reward, -1), 1) # 特殊奖励,放在最后面 # 英雄击杀最后一击,直接最大奖励(因为gamma的存在,扩大这个惩罚) if cur_rival_hero.hp > 0 and next_rival_hero.hp <= 0: # print('对线英雄%s死亡' % rival_hero_name) dmg_hit_rival = next_state.get_hero_total_dmg( hero_name, rival_hero_name) if dmg_hit_rival > 0: # print('英雄%s对对方造成了最后一击' % hero_name) final_reward = 1 if cur_hero.hp > 0 and next_hero.hp <= 0: final_reward = 0 elif cur_hero.hp > 0 and next_hero.hp <= 0: print('英雄死亡') final_reward = -5 return final_reward @staticmethod def assert_tower_in_input(cur_state, hero_name, rival_hero): # 如果敌方塔要攻击英雄的话,检查塔的信息是不是在input中 att_info = cur_state.if_tower_attack_hero(hero_name) if att_info is not None: tower = str(att_info.atker) tower_info = cur_state.get_obj(tower) hero_info = cur_state.get_hero(hero_name) model_input = LineModel_PPO1.gen_input(cur_state, hero_name, rival_hero) if model_input[44] == Line_Input_Lite.normalize_value_static( int(tower)): print('yes found attack tower in input', tower, 'distance', model_input[50], 'cal_distance', StateUtil.cal_distance2(tower_info.pos, hero_info.pos)) else: print('not found attack tower in input', tower, 'distance', model_input[50], 'cal_distance', StateUtil.cal_distance2(tower_info.pos, hero_info.pos)) @staticmethod # 只使用当前帧(做决定帧)+下一帧来计算奖惩,目的是在游戏结束时候可以计算所有之前行为的奖惩,不会因为需要延迟n下而没法计算 # 另外最核心的是,ppo本身就不要要求奖惩值是根据上一个行动来得到的 def cal_target_ppo_2(prev_state, cur_state, next_state, hero_name, rival_hero_name, line_idx): LineModel_PPO1.assert_tower_in_input(cur_state, hero_name, rival_hero_name) # 只计算当前帧的得失,得失为金币获取情况 + 敌方血量变化 # 获得小兵死亡情况, 根据小兵属性计算他们的金币情况 cur_rival_hero = cur_state.get_hero(rival_hero_name) rival_team = cur_rival_hero.team cur_hero = cur_state.get_hero(hero_name) cur_rival_hero = cur_state.get_hero(rival_hero_name) next_hero = next_state.get_hero(hero_name) next_rival_hero = next_state.get_hero(rival_hero_name) # 找到英雄附近死亡的敌方小兵 dead_units = StateUtil.get_dead_units_in_line( next_state, rival_team, line_idx, cur_hero, StateUtil.GOLD_GAIN_RADIUS) dead_golds = sum([ StateUtil.get_unit_value(u.unit_name, u.cfg_id) for u in dead_units ]) # 如果英雄有小额金币变化,则忽略 gold_delta = next_hero.gold - cur_hero.gold if gold_delta % 10 == 3 or gold_delta % 10 == 8 or gold_delta == int( dead_golds / 2) + 3: gold_delta -= 3 # 很难判断英雄的最后一击,所以我们计算金币变化,超过死亡单位一半的金币作为英雄获得金币 if gold_delta > 0: gold_delta = gold_delta * 2 - dead_golds if gold_delta < 0: print('获得击杀金币不应该小于零', cur_state.tick, 'dead_golds', dead_golds, 'gold_delta', (next_hero.gold - cur_hero.gold)) gold_delta = 0 # if dead_golds > 0: # print('dead_gold', dead_golds, 'delta_gold', gold_delta, "hero", hero_name, "tick", cur_state.tick) reward = float(gold_delta) / 100 # 将所有奖励缩小 final_reward = reward / 100 final_reward = min(max(final_reward, -1), 1) # 特殊奖励,放在最后面 # 英雄击杀最后一击,直接最大奖励(因为gamma的存在,扩大这个惩罚) if cur_rival_hero.hp > 0 and next_rival_hero.hp <= 0: # print('对线英雄%s死亡' % rival_hero_name) dmg_hit_rival = next_state.get_hero_total_dmg( hero_name, rival_hero_name) if dmg_hit_rival > 0: # print('英雄%s对对方造成了最后一击' % hero_name) final_reward = 1 if cur_hero.hp > 0 and next_hero.hp <= 0: final_reward = 0 elif cur_hero.hp > 0 and next_hero.hp <= 0: print('英雄死亡') final_reward = -1 return final_reward
class HRAAdaptive(object): """HRAAdaptive using HRA architecture""" def __init__(self, name, choices, reward_types, network_config, reinforce_config): super(HRAAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.replace_frequency = reinforce_config.replace_frequency self.memory = PrioritizedReplayBuffer( self.reinforce_config.memory_size, 0.6) self.learning = True self.reward_types = reward_types self.steps = 0 self.episode = 0 self.reward_history = [] self.best_reward_mean = -sys.maxsize self.beta_schedule = LinearSchedule( self.reinforce_config.beta_timesteps, initial_p=self.reinforce_config.beta_initial, final_p=self.reinforce_config.beta_final) self.epsilon_schedule = LinearSchedule( self.reinforce_config.epsilon_timesteps, initial_p=self.reinforce_config.starting_epsilon, final_p=self.reinforce_config.final_epsilon) self.reset() self.eval_model = HRAModel(self.name + "_eval", self.network_config, use_cuda) self.target_model = HRAModel(self.name + "_target", self.network_config, use_cuda) reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name if not network_config.restore_network: clear_summary_path(reinforce_summary_path) else: self.restore_state() self.summary = SummaryWriter(log_dir=reinforce_summary_path) def __del__(self): self.save() self.summary.close() def should_explore(self): self.epsilon = self.epsilon_schedule.value(self.steps) self.summary.add_scalar(tag='%s/Epsilon' % self.name, scalar_value=self.epsilon, global_step=self.steps) return random.random() < self.epsilon def predict(self, state): self.steps += 1 if (self.previous_state is not None and self.previous_action is not None): self.memory.add(self.previous_state, self.previous_action, self.reward_list(), state, 0) if self.learning and self.should_explore(): action = random.choice(list(range(len(self.choices)))) q_values = None combined_q_values = None choice = self.choices[action] else: _state = Tensor(state).unsqueeze(0) model_start_time = time.time() action, q_values, combined_q_values = self.eval_model.predict( _state, self.steps, self.learning) choice = self.choices[action] self.model_time += time.time() - model_start_time if self.learning and self.steps % self.replace_frequency == 0: logger.debug("Replacing target model for %s" % self.name) self.target_model.replace(self.eval_model) if (self.learning and self.steps > self.reinforce_config.update_start and self.steps % self.reinforce_config.update_steps == 0): update_start_time = time.time() self.update() self.update_time += time.time() - update_start_time self.clear_current_rewards() self.previous_state = state self.previous_action = action return choice, q_values, combined_q_values def disable_learning(self): logger.info("Disabled Learning for %s agent" % self.name) self.save() self.learning = False self.episode = 0 def end_episode(self, state): if not self.learning: return self.reward_history.append(self.total_reward) logger.info("End of Episode %d with total reward %.2f, epsilon %.2f" % (self.episode + 1, self.total_reward, self.epsilon)) self.episode += 1 self.summary.add_scalar(tag='%s/Episode Reward' % self.name, scalar_value=self.total_reward, global_step=self.episode) for reward_type in self.reward_types: tag = '%s/Decomposed Reward/%s' % (self.name, reward_type) value = self.decomposed_total_reward[reward_type] self.summary.add_scalar(tag=tag, scalar_value=value, global_step=self.episode) self.memory.add(self.previous_state, self.previous_action, self.reward_list(), state, 1) self.episode_time = time.time() - self.episode_time logger.debug("Episode Time: %.2f, " "Model prediction time: %.2f, " "Updated time: %.2f, " "Update fit time: %.2f" % (self.episode_time, self.model_time, self.update_time, self.fit_time)) self.save() self.reset() def reset(self): self.clear_current_rewards() self.clear_episode_rewards() self.previous_state = None self.previous_action = None self.episode_time = time.time() self.update_time = 0 self.fit_time = 0 self.model_time = 0 def reward_list(self): reward = [0] * len(self.reward_types) for i, reward_type in enumerate(sorted(self.reward_types)): reward[i] = self.current_reward[reward_type] return reward def clear_current_rewards(self): self.current_reward = {} for reward_type in self.reward_types: self.current_reward[reward_type] = 0 def clear_episode_rewards(self): self.total_reward = 0 self.decomposed_total_reward = {} for reward_type in self.reward_types: self.decomposed_total_reward[reward_type] = 0 def reward(self, reward_type, value): self.current_reward[reward_type] += value self.decomposed_total_reward[reward_type] += value self.total_reward += value def restore_state(self): restore_path = self.network_config.network_path + "/adaptive.info" if self.network_config.network_path and os.path.exists(restore_path): logger.info("Restoring state from %s" % self.network_config.network_path) with open(restore_path, "rb") as file: info = pickle.load(file) self.steps = info["steps"] self.best_reward_mean = info["best_reward_mean"] self.episode = info["episode"] logger.info( "Continuing from %d episode (%d steps) with best reward mean %.2f" % (self.episode, self.steps, self.best_reward_mean)) def save(self, force=False): info = { "steps": self.steps, "best_reward_mean": self.best_reward_mean, "episode": self.episode } if force: logger.info("Forced to save network") self.eval_model.save_network() self.target_model.save_network() pickle.dump(info, self.network_config.network_path + "adaptive.info") if (len(self.reward_history) >= self.network_config.save_steps and self.episode % self.network_config.save_steps == 0): total_reward = sum( self.reward_history[-self.network_config.save_steps:]) current_reward_mean = total_reward / self.network_config.save_steps if current_reward_mean >= self.best_reward_mean: self.best_reward_mean = current_reward_mean info["best_reward_mean"] = current_reward_mean logger.info("Saving network. Found new best reward (%.2f)" % current_reward_mean) self.eval_model.save_network() self.target_model.save_network() with open(self.network_config.network_path + "/adaptive.info", "wb") as file: pickle.dump(info, file, protocol=pickle.HIGHEST_PROTOCOL) else: logger.info("The best reward is still %.2f. Not saving" % current_reward_mean) def update(self): if len(self.memory) <= self.reinforce_config.batch_size: return beta = self.beta_schedule.value(self.steps) self.summary.add_scalar(tag='%s/Beta' % self.name, scalar_value=beta, global_step=self.steps) batch = self.memory.sample(self.reinforce_config.batch_size, beta) (states, actions, reward, next_states, is_terminal, weights, batch_idxes) = batch self.summary.add_histogram(tag='%s/Batch Indices' % self.name, values=Tensor(batch_idxes), global_step=self.steps) states = Tensor(states) next_states = Tensor(next_states) terminal = FloatTensor(is_terminal) reward = FloatTensor(reward) batch_index = torch.arange(self.reinforce_config.batch_size, dtype=torch.long) # Find the target values q_actions, q_values, _ = self.eval_model.predict_batch(states) q_values = q_values[:, batch_index, actions] _, q_next, _ = self.target_model.predict_batch(next_states) q_next = q_next.mean(2).detach() q_next = (1 - terminal) * q_next q_target = reward.t() + self.reinforce_config.discount_factor * q_next # Update the model fit_start_time = time.time() self.eval_model.fit(q_values, q_target, self.steps) self.fit_time += time.time() - fit_start_time # Update priorities td_errors = q_values - q_target td_errors = torch.sum(td_errors, 0) new_priorities = torch.abs(td_errors) + 1e-6 # prioritized_replay_eps self.memory.update_priorities(batch_idxes, new_priorities.data)
class DQN(RL_AGENT): def __init__(self, env, network_policy, gamma=1.0, exploration_fraction=0.02, exploration_final_eps=0.01, steps_total=50000000, size_buffer=1000000, prioritized_replay=True, alpha_prioritized_replay=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, type_optimizer='Adam', lr=5e-4, eps=1.5e-4, time_learning_starts=20000, freq_targetnet_update=8000, freq_train=4, size_batch=32, callback=None, load_path=None, # for debugging device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), seed=42, **network_kwargs): super(DQN, self).__init__(env, gamma, seed) self.create_replay_buffer(prioritized_replay, prioritized_replay_eps, size_buffer, alpha_prioritized_replay, prioritized_replay_beta0, prioritized_replay_beta_iters, steps_total) self.exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * steps_total), initial_p=1.0, final_p=exploration_final_eps) self.network_policy = network_policy # an instance of DQN_NETWORK, which contains an instance of FEATURE_EXTRACTOR and 1 additional head self.optimizer = eval('optim.%s' % type_optimizer)(self.network_policy.parameters(), lr=lr, eps=eps) # initialize target network self.network_target = copy.deepcopy(self.network_policy) for param in self.network_target.parameters(): param.requires_grad = False self.network_target.eval() self.size_batch = size_batch self.time_learning_starts = time_learning_starts self.freq_train = freq_train self.freq_targetnet_update = freq_targetnet_update self.t, self.steps_total = 0, steps_total self.device = device self.step_last_print, self.time_last_print = 0, None def load_checkpoint(self, checkpoint): """ loads checkpoint saved by utils/save_checkpoint """ self.load_state_dict(checkpoint['agent_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) self.t = checkpoint['t'] self.gamma = checkpoint['gamma'] self.seed = checkpoint['seed'] self.exploration = checkpoint['exploration'] self.observation_space = checkpoint['observation_space'] self.action_space = checkpoint['action_space'] self.beta_schedule = checkpoint['beta_schedule'] self.replay_buffer = checkpoint['replay_buffer'] self.size_batch = checkpoint['size_batch'] self.time_learning_starts = checkpoint['time_learning_starts'] self.freq_train = checkpoint['freq_train'] self.freq_targetnet_update = checkpoint['freq_targetnet_update'] self.steps_total = checkpoint['steps_total'] self.device = checkpoint['device'] self.step_last_print = checkpoint['step_last_print'] self.time_last_print = checkpoint['time_last_print'] print('checkpoint loaded with replay buffer of size %d' % (len(self.replay_buffer))) def create_replay_buffer(self, prioritized_replay, prioritized_replay_eps, size_buffer, alpha_prioritized_replay, prioritized_replay_beta0, prioritized_replay_beta_iters, steps_total): self.prioritized_replay = prioritized_replay self.prioritized_replay_eps = prioritized_replay_eps if prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer(size_buffer, alpha=alpha_prioritized_replay) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = steps_total self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(size_buffer) self.beta_schedule = None pass def decide(self, obs, eval=False): # Validated by Harry 17h45 23-11-2019 """ input observation and output action some through the computations of the policy network """ if eval or random.random() > self.exploration.value(self.t): with torch.no_grad(): return int(torch.argmax(self.network_policy(obs))) else: # explore return self.action_space.sample() def step(self, obs_curr, action, reward, obs_next, done, eval=False): """ an agent step: in this step the agent does whatever it needs """ if obs_next is not None: self.replay_buffer.add(obs_curr, action, np.sign(reward), obs_next, done) # clip rewards, done is the flag for whether obs_next is terminal if self.t >= self.time_learning_starts: if len(self.replay_buffer) >= self.size_batch and self.t % self.freq_train == 0: self.update() if self.t % self.freq_targetnet_update == 0: self.sync_parameters() self.t += 1 def update(self): """ update the parameters of the DQN model using the weighted sampled Bellman error """ # sample a batch if self.prioritized_replay: experience = self.replay_buffer.sample(self.size_batch, beta=self.beta_schedule.value(self.t)) (batch_obs_curr, batch_action, batch_reward, batch_obs_next, batch_done, weights, batch_idxes) = experience else: batch_obs_curr, batch_action, batch_reward, batch_obs_next, batch_done = self.replay_buffer.sample(self.size_batch) weights, batch_idxes = np.ones_like(batch_reward), None batch_action, batch_reward = torch.tensor(batch_action, dtype=torch.int64, device=self.device).view(-1, 1), torch.tensor(batch_reward, dtype=torch.float32, device=self.device) weights = torch.tensor(weights, dtype=torch.float32, device=self.device) # calculate the weighted Bellman error index_nonterm_trans = np.argwhere(batch_done == False).reshape(-1) values_next = torch.zeros_like(batch_reward, dtype=torch.float32) values_next[index_nonterm_trans] = self.network_target(batch_obs_next[index_nonterm_trans]).max(1)[0].detach() values_curr = self.network_policy(batch_obs_curr).gather(1, index=batch_action).view(-1) error_bellman = F.smooth_l1_loss(values_curr, batch_reward + self.gamma * values_next, reduction='none') # Huber loss error_bellman_weighted = torch.dot(error_bellman, weights) # calculate gradient w.r.t. to the weighted Bellman error self.optimizer.zero_grad() error_bellman_weighted.backward() # gradient clipping for param in self.network_policy.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() # update prioritized replay, if used if self.prioritized_replay: new_priorities = np.abs(error_bellman.detach().cpu().numpy()) + self.prioritized_replay_eps self.replay_buffer.update_priorities(batch_idxes, new_priorities) def sync_parameters(self): """ synchronize the parameters of self.network_policy and self.network_target """ self.network_target.load_state_dict(self.network_policy.state_dict()) for param in self.network_target.parameters(): param.requires_grad = False self.network_target.eval() def reset_parameters(self): self.network_policy.reset_parameters()
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=100000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. batch_size: int size of a batch sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = PiecewiseSchedule([(0, 1.0), (int(1e6), 0.1), (int(1e7), 0.01)], outside_value=0.01) '''exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) ''' # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_variables(model_file) return act
reset = False new_obs, rew, done, info = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if done: num_iters_since_reset = 0 obs = env.reset() reset = True if (num_iters > max(5 * args.batch_size, args.replay_buffer_size // 20) and num_iters % args.learning_freq == 0): # Sample a bunch of transitions from replay buffer if args.prioritized: experience = replay_buffer.sample( args.batch_size, beta=beta_schedule.value(num_iters)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( args.batch_size) weights = np.ones_like(rewards) # Minimize the error in Bellman's equation and compute TD-error td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # Update the priorities in the replay buffer if args.prioritized: new_priorities = np.abs(td_errors) + args.prioritized_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # Update target network.
def learn(env, q_func, num_actions=4, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((32, 32), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq") # # act_y, train_y, update_target_y, debug_y = deepq.build_train( # make_obs_ph=make_obs_ph, # q_func=q_func, # num_actions=num_actions, # optimizer=tf.train.AdamOptimizer(learning_rate=lr), # gamma=gamma, # grad_norm_clipping=10, # scope="deepq_y" # ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) # replay_buffer_y = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) # beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters, # initial_p=prioritized_replay_beta0, # final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) # replay_buffer_y = ReplayBuffer(buffer_size) beta_schedule = None # beta_schedule_y = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule( schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() # update_target_y() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # Select all marines first obs = env.step( actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = (player_relative == _PLAYER_NEUTRAL).astype(int) #+ path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 16): screen = shift(LEFT, player[0] - 16, screen) elif (player[0] < 16): screen = shift(RIGHT, 16 - player[0], screen) if (player[1] > 16): screen = shift(UP, player[1] - 16, screen) elif (player[1] < 16): screen = shift(DOWN, 16 - player[1], screen) reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join("model/", "mineral_shards") print(model_file) for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act( np.array(screen)[None], update_eps=update_eps, **kwargs)[0] # action_y = act_y(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False coord = [player[0], player[1]] rew = 0 if (action == 0): #UP if (player[1] >= 8): coord = [player[0], player[1] - 8] #path_memory_[player[1] - 16 : player[1], player[0]] = -1 elif (player[1] > 0): coord = [player[0], 0] #path_memory_[0 : player[1], player[0]] = -1 #else: # rew -= 1 elif (action == 1): #DOWN if (player[1] <= 23): coord = [player[0], player[1] + 8] #path_memory_[player[1] : player[1] + 16, player[0]] = -1 elif (player[1] > 23): coord = [player[0], 31] #path_memory_[player[1] : 63, player[0]] = -1 #else: # rew -= 1 elif (action == 2): #LEFT if (player[0] >= 8): coord = [player[0] - 8, player[1]] #path_memory_[player[1], player[0] - 16 : player[0]] = -1 elif (player[0] < 8): coord = [0, player[1]] #path_memory_[player[1], 0 : player[0]] = -1 #else: # rew -= 1 elif (action == 3): #RIGHT if (player[0] <= 23): coord = [player[0] + 8, player[1]] #path_memory_[player[1], player[0] : player[0] + 16] = -1 elif (player[0] > 23): coord = [31, player[1]] #path_memory_[player[1], player[0] : 63] = -1 if _MOVE_SCREEN not in obs[0].observation["available_actions"]: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] # else: # new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = (player_relative == _PLAYER_NEUTRAL).astype( int) #+ path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 16): new_screen = shift(LEFT, player[0] - 16, new_screen) elif (player[0] < 16): new_screen = shift(RIGHT, 16 - player[0], new_screen) if (player[1] > 16): new_screen = shift(UP, player[1] - 16, new_screen) elif (player[1] < 16): new_screen = shift(DOWN, 16 - player[1], new_screen) rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer.add(screen, action, rew, new_screen, float(done)) # replay_buffer_y.add(screen, action_y, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screen = (player_relative == _PLAYER_NEUTRAL).astype( int) #+ path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] # Select all marines first env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) episode_rewards.append(0.0) #episode_minerals.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience # experience_y = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) # (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample(batch_size) # weights_y, batch_idxes_y = np.ones_like(rewards_y), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # td_errors_y = train_x(obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps # new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # update_target_y() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act)
def learn(env, q_func, policy_fn, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) scope = "ampi" reuse=None grad_norm_clipping=None num_actions=env.action_space.n optimizer_q=tf.train.AdamOptimizer(learning_rate=lr) optimizer_pi=tf.train.AdamOptimizer(learning_rate=lr) act = build_act(make_obs_ph, q_func, num_actions=env.action_space.n, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # add ob_space = env.observation_space ac_space = env.action_space pi, act = policy_fn(obs_t_input.get(), ob_space, ac_space, scope="pi_func") # train pi pi_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/pi_func") pi_tp1, act_tp1 = policy_fn(obs_tp1_input.get(), ob_space, ac_space, scope="target_pi_func") # target pi target_pi_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/taget_pi_func") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # Q_{train}(a,s) q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # y_j act_best = tf.argmax(pi, axis=1) # argmax \pi(s_{j+1}) q_tp1_sampled = tf.reduce_sum(q_tp1 * tf.one_hot(act_best, num_actions), 1) # Q_{target}(s_{j+1}, argmax(\pi(s_{j+1})) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_sampled q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # Regression loss td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # argmax_a Q_{target}(s_j, a) z_j = tf.argmax(q_tp1, axis=1) # max Q(s',a') # classification loss cl_error = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi, labels=z_j) # Q optimization if grad_norm_clipping is not None: gradients_q = optimizer_q.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients_qq): if grad is not None: gradients_q[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_q = optimizer_q.apply_gradients(gradients_q) else: optimize_q = optimizer_q.minimize(weighted_error, var_list=q_func_vars) # pi optimization if grad_norm_clipping is not None: gradients_pi = optimizer_pi.compute_gradients(cl_error, var_list=pi_func_vars) for i, (grad, var) in enumerate(gradients_pi): if grad is not None: gradients_pi[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_pi = optimizer_pi.apply_gradients(gradients_pi) else: optimize_pi = optimizer_pi.minimize(cl_error, var_list=pi_func_vars) # update_target Q update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # update_target pi update_target_pi = [] for var, var_target in zip(sorted(pi_func_vars, key=lambda v: v.name), sorted(target_pi_func_vars, key=lambda v: v.name)): update_target_pi.append(var_target.assign(var)) update_target_pi = tf.group(*update_target_pi) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[td_error, cl_error], updates=[optimize_q, optimize_pi] ) update_target = U.function([], [], updates=[update_target_expr, update_target_pi]) q_values = U.function([obs_t_input], q_t) debug = {'q_values': q_values} # Create the replay buffer replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = env.action_space.sample() # not used, just so we have the datatype stochastic=True ac1, vpred1 = act(stochastic, np.array(obs)[None]) action = ac1[0] #action, _ = pi.act(stochastic, obs) #action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # Log train and res mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_state(model_file) return act
action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] reset = False new_obs, rew, done, info = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if done: num_iters_since_reset = 0 obs = env.reset() reset = True if (num_iters > max(5 * args.batch_size, args.replay_buffer_size // 20) and num_iters % args.learning_freq == 0): # Sample a bunch of transitions from replay buffer if args.prioritized: experience = replay_buffer.sample(args.batch_size, beta=beta_schedule.value(num_iters)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(args.batch_size) weights = np.ones_like(rewards) # Minimize the error in Bellman's equation and compute TD-error td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # Update the priorities in the replay buffer if args.prioritized: new_priorities = np.abs(td_errors) + args.prioritized_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # Update target network. if num_iters % args.target_update_freq == 0: update_target() if start_time is not None:
class DQN(BaseAgent): def __init__(self, env, name='default', alg_name='dqn', network_type='mini-mlp', total_timesteps=5e7, batch_size=32, lr=1e-3, gamma=0.99, buffer_size=1e6, final_eps=0.05, exploration_fraction=0.1, training_start=1e5, target_update_freq=1e4, optimizer=tf.train.AdamOptimizer, gradient_clipping=None, reward_clipping=False, tau=1., double_q=False, dueling=False, prioritized_replay=False, prioritized_replay_alpha=0.5, prioritized_replay_beta_init=0.4, prioritized_replay_beta_fraction=1.0, prioritized_replay_eps=1e-6, rolling_reward_mean=20, solved_callback=None, render_training=False, **kwargs): """ Implementation of the Deep Q Learning (DQN) algorithm formulated by Mnih et. al. Contains some well known improvements over the vanilla DQN. Parameters ---------- env: gym.Environment (gym) Environment the agent shall learn from and act on name: str descriptive name of this DQN configuration, e.g. 'atari-breakout' network_type: str which network is from 'networks.py' total_timesteps: int or float number of training timesteps batch_size: int size of minibatch per backprop lr: float learning rate gamma: float discount factor gamma for bellman target buffer_size: int or float maximum number of in replay buffer final_eps: float value to which epsilon is annealed exploration_fraction: float fraction of traing timesteps over which epsilon is annealed training_start: int timestep at which training of the q network begins target_update_freq: int frequency of target network updates (in timesteps) optimizer: tf.Optimizer optimizer class which shall be used such as Adam or RMSprop gradient_clipping: int if not None, gradients are clipped by this value by norm reward_clipping: float rewards will be clipped to this value if not None tau: float interpolation constant for soft update. 1.0 corresponds to a full synchronisation of networks weights, as in the original DQN paper double_q: bool enables Double Q Learning for DQN dueling: bool splits network architecture into advantage and value streams. V(s, a) gets more frequent updates, should stabalize learning prioritized_replay: True use (proportional) prioritized replay prioritized_replay_alpha: float alpha for weighting priorization prioritized_replay_beta_init: float initial value of beta for prioritized replay buffer prioritized_replay_beta_fraction: float fraction of total timesteps to anneal beta to 1.0 prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. rolling_reward_mean: int window of which the rolling mean in the statistics is computed solved_callback: function function which gets as an input the episode rewards as an array and must return a bool. if returned True, the training is considered as done and therefore prematurely interrupted. render_training: bool whether to render the environment while training """ # instance name self.name = name # environment to act on / learn from self.env = env # basic DQN parameters self.total_timesteps = float(total_timesteps) self.buffer_size = int(float(buffer_size)) self.batch_size = batch_size self.final_eps = final_eps self.lr = float(lr) self.gamma = float(gamma) self.exploration_fraction = float(exploration_fraction) self.training_start = int(float(training_start)) self.target_update_freq = int(float(target_update_freq)) # tf.Optimizer self.optimizer = optimizer # minor changes as suggested in some papers self.gradient_clipping = int( gradient_clipping) if gradient_clipping is not None else None self.reward_clipping = int( reward_clipping) if reward_clipping is not None else None # enhancements to DQN published in papers self.tau = float(tau) self.double_q = double_q self.dueling = dueling self.prioritized_replay = prioritized_replay self.prioritized_replay_alpha = float(prioritized_replay_alpha) self.prioritized_replay_beta_init = float(prioritized_replay_beta_init) self.prioritized_replay_beta_fraction = float( prioritized_replay_beta_fraction) self.prioritized_replay_eps = float(prioritized_replay_eps) # function to determine whether agent is able to act well enough self.solved_callback = solved_callback # call env.render() each training step self.render_training = render_training # sliding window for reward calc self.rolling_reward_mean = rolling_reward_mean # stores latest measure for best policy, e.g. best mean over last N episodes self.latest_best = 0.0 super().__init__(env, alg_name, name, **kwargs) # calculate timestep where epsilon reaches its final value self.schedule_timesteps = int(self.total_timesteps * self.exploration_fraction) # sanity checks assert 0.0 < self.tau <= 1.0 # env specific parameter self.obs_shape = env.observation_space.shape self.num_actions = env.action_space.n # tf scopes self.Q_SCOPE = 'q_network' self.TARGET_SCOPE = 'target_network' # build Q and target network; using different scopes to distinguish variables for gradient computation self.q_t_in, self.q_t = build_network(self.obs_shape, self.num_actions, network_type=network_type, dueling=self.dueling, scope=self.Q_SCOPE, summaries=True) self.target_tp1_in, self.target_tp1 = build_network( self.obs_shape, self.num_actions, dueling=self.dueling, network_type=network_type, scope=self.TARGET_SCOPE) # double Q learning needs to pass observations t+1 to the q networks for action selection # so we reuse already created q network variables but with different input if self.double_q: self.q_tp1_in, self.q_tp1 = build_network( self.obs_shape, self.num_actions, dueling=self.dueling, network_type=network_type, scope=self.Q_SCOPE, reuse=True) # create replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, self.prioritized_replay_alpha) else: self.replay_buffer = ReplayBuffer(self.buffer_size) # list of variables of the different networks. required for copying # Q to target network and excluding target network variables from backprop self.q_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.Q_SCOPE) self.target_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.TARGET_SCOPE) # placeholders used in loss function self._L_r = tf.placeholder(tf.float32, (None, ), name='loss_rewards') self._L_a = tf.placeholder(tf.int32, (None, ), name='loss_actions') self._L_d = tf.placeholder(tf.float32, (None, ), name='loss_dones') # pointer to td error vector self._td_errors = tf.placeholder(tf.float32, (None, ), name='td_errors') # configure prioritized replay if self.prioritized_replay: self._is_weights = tf.placeholder( tf.float32, (None, ), name='importance_sampling_weights') # schedule for PR beta beta_steps = int(self.total_timesteps * self.prioritized_replay_beta_fraction) self.pr_beta = LinearSchedule( beta_steps, initial_p=prioritized_replay_beta_init, final_p=1.0) # epsilon schedule self.eps = LinearSchedule(self.schedule_timesteps, final_p=final_eps) # init optimizer self.opt = self.optimizer(self.lr) # specify loss function, only include Q network variables for gradient computation self.gradients = self.opt.compute_gradients(self._loss(), var_list=self.q_net_vars) # clip gradients by norm if self.gradient_clipping is not None: for idx, (grad, var) in enumerate(self.gradients): if grad is not None: self.gradients[idx] = (tf.clip_by_norm( grad, self.gradient_clipping), var) # create training op self.train_op = self.opt.apply_gradients(self.gradients) # update_target_fn will be called periodically to copy Q network to target Q network # variable lists are sorted by name to ensure that correct values are copied self.update_target_ops = [] for var_q, var_target in zip( sorted(self.q_net_vars, key=lambda v: v.name), sorted(self.target_net_vars, key=lambda v: v.name)): v_update = var_target.assign(self.tau * var_q + (1 - self.tau) * var_target) self.update_target_ops.append(v_update) self.update_target_ops = tf.group(*self.update_target_ops) # global tf.Session and Graph init self.sess = tf.Session() # init tensorboard, variables and debug self._finalize_init() # sync networks before training self.sess.run(self.update_target_ops) def _setup_tensorboard(self): """ Adds all variables that might help debugging to Tensorboard. At the end, the FileWriter is constructed pointing to the specified directory. """ # more placeholders for summarised variables; along with summaries self.eps_ph = tf.placeholder(tf.float32, (), name='epsilon') self.rew_ph = tf.placeholder(tf.float32, (), name='rolling-reward') scalar_summary('epsilon', self.eps_ph) scalar_summary('reward', self.rew_ph) # display q_values while training for a_i in range(self.num_actions): scalar_summary('QTa_{}'.format(a_i + 1), tf.reduce_mean(self.target_tp1[:, a_i]), scope='Q-Values') scalar_summary('Qa_{}'.format(a_i + 1), tf.reduce_mean(self.q_t[:, a_i]), scope='Q-Values') # plot network weights with tf.variable_scope('weights'): for qv in self.q_net_vars: tf.summary.histogram('{}'.format(qv.name), qv) for tv in self.target_net_vars: tf.summary.histogram('{}'.format(tv.name), tv) # gradient histograms with tf.variable_scope('gradients'): for g in self.gradients: tf.summary.histogram('{}-grad'.format(g[1].name), g[0]) def _loss(self): """ Defines loss as layed out in the original Nature paper """ with tf.variable_scope('loss'): # either use maximum target q or use value from target network while the action is chosen by the q net if self.double_q: act_tp1_idxs = tf.stop_gradient(tf.argmax(self.q_tp1, axis=1)) q_tp1 = tf.reduce_sum( self.target_tp1 * tf.one_hot(act_tp1_idxs, self.num_actions), axis=1) else: q_tp1 = tf.reduce_max(self.target_tp1, axis=1) # bellman target y = self._L_r + (self.gamma * (1.0 - self._L_d) * q_tp1) # select q value of taken action qj = tf.reduce_sum(self.q_t * tf.one_hot(self._L_a, self.num_actions), axis=1) # TD errors self._td_errors = qj - y # apply huber loss loss = tf.losses.huber_loss(y, qj) if self.use_tensorboard: scalar_summary('target', tf.reduce_mean(y)) scalar_summary('huber-loss', tf.reduce_mean(loss)) tf.summary.histogram('selected_Q', qj) # importance sampling weights if self.prioritized_replay: updates = tf.reduce_mean(self._is_weights * loss) else: updates = tf.reduce_mean(loss) return updates def _build_feed_dict(self, obs_t, ac_t, rew_t, obs_tp1, dones, eps, rolling_rew, weights=None): """ Takes minibatch and returns feed dict for a tf.Session based on the algorithms configuration. """ # first, add data required in all DQN configs feed_d = { self.q_t_in: obs_t, self.target_tp1_in: obs_tp1, self._L_r: rew_t, self._L_a: ac_t, self._L_d: dones } # pass obs t+1 to q network if self.double_q: feed_d[self.q_tp1_in] = obs_tp1 # importance sampling weights if self.prioritized_replay: feed_d[self._is_weights] = weights # variables only necessary for TensorBoard visualisation if self.use_tensorboard: feed_d[self.eps_ph] = eps feed_d[self.rew_ph] = rolling_rew return feed_d def learn(self): """ Learns Q function for a given amount of timesteps """ # reset env, store first observation obs_t = self.env.reset() # save all episode rewards episode_reward_series = [[0.0]] episode_rewards = [] self.logger.info( 'Starting Exploration, training will start at step {}.'.format( self.training_start)) for t in tqdm(range(int(self.total_timesteps))): # decide on action either by policy or chose a random one epsilon = self.eps.value(t) _rand = np.random.choice([True, False], p=[epsilon, 1 - epsilon]) if _rand: action = self.env.action_space.sample() else: action = np.argmax(self.sess.run(self.q_t, {self.q_t_in: [obs_t]}), axis=1) assert len(action) == 1, 'only one action can be taken!' action = action[0] # act on environment with chosen action obs_tp1, reward, done, _ = self.env.step(action) # clip reward if self.reward_clipping: reward = 1 if reward > 0 else -1 if reward < 0 else 0 # store new transition self.replay_buffer.add(obs_t, action, reward, obs_tp1, float(done)) # new observation will be current one in next iteration obs_t = obs_tp1 # append current rewards to episode reward series episode_reward_series[-1].append(reward) if self.render_training: self.env.render() if t == self.training_start: self.logger.info('Training starts now! (t = {})'.format(t)) # final calculations and env reset if done: # calculate total reward episode_rewards.append(np.sum(episode_reward_series[-1])) episode_reward_series.append([0.0]) # reset env to initial state obs_t = self.env.reset() # start training after warmup period if t >= self.training_start: # calculate rolling reward rolling_r = np.mean(episode_rewards[-self.rolling_reward_mean:] ) if len(episode_rewards) > 0 else 0.0 # post episode stuff: printing and saving if done: result_table = [['t', t], ['episode', len(episode_rewards)], ['mean_reward [20]', rolling_r], ['epsilon', epsilon]] print('\n{}'.format(tabulate(result_table))) # if the policy improved, save as new best ... achieving a good reward in one episode # might not be the best metric. continuously achieving good rewards would better if len(episode_rewards) >= 25: mr = np.mean( episode_rewards[-self.rolling_reward_mean:]) if mr >= self.latest_best: self.latest_best = mr self.logger.info( 'Saving new best policy with mean[{}]_r = {} ...' .format(self.rolling_reward_mean, mr)) self._save('best') # save latest policy self._save() # write current values to csv log self.csvlog.write('{}, {}, {}\n'.format( len(episode_rewards), epsilon, episode_rewards[-1])) # sample batch of transitions randomly for training and build feed dictionary # prioritized replay needs a beta and returns weights. if self.prioritized_replay: o_t, a_t, r_t, o_tp1, do, is_ws, batch_idxs = self.replay_buffer.sample( self.batch_size, self.pr_beta.value(t)) feed = self._build_feed_dict(o_t, a_t, r_t, o_tp1, do, epsilon, rolling_r, weights=is_ws) else: o_t, a_t, r_t, o_tp1, do = self.replay_buffer.sample( self.batch_size) feed = self._build_feed_dict(o_t, a_t, r_t, o_tp1, do, epsilon, rolling_r) # run training (and summary) operations if self.use_tensorboard: summary, _, td_errors = self.sess.run( [self.merge_op, self.train_op, self._td_errors], feed_dict=feed) self.writer.add_summary(summary, t) else: self.sess.run(self.train_op, feed_dict=feed) # new td errors needed to update buffer weights if self.prioritized_replay: new_prios = np.abs(td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities(batch_idxs, new_prios) # sync target network every C steps if (t - self.training_start) % self.target_update_freq == 0: self.sess.run(self.update_target_ops) if self.solved_callback is not None: if self.solved_callback(episode_rewards): self.logger.info('Solved!') break # total reward of last episode episode_rewards.append(np.sum(episode_reward_series[-1])) # finalize training, e.g. set flags, write done-file self._finalize_training() def run(self, render=True): """ Runs policy on given environment """ if not self.is_trained: self.logger.warning('Trying to run untrained model!') # set necessary parameters to their defaults epsilon = self.final_eps reward = 0.0 obs = self.env.reset() while True: # decide on action either by policy or chose a random one _rand = np.random.choice([True, False], p=[epsilon, 1 - epsilon]) if _rand: action = self.env.action_space.sample() else: action = np.argmax(self.sess.run(self.q_t, {self.q_t_in: [obs]}), axis=1) assert len(action) == 1, 'only one action can be taken!' action = action[0] # act on environment with chosen action obs, rew, done, _ = self.env.step(action) reward += rew if render: self.env.render() if done: self.logger.info('Done! Reward {}'.format(reward)) reward = 0.0 obs = self.env.reset()