class DQNAgent: def __init__(self, identifier, actions, observation_shape, num_steps, x=0.0, y=0.0): self.id = identifier self.actions = actions self.x = x self.y = y self.yellow_steps = 0 self.postponed_action = None self.obs = None self.current_action = None self.weights = np.ones(32) self.td_errors = np.ones(32) self.pre_train = 2500 self.prioritized = False self.prioritized_eps = 1e-4 self.batch_size = 32 self.buffer_size = 30000 self.learning_freq = 500 self.target_update = 5000 # Create all the functions necessary to train the model self.act, self.train, self.update_target, self.debug = deepq.build_train( make_obs_ph=lambda name: TrafficTfInput(observation_shape, name=name), q_func=dueling_model, num_actions=len(actions), optimizer=tf.train.AdamOptimizer(learning_rate=1e-4, epsilon=1e-4), gamma=0.99, double_q=True, scope="deepq" + identifier ) # Create the replay buffer if self.prioritized: self.replay_buffer = PrioritizedReplayBuffer(size=self.buffer_size, alpha=0.6) self.beta_schedule = LinearSchedule(num_steps // 4, initial_p=0.4, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). self.exploration = LinearSchedule(schedule_timesteps=int(num_steps * 0.1), initial_p=1.0, final_p=0.01) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() def take_action(self, t): if self.postponed_action is None: # Take action and update exploration to the newest value action = self.act(np.array(self.obs)[None], update_eps=self.exploration.value(t))[0] else: # Take action postponed by yellow light transition action = self.postponed_action self.postponed_action = None return action def store(self, rew, new_obs, done): # Store transition in the replay buffer. self.replay_buffer.add(self.obs, self.current_action, rew, new_obs, float(done)) def learn(self, t): # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > self.pre_train: if self.prioritized: experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, self.weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size) self.weights = np.ones_like(rewards) # Minimize the error in Bellman's equation and compute TD-error self.td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones, self.weights) # Update the priorities in the replay buffer if self.prioritized: new_priorities = np.abs(self.td_errors) + self.prioritized_eps self.replay_buffer.update_priorities(batch_idxes, new_priorities) self.update_target_network(t) def update_target_network(self, t): # Update target network periodically. if t % self.target_update == 0: self.update_target() def add_fingerprint_to_obs(self, obs, weights, identifier, td_errors): idx = 0 for w in weights: obs[2, identifier, idx] = w idx += 1 for td in td_errors: obs[2, identifier, idx] = td idx += 1 return obs def add_fingerprint(self, weights, identifier, td_errors): self.obs = self.add_fingerprint_to_obs(self.obs, weights, identifier, td_errors)
def learn(env, network, seed=None, use_crm=False, use_rs=False, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. use_crm: bool use counterfactual experience to train the policy use_rs: bool use reward shaping lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. batch_size: int size of a batch sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Adjusting hyper-parameters by considering the number of RM states for crm if use_crm: rm_states = env.get_num_rm_states() buffer_size = rm_states * buffer_size batch_size = rm_states * batch_size # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = env.step(env_action) # Store transition in the replay buffer. if use_crm: # Adding counterfactual experience (this will alrady include shaped rewards if use_rs=True) experiences = info["crm-experience"] elif use_rs: # Include only the current experince but shape the reward experiences = [(obs, action, info["rs-reward"], new_obs, float(done))] else: # Include only the current experience (standard deepq) experiences = [(obs, action, rew, new_obs, float(done))] # Adding the experiences to the replay buffer for _obs, _action, _r, _new_obs, _done in experiences: replay_buffer.add(_obs, _action, _r, _new_obs, _done) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_variables(model_file) return act
def __init__(self, learning_rate=5e-4, # could use linearschedule here as well? gamma=.99, epsilon_max=1.0, epsilon_min=0.001, epsilon_decay_steps=300000, learning_starts=1000, train_freq=100, target_update_freq=5000, max_buffer_size=100000, batch_size=16, prioritized_replay_beta_iters = 300000, # in reality this would be max_steps -- for now just much larger than decay steps training=True, indicate_nonrandom_action=False, prioritized=True, prioritized_alpha = .6, # b=.7, a=.5 for rank-based prioritization prioritized_beta = .4, # "rank-based likely not as good for sparse-reward structures" ... clipping limits outliers save_file='C:/Users/lbianculli/venv1/sc_bot/minigames/collect_minerals/logs/network_saves', save_dir='C:/Users/lbianculli/venv1/sc_bot/minigames/collect_minerals/logs/ckpts/', ckpt_name='collect_minerals_6-23', summary_path='C:/Users/lbianculli/venv1/sc_bot/minigames/collect_minerals/logs/summaries/', buffer_path='C:/Users/lbianculli/venv1/sc_bot/minigames/collect_minerals/logs/buffers/buffer_6-23', logdir='C:/Users/lbianculli/venv1/sc_bot/minigames/collect_minerals/logs/variable_logs.txt', log=True): super(DQNMoveOnlyAgent, self).__init__() # NN hparams self.learning_rate = learning_rate self.gamma = gamma # agent hparams self.epsilon_max = epsilon_max self.epsilon_min = epsilon_min self.epsilon_decay_steps = epsilon_decay_steps self.learning_starts = learning_starts self.train_freq = train_freq self.target_update_freq = target_update_freq self.indicate_nonrandom_action = indicate_nonrandom_action # not sure exactly self.prioritized = prioritized self.prioritized_alpha = prioritized_alpha self.prioritized_beta = prioritized_beta self.save_file = save_file self.batch_size = batch_size self.log = log # other self.training = training self.max_reward = 0 self.total_reward = 0 self.last_state = None self.last_action = None if self.prioritized: self.buffer_file = buffer_path + '_prioritized.p' self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=self.prioritized_beta, final_p=1.0) else: self.buffer_file = buffer_path + '.p' # load and set epsilon if os.path.isfile(self.save_file + '.npy'): self.epsilon, self.initial_step = np.load(self.save_file + '.npy') # can i just use loaded step for epsilon as well? print(f'epsilon loaded: {self.epsilon}') else: self.epsilon = 1.0 self.initial_step = 0 self.epsilons = [self.epsilon] # for saving and loading files if save_dir: self.online_save_dir = save_dir + 'online/' # for use in checkpoints self.target_save_dir = save_dir + 'target/' if ckpt_name: self.ckpt_name = ckpt_name if summary_path: self.online_summary_path = summary_path + 'online/' # for use in TB summaries self.target_summary_path = summary_path + 'target/' if self.log: self.init_logger(logdir) # build network if save_dir and ckpt_name: self.online_save_path = self.online_save_dir + ckpt_name + '.ckpt' self.target_save_path = self.target_save_dir + ckpt_name + '.ckpt' print("Building models...") tf.reset_default_graph() self.online_network = PlayerRelativeCNN(spatial_dims=FEATURE_SCREEN_SIZE, learning_rate=self.learning_rate, save_path=self.online_save_path, summary_path=self.online_summary_path, name='DQN') if self.training: # set up target_net and initialize replay buffer self.target_network = PlayerRelativeCNN(spatial_dims=FEATURE_SCREEN_SIZE, learning_rate=self.learning_rate, save_path = self.target_save_path, summary_path = self.target_summary_path, name='target_network') # initialize tf session config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) print('Initialization complete.') # check for and load networks/buffer if possible if os.path.isfile(self.online_save_path + '.index') and os.path.isfile(self.target_save_path + '.index'): self.online_network.load(self.sess) self.target_network.load(self.sess) # check for buffer to load if os.path.isfile(self.buffer_file): with open(self.buffer_file, 'rb') as f: self.replay_buffer = pickle.load(f) else: if self.prioritized: # alpha = 0 is same as uniform self.replay_buffer = PrioritizedReplayBuffer(max_buffer_size, self.prioritized_alpha) else: self.replay_buffer = ReplayBuffer(max_buffer_size) self.online_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'DQN') self.target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'target_network') self.online_network._init_train_fn(self.online_vars, grad_norm_clipping=10) # what are good values for clip? self.target_network._init_train_fn(self.target_vars, grad_norm_clipping=10) print('online and target models loaded.') self._tf_init() if self.training: self._update_target_network() # do i still need this? else: self._tf_init()
def learn(env, network, seed=None, lr=1e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, multiplayer=False, callback=None, load_path=None, load_path_1=None, load_path_2=None, **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # This was all handled in not the most elegant way # Variables have a _1 or _2 appended to them to separate them # and a bunch of if statementss to have the _2 variables not do anything in single-player # when in multiplayer Space Invaders, need to not reward players for other player dying isSpaceInvaders = False if "SpaceInvaders" in str(env): isSpaceInvaders = True # put a limit on the amount of memory used, otherwise TensorFlow will consume nearly everything # this leaves 1 GB free on my computer, others may need to change it # Create all the functions necessary to train the model # Create two separate TensorFlow sessions graph_1 = tf.Graph() sess_1 = tf.Session(graph=graph_1) if multiplayer: graph_2 = tf.Graph() sess_2 = tf.Session(graph=graph_2) else: # set session 2 to None if it's not being used sess_2 = None set_global_seeds(seed) # specify the q functions as separate objects q_func_1 = build_q_func(network, **network_kwargs) if multiplayer: q_func_2 = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) # build everything for the first model # pass in the session and the "_1" suffix act_1, train_1, update_target_1, debug_1 = deepq.build_train( sess=sess_1, make_obs_ph=make_obs_ph, q_func=q_func_1, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise, scope="deepq") # a lot of if multiplayer statements duplicating these actions for a second network # pass in session 2 and "_2" instead if multiplayer: act_2, train_2, update_target_2, debug_2 = deepq.build_train( sess=sess_2, make_obs_ph=make_obs_ph, q_func=q_func_2, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise, scope="deepq") # separate act_params for each wrapper act_params_1 = { 'make_obs_ph': make_obs_ph, 'q_func': q_func_1, 'num_actions': env.action_space.n, } if multiplayer: act_params_2 = { 'make_obs_ph': make_obs_ph, 'q_func': q_func_2, 'num_actions': env.action_space.n, } # make the act wrappers act_1 = ActWrapper(act_1, act_params_1) if multiplayer: act_2 = ActWrapper(act_2, act_params_2) # I need to return something if it's single-player else: act_2 = None # Create the replay buffer # separate replay buffers are required for each network # this is required for competitive because the replay buffers hold rewards # and player 2 has different rewards than player 1 if prioritized_replay: replay_buffer_1 = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if multiplayer: replay_buffer_2 = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer_1 = ReplayBuffer(buffer_size) if multiplayer: replay_buffer_2 = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. # initialize both sessions U.initialize(sess_1) if multiplayer: U.initialize(sess_2) # the session was passed into these functions when they were created # the separate update functions work within the different sessions update_target_1() if multiplayer: update_target_2() # keep track of rewards for both models separately episode_rewards_1 = [0.0] saved_mean_reward_1 = None if multiplayer: episode_rewards_2 = [0.0] saved_mean_reward_2 = None obs = env.reset() reset = True # storing stuff in a temporary directory while it's working with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file_1 = os.path.join(td, "model_1") temp_file_1 = os.path.join(td, "temp_1") model_saved_1 = False if multiplayer: model_file_2 = os.path.join(td, "model_2") temp_file_2 = os.path.join(td, "temp_2") model_saved_2 = False if tf.train.latest_checkpoint(td) is not None: if multiplayer: # load both models if multiplayer is on load_variables(model_file_1, sess_1) logger.log('Loaded model 1 from {}'.format(model_file_1)) model_saved_1 = True load_variables(model_file_2, sess_2) logger.log('Loaded model 2 from {}'.format(model_file_2)) model_saved_2 = True # otherwise just load the first one else: load_variables(model_file_1, sess_1) logger.log('Loaded model from {}'.format(model_file_1)) model_saved_1 = True # I have separate load variables for single-player and multiplayer # this should be None if multiplayer is on elif load_path is not None: load_variables(load_path, sess_1) logger.log('Loaded model from {}'.format(load_path)) # load the separate models in for multiplayer # should load the variables into the appropriate sessions # my format may restrict things to working properly only when a Player 1 model is loaded into session 1, and same for Player 2 # however, in practice, the models won't work properly otherwise elif multiplayer: if load_path_1 is not None: load_variables(load_path_1, sess_1) logger.log('Loaded model 1 from {}'.format(load_path_1)) if load_path_2 is not None: load_variables(load_path_2, sess_2) logger.log('Loaded model 2 from {}'.format(load_path_2)) # actual training starts here for t in range(total_timesteps): # use this for updating purposes actual_t = t + 1 if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # receive model 1's action based on the model and observation action_1 = act_1(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action_1 = action_1 # do the same for model 2 if in multiplayer if multiplayer: action_2 = act_2(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action_2 = action_2 reset = False # apply actions to the environment if multiplayer: new_obs, rew_1, rew_2, done, _ = env.step( env_action_1, env_action_2) # apply single action if there isn't a second model else: new_obs, rew_1, rew_2, done, _ = env.step(env_action_1) # manual clipping for Space Invaders multiplayer if isSpaceInvaders and multiplayer: # don't reward a player when the other player dies # change the reward to 0 # the only time either player will get rewarded 200 is when the other player dies if rew_1 >= 200: rew_1 = rew_1 - 200.0 if rew_2 >= 200: rew_2 = rew_2 - 200.0 # manually clip the rewards using the sign function rew_1 = np.sign(rew_1) rew_2 = np.sign(rew_2) combo_factor = 0.25 rew_1_combo = rew_1 + combo_factor * rew_2 rew_2_combo = rew_2 + combo_factor * rew_1 rew_1 = rew_1_combo rew_2 = rew_2_combo # Store transition in the replay buffers replay_buffer_1.add(obs, action_1, rew_1, new_obs, float(done)) if multiplayer: # pass reward_2 to the second player # this reward will vary based on the game replay_buffer_2.add(obs, action_2, rew_2, new_obs, float(done)) obs = new_obs # separate rewards for each model episode_rewards_1[-1] += rew_1 if multiplayer: episode_rewards_2[-1] += rew_2 if done: obs = env.reset() episode_rewards_1.append(0.0) if multiplayer: episode_rewards_2.append(0.0) reset = True if actual_t > learning_starts and actual_t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # sample from the two replay buffers if prioritized_replay: experience_1 = replay_buffer_1.sample( batch_size, beta=beta_schedule.value(t)) (obses_t_1, actions_1, rewards_1, obses_tp1_1, dones_1, weights_1, batch_idxes_1) = experience_1 # keep all the variables with separate names if multiplayer: experience_2 = replay_buffer_2.sample( batch_size, beta=beta_schedule.value(t)) (obses_t_2, actions_2, rewards_2, obses_tp1_2, dones_2, weights_2, batch_idxes_2) = experience_2 # do the same if there's no prioritization else: obses_t_1, actions_1, rewards_1, obses_tp1_1, dones_1 = replay_buffer_1.sample( batch_size) weights_1, batch_idxes_1 = np.ones_like(rewards_1), None if multiplayer: obses_t_2, actions_2, rewards_2, obses_tp1_2, dones_2 = replay_buffer_2.sample( batch_size) weights_2, batch_idxes_2 = np.ones_like( rewards_2), None # actually train the model based on the samples td_errors_1 = train_1(obses_t_1, actions_1, rewards_1, obses_tp1_1, dones_1, weights_1) if multiplayer: td_errors_2 = train_2(obses_t_2, actions_2, rewards_2, obses_tp1_2, dones_2, weights_2) # give new priority weights to the observations if prioritized_replay: new_priorities_1 = np.abs( td_errors_1) + prioritized_replay_eps replay_buffer_1.update_priorities(batch_idxes_1, new_priorities_1) if multiplayer: new_priorities_2 = np.abs( td_errors_2) + prioritized_replay_eps replay_buffer_2.update_priorities( batch_idxes_2, new_priorities_2) if actual_t > learning_starts and actual_t % target_network_update_freq == 0: # Update target networks periodically. update_target_1() if multiplayer: update_target_2() # this section is for the purposes of logging stuff # calculate the average reward over the last 100 episodes mean_100ep_reward_1 = round(np.mean(episode_rewards_1[-101:-1]), 1) if multiplayer: mean_100ep_reward_2 = round( np.mean(episode_rewards_2[-101:-1]), 1) num_episodes = len(episode_rewards_1) # every given number of episodes log and print out the appropriate stuff if done and print_freq is not None and len( episode_rewards_1) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) # print out both rewards if multiplayer if multiplayer: logger.record_tabular("mean 100 episode reward 1", mean_100ep_reward_1) logger.record_tabular("mean 100 episode reward 2", mean_100ep_reward_2) else: logger.record_tabular("mean 100 episode reward", mean_100ep_reward_1) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() # save best-performing version of each model # I've opted out of this for competitive multiplayer because it's difficult to determine what's "best" if (checkpoint_freq is not None and actual_t > learning_starts and num_episodes > 100 and actual_t % checkpoint_freq == 0): # if there's a best reward, save it as the new best model if saved_mean_reward_1 is None or mean_100ep_reward_1 > saved_mean_reward_1: if print_freq is not None: if multiplayer: logger.log( "Saving model 1 due to mean reward increase: {} -> {}" .format(saved_mean_reward_1, mean_100ep_reward_1)) else: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward_1, mean_100ep_reward_1)) save_variables(model_file_1, sess_1) model_saved_1 = True saved_mean_reward_1 = mean_100ep_reward_1 if multiplayer and (saved_mean_reward_2 is None or mean_100ep_reward_2 > saved_mean_reward_2): if print_freq is not None: logger.log( "Saving model 2 due to mean reward increase: {} -> {}" .format(saved_mean_reward_2, mean_100ep_reward_2)) save_variables(model_file_2, sess_2) model_saved_2 = True saved_mean_reward_2 = mean_100ep_reward_2 # restore models at the end to the best performers if model_saved_1: if print_freq is not None: logger.log("Restored model 1 with mean reward: {}".format( saved_mean_reward_1)) load_variables(model_file_1, sess_1) if multiplayer and model_saved_2: if print_freq is not None: logger.log("Restored model 2 with mean reward: {}".format( saved_mean_reward_2)) load_variables(model_file_2, sess_2) return act_1, act_2, sess_1, sess_2
# Create the environment env = gym.make("CartPole-v0") # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs
def learn(env, q_func, num_actions=3, lr=5e-4, max_timesteps=1000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((64, 64), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] num_episodes = 0 saved_mean_reward = None path_memory = np.zeros((64, 64)) obs = env.reset() # Select all marines first player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = player_relative + path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() obs = env.step( actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) for i in range(len(player_x)): xy = [player_x[i], player_y[i]] obs = env.step( actions=[sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy])]) group_id = 0 group_list = [] unit_xy_list = [] for i in range(len(player_x)): if i % 4 != 0: continue if group_id > 2: break xy = [player_x[i], player_y[i]] unit_xy_list.append(xy) if (len(unit_xy_list) >= 1): for idx, xy in enumerate(unit_xy_list): if (idx == 0): obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy]) ]) else: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [[1], xy]) ]) obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_CONTROL_GROUP, [[_CONTROL_GROUP_SET], [group_id]]) ]) unit_xy_list = [] group_list.append(group_id) group_id += 1 if (len(unit_xy_list) >= 1): for idx, xy in enumerate(unit_xy_list): if (idx == 0): obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy]) ]) else: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [[1], xy]) ]) obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_CONTROL_GROUP, [[_CONTROL_GROUP_SET], [group_id]]) ]) group_list.append(group_id) group_id += 1 return obs reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if t % 1000 == 0: ActWrapper.save(ActWrapper, "mineral_shards.pkl") if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False rew = 0 #select marines player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = player_relative + path_memory player = [] while (len(group_list) > 0): group_id = np.random.choice(group_list) obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_CONTROL_GROUP, [[_CONTROL_GROUP_RECALL], [group_id]]) ]) selected = obs[0].observation["screen"][_SELECTED] player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() if (len(player_y) > 0): player = [int(player_x.mean()), int(player_y.mean())] break else: group_list.remove(group_id) if (len(player) == 2): if (player[0] > 32): screen = shift(LEFT, player[0] - 32, screen) elif (player[0] < 32): screen = shift(RIGHT, 32 - player[0], screen) if (player[1] > 32): screen = shift(UP, player[1] - 32, screen) elif (player[1] < 32): screen = shift(DOWN, 32 - player[1], screen) coord = [player[0], player[1]] path_memory_ = np.array(path_memory, copy=True) if (action == 0): #UP if (player[1] >= 16): coord = [player[0], player[1] - 16] path_memory_[player[1] - 16:player[1], player[0]] = -1 elif (player[1] > 0): coord = [player[0], 0] path_memory_[0:player[1], player[0]] = -1 #else: # rew -= 1 elif (action == 1): #DOWN if (player[1] <= 47): coord = [player[0], player[1] + 16] path_memory_[player[1]:player[1] + 16, player[0]] = -1 elif (player[1] > 47): coord = [player[0], 63] path_memory_[player[1]:63, player[0]] = -1 #else: # rew -= 1 elif (action == 2): #LEFT if (player[0] >= 16): coord = [player[0] - 16, player[1]] path_memory_[player[1], player[0] - 16:player[0]] = -1 elif (player[0] < 16): coord = [0, player[1]] path_memory_[player[1], 0:player[0]] = -1 #else: # rew -= 1 elif (action == 3): #RIGHT if (player[0] <= 47): coord = [player[0] + 16, player[1]] path_memory_[player[1], player[0]:player[0] + 16] = -1 elif (player[0] > 47): coord = [63, player[1]] path_memory_[player[1], player[0]:63] = -1 path_memory = np.array(path_memory_) if _MOVE_SCREEN not in obs[0].observation["available_actions"]: for i in range(len(player_x)): xy = [player_x[i], player_y[i]] obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy]) ]) #obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] # else: # new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = player_relative + path_memory selected = obs[0].observation["screen"][_SELECTED] player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer.add(screen, action, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew #episode_minerals[-1] += obs[0].reward if done: obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screen = player_relative + path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 32): screen = shift(LEFT, player[0] - 32, screen) elif (player[0] < 32): screen = shift(RIGHT, 32 - player[0], screen) if (player[1] > 32): screen = shift(UP, player[1] - 32, screen) elif (player[1] < 32): screen = shift(DOWN, 32 - player[1], screen) # Select all marines first obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) for i in range(len(player_x)): xy = [player_x[i], player_y[i]] obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy]) ]) group_id = 0 group_list = [] unit_xy_list = [] for i in range(len(player_x)): if i % 4 != 0: continue if group_id > 2: break xy = [player_x[i], player_y[i]] unit_xy_list.append(xy) if (len(unit_xy_list) >= 1): for idx, xy in enumerate(unit_xy_list): if (idx == 0): obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_POINT, [[0], xy]) ]) else: obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_POINT, [[1], xy]) ]) obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_CONTROL_GROUP, [[_CONTROL_GROUP_SET], [group_id]]) ]) unit_xy_list = [] group_list.append(group_id) group_id += 1 if (len(unit_xy_list) >= 1): for idx, xy in enumerate(unit_xy_list): if (idx == 0): obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_POINT, [[0], xy]) ]) else: obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_POINT, [[1], xy]) ]) obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_CONTROL_GROUP, [[_CONTROL_GROUP_SET], [group_id]]) ]) group_list.append(group_id) group_id += 1 episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) #mean_100ep_mineral = round(np.mean(episode_minerals[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) #logger.record_tabular("mean 100 episode mineral", mean_100ep_mineral) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act)
def learn(env, network, seed=None, lr=5e-4, total_timesteps=1000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, save_path=None, **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. batch_size: int size of a batch sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ logger = logging.getLogger() coloredlogs.install( level='DEBUG', fmt= '%(asctime)s,%(msecs)03d %(filename)s[%(process)d] %(levelname)s %(message)s' ) logger.setLevel(logging.DEBUG) # DATAVAULT: Set up list of action meanings and two lists to store episode # and total sums for each possible action in the list. action_names = env.unwrapped.get_action_meanings() action_episode_sums = [] action_total_sums = [] for x in range(len(action_names)): action_episode_sums.append(0) action_total_sums.append(0) # And obviously, you need a datavault item dv = DataVault() # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) #DATAVAULT: This is where you usually want to scrape data - in the timestep loop for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # if environment is pacman, limit moves to four directions name = env.unwrapped.spec.id if name == "MsPacmanNoFrameskip-v4": while True: step_return = act(np.array(obs)[None], update_eps=update_eps, **kwargs) action = step_return[0][0] env_action = action q_values = np.squeeze(step_return[1]) # test for break condition if 1 <= action <= 4: break else: step_return = act(np.array(obs)[None], update_eps=update_eps, **kwargs) action = step_return[0][0] q_values = np.squeeze(step_return[1]) env_action = action reset = False new_obs, rew, done, info = env.step(env_action) # DATAVAULT: after each step, we push the information out to the datavault lives = env.ale.lives() #store_data(self, action, action_name, action_episode_sums, action_total_sums, reward, done, info, lives, q_values, observation, mean_reward): action_episode_sums, action_total_sums = dv.store_data( action, action_names[action], action_episode_sums, action_total_sums, rew, done, info, lives, q_values, new_obs, saved_mean_reward) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() if (len(episode_rewards[-101:-1]) > 0): mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) else: mean_100ep_reward = 0 num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_variables(model_file) dv.make_dataframes() print("Save path is: ") print(save_path) # use parent dir to save data, so we can keep the current folder small and portable directory = os.path.abspath(os.path.join(save_path, os.pardir)) csv_path = os.path.join(directory, 'CSVs') os.mkdir(csv_path) dv.df_to_csv(csv_path) return act
with U.make_session(8): # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: tf.placeholder(tf.int32, [None, None], name=name), q_func=model, num_actions=s.action_space, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] for t in itertools.count(start=1): # Take action and update exploration to the newest value action = act(np.array(student_history)[None], update_eps=exploration.value(t))[ 0] #FIXME: shape (0, ) instead of (None, None) (correct, time_passed), reward, done = s.do_exercise(action)
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs ): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. batch_size: int size of a batch sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) ############################## RL-S Prepare ############################################# # model saved name saved_name = "0817" ##### # Setup Training Record ##### save_new_data = False create_new_file = False create_new_file_rule = create_new_file save_new_data_rule = save_new_data create_new_file_RL = False save_new_data_RL = save_new_data create_new_file_replay_buffer = False save_new_data_replay_buffer = save_new_data is_training = False trajectory_buffer = deque(maxlen=20) if create_new_file_replay_buffer: if osp.exists("recorded_replay_buffer.txt"): os.remove("recorded_replay_buffer.txt") else: replay_buffer_dataset = np.loadtxt("recorded_replay_buffer.txt") for data in replay_buffer_dataset: obs, action, rew, new_obs, done = _extract_data(data) replay_buffer.add(obs, action, rew, new_obs, done) recorded_replay_buffer_outfile = open("recorded_replay_buffer.txt","a") recorded_replay_buffer_format = " ".join(("%f",)*31)+"\n" ##### # Setup Rule-based Record ##### create_new_file_rule = True # create state database if create_new_file_rule: if osp.exists("state_index_rule.dat"): os.remove("state_index_rule.dat") os.remove("state_index_rule.idx") if osp.exists("visited_state_rule.txt"): os.remove("visited_state_rule.txt") if osp.exists("visited_value_rule.txt"): os.remove("visited_value_rule.txt") visited_state_rule_value = [] visited_state_rule_counter = 0 else: visited_state_rule_value = np.loadtxt("visited_value_rule.txt") visited_state_rule_value = visited_state_rule_value.tolist() visited_state_rule_counter = len(visited_state_rule_value) visited_state_rule_outfile = open("visited_state_rule.txt", "a") visited_state_format = " ".join(("%f",)*14)+"\n" visited_value_rule_outfile = open("visited_value_rule.txt", "a") visited_value_format = " ".join(("%f",)*2)+"\n" visited_state_tree_prop = rindex.Property() visited_state_tree_prop.dimension = 14 visited_state_dist = np.array([[0.2, 2, 10, 0.2, 2, 10, 0.2, 2, 10, 0.2, 2, 10, 0.2, 2]]) visited_state_rule_tree = rindex.Index('state_index_rule',properties=visited_state_tree_prop) ##### # Setup RL-based Record ##### if create_new_file_RL: if osp.exists("state_index_RL.dat"): os.remove("state_index_RL.dat") os.remove("state_index_RL.idx") if osp.exists("visited_state_RL.txt"): os.remove("visited_state_RL.txt") if osp.exists("visited_value_RL.txt"): os.remove("visited_value_RL.txt") if create_new_file_RL: visited_state_RL_value = [] visited_state_RL_counter = 0 else: visited_state_RL_value = np.loadtxt("visited_value_RL.txt") visited_state_RL_value = visited_state_RL_value.tolist() visited_state_RL_counter = len(visited_state_RL_value) visited_state_RL_outfile = open("visited_state_RL.txt", "a") visited_state_format = " ".join(("%f",)*14)+"\n" visited_value_RL_outfile = open("visited_value_RL.txt", "a") visited_value_format = " ".join(("%f",)*2)+"\n" visited_state_tree_prop = rindex.Property() visited_state_tree_prop.dimension = 14 visited_state_dist = np.array([[0.2, 2, 10, 0.2, 2, 10, 0.2, 2, 10, 0.2, 2, 10, 0.2, 2]]) visited_state_RL_tree = rindex.Index('state_index_RL',properties=visited_state_tree_prop) ############################## RL-S Prepare End ############################################# # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action, q_function_cz = act(np.array(obs)[None], update_eps=update_eps, **kwargs) # RLS_action = generate_RLS_action(obs,q_function_cz,action,visited_state_rule_value, # visited_state_rule_tree,visited_state_RL_value, # visited_state_RL_tree,is_training) RLS_action = 0 env_action = RLS_action reset = False new_obs, rew, done, _ = env.step(env_action) ########### Record data in trajectory buffer and local file, but not in replay buffer ########### trajectory_buffer.append((obs, action, float(rew), new_obs, float(done))) # Store transition in the replay buffer. # replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew # safe driving is 1, collision is 0 while len(trajectory_buffer)>10: # if safe driving for 10(can be changed) steps, the state is regarded as safe obs_left, action_left, rew_left, new_obs_left, done_left = trajectory_buffer.popleft() # save this state in local replay buffer file if save_new_data_replay_buffer: recorded_data = _wrap_data(obs_left, action_left, rew_left, new_obs_left, done_left) recorded_replay_buffer_outfile.write(recorded_replay_buffer_format % tuple(recorded_data)) # put this state in replay buffer replay_buffer.add(obs_left[0], action_left, float(rew_left), new_obs_left[0], float(done_left)) action_to_record = action_left r_to_record = rew_left obs_to_record = obs_left # save this state in rule-based or RL-based visited state if action_left == 0: if save_new_data_rule: visited_state_rule_value.append([action_to_record,r_to_record]) visited_state_rule_tree.insert(visited_state_rule_counter, tuple((obs_to_record-visited_state_dist).tolist()[0]+(obs_to_record+visited_state_dist).tolist()[0])) visited_state_rule_outfile.write(visited_state_format % tuple(obs_to_record[0])) visited_value_rule_outfile.write(visited_value_format % tuple([action_to_record,r_to_record])) visited_state_rule_counter += 1 else: if save_new_data_RL: visited_state_RL_value.append([action_to_record,r_to_record]) visited_state_RL_tree.insert(visited_state_RL_counter, tuple((obs_to_record-visited_state_dist).tolist()[0]+(obs_to_record+visited_state_dist).tolist()[0])) visited_state_RL_outfile.write(visited_state_format % tuple(obs_to_record[0])) visited_value_RL_outfile.write(visited_value_format % tuple([action_to_record,r_to_record])) visited_state_RL_counter += 1 ################# Record data end ######################## if done: """ Get collision or out of multilane map """ ####### Record the trajectory data and add data in replay buffer ######### _, _, rew_right, _, _ = trajectory_buffer[-1] while len(trajectory_buffer)>0: obs_left, action_left, rew_left, new_obs_left, done_left = trajectory_buffer.popleft() action_to_record = action_left r_to_record = (rew_right-rew_left)*gamma**len(trajectory_buffer) + rew_left # record in local replay buffer file if save_new_data_replay_buffer: obs_to_record = obs_left recorded_data = _wrap_data(obs_left, action_left, r_to_record, new_obs_left, done_left) recorded_replay_buffer_outfile.write(recorded_replay_buffer_format % tuple(recorded_data)) # record in replay buffer for trainning replay_buffer.add(obs_left[0], action_left, float(r_to_record), new_obs_left[0], float(done_left)) # save visited rule/RL state data in local file if action_left == 0: if save_new_data_rule: visited_state_rule_value.append([action_to_record,r_to_record]) visited_state_rule_tree.insert(visited_state_rule_counter, tuple((obs_to_record-visited_state_dist).tolist()[0]+(obs_to_record+visited_state_dist).tolist()[0])) visited_state_rule_outfile.write(visited_state_format % tuple(obs_to_record[0])) visited_value_rule_outfile.write(visited_value_format % tuple([action_to_record,r_to_record])) visited_state_rule_counter += 1 else: if save_new_data_RL: visited_state_RL_value.append([action_to_record,r_to_record]) visited_state_RL_tree.insert(visited_state_RL_counter, tuple((obs_to_record-visited_state_dist).tolist()[0]+(obs_to_record+visited_state_dist).tolist()[0])) visited_state_RL_outfile.write(visited_state_format % tuple(obs_to_record[0])) visited_value_RL_outfile.write(visited_value_format % tuple([action_to_record,r_to_record])) visited_state_RL_counter += 1 ####### Recorded ##### obs = env.reset() episode_rewards.append(0.0) reset = True ############### Trainning Part Start ##################### if not is_training: # don't need to train the model continue if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward rew_str = str(mean_100ep_reward) path = osp.expanduser("~/models/carlaok_checkpoint/"+saved_name+"_"+rew_str) act.save(path) #### close the file #### visited_state_rule_outfile.close() visited_value_rule_outfile.close() recorded_replay_buffer_outfile.close() if not is_training: testing_record_outfile.close() #### close the file ### if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_variables(model_file) return act
def __init__(self, logdir, env_fn, policy_fn, qf_fn, nenv=1, optimizer=torch.optim.Adam, buffer_size=10000, frame_stack=1, learning_starts=1000, update_period=1, batch_size=256, policy_lr=1e-4, qf_lr=1e-3, qf_weight_decay=0.01, gamma=0.99, noise_theta=0.15, noise_sigma=0.2, noise_sigma_final=0.01, noise_decay_period=10000, target_update_period=1, target_smoothing_coef=0.005, reward_scale=1, gpu=True, eval_num_episodes=1, record_num_episodes=1, log_period=1000): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.gamma = gamma self.buffer_size = buffer_size self.frame_stack = frame_stack self.learning_starts = learning_starts self.update_period = update_period self.batch_size = batch_size if target_update_period < self.update_period: self.target_update_period = self.update_period else: self.target_update_period = target_update_period - ( target_update_period % self.update_period) self.reward_scale = reward_scale self.target_smoothing_coef = target_smoothing_coef self.log_period = log_period self.device = torch.device( 'cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.t = 0 self.env = VecEpisodeLogger(env_fn(nenv=nenv)) self.policy_fn = policy_fn self.qf_fn = qf_fn eval_env = VecFrameStack(self.env, self.frame_stack) self.pi = policy_fn(eval_env) self.qf = qf_fn(eval_env) self.target_pi = policy_fn(eval_env) self.target_qf = qf_fn(eval_env) self.pi.to(self.device) self.qf.to(self.device) self.target_pi.to(self.device) self.target_qf.to(self.device) self.optimizer = optimizer self.policy_lr = policy_lr self.qf_lr = qf_lr self.qf_weight_decay = qf_weight_decay self.opt_pi = optimizer(self.pi.parameters(), lr=policy_lr) self.opt_qf = optimizer(self.qf.parameters(), lr=qf_lr, weight_decay=qf_weight_decay) self.target_pi.load_state_dict(self.pi.state_dict()) self.target_qf.load_state_dict(self.qf.state_dict()) self.noise_schedule = LinearSchedule(noise_decay_period, noise_sigma_final, noise_sigma) self._actor = DDPGActor(self.pi, self.env.action_space, noise_theta, self.noise_schedule.value(self.t)) self.buffer = ReplayBuffer(buffer_size, frame_stack) self.data_manager = ReplayBufferDataManager(self.buffer, self.env, self._actor, self.device, self.learning_starts, self.update_period) self.qf_criterion = torch.nn.MSELoss() if self.env.action_space.__class__.__name__ == 'Discrete': raise ValueError("Action space must be continuous!") self.low = torch.from_numpy(self.env.action_space.low).to(self.device) self.high = torch.from_numpy(self.env.action_space.high).to( self.device)
def learn(env, q_func, num_actions=4, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def agent(): """Run the agent, connecting to a (remote) host started independently.""" agent_module, agent_name = FLAGS.agent.rsplit(".", 1) agent_cls = getattr(importlib.import_module(agent_module), agent_name) with lan_sc2_env.LanSC2Env( host=FLAGS.host, config_port=FLAGS.config_port, race=sc2_env.Race[FLAGS.agent_race], step_mul=FLAGS.step_mul, realtime=FLAGS.realtime, agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=FLAGS.feature_screen_size, feature_minimap=FLAGS.feature_minimap_size, rgb_screen=FLAGS.rgb_screen_size, rgb_minimap=FLAGS.rgb_minimap_size, action_space=FLAGS.action_space, use_unit_counts=True, use_camera_position=True, show_cloaked=True, show_burrowed_shadows=True, show_placeholders=True, send_observation_proto=True, crop_to_playable_area=True, raw_crop_to_playable_area=True, allow_cheating_layers=True, add_cargo_to_units=True, use_feature_units=FLAGS.use_feature_units), visualize=FLAGS.render) as env: agents = [agent_cls()] logging.info("Connected, starting run_loop.") try: run_loop.run_loop(agents, env) except lan_sc2_env.RestartError: pass logging.info("Done.") def make_obs_ph(name): return BatchInput((1, 16, 16), name=name) act_x, train_x, update_target_x, debug_x = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq_x") act_y, train_y, update_target_y, debug_y = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq_y") act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer_x = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) replay_buffer_y = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule_x = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer_x = ReplayBuffer(buffer_size) replay_buffer_y = ReplayBuffer(buffer_size) beta_schedule_x = None beta_schedule_y = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target_x() update_target_y() #time.sleep(30) # Stagger startups, otherwise tshey seem to conflict somehow episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() action_blacklist = ['0'] #function_id = numpy.random.choice(obs[0].observation.available_actions) #step forward a noop so units and prob appear obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE] screen = (player_relative == _PLAYER_NEUTRAL).astype(int) #+ path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [0, 0] reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join("model/", "nexus_wars") print(model_file) for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action_x = act_x(np.expand_dims(np.array(screen)[None], axis=0), update_eps=update_eps, **kwargs)[0] action_y = act_y(np.expand_dims(np.array(screen)[None], axis=0), update_eps=update_eps, **kwargs)[0] reset = False coord = [player[0], player[1]] rew = 0 coord = [action_x, action_y] observation_spec = env.observation_spec() action_spec = env.action_spec() #get available actions avail_actions_now = obs[0].observation.available_actions #ready for actions yet? 4 actions = nothing to do yet if len(avail_actions_now) > 5: #game state is ready for random action commands, get them and args function_id = numpy.random.choice( obs[0].observation.available_actions) args = [[numpy.random.randint(0, size) for size in arg.sizes] for arg in action_spec[0].functions[function_id].args] #issue random command and arg obs = env.step( actions=[sc2_actions.FunctionCall(function_id, args)]) #obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) else: #step no matter wat obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])]) player_relative = obs[0].observation["feature_screen"][ _PLAYER_RELATIVE] new_screen = (player_relative == _PLAYER_NEUTRAL).astype(int) player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() # resolve the cannot convert float NaN to integer issue if len(player_x) == 0: player_x = np.array([0]) if len(player_y) == 0: player_y = np.array([0]) player = [int(player_x.mean()), int(player_y.mean())] rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer_x.add(screen, action_x, rew, new_screen, float(done)) replay_buffer_y.add(screen, action_y, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: obs = env.reset() player_relative = obs[0].observation["feature_screen"][ _PLAYER_RELATIVE] screent = (player_relative == _PLAYER_NEUTRAL).astype(int) player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] # Select all marines first env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) episode_rewards.append(0.0) #episode_minerals.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience_x = replay_buffer_x.sample( batch_size, beta=beta_schedule_x.value(t)) (obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x, weights_x, batch_idxes_x) = experience_x experience_y = replay_buffer_y.sample( batch_size, beta=beta_schedule_y.value(t)) (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y else: obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x = replay_buffer_x.sample( batch_size) weights_x, batch_idxes_x = np.ones_like(rewards_x), None obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample( batch_size) weights_y, batch_idxes_y = np.ones_like(rewards_y), None td_errors_x = train_x(np.expand_dims(obses_t_x, axis=1), actions_x, rewards_x, np.expand_dims(obses_tp1_x, axis=1), dones_x, weights_x) td_errors_y = train_x(np.expand_dims(obses_t_y, axis=1), actions_y, rewards_y, np.expand_dims(obses_tp1_y, axis=1), dones_y, weights_y) if prioritized_replay: new_priorities_x = np.abs( td_errors_x) + prioritized_replay_eps new_priorities_y = np.abs( td_errors_y) + prioritized_replay_eps replay_buffer_x.update_priorities(batch_idxes_x, new_priorities_x) replay_buffer_y.update_priorities(batch_idxes_y, new_priorities_y) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target_x() update_target_y() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act_x), ActWrapper(act_y)
class DDPG(Algorithm): """DDPG algorithm.""" def __init__(self, logdir, env_fn, policy_fn, qf_fn, nenv=1, optimizer=torch.optim.Adam, buffer_size=10000, frame_stack=1, learning_starts=1000, update_period=1, batch_size=256, policy_lr=1e-4, qf_lr=1e-3, qf_weight_decay=0.01, gamma=0.99, noise_theta=0.15, noise_sigma=0.2, noise_sigma_final=0.01, noise_decay_period=10000, target_update_period=1, target_smoothing_coef=0.005, reward_scale=1, gpu=True, eval_num_episodes=1, record_num_episodes=1, log_period=1000): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.gamma = gamma self.buffer_size = buffer_size self.frame_stack = frame_stack self.learning_starts = learning_starts self.update_period = update_period self.batch_size = batch_size if target_update_period < self.update_period: self.target_update_period = self.update_period else: self.target_update_period = target_update_period - ( target_update_period % self.update_period) self.reward_scale = reward_scale self.target_smoothing_coef = target_smoothing_coef self.log_period = log_period self.device = torch.device( 'cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.t = 0 self.env = VecEpisodeLogger(env_fn(nenv=nenv)) self.policy_fn = policy_fn self.qf_fn = qf_fn eval_env = VecFrameStack(self.env, self.frame_stack) self.pi = policy_fn(eval_env) self.qf = qf_fn(eval_env) self.target_pi = policy_fn(eval_env) self.target_qf = qf_fn(eval_env) self.pi.to(self.device) self.qf.to(self.device) self.target_pi.to(self.device) self.target_qf.to(self.device) self.optimizer = optimizer self.policy_lr = policy_lr self.qf_lr = qf_lr self.qf_weight_decay = qf_weight_decay self.opt_pi = optimizer(self.pi.parameters(), lr=policy_lr) self.opt_qf = optimizer(self.qf.parameters(), lr=qf_lr, weight_decay=qf_weight_decay) self.target_pi.load_state_dict(self.pi.state_dict()) self.target_qf.load_state_dict(self.qf.state_dict()) self.noise_schedule = LinearSchedule(noise_decay_period, noise_sigma_final, noise_sigma) self._actor = DDPGActor(self.pi, self.env.action_space, noise_theta, self.noise_schedule.value(self.t)) self.buffer = ReplayBuffer(buffer_size, frame_stack) self.data_manager = ReplayBufferDataManager(self.buffer, self.env, self._actor, self.device, self.learning_starts, self.update_period) self.qf_criterion = torch.nn.MSELoss() if self.env.action_space.__class__.__name__ == 'Discrete': raise ValueError("Action space must be continuous!") self.low = torch.from_numpy(self.env.action_space.low).to(self.device) self.high = torch.from_numpy(self.env.action_space.high).to( self.device) def _norm_actions(self, ac): if self.low is not None and self.high is not None: return 2 * (ac - self.low) / (self.high - self.low) - 1.0 else: return ac def loss(self, batch): """Loss function.""" # compute QFunction loss. with torch.no_grad(): target_action = self.target_pi(batch['next_obs']).normed_action target_q = self.target_qf(batch['next_obs'], target_action).value qtarg = self.reward_scale * batch['reward'].float() + ( (1.0 - batch['done']) * self.gamma * target_q) q = self.qf(batch['obs'], self._norm_actions(batch['action'])).value assert qtarg.shape == q.shape qf_loss = self.qf_criterion(q, qtarg) # compute policy loss action = self.pi(batch['obs'], deterministic=True).normed_action q = self.qf(batch['obs'], action).value pi_loss = -q.mean() # log losses if self.t % self.log_period < self.update_period: logger.add_scalar('loss/qf', qf_loss, self.t, time.time()) logger.add_scalar('loss/pi', pi_loss, self.t, time.time()) return pi_loss, qf_loss def step(self): """Step optimization.""" self._actor.update_sigma(self.noise_schedule.value(self.t)) self.t += self.data_manager.step_until_update() if self.t % self.target_update_period == 0: soft_target_update(self.target_pi, self.pi, self.target_smoothing_coef) soft_target_update(self.target_qf, self.qf, self.target_smoothing_coef) if self.t % self.update_period == 0: batch = self.data_manager.sample(self.batch_size) pi_loss, qf_loss = self.loss(batch) # update self.opt_qf.zero_grad() qf_loss.backward() self.opt_qf.step() self.opt_pi.zero_grad() pi_loss.backward() self.opt_pi.step() return self.t def evaluate(self): """Evaluate.""" eval_env = VecFrameStack(self.env, self.frame_stack) self.pi.eval() misc.set_env_to_eval_mode(eval_env) # Eval policy os.makedirs(os.path.join(self.logdir, 'eval'), exist_ok=True) outfile = os.path.join(self.logdir, 'eval', self.ckptr.format.format(self.t) + '.json') stats = rl_evaluate(eval_env, self.pi, self.eval_num_episodes, outfile, self.device) logger.add_scalar('eval/mean_episode_reward', stats['mean_reward'], self.t, time.time()) logger.add_scalar('eval/mean_episode_length', stats['mean_length'], self.t, time.time()) # Record policy os.makedirs(os.path.join(self.logdir, 'video'), exist_ok=True) outfile = os.path.join(self.logdir, 'video', self.ckptr.format.format(self.t) + '.mp4') rl_record(eval_env, self.pi, self.record_num_episodes, outfile, self.device) self.pi.train() misc.set_env_to_train_mode(self.env) self.data_manager.manual_reset() def save(self): """Save.""" state_dict = { 'pi': self.pi.state_dict(), 'qf': self.qf.state_dict(), 'target_pi': self.target_pi.state_dict(), 'target_qf': self.target_qf.state_dict(), 'opt_pi': self.opt_pi.state_dict(), 'opt_qf': self.opt_qf.state_dict(), 'env': misc.env_state_dict(self.env), 't': self.t } buffer_dict = self.buffer.state_dict() state_dict['buffer_format'] = nest.get_structure(buffer_dict) self.ckptr.save(state_dict, self.t) # save buffer seperately and only once (because it can be huge) np.savez( os.path.join(self.ckptr.ckptdir, 'buffer.npz'), **{f'{i:04d}': x for i, x in enumerate(nest.flatten(buffer_dict))}) def load(self, t=None): """Load.""" state_dict = self.ckptr.load(t) if state_dict is None: self.t = 0 return self.t self.pi.load_state_dict(state_dict['pi']) self.qf.load_state_dict(state_dict['qf']) self.target_pi.load_state_dict(state_dict['target_pi']) self.target_qf.load_state_dict(state_dict['target_qf']) self.opt_pi.load_state_dict(state_dict['opt_pi']) self.opt_qf.load_state_dict(state_dict['opt_qf']) misc.env_load_state_dict(self.env, state_dict['env']) self.t = state_dict['t'] buffer_format = state_dict['buffer_format'] buffer_state = dict( np.load(os.path.join(self.ckptr.ckptdir, 'buffer.npz'))) buffer_state = nest.flatten(buffer_state) self.buffer.load_state_dict( nest.pack_sequence_as(buffer_state, buffer_format)) self.data_manager.manual_reset() return self.t def close(self): """Close environment.""" try: self.env.close() except Exception: pass
def learn(env, args): logger.configure('./rainbow_log', ['stdout', 'csv']) ob = env.reset() ob_shape = ob.shape num_action = int(env.action_space.n) agent = RainbowAgent(ob_shape, num_action, args) replay_buffer = PrioritizedReplayBuffer_NStep( args.buffer_size, alpha=args.prioritized_replay_alpha) args.prioritized_replay_beta_iters = args.max_timesteps beta_schedule = LinearSchedule(args.prioritized_replay_beta_iters, initial_p=args.prioritized_replay_beta0, final_p=1.0) episode_rewards = [0.0] saved_mean_reward = None n_step_seq = [] agent.sample_noise() agent.update_target() for t in range(args.max_timesteps): action = agent.act(ob) new_ob, rew, done, _ = env.step(action) # Append new step n_step_seq.append((ob, action, rew, new_ob, done)) ob = new_ob episode_rewards[-1] += rew if done or t % args.max_steps_per_episode == 0: ob = env.reset() episode_rewards.append(0.0) # Add to experience replay once collect enough steps if len(n_step_seq) >= args.nstep: replay_buffer.add(n_step_seq) n_step_seq = [] if t > args.learning_starts and t % args.replay_period == 0: # Replay experience = replay_buffer.sample(args.batch_size, beta=beta_schedule.value(t)) (obs_n, actions_n, rewards_n, obs_next_n, dones_n, weights, batch_idxes) = experience # Update network kl_errors = agent.update(obs_n, actions_n, rewards_n, obs_next_n, dones_n, weights) agent.sample_noise() # Update priorities in buffer replay_buffer.update_priorities(batch_idxes, np.abs(kl_errors) + 1e-6) if t > args.learning_starts and t % args.target_network_update_freq == 0: # Update target periodically agent.update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and args.print_freq is not None and len( episode_rewards) % args.print_freq == 0: """
def learn( # env flags env, raw_env, use_2D_env=True, use_multiple_starts=False, use_rich_reward=False, total_timesteps=100000, # dqn network=identity_fn, exploration_fraction=0.1, exploration_final_eps=0.02, # hr use_feedback=False, use_real_feedback=False, only_use_hr_until=int(1e3), trans_to_rl_in=int(2e4), good_feedback_acc=0.7, bad_feedback_acc=0.7, # dqn training lr=5e-4, batch_size=32, dqn_epochs=3, train_freq=1, target_network_update_freq=500, learning_starts=1000, param_noise=True, gamma=1.0, # hr training feedback_lr=1e-3, feedback_epochs=4, feedback_batch_size=16, feedback_minibatch_size=8, min_feedback_buffer_size=32, feedback_training_prop=0.7, feedback_training_new_prop=0.4, # replay buffer buffer_size=50000, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, # rslts saving and others checkpoint_freq=10000, checkpoint_path=None, print_freq=100, load_path=None, callback=None, seed=0, **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. batch_size: int size of a batch sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model # sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) hr_func = build_hr_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space observation_space.dtype = np.float32 def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train_rl, train_hr, evaluate_hr, update_target, debug = build_train( make_obs_ph=make_obs_ph, q_func=q_func, hr_func=hr_func, num_actions=env.action_space.n, rl_optimizer=tf.train.AdamOptimizer(learning_rate=lr), hr_optimizer=tf.train.AdamOptimizer(learning_rate=feedback_lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'hr_func': hr_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() obs, cor = obs['obs'], obs['nonviz_sensor'] reset = True if use_feedback and use_real_feedback: import pylsl print("looking for an EEG_Pred stream...", end="", flush=True) feedback_LSL_stream = pylsl.StreamInlet( pylsl.resolve_stream('type', 'EEG_Pred')[0]) print(" done") target_position = raw_env.robot.get_target_position() if use_2D_env: judge_action, *_ = run_dijkstra(raw_env, target_position) else: judge_action = judge_action_1D(raw_env, target_position) state_action_buffer = deque(maxlen=100) action_idx_buffer = deque(maxlen=100) feedback_buffer_train = [] feedback_buffer_valid = [] performance = {"feedback": [], "sparse_reward": [], "rich_reward": []} epi_feedback_test_num = 0 with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if use_feedback: update_rl_importance = (t - only_use_hr_until) / trans_to_rl_in update_rl_importance = np.clip(update_rl_importance, 0, 1) kwargs['update_rl_importance'] = update_rl_importance if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False raw_env.action_idx = t new_obs, rewards_dict, done, _ = env.step(env_action) new_obs, new_cor = new_obs['obs'], new_obs['nonviz_sensor'] sparse_reward = rewards_dict["sparse"] rich_reward = rewards_dict["rich"] rew = rich_reward if use_rich_reward else sparse_reward # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) state_action_buffer.append([obs, action]) action_idx_buffer.append(t) action_idxs, feedbacks, correct_feedbacks = \ get_simulated_feedback([cor] if use_2D_env else [obs], [action], [t], judge_action, good_feedback_acc, bad_feedback_acc) performance["feedback"].extend(correct_feedbacks) performance["sparse_reward"].append(sparse_reward) performance["rich_reward"].append(rich_reward) obs, cor = new_obs, new_cor if use_feedback: if use_real_feedback: feedbacks, action_idxs = get_feedback_from_LSL( feedback_LSL_stream) feedback_epi_buffer = [ state_action_buffer[action_idx_buffer.index(a_idx)] + [feedback] for a_idx, feedback in zip(action_idxs, feedbacks) ] # add feedbacks into feedback replay buffer if feedback_epi_buffer: epi_feedback_test_num += len(feedback_epi_buffer) * ( 1 - feedback_training_prop) epi_test_int = int(epi_feedback_test_num) epi_feedback_test_num -= epi_test_int epi_test_inds = np.random.choice(len(feedback_epi_buffer), epi_test_int, replace=False) epi_train_inds = [ ind for ind in range(len(feedback_epi_buffer)) if ind not in epi_test_inds ] feedback_buffer_train.extend( [feedback_epi_buffer[ind] for ind in epi_train_inds]) feedback_buffer_valid.extend( [feedback_epi_buffer[ind] for ind in epi_test_inds]) episode_rewards[-1] += rew if done: obs = env.reset() obs, cor = obs['obs'], obs['nonviz_sensor'] episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. for _ in range(dqn_epochs): if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train_rl(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs( td_errors) + prioritized_replay_eps replay_buffer.update_priorities( batch_idxes, new_priorities) # train feedback regressor if use_feedback and len( feedback_buffer_train ) >= min_feedback_buffer_size and t <= only_use_hr_until: for i in range(feedback_epochs): if i < feedback_epochs * feedback_training_new_prop: inds = np.arange( len(feedback_buffer_train) - feedback_batch_size, len(feedback_buffer_train)) else: inds = np.random.choice(len(feedback_buffer_train), feedback_batch_size, replace=False) np.random.shuffle(inds) for start in range(0, feedback_batch_size, feedback_minibatch_size): end = start + feedback_minibatch_size obses = np.asarray([ feedback_buffer_train[idx][0] for idx in inds[start:end] ]) actions = np.asarray([ feedback_buffer_train[idx][1] for idx in inds[start:end] ]) feedbacks = np.asarray([ feedback_buffer_train[idx][2] for idx in inds[start:end] ]) pred, loss = train_hr(obses, actions, feedbacks) obs_train = np.asarray( [feedback[0] for feedback in feedback_buffer_train]) actions_train = np.asarray( [feedback[1] for feedback in feedback_buffer_train]) feedbacks_train = np.asarray( [feedback[2] for feedback in feedback_buffer_train]) obs_valid = np.asarray( [feedback[0] for feedback in feedback_buffer_valid]) actions_valid = np.asarray( [feedback[1] for feedback in feedback_buffer_valid]) feedbacks_valid = np.asarray( [feedback[2] for feedback in feedback_buffer_valid]) train_acc, train_loss = evaluate_hr(obs_train, actions_train, feedbacks_train) valid_acc, valid_loss = evaluate_hr(obs_valid, actions_valid, feedbacks_valid) print( "HR: train acc {:>4.2f}, loss {:>5.2f}; valid acc {:>4.2f}, loss {:>5.2f}" .format(train_acc, train_loss, valid_acc, valid_loss)) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_variables(model_file) return act, performance
dist_params={ #'Vmin': args.vmin, #'Vmax': args.vmax, 'nb_atoms': args.nb_atoms }) approximate_num_iters = args.num_steps / 4 exploration = PiecewiseSchedule([(0, 1.0), (approximate_num_iters / 50, 0.1), (approximate_num_iters / 5, 0.01)], outside_value=0.01) if args.prioritized: replay_buffer = PrioritizedReplayBuffer(args.replay_buffer_size, args.prioritized_alpha) beta_schedule = LinearSchedule(approximate_num_iters, initial_p=args.prioritized_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(args.replay_buffer_size) U.initialize() update_target() num_iters = 0 # Load the model state = maybe_load_model(savedir, container) if state is not None: num_iters, replay_buffer = state["num_iters"], state[ "replay_buffer"], monitored_env.set_state(state["monitor_state"])
def learn(env, network, seed=None, pool=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_initial_eps=1.0, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=100, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, experiment_name='unnamed', load_path=None, **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. experiment_name: str name of the experiment (default: trial) load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=exploration_initial_eps, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() reward_shaper = ActionAdviceRewardShaper('../completed-observations') reward_shaper.load() full_exp_name = '{}-{}'.format(date.today().isoformat(), experiment_name) experiment_dir = os.path.join('experiments', full_exp_name) if not os.path.exists(experiment_dir): os.makedirs(experiment_dir) summary_dir = os.path.join(experiment_dir, 'summaries') os.makedirs(summary_dir, exist_ok=True) summary_writer = tf.summary.FileWriter(summary_dir) checkpoint_dir = os.path.join(experiment_dir, 'checkpoints') os.makedirs(checkpoint_dir, exist_ok=True) with tempfile.TemporaryDirectory() as td: td = checkpoint_dir or td os.makedirs(td, exist_ok=True) model_file = os.path.join(td, "best_model") model_saved = False saved_mean_reward = None if os.path.exists(model_file): print('Model is loading') load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) episode_rewards = [] update_step_t = 0 while update_step_t < total_timesteps: # Reset the environment obs = env.reset() obs = StatePreprocessor.process(obs) episode_rewards.append(0.0) reset = True done = False # Sample the episode until it is completed act_step_t = update_step_t while not done: if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(act_step_t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(act_step_t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(act_step_t) + exploration.value(act_step_t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True biases = reward_shaper.get_action_potentials(obs) action = act(np.array(obs)[None], biases, update_eps=update_eps, **kwargs)[0] reset = False pairs = env.step(action) action, (new_obs, rew, done, _) = pairs[-1] # Write down the real reward but learn from normalized version episode_rewards[-1] += rew rew = np.sign(rew) * np.log(1 + np.abs(rew)) new_obs = StatePreprocessor.process(new_obs) logger.log('{}/{} obs {} action {}'.format( act_step_t, total_timesteps, obs, action)) act_step_t += 1 if len(new_obs) == 0: done = True else: replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs # Post episode logging summary = tf.Summary(value=[ tf.Summary.Value(tag="rewards", simple_value=episode_rewards[-1]) ]) summary_writer.add_summary(summary, act_step_t) summary = tf.Summary( value=[tf.Summary.Value(tag="eps", simple_value=update_eps)]) summary_writer.add_summary(summary, act_step_t) summary = tf.Summary(value=[ tf.Summary.Value(tag="episode_steps", simple_value=act_step_t - update_step_t) ]) summary_writer.add_summary(summary, act_step_t) mean_5ep_reward = round(np.mean(episode_rewards[-5:]), 1) num_episodes = len(episode_rewards) if print_freq is not None and num_episodes % print_freq == 0: logger.record_tabular("steps", act_step_t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 5 episode reward", mean_5ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(act_step_t))) logger.dump_tabular() # Do the learning start = time.time() while update_step_t < min(act_step_t, total_timesteps): if update_step_t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(update_step_t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None biases_t = pool.map(reward_shaper.get_action_potentials, obses_t) biases_tp1 = pool.map(reward_shaper.get_action_potentials, obses_tp1) td_errors, weighted_error = train(obses_t, biases_t, actions, rewards, obses_tp1, biases_tp1, dones, weights) # Loss logging summary = tf.Summary(value=[ tf.Summary.Value(tag='weighted_error', simple_value=weighted_error) ]) summary_writer.add_summary(summary, update_step_t) if prioritized_replay: new_priorities = np.abs( td_errors) + prioritized_replay_eps replay_buffer.update_priorities( batch_idxes, new_priorities) if update_step_t % target_network_update_freq == 0: # Update target network periodically. update_target() update_step_t += 1 stop = time.time() logger.log("Learning took {:.2f} seconds".format(stop - start)) if checkpoint_freq is not None and num_episodes % checkpoint_freq == 0: # Periodically save the model and the replay buffer rec_model_file = os.path.join( td, "model_{}_{:.2f}".format(num_episodes, mean_5ep_reward)) save_variables(rec_model_file) buffer_file = os.path.join( td, "buffer_{}_{}".format(num_episodes, update_step_t)) with open(buffer_file, 'wb') as foutput: cloudpickle.dump(replay_buffer, foutput) # Check whether it is best if saved_mean_reward is None or mean_5ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_5ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_5ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_variables(model_file) return act
class DoublePrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha, epsilon, timesteps, initial_p, final_p): super(DoublePrioritizedReplayBuffer, self).__init__(size) assert alpha > 0 self._alpha = alpha self._epsilon = epsilon self._beta_schedule = LinearSchedule(timesteps, initial_p=initial_p, final_p=final_p) it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self._it_sum2 = SumSegmentTree(it_capacity) self._it_min2 = MinSegmentTree(it_capacity) self._max_priority2 = 1.0 def add(self, *args, **kwargs): idx = self._next_idx super().add(*args, **kwargs) self._it_sum[idx] = self._max_priority ** self._alpha self._it_min[idx] = self._max_priority ** self._alpha self._it_sum2[idx] = self._max_priority2 ** self._alpha self._it_min2[idx] = self._max_priority2 ** self._alpha def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def _sample_proportional2(self, batch_size): res = [] for _ in range(batch_size): mass = random.random() * self._it_sum2.sum(0, len(self._storage) - 1) idx = self._it_sum2.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, time_step): beta = self._beta_schedule.value(time_step) assert beta > 0 idxes = self._sample_proportional(batch_size) self.idxes = idxes # keep to update priorities later weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage)) ** (-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage)) ** (-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return encoded_sample + (weights,) def sample_qmap(self, batch_size, time_step, n_steps=1): beta = self._beta_schedule.value(time_step) assert beta > 0 idxes = self._sample_proportional2(batch_size) self.idxes2 = idxes # keep to update priorities later weights = [] p_min = self._it_min2.min() / self._it_sum2.sum() max_weight = (p_min * len(self._storage)) ** (-beta) for idx in idxes: p_sample = self._it_sum2[idx] / self._it_sum2.sum() weight = (p_sample * len(self._storage)) ** (-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_qmap_sample(idxes, n_steps) return encoded_sample + (weights,) def update_priorities(self, td_errors): priorities = np.abs(td_errors) + self._epsilon idxes = self.idxes assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority ** self._alpha self._it_min[idx] = priority ** self._alpha self._max_priority = max(self._max_priority, priority) def update_priorities_qmap(self, td_errors): priorities = np.abs(td_errors) + self._epsilon idxes = self.idxes2 assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum2[idx] = priority ** self._alpha self._it_min2[idx] = priority ** self._alpha self._max_priority2 = max(self._max_priority2, priority)
def startTraining(): # Create the environment print('START ENV', RC.GB_CLIENT_ID(), RC.gbRobotHandle()) env = RobotOperationEnvironment(RC.GB_CLIENT_ID(), RC.GB_CSERVER_ROBOT_ID, RC.gbRobotHandle()) #print('ACTION_SPACE', env.action_space.shape) # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: BatchInput(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() print("Manipulator DEEPQ Training Experiment Start.") for t in itertools.count(): print('Episode ', len(episode_rewards), 'Step ', t, '--------------') print('Start waiting for the next action', env._robot.getOperationState()) while (env._robot.getOperationState() != RC.CROBOT_STATE_READY): time.sleep(0.01) # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] print('Generated action:', action) new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 if is_solved: # Show off the result #env.render() pass else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) print('Generated actions:', actions) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1)) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular()
def learn(env, q_func, num_actions=3, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None, demo_replay=[]): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((64, 64), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # Select all marines first #env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])]) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = player_relative obs = common.init(env, player_relative, obs) group_id = 0 reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") First = True for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # custom process for DefeatZerglingsAndBanelings obs, screen, player = common.select_marine(env, obs) action = act(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False rew = 0 new_action = None obs, new_action = common.marine_action(env, obs, player, action) army_count = env._obs.observation.player_common.army_count #print(army_count) #print(env._obs.observation.player_common.idle_worker_count) try: if army_count > 0 and _ATTACK_SCREEN in obs[0].observation[ "available_actions"]: obs = env.step(actions=new_action) else: new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) except Exception as e: #print(e) 1 # Do nothing player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = player_relative rew += obs[0].reward game_info = sc_pb.ResponseGameInfo obs_tuple = features.Features(screen_size_px=(256, 256), minimap_size_px=(256, 256), hide_specific_actions=True) test = obs_tuple.transform_obs( env._obs.observation)["multi_select"] #test1 = test["screen"][_UNIT_TYPE] #env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])]) if First: with open('output.txt', 'wb') as abc: np.savetxt(abc, test, delimiter=",") #print(test2) #for value in test2: #print(str(value)) #test = obs_tuple.transform_obs(obs=obs) #if First: #with open('output.txt', 'w') as f: #for tt in test: #f.write(' '.join(str(s) for s in tt) + '\n') First = False done = obs[0].step_type == environment.StepType.LAST # selected = obs[0].observation["screen"][_SELECT_ARMY] # if First: # print('hello') # for value in selected: # print(value) # First=False player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() if (len(player_y) > 0): player = [int(player_x.mean()), int(player_y.mean())] if (len(player) == 2): if (player[0] > 32): new_screen = common.shift(LEFT, player[0] - 32, new_screen) elif (player[0] < 32): new_screen = common.shift(RIGHT, 32 - player[0], new_screen) if (player[1] > 32): new_screen = common.shift(UP, player[1] - 32, new_screen) elif (player[1] < 32): new_screen = common.shift(DOWN, 32 - player[1], new_screen) # Store transition in the replay buffer. replay_buffer.add(screen, action, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew if done: print("Episode Reward : %s" % episode_rewards[-1]) obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screen = player_relative group_list = common.init(env, player_relative, obs) # Select all marines first #env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])]) episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act)
def main(): print('main') stats_file = pathlib.Path('stats.csv') if stats_file.exists(): stats_file.unlink() broker = dqn.env.Broker('http://localhost:5000') env = dqn.env.HaliteEnv(broker) with U.make_session(num_cpu=4): observation_shape = env.observation_space.shape def make_obs_ph(name): import dqn.tf_util as U return U.BatchInput(observation_shape, name=name) # Create all the functions necessary to train the model act, train, update_target, debug = dqn.graph.build_train( make_obs_ph=make_obs_ph, q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) act = dqn.play.ActWrapper( act, { 'make_obs_ph': make_obs_ph, 'q_func': model, 'num_actions': env.action_space.n, }) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=30000, initial_p=1.0, final_p=0.03) # Initialize the parameters and copy them to the target network. U.initialize() update_target() learning_starts = 1000 target_network_update_freq = 500 checkpoint_freq = 20 episode_rewards = [0.0] wins = [False] saved_mean_reward = None obs = env.reset() for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, info = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) wins.append(info['win']) win_rate = round(np.mean(wins[-100:]), 4) is_solved = t > 100 and win_rate >= 99 if is_solved: print('solved') break else: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > learning_starts: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) actions = np.argmax(actions, axis=1) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t > learning_starts and t % target_network_update_freq == 0: update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 4) num_episodes = len(episode_rewards) exploration_rate = int(100 * exploration.value(t)) if done: info = { 'date': str(dt.datetime.now()), 'episode': len(episode_rewards), **info, 'win_rate': win_rate, 'mean_100ep_reward': mean_100ep_reward, 'exploration_rate': exploration_rate, } print('episode', info) if not stats_file.exists(): with stats_file.open('w') as fp: fp.write(','.join(info.keys()) + '\n') with stats_file.open('a') as fp: fp.write(','.join(map(str, info.values())) + '\n') if done and num_episodes % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", mean_100ep_reward) logger.record_tabular("mean win rate", win_rate) logger.record_tabular("% time spent exploring", exploration_rate) logger.dump_tabular() if done and (t > learning_starts and num_episodes > 100 and num_episodes % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: logger.log( "Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward)) act.save('dqn_model.pkl') saved_mean_reward = mean_100ep_reward act.save('dqn_model.pkl') env.close()
def learn(self): with U.make_session(8): # Create the environment env = gym.make(self._args.env) # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: ObservationInput( env.observation_space, name=name), q_func=self.model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer( learning_rate=self._args.learning_rate), ) # Create the replay buffer replay_buffer = ReplayBuffer(self._args.replay_buffer_size) # Create the schedule for exploration starting from 1 till min_exploration_rate. exploration = LinearSchedule( schedule_timesteps=self._args.exploration_duration, initial_p=1.0, final_p=self._args.min_exploration_rate) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0) mean_episode_reward = np.mean(episode_rewards[-101:-1]) # Show learned agent: if mean_episode_reward >= self._render_reward_threshold: env.render() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if t > 1000: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( 32) train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) # Update target network periodically. if t % 1000 == 0: update_target() if done and len(episode_rewards) % 10 == 0: self._reward_buffer_mutex.acquire() self._reward_buffer.append(mean_episode_reward) logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", round(mean_episode_reward, 1)) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() self._reward_buffer_changed = True self._reward_buffer_mutex.release()
class DDPG(object): @store_args def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, temperature, prioritization, env_name, alpha, beta0, beta_iters, eps, max_timesteps, rank_method, reuse=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] self.prioritization = prioritization self.env_name = env_name self.temperature = temperature self.rank_method = rank_method # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None,) stage_shapes['w'] = (None,) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key]) for key, val in input_shapes.items()} buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T+1, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size if self.prioritization == 'energy': self.buffer = ReplayBufferEnergy(buffer_shapes, buffer_size, self.T, self.sample_transitions, self.prioritization, self.env_name) elif self.prioritization == 'tderror': self.buffer = PrioritizedReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions, alpha, self.env_name) if beta_iters is None: beta_iters = max_timesteps self.beta_schedule = LinearSchedule(beta_iters, initial_p=beta0, final_p=1.0) else: self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) def _random_action(self, n): return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu)) def _preprocess_og(self, o, ag, g): if self.relative_goals: g_shape = g.shape g = g.reshape(-1, self.dimg) ag = ag.reshape(-1, self.dimg) g = self.subtract_goals(g, ag) g = g.reshape(*g_shape) o = np.clip(o, -self.clip_obs, self.clip_obs) g = np.clip(g, -self.clip_obs, self.clip_obs) return o, g def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False, compute_Q=False): o, g = self._preprocess_og(o, ag, g) policy = self.target if use_target_net else self.main # values to compute vals = [policy.pi_tf] if compute_Q: vals += [policy.Q_pi_tf] # feed feed = { policy.o_tf: o.reshape(-1, self.dimo), policy.g_tf: g.reshape(-1, self.dimg), policy.u_tf: np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32) } ret = self.sess.run(vals, feed_dict=feed) # action postprocessing u = ret[0] noise = noise_eps * self.max_u * np.random.randn(*u.shape) # gaussian noise u += noise u = np.clip(u, -self.max_u, self.max_u) u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (self._random_action(u.shape[0]) - u) # eps-greedy if u.shape[0] == 1: u = u[0] u = u.copy() ret[0] = u if len(ret) == 1: return ret[0] else: return ret def get_td_errors(self, o, g, u): o, g = self._preprocess_og(o, g, g) vals = [self.td_error_tf] r = np.ones((o.reshape(-1, self.dimo).shape[0],1)) feed = { self.target.o_tf: o.reshape(-1, self.dimo), self.target.g_tf: g.reshape(-1, self.dimg), self.bath_tf_r: r, self.main.o_tf: o.reshape(-1, self.dimo), self.main.g_tf: g.reshape(-1, self.dimg), self.main.u_tf: u.reshape(-1, self.dimu) } td_errors = self.sess.run(vals, feed_dict=feed) td_errors = td_errors.copy() return td_errors def store_episode(self, episode_batch, dump_buffer, w_potential, w_linear, w_rotational, rank_method, clip_energy, update_stats=True): """ episode_batch: array of batch_size x (T or T+1) x dim_key 'o' is of size T+1, others are of size T """ if self.prioritization == 'tderror': self.buffer.store_episode(episode_batch, dump_buffer) elif self.prioritization == 'energy': self.buffer.store_episode(episode_batch, w_potential, w_linear, w_rotational, rank_method, clip_energy) else: self.buffer.store_episode(episode_batch) if update_stats: # add transitions to normalizer episode_batch['o_2'] = episode_batch['o'][:, 1:, :] episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :] num_normalizing_transitions = transitions_in_episode_batch(episode_batch) if self.prioritization == 'energy': if not self.buffer.current_size==0 and not len(episode_batch['ag'])==0: transitions = self.sample_transitions(episode_batch, num_normalizing_transitions, 'none', 1.0, True) elif self.prioritization == 'tderror': transitions, weights, episode_idxs = \ self.sample_transitions(self.buffer, episode_batch, num_normalizing_transitions, beta=0) else: transitions = self.sample_transitions(episode_batch, num_normalizing_transitions) o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag'] transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g) self.o_stats.update(transitions['o']) self.g_stats.update(transitions['g']) self.o_stats.recompute_stats() self.g_stats.recompute_stats() def get_current_buffer_size(self): return self.buffer.get_current_size() def dump_buffer(self, epoch): self.buffer.dump_buffer(epoch) def _sync_optimizers(self): self.Q_adam.sync() self.pi_adam.sync() def _grads(self): # Avoid feed_dict here for performance! critic_loss, actor_loss, Q_grad, pi_grad, td_error = self.sess.run([ self.Q_loss_tf, self.main.Q_pi_tf, self.Q_grad_tf, self.pi_grad_tf, self.td_error_tf ]) return critic_loss, actor_loss, Q_grad, pi_grad, td_error def _update(self, Q_grad, pi_grad): self.Q_adam.update(Q_grad, self.Q_lr) self.pi_adam.update(pi_grad, self.pi_lr) def sample_batch(self, t): if self.prioritization == 'energy': transitions = self.buffer.sample(self.batch_size, self.rank_method, temperature=self.temperature) weights = np.ones_like(transitions['r']).copy() elif self.prioritization == 'tderror': transitions, weights, idxs = self.buffer.sample(self.batch_size, beta=self.beta_schedule.value(t)) else: transitions = self.buffer.sample(self.batch_size) weights = np.ones_like(transitions['r']).copy() o, o_2, g = transitions['o'], transitions['o_2'], transitions['g'] ag, ag_2 = transitions['ag'], transitions['ag_2'] transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g) transitions['o_2'], transitions['g_2'] = self._preprocess_og(o_2, ag_2, g) transitions['w'] = weights.flatten().copy() # note: ordered dict transitions_batch = [transitions[key] for key in self.stage_shapes.keys()] if self.prioritization == 'tderror': return (transitions_batch, idxs) else: return transitions_batch def stage_batch(self, t, batch=None): # if batch is None: if self.prioritization == 'tderror': batch, idxs = self.sample_batch(t) else: batch = self.sample_batch(t) assert len(self.buffer_ph_tf) == len(batch) self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch))) if self.prioritization == 'tderror': return idxs def train(self, t, dump_buffer, stage=True): if not self.buffer.current_size==0: if stage: if self.prioritization == 'tderror': idxs = self.stage_batch(t) else: self.stage_batch(t) critic_loss, actor_loss, Q_grad, pi_grad, td_error = self._grads() if self.prioritization == 'tderror': new_priorities = np.abs(td_error) + self.eps # td_error if dump_buffer: T = self.buffer.buffers['u'].shape[1] episode_idxs = idxs // T t_samples = idxs % T batch_size = td_error.shape[0] with self.buffer.lock: for i in range(batch_size): self.buffer.buffers['td'][episode_idxs[i]][t_samples[i]] = td_error[i] self.buffer.update_priorities(idxs, new_priorities) self._update(Q_grad, pi_grad) return critic_loss, actor_loss def _init_target_net(self): self.sess.run(self.init_target_net_op) def update_target_net(self): self.sess.run(self.update_target_net_op) def clear_buffer(self): self.buffer.clear_buffer() def _vars(self, scope): res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope + '/' + scope) assert len(res) > 0 return res def _global_vars(self, scope): res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope + '/' + scope) return res def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf.get_default_session() if self.sess is None: self.sess = tf.InteractiveSession() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([(key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) batch_tf['w'] = tf.reshape(batch_tf['w'], [-1, 1]) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic( target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.td_error_tf = tf.stop_gradient(target_tf) - self.main.Q_tf self.errors_tf = tf.square(self.td_error_tf) self.errors_tf = tf.reduce_mean(batch_tf['w'] * self.errors_tf) self.Q_loss_tf = tf.reduce_mean(self.errors_tf) self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net() def logs(self, prefix=''): logs = [] logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))] logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))] logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))] logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))] if prefix is not '' and not prefix.endswith('/'): return [(prefix + '/' + key, val) for key, val in logs] else: return logs def __getstate__(self): """Our policies can be loaded from pkl, but after unpickling you cannot continue training. """ excluded_subnames = ['_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main', 'target', 'lock', 'env', 'sample_transitions', 'stage_shapes', 'create_actor_critic'] state = {k: v for k, v in self.__dict__.items() if all([not subname in k for subname in excluded_subnames])} state['buffer_size'] = self.buffer_size state['tf'] = self.sess.run([x for x in self._global_vars('') if 'buffer' not in x.name]) return state def __setstate__(self, state): if 'sample_transitions' not in state: # We don't need this for playing the policy. state['sample_transitions'] = None state['env_name'] = None # No need for playing the policy self.__init__(**state) # set up stats (they are overwritten in __init__) for k, v in state.items(): if k[-6:] == '_stats': self.__dict__[k] = v # load TF variables vars = [x for x in self._global_vars('') if 'buffer' not in x.name] assert(len(vars) == len(state["tf"])) node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])] self.sess.run(node)
def learn(env, q_func, num_actions=4, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U2.BatchInput((16, 16), name=name) act_x, train_x, update_target_x, debug_x = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq_x") act_y, train_y, update_target_y, debug_y = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq_y") act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer_x = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) replay_buffer_y = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule_x = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer_x = ReplayBuffer(buffer_size) replay_buffer_y = ReplayBuffer(buffer_size) beta_schedule_x = None beta_schedule_y = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target_x() update_target_y() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # Select all marines first obs = env.step( actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = (player_relative == _PLAYER_NEUTRAL).astype(int) #+ path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join("model/", "mineral_shards") print(model_file) for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action_x = act_x(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] action_y = act_y(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False coord = [player[0], player[1]] rew = 0 coord = [action_x, action_y] if _MOVE_SCREEN not in obs[0].observation["available_actions"]: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] # else: # new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = (player_relative == _PLAYER_NEUTRAL).astype(int) player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer_x.add(screen, action_x, rew, new_screen, float(done)) replay_buffer_y.add(screen, action_y, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screent = (player_relative == _PLAYER_NEUTRAL).astype(int) player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] # Select all marines first env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) episode_rewards.append(0.0) #episode_minerals.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience_x = replay_buffer_x.sample( batch_size, beta=beta_schedule_x.value(t)) (obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x, weights_x, batch_idxes_x) = experience_x experience_y = replay_buffer_y.sample( batch_size, beta=beta_schedule_y.value(t)) (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y else: obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x = replay_buffer_x.sample( batch_size) weights_x, batch_idxes_x = np.ones_like(rewards_x), None obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample( batch_size) weights_y, batch_idxes_y = np.ones_like(rewards_y), None td_errors_x = train_x(obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x, weights_x) td_errors_y = train_x(obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y) if prioritized_replay: new_priorities_x = np.abs( td_errors_x) + prioritized_replay_eps new_priorities_y = np.abs( td_errors_y) + prioritized_replay_eps replay_buffer_x.update_priorities(batch_idxes_x, new_priorities_x) replay_buffer_y.update_priorities(batch_idxes_y, new_priorities_y) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target_x() update_target_y() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U2.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U2.load_state(model_file) return ActWrapper(act_x), ActWrapper(act_y)
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, temperature, prioritization, env_name, alpha, beta0, beta_iters, eps, max_timesteps, rank_method, reuse=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] self.prioritization = prioritization self.env_name = env_name self.temperature = temperature self.rank_method = rank_method # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None,) stage_shapes['w'] = (None,) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key]) for key, val in input_shapes.items()} buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T+1, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size if self.prioritization == 'energy': self.buffer = ReplayBufferEnergy(buffer_shapes, buffer_size, self.T, self.sample_transitions, self.prioritization, self.env_name) elif self.prioritization == 'tderror': self.buffer = PrioritizedReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions, alpha, self.env_name) if beta_iters is None: beta_iters = max_timesteps self.beta_schedule = LinearSchedule(beta_iters, initial_p=beta0, final_p=1.0) else: self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, callback=None): """Train a deepq model. Parameters ------- env : gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)
if __name__ == '__main__': with U.make_session(num_cpu=1): # Create the environment env = gym.make("CartPole-v0") # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() reward_list = [] ###list for saving sum of reward to file episode_rewards = [0.0] obs = env.reset() for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) reward_list.append(rew) ###append reward to list obs = new_obs
gamma=0.99, grad_norm_clipping=10, double_q=args.double_q, param_noise=args.param_noise ) approximate_num_iters = args.num_steps / 4 exploration = PiecewiseSchedule([ (0, 1.0), (approximate_num_iters / 50, 0.1), (approximate_num_iters / 5, 0.01) ], outside_value=0.01) if args.prioritized: replay_buffer = PrioritizedReplayBuffer(args.replay_buffer_size, args.prioritized_alpha) beta_schedule = LinearSchedule(approximate_num_iters, initial_p=args.prioritized_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(args.replay_buffer_size) U.initialize() update_target() num_iters = 0 # Load the model state = maybe_load_model(savedir, container) if state is not None: num_iters, replay_buffer = state["num_iters"], state["replay_buffer"], monitored_env.set_state(state["monitor_state"]) start_time, start_steps = None, None steps_per_iter = RunningAvg(0.999)
dueling = True layer_norm = True activation_fn = tf.nn.elu # Q-map if args.qmap: q_map_model = ConvDeconvMap(convs=[(32, 8, 2), (32, 6, 2), (64, 4, 2)], middle_hiddens=[1024], deconvs=[(64, 4, 2), (32, 6, 2), (env.action_space.n, 4, 1)], coords_shape=coords_shape, dueling=dueling, layer_norm=layer_norm, activation_fn=activation_fn) q_map_random_schedule = LinearSchedule(schedule_timesteps=n_steps, initial_p=0.1, final_p=0.05) else: q_map_model = None q_map_random_schedule = None # DQN if args.dqn: dqn_model = ConvMlp( convs=[(32, 8, 2), (32, 6, 2), (32, 4, 2)], hiddens=[1024], dueling=True, ) exploration_schedule = LinearSchedule(schedule_timesteps=n_steps, initial_p=1.0, final_p=0.05)
def learn(env, q_func, isKfac=False, kfac_paras=None, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return U.BatchInput(observation_space_shape, name=name) if isKfac: from baselines.deepq.kfac import KfacOptimizer optimizer = KfacOptimizer(learning_rate=lr, momentum=kfac_paras['momentum'], clip_kl=kfac_paras['clip_kl'], kfac_update=1, epsilon=kfac_paras['epsilon'], stats_decay=kfac_paras['stats_decay'], async=1, cold_iter=kfac_paras['cold_iter']) act, train, update_target, debug, queue_runner = deepq.build_train( make_obs_ph=make_obs_ph, # lambda name: U.Uint8Input(env.observation_space.shape, name=name), isKfac=True, fisher_metric=kfac_paras['fisher_metric'], q_func=q_func, num_actions=env.action_space.n, optimizer=optimizer, gamma=0.99, grad_norm_clipping=10, ) else: act, train, update_target, debug, _ = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() if isKfac: enqueue_threads = queue_runner.create_threads(sess, coord=tf.train.Coordinator(), start=True) reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") import time start = time.time() for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] if isinstance(env.action_space, gym.spaces.MultiBinary): env_action = np.zeros(env.action_space.n) env_action[action] = 1 else: env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.record_tabular("fps", int(t * 1.0 / (time.time() - start))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) U.load_state(model_file) return act
def learn( env, p_dist_func, lr=5e-4, eps=0.0003125, max_timesteps=100000, buffer_size=50000, exp_t1=1e6, exp_p1=0.1, exp_t2=25e6, exp_p2=0.01, # exploration_fraction=0.1, # exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=0.95, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, callback=None, dist_params=None, n_action=None, action_map=None): """Train a distdeepq model. Parameters ------- env: gym.Env environment to train on p_dist_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/distdeepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = make_session(num_cpu=num_cpu) sess.__enter__() logger.configure(dir=os.path.join( '.', datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))) # logger.configure() def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) if dist_params is None: raise ValueError('dist_params is required') # z, dz = build_z(**dist_params) act, train, update_target, debug = distdeepq_mog.build_train( make_obs_ph=make_obs_ph, p_dist_func=p_dist_func, # num_actions=env.action_space.n, n_action=n_action, optimizer=tf.train.AdamOptimizer(learning_rate=lr, epsilon=eps), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise, dist_params=dist_params) act_params = { 'make_obs_ph': make_obs_ph, 'p_dist_func': p_dist_func, 'num_actions': n_action, 'dist_params': dist_params } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. #exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), # initial_p=1.0, # final_p=exploration_final_eps) # exploration = PiecewiseSchedule([(0, 1.0),(max_timesteps/25, 0.1), # (max_timesteps, 0.01)], outside_value=0.01) exploration = PiecewiseSchedule([(0, 1.0), (exp_t1, exp_p1), (exp_t2, exp_p2)], outside_value=exp_p2) # Initialize the parameters and copy them to the target network. U.initialize() update_target() avg_success_list = deque(maxlen=100) avg_collision_list = deque(maxlen=100) avg_derail_list = deque(maxlen=100) episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] reset = False action_val = action_map[action] new_obs, rew, done, info = env.step(action_val) # env.render() # rew = rew-1 for proposed loss with new metric # rew = rew-1 # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) if info == 1: avg_success_list.append(1.0) avg_collision_list.append(0.0) avg_derail_list.append(0.0) elif info == -1: avg_success_list.append(0.0) avg_collision_list.append(1.0) avg_derail_list.append(0.0) elif info == -2: avg_success_list.append(0.0) avg_collision_list.append(0.0) avg_derail_list.append(1.0) else: avg_success_list.append(0.0) avg_collision_list.append(0.0) avg_derail_list.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # debug['pi'] = tf.Print(debug['pi'], [debug['pi'], "target pi"]) # tf.Print(debug['mu'], [debug['mu'], "target mu"]) # tf.Print(debug['sigma'], [debug['sigma'], "target sigma"]) logger.record_tabular("Success rate", np.mean(avg_success_list)) logger.record_tabular("Collision rate", np.mean(avg_collision_list)) logger.record_tabular("Derailment rate", np.mean(avg_derail_list)) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)