def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0, trust_region=True, alpha=0.99, delta=1): print("Running Acer Simple") print(locals()) tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space num_procs = len(env.remotes) # HACK model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, c=c, trust_region=trust_region, alpha=alpha, delta=delta) runner = Runner(env=env, model=model, nsteps=nsteps, nstack=nstack) if replay_ratio > 0: buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size) else: buffer = None nbatch = nenvs*nsteps acer = Acer(runner, model, buffer, log_interval) acer.tstart = time.time() for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls acer.call(on_policy=True) if replay_ratio > 0 and buffer.has_atleast(replay_start): n = np.random.poisson(replay_ratio) for _ in range(n): acer.call(on_policy=False) # no simulation steps in this env.close()
def __init__(self, env, nsteps, nstack, size): self.env = env self.nsteps = nsteps self.nstack = nstack self.size = size self.buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=size) self.file_dir = None self.flag = 3
def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0, trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs): ''' Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf) Train an agent with given network architecture on a given environment using ACER. Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) (default: 20) nstack: int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension (last image dimension) (default: 4) total_timesteps: int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M) q_coef: float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods) ent_coef: float, policy entropy coefficient in the optimization objective (default: 0.01) max_grad_norm: float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10), lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output rprop_epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) rprop_alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting factor (default: 0.99) log_interval: int, number of updates between logging events (default: 100) buffer_size: int, size of the replay buffer (default: 50k) replay_ratio: int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4) replay_start: int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k) c: float, importance weight clipping factor (default: 10) trust_region bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size (default: True) delta: float, max KL divergence between the old policy and updated policy (default: 1) alpha: float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99) load_path: str, path to load the model from (default: None) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' print("Running Acer Simple") print(locals()) set_global_seeds(seed) if not isinstance(env, VecFrameStack): env = VecFrameStack(env, 1) policy = build_policy(env, network, estimate_q=True, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nstack = env.nstack model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, c=c, trust_region=trust_region, alpha=alpha, delta=delta) runner = Runner(env=env, model=model, nsteps=nsteps) if replay_ratio > 0: buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size) else: buffer = None nbatch = nenvs*nsteps acer = Acer(runner, model, buffer, log_interval) acer.tstart = time.time() for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls acer.call(on_policy=True) if replay_ratio > 0 and buffer.has_atleast(replay_start): n = np.random.poisson(replay_ratio) for _ in range(n): acer.call(on_policy=False) # no simulation steps in this return model
def learn(policy, env, flags): """ :param policy: :param baselines.common.vec_env.VecEnv env: :param baselines.acer.flags.AcerFlags flags: """ print("Running Acer Simple") print(flags) flags.total_timesteps = int(flags.total_timesteps) # disable gpu before creating any tensor if not flags.use_gpu: tf_util.disable_gpu() tf.reset_default_graph() set_global_seeds(flags.seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, num_procs=nenvs, flags=flags) runner = Runner(env=env, model=model, nsteps=flags.nsteps, nstack=flags.nstack) if flags.replay_ratio > 0: buffer = Buffer(env=env, nsteps=flags.nsteps, nstack=flags.nstack, size=flags.buffer_size) else: buffer = None nbatch = nenvs*flags.nsteps acer = Acer(runner, model, buffer, flags.log_interval, flags.stats_interval) saver = tf.train.Saver(max_to_keep=3, keep_checkpoint_every_n_hours=flags.permanent_save_hours) checkpoint_dir = os.path.join(flags.save_dir, 'checkpoints') checkpoint_path = os.path.join(checkpoint_dir, 'model') os.makedirs(checkpoint_dir, exist_ok=True) # load checkpoint latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint: {}".format(latest_checkpoint)) saver.restore(model.sess, latest_checkpoint) start_steps = model.GSwrapper.get(model.sess) if hasattr(env, 'restore_state'): env.restore_state(checkpoint_dir, start_steps) else: start_steps = 0 coordinator = tf.train.Coordinator() def signal_handler(signal, frame): if not coordinator.should_stop(): coordinator.request_stop() print("Stopping training...") else: print("Stop already requested, please wait...") signal.signal(signal.SIGINT, signal_handler) print("Press CTRL+C to stop") acer.tstart = time.time() for acer.steps in range(start_steps, flags.total_timesteps, nbatch): # on policy training acer.call(on_policy=True) # off policy training if flags.replay_ratio > 0 and buffer.has_atleast(flags.replay_start): n = np.random.poisson(flags.replay_ratio) for _ in range(n): acer.call(on_policy=False) # no simulation steps in this # saving do_save = (((acer.steps//nbatch) + 1) % flags.save_interval == 0) or coordinator.should_stop() if do_save: save_steps = acer.steps+nbatch print("Saving at t=%s" % save_steps) model.GSwrapper.set(model.sess, save_steps) saver.save(model.sess, save_path=checkpoint_path, global_step=save_steps) if hasattr(env, 'save_state'): env.save_state(checkpoint_dir, save_steps) if coordinator.should_stop(): break env.close()
def learn(policy, env, seed, env_id, learn_time, expert_buffer_size, perform=False, use_expert=False, save_networks=False, network_saving_dir=None, total_timesteps=int(80e6), nsteps=20, nstack=4, q_coef=0.5, ent_coef=0.01, max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=10, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0, trust_region=True, alpha=0.99, delta=1): print(locals()) tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space #Box(84,84,1) ac_space = env.action_space #Discrete(4) if use_expert: expert = Expert( env=env, nsteps=nsteps, nstack=nstack, size=expert_buffer_size) #Exp1:50000; Exp2:25000 ; Exp3:10000 expert_dir = os.path.join('./expert') + '/expert.pkl' file_dir = '/home/zhangxiaoqin/Projects/conda/atari_v1/' #expert.load_file_human(file_dir) expert.load_file(expert_dir) else: expert = None num_procs = len(env.remotes) # HACK model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, c=c, trust_region=trust_region, alpha=alpha, delta=delta, network_saving_dir=network_saving_dir, use_expert=use_expert, expert=expert) runner = Runner(env=env, model=model, nsteps=nsteps, nstack=nstack, env_id=env_id) if replay_ratio > 0: buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size) else: buffer = None if perform: model.load() nbatch = nenvs * nsteps acer = Acer(runner, model, buffer, log_interval) acer.tstart = time.time() for acer.steps in range( 0, total_timesteps, nbatch ): #nbatch samples, 1 on_policy call and multiple off-policy calls if acer.steps > learn_time and use_expert: print('-------------------------') print('Reuse the normal networks') print('-------------------------') use_expert = False expert = None acer.call(perform, save_networks, use_expert, expert, on_policy=True) if replay_ratio > 0 and buffer.has_atleast( replay_start) and not perform: n = np.random.poisson(replay_ratio) for _ in range(n): acer.call(perform, save_networks, use_expert, expert, on_policy=False) # no simulation steps in this #dir = os.path.join('./models/', 'test.m') #model.save('./models/test_2.pkl') # # env.close()
class Expert: def __init__(self, env, nsteps, nstack, size): self.env = env self.nsteps = nsteps self.nstack = nstack self.size = size self.buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=size) self.file_dir = None self.flag = 3 def load_file(self, file_dir): self.file_dir = file_dir expert_file = open(self.file_dir, 'rb') expert_data = pickle.load(expert_file) expert_file.close() for step_sample in expert_data: # print('----------') # print(step_sample[5].shape) # print('----------') self.buffer.put(step_sample[0], step_sample[1], step_sample[2], step_sample[3], step_sample[4], step_sample[5]) # if self.flag > 0: # print(self.flag,'**************************************') # print(step_sample[0], step_sample[1], step_sample[2], step_sample[3], step_sample[4], step_sample[5]) # self.flag = self.flag -1 del expert_data gc.collect() def update_obs(self, obs, dones=None): if dones is not None: self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None] self.obs = np.roll(self.obs, shift=-self.nc, axis=3) self.obs[:, :, :, -self.nc:] = obs[:, :, :, :] def load_file_human(self, file_dir='/home/zhangxiaoqin/atari_v1/'): import agc.dataset as ds import agc.util as util import cv2 env_name = 'spaceinvaders' nsteps = 20 next_file_point = 1 file_point = np.arange(16, dtype=np.int) frame_point = np.zeros( (16), dtype=np.int ) #f_p[0][0] first_line->file_num, sec_line->frame_num dataset = ds.AtariDataset(file_dir) all_trajectories = dataset.trajectories screenshoot_dir = os.path.join(file_dir, 'screens/spaceinvaders') flag = 1 k = 0 while k < 16: i = 1 if i in dataset.trajectories['spaceinvaders']: file_point[k] = i k = k + 1 i = i + 1 init_obs = np.zeros((16, 84, 84, 4), dtype=np.uint8) enc_obs = np.split(init_obs, 4, axis=3) # so now list of obs steps mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], [] while (flag): for _ in range(nsteps): #actions, mus, states = self.model.step(self.obs, state=self.states, mask=self.dones) obs = np.zeros([16, 84, 84, 1], dtype=np.uint8) for i in np.arange(16): pic_path = os.path.join(screenshoot_dir, str( file_point[i]), str(frame_point[i])) + '.png' pic = cv2.imread(pic_path) pic = cv2.cvtColor(pic, cv2.COLOR_RGB2GRAY) pic = cv2.resize(pic, (84, 84), interpolation=cv2.INTER_AREA) obs[i, :, :, :] = pic[:, :, None] if frame_point[i] < all_trajectories['spaceinvaders'][ file_point[i]][-1]['frame']: frame_point[i] = frame_point[i] + 1 else: frame_point[i] = 0 file_point[i] = next_file_point next_file_point = next_file_point + 1 while next_file_point not in dataset.trajectories[ 'spaceinvaders'] and next_file_point <= 514: next_file_point = next_file_point + 1 if next_file_point > 514: flag = False mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_mus.append(mus) mb_dones.append(self.dones) #obs, rewards, dones, _ = self.env.step(actions) #env.render(); # aa,bb,cc,dd = self.env_s.step(actions[0]) # self.env_s.render() # if cc == True: # self.env_s.reset() # states information for statefull models like LSTM self.states = states self.dones = dones #self.update_obs(obs, dones) mb_rewards.append(rewards) enc_obs.append(obs) mb_obs.append(np.copy(self.obs)) mb_dones.append(self.dones) enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0) mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards def get(self): return self.buffer.get() def strip(var, nenvs, nsteps, flat=False): vars = batch_to_seq(var, nenvs, nsteps + 1, flat) return seq_to_batch(vars[:-1], flat) def set_tf(self, sess, expert_train_model, ob_space, ac_space, nenvs, nsteps): nact = ac_space.n nbatch = nenvs * nsteps self.A = tf.placeholder(tf.int32, [nbatch]) # actions self.D = tf.placeholder(tf.float32, [nbatch]) # dones self.R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns self.MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's self.LR = tf.placeholder(tf.float32, []) eps = 1e-6 #step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) # params = find_trainable_variables("model") # print("Params {}".format(len(params))) # for var in params: # print(var) # create polyak averaged model #ema = tf.train.ExponentialMovingAverage(alpha) #ema_apply_op = ema.apply(params) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i v = tf.reduce_sum(tf.stop_gradient(expert_train_model.pi) * expert_train_model.q, axis=-1) # shape is [nenvs * (nsteps + 1)] s_v = tf.reduce_sum(expert_train_model.pi * tf.stop_gradient(expert_train_model.q), axis=-1) v = strip(v, nenvs, nsteps, True) s_v = strip(s_v, nenvs, nsteps, True) # strip off last step #f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [expert_train_model.pi, expert_polyak_model.pi, expert_train_model.q]) fq = lambda var: strip(var, nenvs, nsteps) q_i = get_by_index(fq(expert_train_model.q), self.A) #v = tf.reduce_max(fq(expert_train_model.q), axis = 1) # one_hot_A = tf.one_hot(self.A, nact) # pi = fq(expert_train_model.pi) # loss_policy = tf.reduce_mean(tf.square(pi-one_hot_A)) # Get pi and q values for actions taken #v = strip(v, nenvs, nsteps, True) #loss_q = -tf.reduce_mean(q_i - tf.reshape(v, [nenvs * nsteps, 1])) loss_q = tf.nn.relu(tf.reduce_mean(v - q_i)) loss_policy = -tf.reduce_mean(s_v - tf.stop_gradient(q_i)) self.expert_loss = loss_q + loss_policy #self.expert_loss = loss_policy self.loss_q = loss_q self.loss_policy = loss_policy
def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0, trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs): ''' Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf) Train an agent with given network architecture on a given environment using ACER. Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) (default: 20) nstack: int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension (last image dimension) (default: 4) total_timesteps: int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M) q_coef: float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods) ent_coef: float, policy entropy coefficient in the optimization objective (default: 0.01) max_grad_norm: float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10), lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output rprop_epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) rprop_alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting factor (default: 0.99) log_interval: int, number of updates between logging events (default: 100) buffer_size: int, size of the replay buffer (default: 50k) replay_ratio: int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4) replay_start: int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k) c: float, importance weight clipping factor (default: 10) trust_region bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size (default: True) delta: float, max KL divergence between the old policy and updated policy (default: 1) alpha: float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99) load_path: str, path to load the model from (default: None) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' print("Running Acer Simple") learn_params = { "network": network, "seed": seed, "nsteps": nsteps, "total_timesteps": total_timesteps, "q_coef": q_coef, "ent_coef": ent_coef, "max_grad_norm": max_grad_norm, "lr": lr, "lrschedule": lrschedule, "rprop_epsilon": rprop_epsilon, "rprop_alpha": rprop_alpha, "gamma": gamma, "log_interval": log_interval, "buffer_size": buffer_size, "replay_ratio": replay_ratio, "replay_start": replay_start, "c": c, "trust_region": trust_region, "alpha": alpha, "delta": delta, "load_path": load_path, **network_kwargs } with open("params.json") as f: params = json.load(f) params["replay_start"] = min(params["replay_start"], params["buffer_size"]) params["buffer_size"] = min(params["buffer_size"], params["disk_buffer_size"]) nsteps, buffer_size, disk_buffer_size = params['nsteps'], params[ 'buffer_size'], params['disk_buffer_size'] for k, v in params.items(): if k in learn_params: learn_params[k] = v # print(locals()) with open("model_params.pkl", "wb") as f: pkl.dump(learn_params, f) env, policy, nenvs, ob_space, ac_space, nstack, model = create_model( **learn_params) # *** UNCOMMENT IF YOU WANT TO LOAD OLD VARIABLES load_variables("actor.ckpt") # *** # runner = HaliteRunner(model=model, env=env, gamma=gamma, nsteps=nsteps) runner = HaliteRunner(model) # reads the params json now if replay_ratio > 0: buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size, disk_size=disk_buffer_size) else: buffer = None nbatch = nenvs * nsteps acer = Acer(runner, model, buffer, log_interval, nsteps) acer.tstart = time.time() for acer.steps in range( 0, total_timesteps, nbatch ): #nbatch samples, 1 on_policy call and multiple off-policy calls acer.call(on_policy=True) if replay_ratio > 0 and buffer.has_atleast(replay_start): n = replay_ratio #np.random.poisson(replay_ratio) for _ in range(n): acer.call(on_policy=False) # no simulation steps in this return model
def learn(policy, env, seed, n_steps=20, n_stack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, max_grad_norm=10, learning_rate=7e-4, lr_schedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, correction_term=10.0, trust_region=True, alpha=0.99, delta=1): """ Train an ACER model. :param policy: (ACERPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param env: (Gym environment) The environment to learn from :param seed: (int) The initial seed for training :param n_steps: (int) The number of steps to run for each environment :param n_stack: (int) The number of stacked frames :param total_timesteps: (int) The total number of samples :param q_coef: (float) Q function coefficient for the loss calculation :param ent_coef: (float) Entropy coefficient for the loss caculation :param max_grad_norm: (float) The maximum value for the gradient clipping :param learning_rate: (float) The learning rate :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') :param rprop_epsilon: (float) RMS prop optimizer epsilon :param rprop_alpha: (float) RMS prop optimizer decay :param gamma: (float) Discount factor :param log_interval: (int) The number of timesteps before logging. :param buffer_size: (int) The buffer size in number of steps :param replay_ratio: (float) The number of replay learning per on policy learning on average, using a poisson distribution :param replay_start: (int) The minimum number of steps in the buffer, before learning replay :param correction_term: (float) The correction term for the weights :param trust_region: (bool) Enable Trust region policy optimization loss :param alpha: (float) The decay rate for the Exponential moving average of the parameters :param delta: (float) trust region delta value """ print("Running Acer Simple") print(locals()) set_global_seeds(seed) n_envs = env.num_envs ob_space = env.observation_space ac_space = env.action_space num_procs = len(env.remotes) # HACK model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, n_envs=n_envs, n_steps=n_steps, n_stack=n_stack, num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, max_grad_norm=max_grad_norm, learning_rate=learning_rate, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, total_timesteps=total_timesteps, lr_schedule=lr_schedule, correction_term=correction_term, trust_region=trust_region, alpha=alpha, delta=delta) runner = Runner(env=env, model=model, n_steps=n_steps, n_stack=n_stack) if replay_ratio > 0: buffer = Buffer(env=env, n_steps=n_steps, n_stack=n_stack, size=buffer_size) else: buffer = None n_batch = n_envs * n_steps acer = Acer(runner, model, buffer, log_interval) acer.t_start = time.time() for acer.steps in range( 0, total_timesteps, n_batch ): # n_batch samples, 1 on_policy call and multiple off-policy calls acer.call(on_policy=True) if replay_ratio > 0 and buffer.has_atleast(replay_start): samples_number = np.random.poisson(replay_ratio) for _ in range(samples_number): acer.call(on_policy=False) # no simulation steps in this env.close()