def main(): env_id = 'PongNoFrameskip-v4' # env_id = 'MsPacmanNoFrameskip-v4' # env_id = 'BreakoutNoFrameskip-v4' num_env = 16 num_steps = 5 num_batch = num_env * num_steps seed = 0 env_args = {'episode_life': False, 'clip_rewards': False, 'scale': False, 'transpose_image': True} env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4) network = ConvVAE([84, 84], 2048) observs = [] actions = [] next_observs = [] observ = env.reset() observ = observ.transpose(0, 3, 2, 1) observ = tensor(observ) print(observ.shape) out = network(observ)[0] print(out.shape)
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env): """ Train A2C model for atari environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') :param num_env: (int) The number of environments """ policy_fn = None if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = CnnLstmPolicy elif policy == 'lnlstm': policy_fn = CnnLnLstmPolicy if policy_fn is None: raise ValueError("Error: policy {} not implemented".format(policy)) env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) model = A2C(policy_fn, env, lr_schedule=lr_schedule) model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) env.close()
def play(env_id, num_timesteps, seed, policy, lr_schedule, num_env, sil_update, sil_beta, load_path): policy_fn = CnnPolicy if policy_fn is None: raise ValueError("Error: policy {} not implemented".format(policy)) env_args = {'episode_life': False, 'clip_rewards': False, 'scale': True} env = make_video_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args) env = VecFrameStack(env, 4) model = SelfImitationA2C.load(load_path, env=env) print(model.params) return_ = np.zeros((env.num_envs, )) terminals_ = np.zeros((env.num_envs, ), dtype=np.bool) print(model.env) observ = env.reset() while True: actions, values, states, _ = model.step(observ, None, None) next_observ, rewards, terminals, _ = env.step(actions) print(rewards) return_ += rewards terminals_ |= terminals # print('terminals', terminals_) done = True for terminal in terminals_.tolist(): done &= terminal if done: break for mp4_file in Path('/tmp/video').glob('*.mp4'): if int(mp4_file.stat().st_size) < 100: mp4_file.unlink() print(return_)
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env, sil_update, sil_beta, tensorboard_log, tb_log_name): """ Train A2C model for atari environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') :param num_env: (int) The number of environments """ policy_fn = None if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = CnnLstmPolicy elif policy == 'lnlstm': policy_fn = CnnLnLstmPolicy if policy_fn is None: raise ValueError("Error: policy {} not implemented".format(policy)) env_args = {'episode_life': False, 'clip_rewards': False, 'scale': True} env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4) model = SelfImitationA2C(policy_fn, env, lr_schedule=lr_schedule, tensorboard_log=tensorboard_log, verbose=1, sil_update=sil_update, sil_beta=sil_beta) model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed, tb_log_name=tb_log_name) env.close()
def main(): env_id = 'BreakoutNoFrameskip-v4' num_env = 5 seed = 0 env_args = {'episode_life': False, 'clip_rewards': False} env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4) graph = tf.Graph() with graph.as_default(): sess = tf_util.make_session(graph=graph) with tf.variable_scope('input', reuse=False): input_x, process_x = observation_input(env.observation_space, num_env) print(env.action_space.shape) pdtype = make_proba_dist_type(env.action_space) actions_ph = pdtype.sample_placeholder([num_env], name="action_ph") one_hot_actions = tf.one_hot(actions_ph, env.action_space.n) print(input_x, process_x) print('action', actions_ph, one_hot_actions) beta = 0.1 mu, sigma_sq, recons_x = build_network(process_x, one_hot_actions) print(mu) print(sigma_sq) print(recons_x) with tf.name_scope('losses'): recons_loss = tf.losses.mean_squared_error(input_x, recons_x, scope='recons_loss') kl_divergence = -tf.reduce_mean(0.5 * (tf.add(1., sigma_sq) - tf.pow(mu, 2) - tf.exp(sigma_sq)), name='kl_divergence') loss = tf.add(recons_loss, tf.multiply( kl_divergence, beta), name='objective') print(loss) summary = utility.summary({recons_loss: 'recons_loss', kl_divergence: 'kl_divergence', mu: 'phi_mu', sigma_sq: 'sigma_sq', recons_x: 'recons_x', input_x: 'input_x', }, env.observation_space.shape) optimizer = tf.train.AdamOptimizer(learning_rate=0.0002, beta1=0.5) train_op = optimizer.minimize(loss) for event_file in LOG_DIR.glob('event*'): event_file.unlink() writer = tf.summary.FileWriter(LOG_DIR.as_posix(), sess.graph) sess.run(tf.global_variables_initializer()) observ = env.reset() actions = [env.action_space.sample() for _ in range(num_env)] print(env.observation_space) print(observ.shape) recons_image, summary_ = sess.run([recons_x, summary], feed_dict={input_x: observ, actions_ph: actions}) writer.add_summary(summary_, 0)
def train(env_id, num_timesteps, seed, num_cpu): """ train an ACKTR model on atari :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param num_cpu: (int) The number of cpu to train on """ env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) model = ACKTR(CnnPolicy, env, nprocs=num_cpu) model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) env.close()
def test(env_id, seed, policy): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) """ # if 'lstm' in policy: # print('LSTM policies not supported for drawing') # return 1 env = DummyVecEnv([PadEnvRender for _ in range(1)]) # Need for lstm # else: # env = PadEnvRender() env = VecFrameStack(env, 8) model = PPO2.load('./pad_5combo_ppo2.pkl', env) while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) done = done.any() episode_rew += rew time.sleep(1 / 24.) if done: print('Episode reward:', rew)
def train(env_id, num_timesteps, seed, policy): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) """ env = VecFrameStack(make_atari_env(env_id, 8, seed), 4) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] model = PPO2(policy=policy, env=env, n_steps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) model.learn(total_timesteps=num_timesteps)
def makeEnv(cls, args, env_kwargs=None, load_path_normalise=None): if "num_population" in args.__dict__: args.num_cpu = args.num_population * 2 assert not (registered_env[args.env][3] is ThreadingType.NONE and args.num_cpu != 1), \ "Error: cannot have more than 1 CPU for the environment {}".format(args.env) if env_kwargs is not None and env_kwargs.get("use_srl", False): srl_model = MultiprocessSRLModel(args.num_cpu, args.env, env_kwargs) env_kwargs["state_dim"] = srl_model.state_dim env_kwargs["srl_pipe"] = srl_model.pipe envs = [ makeEnv(args.env, args.seed, i, args.log_dir, allow_early_resets=True, env_kwargs=env_kwargs) for i in range(args.num_cpu) ] envs = SubprocVecEnv(envs) envs = VecFrameStack(envs, args.num_stack) if args.srl_model != "raw_pixels" and args.algo_type == "v2": envs = VecNormalize(envs, norm_obs=True, norm_reward=False) envs = loadRunningAverage(envs, load_path_normalise=load_path_normalise) return envs
def train(env_id, num_timesteps, seed, policy, n_envs=8, nminibatches=4, n_steps=128): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param n_envs: (int) Number of parallel environments :param nminibatches: (int) Number of training minibatches per update. For recurrent policies, the number of environments run in parallel should be a multiple of nminibatches. :param n_steps: (int) The number of steps to run for each environment per update (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel) """ env = VecFrameStack(make_atari_env(env_id, n_envs, seed), 4) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] model = PPO2(policy=policy, env=env, n_steps=n_steps, nminibatches=nminibatches, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) model.learn(total_timesteps=num_timesteps) del model
def train(env_id, num_timesteps, seed, policy): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) """ env = Monitor(PadEnv(), './logs', allow_early_resets=True) env = DummyVecEnv([lambda: env for _ in range(16)]) env = VecFrameStack(env, 8) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] model = PPO2(policy=policy, env=env, n_steps=256, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) # model = model.load('./pad_4combo_ppo2.pkl', env) try: model.learn(total_timesteps=num_timesteps) except KeyboardInterrupt: print('Keyboard Interrupted') model.save('./pad_5combo_ppo2.pkl')
def main(): beta = 0 env_id = 'MsPacmanNoFrameskip-v4' num_env = 16 num_steps = 5 num_batch = num_env * num_steps seed = 0 env_args = {'episode_life': False, 'clip_rewards': False, 'scale': False} env = VecFrameStack( make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4) graph = tf.Graph() with graph.as_default(): sess = tf_util.make_session(graph=graph) policy = ReconstructionModule(sess, env.observation_space, env.action_space, num_batch) def save(save_path: Path, params): data = { 'policy': ReconstructionModule, } params = sess.run(params) _save_to_file(save_path, data=data, params=params) print(policy.mu) print(policy.log_sigma_sq) print(policy.recons_x) params = find_trainable_variables('model') tf.global_variables_initializer().run(session=sess) def load(load_path: Path): _data, load_params = _load_from_file(LOG_DIR) restores = [] for param, load_param in zip(params, load_params): restores.append(param.assign(load_param)) sess.run(restores) with tf.name_scope('losses'): recons_losses = tf.squared_difference(policy.next_process_x, policy.recons_x) recons_loss = tf.reduce_mean(recons_losses, name='reconstruction_loss') summary = utility.summary( { policy.capacity_ph: 'capacity', recons_losses: 'recons_losses', policy.process_x: 'process_x', policy.next_process_x: 'next_process_x', policy.recons_x: 'recons_x', }, env.observation_space.shape, ignore=['recons_x', 'recons_losses', 'process_x']) # optimizer = tf.train.RMSPropOptimizer(learning_rate=7e-4, decay=0.99, epsilon=1e-5) optimizer = tf.train.AdamOptimizer(5e-4) train_op = optimizer.minimize(recons_loss) for event_file in LOG_DIR.glob('event*'): event_file.unlink() writer = tf.summary.FileWriter(LOG_DIR.as_posix(), sess.graph) sess.run(tf.global_variables_initializer()) observs = [] actions = [] next_observs = [] observ = env.reset() global_step = 0 while True: if global_step > 100_000: break print('\rStep Global Step {}/{}'.format(global_step, 100_000 + 1), end='', flush=True) action = [env.action_space.sample() for _ in range(num_env)] next_observ, rewards, terminals, _ = env.step(action) observs.extend(observ) actions.extend(action) next_observs.extend(next_observ) observ = next_observ global_step += num_env if len(observs) == num_batch: feed_dict = { policy.input_x: np.asarray(observs), policy.next_input_x: np.asarray(next_observs), policy.actions_ph: np.asarray(actions), policy.capacity_ph: _calculate_encoding_capacity(step), } if global_step % (5 * num_batch) == 0: summary_, _ = sess.run([summary, train_op], feed_dict=feed_dict) writer.add_summary(summary_, global_step) else: _ = sess.run([train_op], feed_dict=feed_dict) observs = [] actions = [] next_observs = [] save(LOG_DIR, params)
def main(): beta = 750 # env_id = 'PongNoFrameskip-v4' env_id = 'MsPacmanNoFrameskip-v4' # env_id = 'BreakoutNoFrameskip-v4' num_env = 16 num_steps = 5 num_batch = num_env * num_steps seed = 0 env_args = {'episode_life': False, 'clip_rewards': False, 'scale': False} env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4) graph = tf.Graph() with graph.as_default(): sess = tf_util.make_session(graph=graph) policy = ReconstructionModule(sess, env.observation_space, env.action_space, num_batch, use_batch_norm=True) def save(save_path: Path, params): data = { 'policy': ReconstructionModule, } params = sess.run(params) _save_to_file(save_path, data=data, params=params) print(policy.mu) print(policy.log_sigma_sq) print(policy.recons_x) params = find_trainable_variables('model') tf.global_variables_initializer().run(session=sess) def load(load_path: Path): _data, load_params = _load_from_file(LOG_DIR) restores = [] for param, load_param in zip(params, load_params): restores.append(param.assign(load_param)) sess.run(restores) with tf.name_scope('losses'): recons_losses = tf.squared_difference(policy.next_process_x, policy.recons_x) recons_loss = tf.reduce_mean(recons_losses, name='reconstruction_loss') kl_divergences = -0.5 * (tf.add(1., policy.log_sigma_sq) - tf.square(policy.mu) - tf.exp(policy.log_sigma_sq)) kl_divergence = tf.reduce_mean(kl_divergences, name='kl_divergence') coefed_kl = tf.multiply(tf.abs(kl_divergence - policy.capacity_ph), beta) loss = tf.add(recons_loss, coefed_kl, name='objective') # loss = tf.add(recons_loss, # tf.multiply( # kl_divergence, beta), name='objective') summary = utility.summary({ loss: 'loss', policy.capacity_ph: 'capacity', kl_divergences: 'kl_divergences', recons_losses: 'recons_losses', policy.process_x: 'process_x', policy.next_process_x: 'next_process_x', policy.mu: 'phi_mu', policy.log_sigma_sq: 'log_sigma_sq', policy.recons_x: 'recons_x', coefed_kl: 'coefed_KL', }, env.observation_space.shape, ignore=['recons_x', 'recons_losses', 'process_x']) optimizer = tf.train.RMSPropOptimizer(learning_rate=7e-4, decay=0.99, epsilon=1e-5) # optimizer = tf.train.AdamOptimizer(5e-5) train_op = optimizer.minimize(loss) for event_file in LOG_DIR.glob('event*'): event_file.unlink() writer = tf.summary.FileWriter(LOG_DIR.as_posix(), sess.graph) sess.run(tf.global_variables_initializer()) observs = [] actions = [] next_observs = [] observ = env.reset() for step in tqdm(range(1, 2_000_000 + 1, num_env)): action = [env.action_space.sample() for _ in range(num_env)] next_observ, rewards, terminals, _ = env.step(action) observs.extend(observ) actions.extend(action) next_observs.extend(next_observ) observ = next_observ if len(observs) == num_batch: feed_dict = {policy.input_x: np.asarray(observs), policy.next_input_x: np.asarray(next_observs), policy.actions_ph: np.asarray(actions), policy.capacity_ph: _calculate_encoding_capacity(step), } if (step // num_env + 1) % 1000 == 0: summary_, _ = sess.run([summary, train_op], feed_dict=feed_dict) writer.add_summary(summary_, step) else: _ = sess.run([train_op], feed_dict=feed_dict) observs = [] actions = [] next_observs = [] save(LOG_DIR, params)