def main(dir, interval): with logger.session(dir): saver = SnapshotSaver(dir, interval=interval) state = saver.get_state() alg_state = state['alg_state'] env = alg_state['env_maker'].make() alg = state['alg'] alg(env=env, snapshot_saver=saver, **alg_state)
def main(dir): env = None while True: saver = SnapshotSaver(dir) state = saver.get_state() if state is None: time.sleep(1) continue alg_state = state['alg_state'] if env is None: env = alg_state['env_maker'].make() policy = alg_state['policy'] ob = env.reset() done = False while not done: action, _ = policy.get_action(ob) ob, _, done, _ = env.step(action) env.render()
def main(dir): env = None while True: saver = SnapshotSaver(dir) state = saver.get_state() if state is None: time.sleep(1) continue alg_state = state['alg_state'] if env is None: # save videos of all episodes to monitor_dir env = alg_state['env_maker'].make( video_callable=lambda episode_id: True) policy = alg_state['policy'] ob = env.reset() done = False while not done: action, _ = policy.get_action(ob) ob, _, done, _ = env.step(action) env.render()
def run(v): np.random.seed(v['seed']) env_maker = EnvMaker('Pendulum-v0') env = env_maker.make() policy = GaussianMLPPolicy( observation_space=env.observation_space, action_space=env.action_space, env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=chainer.functions.tanh, ) if v['baseline'] == 'mlp': baseline = MLPBaseline( observation_space=env.observation_space, action_space=env.action_space, env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=chainer.functions.tanh, ) elif v['baseline'] == 'time_dependent': baseline = TimeDependentBaseline( observation_space=env.observation_space, action_space=env.action_space, env_spec=env.spec, ) elif v['baseline'] == 'linear_feature': baseline = LinearFeatureBaseline( observation_space=env.observation_space, action_space=env.action_space, env_spec=env.spec, ) else: raise ValueError trpo( env=env, env_maker=env_maker, n_envs=16, policy=policy, baseline=baseline, batch_size=10000, n_iters=100, snapshot_saver=SnapshotSaver(logger.get_dir()), )
def run(v): np.random.seed(v['seed']) env_maker = EnvMaker('CartPole-v0') env = env_maker.make() policy = CategoricalMLPPolicy( observation_space=env.observation_space, action_space=env.action_space, env_spec=env.spec ) baseline = MLPBaseline( observation_space=env.observation_space, action_space=env.action_space, env_spec=env.spec ) trpo( env=env, env_maker=env_maker, n_envs=16, policy=policy, baseline=baseline, batch_size=2000, n_iters=100, snapshot_saver=SnapshotSaver(logger.get_dir()) )
#!/usr/bin/env python from algs import a2c from env_makers import EnvMaker from models import CategoricalCNNPolicy from utils import SnapshotSaver import numpy as np import os import logger log_dir = "data/local/a2c-pong" np.random.seed(42) # Clean up existing logs os.system("rm -rf {}".format(log_dir)) with logger.session(log_dir): env_maker = EnvMaker('PongNoFrameskip-v4') env = env_maker.make() policy = CategoricalCNNPolicy(env.observation_space, env.action_space, env.spec) vf = policy.create_vf() a2c( env=env, env_maker=env_maker, n_envs=16, policy=policy, vf=vf, snapshot_saver=SnapshotSaver(log_dir, interval=10), )
log_dir = "data/local/trpo-cartpole" np.random.seed(42) # Clean up existing logs os.system("rm -rf {}".format(log_dir)) with logger.session(log_dir): env_maker = EnvMaker('CartPole-v0') env = env_maker.make() policy = CategoricalMLPPolicy( observation_space=env.observation_space, action_space=env.action_space, env_spec=env.spec ) baseline = MLPBaseline( observation_space=env.observation_space, action_space=env.action_space, env_spec=env.spec ) trpo( env=env, env_maker=env_maker, n_envs=16, policy=policy, baseline=baseline, batch_size=2000, n_iters=100, snapshot_saver=SnapshotSaver(log_dir) )