def train(args, extra_args): env_type, env_id = run.get_env_type(args.env) if args.alg == 'gail': env_type += '_gail' args.alg = 'bgail' elif args.alg not in ['bgail', 'gail']: raise NotImplementedError learn = run.get_learn_function(args.alg) alg_kwargs = run.get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args) logger.configure(os.path.join("log", "GAIL", args.env, "subsample_{}".format(extra_args["data_subsample_freq"]), "traj_{}".format(extra_args["num_expert_trajs"]), "batch_size_{}".format(extra_args["timesteps_per_batch"]), "seed_{}".format(args.seed))) print('Training {} on {}:{} with arguments \n{}'.format(args.alg, env_type, env_id, alg_kwargs)) model = learn(env=env, seed=args.seed, save_path=args.save_path, load_path=args.load_path, render=args.render, **alg_kwargs)
def train(params_dict: dict): ncpu = multiprocessing.cpu_count() # ncpu = 1 env_id = params_dict['env_params']['env'] total_timesteps = float(params_dict['training_params']['num_timesteps']) learn = get_learn_function(params_dict['model_params']['alg']) alg_kwargs = get_learn_function_defaults( params_dict['model_params']['alg'], 'atari') alg_kwargs['network'] = params_dict['model_params']['network'] alg_kwargs['lr'] = 0.0001 if 'frame_stack' in params_dict['env_params'] and params_dict[ 'env_params']['frame_stack']: wrapper_kwargs = {'frame_stack': True} else: wrapper_kwargs = {} env = make_vec_env(env_id, 'atari', ncpu, seed=None, wrapper_kwargs=wrapper_kwargs) # env = VecFrameStack(env, 4) model = learn(env=env, seed=None, total_timesteps=total_timesteps, **alg_kwargs) return model, env
def test_coexistence(learn_fn, network_fn): ''' Test if more than one model can exist at a time ''' if learn_fn == 'deepq': # TODO enable multiple DQN models to be useable at the same time # github issue https://github.com/openai/baselines/issues/656 return if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']: # TODO make acktr work with recurrent policies # and test # github issue: https://github.com/openai/baselines/issues/660 return env = DummyVecEnv([lambda: gym.make('CartPole-v0')]) learn = get_learn_function(learn_fn) kwargs = {} kwargs.update(network_kwargs[network_fn]) kwargs.update(learn_kwargs[learn_fn]) learn = partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs) make_session(make_default=True, graph=tf.Graph()) model1 = learn(seed=1) make_session(make_default=True, graph=tf.Graph()) model2 = learn(seed=2) model1.step(env.observation_space.sample()) model2.step(env.observation_space.sample())
def test_coexistence(learn_fn, network_fn): ''' Test if more than one model can exist at a time ''' if learn_fn == 'deepq': # TODO enable multiple DQN models to be useable at the same time # github issue https://github.com/openai/baselines/issues/656 return if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']: # TODO make acktr work with recurrent policies # and test # github issue: https://github.com/openai/baselines/issues/660 return env = DummyVecEnv([lambda: gym.make('CartPole-v0')]) learn = get_learn_function(learn_fn) kwargs = {} kwargs.update(network_kwargs[network_fn]) kwargs.update(learn_kwargs[learn_fn]) learn = partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs) make_session(make_default=True, graph=tf.Graph()) model1 = learn(seed=1) make_session(make_default=True, graph=tf.Graph()) model2 = learn(seed=2) model1.step(env.observation_space.sample()) model2.step(env.observation_space.sample())
def test_multidiscrete_identity(alg): ''' Test if the algorithm (with an mlp policy) can learn an identity transformation (i.e. return observation as an action) ''' kwargs = learn_kwargs[alg] kwargs.update(common_kwargs) learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) env_fn = lambda: MultiDiscreteIdentityEnv((3, 3), episode_len=100) simple_test(env_fn, learn_fn, 0.9)
def test_multidiscrete_identity(alg): ''' Test if the algorithm (with an mlp policy) can learn an identity transformation (i.e. return observation as an action) ''' kwargs = learn_kwargs[alg] kwargs.update(common_kwargs) learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) env_fn = lambda: MultiDiscreteIdentityEnv((3,3), episode_len=100) simple_test(env_fn, learn_fn, 0.9)
def test_serialization(learn_fn, network_fn): ''' Test if the trained model can be serialized ''' if network_fn.endswith('lstm') and learn_fn in [ 'acer', 'acktr', 'trpo_mpi', 'deepq' ]: # TODO make acktr work with recurrent policies # and test # github issue: https://github.com/openai/baselines/issues/660 return def make_env(): env = MnistEnv(episode_len=100) env.seed(10) return env env = DummyVecEnv([make_env]) ob = env.reset().copy() learn = get_learn_function(learn_fn) kwargs = {} kwargs.update(network_kwargs[network_fn]) kwargs.update(learn_kwargs[learn_fn]) learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs) with tempfile.TemporaryDirectory() as td: model_path = os.path.join(td, 'serialization_test_model') with tf.Graph().as_default(), make_session().as_default(): model = learn(total_timesteps=100) model.save(model_path) mean1, std1 = _get_action_stats(model, ob) variables_dict1 = _serialize_variables() with tf.Graph().as_default(), make_session().as_default(): model = learn(total_timesteps=0, load_path=model_path) mean2, std2 = _get_action_stats(model, ob) variables_dict2 = _serialize_variables() for k, v in variables_dict1.items(): np.testing.assert_allclose( v, variables_dict2[k], atol=0.01, err_msg='saved and loaded variable {} value mismatch'.format( k)) np.testing.assert_allclose(mean1, mean2, atol=0.5) np.testing.assert_allclose(std1, std2, atol=0.5)
def test_continuous_identity(alg): ''' Test if the algorithm (with an mlp policy) can learn an identity transformation (i.e. return observation as an action) to a required precision ''' kwargs = learn_kwargs[alg] kwargs.update(common_kwargs) learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) env_fn = lambda: BoxIdentityEnv((1,), episode_len=100) simple_test(env_fn, learn_fn, -0.1)
def test_continuous_identity(alg): ''' Test if the algorithm (with an mlp policy) can learn an identity transformation (i.e. return observation as an action) to a required precision ''' kwargs = learn_kwargs[alg] kwargs.update(common_kwargs) learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) env_fn = lambda: BoxIdentityEnv((1, ), episode_len=100) simple_test(env_fn, learn_fn, -0.1)
def test_fixed_sequence(alg, rnn): ''' Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) ''' kwargs = learn_kwargs[alg] kwargs.update(common_kwargs) env_fn = lambda: FixedSequenceEnv(n_actions=10, episode_len=5) learn = lambda e: get_learn_function(alg)(env=e, network=rnn, **kwargs) simple_test(env_fn, learn, 0.7)
def test_mnist(alg): ''' Test if the algorithm can learn to classify MNIST digits. Uses CNN policy. ''' learn_kwargs = learn_args[alg] learn_kwargs.update(common_kwargs) learn = get_learn_function(alg) learn_fn = lambda e: learn(env=e, **learn_kwargs) env_fn = lambda: MnistEnv(episode_len=100) simple_test(env_fn, learn_fn, 0.6)
def test_mnist(alg): ''' Test if the algorithm can learn to classify MNIST digits. Uses CNN policy. ''' learn_kwargs = learn_args[alg] learn_kwargs.update(common_kwargs) learn = get_learn_function(alg) learn_fn = lambda e: learn(env=e, **learn_kwargs) env_fn = lambda: MnistEnv(seed=0, episode_len=100) simple_test(env_fn, learn_fn, 0.6)
def test_serialization(learn_fn, network_fn): ''' Test if the trained model can be serialized ''' if network_fn.endswith('lstm') and learn_fn in ['acer', 'acktr', 'trpo_mpi', 'deepq']: # TODO make acktr work with recurrent policies # and test # github issue: https://github.com/openai/baselines/issues/660 return def make_env(): env = MnistEnv(episode_len=100) env.seed(10) return env env = DummyVecEnv([make_env]) ob = env.reset().copy() learn = get_learn_function(learn_fn) kwargs = {} kwargs.update(network_kwargs[network_fn]) kwargs.update(learn_kwargs[learn_fn]) learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs) with tempfile.TemporaryDirectory() as td: model_path = os.path.join(td, 'serialization_test_model') with tf.Graph().as_default(), make_session().as_default(): model = learn(total_timesteps=100) model.save(model_path) mean1, std1 = _get_action_stats(model, ob) variables_dict1 = _serialize_variables() with tf.Graph().as_default(), make_session().as_default(): model = learn(total_timesteps=0, load_path=model_path) mean2, std2 = _get_action_stats(model, ob) variables_dict2 = _serialize_variables() for k, v in variables_dict1.items(): np.testing.assert_allclose(v, variables_dict2[k], atol=0.01, err_msg='saved and loaded variable {} value mismatch'.format(k)) np.testing.assert_allclose(mean1, mean2, atol=0.5) np.testing.assert_allclose(std1, std2, atol=0.5)
def test_env_after_learn(algo): def make_env(): # acktr requires too much RAM, fails on travis env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4') return env make_session(make_default=True, graph=tf.Graph()) env = SubprocVecEnv([make_env]) learn = get_learn_function(algo) # Commenting out the following line resolves the issue, though crash happens at env.reset(). learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None) env.reset() env.close()
def test_env_after_learn(algo): def make_env(): # acktr requires too much RAM, fails on travis env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4') return env make_session(make_default=True, graph=tf.Graph()) env = SubprocVecEnv([make_env]) learn = get_learn_function(algo) # Commenting out the following line resolves the issue, though crash happens at env.reset(). learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None) env.reset() env.close()
def test_fixed_sequence(alg, rnn): ''' Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) ''' kwargs = learn_kwargs[alg] kwargs.update(common_kwargs) env_fn = lambda: FixedSequenceEnv(n_actions=10, episode_len=5) learn = lambda e: get_learn_function(alg)( env=e, network=rnn, **kwargs ) simple_test(env_fn, learn, 0.7)
def test_fetchreach(alg): ''' Test if the algorithm (with an mlp policy) can learn the FetchReach task ''' kwargs = common_kwargs.copy() kwargs.update(learn_kwargs[alg]) learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) def env_fn(): env = gym.make('FetchReach-v1') env.seed(0) return env reward_per_episode_test(env_fn, learn_fn, -15)
def test_cartpole(alg): ''' Test if the algorithm (with an mlp policy) can learn to balance the cartpole ''' kwargs = common_kwargs.copy() kwargs.update(learn_kwargs[alg]) learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) def env_fn(): env = gym.make('CartPole-v0') env.seed(0) return env reward_per_episode_test(env_fn, learn_fn, 100)
def test_fetchreach(alg): ''' Test if the algorithm (with an mlp policy) can learn the FetchReach task ''' kwargs = common_kwargs.copy() kwargs.update(learn_kwargs[alg]) learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) def env_fn(): env = gym.make('FetchReach-v1') env.seed(0) return env reward_per_episode_test(env_fn, learn_fn, -15)
def test_cartpole(alg): ''' Test if the algorithm (with an mlp policy) can learn to balance the cartpole ''' kwargs = common_kwargs.copy() kwargs.update(learn_kwargs[alg]) learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) def env_fn(): env = gym.make('CartPole-v0') env.seed(0) return env reward_per_episode_test(env_fn, learn_fn, 100)
def test_env_after_learn(algo): def make_env(): env = gym.make('PongNoFrameskip-v4') return env make_session(make_default=True, graph=tf.Graph()) env = SubprocVecEnv([make_env]) learn = get_learn_function(algo) network = cnn(one_dim_bias=True) # Commenting out the following line resolves the issue, though crash happens at env.reset(). learn(network=network, env=env, total_timesteps=0, load_path=None, seed=None) env.reset() env.close()
def train(args, extra_args): env_type, env_id = run.get_env_type(args.env) if args.alg == 'gail': env_type += '_gail' args.alg = 'bgail' elif args.alg not in ['bgail', 'gail']: raise NotImplementedError learn = run.get_learn_function(args.alg) alg_kwargs = run.get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) model = learn(env=env, seed=args.seed, save_path=args.save_path, load_path=args.load_path, render=args.render, **alg_kwargs)
#!/usr/bin/python3 #https://stackoverflow.com/questions/45068568/is-it-possible-to-create-a-new-gym-environment-in-openai import gym import gym_banana import baselines.run as r time_cycles = 2464 env = gym.make('Banana-v0') env.num_envs = 1 learn = r.get_learn_function("ppo2") model = learn( network='mlp', env=env, total_timesteps=time_cycles )
'nsteps': 2048, 'noptepochs': 10, 'save_interval': 20, 'log_interval': 1, 'save_path': save_path, 'model_load_path': model_load_path, 'seed': 0, 'reward_scale': 1, 'flatten_dict_observations': True, 'transfer_weights': False } args = SimpleNamespace(**args_dict) # Prepare the environment and learning algorithm env_type, env_id = get_env_type(args.env) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) env = build_env(args) alg_kwargs['network'] = args.network # The path we will store the results of this experiment full_path = args.save_path + '/' + args.env + '-' + args.alg # Make folders that we will store the checkpoints, models and epoch results if not os.path.exists(full_path): os.makedirs(full_path) os.makedirs(full_path + '/checkpoints') print("About to start learning model") model = learn(env=env,