def ray_runner(self, num_runs, flow_params, version): alg_run = 'PPO' HORIZON = 10 agent_cls = get_agent_class(alg_run) config = agent_cls._default_config.copy() config['num_workers'] = 1 config['sample_batch_size'] = 50 # arbitrary config['train_batch_size'] = 50 # arbitrary config['sgd_minibatch_size'] = 10 config['num_sgd_iter'] = 1 config['horizon'] = HORIZON # save the flow params for replay flow_json = json.dumps( flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4) config['env_config']['flow_params'] = flow_json config['env_config']['run'] = alg_run create_env, env_name = make_create_env(params=flow_params, version=version) # Register as rllib env register_env(env_name, create_env) alg = ppo.PPOAgent( env=env_name, config=config) for i in range(num_runs): alg.train() checkpoint_path = alg.save('benchmark_tmp') self.assertTrue('%s.index' % os.path.exists(checkpoint_path))
def instantiate_agent(**kwargs): try: ray.get([]) except: ray.init() register_env("my_env", lambda ec: pong_py.PongJSEnv()) trainer = ppo.PPOAgent(env="my_env", config={"env_config": {}}) return trainer
def select_agent(config): algo = config["algo"].lower() env = config["env_config"]["env"] expe_config = merge_env_algo_config(config) if algo == "ppo": agent = ppo.PPOAgent(config=expe_config, env=env) elif algo == "apex": agent = dqn.ApexAgent(config=expe_config, env=env) elif algo == "apex_film_frozen": agent = FilmFrozenApex(config=expe_config, env=env) elif algo == "apex_vision_frozen": agent = VisionFrozenApex(config=expe_config, env=env) elif algo == "apex_film_frozen_loaded_weight": agent = FilmFrozenApexLoadedWeight(config=expe_config, env=env) elif algo == "apex_vision_frozen_loaded_weight": agent = VisionFrozenApexLoadedWeight(config=expe_config, env=env) else: raise NotImplementedError("PPO and Apex are available, that's all") return agent
graphics_render=False, delivery_locations=[(2, 2), (2, 7), (7, 2), (7, 7)])) env = DeepLogisticsMultiEnv1() register_env("DeepLogisticsMultiEnv1", lambda config: DeepLogisticsMultiEnv1()) policy_graphs = { k: (PPOPolicyGraph, env.observation_space, env.action_space, dict(gamma=0.95)) for k, a in env.agents.items() } policy_ids = list(policy_graphs.keys()) trainer = ppo.PPOAgent( env="DeepLogisticsMultiEnv1", config=dict(multiagent=dict( policy_graphs=policy_graphs, policy_mapping_fn=lambda agent_id: agent_id), callbacks=dict(on_episode_end=tune.function( DeepLogisticsMultiEnv.on_episode_end)) #num_envs_per_worker=4, #num_workers=2 )) while True: print(":D") print(trainer.train()) 1
#'num_workers': 32, 'num_workers': 4, 'gamma': 0.995, 'lambda': 0.95, 'clip_param': 0.1, 'num_sgd_iter': 3, #'sgd_batchsize': 4096, #'sgd_batchsize': 128, 'sgd_stepsize': 1e-4, 'use_gae': True, 'horizon': 4096, 'entropy_coeff': 0.0, # 'devices': ['/gpu:0', '/gpu:1', '/gpu:2', '/gpu:3', '/gpu:4', '/gpu:5', # '/gpu:6', 'gpu:7'], #'devices': ['/gpu:0'], #'num_gpus_per_worker':1, 'tf_session_args': { 'gpu_options': {'allow_growth': True} } }) alg = ppo.PPOAgent(config=config, env=env_name) for i in range(10000000): result = alg.train() print('result = {}'.format(result)) if i % 10000 == 0: checkpoint = alg.save() print('checkpoint saved at', checkpoint)
def create_env(env_config): pass_params_to_gym(env_name) env = gym.envs.make(env_name) return env if __name__ == '__main__': register_env(env_name, lambda env_config: create_env(env_config)) config = ppo.DEFAULT_CONFIG.copy() horizon = 10 num_cpus = 4 ray.init(num_cpus=num_cpus, redirect_output=True) config["num_workers"] = num_cpus config["timesteps_per_batch"] = 10 config["sgd_batchsize"] = 10 config["num_sgd_iter"] = 10 config["gamma"] = 0.999 config["horizon"] = horizon config["use_gae"] = True config["model"].update({"fcnet_hiddens": [256, 256]}) options = { "multiagent_obs_shapes": [3, 3], "multiagent_act_shapes": [1, 1], "multiagent_shared_model": True, "multiagent_fcnet_hiddens": [[32, 32]] * 2 } config["model"].update({"custom_options": options}) alg = ppo.PPOAgent(env=env_name, config=config) for i in range(1): alg.train()
register_env("prosthetics_env", create_env) # Set up agent (agent references environment) # Use default integrator accuracy in test mode #agent_config['env_config'] = {'visualize': not args.token and args.visualize, 'integrator_accuracy': integrator_accuracy} agent_config['env_config'] = {'visualize': not args.token and args.visualize, "difficulty": 0} print('agent_config:\n%s' % (agent_config,)) if agent_type == 'DDPG': agent = ddpg.DDPGAgent( env="prosthetics_env", config=agent_config ) elif agent_type == 'PPO': agent = ppo.PPOAgent( env="prosthetics_env", config=agent_config ) else: raise ValueError('Unspported agent type') agent.restore(args.checkpoint) # Different inference procedure if using LSTM or not use_lstm = False if 'model' in agent_config: if 'use_lstm' in agent_config['model']: if agent_config['model']['use_lstm']: use_lstm = True if args.token: # Submit to competition
def test_ray(self): """ Integration test for ray/rllib + flow """ # Test 1: test_two_level_ray config = ppo.DEFAULT_CONFIG.copy() num_workers = 1 ray.init(num_cpus=num_workers, redirect_output=False) config["num_workers"] = num_workers config["timesteps_per_batch"] = min(HORIZON * num_workers, 128) config["num_sgd_iter"] = 1 config["model"].update({"fcnet_hiddens": [3, 3]}) config["gamma"] = 0.999 config["min_steps_per_task"] = HORIZON config["horizon"] = HORIZON config["sgd_batchsize"] = 4 additional_env_params = { "target_velocity": 8, "scenario_type": LoopScenario } additional_net_params = { "length": 260, "lanes": 1, "speed_limit": 30, "resolution": 40 } vehicle_params = [ dict(veh_id="rl", num_vehicles=1, acceleration_controller=(RLController, {}), routing_controller=(ContinuousRouter, {})), dict(veh_id="idm", num_vehicles=21, acceleration_controller=(IDMController, {}), routing_controller=(ContinuousRouter, {})) ] flow_params = dict(sumo=dict(sim_step=0.1, no_step_log=False), env=dict(horizon=HORIZON, additional_params=additional_env_params), net=dict(no_internal_links=False, additional_params=additional_net_params), veh=vehicle_params, initial=dict(spacing="uniform", bunching=30, min_gap=0)) flow_env_name = "WaveAttenuationPOEnv" create_env, env_name = make_create_env(flow_env_name, flow_params, 0) # Register as rllib env registry.register_env(env_name, create_env) alg = ppo.PPOAgent(env=env_name, registry=registry.get_registry(), config=config) for i in range(1): alg.train() checkpoint_path = alg.save() self.assertTrue("%s.index" % os.path.exists(checkpoint_path)) # Test 2: test_two_level_ray # Integration test for two-level fcnet policy # FIXME(cathywu) ray restart currently not supported, so need to tie # integration tests together for the time being. # reload(ppo) # reload(registry) config = ppo.DEFAULT_CONFIG.copy() num_workers = 1 # ray.init(num_cpus=num_workers, redirect_output=True) config["num_workers"] = num_workers config["timesteps_per_batch"] = min(HORIZON * num_workers, 128) config["num_sgd_iter"] = 1 config["model"].update({"fcnet_hiddens": [3, 3]}) config["gamma"] = 0.999 config["min_steps_per_task"] = HORIZON config["horizon"] = HORIZON config["sgd_batchsize"] = 4 config["model"].update({"fcnet_hiddens": [5, 3]}, ) options = { "num_subpolicies": 2, "fn_choose_subpolicy": fn_choose_subpolicy, "hierarchical_fcnet_hiddens": [[3, 3]] * 2 } config["model"].update({"custom_options": options})
import os import ray import ray.rllib.agents.ppo as ppo from ray.tune.logger import pretty_print # Based on https://github.com/ray-project/ray/blob/master/doc/source/rllib-training.rst#python-api ray.init(log_to_driver=False) config = ppo.DEFAULT_CONFIG.copy() config["num_gpus"] = int(os.environ.get("SM_NUM_GPUS", 0)) checkpoint_dir = os.environ.get("SM_MODEL_DIR", "/Users/nadzeya/gym") config["num_workers"] = 1 agent = ppo.PPOAgent(config=config, env="CartPole-v0") # Can optionally call agent.restore(path) to load a checkpoint. for i in range(5): # Perform one iteration of training the policy with PPO result = agent.train() print(pretty_print(result)) checkpoint = agent.save(checkpoint_dir=checkpoint_dir) print("checkpoint saved at", checkpoint)
agent = ppo.PPOAgent( env="prosthetics", config={ # Discount factor "gamma": 0.998, # If true, use the Generalized Advantage Estimator (GAE) # with a value function, see https://arxiv.org/pdf/1506.02438.pdf. "use_gae": True, # Time horizon "horizon": 300, # Reward clipping (std) "clip_rewards": False, # Number of workers "num_workers": 72 * 4 + 5, # 72 * 3 + 5 # GAE(lambda) parameter "lambda": 0.95, # Initial coefficient for KL divergence "kl_coeff": 0.2, # Number of SGD iterations in each outer loop "num_sgd_iter": 10, # Stepsize of SGD "sgd_stepsize": 3e-4, # timestep_size "timesteps_per_batch": 4000, # batch_size "sample_batch_size": 128, # Coefficient of the value function loss "vf_loss_coeff": 1.0, # Coefficient of the entropy regularizer "entropy_coeff": 0.001, # PPO clip parameter "clip_param": 0.2, # Target value for KL divergence "kl_target": 0.01, # Number of GPUs to use for SGD "num_gpus": 1, # Whether to allocate GPUs for workers (if > 0). "num_gpus_per_worker": 0, # Whether to allocate CPUs for workers (if > 0). "num_cpus_per_worker": 1, # observation preprocess "preprocessor_pref": "rllib", # Whether to rollout "complete_episodes" or "truncate_episodes" "batch_mode": "complete_episodes", # Which observation filter to apply to the observation "observation_filter": "MeanStdFilter", # Use the sync samples optimizer instead of the multi-gpu one "simple_optimizer": True, # Override model config "model": { # Whether to use LSTM model "use_lstm": True, # Max seq length for LSTM training. "max_seq_len": 40, "fcnet_hiddens": [256, 256], "lstm_cell_size": 256 }, })
def main(): # bs.init() # env_config = {} # test = BlueSkyEnv(env_config) # test.reset() # # action = dict(SUP0=5) # for i in range(500): # kaas = test.step(5) # print('iteration loop nr: ' + str(i)) # print(kaas) # def make_env(i, n_cpu): # def _init(): # env = gym.make('bluesky-v0', NodeID=i, n_cpu=n_cpu) # return env # return _init() # # # n_cpu = 8 # # env = SubprocVecEnv([make_env(i, n_cpu) for i in range(n_cpu)]) # # # # policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[128,128, dict(vf=[128,128], pi=[128,128])]) # # model = PPO2(MlpPolicy, env, verbose=0, tensorboard_log='/home/dennis/tensorboard/PPO2_2e6', n_steps=500, learning_rate=0.003, vf_coef= 0.8, noptepochs=4, nminibatches=4, full_tensorboard_log=True, policy_kwargs=policy_kwargs,ent_coef=0.01) # # model.learn(total_timesteps=2000000) # # model.save("PPO2_1_222") # # # model = PPO2.load("PPO2_1") # env = gym.make('bluesky-v0', NodeID=0) # for i_episode in range(20): # obs = env.reset() # while True: # action, _states = model.predict(obs) # obs, rewards, dones, info = env.step(action) # # env.render(/) # # # obs, rewards, dones, info =/ # # if dones: # print("Episode finished after {} timesteps".format(t+1)) # break # # # ################### NEWWWWWWWWWWW TEST ################################## # gym.envs.register( # id='bluesky-v0', # entry_point='gym_bluesky.envs:BlueSkyEnv', # kwargs={'NodeID': 0, # 'n_cpu': 1, # 'scenfile': None}) # env_config = EnvConfig() # env_config = 'kaas' # # 'horizon':500, # 'batch_mode':'complete_episodes', # test = BlueSkyEnv(env_config) bs.settings ray.init() # # # env_creator = lambda config:make_env(config,0,0) register_env("Bluesky", lambda config: MultiEnv(config)) # # print('hallo2') # # # low_obs = np.array([-1,-1,-1,0,0,0]) # # # high_obs = np.array([1,1,1,1,1,1]) trainer = ppo.PPOAgent( env="Bluesky", config={ "log_level": "INFO", 'num_workers': 1, "vf_share_layers": True, #'ignore_worker_failures': True, #'num_cpus_per_worker':16, 'num_envs_per_worker': 2, 'env_config': { 'nr_nodes': 12 }, 'horizon': 500, 'batch_mode': 'complete_episodes', 'model': { 'fcnet_hiddens': [256, 256], "use_lstm": False }, 'sample_batch_size': 200, 'train_batch_size': 4000, 'vf_clip_param': 50 }) for i in range(151): trainer.train() if i % 10 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint, i) print('hallo3')
import ray from ray.tune.registry import register_env from ray.rllib.agents import ppo def env_creator(env_config): import gym return gym.make("CartPole-v0") # or return your own custom env register_env("my_env", env_creator) ray.init() trainer = ppo.PPOAgent( env="my_env", config={ "env_config": {}, # config to pass to env creator }) while True: print(trainer.train())
import ray import ray.rllib.agents.ppo as ppo from ray.tune.logger import pretty_print ray.init() config = ppo.DEFAULT_CONFIG.copy() config['num_workers'] = 16 agent = ppo.PPOAgent(config, env='FetchPickAndPlace-v1') for i in range(3000): result = agent.train() print(pretty_print(result)) if i % 100 == 0: checkpoint = agent.save() print('checkpoint saved at ', checkpoint)
self.action = action def increment(self): self.value += 1 return self.value, self.action ray.init(ignore_reinit_error=True) count = Counter.remote() config = ppo.DEFAULT_CONFIG.copy() config["num_gpus"] = 1 config["num_workers"] = 0 config["eager"] = False agent = ppo.PPOAgent(config=config, env=TuxEnv) agent.restore("project/checkpoint-50") prev_img = None def drive(img, kart: pystk.Kart): """ @img: (120,160,3) RGB image return: pystk.Action """ img = np.asarray(img) / 255.0 i, old_action = ray.get(count.increment.remote()) #print(i, old_action) prev = ray.get(count.get_img.remote())
"num_subpolicies": 2, "fn_choose_subpolicy": fn_choose_subpolicy, "hierarchical_fcnet_hiddens": [[32, 32]] * 2 } config["model"].update({"custom_options": options}) flow_env_name = "TwoLoopsMergePOEnv" exp_tag = "merge_two_level_policy_example" this_file = os.path.basename(__file__)[:-3] # filename without '.py' flow_params["flowenv"] = flow_env_name flow_params["exp_tag"] = exp_tag flow_params["module"] = os.path.basename(__file__)[:-3] config['model']['custom_options'].update({ 'flowenv': flow_env_name, 'exp_tag': exp_tag, 'module': this_file }) create_env, env_name = make_create_env(flow_env_name, flow_params, version=0, exp_tag=exp_tag) # Register as rllib env register_rllib_env(env_name, create_env) alg = ppo.PPOAgent(env=env_name, registry=get_registry(), config=config) for i in range(2): alg.train() if i % 20 == 0: alg.save() # save checkpoint