def check_support_multiagent(alg, config): register_env("multi_mountaincar", lambda _: MultiMountainCar(2)) register_env("multi_cartpole", lambda _: MultiCartpole(2)) if "DDPG" in alg: a = get_agent_class(alg)(config=config, env="multi_mountaincar") else: a = get_agent_class(alg)(config=config, env="multi_cartpole") try: a.train() finally: a.stop()
def check_support_multiagent(alg, config): register_env("multi_mountaincar", lambda _: MultiMountainCar(2)) register_env("multi_cartpole", lambda _: MultiCartpole(2)) config["log_level"] = "ERROR" if "DDPG" in alg: a = get_agent_class(alg)(config=config, env="multi_mountaincar") else: a = get_agent_class(alg)(config=config, env="multi_cartpole") try: a.train() finally: a.stop()
def check_support_multiagent(alg, config): register_env("multi_agent_mountaincar", lambda _: MultiAgentMountainCar({"num_agents": 2})) register_env("multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": 2})) config["log_level"] = "ERROR" for _ in framework_iterator(config, frameworks=("torch", "tf")): if alg in ["DDPG", "APEX_DDPG", "SAC"]: a = get_agent_class(alg)( config=config, env="multi_agent_mountaincar") else: a = get_agent_class(alg)(config=config, env="multi_agent_cartpole") print(a.train()) a.stop()
def check_support(alg, config, test_eager=False, test_trace=True): config["framework"] = "tfe" config["log_level"] = "ERROR" # Test both continuous and discrete actions. for cont in [True, False]: if cont and alg in ["DQN", "APEX", "SimpleQ"]: continue elif not cont and alg in ["DDPG", "APEX_DDPG", "TD3"]: continue if cont: config["env"] = "Pendulum-v0" else: config["env"] = "CartPole-v0" a = get_agent_class(alg) if test_eager: print("tf-eager: alg={} cont.act={}".format(alg, cont)) config["eager_tracing"] = False tune.run(a, config=config, stop={"training_iteration": 1}, verbose=1) if test_trace: config["eager_tracing"] = True print("tf-eager-tracing: alg={} cont.act={}".format(alg, cont)) tune.run(a, config=config, stop={"training_iteration": 1}, verbose=1)
def run(args, config): local_mode = False if args.dbg: local_mode = True ray.init(local_mode=local_mode) cls = get_agent_class(args._run) agent = cls(env=args.env, config=config) agent.restore(args.checkpoint) num_steps = int(1e9) render_frameskip = args.render_action_repeat if render_frameskip == -1: # default - read from config # fallback to default if env config does not have it render_frameskip = cfg_param('skip_frames', config.get('env_config', None)) log.info('Using render frameskip %d! \n\n\n', render_frameskip) rollout_loop( agent, args.env, num_steps, num_episodes=args.num_episodes, no_render=args.no_render, fps=args.fps, frameskip=render_frameskip, )
def check_support(alg, config, stats, check_bounds=False, name=None): covered_a = set() covered_o = set() config["log_level"] = "ERROR" for a_name, action_space in ACTION_SPACES_TO_TEST.items(): for o_name, obs_space in OBSERVATION_SPACES_TO_TEST.items(): print("=== Testing", alg, action_space, obs_space, "===") stub_env = make_stub_env(action_space, obs_space, check_bounds) register_env("stub_env", lambda c: stub_env()) stat = "ok" a = None try: if a_name in covered_a and o_name in covered_o: stat = "skip" # speed up tests by avoiding full grid else: a = get_agent_class(alg)(config=config, env="stub_env") a.train() covered_a.add(a_name) covered_o.add(o_name) except UnsupportedSpaceException: stat = "unsupported" except Exception as e: stat = "ERROR" print(e) print(traceback.format_exc()) finally: if a: try: a.stop() except Exception as e: print("Ignoring error stopping agent", e) pass print(stat) print() stats[name or alg, a_name, o_name] = stat
def __init__(self, sim_config, algoConfig, checkPointPath): import ray from ray.tune import run_experiments from ray.tune.registry import register_env from ray.rllib.agents.registry import get_agent_class from v2i import V2I # Do Essentials algoConfig["EXP_NAME"]["config"]["num_workers"] = 2 algoConfig["EXP_NAME"]["config"]["num_envs_per_worker"] = 1 algoConfig["EXP_NAME"]["config"]["train_batch_size"] = algoConfig[ "EXP_NAME"]["config"]["num_workers"] * algoConfig["EXP_NAME"][ "config"]["sgd_minibatch_size"] simConfigYaml = readYaml(sim_config) self.lstmEnabled = False if simConfigYaml['config']['enable-lstm']: algoConfig['EXP_NAME']['config']['model']['use_lstm'] = True self.lstmEnabled = True else: algoConfig['EXP_NAME']['config']['model']['use_lstm'] = False env_creator_name = "v2i-v0" register_env(env_creator_name, lambda config: V2I.V2I(sim_config, "train")) ray.init() cls = get_agent_class('PPO') self.agent = cls(env=env_creator_name, config=algoConfig["EXP_NAME"]["config"]) self.agent.restore(checkPointPath) print("Loaded Checkpoint -> %s" % (checkPointPath))
def test_export(algo_name, failures): cls = get_agent_class(algo_name) if "DDPG" in algo_name: algo = cls(config=CONFIGS[name], env="Pendulum-v0") else: algo = cls(config=CONFIGS[name], env="CartPole-v0") for _ in range(3): res = algo.train() print("current status: " + str(res)) export_dir = "/tmp/export_dir_%s" % algo_name print("Exporting model ", algo_name, export_dir) algo.export_policy_model(export_dir) if not os.path.exists(os.path.join(export_dir, "saved_model.pb")) \ or not os.listdir(os.path.join(export_dir, "variables")): failures.append(algo_name) shutil.rmtree(export_dir) print("Exporting checkpoint", algo_name, export_dir) algo.export_policy_checkpoint(export_dir) if not os.path.exists(os.path.join(export_dir, "model.meta")) \ or not os.path.exists(os.path.join(export_dir, "model.index")) \ or not os.path.exists(os.path.join(export_dir, "checkpoint")): failures.append(algo_name) shutil.rmtree(export_dir)
def _restore(agent_type, run_name, ckpt, env_name, extra_config=None, existing_agent=None): assert isinstance(agent_type, str) or issubclass(agent_type, Trainer) if existing_agent is not None: agent = existing_agent else: change_model = None use_activation_model = False if agent_type == "PPOAgentWithActivation": cls = PPOAgentWithActivation change_model = "fc_with_activation" use_activation_model = True elif agent_type == "PPOAgentWithMask": cls = PPOAgentWithMask change_model = "fc_with_mask" use_activation_model = True elif (not isinstance(agent_type, str)) and issubclass( agent_type, Trainer): cls = agent_type else: cls = get_agent_class(run_name) is_es_agent = run_name == "ES" config = build_config(ckpt, extra_config, is_es_agent, change_model, use_activation_model) logger.info("The config of restored agent: ", config) agent = cls(env=env_name, config=config) if ckpt is not None: ckpt = os.path.abspath(os.path.expanduser(ckpt)) # Remove relative dir agent.restore(ckpt) return agent
def run(args, parser): config = {} # Load configuration from file config_dir = os.path.dirname(args.checkpoint) config_path = os.path.join(config_dir, "params.pkl") if not os.path.exists(config_path): config_path = os.path.join(config_dir, "../params.pkl") if not os.path.exists(config_path): if not args.config: raise ValueError( "Could not find params.pkl in either the checkpoint dir or " "its parent directory.") else: with open(config_path, 'rb') as f: config = pickle.load(f) if "num_workers" in config: config["num_workers"] = min(2, config["num_workers"]) config = merge_dicts(config, args.config) if not args.env: if not config.get("env"): parser.error("the following arguments are required: --env") args.env = config.get("env") ray.init() cls = get_agent_class(args.run) agent = cls(env=args.env, config=config) agent.restore(args.checkpoint) num_steps = int(args.steps) rollout(agent, args.env, num_steps, args.out, args.no_render)
def __init__(self, sim_config, algoConfig, checkPointPath): import ray from ray.tune import run_experiments from ray.tune.registry import register_env from ray.rllib.agents.registry import get_agent_class from v2i import V2I # Do Essentials algoConfig["EXP_NAME"]["config"]["num_workers"] = 2 algoConfig["EXP_NAME"]["config"]["num_envs_per_worker"] = 1 algoConfig["EXP_NAME"]["config"]["train_batch_size"] = algoConfig[ "EXP_NAME"]["config"]["num_workers"] * algoConfig["EXP_NAME"][ "config"]["sample_batch_size"] env_creator_name = "v2i-v0" register_env(env_creator_name, lambda config: V2I.V2I(sim_config, "train")) ray.init() cls = get_agent_class('IMPALA') self.agent = cls(env=env_creator_name, config=algoConfig["EXP_NAME"]["config"]) self.agent.restore(checkPointPath) print("Loaded Checkpoint -> %s" % (checkPointPath))
def _register_all(): from ray.rllib.agents.trainer import Trainer, with_common_config from ray.rllib.agents.registry import ALGORITHMS, get_agent_class from ray.rllib.contrib.registry import CONTRIBUTED_ALGORITHMS for key in list(ALGORITHMS.keys()) + list(CONTRIBUTED_ALGORITHMS.keys( )) + ["__fake", "__sigmoid_fake_data", "__parameter_tuning"]: register_trainable(key, get_agent_class(key)) def _see_contrib(name): """Returns dummy agent class warning algo is in contrib/.""" class _SeeContrib(Trainer): _name = "SeeContrib" _default_config = with_common_config({}) def setup(self, config): raise NameError( "Please run `contrib/{}` instead.".format(name)) return _SeeContrib # also register the aliases minus contrib/ to give a good error message for key in list(CONTRIBUTED_ALGORITHMS.keys()): assert key.startswith("contrib/") alias = key.split("/", 1)[1] register_trainable(alias, _see_contrib(alias))
def get_default_config(params, env): """ Return the default configuration for a specific type of algorithm :param params: (dict) general dictionary containing every configuration parameter (env, netwrok, inflow ...) :return:(dict) """ # get original config from alg config = get_agent_class(Params.training_alg)._default_config.copy() # apply alg-free changes config = env_config(config) config = eval_config(config) config = model_config(config) config = flow_config(params, config) config = performance_config(config) if Params.training_alg == "PPO": config = ppo_config(config) elif Params.training_alg == "MARWIL": config = marwil_config(config) elif Params.training_alg == "contrib/MADDPG": config = maddpg_config(config, env) else: raise NotImplementedError(f"{Params.training_alg} has not been implemented") return config
def backtest(self, checkpoint_path): agent_config, assets, currency, datapoints, granularity, _ = get_instruments_from_checkpoint( checkpoint_path) config = { 'assets': assets, 'currency': currency, 'granularity': granularity, 'datapoints': datapoints, 'df_complete': {}, 'df_features': {}, 'variables': self.config_spec_variables } for asset in assets: config['df_complete'][asset] = self.df[asset]['rollout'] config['df_features'][asset] = self.df[asset][ 'rollout'].loc[:, self.df[asset]['rollout'].columns != 'Date'] register_env(env_name, lambda _: TradingEnv(config)) ray.init() cls = get_agent_class('PPO') agent = cls(env=env_name, config=agent_config) agent.restore(checkpoint_path) num_steps = int(len(config['df_complete'][assets[0]])) no_render = False rollout(agent, env_name, num_steps, no_render)
def check_support(alg, config, stats, check_bounds=False): for a_name, action_space in ACTION_SPACES_TO_TEST.items(): for o_name, obs_space in OBSERVATION_SPACES_TO_TEST.items(): print("=== Testing", alg, action_space, obs_space, "===") stub_env = make_stub_env(action_space, obs_space, check_bounds) register_env("stub_env", lambda c: stub_env()) stat = "ok" a = None try: a = get_agent_class(alg)(config=config, env="stub_env") a.train() except UnsupportedSpaceException: stat = "unsupported" except Exception as e: stat = "ERROR" print(e) print(traceback.format_exc()) finally: if a: try: a.stop() except Exception as e: print("Ignoring error stopping agent", e) pass print(stat) print() stats[alg, a_name, o_name] = stat
def test_pettingzoo_env(self): register_env("prison", lambda _: PettingZooEnv(simple_spread_v0.env())) agent_class = get_agent_class("PPO") config = deepcopy(agent_class._default_config) test_env = PettingZooEnv(simple_spread_v0.env()) obs_space = test_env.observation_space act_space = test_env.action_space test_env.close() config["multiagent"] = { "policies": { # the first tuple value is None -> uses default policy "av": (None, obs_space, act_space, {}), }, "policy_mapping_fn": lambda agent_id: "av" } config["log_level"] = "DEBUG" config["num_workers"] = 0 config["rollout_fragment_length"] = 30 config["train_batch_size"] = 200 config["horizon"] = 200 # After n steps, force reset simulation config["no_done_at_end"] = False agent = agent_class(env="prison", config=config) agent.train()
def create_tf_serving_model(self, algorithm=None, env_string=None): self.register_env_creator() self.register_algorithms_and_preprocessors() if ray.__version__ >= "0.6.5": from ray.rllib.agents.registry import get_agent_class else: from ray.rllib.agents.agent import get_agent_class cls = get_agent_class(algorithm) with open(os.path.join(MODEL_OUTPUT_DIR, "params.json")) as config_json: config = json.load(config_json) use_torch = config.get("use_pytorch", False) if not use_torch: if "callbacks" in config: callback_cls_str = config["callbacks"] callback_cls = callback_cls_str.split("'")[-2].split(".")[-1] config["callbacks"] = ast.literal_eval()(callback_cls) print("Loaded config for TensorFlow serving.") config["monitor"] = False config["num_workers"] = 1 config["num_gpus"] = 0 agent = cls(env=env_string, config=config) checkpoint = os.path.join(MODEL_OUTPUT_DIR, "checkpoint") agent.restore(checkpoint) export_tf_serving(agent, MODEL_OUTPUT_DIR)
def training_workflow(config_, reporter): # build trainer cls = get_agent_class(args.algo) trainer = cls(env=CityflowGymEnv, config=config_) for i in range(args.epoch): res = trainer.train() reporter(**res)
def setup_PPO_exp(): alg_run = 'PPO' agent_cls = get_agent_class(alg_run) config = agent_cls._default_config.copy() config['num_workers'] = n_cpus config['train_batch_size'] = horizon * rollouts config['gamma'] = discount_rate config['use_gae'] = True config['lambda'] = 0.97 config['kl_target'] = 0.02 config['num_sgd_iter'] = 10 config['clip_actions'] = False # FIXME(ev) temporary ray bug config['horizon'] = horizon config['model'].update({'fcnet_hiddens': [32, 32]}) # save the flow params for replay flow_json = json.dumps(flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4) config['env_config']['flow_params'] = flow_json config['env_config']['run'] = alg_run create_env, gym_name = make_create_env(params=flow_params, version=0) # Register as rllib env register_env(gym_name, create_env) return alg_run, gym_name, config
def run(args, parser): config = {} # Load configuration from file config_dir = os.path.dirname(args.checkpoint) config_path = os.path.join(config_dir, "params.pkl") if not os.path.exists(config_path): config_path = os.path.join(config_dir, "../params.pkl") if not os.path.exists(config_path): if not args.config: raise ValueError( "Could not find params.pkl in either the checkpoint dir or " "its parent directory.") else: with open(config_path, 'rb') as f: config = pickle.load(f) if "num_workers" in config: config["num_workers"] = min(2, config["num_workers"]) config = merge_dicts(config, args.config) if not args.env: if not config.get("env"): parser.error("the following arguments are required: --env") args.env = config.get("env") ray.init() cls = get_agent_class(args.run) agent = cls(env=args.env, config=config) agent.restore(args.checkpoint) num_steps = int(args.steps) rollout(agent, args.env, num_steps, args.out, args.no_render)
def test_ckpt_restore(use_object_store, alg_name, failures): cls = get_agent_class(alg_name) if "DDPG" in alg_name: alg1 = cls(config=CONFIGS[name], env="Pendulum-v0") alg2 = cls(config=CONFIGS[name], env="Pendulum-v0") else: alg1 = cls(config=CONFIGS[name], env="CartPole-v0") alg2 = cls(config=CONFIGS[name], env="CartPole-v0") for _ in range(3): res = alg1.train() print("current status: " + str(res)) # Sync the models if use_object_store: alg2.restore_from_object(alg1.save_to_object()) else: alg2.restore(alg1.save()) for _ in range(10): if "DDPG" in alg_name: obs = np.random.uniform(size=3) else: obs = np.random.uniform(size=4) a1 = get_mean_action(alg1, obs) a2 = get_mean_action(alg2, obs) print("Checking computed actions", alg1, obs, a1, a2) if abs(a1 - a2) > .1: failures.append((alg_name, [a1, a2]))
def check_support(alg, config, stats, check_bounds=False, name=None): for a_name, action_space in ACTION_SPACES_TO_TEST.items(): for o_name, obs_space in OBSERVATION_SPACES_TO_TEST.items(): print("=== Testing", alg, action_space, obs_space, "===") stub_env = make_stub_env(action_space, obs_space, check_bounds) register_env("stub_env", lambda c: stub_env()) stat = "ok" a = None try: a = get_agent_class(alg)(config=config, env="stub_env") a.train() except UnsupportedSpaceException: stat = "unsupported" except Exception as e: stat = "ERROR" print(e) print(traceback.format_exc()) finally: if a: try: a.stop() except Exception as e: print("Ignoring error stopping agent", e) pass print(stat) print() stats[name or alg, a_name, o_name] = stat
def ckpt_restore_test(use_object_store, alg_name, failures): cls = get_agent_class(alg_name) if "DDPG" in alg_name or "SAC" in alg_name: alg1 = cls(config=CONFIGS[alg_name], env="Pendulum-v0") alg2 = cls(config=CONFIGS[alg_name], env="Pendulum-v0") env = gym.make("Pendulum-v0") else: alg1 = cls(config=CONFIGS[alg_name], env="CartPole-v0") alg2 = cls(config=CONFIGS[alg_name], env="CartPole-v0") env = gym.make("CartPole-v0") for _ in range(2): res = alg1.train() print("current status: " + str(res)) # Sync the models if use_object_store: alg2.restore_from_object(alg1.save_to_object()) else: alg2.restore(alg1.save()) for _ in range(5): if "DDPG" in alg_name or "SAC" in alg_name: obs = np.clip(np.random.uniform(size=3), env.observation_space.low, env.observation_space.high) else: obs = np.clip(np.random.uniform(size=4), env.observation_space.low, env.observation_space.high) a1 = get_mean_action(alg1, obs) a2 = get_mean_action(alg2, obs) print("Checking computed actions", alg1, obs, a1, a2) if abs(a1 - a2) > .1: failures.append((alg_name, [a1, a2]))
def _register_all(): from ray.rllib.agents.registry import ALGORITHMS from ray.rllib.contrib.registry import CONTRIBUTED_ALGORITHMS for key in list(ALGORITHMS.keys()) + list(CONTRIBUTED_ALGORITHMS.keys( )) + ["__fake", "__sigmoid_fake_data", "__parameter_tuning"]: from ray.rllib.agents.registry import get_agent_class register_trainable(key, get_agent_class(key))
def get_agent_class(agent_name): """ Returns the class that corresponds to the agent_name. """ if agent_name in CUSTOM_ALGORITHMS: return CUSTOM_ALGORITHMS[agent_name] else: return reg.get_agent_class(agent_name)
def check_support(alg, config): config["eager"] = True if alg in ["APEX_DDPG", "TD3", "DDPG", "SAC"]: config["env"] = "Pendulum-v0" else: config["env"] = "CartPole-v0" a = get_agent_class(alg) tune.run(a, config=config, stop={"training_iteration": 0})
def _register_all(): from ray.rllib.agents.registry import ALGORITHMS from ray.rllib.contrib.registry import CONTRIBUTED_ALGORITHMS for key in list(ALGORITHMS.keys()) + list(CONTRIBUTED_ALGORITHMS.keys( )) + ["__fake", "__sigmoid_fake_data", "__parameter_tuning"]: from ray.rllib.agents.registry import get_agent_class register_trainable(key, get_agent_class(key))
def get_agent(agent_name): try: agent_class = get_agent_class(agent_name.upper()) except Exception as e: print("%s Loading basic algorithm" % e) # We use PG as the base class for experiments agent_class = type(agent_name.upper(), (MaxAgent, ), {}) return agent_class
def check_support(alg, config, stats, check_bounds=False, name=None): covered_a = set() covered_o = set() config["log_level"] = "ERROR" first_error = None torch = config.get("use_pytorch", False) for a_name, action_space in ACTION_SPACES_TO_TEST.items(): for o_name, obs_space in OBSERVATION_SPACES_TO_TEST.items(): print("=== Testing {} (torch={}) A={} S={} ===".format( alg, torch, action_space, obs_space)) stub_env = make_stub_env(action_space, obs_space, check_bounds) register_env("stub_env", lambda c: stub_env()) stat = "ok" a = None try: if a_name in covered_a and o_name in covered_o: stat = "skip" # speed up tests by avoiding full grid else: a = get_agent_class(alg)(config=config, env="stub_env") if alg not in ["DDPG", "ES", "ARS", "SAC"]: if o_name in ["atari", "image"]: if torch: assert isinstance(a.get_policy().model, TorchVisionNetV2) else: assert isinstance(a.get_policy().model, VisionNetV2) elif o_name in ["vector", "vector2"]: if torch: assert isinstance(a.get_policy().model, TorchFCNetV2) else: assert isinstance(a.get_policy().model, FCNetV2) a.train() covered_a.add(a_name) covered_o.add(o_name) except UnsupportedSpaceException: stat = "unsupported" except Exception as e: stat = "ERROR" print(e) print(traceback.format_exc()) first_error = first_error if first_error is not None else e finally: if a: try: a.stop() except Exception as e: print("Ignoring error stopping agent", e) pass print(stat) print() stats[name or alg, a_name, o_name] = stat # If anything happened, raise error. if first_error is not None: raise first_error
def run_heuristic_vs_learned(args, use_lstm=False, trainer="PG"): """Run heuristic policies vs a learned agent. The learned agent should eventually reach a reward of ~5 with use_lstm=False, and ~7 with use_lstm=True. The reason the LSTM policy can perform better is since it can distinguish between the always_same vs beat_last heuristics. """ def select_policy(agent_id): if agent_id == "player1": return "learned" else: return random.choice(["always_same", "beat_last"]) config = { "env": RockPaperScissors, "gamma": 0.9, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "num_workers": 0, "num_envs_per_worker": 4, "rollout_fragment_length": 10, "train_batch_size": 200, "multiagent": { "policies_to_train": ["learned"], "policies": { "always_same": (AlwaysSameHeuristic, Discrete(3), Discrete(3), {}), "beat_last": (BeatLastHeuristic, Discrete(3), Discrete(3), {}), "learned": (None, Discrete(3), Discrete(3), { "model": { "use_lstm": use_lstm }, "framework": "torch" if args.torch else "tf", }), }, "policy_mapping_fn": select_policy, }, "framework": "torch" if args.torch else "tf", } cls = get_agent_class(trainer) if isinstance(trainer, str) else trainer trainer_obj = cls(config=config) env = trainer_obj.workers.local_worker().env for _ in range(args.stop_iters): results = trainer_obj.train() print(results) # Timesteps reached. if results["timesteps_total"] > args.stop_timesteps: break # Reward (difference) reached -> all good, return. elif env.player1_score - env.player2_score > args.stop_reward: return # Reward (difference) not reached: Error if `as_test`. if args.as_test: raise ValueError( "Desired reward difference ({}) not reached! Only got to {}.". format(args.stop_reward, env.player1_score - env.player2_score))
def run(args, parser, env_config={}): if not args.config: # Load configuration from file config_dir = os.path.dirname(args.checkpoint) # params.json is saved in the model directory during ray training by default config_path = os.path.join(config_dir, "params.json") with open(config_path) as f: args.config = json.load(f) if not args.env: if not args.config.get("env"): parser.error("the following arguments are required: --env") args.env = args.config.get("env") ray.init() config = args.config config["monitor"] = False config["num_workers"] = 1 config["num_gpus"] = 0 env_config = config["env_config"] from gameserver_env import GameServerEnv env = GameServerEnv(env_config) if ray.__version__ >= "0.6.5": from ray.rllib.agents.registry import get_agent_class else: from ray.rllib.agents.agent import get_agent_class cls = get_agent_class(args.algorithm) agent = cls(env=GameServerEnv, config=config) agent.restore(args.checkpoint) num_episodes = int(args.evaluate_episodes) env = wrappers.Monitor(env, OUTPUT_DIR, force=True, video_callable=lambda episode_id: True) all_rewards = [] for episode in range(num_episodes): steps = 0 state = env.reset() done = False reward_total = 0.0 while not done: action = agent.compute_action(state) next_state, reward, done, _ = env.step(action) reward_total += reward steps += 1 state = next_state all_rewards.append(reward_total) print("Episode reward: %s. Episode steps: %s" % (reward_total, steps)) print("Mean Reward:", np.mean(all_rewards)) print("Max Reward:", np.max(all_rewards)) print("Min Reward:", np.min(all_rewards))
def create_tf_serving_model(self, algorithm=None, env_string=None, config=None): self.register_env_creator() cls = get_agent_class(algorithm) config["monitor"] = False config["num_workers"] = 1 config["num_gpus"] = 0 agent = cls(env=env_string, config=config) checkpoint = os.path.join(MODEL_OUTPUT_DIR, "checkpoint") agent.restore(checkpoint) export_tf_serving(agent, MODEL_OUTPUT_DIR)
def train_and_export(algo_name, num_steps, model_dir, ckpt_dir, prefix): cls = get_agent_class(algo_name) alg = cls(config={}, env="CartPole-v0") for _ in range(num_steps): alg.train() # Export tensorflow checkpoint for fine-tuning alg.export_policy_checkpoint(ckpt_dir, filename_prefix=prefix) # Export tensorflow SavedModel for online serving alg.export_policy_model(model_dir)
def train_and_export(algo_name, num_steps, model_dir, ckpt_dir, prefix): cls = get_agent_class(algo_name) alg = cls(config={}, env="CartPole-v0") for _ in range(num_steps): alg.train() # Export tensorflow checkpoint for fine-tuning alg.export_policy_checkpoint(ckpt_dir, filename_prefix=prefix) # Export tensorflow SavedModel for online serving alg.export_policy_model(model_dir)
def run(args, parser): def create_environment(env_config={}): return RoboschoolReacher() if not args.config: # Load configuration from file config_dir = os.path.dirname(args.checkpoint) # params.json is saved in the model directory during ray training by default config_path = os.path.join(config_dir, "params.json") with open(config_path) as f: args.config = json.load(f) if not args.env: if not args.config.get("env"): parser.error("the following arguments are required: --env") args.env = args.config.get("env") ray.init() register_env(args.env, create_environment) if ray.__version__ >= "0.6.5": from ray.rllib.agents.registry import get_agent_class else: from ray.rllib.agents.agent import get_agent_class cls = get_agent_class(args.algorithm) config = args.config config["monitor"] = False config["num_workers"] = 1 config["num_gpus"] = 0 agent = cls(env=args.env, config=config) agent.restore(args.checkpoint) num_episodes = int(args.evaluate_episodes) env = RoboschoolReacher() all_rewards = [] max_steps = 100 # set a max_steps as stopping condition as this env does not return done=True for episode in range(num_episodes): steps = 0 state = env.reset() done = False reward_total = 0.0 while steps < max_steps: action = agent.compute_action(state) next_state, reward, done, _ = env.step(action) reward_total += reward steps += 1 state = next_state all_rewards.append(reward_total) print("Episode reward: %s. Episode steps: %s" % (reward_total, steps)) print("Mean Reward:", np.mean(all_rewards)) print("Max Reward:", np.max(all_rewards)) print("Min Reward:", np.min(all_rewards))
def _doTestFaultFatal(self, alg, config): register_env("fault_env", lambda c: FaultInjectEnv(c)) agent_cls = get_agent_class(alg) # Test raises real error when out of workers config["num_workers"] = 2 config["ignore_worker_failures"] = True config["env_config"] = {"bad_indices": [1, 2]} a = agent_cls(config=config, env="fault_env") self.assertRaises(Exception, lambda: a.train()) a.stop()
def test_export(algo_name, failures): def valid_tf_model(model_dir): return os.path.exists(os.path.join(model_dir, "saved_model.pb")) \ and os.listdir(os.path.join(model_dir, "variables")) def valid_tf_checkpoint(checkpoint_dir): return os.path.exists(os.path.join(checkpoint_dir, "model.meta")) \ and os.path.exists(os.path.join(checkpoint_dir, "model.index")) \ and os.path.exists(os.path.join(checkpoint_dir, "checkpoint")) cls = get_agent_class(algo_name) if "DDPG" in algo_name: algo = cls(config=CONFIGS[name], env="Pendulum-v0") else: algo = cls(config=CONFIGS[name], env="CartPole-v0") for _ in range(3): res = algo.train() print("current status: " + str(res)) export_dir = "/tmp/export_dir_%s" % algo_name print("Exporting model ", algo_name, export_dir) algo.export_policy_model(export_dir) if not valid_tf_model(export_dir): failures.append(algo_name) shutil.rmtree(export_dir) print("Exporting checkpoint", algo_name, export_dir) algo.export_policy_checkpoint(export_dir) if not valid_tf_checkpoint(export_dir): failures.append(algo_name) shutil.rmtree(export_dir) print("Exporting default policy", algo_name, export_dir) algo.export_model([ExportFormat.CHECKPOINT, ExportFormat.MODEL], export_dir) if not valid_tf_model(os.path.join(export_dir, ExportFormat.MODEL)) \ or not valid_tf_checkpoint(os.path.join(export_dir, ExportFormat.CHECKPOINT)): failures.append(algo_name) shutil.rmtree(export_dir)