Пример #1
0
def evaluate_model(args):
    if args.model_path == '':
        print('Cannot evaluate model, no --model_path set')
        exit(1)

    def get_env():
        # Simulator env uses a single map, so better for evaluation/testing.
        # DiscreteWrapper just converts wheel velocities to high level discrete actions.
        return DiscreteWrapper(simulator.Simulator(
            map_name=args.map,
            max_steps=2000,
        ))

    # Rather than reuse the env, another one is created later because I can't
    # figure out how to provide register_env with an object, th
    # register_env('DuckieTown-Simulator', lambda _: get_env())
    trainer = PPOTrainer(
        env="DuckieTown-Simulator",
        config={
            "framework": "torch",
            "model": {
                "custom_model": "image-ppo",
            },
        },
    )
    trainer.restore(args.model_path)

    sim_env = get_env()

    # Standard OpenAI Gym reset/action/step/render loop.
    # This matches how the `enjoy_reinforcement.py` script works, see: https://git.io/J3js2
    done = False
    observation = sim_env.reset()
    episode_reward = 0
    while not done:
        action = trainer.compute_action(observation)
        observation, reward, done, _ = sim_env.step(action)
        episode_reward += reward
        sim_env.render()

    print(f'Episode complete, total reward: {episode_reward}')
Пример #2
0
class PPOrl(object):
    def __init__(self, env, env_config, config):
        self.config = config
        self.config['env_config'] = env_config
        self.env = env(env_config)
        self.agent = PPOTrainer(config=self.config, env=env)

    def fit(self, checkpoint=None, n_iter=2000, save_checkpoint=10):
        if checkpoint is None:
            checkpoint = os.path.join(os.getcwd(), 'data/checkpoint_rl.pkl')
        for idx in trange(n_iter):
            result = self.agent.train()
            LOGGER.warning('result: ', result)
            if (idx + 1) % save_checkpoint == 0:
                LOGGER.warning('Save checkpoint at: {}'.format(idx + 1))
                state = self.agent.save_to_object()
                with open(checkpoint, 'wb') as fp:
                    pickle.dump(state, fp, protocol=pickle.HIGHEST_PROTOCOL)
        return result

    def predict(self, checkpoint=None):
        if checkpoint is not None:
            with open(checkpoint, 'rb') as fp:
                state = pickle.load(fp)
            self.agent.restore_from_object(state)
        done = False
        episode_reward = 0
        obs = self.env.reset()
        actions = []
        while not done:
            action = self.agent.compute_action(obs)
            actions.append(action)
            obs, reward, done, info = self.env.step(action)
            episode_reward += reward
        results = {'action': actions, 'reward': episode_reward}
        return results
Пример #3
0
def single_test(defaultconfig,
                training_trials,
                evaluation_trials,
                check,
                lr=0.00005,
                num_workers=4,
                num_gpus=0.25):
    ray.shutdown()
    ray.init(**ray_init_kwargs)
    config = ppo.DEFAULT_CONFIG.copy()
    if (num_gpus > 0):
        config["num_gpus"] = num_gpus
    config["num_workers"] = num_workers
    config["lr"] = lr
    config["train_batch_size"] = 8000
    config["num_sgd_iter"] = 5
    config["env_config"] = defaultconfig
    trainer = Trainer(config=config, env=qsdl.QSDEnv)
    for i in range(training_trials):
        result = trainer.train()
        print("train iteration", i + 1, "/", training_trials, " avg_reward =",
              result["episode_reward_mean"], " timesteps =",
              result["timesteps_total"])
        if i % check == check - 1:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)
    avgR = 0
    for i in range(evaluation_trials):
        env = qsdl.QSDEnv(defaultconfig)
        obs = env.reset()
        done = False
        while not done:
            action = trainer.compute_action(obs)
            obs, r, done, _ = env.step(action)
            avgR += r
    return avgR / evaluation_trials
Пример #4
0
                            rnn_config['model'],
                            'happy')
     state = dummy_model.get_initial_state()
     state_list.append((key, [s.detach().numpy() for s in state]))
 state_list = dict(state_list)
 iters = 100
 tdiff = 0.0
 tot = [0.0, 0.0]
 for i in range(iters):
     obs = ff.reset()
     r1 = 0.0
     state2 = deepcopy(state_list)
     while True:
         policy_id = list(obs.keys())[0]
         logits = trainer.compute_action(obs[policy_id],
                                         state2[policy_id],
                                         policy_id=policy_id,
                                         full_fetch=False)
         state2[policy_id] = logits[1]
         turn = int(list(obs.values())[0][6]/0.025 + 0.0125)
         if turn < 2:
             action = 0
         elif turn == 2:
             action = 6
         elif len(obs) == 13:
             if turn == 3:
                 action = 4
             elif turn == 4:
                 action = 7
             elif turn == 5:
                 action = 8
             else:
        doneAll = False
        i = 0
        drift_data = []
        timestep_data = []
        while not doneAll:
            i += 1
            done = False
            # env.render()
            observation = env.resetFromFrame(startFrame=0, resetYaw=0, startFromRef=True)
            # drawAxis()
            pause = False

            drift = []
            while not done and not doneAll:
                if not pause:
                    action = agent.compute_action(observation)
                    observation, reward, f_done, info = env.step(action)
                    done = f_done
                    drift.append(calcDrift(env.robot_pos, env.starting_robot_pos, env.target))
                # drawLine(
                #     env.robot_pos,
                #     [env.flat_env.robot.walk_target_x, env.flat_env.robot.walk_target_y, 0],
                #     [0, 0, 1],
                # )

                # time.sleep(1.0 / fps)

                # keys = pybullet.getKeyboardEvents()
                # if qKey in keys and keys[qKey] & pybullet.KEY_WAS_TRIGGERED:
                #     print("QUIT")
                #     doneAll = True
Пример #6
0
        drift_data = []
        timestep_data = []
        while not doneAll:
            i += 1
            done = False
            # env.render()
            observation = env.resetFromFrame(startFrame=0, resetYaw=0, startFromRef=True)
            # drawAxis()
            pause = False

            drift = []
            while not done and not doneAll:
                action = dict()
                if(not pause):
                    if('high_level_agent' in observation):
                        action['high_level_agent'] = agent.compute_action(observation['high_level_agent'], policy_id='high_level_policy')
                        # if(not pause):
                            # pause = True
                    else:
                        action[env.low_level_agent_id] = agent.compute_action(observation[env.low_level_agent_id], policy_id='low_level_policy')
                    observation, reward, f_done, info = env.step(action)
                    done = f_done['__all__'] == True
                    drift.append(calcDrift(env.robot_pos, env.starting_robot_pos, env.target))
                    # if(f_done['__all__']):
                        # print("Done")
                # targetHL = np.array([
                #     np.cos(env.highLevelDegTarget),
                #     np.sin(env.highLevelDegTarget),
                #     0
                # ]) * 5
                # drawLine(env.robot_pos, env.robot_pos + targetHL, [0, 1, 0])
Пример #7
0
    "agent_names": agent_names,
    "env_id": env_id,
    "phase": 0
})

for i in range(1):
    obs = env.reset()

    done = False
    step = 0
    while not done:
        env.render()
        actions = env.act(obs)

        actions[1] = ppo_agent.compute_action(observation=penv.featurize(
            obs[1]),
                                              policy_id="ppo_policy")
        actions[3] = ppo_agent.compute_action(observation=penv.featurize(
            obs[3]),
                                              policy_id="ppo_policy")

        obs, reward, done, info = env.step(actions)
        features = penv.featurize(obs[1])
        for i in range(13):
            print("i:", i)
            print(features["board"][:, :, i])
            print("======")
        print(obs[1]["board"])
        print()
        print(obs[1]["bomb_life"])
        print("step:", step)
    # env = Monitor(env, "gym_monitor_results", write_upon_reset=True, force=True)
    env = launch_and_wrap_duckieenv(config["env_config"], seed)
    print(env)
    # config['env_config'] del 'action_type'
    for i in range(1):
        steps = 0
        env=env.env.env.env
        print("After unwrapping, env is {}".format(env))
        obs = env.reset()
        env.render(render_mode)
        print("env's seed is {}".format(env.seed_value))
        print("default robot speed is {}".format(env.robot_speed))
        print("frame rate is {}".format(env.frame_rate))
        done = False
        while not done:
            action = trainer.compute_action(obs, explore=False)
            tuple_action = np.clip(np.array([1 + action[0], 1 - action[0]]), 0., 1.)
            angle_vel_action = convert_to_vel_angle_actions(tuple_action)

            # angle_vel_action[0] = min(angle_vel_action[0], 0.58)

            actions.append(convert_to_vel_angle_actions(tuple_action))
            # print("angle_vel_action is {}".format(angle_vel_action))
            # print("action is {}".format(action))
            # print("angle_vel action is {}".format(angle_vel_action))
            obs, reward, done, info = env.step(angle_vel_action)
            total_reward+=reward
            # print("obs has shape{}".format(obs.shape))
            # print("speed is {}".format(env.speed))
            print("reward is {}".format(reward))
            steps += 1
Пример #9
0
        # Play from the command line against the trained agent
        # in an actual (non-RLlib-wrapped) open-spiel env.
        human_player = 1
        env = Environment("connect_four")

        while num_episodes < args.num_episodes_human_play:
            print("You play as {}".format("o" if human_player else "x"))
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if player_id == human_player:
                    action = ask_user_for_action(time_step)
                else:
                    obs = np.array(
                        time_step.observations["info_state"][player_id])
                    action = trainer.compute_action(obs, policy_id="main")
                    # In case computer chooses an invalid action, pick a
                    # random one.
                    legal = time_step.observations["legal_actions"][player_id]
                    if action not in legal:
                        action = np.random.choice(legal)
                time_step = env.step([action])
                print(f"\n{env.get_state}")

            print(f"\n{env.get_state}")

            print("End of game!")
            if time_step.rewards[human_player] > 0:
                print("You win")
            elif time_step.rewards[human_player] < 0:
                print("You lose")
Пример #10
0
# fdb733b6
checkpoint = 600
checkpoint_dir = "/home/lucius/ray_results/two_policies_vs_static_agents/PPO_RllibPomme_0_2020-06-09_23-39-347whmqdrs"
ppo_agent.restore("{}/checkpoint_{}/checkpoint-{}".format(
    checkpoint_dir, checkpoint, checkpoint))

agent_list = []
for agent_id in range(4):
    agent_list.append(agents.StaticAgent())
env = pommerman.make("PommeTeam-v0", agent_list=agent_list)

for i in range(1):
    obs = env.reset()

    done = False
    while not done:
        env.render()
        actions = env.act(obs)
        actions[0] = ppo_agent.compute_action(observation=featurize(obs[0]),
                                              policy_id="policy_0")
        actions[2] = ppo_agent.compute_action(observation=featurize(obs[2]),
                                              policy_id="policy_0")
        obs, reward, done, info = env.step(actions)
        print("reward:", reward)
        print("done:", done)
        print("info:", info)
        print("=========")
    env.render(close=True)
    # env.close()
Пример #11
0
config2 = DEFAULT_CONFIG.copy()
config2['num_workers'] = 4
config2['num_sgd_iter'] = 30
config2['sgd_minibatch_size'] = 128
config2['model']['fcnet_hiddens'] = [100, 100]
config2['num_cpus_per_worker'] = 0

agent2 = PPOTrainer(config2, 'CartPole-v0')
for i in range(2):
    result = agent2.train()
    print(pretty_print(result))

checkpoint_path = agent2.save()
print(checkpoint_path)
trained_config = config2.copy()

test_agent = PPOTrainer(trained_config, 'CartPole-v0')
test_agent.restore(checkpoint_path)

env = gym.make('CartPole-v0')
state = env.reset()
done = False
cumulative_reward = 0

while not done:
    action = test_agent.compute_action(state)
    state, reward, done, _ = env.step(action)
    cumulative_reward += reward
print(cumulative_reward)

#tensorboard --logdir="/Users/guoqiong/ray_results/PPO_CartPole-v0_2020-04-29_18-58-22yq1yq16u/" --host=0.0.0.0
Пример #12
0
class KandboxAgentRLLibPPO(KandboxAgentPlugin):
    title = "Kandbox Plugin - Agent - realtime - by rllib ppo"
    slug = "ri_agent_rl_ppo"
    author = "Kandbox"
    author_url = "https://github.com/qiyangduan"
    description = "RLLibPPO for GYM for RL."
    version = "0.1.0"
    default_config = {
        "nbr_of_actions": 4,
        "n_epochs": 1000,
        "nbr_of_days_planning_window": 6,
        "model_path": "default_model_path",
        "working_dir": "/tmp",
        "checkpoint_path_key": "ppo_checkpoint_path",
    }
    config_form_spec = {
        "type": "object",
        "properties": {},
    }

    def __init__(self, agent_config, kandbox_config):
        self.agent_config = agent_config
        self.current_best_episode_reward_mean = -99

        env_config = agent_config["env_config"]

        if "rules_slug_config_list" not in env_config.keys():
            if "rules" not in env_config.keys():
                log.error("no rules_slug_config_list and no rules ")
            else:
                env_config["rules_slug_config_list"] = [
                    [rule.slug, rule.config] for rule in env_config["rules"]
                ]
                env_config.pop("rules", None)

        # self.env_class = env_class = agent_config["env"]

        self.kandbox_config = self.default_config.copy()
        self.kandbox_config.update(kandbox_config)

        # self.trained_model = trained_model
        self.kandbox_config["create_datetime"] = datetime.now()

        # self.trainer = None
        self.env_config = env_config
        # self.load_model(env_config=self.env_config)
        print(
            f"KandboxAgentRLLibPPO __init__ called, at time {self.kandbox_config['create_datetime']}"
        )
        # import pdb

        # pdb.set_trace()
        if not ray.is_initialized():
            ray.init(ignore_reinit_error=True, log_to_driver=False)
        # ray.init(redis_address="localhost:6379")

    def build_model(self):

        trainer_config = DEFAULT_CONFIG.copy()

        trainer_config["num_workers"] = 0
        # trainer_config["train_batch_size"] = 640
        # trainer_config["sgd_minibatch_size"] = 160
        # trainer_config["num_sgd_iter"] = 100

        trainer_config["exploration_config"] = {
            "type": "Random",
        }
        # EpsilonGreedy(Exploration):
        # trainer_config["exploration_config"] = {
        #         "type": "Curiosity",
        #         "eta": 0.2,
        #         "lr": 0.001,
        #         "feature_dim": 128,
        #         "feature_net_config": {
        #             "fcnet_hiddens": [],
        #             "fcnet_activation": "relu",
        #         },
        #         "sub_exploration": {
        #             "type": "StochasticSampling",
        #         }
        #     }

        # trainer_config["log_level"] = "DEBUG"
        """
        if env_config is not None:
            for x in env_config.keys():
                trainer_config[x] = env_config[x]
        """

        # trainer_config["env_config"] = copy.deepcopy(env_config)  #  {"rules": "qiyang_role"}

        trainer_config.update(self.agent_config)

        self.trainer = PPOTrainer(trainer_config, self.agent_config["env"])
        # self.config["trainer"] = self.trainer
        return self.trainer

    def load_model(self):  # , allow_empty = None
        env_config = self.agent_config["env_config"]
        self.trainer = self.build_model()

        # if (model_path is not None) & (os.path.exists(model_path)):
        if "ppo_checkpoint_path" in env_config.keys():
            # raise FileNotFoundError("can not find model at path: {}".format(model_path))
            if os.path.exists(env_config["ppo_checkpoint_path"]):
                self.trainer.restore(env_config["ppo_checkpoint_path"])
                print("Reloaded model from path: {} ".format(
                    env_config["ppo_checkpoint_path"]))

            else:
                print(
                    "Env_config has ppo_checkpoint_path = {}, but no files found. I am returning an initial model"
                    .format(env_config["ppo_checkpoint_path"]))

        else:
            print(
                "Env_config has no ppo_checkpoint_path, returning an initial model"
            )
        # self.config["model_path"] = model_path
        # self.config["trainer"] = self.trainer
        # self.config["policy"] = self.trainer.workers.local_worker().get_policy()
        self.policy = self.trainer.workers.local_worker().get_policy()
        return self.trainer

    def train_model(self):

        # self.trainer = self.build_model()
        for i in range(self.kandbox_config["n_epochs"]):
            result = self.trainer.train()
            # print(pretty_print(result))
            print(
                "Finished training iteration {}, Result: episodes_this_iter:{}, policy_reward_max: {}, episode_reward_max {}, episode_reward_mean {}, info.num_steps_trained: {}..."
                .format(
                    i,
                    result["episodes_this_iter"],
                    result["policy_reward_max"],
                    result["episode_reward_max"],
                    result["episode_reward_mean"],
                    result["info"]["num_steps_trained"],
                ))
            if result[
                    "episode_reward_mean"] > self.current_best_episode_reward_mean * 1.1:
                model_path = self.save_model()
                print(
                    "Model is saved after 10 percent increase, episode_reward_mean = {},  file = {}"
                    .format(result["episode_reward_mean"], model_path))
                self.current_best_episode_reward_mean = result[
                    "episode_reward_mean"]

        return self.save_model()

    def save_model(self):

        checkpoint_dir = "{}/model_checkpoint_org_{}_team_{}".format(
            self.agent_config["env_config"]["working_dir"],
            self.agent_config["env_config"]["org_code"],
            self.agent_config["env_config"]["team_id"],
        )
        _path = self.trainer.save(checkpoint_dir=checkpoint_dir)

        # exported_model_dir = "{}/exported_ppo_model_org_{}_team_{}".format(
        #     self.agent_config["env_config"]["working_dir"], self.agent_config["env_config"]["org_code"], self.agent_config["env_config"]["team_id"]
        # )
        # self.trainer.get_policy().export_model(exported_model_dir + "/1")

        return _path  # self.trainer

    def predict_action(self, observation=None):

        action = self.trainer.compute_action(observation)
        return action

    def predict_action_list(self, env=None, job_code=None, observation=None):
        actions = []
        if env is not None:
            self.env = env
        else:
            env = self.env

        if job_code is None:
            job_i = env.current_job_i
        else:
            job_i = env.jobs_dict[job_code].job_index

        observation = env._get_observation()

        # export_dir = "/Users/qiyangduan/temp/kandbox/exported_ppo_model_org_duan3_team_3/1"
        # loaded_policy = tf.saved_model.load(export_dir)
        # loaded_policy.signatures["serving_default"](observations=observation)

        predicted_action = self.trainer.compute_action(observation)
        # V predicted_action = self.policy.compute_action(observation)

        for _ in range(len(env.workers)):  # hist_job_workers_ranked:
            if len(actions) >= self.config["nbr_of_actions"]:
                return actions
            actions.append(list(predicted_action).copy())
            max_i = np.argmax(predicted_action[0:len(env.workers)])
            predicted_action[max_i] = 0

        return actions

    def predict_action_dict_list(self,
                                 env=None,
                                 job_code=None,
                                 observation=None):
        if env is not None:
            self.env = env
        else:
            env = self.env

        curr_job = copy.deepcopy(env.jobs_dict[job_code])

        if job_code is None:
            job_i = env.current_job_i
        else:
            job_i = curr_job.job_index
            env.current_job_i = job_i

        observation = env._get_observation()

        action = self.predict_action(observation=observation)
        action_dict = env.decode_action_into_dict_native(action=action)

        action_day = int(action_dict.scheduled_start_minutes / 1440)
        curr_job.requested_start_min_minutes = action_day * 1440
        curr_job.requested_start_max_minutes = (action_day + 1) * 1440

        action_dict_list = self.env.recommendation_server.search_action_dict_on_worker_day(
            a_worker_code_list=action.scheduled_worker_codes,
            curr_job=curr_job,
            max_number_of_matching=3,
        )
        return action_dict_list
Пример #13
0
def fulltest(total_trials,
             training_trials,
             d,
             m,
             q,
             train_check,
             evaluation_trials=5000,
             lr=0.00005,
             num_workers=4,
             num_gpus=0.25,
             SDP=True,
             LG=False,
             local_SDP=False,
             dep=True,
             rngvec=np.ones(1000)):
    quantization = 20
    separable = True
    bigvec = np.zeros((total_trials, int(training_trials / train_check) + 1))
    vec_SDP = []
    vec_local_SDP = []
    vec_LG = []

    for j in range(total_trials):
        print("Starting round", j, "of", total_trials)
        rho, _ = qsdl.generate_initial_state(d,
                                             m,
                                             rng=rngvec[j],
                                             depolarized=dep)

        if local_SDP == True:
            lg = max_SDP_sim_order(q, rho, len(d), 1250, d)
            vec_local_SDP.append(lg)
            print("local SDP-based")
            print(lg)
        if SDP == True:
            sdpr = sdp.SDP(rho, q, len(d))
            vec_SDP.append(sdpr)
            print("SDP")
            print(sdpr)
        if LG == True:
            lg = LG_sim_order(copy.copy(q), copy.copy(rho), len(d), 2500, d)
            vec_LG.append(lg)
            print("LG")
            print(lg)

        print("RLNN: ")
        print(bigvec[-1])
        defaultconfig = {
            "rho": copy.copy(rho),
            "q": copy.copy(q),
            "quantization": quantization,
            "d": d,
            "separable": True
        }
        vec = []
        ray.shutdown()
        ray.init(**ray_init_kwargs)
        config = ppo.DEFAULT_CONFIG.copy()
        if (num_gpus > 0):
            config["num_gpus"] = num_gpus
            config["num_workers"] = num_workers
            config["lr"] = lr
            config["train_batch_size"] = 8000
            config["num_sgd_iter"] = 5
            config["env_config"] = defaultconfig
            trainer = Trainer(config=config, env=qsdl.QSDEnv)
        for i in range(training_trials):
            result = trainer.train()
            print("train iteration", i + 1, "/", training_trials,
                  " avg_reward =", result["episode_reward_mean"],
                  " timesteps =", result["timesteps_total"])
            #         if i % check == check-1:
            #             checkpoint = trainer.save()
            #             print("checkpoint saved at", checkpoint)
            if i == 0 or (i + 1) % train_check == 0:
                rew = 0
                for i in range(evaluation_trials):
                    env = qsdl.QSDEnv(defaultconfig)
                    obs = env.reset()
                    done = False
                    while not done:
                        action = trainer.compute_action(obs)
                        obs, r, done, _ = env.step(action)
                        rew += r
                vec.append(rew / evaluation_trials)
        bigvec[j] = vec
    return bigvec, vec_SDP, vec_local_SDP, vec_LG
Пример #14
0
trainer_config["train_batch_size"] = 400
trainer_config["sgd_minibatch_size"] = 64
trainer_config["num_sgd_iter"] = 10




trainer = PPOTrainer(trainer_config, SIR);
for i in range(200):
    print("Training iteration {}...".format(i))
    trainer.train()



env = SIR()
state = env.reset()

done = False
#max_state = -1
cumulative_reward = 0

total_states = list()
while not done:
    action = trainer.compute_action(state)
    state, reward, done, results = env.step(action)
    #max_state = max(max_state, state)
    total_states.append(state)
    cumulative_reward += reward

print("Cumulative reward you've received is: {}. Congratulations!".format(cumulative_reward))
print("Final state is", state)
Пример #15
0
def main() -> None:
    ray.init()
    np.random.seed(0)

    # instructions = {
    #     0: [Instruction(time=0, x=5, y=5)],
    #     1: [Instruction(time=1, x=5, y=5), Instruction(time=1, x=1, y=5)],
    #     2: [Instruction(time=2, x=5, y=5, rng=np.random.default_rng())],
    # }
    # task = Task(
    #     target_x=1,
    #     target_y=5,
    #     instructions=instructions,
    #     tot_frames=4,
    #     width=42,
    #     height=42,
    # )

    # task = ODR(target_x=1, target_y=5, width=42, height=42)
    # task = Gap(target_x=1, target_y=5, width=42, height=42)
    task = ODRDistract(target_x=1, target_y=5, width=42, height=42)

    def env_creator(env_config):
        return Environment(env_config)  # return an env instance

    register_env("my_env", env_creator)

    # trainer_config = DEFAULT_CONFIG.copy()
    # trainer_config["num_workers"] = 1
    # trainer_config["train_batch_size"] = 20  # 100
    # trainer_config["sgd_minibatch_size"] = 15  # 32
    # trainer_config["num_sgd_iter"] = 50

    trainer = PPOTrainer(
        env="my_env",
        config={
            "env_config": {"task": task},
            "framework": "torch",
            "num_workers": 1,
            "train_batch_size": 10,
            "sgd_minibatch_size": 5,
            "num_sgd_iter": 10,
            # "model": {
            #     # Whether to wrap the model with an LSTM.
            #     "use_lstm": True,
            #     # Max seq len for training the LSTM, defaults to 20.
            #     "max_seq_len": task.tot_frames - 1,
            #     # # Size of the LSTM cell.
            #     "lstm_cell_size": task.tot_frames - 1,
            #     # # Whether to feed a_{t-1}, r_{t-1} to LSTM.
            #     # # "lstm_use_prev_action_reward": False,
            # },
        },
    )

    trainer = A2CTrainer(
        env="my_env",
        config={
            "env_config": {"task": task},
            "framework": "torch",
            "num_workers": 1,
            "train_batch_size": 10,
            # "model": {
            #     # Whether to wrap the model with an LSTM.
            #     "use_lstm": True,
            #     # Max seq len for training the LSTM, defaults to 20.
            #     "max_seq_len": task.tot_frames - 1,
            #     # # Size of the LSTM cell.
            #     "lstm_cell_size": task.tot_frames - 1,
            #     # # Whether to feed a_{t-1}, r_{t-1} to LSTM.
            #     # # "lstm_use_prev_action_reward": False,
            # },
        },
    )

    # trainer = DQNTrainer(
    #     env="my_env",
    #     config={
    #         "env_config": {"task": task},
    #         "framework": "torch",
    #         "num_workers": 1,
    #         "train_batch_size": 10,
    #         # "model": {
    #         #     # Whether to wrap the model with an LSTM.
    #         #     "use_lstm": True,
    #         #     # Max seq len for training the LSTM, defaults to 20.
    #         #     "max_seq_len": task.tot_frames - 1,
    #         #     # # Size of the LSTM cell.
    #         #     "lstm_cell_size": task.tot_frames - 1,
    #         #     # # Whether to feed a_{t-1}, r_{t-1} to LSTM.
    #         #     # # "lstm_use_prev_action_reward": False,
    #         # },
    #     },
    # )

    env = Environment(env_config={"task": task})

    for i in range(200):
        print(f"Training iteration {i}...")
        trainer.train()

        done = False
        cumulative_reward = 0.0
        observation = env.reset()

        while not done:
            action = trainer.compute_action(observation)

            observation, reward, done, results = env.step(action)
            print(f"Time: {env.time}. Action: {action}")
            cumulative_reward += reward
        print(
            f"Last step reward: {reward: .3e}; Cumulative reward: {cumulative_reward:.3e}"
        )
    agentLow = PPOTrainer(config_low)
    experiment_name = "HWalk_Low_Mimic"
    experiment_id = "PPO_HumanoidBulletEnvLow-v0_699c9_00000_0_2021-04-18_22-14-39"
    checkpoint_num = "1930"
    agentLow.restore(
        "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}".format(
            experiment_name, experiment_id, checkpoint_num, checkpoint_num))

    # agent.export_policy_model("out/model", "default_policy")
    # agent.import_model("out/model")

    # agent.get_policy("default_policy").import_model_from_h5

    agentHigh = PPOTrainer(config_hier)
    lowWeight = agentLow.get_policy().get_weights()
    highWeight = agentHigh.get_policy("low_level_policy").get_weights()
    importedPolicy = {
        hw: lowWeight[lw]
        for hw, lw in zip(highWeight.keys(), lowWeight.keys())
    }
    s1 = agentLow.get_policy().get_state()
    s11 = OrderedDict([(k.replace("default_policy", "low_level_policy"), v)
                       for k, v in s1['_optimizer_variables'].items()])
    importedPolicy['_optimizer_variables'] = s11
    agentHigh.get_policy("low_level_policy").set_state(importedPolicy)

    obs = single_env.low_level_obs_space.sample()
    print(agentLow.compute_action(obs))
    print(agentHigh.compute_action(obs, policy_id='low_level_policy'))
    print("=============================================================")
    ray.shutdown()
Пример #17
0
# ================= Enjoy a trained agent =================

t_end = 10.0  # Total duration of the simulation(s) in seconds

try:
    env = env_creator(rllib_cfg["env_config"])
    test_agent = Trainer(agent_cfg, env="my_custom_env")
    test_agent.restore(checkpoint_path)
    t_init = time.time()
    t_prev = t_init
    while t_prev - t_init < t_end:
        observ = env.reset()
        done = False
        cumulative_reward = 0
        while not done:
            if not (t_prev - t_init < t_end):
                break
            action = test_agent.compute_action(observ, explore=False)
            observ, reward, done, _ = env.step(action)
            cumulative_reward += reward
            env.render()
            sleep(env.dt - (time.time() - t_prev))
            t_prev = time.time()
        print(cumulative_reward)
except KeyboardInterrupt:
    print("Interrupting testing...")

# ================= Terminate the Ray backend =================

ray.shutdown()