예제 #1
0
    def test_nested_action_spaces(self):
        config = DEFAULT_CONFIG.copy()
        config["env"] = RandomEnv
        # Write output to check, whether actions are written correctly.
        tmp_dir = os.popen("mktemp -d").read()[:-1]
        if not os.path.exists(tmp_dir):
            # Last resort: Resolve via underlying tempdir (and cut tmp_.
            tmp_dir = ray._private.utils.tempfile.gettempdir() + tmp_dir[4:]
            assert os.path.exists(tmp_dir), f"'{tmp_dir}' not found!"
        config["output"] = tmp_dir
        # Switch off OPE as we don't write action-probs.
        # TODO: We should probably always write those if `output` is given.
        config["input_evaluation"] = []

        # Pretend actions in offline files are already normalized.
        config["actions_in_input_normalized"] = True

        for _ in framework_iterator(config):
            for name, action_space in SPACES.items():
                config["env_config"] = {
                    "action_space": action_space,
                }
                for flatten in [False, True]:
                    print(f"A={action_space} flatten={flatten}")
                    shutil.rmtree(config["output"])
                    config["_disable_action_flattening"] = not flatten
                    trainer = PGTrainer(config)
                    trainer.train()
                    trainer.stop()

                    # Check actions in output file (whether properly flattened
                    # or not).
                    reader = JsonReader(
                        inputs=config["output"],
                        ioctx=trainer.workers.local_worker().io_context,
                    )
                    sample_batch = reader.next()
                    if flatten:
                        assert isinstance(sample_batch["actions"], np.ndarray)
                        assert len(sample_batch["actions"].shape) == 2
                        assert sample_batch["actions"].shape[0] == len(
                            sample_batch)
                    else:
                        tree.assert_same_structure(
                            trainer.get_policy().action_space_struct,
                            sample_batch["actions"],
                        )

                    # Test, whether offline data can be properly read by a
                    # BCTrainer, configured accordingly.
                    config["input"] = config["output"]
                    del config["output"]
                    bc_trainer = BCTrainer(config=config)
                    bc_trainer.train()
                    bc_trainer.stop()
                    config["output"] = tmp_dir
                    config["input"] = "sampler"
예제 #2
0
    def test_itr_batches(self):
        """Test that the json reader iterates over batches of rows correctly."""
        rllib_dir = Path(__file__).parent.parent.parent.parent
        print("rllib dir={}".format(rllib_dir))
        data_file = os.path.join(rllib_dir,
                                 "rllib/tests/data/pendulum/large.json")
        print("data_file={} exists={}".format(data_file,
                                              os.path.isfile(data_file)))

        ioctx = IOContext(config={"train_batch_size": 1200}, worker_index=0)
        reader = JsonReader([data_file], ioctx)
        assert len(reader.next()) == 1200
예제 #3
0
 def _init(self, config, env_name):
     self._policy_graph = self.config["multiagent"]["policy_graphs"]
     
     self.local_evaluator = self.make_local_evaluator(
          env_name, self._policy_graph, self.config)        
     self.remote_evaluators = self.make_remote_evaluators(
         env_name, self._policy_graph, self.config["num_workers"])
     
     self.train_batch_size = self.config["train_batch_size"]
     self.num_sgd_iter = self.config["num_sgd_iter"]
     self.num_train = self.config["num_train"]
     self.expert_path = self.config["expert_path"]
     self.theta_lr = self.config["theta_lr"]
     
     expert_reader = JsonReader(self.expert_path)
     self.expert_samples = expert_reader.next()
     self.expert_features = self.calculate_expected_feature(self.expert_samples)
     self.theta = np.random.uniform(size=self.expert_features.shape)
예제 #4
0
파일: mixed_input.py 프로젝트: zzmcdc/ray
    def __init__(self, dist: Dict[JsonReader, float], ioctx: IOContext):
        """Initialize a MixedInput.

        Args:
            dist (dict): dict mapping JSONReader paths or "sampler" to
                probabilities. The probabilities must sum to 1.0.
            ioctx (IOContext): current IO context object.
        """
        if sum(dist.values()) != 1.0:
            raise ValueError("Values must sum to 1.0: {}".format(dist))
        self.choices = []
        self.p = []
        for k, v in dist.items():
            if k == "sampler":
                self.choices.append(ioctx.default_sampler_input())
            else:
                self.choices.append(JsonReader(k))
            self.p.append(v)
예제 #5
0
    def __init__(self, dist: Dict[JsonReader, float], ioctx: IOContext):
        """Initialize a MixedInput.

        Args:
            dist (dict): dict mapping JSONReader paths or "sampler" to
                probabilities. The probabilities must sum to 1.0.
            ioctx (IOContext): current IO context object.
        """
        if sum(dist.values()) != 1.0:
            raise ValueError("Values must sum to 1.0: {}".format(dist))
        self.choices = []
        self.p = []
        for k, v in dist.items():
            if k == "sampler":
                self.choices.append(ioctx.default_sampler_input())
            elif isinstance(k, FunctionType):
                self.choices.append(k(ioctx))
            elif isinstance(k, str) and registry_contains_input(k):
                input_creator = registry_get_input(k)
                self.choices.append(input_creator(ioctx))
            else:
                self.choices.append(JsonReader(k, ioctx))
            self.p.append(v)
예제 #6
0
    def offline_evaluation(self, iteration):
        self._agent.eval_mode = True

        validation_dataset = [
            os.path.join(self.dataset_path, f)
            for f in os.listdir(self.dataset_path)
            if os.path.isfile(os.path.join(self.dataset_path, f))
        ]
        validation_dataset = sorted(validation_dataset)

        rewards = []
        for n_eps in range(len(validation_dataset)):
            reader = JsonReader(validation_dataset[n_eps])

            with open(validation_dataset[n_eps], "r") as f:
                sb = f.readlines()

            for _ in range(len(sb)):
                n = reader.next()
                batch = reader.next()
                for episode in batch.split_by_episode():
                    for r in episode["rewards"]:
                        rewards.append(r)

        rewards_shift = (round(min(rewards), 5) *
                         -1 if round(min(rewards), 5) < 0 else round(
                             min(rewards), 5))

        actions = []
        estimation = {
            "dm/score": [],
            "dm/pred_reward_mean": [],
            "dm/pred_reward_total": [],
            "is/V_prev": [],
            "is/V_step_IS": [],
            "is/V_gain_est": [],
        }
        for n_eps in range(len(validation_dataset)):
            reader = JsonReader(validation_dataset[n_eps])
            batch = reader.next()
            for episode in batch.split_by_episode():
                action = []
                selected_action_prob = []
                all_actions_prob = []
                for i in range(len(episode["eps_id"])):
                    _action, _action_prob = self._agent.step(
                        episode["rewards"][i], episode["obs"][i])
                    action.append(_action)
                    selected_action_prob.append(_action_prob[_action])
                    all_actions_prob.append(_action_prob)

                is_estimation = self.is_estimator.estimate(
                    episode, all_actions_prob, rewards_shift)

                actions.extend(action)
                action = np.array([action])
                action_prob = np.array([selected_action_prob])

                obs = torch.Tensor(
                    np.concatenate(
                        (episode["obs"],
                         np.reshape(action, (action[0].shape[0], 1))),
                        axis=1,
                    )
                )  # concatenate actions and observations for input obs are usually [[obs1],[obs2],[obs3]] and
                # actions are usually [1,0,1,0] so the goal is to make actions like this: [[1],[0],[1]]
                scores_raw = self.predictor.predict(obs).detach().numpy()
                scores = {}
                scores["score"] = (scores_raw * action_prob).mean()
                scores["pred_reward_mean"] = scores_raw.mean()
                scores["pred_reward_total"] = scores_raw.sum()

                # DM Estimation ------------------------
                estimation["dm/score"].append(scores["score"])
                estimation["dm/pred_reward_mean"].append(
                    scores["pred_reward_mean"])
                estimation["dm/pred_reward_total"].append(
                    scores["pred_reward_total"])

                # IS Estimation -----------------------
                estimation["is/V_prev"].append(is_estimation["V_prev"])
                estimation["is/V_step_IS"].append(is_estimation["V_step_IS"])
                estimation["is/V_gain_est"].append(is_estimation["V_gain_est"])

        est_mean = pd.DataFrame.from_dict(estimation).mean(axis=0)

        summary = tf.Summary(value=[
            tf.Summary.Value(tag="Eval/DM/score",
                             simple_value=est_mean["dm/score"]),
            tf.Summary.Value(
                tag="Eval/DM/pred_reward_mean",
                simple_value=est_mean["dm/pred_reward_mean"],
            ),
            tf.Summary.Value(
                tag="Eval/DM/pred_reward_total",
                simple_value=est_mean["dm/pred_reward_total"],
            ),
            tf.Summary.Value(tag="Eval/is/V_prev",
                             simple_value=est_mean["is/V_prev"]),
            tf.Summary.Value(
                tag="Eval/is/V_step_IS",
                simple_value=est_mean["is/V_step_IS"],
            ),
            tf.Summary.Value(
                tag="Eval/is/V_gain_est",
                simple_value=est_mean["is/V_gain_est"],
            ),
            tf.Summary.Value(
                tag="Eval/actions_prob",
                simple_value=float(actions.count(1)) / len(actions),
            ),
        ])
        self._summary_writer.add_summary(summary, iteration)
예제 #7
0
파일: test_crr.py 프로젝트: parasj/ray
 def input_reading_fn(ioctx):
     return JsonReader(ioctx.config["input_config"]["paths"], ioctx)
예제 #8
0
파일: test_ope.py 프로젝트: parasj/ray
    def setUpClass(cls):
        ray.init()
        rllib_dir = Path(__file__).parent.parent.parent.parent
        train_data = os.path.join(rllib_dir, "tests/data/cartpole/large.json")
        eval_data = train_data

        env_name = "CartPole-v0"
        cls.gamma = 0.99
        n_episodes = 40
        cls.q_model_config = {"n_iters": 600}

        config = (DQNConfig().environment(env=env_name).training(
            gamma=cls.gamma).rollouts(
                num_rollout_workers=3,
                batch_mode="complete_episodes").framework("torch").resources(
                    num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", 0))).
                  offline_data(input_=train_data).evaluation(
                      evaluation_interval=None,
                      evaluation_duration=n_episodes,
                      evaluation_num_workers=1,
                      evaluation_duration_unit="episodes",
                      evaluation_config={"input": eval_data},
                      off_policy_estimation_methods={
                          "is": {
                              "type": ImportanceSampling
                          },
                          "wis": {
                              "type": WeightedImportanceSampling
                          },
                          "dm_fqe": {
                              "type": DirectMethod,
                              "q_model_config": {
                                  "type": FQETorchModel
                              },
                          },
                          "dr_fqe": {
                              "type": DoublyRobust,
                              "q_model_config": {
                                  "type": FQETorchModel
                              },
                          },
                      },
                  ))
        cls.algo = config.build()

        # Train DQN for evaluation policy
        for _ in range(n_episodes):
            cls.algo.train()

        # Read n_episodes of data, assuming that one line is one episode
        reader = JsonReader(eval_data)
        cls.batch = reader.next()
        for _ in range(n_episodes - 1):
            cls.batch = concat_samples([cls.batch, reader.next()])
        cls.n_episodes = len(cls.batch.split_by_episode())
        print("Episodes:", cls.n_episodes, "Steps:", cls.batch.count)

        cls.mean_ret = {}
        cls.std_ret = {}
        cls.losses = {}

        # Simulate Monte-Carlo rollouts
        mc_ret = []
        env = gym.make(env_name)
        for _ in range(n_episodes):
            obs = env.reset()
            done = False
            rewards = []
            while not done:
                act = cls.algo.compute_single_action(obs)
                obs, reward, done, _ = env.step(act)
                rewards.append(reward)
            ret = 0
            for r in reversed(rewards):
                ret = r + cls.gamma * ret
            mc_ret.append(ret)

        cls.mean_ret["simulation"] = np.mean(mc_ret)
        cls.std_ret["simulation"] = np.std(mc_ret)
예제 #9
0
    def setUpClass(cls):
        ray.init(ignore_reinit_error=True)
        rllib_dir = Path(__file__).parent.parent.parent.parent
        print("rllib dir={}".format(rllib_dir))
        data_file = os.path.join(rllib_dir, "tests/data/cartpole/large.json")
        print("data_file={} exists={}".format(data_file, os.path.isfile(data_file)))

        env_name = "CartPole-v0"
        cls.gamma = 0.99
        train_steps = 20000
        n_batches = 20  # Approx. equal to n_episodes
        n_eval_episodes = 100

        config = (
            DQNConfig()
            .environment(env=env_name)
            .training(gamma=cls.gamma)
            .rollouts(num_rollout_workers=3)
            .exploration(
                explore=True,
                exploration_config={
                    "type": "SoftQ",
                    "temperature": 1.0,
                },
            )
            .framework("torch")
            .rollouts(batch_mode="complete_episodes")
        )
        cls.trainer = config.build()

        # Train DQN for evaluation policy
        tune.run(
            "DQN",
            config=config.to_dict(),
            stop={"timesteps_total": train_steps},
            verbose=0,
        )

        # Read n_batches of data
        reader = JsonReader(data_file)
        cls.batch = reader.next()
        for _ in range(n_batches - 1):
            cls.batch = cls.batch.concat(reader.next())
        cls.n_episodes = len(cls.batch.split_by_episode())
        print("Episodes:", cls.n_episodes, "Steps:", cls.batch.count)

        cls.mean_ret = {}
        cls.std_ret = {}

        # Simulate Monte-Carlo rollouts
        mc_ret = []
        env = gym.make(env_name)
        for _ in range(n_eval_episodes):
            obs = env.reset()
            done = False
            rewards = []
            while not done:
                act = cls.trainer.compute_single_action(obs)
                obs, reward, done, _ = env.step(act)
                rewards.append(reward)
            ret = 0
            for r in reversed(rewards):
                ret = r + cls.gamma * ret
            mc_ret.append(ret)

        cls.mean_ret["simulation"] = np.mean(mc_ret)
        cls.std_ret["simulation"] = np.std(mc_ret)

        # Optional configs for the model-based estimators
        cls.model_config = {"k": 2, "n_iters": 10}
        ray.shutdown()