示例#1
0
    def custom_loss(self, policy_loss, loss_inputs):
        # create a new input reader per worker
        reader = JsonReader(self.options["custom_options"]["input_files"])
        input_ops = reader.tf_input_ops()

        # define a secondary loss by building a graph copy with weight sharing
        with tf.variable_scope(self.scope,
                               reuse=tf.AUTO_REUSE,
                               auxiliary_name_scope=False):
            logits, _ = self._build_layers_v2(
                {
                    "obs":
                    restore_original_dimensions(input_ops["obs"],
                                                self.obs_space)
                }, self.num_outputs, self.options)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # compute the IL loss
        action_dist = Categorical(logits)
        self.policy_loss = policy_loss
        self.imitation_loss = tf.reduce_mean(
            -action_dist.logp(input_ops["actions"]))
        return policy_loss + 10 * self.imitation_loss
示例#2
0
    def custom_loss(self, policy_loss, loss_inputs):
        # create a new input reader per worker
        reader = JsonReader(
            self.model_config["custom_model_config"]["input_files"])
        input_ops = reader.tf_input_ops(
            self.model_config["custom_model_config"].get("expert_size", 1))

        # define a secondary loss by building a graph copy with weight sharing
        obs = restore_original_dimensions(
            tf.cast(input_ops["obs"], tf.float32), self.obs_space)
        logits, _ = self.forward({"obs": obs}, [], None)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        # print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # compute the IL loss
        self.policy_loss = policy_loss
        (action_scores, model_logits,
         dist) = self.get_q_value_distributions(logits)
        model_logits = tf.squeeze(model_logits)
        action_dist = Categorical(model_logits, self.model_config)

        expert_logits = tf.cast(input_ops["actions"], tf.int32)
        expert_action = tf.math.argmax(expert_logits)
        expert_action_one_hot = tf.one_hot(expert_action, self.num_outputs)
        model_action = action_dist.deterministic_sample()
        model_action_one_hot = tf.one_hot(model_action, self.num_outputs)
        model_expert = model_action_one_hot * expert_action_one_hot
        imitation_loss = 0
        loss_type = self.model_config["custom_model_config"].get("loss", "ce")
        if loss_type == "ce":
            imitation_loss = tf.reduce_mean(-action_dist.logp(expert_logits))
        elif loss_type == "kl":
            expert_dist = Categorical(tf.one_hot(expert_logits, \
                                                 self.num_outputs), self.model_config)
            imitation_loss = tf.reduce_mean(-action_dist.kl(expert_dist))
        elif loss_type == "dqfd":
            max_value = float("-inf")
            Q_select = model_logits  # TODO: difference in action_scores,dist and logits
            for a in range(self.num_outputs):
                max_value = tf.maximum(
                    Q_select[a] + 0.8 * tf.cast(model_expert[a], tf.float32),
                    max_value)
            imitation_loss = tf.reduce_mean(
                1 * (max_value - Q_select[tf.cast(expert_action, tf.int32)]))

        self.imitation_loss = imitation_loss
        total_loss = self.model_config["custom_model_config"]["lambda1"] * policy_loss \
                     + self.model_config["custom_model_config"]["lambda2"] \
                     * self.imitation_loss
        return total_loss
示例#3
0
    def test_agent_output_infos(self):
        """Verify that the infos dictionary is written to the output files.

        Note, with torch this is always the case.
        """
        output_config = {"store_infos": True}
        for fw in framework_iterator(frameworks=("torch", "tf")):
            self.write_outputs(self.test_dir, fw, output_config=output_config)
            self.assertEqual(len(os.listdir(self.test_dir + fw)), 1)
            reader = JsonReader(self.test_dir + fw + "/*.json")
            data = reader.next()
            assert "infos" in data
示例#4
0
    def __init__(
        self, obs_space, action_space, num_outputs, model_config, name, input_files
    ):
        super().__init__(obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)

        self.input_files = input_files
        # Create a new input reader per worker.
        self.reader = JsonReader(self.input_files)
        self.fcnet = TorchFC(
            self.obs_space, self.action_space, num_outputs, model_config, name="fcnet"
        )
示例#5
0
    def __init__(self,
                 inputs,
                 num_samples_per_task=0,
                 policy_id="human_0",
                 transform=None,
                 target_transform=None,
                 dataset_transform=None):
        super(Behaviour, self).__init__(meta_split='train',
                                        target_transform=target_transform,
                                        dataset_transform=dataset_transform)

        self.transform = transform
        self._datasets = []
        for input_path in inputs:
            # Cache all samples in rllib, to ensure stochastic loading
            data_paths = [
                osp.join(input_path, f) for f in os.listdir(input_path)
            ]
            print(f"Loading from {input_path}: {len(data_paths)} files")
            # dataset = ShuffledInput(JsonReader(data_paths), n=num_samples_per_task)
            dataset = ShuffledInput(JsonReader(data_paths))
            self._datasets.append(dataset)

        self.num_tasks = len(inputs)
        self.num_samples_per_task = num_samples_per_task
        self.policy_id = policy_id
示例#6
0
 def test_multiple_output_workers(self):
     ray.shutdown()
     ray.init(num_cpus=4, ignore_reinit_error=True)
     for fw in framework_iterator(frameworks=["tf", "torch"]):
         agent = PG(
             env="CartPole-v0",
             config={
                 "num_workers": 2,
                 "output": self.test_dir + fw,
                 "rollout_fragment_length": 250,
                 "framework": fw,
             },
         )
         agent.train()
         self.assertEqual(len(os.listdir(self.test_dir + fw)), 2)
         reader = JsonReader(self.test_dir + fw + "/*.json")
         reader.next()
示例#7
0
文件: test_io.py 项目: xuman2019/ray
 def testReadWrite(self):
     ioctx = IOContext(self.test_dir, {}, 0, None)
     writer = JsonWriter(
         self.test_dir, ioctx, max_file_size=5000, compress_columns=["obs"])
     for i in range(100):
         writer.write(make_sample_batch(i))
     reader = JsonReader(self.test_dir + "/*.json")
     seen_a = set()
     seen_o = set()
     for i in range(1000):
         batch = reader.next()
         seen_a.add(batch["actions"][0])
         seen_o.add(batch["obs"][0])
     self.assertGreater(len(seen_a), 90)
     self.assertLess(len(seen_a), 101)
     self.assertGreater(len(seen_o), 90)
     self.assertLess(len(seen_o), 101)
示例#8
0
 def testAbortOnAllEmptyInputs(self):
     open(self.test_dir + "/empty", "w").close()
     reader = JsonReader([
         self.test_dir + "/empty",
     ])
     self.assertRaises(ValueError, lambda: reader.next())
     with open(self.test_dir + "/empty1", "w") as f:
         for _ in range(100):
             f.write("\n")
     with open(self.test_dir + "/empty2", "w") as f:
         for _ in range(100):
             f.write("\n")
     reader = JsonReader([
         self.test_dir + "/empty1",
         self.test_dir + "/empty2",
     ])
     self.assertRaises(ValueError, lambda: reader.next())
示例#9
0
 def testReadWrite(self):
     ioctx = IOContext(self.test_dir, {}, 0, None)
     writer = JsonWriter(
         self.test_dir, ioctx, max_file_size=5000, compress_columns=["obs"])
     for i in range(100):
         writer.write(make_sample_batch(i))
     reader = JsonReader(self.test_dir + "/*.json")
     seen_a = set()
     seen_o = set()
     for i in range(1000):
         batch = reader.next()
         seen_a.add(batch["actions"][0])
         seen_o.add(batch["obs"][0])
     self.assertGreater(len(seen_a), 90)
     self.assertLess(len(seen_a), 101)
     self.assertGreater(len(seen_o), 90)
     self.assertLess(len(seen_o), 101)
示例#10
0
文件: test_io.py 项目: novahe/ray
 def test_skips_over_empty_lines_and_files(self):
     open(self.test_dir + "/empty", "w").close()
     with open(self.test_dir + "/f1", "w") as f:
         f.write("\n")
         f.write("\n")
         f.write(_to_json(make_sample_batch(0), []))
     with open(self.test_dir + "/f2", "w") as f:
         f.write(_to_json(make_sample_batch(1), []))
         f.write("\n")
     reader = JsonReader([
         self.test_dir + "/empty",
         self.test_dir + "/f1",
         "file://" + self.test_dir + "/f2",
     ])
     seen_a = set()
     for i in range(100):
         batch = reader.next()
         seen_a.add(batch["actions"][0])
     self.assertEqual(len(seen_a), 2)
示例#11
0
文件: test_io.py 项目: novahe/ray
 def test_skips_over_corrupted_lines(self):
     with open(self.test_dir + "/f1", "w") as f:
         f.write(_to_json(make_sample_batch(0), []))
         f.write("\n")
         f.write(_to_json(make_sample_batch(1), []))
         f.write("\n")
         f.write(_to_json(make_sample_batch(2), []))
         f.write("\n")
         f.write(_to_json(make_sample_batch(3), []))
         f.write("\n")
         f.write("{..corrupted_json_record")
     reader = JsonReader([
         self.test_dir + "/f1",
     ])
     seen_a = set()
     for i in range(10):
         batch = reader.next()
         seen_a.add(batch["actions"][0])
     self.assertEqual(len(seen_a), 4)
示例#12
0
 def testSkipsOverCorruptedLines(self):
     with open(self.test_dir + "/f1", "w") as f:
         f.write(_to_json(make_sample_batch(0), []))
         f.write("\n")
         f.write(_to_json(make_sample_batch(1), []))
         f.write("\n")
         f.write(_to_json(make_sample_batch(2), []))
         f.write("\n")
         f.write(_to_json(make_sample_batch(3), []))
         f.write("\n")
         f.write("{..corrupted_json_record")
     reader = JsonReader([
         self.test_dir + "/f1",
     ])
     seen_a = set()
     for i in range(10):
         batch = reader.next()
         seen_a.add(batch["actions"][0])
     self.assertEqual(len(seen_a), 4)
示例#13
0
 def testSkipsOverEmptyLinesAndFiles(self):
     open(self.test_dir + "/empty", "w").close()
     with open(self.test_dir + "/f1", "w") as f:
         f.write("\n")
         f.write("\n")
         f.write(_to_json(make_sample_batch(0), []))
     with open(self.test_dir + "/f2", "w") as f:
         f.write(_to_json(make_sample_batch(1), []))
         f.write("\n")
     reader = JsonReader([
         self.test_dir + "/empty",
         self.test_dir + "/f1",
         "file:" + self.test_dir + "/f2",
     ])
     seen_a = set()
     for i in range(100):
         batch = reader.next()
         seen_a.add(batch["actions"][0])
     self.assertEqual(len(seen_a), 2)
示例#14
0
 def testSkipsOverEmptyLinesAndFiles(self):
     ioctx = IOContext(self.test_dir, {}, 0, None)
     open(self.test_dir + "/empty", "w").close()
     with open(self.test_dir + "/f1", "w") as f:
         f.write("\n")
         f.write("\n")
         f.write(_to_json(make_sample_batch(0), []))
     with open(self.test_dir + "/f2", "w") as f:
         f.write(_to_json(make_sample_batch(1), []))
         f.write("\n")
     reader = JsonReader(ioctx, [
         self.test_dir + "/empty",
         self.test_dir + "/f1",
         "file:" + self.test_dir + "/f2",
     ])
     seen_a = set()
     for i in range(100):
         batch = reader.next()
         seen_a.add(batch["actions"][0])
     self.assertEqual(len(seen_a), 2)
示例#15
0
 def testSkipsOverCorruptedLines(self):
     ioctx = IOContext(self.test_dir, {}, 0, None)
     with open(self.test_dir + "/f1", "w") as f:
         f.write(_to_json(make_sample_batch(0), []))
         f.write("\n")
         f.write(_to_json(make_sample_batch(1), []))
         f.write("\n")
         f.write(_to_json(make_sample_batch(2), []))
         f.write("\n")
         f.write(_to_json(make_sample_batch(3), []))
         f.write("\n")
         f.write("{..corrupted_json_record")
     reader = JsonReader(ioctx, [
         self.test_dir + "/f1",
     ])
     seen_a = set()
     for i in range(10):
         batch = reader.next()
         seen_a.add(batch["actions"][0])
     self.assertEqual(len(seen_a), 4)
示例#16
0
    def custom_loss(self, policy_loss, loss_inputs):
        # Create a new input reader per worker.
        reader = JsonReader(self.model_config["custom_model_config"]["input_files"])
        input_ops = reader.tf_input_ops()

        # Define a secondary loss by building a graph copy with weight sharing.
        obs = restore_original_dimensions(
            tf.cast(input_ops["obs"], tf.float32), self.obs_space
        )
        logits, _ = self.forward({"obs": obs}, [], None)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # Compute the IL loss.
        action_dist = Categorical(logits, self.model_config)
        self.policy_loss = policy_loss
        self.imitation_loss = tf.reduce_mean(-action_dist.logp(input_ops["actions"]))
        return policy_loss + 10 * self.imitation_loss
示例#17
0
    def custom_loss(self, policy_loss, loss_inputs):
        # create a new input reader per worker
        reader = JsonReader(self.options["custom_options"]["input_files"])
        input_ops = reader.tf_input_ops()

        # define a secondary loss by building a graph copy with weight sharing
        logits, _ = self._build_layers_v2({
            "obs": restore_original_dimensions(input_ops["obs"],
                                               self.obs_space)
        }, self.num_outputs, self.options)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # compute the IL loss
        action_dist = Categorical(logits)
        self.policy_loss = policy_loss
        self.imitation_loss = tf.reduce_mean(
            -action_dist.logp(input_ops["actions"]))
        return policy_loss + 10 * self.imitation_loss
示例#18
0
文件: test_io.py 项目: novahe/ray
 def test_abort_on_all_empty_inputs(self):
     open(self.test_dir + "/empty", "w").close()
     reader = JsonReader([
         self.test_dir + "/empty",
     ])
     self.assertRaises(ValueError, lambda: reader.next())
     with open(self.test_dir + "/empty1", "w") as f:
         for _ in range(100):
             f.write("\n")
     with open(self.test_dir + "/empty2", "w") as f:
         for _ in range(100):
             f.write("\n")
     reader = JsonReader([
         self.test_dir + "/empty1",
         self.test_dir + "/empty2",
     ])
     self.assertRaises(ValueError, lambda: reader.next())
示例#19
0
 def testAbortOnAllEmptyInputs(self):
     ioctx = IOContext(self.test_dir, {}, 0, None)
     open(self.test_dir + "/empty", "w").close()
     reader = JsonReader(ioctx, [
         self.test_dir + "/empty",
     ])
     self.assertRaises(ValueError, lambda: reader.next())
     with open(self.test_dir + "/empty1", "w") as f:
         for _ in range(100):
             f.write("\n")
     with open(self.test_dir + "/empty2", "w") as f:
         for _ in range(100):
             f.write("\n")
     reader = JsonReader(ioctx, [
         self.test_dir + "/empty1",
         self.test_dir + "/empty2",
     ])
     self.assertRaises(ValueError, lambda: reader.next())
示例#20
0
    def _make_evaluator(self, cls, env_creator, policy_graph, worker_index,
                        config):
        def session_creator():
            logger.debug("Creating TF session {}".format(
                config["tf_session_args"]))
            return tf.Session(config=tf.ConfigProto(
                **config["tf_session_args"]))

        if isinstance(config["input"], FunctionType):
            input_creator = config["input"]
        elif config["input"] == "sampler":
            input_creator = (lambda ioctx: ioctx.default_sampler_input())
        elif isinstance(config["input"], dict):
            input_creator = (
                lambda ioctx: ShuffledInput(MixedInput(config["input"], ioctx),
                                            config["shuffle_buffer_size"]))
        else:
            input_creator = (
                lambda ioctx: ShuffledInput(JsonReader(config["input"], ioctx),
                                            config["shuffle_buffer_size"]))

        if isinstance(config["output"], FunctionType):
            output_creator = config["output"]
        elif config["output"] is None:
            output_creator = (lambda ioctx: NoopOutput())
        elif config["output"] == "logdir":
            output_creator = (lambda ioctx: JsonWriter(
                ioctx.log_dir,
                ioctx,
                max_file_size=config["output_max_file_size"],
                compress_columns=config["output_compress_columns"]))
        else:
            output_creator = (lambda ioctx: JsonWriter(
                config["output"],
                ioctx,
                max_file_size=config["output_max_file_size"],
                compress_columns=config["output_compress_columns"]))

        if config["input"] == "sampler":
            input_evaluation = []
        else:
            input_evaluation = config["input_evaluation"]

        return cls(
            env_creator,
            self.config["multiagent"]["policy_graphs"] or policy_graph,
            policy_mapping_fn=self.config["multiagent"]["policy_mapping_fn"],
            policies_to_train=self.config["multiagent"]["policies_to_train"],
            tf_session_creator=(session_creator
                                if config["tf_session_args"] else None),
            batch_steps=config["sample_batch_size"],
            batch_mode=config["batch_mode"],
            episode_horizon=config["horizon"],
            preprocessor_pref=config["preprocessor_pref"],
            sample_async=config["sample_async"],
            compress_observations=config["compress_observations"],
            num_envs=config["num_envs_per_worker"],
            observation_filter=config["observation_filter"],
            clip_rewards=config["clip_rewards"],
            clip_actions=config["clip_actions"],
            env_config=config["env_config"],
            model_config=config["model"],
            policy_config=config,
            worker_index=worker_index,
            monitor_path=self.logdir if config["monitor"] else None,
            log_dir=self.logdir,
            log_level=config["log_level"],
            callbacks=config["callbacks"],
            input_creator=input_creator,
            input_evaluation=input_evaluation,
            output_creator=output_creator,
            remote_worker_envs=config["remote_worker_envs"],
            async_remote_worker_envs=config["async_remote_worker_envs"])
示例#21
0
 def testAgentOutputOk(self):
     self.writeOutputs(self.test_dir)
     self.assertEqual(len(os.listdir(self.test_dir)), 1)
     reader = JsonReader(self.test_dir + "/*.json")
     reader.next()
示例#22
0
    def _make_worker(
        self,
        *,
        cls: Callable,
        env_creator: Callable[[EnvContext], EnvType],
        validate_env: Optional[Callable[[EnvType], None]],
        policy_cls: Type[Policy],
        worker_index: int,
        num_workers: int,
        config: TrainerConfigDict,
        spaces: Optional[Dict[PolicyID, Tuple[gym.spaces.Space,
                                              gym.spaces.Space]]] = None,
    ) -> Union[RolloutWorker, "ActorHandle"]:
        def session_creator():
            logger.debug("Creating TF session {}".format(
                config["tf_session_args"]))
            return tf1.Session(config=tf1.ConfigProto(
                **config["tf_session_args"]))

        if isinstance(config["input"], FunctionType):
            input_creator = config["input"]
        elif config["input"] == "sampler":
            input_creator = (lambda ioctx: ioctx.default_sampler_input())
        elif isinstance(config["input"], dict):
            input_creator = (
                lambda ioctx: ShuffledInput(MixedInput(config["input"], ioctx),
                                            config["shuffle_buffer_size"]))
        else:
            input_creator = (
                lambda ioctx: ShuffledInput(JsonReader(config["input"], ioctx),
                                            config["shuffle_buffer_size"]))

        if isinstance(config["output"], FunctionType):
            output_creator = config["output"]
        elif config["output"] is None:
            output_creator = (lambda ioctx: NoopOutput())
        elif config["output"] == "logdir":
            output_creator = (lambda ioctx: JsonWriter(
                ioctx.log_dir,
                ioctx,
                max_file_size=config["output_max_file_size"],
                compress_columns=config["output_compress_columns"]))
        else:
            output_creator = (lambda ioctx: JsonWriter(
                config["output"],
                ioctx,
                max_file_size=config["output_max_file_size"],
                compress_columns=config["output_compress_columns"]))

        if config["input"] == "sampler":
            input_evaluation = []
        else:
            input_evaluation = config["input_evaluation"]

        # Fill in the default policy_cls if 'None' is specified in multiagent.
        if config["multiagent"]["policies"]:
            tmp = config["multiagent"]["policies"]
            _validate_multiagent_config(tmp, allow_none_graph=True)
            # TODO: (sven) Allow for setting observation and action spaces to
            #  None as well, in which case, spaces are taken from env.
            #  It's tedious to have to provide these in a multi-agent config.
            for k, v in tmp.items():
                if v[0] is None:
                    tmp[k] = (policy_cls, v[1], v[2], v[3])
            policy_spec = tmp
        # Otherwise, policy spec is simply the policy class itself.
        else:
            policy_spec = policy_cls

        if worker_index == 0:
            extra_python_environs = config.get(
                "extra_python_environs_for_driver", None)
        else:
            extra_python_environs = config.get(
                "extra_python_environs_for_worker", None)

        worker = cls(
            env_creator=env_creator,
            validate_env=validate_env,
            policy_spec=policy_spec,
            policy_mapping_fn=config["multiagent"]["policy_mapping_fn"],
            policies_to_train=config["multiagent"]["policies_to_train"],
            tf_session_creator=(session_creator
                                if config["tf_session_args"] else None),
            rollout_fragment_length=config["rollout_fragment_length"],
            batch_mode=config["batch_mode"],
            episode_horizon=config["horizon"],
            preprocessor_pref=config["preprocessor_pref"],
            sample_async=config["sample_async"],
            compress_observations=config["compress_observations"],
            num_envs=config["num_envs_per_worker"],
            observation_fn=config["multiagent"]["observation_fn"],
            observation_filter=config["observation_filter"],
            clip_rewards=config["clip_rewards"],
            clip_actions=config["clip_actions"],
            env_config=config["env_config"],
            model_config=config["model"],
            policy_config=config,
            worker_index=worker_index,
            num_workers=num_workers,
            monitor_path=self._logdir if config["monitor"] else None,
            log_dir=self._logdir,
            log_level=config["log_level"],
            callbacks=config["callbacks"],
            input_creator=input_creator,
            input_evaluation=input_evaluation,
            output_creator=output_creator,
            remote_worker_envs=config["remote_worker_envs"],
            remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"],
            soft_horizon=config["soft_horizon"],
            no_done_at_end=config["no_done_at_end"],
            seed=(config["seed"] +
                  worker_index) if config["seed"] is not None else None,
            fake_sampler=config["fake_sampler"],
            extra_python_environs=extra_python_environs,
            spaces=spaces,
        )

        return worker
示例#23
0
文件: test_io.py 项目: novahe/ray
 def test_agent_output_ok(self):
     for fw in framework_iterator(frameworks=("torch", "tf")):
         self.write_outputs(self.test_dir, fw)
         self.assertEqual(len(os.listdir(self.test_dir + fw)), 1)
         reader = JsonReader(self.test_dir + fw + "/*.json")
         reader.next()
示例#24
0
    def _make_worker(
            self,
            *,
            cls: Callable,
            env_creator: Callable[[EnvContext], EnvType],
            validate_env: Optional[Callable[[EnvType], None]],
            policy_cls: Type[Policy],
            worker_index: int,
            num_workers: int,
            config: TrainerConfigDict,
            spaces: Optional[Dict[PolicyID, Tuple[gym.spaces.Space,
                                                  gym.spaces.Space]]] = None,
    ) -> Union[RolloutWorker, ActorHandle]:
        def session_creator():
            logger.debug("Creating TF session {}".format(
                config["tf_session_args"]))
            return tf1.Session(
                config=tf1.ConfigProto(**config["tf_session_args"]))

        def valid_module(class_path):
            if isinstance(class_path, str) and "." in class_path:
                module_path, class_name = class_path.rsplit(".", 1)
                try:
                    spec = importlib.util.find_spec(module_path)
                    if spec is not None:
                        return True
                except (ModuleNotFoundError, ValueError):
                    print(
                        f"module {module_path} not found while trying to get "
                        f"input {class_path}")
            return False

        if isinstance(config["input"], FunctionType):
            input_creator = config["input"]
        elif config["input"] == "sampler":
            input_creator = (lambda ioctx: ioctx.default_sampler_input())
        elif isinstance(config["input"], dict):
            input_creator = (
                lambda ioctx: ShuffledInput(MixedInput(config["input"], ioctx),
                                            config["shuffle_buffer_size"]))
        elif isinstance(config["input"], str) and \
                registry_contains_input(config["input"]):
            input_creator = registry_get_input(config["input"])
        elif "d4rl" in config["input"]:
            env_name = config["input"].split(".")[-1]
            input_creator = (lambda ioctx: D4RLReader(env_name, ioctx))
        elif valid_module(config["input"]):
            input_creator = (lambda ioctx: ShuffledInput(from_config(
                config["input"], ioctx=ioctx)))
        else:
            input_creator = (
                lambda ioctx: ShuffledInput(JsonReader(config["input"], ioctx),
                                            config["shuffle_buffer_size"]))

        if isinstance(config["output"], FunctionType):
            output_creator = config["output"]
        elif config["output"] is None:
            output_creator = (lambda ioctx: NoopOutput())
        elif config["output"] == "logdir":
            output_creator = (lambda ioctx: JsonWriter(
                ioctx.log_dir,
                ioctx,
                max_file_size=config["output_max_file_size"],
                compress_columns=config["output_compress_columns"]))
        else:
            output_creator = (lambda ioctx: JsonWriter(
                config["output"],
                ioctx,
                max_file_size=config["output_max_file_size"],
                compress_columns=config["output_compress_columns"]))

        if config["input"] == "sampler":
            input_evaluation = []
        else:
            input_evaluation = config["input_evaluation"]

        # Assert everything is correct in "multiagent" config dict (if given).
        ma_policies = config["multiagent"]["policies"]
        if ma_policies:
            for pid, policy_spec in ma_policies.copy().items():
                assert isinstance(policy_spec, (PolicySpec, list, tuple))
                # Class is None -> Use `policy_cls`.
                if policy_spec.policy_class is None:
                    ma_policies[pid] = ma_policies[pid]._replace(
                        policy_class=policy_cls)
            policies = ma_policies

        # Create a policy_spec (MultiAgentPolicyConfigDict),
        # even if no "multiagent" setup given by user.
        else:
            policies = policy_cls

        if worker_index == 0:
            extra_python_environs = config.get(
                "extra_python_environs_for_driver", None)
        else:
            extra_python_environs = config.get(
                "extra_python_environs_for_worker", None)

        worker = cls(
            env_creator=env_creator,
            validate_env=validate_env,
            policy_spec=policies,
            policy_mapping_fn=config["multiagent"]["policy_mapping_fn"],
            policies_to_train=config["multiagent"]["policies_to_train"],
            tf_session_creator=(session_creator
                                if config["tf_session_args"] else None),
            rollout_fragment_length=config["rollout_fragment_length"],
            count_steps_by=config["multiagent"]["count_steps_by"],
            batch_mode=config["batch_mode"],
            episode_horizon=config["horizon"],
            preprocessor_pref=config["preprocessor_pref"],
            sample_async=config["sample_async"],
            compress_observations=config["compress_observations"],
            num_envs=config["num_envs_per_worker"],
            observation_fn=config["multiagent"]["observation_fn"],
            observation_filter=config["observation_filter"],
            clip_rewards=config["clip_rewards"],
            normalize_actions=config["normalize_actions"],
            clip_actions=config["clip_actions"],
            env_config=config["env_config"],
            policy_config=config,
            worker_index=worker_index,
            num_workers=num_workers,
            record_env=config["record_env"],
            log_dir=self._logdir,
            log_level=config["log_level"],
            callbacks=config["callbacks"],
            input_creator=input_creator,
            input_evaluation=input_evaluation,
            output_creator=output_creator,
            remote_worker_envs=config["remote_worker_envs"],
            remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"],
            soft_horizon=config["soft_horizon"],
            no_done_at_end=config["no_done_at_end"],
            seed=(config["seed"] + worker_index)
            if config["seed"] is not None else None,
            fake_sampler=config["fake_sampler"],
            extra_python_environs=extra_python_environs,
            spaces=spaces,
        )

        return worker
示例#25
0
    def test_marwil_loss_function(self):
        """
        To generate the historic data used in this test case, first run:
        $ ./train.py --run=PPO --env=CartPole-v0 \
          --stop='{"timesteps_total": 50000}' \
          --config='{"output": "/tmp/out", "batch_mode": "complete_episodes"}'
        """
        rllib_dir = Path(__file__).parent.parent.parent.parent
        print("rllib dir={}".format(rllib_dir))
        data_file = os.path.join(rllib_dir, "tests/data/cartpole/small.json")
        print("data_file={} exists={}".format(data_file,
                                              os.path.isfile(data_file)))

        config = (marwil.MARWILConfig().rollouts(
            num_rollout_workers=0).offline_data(input_=[data_file])
                  )  # Learn from offline data.

        for fw, sess in framework_iterator(config, session=True):
            reader = JsonReader(inputs=[data_file])
            batch = reader.next()

            trainer = config.build(env="CartPole-v0")
            policy = trainer.get_policy()
            model = policy.model

            # Calculate our own expected values (to then compare against the
            # agent's loss output).
            cummulative_rewards = compute_advantages(batch, 0.0, config.gamma,
                                                     1.0, False,
                                                     False)["advantages"]
            if fw == "torch":
                cummulative_rewards = torch.tensor(cummulative_rewards)
            if fw != "tf":
                batch = policy._lazy_tensor_dict(batch)
            model_out, _ = model(batch)
            vf_estimates = model.value_function()
            if fw == "tf":
                model_out, vf_estimates = policy.get_session().run(
                    [model_out, vf_estimates])
            adv = cummulative_rewards - vf_estimates
            if fw == "torch":
                adv = adv.detach().cpu().numpy()
            adv_squared = np.mean(np.square(adv))
            c_2 = 100.0 + 1e-8 * (adv_squared - 100.0)
            c = np.sqrt(c_2)
            exp_advs = np.exp(config.beta * (adv / c))
            dist = policy.dist_class(model_out, model)
            logp = dist.logp(batch["actions"])
            if fw == "torch":
                logp = logp.detach().cpu().numpy()
            elif fw == "tf":
                logp = sess.run(logp)
            # Calculate all expected loss components.
            expected_vf_loss = 0.5 * adv_squared
            expected_pol_loss = -1.0 * np.mean(exp_advs * logp)
            expected_loss = expected_pol_loss + config.vf_coeff * expected_vf_loss

            # Calculate the algorithm's loss (to check against our own
            # calculation above).
            batch.set_get_interceptor(None)
            postprocessed_batch = policy.postprocess_trajectory(batch)
            loss_func = (MARWILTF2Policy.loss
                         if fw != "torch" else MARWILTorchPolicy.loss)
            if fw != "tf":
                policy._lazy_tensor_dict(postprocessed_batch)
                loss_out = loss_func(policy, model, policy.dist_class,
                                     postprocessed_batch)
            else:
                loss_out, v_loss, p_loss = policy.get_session().run(
                    # policy._loss is create by TFPolicy, and is basically the
                    # loss tensor of the static graph.
                    [
                        policy._loss,
                        policy._marwil_loss.v_loss,
                        policy._marwil_loss.p_loss,
                    ],
                    feed_dict=policy._get_loss_inputs_dict(postprocessed_batch,
                                                           shuffle=False),
                )

            # Check all components.
            if fw == "torch":
                check(policy.v_loss, expected_vf_loss, decimals=4)
                check(policy.p_loss, expected_pol_loss, decimals=4)
            elif fw == "tf":
                check(v_loss, expected_vf_loss, decimals=4)
                check(p_loss, expected_pol_loss, decimals=4)
            else:
                check(policy._marwil_loss.v_loss, expected_vf_loss, decimals=4)
                check(policy._marwil_loss.p_loss,
                      expected_pol_loss,
                      decimals=4)
            check(loss_out, expected_loss, decimals=3)
示例#26
0
    def _make_worker(self, cls, env_creator, policy, worker_index, config):
        def session_creator():
            logger.debug("Creating TF session {}".format(
                config["tf_session_args"]))
            return tf.Session(config=tf.ConfigProto(
                **config["tf_session_args"]))

        if isinstance(config["input"], FunctionType):
            input_creator = config["input"]
        elif config["input"] == "sampler":
            input_creator = (lambda ioctx: ioctx.default_sampler_input())
        elif isinstance(config["input"], dict):
            input_creator = (
                lambda ioctx: ShuffledInput(MixedInput(config["input"], ioctx),
                                            config["shuffle_buffer_size"]))
        else:
            input_creator = (
                lambda ioctx: ShuffledInput(JsonReader(config["input"], ioctx),
                                            config["shuffle_buffer_size"]))

        if isinstance(config["output"], FunctionType):
            output_creator = config["output"]
        elif config["output"] is None:
            output_creator = (lambda ioctx: NoopOutput())
        elif config["output"] == "logdir":
            output_creator = (lambda ioctx: JsonWriter(
                ioctx.log_dir,
                ioctx,
                max_file_size=config["output_max_file_size"],
                compress_columns=config["output_compress_columns"]))
        else:
            output_creator = (lambda ioctx: JsonWriter(
                config["output"],
                ioctx,
                max_file_size=config["output_max_file_size"],
                compress_columns=config["output_compress_columns"]))

        if config["input"] == "sampler":
            input_evaluation = []
        else:
            input_evaluation = config["input_evaluation"]

        # Fill in the default policy if 'None' is specified in multiagent
        if config["multiagent"]["policies"]:
            tmp = config["multiagent"]["policies"]
            _validate_multiagent_config(tmp, allow_none_graph=True)
            for k, v in tmp.items():
                if v[0] is None:
                    tmp[k] = (policy, v[1], v[2], v[3])
            policy = tmp

        return cls(env_creator,
                   policy,
                   policy_mapping_fn=config["multiagent"]["policy_mapping_fn"],
                   policies_to_train=config["multiagent"]["policies_to_train"],
                   tf_session_creator=(session_creator
                                       if config["tf_session_args"] else None),
                   batch_steps=config["sample_batch_size"],
                   batch_mode=config["batch_mode"],
                   episode_horizon=config["horizon"],
                   preprocessor_pref=config["preprocessor_pref"],
                   sample_async=config["sample_async"],
                   compress_observations=config["compress_observations"],
                   num_envs=config["num_envs_per_worker"],
                   observation_filter=config["observation_filter"],
                   clip_rewards=config["clip_rewards"],
                   clip_actions=config["clip_actions"],
                   env_config=config["env_config"],
                   model_config=config["model"],
                   policy_config=config,
                   worker_index=worker_index,
                   monitor_path=self._logdir if config["monitor"] else None,
                   log_dir=self._logdir,
                   log_level=config["log_level"],
                   callbacks=config["callbacks"],
                   input_creator=input_creator,
                   input_evaluation=input_evaluation,
                   output_creator=output_creator,
                   remote_worker_envs=config["remote_worker_envs"],
                   remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"],
                   soft_horizon=config["soft_horizon"],
                   no_done_at_end=config["no_done_at_end"],
                   seed=(config["seed"] +
                         worker_index) if config["seed"] is not None else None,
                   _fake_sampler=config.get("_fake_sampler", False))
示例#27
0
 def testAgentOutputOk(self):
     self.writeOutputs(self.test_dir)
     self.assertEqual(len(os.listdir(self.test_dir)), 1)
     reader = JsonReader(self.test_dir + "/*.json")
     reader.next()
示例#28
0
    def _make_worker(
        self,
        *,
        cls: Callable,
        env_creator: EnvCreator,
        validate_env: Optional[Callable[[EnvType], None]],
        policy_cls: Type[Policy],
        worker_index: int,
        num_workers: int,
        recreated_worker: bool = False,
        config: AlgorithmConfigDict,
        spaces: Optional[Dict[PolicyID, Tuple[gym.spaces.Space,
                                              gym.spaces.Space]]] = None,
    ) -> Union[RolloutWorker, ActorHandle]:
        def session_creator():
            logger.debug("Creating TF session {}".format(
                config["tf_session_args"]))
            return tf1.Session(config=tf1.ConfigProto(
                **config["tf_session_args"]))

        def valid_module(class_path):
            if (isinstance(class_path, str) and not os.path.isfile(class_path)
                    and "." in class_path):
                module_path, class_name = class_path.rsplit(".", 1)
                try:
                    spec = importlib.util.find_spec(module_path)
                    if spec is not None:
                        return True
                except (ModuleNotFoundError, ValueError):
                    print(
                        f"module {module_path} not found while trying to get "
                        f"input {class_path}")
            return False

        # A callable returning an InputReader object to use.
        if isinstance(config["input"], FunctionType):
            input_creator = config["input"]
        # Use RLlib's Sampler classes (SyncSampler or AsynchSampler, depending
        # on `config.sample_async` setting).
        elif config["input"] == "sampler":
            input_creator = lambda ioctx: ioctx.default_sampler_input()
        # Ray Dataset input -> Use `config.input_config` to construct DatasetReader.
        elif config["input"] == "dataset":
            # Input dataset shards should have already been prepared.
            # We just need to take the proper shard here.
            input_creator = lambda ioctx: DatasetReader(
                ioctx, self._ds_shards[worker_index])
        # Dict: Mix of different input methods with different ratios.
        elif isinstance(config["input"], dict):
            input_creator = lambda ioctx: ShuffledInput(
                MixedInput(config["input"], ioctx), config[
                    "shuffle_buffer_size"])
        # A pre-registered input descriptor (str).
        elif isinstance(config["input"], str) and registry_contains_input(
                config["input"]):
            input_creator = registry_get_input(config["input"])
        # D4RL input.
        elif "d4rl" in config["input"]:
            env_name = config["input"].split(".")[-1]
            input_creator = lambda ioctx: D4RLReader(env_name, ioctx)
        # Valid python module (class path) -> Create using `from_config`.
        elif valid_module(config["input"]):
            input_creator = lambda ioctx: ShuffledInput(
                from_config(config["input"], ioctx=ioctx))
        # JSON file or list of JSON files -> Use JsonReader (shuffled).
        else:
            input_creator = lambda ioctx: ShuffledInput(
                JsonReader(config["input"], ioctx), config[
                    "shuffle_buffer_size"])

        if isinstance(config["output"], FunctionType):
            output_creator = config["output"]
        elif config["output"] is None:
            output_creator = lambda ioctx: NoopOutput()
        elif config["output"] == "dataset":
            output_creator = lambda ioctx: DatasetWriter(
                ioctx, compress_columns=config["output_compress_columns"])
        elif config["output"] == "logdir":
            output_creator = lambda ioctx: JsonWriter(
                ioctx.log_dir,
                ioctx,
                max_file_size=config["output_max_file_size"],
                compress_columns=config["output_compress_columns"],
            )
        else:
            output_creator = lambda ioctx: JsonWriter(
                config["output"],
                ioctx,
                max_file_size=config["output_max_file_size"],
                compress_columns=config["output_compress_columns"],
            )

        # Assert everything is correct in "multiagent" config dict (if given).
        ma_policies = config["multiagent"]["policies"]
        if ma_policies:
            for pid, policy_spec in ma_policies.copy().items():
                assert isinstance(policy_spec, PolicySpec)
                # Class is None -> Use `policy_cls`.
                if policy_spec.policy_class is None:
                    ma_policies[pid].policy_class = policy_cls
            policies = ma_policies

        # Create a policy_spec (MultiAgentPolicyConfigDict),
        # even if no "multiagent" setup given by user.
        else:
            policies = policy_cls

        if worker_index == 0:
            extra_python_environs = config.get(
                "extra_python_environs_for_driver", None)
        else:
            extra_python_environs = config.get(
                "extra_python_environs_for_worker", None)

        worker = cls(
            env_creator=env_creator,
            validate_env=validate_env,
            policy_spec=policies,
            policy_mapping_fn=config["multiagent"]["policy_mapping_fn"],
            policies_to_train=config["multiagent"]["policies_to_train"],
            tf_session_creator=(session_creator
                                if config["tf_session_args"] else None),
            rollout_fragment_length=config["rollout_fragment_length"],
            count_steps_by=config["multiagent"]["count_steps_by"],
            batch_mode=config["batch_mode"],
            episode_horizon=config["horizon"],
            preprocessor_pref=config["preprocessor_pref"],
            sample_async=config["sample_async"],
            compress_observations=config["compress_observations"],
            num_envs=config["num_envs_per_worker"],
            observation_fn=config["multiagent"]["observation_fn"],
            observation_filter=config["observation_filter"],
            clip_rewards=config["clip_rewards"],
            normalize_actions=config["normalize_actions"],
            clip_actions=config["clip_actions"],
            env_config=config["env_config"],
            policy_config=config,
            worker_index=worker_index,
            num_workers=num_workers,
            recreated_worker=recreated_worker,
            log_dir=self._logdir,
            log_level=config["log_level"],
            callbacks=config["callbacks"],
            input_creator=input_creator,
            output_creator=output_creator,
            remote_worker_envs=config["remote_worker_envs"],
            remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"],
            soft_horizon=config["soft_horizon"],
            no_done_at_end=config["no_done_at_end"],
            seed=(config["seed"] +
                  worker_index) if config["seed"] is not None else None,
            fake_sampler=config["fake_sampler"],
            extra_python_environs=extra_python_environs,
            spaces=spaces,
            disable_env_checking=config["disable_env_checking"],
        )

        return worker
示例#29
0
    def test_marwil_loss_function(self):
        """
        To generate the historic data used in this test case, first run:
        $ ./train.py --run=PPO --env=CartPole-v0 \
          --stop='{"timesteps_total": 50000}' \
          --config='{"output": "/tmp/out", "batch_mode": "complete_episodes"}'
        """
        rllib_dir = Path(__file__).parent.parent.parent.parent
        print("rllib dir={}".format(rllib_dir))
        data_file = os.path.join(rllib_dir, "tests/data/cartpole/small.json")
        print("data_file={} exists={}".format(data_file,
                                              os.path.isfile(data_file)))
        config = marwil.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        # Learn from offline data.
        config["input"] = [data_file]

        for fw in framework_iterator(config, frameworks=["torch", "tf2"]):
            reader = JsonReader(inputs=[data_file])
            batch = reader.next()

            trainer = marwil.MARWILTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()
            model = policy.model

            # Calculate our own expected values (to then compare against the
            # agent's loss output).
            cummulative_rewards = compute_advantages(batch, 0.0,
                                                     config["gamma"], 1.0,
                                                     False,
                                                     False)["advantages"]
            if fw == "torch":
                cummulative_rewards = torch.tensor(cummulative_rewards)
            batch = policy._lazy_tensor_dict(batch)
            model_out, _ = model.from_batch(batch)
            vf_estimates = model.value_function()
            adv = cummulative_rewards - vf_estimates
            if fw == "torch":
                adv = adv.detach().cpu().numpy()
            adv_squared = np.mean(np.square(adv))
            c_2 = 100.0 + 1e-8 * (adv_squared - 100.0)
            c = np.sqrt(c_2)
            exp_advs = np.exp(config["beta"] * (adv / c))
            logp = policy.dist_class(model_out, model).logp(batch["actions"])
            if fw == "torch":
                logp = logp.detach().cpu().numpy()
            # Calculate all expected loss components.
            expected_vf_loss = 0.5 * adv_squared
            expected_pol_loss = -1.0 * np.mean(exp_advs * logp)
            expected_loss = \
                expected_pol_loss + config["vf_coeff"] * expected_vf_loss

            # Calculate the algorithm's loss (to check against our own
            # calculation above).
            batch.set_get_interceptor(None)
            postprocessed_batch = policy.postprocess_trajectory(batch)
            loss_func = marwil.marwil_tf_policy.marwil_loss if fw != "torch" \
                else marwil.marwil_torch_policy.marwil_loss
            loss_out = loss_func(policy, model, policy.dist_class,
                                 policy._lazy_tensor_dict(postprocessed_batch))

            # Check all components.
            if fw == "torch":
                check(policy.v_loss, expected_vf_loss, decimals=4)
                check(policy.p_loss, expected_pol_loss, decimals=4)
            else:
                check(policy.loss.v_loss, expected_vf_loss, decimals=4)
                check(policy.loss.p_loss, expected_pol_loss, decimals=4)
            check(loss_out, expected_loss, decimals=3)
    def _make_worker(self, cls, env_creator, policy, worker_index, config):
        def session_creator():
            logger.debug("Creating TF session {}".format(
                config["tf_session_args"]))
            return tf.Session(config=tf.ConfigProto(
                **config["tf_session_args"]))

        if isinstance(config["input"], FunctionType):
            input_creator = config["input"]
        elif config["input"] == "sampler":
            input_creator = (lambda ioctx: ioctx.default_sampler_input())
        elif isinstance(config["input"], dict):
            input_creator = (
                lambda ioctx: ShuffledInput(MixedInput(config["input"], ioctx),
                                            config["shuffle_buffer_size"]))
        else:
            input_creator = (
                lambda ioctx: ShuffledInput(JsonReader(config["input"], ioctx),
                                            config["shuffle_buffer_size"]))

        if isinstance(config["output"], FunctionType):
            output_creator = config["output"]
        elif config["output"] is None:
            output_creator = (lambda ioctx: NoopOutput())
        elif config["output"] == "logdir":
            output_creator = (lambda ioctx: JsonWriter(
                ioctx.log_dir,
                ioctx,
                max_file_size=config["output_max_file_size"],
                compress_columns=config["output_compress_columns"]))
        else:
            output_creator = (lambda ioctx: JsonWriter(
                config["output"],
                ioctx,
                max_file_size=config["output_max_file_size"],
                compress_columns=config["output_compress_columns"]))

        if config["input"] == "sampler":
            input_evaluation = []
        else:
            input_evaluation = config["input_evaluation"]

        # Fill in the default policy if 'None' is specified in multiagent
        if config["multiagent"]["policies"]:
            tmp = config["multiagent"]["policies"]
            _validate_multiagent_config(tmp, allow_none_graph=True)
            for k, v in tmp.items():
                if v[0] is None:
                    tmp[k] = (policy, v[1], v[2], v[3])
            policy = tmp

        worker = cls(
            env_creator,
            policy,
            policy_mapping_fn=config["multiagent"]["policy_mapping_fn"],
            policies_to_train=config["multiagent"]["policies_to_train"],
            tf_session_creator=(session_creator
                                if config["tf_session_args"] else None),
            rollout_fragment_length=config["rollout_fragment_length"],
            batch_mode=config["batch_mode"],
            episode_horizon=config["horizon"],
            preprocessor_pref=config["preprocessor_pref"],
            sample_async=config["sample_async"],
            compress_observations=config["compress_observations"],
            num_envs=config["num_envs_per_worker"],
            observation_filter=config["observation_filter"],
            clip_rewards=config["clip_rewards"],
            clip_actions=config["clip_actions"],
            env_config=config["env_config"],
            model_config=config["model"],
            policy_config=config,
            worker_index=worker_index,
            num_workers=self._num_workers,
            monitor_path=self._logdir if config["monitor"] else None,
            log_dir=self._logdir,
            log_level=config["log_level"],
            callbacks=config["callbacks"],
            input_creator=input_creator,
            input_evaluation=input_evaluation,
            output_creator=output_creator,
            remote_worker_envs=config["remote_worker_envs"],
            remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"],
            soft_horizon=config["soft_horizon"],
            no_done_at_end=config["no_done_at_end"],
            seed=(config["seed"] +
                  worker_index) if config["seed"] is not None else None,
            _fake_sampler=config.get("_fake_sampler", False))

        # Check for correct policy class (only locally, remote Workers should
        # create the exact same Policy types).
        if type(worker) is RolloutWorker:
            actual_class = type(worker.get_policy())

            # Pytorch case: Policy must be a TorchPolicy.
            if config["use_pytorch"]:
                assert issubclass(actual_class, TorchPolicy), \
                    "Worker policy must be subclass of `TorchPolicy`, " \
                    "but is {}!".format(actual_class.__name__)
            # non-Pytorch case:
            # Policy may be None AND must not be a TorchPolicy.
            else:
                assert issubclass(actual_class, type(None)) or \
                       (issubclass(actual_class, Policy) and
                        not issubclass(actual_class, TorchPolicy)), "Worker " \
                       "policy must be subclass of `Policy`, but NOT " \
                       "`TorchPolicy` (your class={})! If you have a torch " \
                       "Trainer, make sure to set `use_pytorch=True` in " \
                       "your Trainer's config)!".format(actual_class.__name__)

        return worker
示例#31
0
 def testAgentOutputOk(self):
     self.writeOutputs(self.test_dir)
     self.assertEqual(len(os.listdir(self.test_dir)), 1)
     ioctx = IOContext(self.test_dir, {}, 0, None)
     reader = JsonReader(ioctx, self.test_dir + "/*.json")
     reader.next()
示例#32
0
class TorchCustomLossModel(TorchModelV2, nn.Module):
    """PyTorch version of the CustomLossModel above."""
    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name, input_files):
        super().__init__(obs_space, action_space, num_outputs, model_config,
                         name)
        nn.Module.__init__(self)

        self.input_files = input_files
        # Create a new input reader per worker.
        self.reader = JsonReader(self.input_files)
        self.fcnet = TorchFC(self.obs_space,
                             self.action_space,
                             num_outputs,
                             model_config,
                             name="fcnet")

    @override(ModelV2)
    def forward(self, input_dict, state, seq_lens):
        # Delegate to our FCNet.
        return self.fcnet(input_dict, state, seq_lens)

    @override(ModelV2)
    def custom_loss(self, policy_loss, loss_inputs):
        """Calculates a custom loss on top of the given policy_loss(es).

        Args:
            policy_loss (List[TensorType]): The list of already calculated
                policy losses (as many as there are optimizers).
            loss_inputs (TensorStruct): Struct of np.ndarrays holding the
                entire train batch.

        Returns:
            List[TensorType]: The altered list of policy losses. In case the
                custom loss should have its own optimizer, make sure the
                returned list is one larger than the incoming policy_loss list.
                In case you simply want to mix in the custom loss into the
                already calculated policy losses, return a list of altered
                policy losses (as done in this example below).
        """
        # Get the next batch from our input files.
        batch = self.reader.next()

        # Define a secondary loss by building a graph copy with weight sharing.
        obs = restore_original_dimensions(torch.from_numpy(
            batch["obs"]).float(),
                                          self.obs_space,
                                          tensorlib="torch")
        logits, _ = self.forward({"obs": obs}, [], None)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # Compute the IL loss.
        action_dist = TorchCategorical(logits, self.model_config)
        imitation_loss = torch.mean(
            -action_dist.logp(torch.from_numpy(batch["actions"])))
        self.imitation_loss_metric = imitation_loss.item()
        self.policy_loss_metric = np.mean([l.item() for l in policy_loss])

        # Add the imitation loss to each already calculated policy loss term.
        # Alternatively (if custom loss has its own optimizer):
        # return policy_loss + [10 * self.imitation_loss]
        return [loss_ + 10 * imitation_loss for loss_ in policy_loss]

    def metrics(self):
        return {
            "policy_loss": self.policy_loss_metric,
            "imitation_loss": self.imitation_loss_metric,
        }