Пример #1
0
    def log(
        self,
        log_writer: SummaryWriter,
        task_outputs: Optional[List[Any]],
        render: Optional[Dict[str, List[Dict[str, Any]]]],
        num_steps: int,
    ):
        viz_order, all_episodes = self._auto_viz_order(task_outputs)
        if viz_order is None:
            get_logger().debug("trajectory viz returning without visualizing")
            return

        for page, current_ids in enumerate(viz_order):
            figs = []
            for episode_id in current_ids:
                # assert episode_id in all_episodes
                if episode_id not in all_episodes:
                    get_logger().warning(
                        "skipping viz for missing episode {}".format(episode_id)
                    )
                    continue
                figs.append(self.make_fig(all_episodes[episode_id], episode_id))
            if len(figs) == 0:
                continue
            log_writer.add_figure(
                "{}/{}_group{}".format(self.mode, self.label, page),
                figs,
                global_step=num_steps,
            )
            plt.close(
                "all"
            )  # close all current figures (SummaryWriter already closes all figures we log)
Пример #2
0
    def log(
        self,
        log_writer: SummaryWriter,
        task_outputs: Optional[List[Any]],
        render: Optional[Dict[str, List[Dict[str, Any]]]],
        num_steps: int,
    ):
        if render is None:
            return

        if (
            self.action_names is None
            and task_outputs is not None
            and len(task_outputs) > 0
            and self.action_names_path is not None
        ):
            self.action_names = list(
                self._access(task_outputs[0], self.action_names_path)
            )

        viz_order, _ = self._auto_viz_order(task_outputs)
        if viz_order is None:
            get_logger().debug("actor viz returning without visualizing")
            return

        for page, current_ids in enumerate(viz_order):
            figs = []
            for episode_id in current_ids:
                # assert episode_id in render
                if episode_id not in render:
                    get_logger().warning(
                        "skipping viz for missing episode {}".format(episode_id)
                    )
                    continue
                episode_src = [
                    step["actor_probs"]
                    for step in render[episode_id]
                    if "actor_probs" in step
                ]
                assert len(episode_src) == len(render[episode_id])
                figs.append(self.make_fig(episode_src, episode_id))
            if len(figs) == 0:
                continue
            log_writer.add_figure(
                "{}/{}_group{}".format(self.mode, self.label, page),
                figs,
                global_step=num_steps,
            )
            plt.close(
                "all"
            )  # close all current figures (SummaryWriter already closes all figures we log)
Пример #3
0
    def log(
        self,
        log_writer: SummaryWriter,
        task_outputs: Optional[List[Any]],
        render: Optional[Dict[str, List[Dict[str, Any]]]],
        num_steps: int,
    ):
        if render is None:
            return

        viz_order, _ = self._auto_viz_order(task_outputs)
        if viz_order is None:
            get_logger().debug("tensor viz returning without visualizing")
            return

        for page, current_ids in enumerate(viz_order):
            figs = []
            for episode_id in current_ids:
                if episode_id not in render or len(render[episode_id]) == 0:
                    get_logger().warning(
                        "skipping viz for missing or 0-length episode {}".format(
                            episode_id
                        )
                    )
                    continue
                episode_src = [
                    step[self.datum_id]
                    for step in render[episode_id]
                    if self.datum_id in step
                ]
                if len(episode_src) > 0:
                    # If the last episode for an inference worker is of length 1, there's no captured rollout sources
                    figs.append(self.make_fig(episode_src, episode_id))
            if len(figs) == 0:
                continue
            log_writer.add_figure(
                "{}/{}_group{}".format(self.mode, self.label, page),
                figs,
                global_step=num_steps,
            )
            plt.close(
                "all"
            )  # close all current figures (SummaryWriter already closes all figures we log)
Пример #4
0
    def log(
        self,
        log_writer: SummaryWriter,
        task_outputs: Optional[List[Any]],
        render: Optional[Dict[str, List[Dict[str, Any]]]],
        num_steps: int,
    ):
        if render is None:
            return

        datum_id = self._source_to_str(self.vector_task_sources[0], is_vector_task=True)

        viz_order, _ = self._auto_viz_order(task_outputs)
        if viz_order is None:
            get_logger().debug("agent view viz returning without visualizing")
            return

        for page, current_ids in enumerate(viz_order):
            images = []  # list of lists of rgb frames
            for episode_id in current_ids:
                # assert episode_id in render
                if episode_id not in render:
                    get_logger().warning(
                        "skipping viz for missing episode {}".format(episode_id)
                    )
                    continue
                images.append(
                    [
                        self._overlay_label(step[datum_id], episode_id)
                        for step in render[episode_id]
                    ]
                )
            if len(images) == 0:
                continue
            vid = self.make_vid(images)
            if vid is not None:
                log_writer.add_vid(
                    "{}/{}_group{}".format(self.mode, self.label, page),
                    vid,
                    global_step=num_steps,
                )
Пример #5
0
    def log(
            self,
            start_time_str: str,
            nworkers: int,
            test_steps: Sequence[int] = (),
            metrics_file: Optional[str] = None,
    ):
        finalized = False

        log_writer: Optional[SummaryWriter] = None
        if not self.disable_tensorboard:
            log_writer = SummaryWriter(
                log_dir=self.log_writer_path(start_time_str),
                filename_suffix="__{}_{}".format(self.mode,
                                                 self.local_start_time_str),
            )

        # To aggregate/buffer metrics from trainers/testers
        collected: List[LoggingPackage] = []
        last_train_steps = 0
        last_offpolicy_steps = 0
        last_train_time = time.time()
        # test_steps = sorted(test_steps, reverse=True)
        test_results: List[Dict] = []
        unfinished_workers = nworkers

        try:
            while True:
                try:
                    package: Union[LoggingPackage, Union[
                        Tuple[str, Any],
                        Tuple[str, Any,
                              Any]]] = self.queues["results"].get(timeout=1)

                    if isinstance(package, LoggingPackage):
                        pkg_mode = package.mode

                        if pkg_mode == "train":
                            collected.append(package)
                            if len(collected) >= nworkers:

                                collected = sorted(
                                    collected,
                                    key=lambda pkg: (
                                        pkg.training_steps,
                                        pkg.off_policy_steps,
                                    ),
                                )

                                if (
                                        collected[nworkers - 1].training_steps
                                        == collected[0].training_steps
                                        and collected[nworkers -
                                                      1].off_policy_steps
                                        == collected[0].off_policy_steps
                                ):  # ensure nworkers have provided the same num_steps
                                    (
                                        last_train_steps,
                                        last_offpolicy_steps,
                                        last_train_time,
                                    ) = self.process_train_packages(
                                        log_writer=log_writer,
                                        pkgs=collected[:nworkers],
                                        last_steps=last_train_steps,
                                        last_offpolicy_steps=
                                        last_offpolicy_steps,
                                        last_time=last_train_time,
                                    )
                                    collected = collected[nworkers:]
                                elif len(collected) > 2 * nworkers:
                                    get_logger().warning(
                                        "Unable to aggregate train packages from all {} workers"
                                        "after {} packages collected".format(
                                            nworkers, len(collected)))
                        elif pkg_mode == "valid":  # they all come from a single worker
                            if (package.training_steps
                                    is not None):  # no validation samplers
                                self.process_eval_package(
                                    log_writer=log_writer, pkg=package)
                            if (
                                    finalized
                                    and self.queues["checkpoints"].empty()
                            ):  # assume queue is actually empty after trainer finished and no checkpoints in queue
                                break
                        elif pkg_mode == "test":
                            collected.append(package)
                            if len(collected) >= nworkers:
                                collected = sorted(
                                    collected, key=lambda x: x.training_steps
                                )  # sort by num_steps
                                if (
                                        collected[nworkers - 1].training_steps
                                        == collected[0].training_steps
                                ):  # ensure nworkers have provided the same num_steps
                                    self.process_test_packages(
                                        log_writer=log_writer,
                                        pkgs=collected[:nworkers],
                                        all_results=test_results,
                                    )

                                    collected = collected[nworkers:]
                                    with open(metrics_file, "w") as f:
                                        json.dump(
                                            test_results,
                                            f,
                                            indent=4,
                                            sort_keys=True,
                                            cls=NumpyJSONEncoder,
                                        )
                                        get_logger().info(
                                            "Updated {} up to checkpoint {}".
                                            format(
                                                metrics_file,
                                                test_steps[len(test_results) -
                                                           1],
                                            ))
                        else:
                            get_logger().error(
                                f"Runner received unknown package of type {pkg_mode}"
                            )
                    else:
                        pkg_mode = package[0]

                        if pkg_mode == "train_stopped":
                            if package[1] == 0:
                                finalized = True
                                if not self.running_validation:
                                    get_logger().info(
                                        "Terminating runner after trainer done (no validation)"
                                    )
                                    break
                            else:
                                raise Exception(
                                    "Train worker {} abnormally terminated".
                                    format(package[1] - 1))
                        elif pkg_mode == "valid_stopped":
                            raise Exception(
                                "Valid worker {} abnormally terminated".format(
                                    package[1] - 1))
                        elif pkg_mode == "test_stopped":
                            if package[1] == 0:
                                unfinished_workers -= 1
                                if unfinished_workers == 0:
                                    get_logger().info(
                                        "Last tester finished. Terminating")
                                    finalized = True
                                    break
                            else:
                                raise RuntimeError(
                                    "Test worker {} abnormally terminated".
                                    format(package[1] - 1))
                        else:
                            get_logger().error(
                                f"Runner received invalid package tuple {package}"
                            )
                except queue.Empty as _:
                    if all(p.exitcode is not None
                           for p in itertools.chain(*self.processes.values())):
                        break
        except KeyboardInterrupt:
            get_logger().info("KeyboardInterrupt. Terminating runner.")
        except Exception:
            get_logger().error("Encountered Exception. Terminating runner.")
            get_logger().exception(traceback.format_exc())
        finally:
            if finalized:
                get_logger().info("Done")
            if log_writer is not None:
                log_writer.close()
            self.close()
            return test_results
Пример #6
0
    def make_tensorboard_summary(self):
        all_experiments = list(self.experiment_to_train_events_paths_map.keys())

        for experiment_name in all_experiments:
            summary_writer = SummaryWriter(
                os.path.join(self.tensorboard_output_summary_folder, experiment_name)
            )

            test_labels = (
                sorted(list(self.test_data[experiment_name].keys()))
                if len(self.test_data) > 0
                else []
            )
            for test_label in test_labels:
                train_label = test_label.replace("valid", "test").replace(
                    "test", "train"
                )
                if train_label not in self.train_data[experiment_name]:
                    print(
                        f"Missing matching 'train' label {train_label} for eval label {test_label}. Skipping"
                    )
                    continue
                train_data = self.train_data[experiment_name][train_label]
                test_data = self.test_data[experiment_name][test_label]
                scores, times, steps = self._eval_vs_train_time_steps(
                    test_data, train_data
                )
                for score, t, step in zip(scores, times, steps):
                    summary_writer.add_scalar(
                        test_label, score, global_step=step, walltime=t
                    )

            valid_labels = sorted(
                [
                    key
                    for key in list(self.train_data[experiment_name].keys())
                    if "valid" in key
                ]
            )
            for valid_label in valid_labels:
                train_label = valid_label.replace("valid", "train")
                assert (
                    train_label in self.train_data[experiment_name]
                ), f"Missing matching 'train' label {train_label} for valid label {valid_label}"
                train_data = self.train_data[experiment_name][train_label]
                valid_data = self.train_data[experiment_name][valid_label]
                scores, times, steps = self._eval_vs_train_time_steps(
                    valid_data, train_data
                )
                for score, t, step in zip(scores, times, steps):
                    summary_writer.add_scalar(
                        valid_label, score, global_step=step, walltime=t
                    )

            train_labels = sorted(
                [
                    key
                    for key in list(self.train_data[experiment_name].keys())
                    if "train" in key
                ]
            )
            for train_label in train_labels:
                scores, times, steps = self._train_vs_time_steps(
                    self.train_data[experiment_name][train_label]
                )
                for score, t, step in zip(scores, times, steps):
                    summary_writer.add_scalar(
                        train_label, score, global_step=step, walltime=t
                    )

            summary_writer.close()