示例#1
0
 def worker_devices(self, mode: str):
     # Note: Avoid instantiating preprocessors in machine_params (use Builder if needed)
     devices = self.config.machine_params(mode)["gpu_ids"]
     if len(devices) > 0:
         if torch.device(devices[0]) == torch.device("cpu"):
             assert all_equal(
                 devices
             ), "Specified devices {} must be all non-negative integers or all equal to 'cpu'".format(
                 devices
             )
             devices = [torch.device(d) for d in devices]
         else:
             assert all(
                 [gpu_id >= 0 for gpu_id in devices]
             ), "all gpu_ids must be >= 0"
             assert torch.cuda.device_count() > max(
                 set(devices)
             ), "{} CUDA devices available for requested {} gpu ids {}".format(
                 torch.cuda.device_count(), mode, devices
             )
     else:
         devices = [torch.device("cpu")]
     get_logger().info(
         "Using {} {} workers on devices {}".format(len(devices), mode, devices)
     )
     return devices
示例#2
0
    def process_test_packages(
        self,
        log_writer: Optional[SummaryWriter],
        pkgs: List[LoggingPackage],
        all_results: Optional[List[Any]] = None,
    ):
        mode = pkgs[0].mode
        assert mode == "test"

        training_steps = pkgs[0].training_steps

        all_metrics_tracker = ScalarMeanTracker()
        metric_dicts_list, render, checkpoint_file_name = [], {}, []
        for pkg in pkgs:
            all_metrics_tracker.add_scalars(
                scalars=pkg.metrics_tracker.means(),
                n=pkg.metrics_tracker.counts())
            metric_dicts_list.extend(pkg.metric_dicts)
            if pkg.viz_data is not None:
                render.update(pkg.viz_data)
            checkpoint_file_name.append(pkg.checkpoint_file_name)

        assert all_equal(checkpoint_file_name)

        message = [f"{mode} {training_steps} steps:"]

        metric_means = all_metrics_tracker.means()
        for k in sorted(metric_means.keys()):
            if log_writer is not None:
                log_writer.add_scalar(f"{mode}/{k}", metric_means[k],
                                      training_steps)
            message.append(k + " {:.3g}".format(metric_means[k]))

        if all_results is not None:
            results = copy.deepcopy(metric_means)
            results.update({
                "training_steps": training_steps,
                "tasks": metric_dicts_list
            })
            all_results.append(results)

        num_tasks = sum(
            [pkg.num_non_empty_metrics_dicts_added for pkg in pkgs])
        if log_writer is not None:
            log_writer.add_scalar(f"{mode}/num_tasks_evaled", num_tasks,
                                  training_steps)

        message.append("tasks {} checkpoint {}".format(
            num_tasks, checkpoint_file_name[0]))
        get_logger().info(" ".join(message))

        if self.visualizer is not None:
            self.visualizer.log(
                log_writer=log_writer,
                task_outputs=metric_dicts_list,
                render=render,
                num_steps=training_steps,
            )
示例#3
0
    def worker_devices(self, mode: str):
        machine_params: MachineParams = MachineParams.instance_from(
            self.config.machine_params(mode))
        devices = machine_params.devices

        assert all_equal(devices) or all(
            d.index >= 0 for d in devices
        ), f"Cannot have a mix of CPU and GPU devices (`devices == {devices}`)"

        get_logger().info("Using {} {} workers on devices {}".format(
            len(devices), mode, devices))
        return devices