def worker_devices(self, mode: str): # Note: Avoid instantiating preprocessors in machine_params (use Builder if needed) devices = self.config.machine_params(mode)["gpu_ids"] if len(devices) > 0: if torch.device(devices[0]) == torch.device("cpu"): assert all_equal( devices ), "Specified devices {} must be all non-negative integers or all equal to 'cpu'".format( devices ) devices = [torch.device(d) for d in devices] else: assert all( [gpu_id >= 0 for gpu_id in devices] ), "all gpu_ids must be >= 0" assert torch.cuda.device_count() > max( set(devices) ), "{} CUDA devices available for requested {} gpu ids {}".format( torch.cuda.device_count(), mode, devices ) else: devices = [torch.device("cpu")] get_logger().info( "Using {} {} workers on devices {}".format(len(devices), mode, devices) ) return devices
def process_test_packages( self, log_writer: Optional[SummaryWriter], pkgs: List[LoggingPackage], all_results: Optional[List[Any]] = None, ): mode = pkgs[0].mode assert mode == "test" training_steps = pkgs[0].training_steps all_metrics_tracker = ScalarMeanTracker() metric_dicts_list, render, checkpoint_file_name = [], {}, [] for pkg in pkgs: all_metrics_tracker.add_scalars( scalars=pkg.metrics_tracker.means(), n=pkg.metrics_tracker.counts()) metric_dicts_list.extend(pkg.metric_dicts) if pkg.viz_data is not None: render.update(pkg.viz_data) checkpoint_file_name.append(pkg.checkpoint_file_name) assert all_equal(checkpoint_file_name) message = [f"{mode} {training_steps} steps:"] metric_means = all_metrics_tracker.means() for k in sorted(metric_means.keys()): if log_writer is not None: log_writer.add_scalar(f"{mode}/{k}", metric_means[k], training_steps) message.append(k + " {:.3g}".format(metric_means[k])) if all_results is not None: results = copy.deepcopy(metric_means) results.update({ "training_steps": training_steps, "tasks": metric_dicts_list }) all_results.append(results) num_tasks = sum( [pkg.num_non_empty_metrics_dicts_added for pkg in pkgs]) if log_writer is not None: log_writer.add_scalar(f"{mode}/num_tasks_evaled", num_tasks, training_steps) message.append("tasks {} checkpoint {}".format( num_tasks, checkpoint_file_name[0])) get_logger().info(" ".join(message)) if self.visualizer is not None: self.visualizer.log( log_writer=log_writer, task_outputs=metric_dicts_list, render=render, num_steps=training_steps, )
def worker_devices(self, mode: str): machine_params: MachineParams = MachineParams.instance_from( self.config.machine_params(mode)) devices = machine_params.devices assert all_equal(devices) or all( d.index >= 0 for d in devices ), f"Cannot have a mix of CPU and GPU devices (`devices == {devices}`)" get_logger().info("Using {} {} workers on devices {}".format( len(devices), mode, devices)) return devices