def test_submit_models(self): _reset() nni.retiarii.debug_configs.framework = 'pytorch' os.makedirs('generated', exist_ok=True) from nni.runtime import protocol import nni.runtime.platform.test as tt protocol._set_out_file( open('generated/debug_protocol_out_file.py', 'wb')) protocol._set_in_file( open('generated/debug_protocol_out_file.py', 'rb')) models = _load_mnist(2) advisor = RetiariiAdvisor() cgo_engine = CGOExecutionEngine(devices=[ GPUDevice("test", 0), GPUDevice("test", 1), GPUDevice("test", 2), GPUDevice("test", 3) ], batch_waiting_time=0) set_execution_engine(cgo_engine) submit_models(*models) time.sleep(3) if torch.cuda.is_available() and torch.cuda.device_count() >= 2: cmd, data = protocol.receive() params = nni.load(data) tt.init_params(params) trial_thread = threading.Thread( target=CGOExecutionEngine.trial_execute_graph) trial_thread.start() last_metric = None while True: time.sleep(1) if tt._last_metric: metric = tt.get_last_metric() if metric == last_metric: continue if 'value' in metric: metric['value'] = json.dumps(metric['value']) advisor.handle_report_metric_data(metric) last_metric = metric if not trial_thread.is_alive(): trial_thread.join() break trial_thread.join() advisor.stopping = True advisor.default_worker.join() advisor.assessor_worker.join() cgo_engine.join()
def _construct_devices(self): devices = [] if hasattr(self.config.training_service, 'machine_list'): for machine_idx, machine in enumerate( self.config.training_service.machine_list): for gpu_idx in machine.gpu_indices: devices.append(GPUDevice(machine.host, gpu_idx)) else: for gpu_idx in self.config.training_service.gpu_indices: devices.append(GPUDevice('local', gpu_idx)) return devices
def _construct_devices(self): devices = [] if hasattr(self.config.training_service, 'machine_list'): for machine in self.config.training_service.machine_list: assert machine.gpu_indices is not None, \ 'gpu_indices must be set in RemoteMachineConfig for CGO execution engine' for gpu_idx in machine.gpu_indices: devices.append(GPUDevice(machine.host, gpu_idx)) return devices
def test_dedup_input_two_devices(self): _reset() lp, models = self._build_logical_with_mnist(3) opt = DedupInputOptimizer() opt.convert(lp) advisor = RetiariiAdvisor() available_devices = [GPUDevice("test", 0), GPUDevice("test", 1)] cgo = CGOExecutionEngine(devices=available_devices, batch_waiting_time=0) phy_models = cgo._assemble(lp) self.assertTrue(len(phy_models) == 2) advisor.stopping = True advisor.default_worker.join() advisor.assessor_worker.join() cgo.join()
def _construct_devices(self, training_service): devices = [] if hasattr(training_service, 'machine_list'): for machine in cast(RemoteConfig, training_service).machine_list: assert machine.gpu_indices is not None, \ 'gpu_indices must be set in RemoteMachineConfig for CGO execution engine' assert isinstance(machine.gpu_indices, list), 'gpu_indices must be a list' for gpu_idx in machine.gpu_indices: devices.append(GPUDevice(machine.host, gpu_idx)) return devices