Пример #1
0
def test_inference3d_in_proc(tiny_model_3d, log_queue):
    config = tiny_model_3d["config"]
    in_channels = config["input_channels"]
    model = TinyConvNet3d(in_channels=in_channels)
    handler_conn, inference_conn = mp.Pipe()
    p = mp.Process(target=run,
                   kwargs={
                       "conn": inference_conn,
                       "model": model,
                       "config": config,
                       "log_queue": log_queue
                   })
    p.start()
    client = create_client(IInference, handler_conn)
    try:
        client.set_devices([torch.device("cpu")])
        f = []
        n = 10
        for i in range(n):
            data = TikTensor(torch.rand(in_channels, 15, 15, 15))
            f.append(client.forward(data))

        for i in range(n):
            f[i].result(timeout=10)
            print("received ", i)
    finally:
        client.shutdown()
Пример #2
0
def test_training_in_proc(tiny_model_2d, log_queue):
    config = tiny_model_2d["config"]
    config["num_iterations_per_update"] = 10
    in_channels = config["input_channels"]
    model = TinyConvNet2d(in_channels=in_channels)
    handler_conn, training_conn = mp.Pipe()
    p = mp.Process(target=run,
                   kwargs={
                       "conn": training_conn,
                       "model": model,
                       "config": config,
                       "log_queue": log_queue
                   })
    p.start()
    client = create_client(ITraining, handler_conn)
    try:
        client.set_devices([torch.device("cpu")])
        data = TikTensorBatch([
            TikTensor(torch.zeros(in_channels, 15, 15), ((1, ), (1, ))),
            TikTensor(torch.ones(in_channels, 9, 9), ((2, ), (2, ))),
        ])
        labels = TikTensorBatch([
            TikTensor(torch.ones(in_channels, 15, 15, dtype=torch.uint8),
                      ((1, ), (1, ))),
            TikTensor(torch.full((in_channels, 9, 9), 2, dtype=torch.uint8),
                      ((2, ), (2, ))),
        ])
        client.update_dataset("training", data, labels)
        client.resume_training()
    finally:
        client.shutdown()
Пример #3
0
    def load_model(self, model: Model, state: ModelState,
                   devices: list) -> RPCFuture[SetDeviceReturnType]:
        log_dir = model.config.get(LOGGING, {}).get(DIRECTORY, "")
        if log_dir:
            os.makedirs(log_dir, exist_ok=True)
            self.logger.info("log dir: %s", os.path.abspath(log_dir))

        self._start_logging_handler()
        incomplete_msg = get_error_msg_for_incomplete_config(model.config)
        if incomplete_msg:
            raise ValueError(incomplete_msg)

        # todo: move test_transforms elsewhere
        self.test_transforms = model.config.get(TESTING, {}).get(
            TRANSFORMS, {"Normalize": {}})

        if not devices:
            devices = ["cpu"]

        cuda_visible_devices, handler_devices = self.get_cuda_and_handler_device_names(
            devices)

        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(cuda_visible_devices)
        self.logger.info("Set CUDA_VISIBLE_DEVICES to '%s'",
                         os.environ["CUDA_VISIBLE_DEVICES"])

        server_conn, handler_conn = mp.Pipe()
        p = mp.Process(
            target=run_handler,
            name="Handler",
            kwargs={
                "conn": handler_conn,
                "config": model.config,
                "model_file": model.code,
                "model_state": state.model_state,
                "optimizer_state": state.optimizer_state,
                "log_queue": self.log_queue,
            },
        )
        try:
            p.start()
        except Exception as e:
            self.logger.error(e)
            err_fut = RPCFuture()
            err_fut.set_exception(e)
            return err_fut
        else:
            self.handler = create_client(IHandler, server_conn)
            try:
                tik_fut = self.handler.set_devices(handler_devices)
            except Exception as e:
                self.logger.exception("set_devices failed")
                err_fut = RPCFuture()
                err_fut.set_exception(e)
                return err_fut
            else:
                self.logger.info("got tik_fut")
                fut = tik_fut.map(convert_to_SetDeviceReturnType)
                self.logger.info("converted tik_fut")
                return fut
Пример #4
0
    def _spawn(iface_cls, srv_cls):
        child, parent = mp.Pipe()

        p = mp.Process(target=_run_srv, args=(srv_cls, parent, log_queue))
        p.start()

        data["client"] = client = create_client(iface_cls, child)
        data["process"] = p
        return client
Пример #5
0
def start_model_session_process(
    model_zip: bytes, devices: List[str], log_queue: Optional[_mp.Queue] = None
) -> Tuple[_mp.Process, IRPCModelSession]:
    client_conn, server_conn = _mp.Pipe()
    proc = _mp.Process(
        target=_run_model_session_process,
        name="ModelSessionProcess",
        kwargs={"conn": server_conn, "devices": devices, "log_queue": log_queue, "model_zip": model_zip},
    )
    proc.start()
    return proc, _mp_rpc.create_client(IRPCModelSession, client_conn)
Пример #6
0
def client(log_queue):
    child, parent = mp.Pipe()

    p = mp.Process(target=_srv, args=(parent, log_queue))
    p.start()

    client = create_client(ITestApi, child, timeout=10)

    yield client

    client.shutdown()
    p.join()
Пример #7
0
def client3d(tiny_model_3d, log_queue):
    client_conn, handler_conn = mp.Pipe()

    p = mp.Process(target=run_handler,
                   name="Handler",
                   kwargs={
                       "conn": handler_conn,
                       **tiny_model_3d, "log_queue": log_queue
                   })
    p.start()

    cl = create_client(IHandler, client_conn)
    cl.set_devices(["cpu"])
    yield cl

    cl.shutdown.async_().result(timeout=30)
    p.join(timeout=20)
Пример #8
0
def test_inference2d_in_proc(tiny_model_2d, log_queue):
    config = tiny_model_2d["config"]
    in_channels = config["input_channels"]
    model = TinyConvNet2d(in_channels=in_channels)
    handler_conn, inference_conn = mp.Pipe()
    p = mp.Process(target=run,
                   kwargs={
                       "conn": inference_conn,
                       "model": model,
                       "config": config,
                       "log_queue": log_queue
                   })
    p.start()
    client = create_client(IInference, handler_conn)
    try:
        client.set_devices([torch.device("cpu")])
        data = TikTensor(torch.zeros(in_channels, 15, 15), (0, ))
        f = client.forward(data)
        f.result(timeout=10)
    finally:
        client.shutdown()
Пример #9
0
def test_future_timeout(client: ITestApi, log_queue):
    child, parent = mp.Pipe()

    p = mp.Process(target=_srv, args=(parent, log_queue))
    p.start()

    client = create_client(ITestApi, child, timeout=0.001)

    with pytest.raises(TimeoutError):
        client.compute(1, 2)

    with pytest.raises(TimeoutError):
        client.compute.async_(1, 2).result()

    with pytest.raises(TimeoutError):
        client.compute_fut(1, 2).result()

    client.compute.async_(1, 2).result(timeout=3)

    client.shutdown()
    p.join()
Пример #10
0
def test_race_condition(log_queue):
    class SlowConn:
        def __init__(self, conn):
            self._conn = conn

        def send(self, *args):
            self._conn.send(*args)
            # Block so future will be resolved earlier than we return value
            time.sleep(0.5)

        def __getattr__(self, name):
            return getattr(self._conn, name)

    child, parent = mp.Pipe()

    p = mp.Process(target=_srv, args=(parent, log_queue))
    p.start()

    client = create_client(ITestApi, SlowConn(child))

    client.fast_compute(2, 2)

    client.shutdown().result()
Пример #11
0
    def __init__(
        self,
        config: dict,
        model_file: bytes,
        model_state: bytes,
        optimizer_state: bytes,
        log_queue: Optional[mp.Queue] = None,
    ) -> None:
        """
        :param config: configuration dict
        :param model_file: bytes of file describing the neural network model
        :param model_state: binarized model state dict
        :param optimizer_state: binarized optimizer state dict
        """
        assert model_file
        for required in [MODEL_CLASS_NAME]:
            if required not in config:
                raise ValueError(f"{required} missing in config")

        self.config = config

        self.shutdown_event = threading.Event()

        self.logger = logging.getLogger(__name__)
        self.logger.info("started")
        self.valid_shapes: Optional[List[Point]] = None
        self.shrinkage: Optional[Point] = None
        self.idle_devices: List[torch.device] = []
        self.training_devices: List[torch.device] = []
        self.inference_devices: List[torch.device] = []

        self.tempdir = tempfile.mkdtemp()
        user_module_name = "usermodel"
        with open(os.path.join(self.tempdir, user_module_name + ".py"),
                  "wb") as f:
            f.write(model_file)

        sys.path.insert(0, self.tempdir)
        user_module = importlib.import_module(user_module_name)

        self.model: torch.nn.Module = getattr(
            user_module, self.config[MODEL_CLASS_NAME])(
                **self.config.get(MODEL_INIT_KWARGS, {}))
        self.logger.debug("created user model")

        if model_state:
            self.logger.debug("load model state")
            try:
                self.model.load_state_dict(
                    torch.load(io.BytesIO(model_state), map_location="cpu"))
            except Exception as e:
                self.logger.exception(e)
            else:
                self.logger.info("restored model state")

        try:
            self.logger.debug("start dryrun process")
            handler2dryrun_conn, dryrun2handler_conn = mp.Pipe()
            self._dry_run_proc = mp.Process(
                name="DryRun",
                target=run_dryrun,
                kwargs={
                    "conn": dryrun2handler_conn,
                    "config": config,
                    "model": self.model,
                    "log_queue": log_queue
                },
            )
            self._dry_run_proc.start()
            self._dry_run: IDryRun = create_client(IDryRun,
                                                   handler2dryrun_conn)

            self.logger.debug("start training process")
            handler2training_conn, training2handler_conn = mp.Pipe()
            self._training_proc = mp.Process(
                target=run_training,
                name="Training",
                kwargs={
                    "conn": training2handler_conn,
                    "config": config,
                    "model": self.model,
                    "optimizer_state": optimizer_state,
                    "log_queue": log_queue,
                },
            )
            self._training_proc.start()
            self._training: ITraining = create_client(ITraining,
                                                      handler2training_conn)

            self.logger.debug("start inference process")
            handler2inference_conn, inference2handler_conn = mp.Pipe()
            self._inference_proc = mp.Process(
                target=run_inference,
                name="Inference",
                kwargs={
                    "conn": inference2handler_conn,
                    "config": config,
                    "model": self.model,
                    "log_queue": log_queue
                },
            )
            self._inference_proc.start()
            self._inference: IInference = create_client(
                IInference, handler2inference_conn)

            # start device setter thread that will wait for dry run processes to finish
            self.new_device_names: queue.Queue = queue.Queue()
            self.device_setter_thread = threading.Thread(target=add_logger(
                self.logger)(self._device_setter_worker),
                                                         name="DeviceSetter")
            self.device_setter_thread.start()
        except Exception as e:
            self.logger.exception(e)
            self.shutdown()