Exemplo n.º 1
0
 def terminate(self):
     if self._task:
         # trigger pytorch lighting training graceful shutdown via a ^C
         self._task.set_exception(KeyboardInterrupt())
         model_train_curd.update(TrainingJobUpdate(_id=self._id, status=Status.FAIL))
         model_bo = ModelService.get_model_by_id(self.model_id)
         model_bo.model_status.remove(ModelStatus.TRAINING)
         model_bo.model_status.append(ModelStatus.DRAFT)
         ModelService.update_model(model_bo)
Exemplo n.º 2
0
    def start(self):
        def training_done_callback(future):
            model_train_curd.update(TrainingJobUpdate(_id=self._id, status=Status.PASS))
            # TODO: save to database and update model_status, engine
            print(self.export_model())

        self._task = self._executor.submit(self.trainer_engine.fit, self.model, **self._data_loader_kwargs)
        self._task.add_done_callback(training_done_callback)
        model_train_curd.update(TrainingJobUpdate(_id=self._id, status=Status.RUNNING))

        model_bo = ModelService.get_model_by_id(self.model_id)
        model_bo.model_status.remove(ModelStatus.DRAFT)
        model_bo.model_status.append(ModelStatus.TRAINING)
        ModelService.update_model(model_bo)
Exemplo n.º 3
0
    def from_training_job(cls, training_job: TrainingJob) -> 'PyTorchTrainer':
        # TODO: only support fine-tune

        model_bo = ModelService.get_model_by_id(training_job.model)
        if model_bo.engine != Engine.PYTORCH:
            raise ValueError(
                f'Model engine expected `{Engine.PYTORCH}`, but got {model_bo.engine}.'
            )

        # download local cache
        cache_path = get_remote_model_weight(model_bo)
        net = torch.load(cache_path)
        freeze(module=net, n=-1, train_bn=True)

        # build pytorch lightning module
        fine_tune_module_kwargs = {
            'net': net,
            'loss': eval(str(training_job.loss_function))(),  # nosec
            'batch_size': training_job.data_module.batch_size,
            'num_workers': training_job.data_module.num_workers,
        }
        if training_job.optimizer_property.lr:
            fine_tune_module_kwargs['lr'] = training_job.optimizer_property.lr
        if training_job.lr_scheduler_property.gamma:
            fine_tune_module_kwargs[
                'lr_scheduler_gamma'] = training_job.lr_scheduler_property.gamma
        if training_job.lr_scheduler_property.step_size:
            fine_tune_module_kwargs[
                'step_size'] = training_job.lr_scheduler_property.step_size
        model = FineTuneModule(**fine_tune_module_kwargs)
        data_module = PyTorchDataModule(**training_job.data_module.dict(
            exclude_none=True))

        trainer_kwargs = training_job.dict(
            exclude_none=True, include={'min_epochs', 'max_epochs'})
        trainer = cls(
            id=training_job.id,
            model=model,
            data_loader_kwargs={'datamodule': data_module},
            trainer_kwargs={
                'default_root_dir': training_job.data_module.data_dir
                or OUTPUT_DIR,
                'weights_summary': None,
                'progress_bar_refresh_rate': 1,
                'num_sanity_val_steps': 0,
                'gpus': 1,  # TODO: set GPU number
                **trainer_kwargs,
            })
        return trainer
Exemplo n.º 4
0
def generate_model_graph(*, id: str):  # noqa
    model_bo = ModelService.get_model_by_id(id)
    dot_graph = ''
    if model_bo.engine == Engine.PYTORCH:
        pytorch_model = torch.load(model_bo.saved_path)
        sample_data = torch.zeros(1,
                                  *model_bo.inputs[0].shape[1:],
                                  dtype=torch.float,
                                  requires_grad=False)
        out = pytorch_model(sample_data)
        dot_graph = make_dot(out,
                             params=dict(
                                 list(pytorch_model.named_parameters()) +
                                 [('x', sample_data)]))

    return {'dot': str(dot_graph)}
Exemplo n.º 5
0
async def get_model_structure(id: str):  # noqa
    """
    Get model structure as a model structure graph (connection between layer as edge, layers as nodes)

    Arguments:
        id (str): Model object ID.
    """
    # return model DAG
    model = ModelService.get_model_by_id(id)
    if model.engine != Engine.PYTORCH:
        raise ValueError(f'model {id} is not supported for editing. '
                         f'Currently only support model with engine=PYTORCH')

    # download model as local cache
    cache_path = get_remote_model_weight(model=model)
    net = torch.load(cache_path)
    return Structure.from_model(net)
Exemplo n.º 6
0
def test_get_model_by_id():
    model_bo = ModelService.get_models('ResNet50')[0]
    model = ModelService.get_model_by_id(model_bo.id)

    # check model id
    assert model.id == model_bo.id
Exemplo n.º 7
0
def update_finetune_model_as_new(id: str,
                                 updated_layer: Structure,
                                 dry_run: bool = False):  # noqa
    """
    Temporary function for finetune CV models. The function's functionality is overlapped with
    `update_model_structure_as_new`. Please use the `update_model_structure_as_new` in next release.

    Examples:
        Fine-tune the model by modify the layer with name 'fc' (last layer). The layer
        has a changed argument out_features = 10. op_='M' indicates the operation to this layer ('fc')
        is 'Modify'. There is no changes in layer connections.
        Therefore, the structure change summary is
            [M] fc: (...) out_features=10

        >>> from collections import OrderedDict
        >>> structure_data = {
        ...     'layer': OrderedDict({'fc': {'out_features': 10, 'op_': 'M', 'type_': 'torch.nn.Linear'}})
        ... }
        >>> update_finetune_model_as_new(id=..., updated_layer=Structure.parse_obj(structure_data))

    Args:
        id (str): ID of the model to be updated.
        updated_layer (Structure): Contains layers to be fine-tuned.
        dry_run (bool): Test run for verify if the provided parameter (i.e. model specified in `id`
            and updated layers) is valid.

    Returns:

    """
    if len(updated_layer.layer.items()) == 0:
        return True
    model = ModelService.get_model_by_id(id)
    if model.engine != Engine.PYTORCH:
        raise ValueError(f'model {id} is not supported for editing. '
                         f'Currently only support model with engine=PYTORCH')
    # download model as local cache
    cache_path = get_remote_model_weight(model=model)
    net = torch.load(cache_path)

    for layer_name, layer_param in updated_layer.layer.items():
        layer_op = getattr(layer_param, 'op_')

        # update layer
        if layer_op == Operation.MODIFY:

            # check if the layer name exists
            # TODO check if layer path exists eg."layer1.0.conv1"
            if not hasattr(net, layer_name):
                raise ModelStructureError(
                    f'Structure layer name `{layer_name}` not found in model {id}.'
                )
            net_layer = getattr(net, layer_name)

            # check if the provided type matches the original type
            layer_type = type(net_layer)
            layer_type_provided = eval(layer_param.type_.value)  # nosec
            if layer_type is not layer_type_provided:
                raise ModelStructureError(
                    f'Expect `{layer_name}.type_` to be {layer_type}, '
                    f'but got {layer_type_provided}')

            # get layer parameters
            layer_param_old = layer_param.parse_layer_obj(net_layer)
            layer_param_data = layer_param_old.dict(exclude_none=True,
                                                    exclude={'type_', 'op_'})

            layer_param_update_data = layer_param.dict(
                exclude_none=True, exclude={'type_', 'op_'})
            # replace 'null' with None. See reason :class:`ModelLayer`.
            for k, v in layer_param_update_data.items():
                if v == 'null':
                    layer_param_update_data[k] = None

            # update the layer parameters
            layer_param_data.update(layer_param_update_data)
            layer = layer_type(**layer_param_data)
            setattr(net, layer_name, layer)

        else:
            # if layer_op is Operation.ADD,
            #     1. check if the layer name not exists
            #     2. add a layer
            #     3. change the `forward` function according to the connections
            # if layer_op is Operation.DELETE,
            #     1. check if the layer exists
            #     2. delete the layer
            #     3. change the `forward` function
            raise ValueError(
                'Operation not permitted. Please use `update_model_structure_as_new`.'
            )

    input_tensors = list()
    bs = 1
    for input_ in model.inputs:
        input_tensor = torch.rand(bs, *input_.shape[1:]).type(
            model_data_type_to_torch(input_.dtype))
        input_tensors.append(input_tensor)

    # parse output tensors
    output_shapes = list()
    output_tensors = net(*input_tensors)
    if not isinstance(output_tensors, (list, tuple)):
        output_tensors = (output_tensors, )
    for output_tensor in output_tensors:
        output_shape = IOShape(shape=[bs, *output_tensor.shape[1:]],
                               dtype=type_to_data_type(output_tensor.dtype))
        output_shapes.append(output_shape)

    if not dry_run:
        # TODO return validation result for dry_run mode
        # TODO apply Semantic Versioning https://semver.org/
        # TODO reslove duplicate model version problem in a more efficient way
        version = ModelVersion(model.version.ver + 1)
        previous_models = ModelService.get_models(
            architecture=model.architecture,
            task=model.task,
            framework=model.framework,
            engine=Engine.NONE)
        if len(previous_models):
            last_version = max(previous_models,
                               key=lambda k: k.version.ver).version.ver
            version = ModelVersion(last_version + 1)

        saved_path = generate_path_plain(architecture=model.architecture,
                                         task=model.task,
                                         framework=model.framework,
                                         engine=Engine.NONE,
                                         version=version)
        saved_path.parent.mkdir(parents=True, exist_ok=True)
        torch.save(model, saved_path.with_suffix('.pt'))
        mlmodelin = MLModel(dataset='',
                            metric={key: 0
                                    for key in model.metric.keys()},
                            task=model.task,
                            inputs=model.inputs,
                            outputs=output_shapes,
                            architecture=model.name,
                            framework=model.framework,
                            engine=Engine.NONE,
                            model_status=[ModelStatus.DRAFT],
                            parent_model_id=model.id,
                            version=version,
                            weight=saved_path)
        register_model(mlmodelin, convert=False, profile=False)

        model_bo = ModelService.get_models(architecture=model.architecture,
                                           task=model.task,
                                           framework=model.framework,
                                           engine=Engine.NONE,
                                           version=version)[0]

        return {'id': model_bo.id}
Exemplo n.º 8
0
def get_model(*, id: str):  # noqa
    model = ModelService.get_model_by_id(id)
    return ModelDetailOut.from_bo(model)