示例#1
0
def test_model_log_with_databricks_runtime():
    dbr = "8.3.x-snapshot-gpu-ml-scala2.12"
    with TempDir(chdr=True) as tmp, mock.patch(
            "mlflow.models.model.get_databricks_runtime", return_value=dbr):
        sig = ModelSignature(
            inputs=Schema([ColSpec("integer", "x"),
                           ColSpec("integer", "y")]),
            outputs=Schema([ColSpec(name=None, type="double")]),
        )
        input_example = {"x": 1, "y": 2}
        local_path, r = _log_model_with_signature_and_example(
            tmp, sig, input_example)

        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
        assert loaded_model.run_id == r.info.run_id
        assert loaded_model.artifact_path == "some/path"
        assert loaded_model.flavors == {
            "flavor1": {
                "a": 1,
                "b": 2
            },
            "flavor2": {
                "x": 1,
                "y": 2
            },
        }
        assert loaded_model.signature == sig
        path = os.path.join(
            local_path, loaded_model.saved_input_example_info["artifact_path"])
        x = _dataframe_from_json(path)
        assert x.to_dict(orient="records")[0] == input_example
        assert loaded_model.databricks_runtime == dbr
示例#2
0
def _dataframe_from_json(path_or_str,
                         schema: Schema = None,
                         pandas_orient: str = "split") -> pd.DataFrame:
    """
    Parse json into pandas.DataFrame. User can pass schema to ensure correct type parsing and to
    make any necessary conversions (e.g. string -> binary for binary columns).

    :param path_or_str: Path to a json file or a json string.
    :param schema: Mlflow schema used when parsing the data.
    :param pandas_orient: pandas data frame convention used to store the data.
    :return: pandas.DataFrame.
    """
    if schema is not None:
        dtypes = dict(zip(schema.column_names(), schema.column_types()))
        df = pd.read_json(path_or_str, orient=pandas_orient,
                          dtype=dtypes)[schema.column_names()]
        binary_cols = [
            i for i, x in enumerate(schema.column_types())
            if x == DataType.binary
        ]

        for i in binary_cols:
            col = df.columns[i]
            df[col] = np.array(df[col].map(_base64decode), dtype=np.bytes_)
            return df
    else:
        return pd.read_json(path_or_str, orient=pandas_orient, dtype=False)
示例#3
0
def test_model_log():
    with TempDir(chdr=True) as tmp:
        sig = ModelSignature(
            inputs=Schema([ColSpec("integer", "x"),
                           ColSpec("integer", "y")]),
            outputs=Schema([ColSpec(name=None, type="double")]),
        )
        input_example = {"x": 1, "y": 2}
        local_path, r = _log_model_with_signature_and_example(
            tmp, sig, input_example)

        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
        assert loaded_model.run_id == r.info.run_id
        assert loaded_model.artifact_path == "some/path"
        assert loaded_model.flavors == {
            "flavor1": {
                "a": 1,
                "b": 2
            },
            "flavor2": {
                "x": 1,
                "y": 2
            },
        }
        assert loaded_model.signature == sig
        path = os.path.join(
            local_path, loaded_model.saved_input_example_info["artifact_path"])
        x = _dataframe_from_json(path)
        assert x.to_dict(orient="records")[0] == input_example
        assert not hasattr(loaded_model, "databricks_runtime")
示例#4
0
    def on_train_end(self, args, state, control, **kwargs):
        input_schema = Schema([ColSpec(name="text", type="string")])
        output_schema = Schema([TensorSpec(np.dtype(np.float), (-1, -1))])
        signature = ModelSignature(inputs=input_schema, outputs=output_schema)

        pyfunc.log_model(
            # artifact path is _relative_ to run root in mlflow
            artifact_path="bert_classifier_model",
            # Dir with the module files for dependencies
            code_path=[
                os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             "models.py"),
                os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             "utils.py")
            ],
            python_model=MLFlowBertClassificationModel(),
            artifacts={
                "model": state.best_model_checkpoint,
            },
            conda_env={
                'name':
                'classifier-env',
                'channels': ['defaults', 'pytorch', 'pypi'],
                'dependencies': [
                    'python=3.8.8', 'pip', 'pytorch=1.8.0', {
                        'pip': [
                            'transformers==4.4.2', 'mlflow==1.15.0',
                            'numpy==1.20.1'
                        ]
                    }
                ]
            },
            signature=signature,
            await_registration_for=5,
            registered_model_name=self.registered_name)
示例#5
0
def test_model_load_input_example_failures():
    with TempDir(chdr=True) as tmp:
        input_example = np.array([[3, 4, 5]], dtype=np.int32)
        sig = ModelSignature(
            inputs=Schema([
                TensorSpec(type=input_example.dtype, shape=input_example.shape)
            ]),
            outputs=Schema([ColSpec(name=None, type="double")]),
        )

        local_path, _ = _log_model_with_signature_and_example(
            tmp, sig, input_example)
        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
        loaded_example = loaded_model.load_input_example(local_path)
        assert loaded_example is not None

        with pytest.raises(FileNotFoundError,
                           match="No such file or directory"):
            loaded_model.load_input_example(
                os.path.join(local_path, "folder_which_does_not_exist"))

        path = os.path.join(
            local_path, loaded_model.saved_input_example_info["artifact_path"])
        os.remove(path)
        with pytest.raises(FileNotFoundError,
                           match="No such file or directory"):
            loaded_model.load_input_example(local_path)
示例#6
0
def test_spark_schema_inference(pandas_df_with_all_types):
    import pyspark
    from pyspark.sql.types import _parse_datatype_string, StructField, StructType

    pandas_df_with_all_types = pandas_df_with_all_types.drop(
        columns=["boolean_ext", "integer_ext", "string_ext"])
    schema = _infer_schema(pandas_df_with_all_types)
    assert schema == Schema(
        [ColSpec(x, x) for x in pandas_df_with_all_types.columns])
    spark_session = pyspark.sql.SparkSession(
        pyspark.SparkContext.getOrCreate())

    struct_fields = []
    for t in schema.input_types():
        # pyspark _parse_datatype_string() expects "timestamp" instead of "datetime"
        if t == DataType.datetime:
            struct_fields.append(
                StructField("datetime", _parse_datatype_string("timestamp"),
                            True))
        else:
            struct_fields.append(
                StructField(t.name, _parse_datatype_string(t.name), True))
    spark_schema = StructType(struct_fields)
    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types,
                                            schema=spark_schema)
    schema = _infer_schema(sparkdf)
    assert schema == Schema(
        [ColSpec(x, x) for x in pandas_df_with_all_types.columns])
示例#7
0
def test_model_log():
    with TempDir(chdr=True) as tmp:
        experiment_id = mlflow.create_experiment("test")
        sig = ModelSignature(
            inputs=Schema([ColSpec("integer", "x"),
                           ColSpec("integer", "y")]),
            outputs=Schema([ColSpec(name=None, type="double")]))
        input_example = {"x": 1, "y": 2}
        with mlflow.start_run(experiment_id=experiment_id) as r:
            Model.log("some/path",
                      TestFlavor,
                      signature=sig,
                      input_example=input_example)

        local_path = _download_artifact_from_uri("runs:/{}/some/path".format(
            r.info.run_id),
                                                 output_path=tmp.path(""))
        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
        assert loaded_model.run_id == r.info.run_id
        assert loaded_model.artifact_path == "some/path"
        assert loaded_model.flavors == {
            "flavor1": {
                "a": 1,
                "b": 2
            },
            "flavor2": {
                "x": 1,
                "y": 2
            },
        }
        assert loaded_model.signature == sig
        path = os.path.join(
            local_path, loaded_model.saved_input_example_info["artifact_path"])
        x = _dataframe_from_json(path)
        assert x.to_dict(orient="records")[0] == input_example
示例#8
0
def _dataframe_from_json(path_or_str,
                         schema: Schema = None,
                         pandas_orient: str = "split",
                         precise_float=False) -> pd.DataFrame:
    """
    Parse json into pandas.DataFrame. User can pass schema to ensure correct type parsing and to
    make any necessary conversions (e.g. string -> binary for binary columns).

    :param path_or_str: Path to a json file or a json string.
    :param schema: Mlflow schema used when parsing the data.
    :param pandas_orient: pandas data frame convention used to store the data.
    :return: pandas.DataFrame.
    """
    if schema is not None:
        dtypes = dict(zip(schema.column_names(), schema.pandas_types()))
        df = pd.read_json(path_or_str,
                          orient=pandas_orient,
                          dtype=dtypes,
                          precise_float=precise_float)
        actual_cols = set(df.columns)
        for type_, name in zip(schema.column_types(), schema.column_names()):
            if type_ == DataType.binary and name in actual_cols:
                df[name] = df[name].map(
                    lambda x: base64.decodebytes(bytes(x, "utf8")))
        return df
    else:
        return pd.read_json(path_or_str,
                            orient=pandas_orient,
                            dtype=False,
                            precise_float=precise_float)
示例#9
0
def test_model_log_with_input_example_succeeds():
    with TempDir(chdr=True) as tmp:
        sig = ModelSignature(
            inputs=Schema([
                ColSpec("integer", "a"),
                ColSpec("string", "b"),
                ColSpec("boolean", "c"),
                ColSpec("string", "d"),
                ColSpec("datetime", "e"),
            ]),
            outputs=Schema([ColSpec(name=None, type="double")]),
        )
        input_example = pd.DataFrame(
            {
                "a": np.int32(1),
                "b": "test string",
                "c": True,
                "d": date.today(),
                "e": np.datetime64("2020-01-01T00:00:00"),
            },
            index=[0],
        )

        local_path, _ = _log_model_with_signature_and_example(
            tmp, sig, input_example)
        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
        path = os.path.join(
            local_path, loaded_model.saved_input_example_info["artifact_path"])
        x = _dataframe_from_json(path, schema=sig.inputs)

        # date column will get deserialized into string
        input_example["d"] = input_example["d"].apply(lambda x: x.isoformat())
        assert x.equals(input_example)
示例#10
0
def train(
    proj_name: str,
    Model: str,
    dataset_cls: str,
    net_fn: str,
    net_args: Dict,
    dataset_args: Dict,
):
    """ Train Function """

    dataset_module = importlib.import_module(
        f"manythings.data.dta_{dataset_cls}")
    dataset_cls_ = getattr(dataset_module, dataset_cls)

    network_module = importlib.import_module(f"manythings.networks.{net_fn}")
    network_fn_ = getattr(network_module, net_fn)

    model_module = importlib.import_module(f"manythings.models.{Model}")
    model_cls_ = getattr(model_module, Model)

    config = {
        "model": Model,
        "dataset_cls": dataset_cls,
        "net_fn": net_fn,
        "net_args": net_args,
        "dataset_args": dataset_args
    }

    input_schema = Schema([
        TensorSpec(np.dtype(np.uint8), (-1, 71), "encoder_input"),
        TensorSpec(np.dtype(np.uint8), (-1, 93), "decoder_input")
    ])

    output_schema = Schema([TensorSpec(np.dtype(np.float32), (-1, 93))])

    signature = ModelSignature(inputs=input_schema, outputs=output_schema)
    data = dataset_cls_()
    data.load_or_generate()
    data.preprocess()

    with wandb.init(project=proj_name, config=config):
        """"""
        config = wandb.config
        model = model_cls_(dataset_cls_, network_fn_, net_args, dataset_args)

        callbacks = [
            WandbCallback(
                # training_data=(
                #     [data.encoder_input_data, data.decoder_input_data],
                #     data.decoder_target_data
                # ),
                # log_weights=True,
                # log_gradients=True
            )
        ]

        model.fit(callbacks=callbacks)
        mlflow.keras.save_model(model.network,
                                "saved_models/seq2seq",
                                signature=signature)
示例#11
0
def test_model_save_load():
    m = Model(artifact_path="some/path",
              run_id="123",
              flavors={
                  "flavor1": {
                      "a": 1,
                      "b": 2
                  },
                  "flavor2": {
                      "x": 1,
                      "y": 2
                  },
              },
              signature=ModelSignature(
                  inputs=Schema(
                      [ColSpec("integer", "x"),
                       ColSpec("integer", "y")]),
                  outputs=Schema([ColSpec(name=None, type="double")])),
              saved_input_example_info={
                  "x": 1,
                  "y": 2
              })
    assert m.get_input_schema() == m.signature.inputs
    assert m.get_output_schema() == m.signature.outputs
    x = Model(artifact_path="some/other/path", run_id="1234")
    assert x.get_input_schema() is None
    assert x.get_output_schema() is None

    n = Model(artifact_path="some/path",
              run_id="123",
              flavors={
                  "flavor1": {
                      "a": 1,
                      "b": 2
                  },
                  "flavor2": {
                      "x": 1,
                      "y": 2
                  },
              },
              signature=ModelSignature(
                  inputs=Schema(
                      [ColSpec("integer", "x"),
                       ColSpec("integer", "y")]),
                  outputs=Schema([ColSpec(name=None, type="double")])),
              saved_input_example_info={
                  "x": 1,
                  "y": 2
              })
    n.utc_time_created = m.utc_time_created
    assert m == n
    n.signature = None
    assert m != n
    with TempDir() as tmp:
        m.save(tmp.path("model"))
        o = Model.load(tmp.path("model"))
    assert m == o
    assert m.to_json() == o.to_json()
    assert m.to_yaml() == o.to_yaml()
示例#12
0
def test_schema_creation_with_named_and_unnamed_spec():
    with pytest.raises(MlflowException) as ex:
        Schema([
            TensorSpec(np.dtype("float64"), (-1, ), "blah"),
            TensorSpec(np.dtype("float64"), (-1, ))
        ])
    assert "Creating Schema with a combination of named and unnamed columns" in ex.value.message

    with pytest.raises(MlflowException) as ex:
        Schema([ColSpec("double", "blah"), ColSpec("double")])
    assert "Creating Schema with a combination of named and unnamed columns" in ex.value.message
示例#13
0
def test_model_load_input_example_no_signature():
    with TempDir(chdr=True) as tmp:
        input_example = np.array([[3, 4, 5]], dtype=np.int32)
        sig = ModelSignature(
            inputs=Schema([TensorSpec(type=input_example.dtype, shape=input_example.shape)]),
            outputs=Schema([ColSpec(name=None, type="double")]),
        )

        local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example=None)
        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
        loaded_example = loaded_model.load_input_example(local_path)
        assert loaded_example is None
示例#14
0
def test_model_load_input_example_scipy():
    with TempDir(chdr=True) as tmp:
        input_example = csc_matrix(np.arange(0, 12, 0.5).reshape(3, 8))
        sig = ModelSignature(
            inputs=Schema([TensorSpec(type=input_example.data.dtype, shape=input_example.shape)]),
            outputs=Schema([ColSpec(name=None, type="double")]),
        )

        local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example)
        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
        loaded_example = loaded_model.load_input_example(local_path)

        assert isinstance(loaded_example, csc_matrix)
        assert np.array_equal(input_example.data, loaded_example.data)
示例#15
0
def test_schema_creation():
    # can create schema with named col specs
    Schema([ColSpec("double", "a"), ColSpec("integer", "b")])

    # can create schema with unnamed col specs
    Schema([ColSpec("double"), ColSpec("integer")])

    # can create schema with multiple named tensor specs
    Schema([TensorSpec(np.dtype("float64"), (-1,), "a"), TensorSpec(np.dtype("uint8"), (-1,), "b")])

    # can create schema with single unnamed tensor spec
    Schema([TensorSpec(np.dtype("float64"), (-1,))])

    # combination of tensor and col spec is not allowed
    with pytest.raises(MlflowException) as ex:
        Schema([TensorSpec(np.dtype("float64"), (-1,)), ColSpec("double")])
    assert "Please choose one of" in ex.value.message

    # combination of named and unnamed inputs is not allowed
    with pytest.raises(MlflowException) as ex:
        Schema(
            [TensorSpec(np.dtype("float64"), (-1,), "blah"), TensorSpec(np.dtype("float64"), (-1,))]
        )
    assert "Creating Schema with a combination of named and unnamed inputs" in ex.value.message

    with pytest.raises(MlflowException) as ex:
        Schema([ColSpec("double", "blah"), ColSpec("double")])
    assert "Creating Schema with a combination of named and unnamed inputs" in ex.value.message

    # multiple unnamed tensor specs is not allowed
    with pytest.raises(MlflowException) as ex:
        Schema([TensorSpec(np.dtype("double"), (-1,)), TensorSpec(np.dtype("double"), (-1,))])
    assert "Creating Schema with multiple unnamed TensorSpecs is not supported" in ex.value.message
示例#16
0
def test_model_info():
    with TempDir(chdr=True) as tmp:
        sig = ModelSignature(
            inputs=Schema([ColSpec("integer", "x"),
                           ColSpec("integer", "y")]),
            outputs=Schema([ColSpec(name=None, type="double")]),
        )
        input_example = {"x": 1, "y": 2}

        experiment_id = mlflow.create_experiment("test")
        with mlflow.start_run(experiment_id=experiment_id) as run:
            model_info = Model.log("some/path",
                                   TestFlavor,
                                   signature=sig,
                                   input_example=input_example)
        local_path = _download_artifact_from_uri("runs:/{}/some/path".format(
            run.info.run_id),
                                                 output_path=tmp.path(""))

        assert model_info.run_id == run.info.run_id
        assert model_info.artifact_path == "some/path"
        assert model_info.model_uri == "runs:/{}/some/path".format(
            run.info.run_id)

        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
        assert model_info.utc_time_created == loaded_model.utc_time_created
        assert model_info.model_uuid == loaded_model.model_uuid

        assert model_info.flavors == {
            "flavor1": {
                "a": 1,
                "b": 2
            },
            "flavor2": {
                "x": 1,
                "y": 2
            },
        }

        path = os.path.join(
            local_path, model_info.saved_input_example_info["artifact_path"])
        x = _dataframe_from_json(path)
        assert x.to_dict(orient="records")[0] == input_example

        assert model_info.signature_dict == sig.to_dict()

        assert Version(model_info.mlflow_version) == Version(
            loaded_model.mlflow_version)
示例#17
0
def test_spark_schema_inference(pandas_df_with_all_types):
    import pyspark
    from pyspark.sql.types import _parse_datatype_string, StructField, StructType

    pandas_df_with_all_types = pandas_df_with_all_types.drop(
        columns=["boolean_ext", "integer_ext", "string_ext"]
    )
    schema = _infer_schema(pandas_df_with_all_types)
    assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])
    spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate())
    spark_schema = StructType(
        [StructField(t.name, _parse_datatype_string(t.name), True) for t in schema.column_types()]
    )
    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=spark_schema)
    schema = _infer_schema(sparkdf)
    assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])
示例#18
0
def test_schema_inference_on_dataframe(pandas_df_with_all_types):
    basic_types = pandas_df_with_all_types.drop(
        columns=["boolean_ext", "integer_ext", "string_ext"])
    schema = _infer_schema(basic_types)
    assert schema == Schema([ColSpec(x, x) for x in basic_types.columns])

    ext_types = pandas_df_with_all_types[[
        "boolean_ext", "integer_ext", "string_ext"
    ]].copy()
    expected_schema = Schema([
        ColSpec(DataType.boolean, "boolean_ext"),
        ColSpec(DataType.long, "integer_ext"),
        ColSpec(DataType.string, "string_ext"),
    ])
    schema = _infer_schema(ext_types)
    assert schema == expected_schema
示例#19
0
    def from_dict(cls, signature_dict: Dict[str, Any]):
        """
        Deserialize from dictionary representation.

        :param signature_dict: Dictionary representation of model signature.
                               Expected dictionary format:
                               `{'inputs': <json string>, 'outputs': <json string>" }`

        :return: ModelSignature populated with the data form the dictionary.
        """
        inputs = Schema.from_json(signature_dict["inputs"])
        if "outputs" in signature_dict and signature_dict["outputs"] is not None:
            outputs = Schema.from_json(signature_dict["outputs"])
            return cls(inputs, outputs)
        else:
            return cls(inputs)
示例#20
0
 def test_dtype(nparray, dtype):
     schema = _infer_schema(nparray)
     assert schema == Schema([TensorSpec(np.dtype(dtype), (-1, ))])
     spec = schema.inputs[0]
     recreated_spec = TensorSpec.from_json_dict(**spec.to_dict())
     assert spec == recreated_spec
     enforced_array = _enforce_tensor_spec(nparray, spec)
     assert isinstance(enforced_array, np.ndarray)
def test_model_signature():
    signature1 = ModelSignature(inputs=Schema(
        [ColSpec(DataType.boolean),
         ColSpec(DataType.binary)]),
                                outputs=Schema([
                                    ColSpec(name=None, type=DataType.double),
                                    ColSpec(name=None, type=DataType.double)
                                ]))
    signature2 = ModelSignature(inputs=Schema(
        [ColSpec(DataType.boolean),
         ColSpec(DataType.binary)]),
                                outputs=Schema([
                                    ColSpec(name=None, type=DataType.double),
                                    ColSpec(name=None, type=DataType.double)
                                ]))
    assert signature1 == signature2
    signature3 = ModelSignature(inputs=Schema(
        [ColSpec(DataType.boolean),
         ColSpec(DataType.binary)]),
                                outputs=Schema([
                                    ColSpec(name=None, type=DataType.float),
                                    ColSpec(name=None, type=DataType.double)
                                ]))
    assert signature3 != signature1
    as_json = json.dumps(signature1.to_dict())
    signature4 = ModelSignature.from_dict(json.loads(as_json))
    assert signature1 == signature4
    signature5 = ModelSignature(inputs=Schema(
        [ColSpec(DataType.boolean),
         ColSpec(DataType.binary)]),
                                outputs=None)
    as_json = json.dumps(signature5.to_dict())
    signature6 = ModelSignature.from_dict(json.loads(as_json))
    assert signature5 == signature6
示例#22
0
def test_content_types(tensor_spec: TensorSpec, request_input: RequestInput):
    input_schema = Schema(inputs=[tensor_spec])

    inference_request = InferenceRequest(
        parameters=Parameters(content_type=PandasCodec.ContentType),
        inputs=[request_input],
    )
    data = decode_inference_request(inference_request)

    # _enforce_schema will raise if something fails
    _enforce_schema(data, input_schema)
示例#23
0
def test_signature_inference_infers_datime_types_as_expected():
    col_name = "datetime_col"
    test_datetime = np.datetime64("2021-01-01")
    test_series = pd.Series(pd.to_datetime([test_datetime]))
    test_df = test_series.to_frame(col_name)

    signature = infer_signature(test_series)
    assert signature.inputs == Schema([ColSpec(DataType.datetime)])

    signature = infer_signature(test_df)
    assert signature.inputs == Schema([ColSpec(DataType.datetime, name=col_name)])

    spark = pyspark.sql.SparkSession.builder.getOrCreate()
    spark_df = spark.range(1).selectExpr(
        "current_timestamp() as timestamp", "current_date() as date"
    )
    signature = infer_signature(spark_df)
    assert signature.inputs == Schema(
        [ColSpec(DataType.datetime, name="timestamp"), ColSpec(DataType.datetime, name="date")]
    )
示例#24
0
def test_schema_inference_on_dictionary(dict_of_ndarrays):
    # test dictionary
    schema = _infer_schema(dict_of_ndarrays)
    assert schema == Schema([
        TensorSpec(tensor.dtype, _get_tensor_shape(tensor), name)
        for name, tensor in dict_of_ndarrays.items()
    ])
    # test exception is raised if non-numpy data in dictionary
    with pytest.raises(TypeError):
        _infer_schema({"x": 1})
    with pytest.raises(TypeError):
        _infer_schema({"x": [1]})
示例#25
0
def test_model_log_with_input_example_succeeds():
    with TempDir(chdr=True) as tmp:
        experiment_id = mlflow.create_experiment("test")
        sig = ModelSignature(
            inputs=Schema([
                ColSpec("integer", "a"),
                ColSpec("string", "b"),
                ColSpec("boolean", "c"),
                ColSpec("string", "d"),
                ColSpec("datetime", "e"),
            ]),
            outputs=Schema([ColSpec(name=None, type="double")]),
        )
        input_example = pd.DataFrame(
            {
                "a": np.int32(1),
                "b": "test string",
                "c": True,
                "d": date.today(),
                "e": np.datetime64("2020-01-01T00:00:00"),
            },
            index=[0],
        )
        with mlflow.start_run(experiment_id=experiment_id) as r:
            Model.log("some/path",
                      TestFlavor,
                      signature=sig,
                      input_example=input_example)

        local_path = _download_artifact_from_uri("runs:/{}/some/path".format(
            r.info.run_id),
                                                 output_path=tmp.path(""))
        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
        path = os.path.join(
            local_path, loaded_model.saved_input_example_info["artifact_path"])
        x = _dataframe_from_json(path, schema=sig.inputs)

        # date column will get deserialized into string
        input_example["d"] = input_example["d"].apply(lambda x: x.isoformat())
        assert x.equals(input_example)
示例#26
0
def test_schema_inference_on_numpy_array(pandas_df_with_all_types):
    for col in pandas_df_with_all_types:
        data = pandas_df_with_all_types[col].to_numpy()
        schema = _infer_schema(data)
        assert schema == Schema([TensorSpec(type=data.dtype, shape=(-1, ))])

    # test boolean
    schema = _infer_schema(np.array([True, False, True], dtype=np.bool_))
    assert schema == Schema([TensorSpec(np.dtype(np.bool_), (-1, ))])

    # test bytes
    schema = _infer_schema(np.array([bytes([1])], dtype=np.bytes_))
    assert schema == Schema([TensorSpec(np.dtype("S1"), (-1, ))])

    # test (u)ints
    for t in [
            np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32,
            np.uint64, np.int64
    ]:
        schema = _infer_schema(np.array([1, 2, 3], dtype=t))
        assert schema == Schema([TensorSpec(np.dtype(t), (-1, ))])

    # test floats
    for t in [np.float16, np.float32, np.float64]:
        schema = _infer_schema(np.array([1.1, 2.2, 3.3], dtype=t))
        assert schema == Schema([TensorSpec(np.dtype(t), (-1, ))])

    if hasattr(np, "float128"):
        schema = _infer_schema(np.array([1.1, 2.2, 3.3], dtype=np.float128))
        assert schema == Schema([TensorSpec(np.dtype(np.float128), (-1, ))])
示例#27
0
def test_spark_type_mapping(pandas_df_with_all_types):
    import pyspark
    from pyspark.sql.types import (
        BooleanType,
        IntegerType,
        LongType,
        FloatType,
        DoubleType,
        StringType,
        BinaryType,
        TimestampType,
    )
    from pyspark.sql.types import StructField, StructType

    assert isinstance(DataType.boolean.to_spark(), BooleanType)
    assert isinstance(DataType.integer.to_spark(), IntegerType)
    assert isinstance(DataType.long.to_spark(), LongType)
    assert isinstance(DataType.float.to_spark(), FloatType)
    assert isinstance(DataType.double.to_spark(), DoubleType)
    assert isinstance(DataType.string.to_spark(), StringType)
    assert isinstance(DataType.binary.to_spark(), BinaryType)
    assert isinstance(DataType.datetime.to_spark(), TimestampType)
    pandas_df_with_all_types = pandas_df_with_all_types.drop(
        columns=["boolean_ext", "integer_ext", "string_ext"])
    schema = _infer_schema(pandas_df_with_all_types)
    expected_spark_schema = StructType([
        StructField(t.name, t.to_spark(), True) for t in schema.input_types()
    ])
    actual_spark_schema = schema.as_spark_schema()
    assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue()
    spark_session = pyspark.sql.SparkSession(
        pyspark.SparkContext.getOrCreate())
    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types,
                                            schema=actual_spark_schema)
    schema2 = _infer_schema(sparkdf)
    assert schema == schema2

    # test unnamed columns
    schema = Schema([ColSpec(col.type) for col in schema.inputs])
    expected_spark_schema = StructType([
        StructField(str(i), t.to_spark(), True)
        for i, t in enumerate(schema.input_types())
    ])
    actual_spark_schema = schema.as_spark_schema()
    assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue()

    # test single unnamed column is mapped to just a single spark type
    schema = Schema([ColSpec(DataType.integer)])
    spark_type = schema.as_spark_schema()
    assert isinstance(spark_type, IntegerType)
示例#28
0
def _infer_schema(data):
    res = []
    for _, col in enumerate(data):
        t = col.type.replace("tensor(", "").replace(")", "")
        if t in ["bool"]:
            dt = DataType.boolean
        elif t in ["int8", "uint8", "int16", "uint16", "int32"]:
            dt = DateType.integer
        elif t in ["uint32", "int64"]:
            dt = DataType.long
        elif t in ["float16", "bfloat16", "float"]:
            dt = DataType.float
        elif t in ["double"]:
            dt = DataType.double
        elif t in ["string"]:
            dt = DataType.string
        else:
            raise Exception("Unsupported type: " + t)
        res.append(ColSpec(type=dt, name=col.name))
    return Schema(res)
示例#29
0
def test_model_signature_with_colspec_and_tensorspec():
    signature1 = ModelSignature(inputs=Schema([ColSpec(DataType.double)]))
    signature2 = ModelSignature(inputs=Schema([TensorSpec(np.dtype("float"), (-1, 28, 28))]))
    assert signature1 != signature2
    assert signature2 != signature1

    signature3 = ModelSignature(
        inputs=Schema([ColSpec(DataType.double)]),
        outputs=Schema([TensorSpec(np.dtype("float"), (-1, 28, 28))]),
    )
    signature4 = ModelSignature(
        inputs=Schema([ColSpec(DataType.double)]), outputs=Schema([ColSpec(DataType.double)]),
    )
    assert signature3 != signature4
    assert signature4 != signature3
示例#30
0
def _infer_schema(data: Any) -> Schema:
    """
    Infer an MLflow schema from a dataset.

    This method captures the column names and data types from the user data. The signature
    represents model input and output as data frames with (optionally) named columns and data
    type specified as one of types defined in :py:class:`DataType`. This method will raise
    an exception if the user data contains incompatible types or is not passed in one of the
    supported formats (containers).

    The input should be one of these:
      - pandas.DataFrame or pandas.Series
      - dictionary of { name -> numpy.ndarray}
      - numpy.ndarray
      - pyspark.sql.DataFrame

    The element types should be mappable to one of :py:class:`mlflow.models.signature.DataType`.

    NOTE: Multidimensional (>2d) arrays (aka tensors) are not supported at this time.

    :param data: Dataset to infer from.

    :return: Schema
    """

    if isinstance(data, dict):
        res = []
        for col in data.keys():
            ary = data[col]
            if not isinstance(ary, np.ndarray):
                raise TypeError(
                    "Data in the dictionary must be of type numpy.ndarray")
            dims = len(ary.shape)
            if dims == 1:
                res.append(ColSpec(type=_infer_numpy_array(ary), name=col))
            else:
                raise TensorsNotSupportedException(
                    "Data in the dictionary must be 1-dimensional, "
                    "got shape {}".format(ary.shape))
        return Schema(res)
    elif isinstance(data, pd.Series):
        return Schema([ColSpec(type=_infer_numpy_array(data.values))])
    elif isinstance(data, pd.DataFrame):
        return Schema([
            ColSpec(type=_infer_numpy_array(data[col].values), name=col)
            for col in data.columns
        ])
    elif isinstance(data, np.ndarray):
        if len(data.shape) > 2:
            raise TensorsNotSupportedException(
                "Attempting to infer schema from numpy array with "
                "shape {}".format(data.shape))
        if data.dtype == np.object:
            data = pd.DataFrame(data).infer_objects()
            return Schema([
                ColSpec(type=_infer_numpy_array(data[col].values))
                for col in data.columns
            ])
        if len(data.shape) == 1:
            return Schema([ColSpec(type=_infer_numpy_dtype(data.dtype))])
        elif len(data.shape) == 2:
            return Schema([ColSpec(type=_infer_numpy_dtype(data.dtype))] *
                          data.shape[1])
    elif _is_spark_df(data):
        return Schema([
            ColSpec(type=_infer_spark_type(field.dataType), name=field.name)
            for field in data.schema.fields
        ])
    raise TypeError(
        "Expected one of (pandas.DataFrame, numpy array, "
        "dictionary of (name -> numpy.ndarray), pyspark.sql.DataFrame) "
        "but got '{}'".format(type(data)))