예제 #1
0
    def test_collect_feature_data_and_vis_attributes(self, df):
        """Unit test that attributes pre-logging are the correct format."""
        in_df, out_df = df.iloc[:, :-1], df.iloc[:, [-1]]

        feature_data_list = RegisteredModelVersion._compute_training_data_profile(
            in_df,
            out_df,
        )
        feature_data_attrs = (
            RegisteredModelVersion._collect_feature_data_and_vis_attributes(
                feature_data_list, ))

        for key, val in feature_data_attrs.items():
            if key.startswith(_deployable_entity._FEATURE_DATA_ATTR_PREFIX):
                feature_data = _utils.json_to_proto(val,
                                                    FeatureDataInModelVersion)
                self.assert_feature_data_correctness(feature_data, in_df,
                                                     out_df)

                if feature_data.profiler_name == "MissingValuesProfiler":
                    sample_key = feature_data.feature_name + "MissingValues"
                else:
                    sample_key = feature_data.feature_name + "Distribution"
                sample_key = (
                    _deployable_entity._TRAINING_DATA_ATTR_PREFIX +
                    RegisteredModelVersion._normalize_attribute_key(sample_key)
                )
                assert feature_data_attrs[sample_key] == json.loads(
                    feature_data.content)
예제 #2
0
    def _get_url_for_artifact(self,
                              dataset_component_path,
                              method,
                              part_num=0):
        """
        Obtains a URL to use for accessing stored artifacts.

        Parameters
        ----------
        dataset_component_path : str
            Filepath in dataset component blob.
        method : {'GET', 'PUT'}
            HTTP method to request for the generated URL.
        part_num : int, optional
            If using Multipart Upload, number of part to be uploaded.

        Returns
        -------
        response_msg : `_DatasetVersionService.GetUrlForDatasetBlobVersioned.Response`
            Backend response.

        """
        if method.upper() not in ("GET", "PUT"):
            raise ValueError("`method` must be one of {'GET', 'PUT'}")

        Message = _DatasetVersionService.GetUrlForDatasetBlobVersioned
        msg = Message(
            path_dataset_component_blob_path=dataset_component_path,
            method=method,
            part_number=part_num,
        )
        data = _utils.proto_to_json(msg)
        endpoint = "{}://{}/api/v1/modeldb/dataset-version/dataset/{}/datasetVersion/{}/getUrlForDatasetBlobVersioned".format(
            self._conn.scheme,
            self._conn.socket,
            self.dataset_id,
            self.id,
        )
        response = _utils.make_request("POST", endpoint, self._conn, json=data)
        _utils.raise_for_http_error(response)

        response_msg = _utils.json_to_proto(response.json(), Message.Response)

        url = response_msg.url
        # accommodate port-forwarded NFS store
        if 'https://localhost' in url[:20]:
            url = 'http' + url[5:]
        if 'localhost%3a' in url[:20]:
            url = url.replace('localhost%3a', 'localhost:')
        if 'localhost%3A' in url[:20]:
            url = url.replace('localhost%3A', 'localhost:')
        response_msg.url = url

        return response_msg
예제 #3
0
    def _get_url_for_artifact(self, key, method, artifact_type=0, part_num=0):
        if method.upper() not in ("GET", "PUT"):
            raise ValueError("`method` must be one of {'GET', 'PUT'}")

        Message = _RegistryService.GetUrlForArtifact
        msg = Message(
            model_version_id=self.id,
            key=key,
            method=method,
            artifact_type=artifact_type,
            part_number=part_num,
        )
        data = _utils.proto_to_json(msg)
        endpoint = "{}://{}/api/v1/registry/model_versions/{}/getUrlForArtifact".format(
            self._conn.scheme, self._conn.socket, self.id)
        response = _utils.make_request("POST", endpoint, self._conn, json=data)
        _utils.raise_for_http_error(response)
        return _utils.json_to_proto(response.json(), Message.Response)
예제 #4
0
    def get_code(self):
        """
        Gets the code version.

        Returns
        -------
        dict or zipfile.ZipFile
            Either:
                - a dictionary containing Git snapshot information with at most the following items:
                    - **filepaths** (*list of str*)
                    - **repo_url** (*str*) – Remote repository URL
                    - **commit_hash** (*str*) – Commit hash
                    - **is_dirty** (*bool*)
                - a `ZipFile <https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile>`_
                  containing Python source code files

        """
        # TODO: remove this circular dependency
        from ._project import Project
        from ._experiment import Experiment
        from ._experimentrun import ExperimentRun
        if isinstance(self, Project):  # TODO: not this
            Message = self._service.GetProjectCodeVersion
            endpoint = "getProjectCodeVersion"
        elif isinstance(self, Experiment):
            Message = self._service.GetExperimentCodeVersion
            endpoint = "getExperimentCodeVersion"
        elif isinstance(self, ExperimentRun):
            Message = self._service.GetExperimentRunCodeVersion
            endpoint = "getExperimentRunCodeVersion"
        msg = Message(id=self.id)
        data = _utils.proto_to_json(msg)
        response = _utils.make_request("GET",
                                       self._request_url.format(endpoint),
                                       self._conn,
                                       params=data)
        _utils.raise_for_http_error(response)

        response_msg = _utils.json_to_proto(_utils.body_to_json(response),
                                            Message.Response)
        code_ver_msg = response_msg.code_version
        which_code = code_ver_msg.WhichOneof('code')
        if which_code == 'git_snapshot':
            git_snapshot_msg = code_ver_msg.git_snapshot
            git_snapshot = {}
            if git_snapshot_msg.filepaths:
                git_snapshot['filepaths'] = git_snapshot_msg.filepaths
            if git_snapshot_msg.repo:
                git_snapshot['repo_url'] = git_snapshot_msg.repo
            if git_snapshot_msg.hash:
                git_snapshot['commit_hash'] = git_snapshot_msg.hash
            if git_snapshot_msg.is_dirty != _CommonCommonService.TernaryEnum.UNKNOWN:
                git_snapshot[
                    'is_dirty'] = git_snapshot_msg.is_dirty == _CommonCommonService.TernaryEnum.TRUE
            return git_snapshot
        elif which_code == 'code_archive':
            # download artifact from artifact store
            # pylint: disable=no-member
            # this method should only be called on ExperimentRun, which does have _get_url_for_artifact()
            url = self._get_url_for_artifact(
                "verta_code_archive", "GET",
                code_ver_msg.code_archive.artifact_type).url

            response = _utils.make_request("GET", url, self._conn)
            _utils.raise_for_http_error(response)

            code_archive = six.BytesIO(response.content)
            return zipfile.ZipFile(
                code_archive, 'r')  # TODO: return a util class instead, maybe
        else:
            raise RuntimeError("unable find code in response")
예제 #5
0
    def test_profile_training_data(self, model_version):
        """Integration test for logging attributes with correct structure."""
        pd = pytest.importorskip("pandas")
        np = pytest.importorskip("numpy")

        cont_col = np.random.random(100)
        discrete_col = np.random.choice(5, 100)
        string_discrete_col = np.random.choice(["a", "b", "c", "d", "e"],
                                               size=100)
        string_freeform_col = [
            uuid.uuid4().hex.upper()[0:10] for _ in range(100)
        ]
        other_col = [datetime.datetime.now() for x in range(100)]
        output_col = np.random.choice(2, 100)

        col_names = [
            "Continuous_Numeric",
            "Discrete_Numeric",
            "Discrete_String",
            "Freeform_String",
            "Other",
            "Output_Col",
        ]
        supported_col_names = [
            "Continuous_Numeric", "Discrete_Numeric", "Output_Col"
        ]

        # create dataframes
        df = pd.DataFrame(
            list(
                zip(
                    cont_col,
                    discrete_col,
                    string_discrete_col,
                    string_freeform_col,
                    other_col,
                    output_col,
                )),
            columns=col_names,
        )

        # log to model version with new method
        model_version.log_training_data_profile(
            df.loc[:, df.columns != "Output_Col"],
            pd.DataFrame(df["Output_Col"]),
        )

        # get back attributes to validate
        attributes = model_version.get_attributes()
        key = _deployable_entity._FEATURE_DATA_ATTR_PREFIX + "{}"
        discrete_col_missing_summary = _utils.json_to_proto(
            model_version.get_attribute(key.format("2")),
            FeatureDataInModelVersion,  # missing value
        )
        discrete_col_distribution_summary = _utils.json_to_proto(
            model_version.get_attribute(key.format("3")),
            FeatureDataInModelVersion,  # missing value
        )

        # missing value, distribution summary for each supported column +
        # equal number of attributes for visualization
        assert len(attributes.keys()) == len(supported_col_names) * 2 * 2
        assert (discrete_col_distribution_summary.summary_type_name ==
                "verta.discreteHistogram.v1")
        assert (discrete_col_distribution_summary.profiler_name ==
                "BinaryHistogramProfiler")
        assert (len(
            json.loads(discrete_col_distribution_summary.content)
            ["discreteHistogram"]["buckets"]) <= 5)

        assert (discrete_col_missing_summary.summary_type_name ==
                "verta.discreteHistogram.v1")
        assert discrete_col_missing_summary.profiler_name == "MissingValuesProfiler"
        assert (len(
            json.loads(discrete_col_missing_summary.content)
            ["discreteHistogram"]["buckets"]) == 2)

        # reference distribution attributes can be fetched back as histograms
        for col in supported_col_names:
            key = _deployable_entity._TRAINING_DATA_ATTR_PREFIX + col + "Distribution"
            histogram = model_version.get_attribute(key)
            assert isinstance(histogram, _verta_data_type._VertaDataType)