示例#1
0
def test_normalize_sensor_tags_not_ok():
    with pytest.raises(SensorTagNormalizationError):
        tag_list_as_list_of_strings_nonsense = [
            NON_RESOLVABLE_TAG_NAME1,
            NON_RESOLVABLE_TAG_NAME2,
        ]
        normalize_sensor_tags(tag_list_as_list_of_strings_nonsense)
示例#2
0
def test_load_series_dry_run(dates, ncs_reader):
    valid_tag_list_no_asset = normalize_sensor_tags(["TRC-123", "TRC-321"])
    for frame in ncs_reader.load_series(dates[0],
                                        dates[1],
                                        valid_tag_list_no_asset,
                                        dry_run=True):
        assert len(frame) == 0
示例#3
0
def dataset_config(mock_file_system, mock_assets_config):
    train_start_date = dateutil.parser.isoparse("2017-01-01T08:56:00+00:00")
    train_end_date = dateutil.parser.isoparse("2017-01-01T10:01:00+00:00")
    return {
        "type": "TimeSeriesDataset",
        "train_start_date": train_start_date,
        "train_end_date": train_end_date,
        "tag_list": normalize_sensor_tags(["TRC-FIQ -39-0706", "GRA-EM-23-0003ARV.PV"]),
        "data_provider": DataLakeProvider(
            storage=mock_file_system, assets_config=mock_assets_config
        ),
    }
示例#4
0
    def tags(self) -> typing.List[SensorTag]:
        """
        The input tags for this model

        Returns
        -------
        typing.List[SensorTag]
        """
        return normalize_sensor_tags(
            g.metadata["dataset"]["tag_list"],
            asset=g.metadata["dataset"].get("asset"),
            default_asset=g.metadata["dataset"].get("default_asset"),
        )
示例#5
0
def test_with_conflicted_file_types_with_preferable_csv(dates, assets_config):
    ncs_reader = NcsReader(
        ADLGen1FileSystem(AzureDLFileSystemMock(), "adl1"),
        assets_config=assets_config,
        remove_status_codes=[0],
        lookup_for=["csv"],
        partition_by="year",
    )

    valid_tag_list = normalize_sensor_tags(["TRC-324"])
    series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list)
    tags_series = [v for v in series_gen]
    assert len(tags_series) == 1
    trc_324_series = tags_series[0]
    # CSV file should be with 1 row
    assert len(trc_324_series) == 1
示例#6
0
def test_parquet_files_lookup(dates, assets_config):
    ncs_reader = NcsReader(
        ADLGen1FileSystem(AzureDLFileSystemMock(), "adl1"),
        assets_config=assets_config,
        remove_status_codes=[0],
        lookup_for=["yearly_parquet", "csv"],
        partition_by=PartitionBy.YEAR,
    )

    valid_tag_list = normalize_sensor_tags(["TRC-323"])
    series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list)
    tags_series = [v for v in series_gen]
    assert len(tags_series) == 1
    trc_323_series = tags_series[0]
    assert trc_323_series.name == "TRC-323"
    assert trc_323_series.dtype.name == "float64"
    assert len(trc_323_series) == 20
示例#7
0
def test_load_series_with_filter_bad_data(dates, remove_status_codes,
                                          assets_config):

    ncs_reader = NcsReader(
        ADLGen1FileSystem(AzureDLFileSystemMock(), "adl1"),
        assets_config=assets_config,
        remove_status_codes=remove_status_codes,
        lookup_for=["yearly_parquet", "csv"],
        partition_by=PartitionBy.YEAR,
    )

    valid_tag_list = normalize_sensor_tags(["TRC-322"])
    series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list)
    # Checks if the bad data from the files under tests/gordo/data_provider/data/datalake/TRC-322
    # are filtered out. 20 rows exists, 5 of then have the value 0.

    n_expected = 15 if remove_status_codes != [] else 20
    assert all(len(series) == n_expected for series in series_gen)
示例#8
0
def test_monthly_parquet(dates, assets_config):
    ncs_reader = NcsReader(
        ADLGen1FileSystem(AzureDLFileSystemMock(), "adl1"),
        assets_config=assets_config,
    )

    valid_tag_list = normalize_sensor_tags(["TRC-325"])
    series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list)
    tags_series = [v for v in series_gen]
    assert len(tags_series) == 1
    index = tags_series[0].index
    assert len(index) == 20
    dr1 = pd.date_range(start="2001-05-10T00:00:00+00:00",
                        periods=10,
                        freq="1T")
    dr2 = pd.date_range(start="2001-06-10T00:00:00+00:00",
                        periods=10,
                        freq="1T")
    dr = dr1.append(dr2)
    assert index.equals(dr)
示例#9
0
    def target_tags(self) -> typing.List[SensorTag]:
        """
        The target tags for this model

        Returns
        -------
        typing.List[SensorTag]
        """
        # TODO refactor this part to have the same tag preparation logic as in TimeSeriesDataset
        orig_target_tag_list = []
        if "target_tag_list" in g.metadata["dataset"]:
            orig_target_tag_list = g.metadata["dataset"]["target_tag_list"]
        if orig_target_tag_list:
            return normalize_sensor_tags(
                orig_target_tag_list,
                asset=g.metadata["dataset"].get("asset"),
                default_asset=g.metadata["dataset"].get("default_asset"),
            )
        else:
            return self.tags
示例#10
0
def test_can_handle_tag_unknow_prefix_raise(ncs_reader):
    with pytest.raises(ValueError):
        ncs_reader.can_handle_tag(normalize_sensor_tags(["XYZ-123"])[0])
示例#11
0
def test_load_series_invalid_year(start_date, end_date, frame_len, ncs_reader):
    valid_tag_list = normalize_sensor_tags(["TRC-123"])
    frame = next(ncs_reader.load_series(start_date, end_date, valid_tag_list))
    assert len(frame) == frame_len
示例#12
0
def test_load_series_known_prefix(dates, ncs_reader):
    valid_tag_list_no_asset = normalize_sensor_tags(["TRC-123", "TRC-321"])
    for frame in ncs_reader.load_series(dates[0], dates[1],
                                        valid_tag_list_no_asset):
        assert len(frame) == 20
示例#13
0
        lookup_for=["yearly_parquet", "csv"],
        partition_by=PartitionBy.YEAR,
    )


@pytest.fixture
def dates():
    return (
        dateutil.parser.isoparse("2000-01-01T08:56:00+00:00"),
        dateutil.parser.isoparse("2001-09-01T10:01:00+00:00"),
    )


@pytest.mark.parametrize(
    "tag_to_check",
    [normalize_sensor_tags(["TRC-123"])[0],
     SensorTag("XYZ-123", "1776-TROC")],
)
def test_can_handle_tag_ok(tag_to_check, ncs_reader):
    assert ncs_reader.can_handle_tag(tag_to_check)


@pytest.mark.parametrize(
    "tag_to_check",
    [SensorTag("TRC-123", None),
     SensorTag("XYZ-123", "123-XXX")])
def test_can_handle_tag_notok(tag_to_check, ncs_reader):
    assert not ncs_reader.can_handle_tag(tag_to_check)


def test_can_handle_tag_unknow_prefix_raise(ncs_reader):
示例#14
0
def test_normalize_sensor_tags_ok(good_input_tags, asset, default_asset,
                                  expected_output_tags):
    tag_list_as_list_of_sensor_tag = normalize_sensor_tags(
        good_input_tags, asset, default_asset=default_asset)
    assert tag_list_as_list_of_sensor_tag == expected_output_tags
示例#15
0
def get_machine_log_items(machine: Machine) -> Tuple[List[Metric], List[Param]]:
    """
    Create flat lists of MLflow logging entities from multilevel dictionary

    For more information, see the mlflow docs:
    https://www.mlflow.org/docs/latest/python_api/mlflow.tracking.html#mlflow.tracking.MlflowClient.log_batch

    Parameters
    ----------
    machine: Machine

    Returns
    -------
    metrics: List[Metric]
        List of MLFlow Metric objects to log.
    params: List[Param]
        List of MLFlow Param objects to log.
    """

    metrics: List[Metric] = list()
    build_metadata = machine.metadata.build_metadata

    # Project/machine parameters
    keys = ["project_name", "name"]
    params = [Param(attr, getattr(machine, attr)) for attr in keys]

    # Dataset parameters
    dataset_keys = [
        "train_start_date",
        "train_end_date",
        "resolution",
        "row_filter",
        "row_filter_buffer_size",
    ]
    params.extend(Param(k, str(getattr(machine.dataset, k))) for k in dataset_keys)

    # Model parameters
    model_keys = ["model_creation_date", "model_builder_version", "model_offset"]
    params.extend(Param(k, str(getattr(build_metadata.model, k))) for k in model_keys)

    # Parse cross-validation split metadata
    splits = build_metadata.model.cross_validation.splits
    params.extend(Param(k, str(v)) for k, v in splits.items())

    # Parse cross-validation metrics

    tag_list = normalize_sensor_tags(
        machine.dataset.tag_list, asset=machine.dataset.asset
    )
    scores = build_metadata.model.cross_validation.scores

    keys = sorted(list(scores.keys()))
    subkeys = ["mean", "max", "min", "std"]

    n_folds = len(scores[keys[0]]) - len(subkeys)
    for k in keys:
        # Skip per tag data, produces too many params for MLflow
        if any([t.name in k for t in tag_list]):
            continue

        # Summary stats per metric
        for sk in subkeys:
            metrics.append(Metric(f"{k}-{sk}", scores[k][f"fold-{sk}"], epoch_now(), 0))
        # Append value for each fold with increasing steps
        metrics.extend(
            Metric(k, scores[k][f"fold-{i+1}"], epoch_now(), i) for i in range(n_folds)
        )

    # Parse fit metrics
    try:
        meta_params = build_metadata.model.model_meta["history"]["params"]
    except KeyError:
        logger.debug(
            "Key 'build-metadata.model.history.params' not found found in metadata."
        )
    else:
        metrics.extend(
            Metric(k, float(getattr(build_metadata.model, k)), epoch_now(), 0)
            for k in ["model_training_duration_sec"]
        )
        for m in meta_params["metrics"]:
            data = build_metadata.model.model_meta["history"][m]
            metrics.extend(
                Metric(m, float(x), timestamp=epoch_now(), step=i)
                for i, x in enumerate(data)
            )
        params.extend(
            Param(k, str(meta_params[k]))
            for k in (p for p in meta_params if p != "metrics")
        )

    return metrics, params
def test_normalize_iroc_tags():
    normalized_tags = normalize_sensor_tags(IROC_MANY_ASSETS_TAG_LIST)
    assert normalized_tags == IROC_MANY_ASSETS_SENSOR_TAG_LIST