示例#1
0
def dataset_file(request):
    bucket_name = "somebucket"
    with mock_s3():
        client = boto3.client("s3", region_name=os.environ.get("AWS_REGION"))
        client.create_bucket(Bucket=bucket_name)
        client.put_object(
            Bucket=bucket_name,
            Key=f"train/{request.param}",
            Body=f"contents={request.param}",
        )

        dsf = DatasetFile(request.param, bucket_name)
        dsf.cli = client
        yield dsf
def createdatasetgroup(event, context) -> (Status, str):
    """
    Create/ monitor Amazon Forecast dataset group creation
    :param event: lambda event
    :param context: lambda context
    :return: dataset group status and dataset group ARN
    """
    config = Config.from_sfn(event)
    dataset_file = DatasetFile(event.get("dataset_file"), event.get("bucket"))
    dataset_groups = config.dataset_groups(dataset_file)
    datasets = config.datasets(dataset_file)

    # dataset group creation returns an ARN immediately; creation of all dependent dataset groups is safe
    # dataset group update returns immediately; update of all dependent dataset groups is safe
    for dataset_group in dataset_groups:
        if dataset_group.status == Status.DOES_NOT_EXIST:
            dataset_group.create()

        if dataset_group.status != Status.ACTIVE:
            raise ValueError(
                f"Dataset group {dataset_group.dataset_group_name} is {dataset_group.status}, expected ACTIVE"
            )

        dataset_group.update(datasets, dataset_file)

    # at this point, we are guaranteed that all dataset groups are active (or an error was thrown)
    return (
        Status.ACTIVE,
        [dataset_group.dataset_group_name for dataset_group in dataset_groups],
    )
def build_message(event):
    """
    Build a message for SNS to publish
    :param event: the lambda event containing the message
    :return: the message to publish
    """
    message = ""
    error = None
    file = DatasetFile(event.get("dataset_file"), event.get("bucket"))

    if "statesError" in event.keys():
        logger.info("State error message encountered")
        message += f"There was an error running the forecast for {file.prefix}\n\n"
        error = event.get("statesError")
    if "serviceError" in event.keys():
        logger.info("Service error message encountered")
        message += (
            f"There was a service error running the forecast for {file.prefix}\n\n"
        )
        error = event.get("serviceError")

    if error:
        error_type = error.get("Error", "Unknown")
        error_cause = json.loads(error.get("Cause", "{}"))
        error_message = error_cause.get("errorMessage")
        stack_trace = error_cause.get("stackTrace")

        message += f"Message: {error_message}\n\n"
        message += f"Details: (caught {error_type})\n\n"
        if stack_trace:
            message += f"\n".join(stack_trace)
    else:
        message = f"Forecast for {file.prefix} is ready!"

    return message
def test_status_still_good(forecast_stub, configuration_data, expected_dataset_arns):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("RetailDemandTNPTS.csv", "some_bucket")
    predictor = config.predictor(dataset_file)

    predictor.cli = forecast_stub.client
    forecast_stub.add_response(
        "list_predictors",
        {
            "Predictors": [
                {"PredictorArn": "arn:", "CreationTime": datetime.now(timezone.utc)}
            ]
        },
    )
    forecast_stub.add_response(
        "describe_dataset_group", {"DatasetArns": expected_dataset_arns}
    )
    for arn in expected_dataset_arns:
        forecast_stub.add_response(
            "describe_dataset", {"Status": "ACTIVE", "DatasetArn": arn}
        )
    forecast_stub.add_response(
        "describe_predictor",
        {"CreationTime": datetime.now(timezone.utc), "Status": "ACTIVE"},
    )

    assert predictor.status == Status.ACTIVE
示例#5
0
def test_config_valid(configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("some_new_key.csv", "some_bucket")
    errors = config.validate()
    assert not errors
示例#6
0
def test_predictor_history(forecast_stub, configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("RetailDemandTNPTS.csv", "some_bucket")
    predictor = config.predictor(dataset_file, "RetailDemandTNPTS")

    predictor.cli = forecast_stub.client
    forecast_stub.add_response(
        "list_predictors",
        {
            "Predictors": [
                {
                    "CreationTime": datetime(2015, 1, 1),
                    "PredictorArn": "arn:2015-1-1",
                    "Status": "ACTIVE",
                },
                {
                    "CreationTime": datetime(2017, 1, 1),
                    "PredictorArn": "arn:2017-1-1",
                    "Status": "CREATE_IN_PROGRESS",
                },
            ]
        },
    )

    history = predictor.history()
    assert history[0].get("CreationTime") == datetime(2017, 1, 1)
    assert history[1].get("CreationTime") == datetime(2015, 1, 1)
示例#7
0
    def validate(self, event: dict):
        record = next(iter(event.get("Records", [{}])))
        if not record:
            raise RecordNotFound

        # Make sure this event version is supported
        event_version = record.get("eventVersion")
        if version.parse(event_version).major != S3_EVENT_STRUCTURE_MAJOR:
            raise RecordNotSupported(
                f"The event version {event_version} is not supported by this solution."
            )

        # Make sure there's a bucket in the event structure
        bucket = record.get("s3", {}).get("bucket", {}).get("name")
        if not bucket:
            raise BucketNotFound

        # Make sure there's a key in the event structure
        key = record.get("s3", {}).get("object", {}).get("key")
        if not key:
            raise KeyNotFound

        # The name of the event is the stem of the file without extensions
        file = DatasetFile(key=key, bucket=bucket)

        return bucket, key, file
def test_config_dependent_dataset_dependencies(configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("DatasetsFromRetailDemandTRMProphet",
                               "some_bucket")
    datasets = config.datasets(dataset_file)
示例#9
0
def test_dataset_default(configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("some_new_key.csv", "some_bucket")

    ds = config.dataset(dataset_file)
    assert ds.data_frequency == "D"
    assert ds.dataset_type == DatasetType.TARGET_TIME_SERIES
    assert ds.dataset_domain == DatasetDomain.RETAIL
    assert ds.dataset_name == "some_new_key"
    assert ds.dataset_schema == {
        "Attributes": [
            {
                "AttributeName": "item_id",
                "AttributeType": "string"
            },
            {
                "AttributeName": "timestamp",
                "AttributeType": "timestamp",
            },
            {
                "AttributeName": "demand",
                "AttributeType": "float"
            },
        ]
    }
示例#10
0
def test_config_required_datasets(configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("some_new_key.csv", "some_bucket")

    assert config.required_datasets(dataset_file) == ["TARGET_TIME_SERIES"]
def test_forecast_history(forecast_stub, configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("RetailDemandTNPTS.csv", "some_bucket")
    forecast = config.forecast(dataset_file, "RetailDemandTNPTS")

    forecast.cli = forecast_stub.client
    forecast_stub.add_response(
        "list_forecasts",
        {
            "Forecasts": [
                {
                    "LastModificationTime": datetime(2015, 1, 1),
                    "ForecastArn": "arn:2015-1-1",
                    "Status": "ACTIVE",
                },
                {
                    "LastModificationTime": datetime(2017, 1, 1),
                    "ForecastArn": "arn:2017-1-1",
                    "Status": "CREATE_IN_PROGRESS",
                },
            ]
        },
    )

    history = forecast.history()
    assert history[0].get("LastModificationTime") == datetime(2017, 1, 1)
    assert history[1].get("LastModificationTime") == datetime(2015, 1, 1)
def test_forecast_arn(forecast_stub, configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("RetailDemandTNPTS.csv", "some_bucket")
    forecast = config.forecast(dataset_file, "RetailDemandTNPTS")

    forecast.cli = forecast_stub.client
    forecast_stub.add_response(
        "list_forecasts",
        {
            "Forecasts": [
                {
                    "LastModificationTime": datetime(2015, 1, 1),
                    "ForecastArn": "arn:2015-1-1",
                },
                {
                    "LastModificationTime": datetime(2017, 1, 1),
                    "ForecastArn": "arn:2017-1-1",
                },
            ]
        },
    )

    assert forecast.arn == "arn:2017-1-1"
示例#13
0
def test_predictor_arn(forecast_stub, configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("RetailDemandTNPTS.csv", "some_bucket")
    predictor = config.predictor(dataset_file, "RetailDemandTNPTS")

    predictor.cli = forecast_stub.client
    forecast_stub.add_response(
        "list_predictors",
        {
            "Predictors": [
                {
                    "CreationTime": datetime(2015, 1, 1),
                    "PredictorArn": "arn:2015-1-1"
                },
                {
                    "CreationTime": datetime(2017, 1, 1),
                    "PredictorArn": "arn:2017-1-1"
                },
            ]
        },
    )

    assert predictor.arn == "arn:2017-1-1"
示例#14
0
def test_dataset_import_job_arn(configuration_data, forecast_stub, mocker):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("RetailDemandTRM.csv", "some_bucket")
    dataset_import_job = config.dataset_import_job(dataset_file)

    # create some job history
    forecast_stub.add_response(
        "list_dataset_import_jobs",
        {
            "DatasetImportJobs": [
                {
                    "LastModificationTime": datetime(2015, 1, 1),
                    "DatasetImportJobArn": "arn:2015-1-1",
                },
                {
                    "LastModificationTime":
                    datetime(2017, 1, 1),
                    "DatasetImportJobArn":
                    "arn:aws:forecast:abcdefghijkl:us-east-1:dataset-import-job/RetailDemandTRM/RetailDemandTRM_2017_01_01_00_00_00",
                },
                {
                    "LastModificationTime": datetime(2016, 1, 1),
                    "DatasetImportJobArn": "arn:2016-1-1",
                },
            ]
        },
    )

    dataset_import_job.cli = forecast_stub.client
    assert (
        dataset_import_job.arn ==
        f"arn:aws:forecast:abcdefghijkl:us-east-1:dataset-import-job/RetailDemandTRM/RetailDemandTRM_2017_01_01_00_00_00"
    )
def test_config_dataset_groups(configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("RetailDemandTRMProphet", "some_bucket")
    dsgs = config.dataset_groups(dataset_file)

    assert len(dsgs) == 2
示例#16
0
def test_config_required_datasets_override(configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("Override.csv", "some_bucket")
    required_datasets = config.required_datasets(dataset_file)
    assert "TARGET_TIME_SERIES" in required_datasets
    assert "RELATED_TIME_SERIES" in required_datasets
    assert "ITEM_METADATA" in required_datasets
def test_config_dependent_dataset_groups(configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("RetailDemandTRMProphet", "some_bucket")

    dependents = config.dependent_dataset_groups(dataset_file)
    assert len(dependents) == 2
    assert "DatasetsFromRetailDemandTRMProphet" in dependents
示例#18
0
def test_dataset_group_mismatch(configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("Mismatch.csv", "some_bucket")
    with pytest.raises(ValueError) as excinfo:
        config.dataset_group(dataset_file)

    assert "must match" in str(excinfo.value)
示例#19
0
def test_duplicate_timeseries(configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("RetailDemandDuplicateDatasets.csv",
                               "some_bucket")
    with pytest.raises(ValueError) as excinfo:
        config.required_datasets(dataset_file)

    assert "duplicate dataset types" in str(excinfo.value)
def test_config_predictor_from_dependent(configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("RetailDemandTRMProphet", "some_bucket")

    predictor = config.predictor(dataset_file,
                                 "DatasetsFromRetailDemandTRMProphet")
    assert (predictor.validator.expected_params["AlgorithmArn"] ==
            "arn:aws:forecast:::algorithm/CNN-QR")
示例#21
0
def test_missing_timeseries(configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("RetailDemandForgottenDatasets.csv",
                               "some_bucket")
    with pytest.raises(ValueError) as excinfo:
        config.required_datasets(dataset_file)

    assert "you must configure a TARGET_TIME_SERIES dataset" in str(
        excinfo.value)
def test_status_not_yet_created(forecast_stub, configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("RetailDemandTNPTS.csv", "some_bucket")
    forecast = config.forecast(dataset_file, "RetailDemandTNPTS")

    forecast.cli = forecast_stub.client
    forecast_stub.add_response("list_forecasts", {"Forecasts": []})

    assert forecast.status == Status.DOES_NOT_EXIST
    forecast_stub.assert_no_pending_responses()
def test_init_forecast(forecast_stub, configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("RetailDemandTNPTS.csv", "some_bucket")
    forecast = config.forecast(dataset_file, "RetailDemandTNPTS")
    dataset_group = config.dataset_group(dataset_file)

    assert (
        forecast._dataset_group.dataset_group_name == dataset_group.dataset_group_name
    )
    assert forecast._forecast_config == config.config_item(dataset_file, "Forecast")
示例#24
0
def test_dataset_import_timestamp_format_none(configuration_data,
                                              forecast_stub):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("RetailDemandTRM.csv", "some_bucket")
    dataset = config.dataset(dataset_file)

    forecast_stub.add_response("list_dataset_import_jobs",
                               {"DatasetImportJobs": []})
    dataset.cli = forecast_stub.client

    assert dataset.timestamp_format == None
 def _copy_dataset(self, source: DatasetFileDataset) -> DatasetFileDataset:
     """
     Athena works against folders of .csv files, but not single .csv files. This copies them to a temporary location
     under the forecast data bucket (under /raw) to consume their data properly
     :param source: DatasetFileDataset of source input
     :return: DatasetFileDataset of destination (under 'raw')
     """
     dest = source.dataset_file.copy("raw", self.unique_id,
                                     str(source.dataset_file.data_type))
     copied_dataset_file = DatasetFile(key=dest,
                                       bucket=source.dataset_file.bucket)
     return DatasetFileDataset(dataset=source.dataset,
                               dataset_file=copied_dataset_file)
示例#26
0
def test_init_predictor(forecast_stub, configuration_data):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("RetailDemandTNPTS.csv", "some_bucket")
    predictor = config.predictor(dataset_file, "RetailDemandTNPTS")

    predictor.cli = forecast_stub.client

    assert predictor._dataset_file == dataset_file
    for k, v in config.config_item(dataset_file, "Predictor").items():
        if k != "MaxAge":
            assert predictor._predictor_params.get(k) == v
def test_dataset_status_lifecycle(configuration_data, forecast_stub):
    config = Config()
    config.config = configuration_data

    dataset_file = DatasetFile("RetailDemandTRM.csv", "some_bucket")
    dataset = config.dataset(dataset_file)

    forecast_stub.add_client_error("describe_dataset", "ResourceNotFoundException")
    forecast_stub.add_response("describe_dataset", {"Status": "ACTIVE"})

    dataset.cli = forecast_stub.client

    assert dataset.status == Status.DOES_NOT_EXIST
    assert dataset.status == "ACTIVE"
    def datasets(self, dataset_file: DatasetFile) -> List[Dataset]:
        """
        Get all datasets that would be referenced by a dataset group.
        :param dataset_file: The dataset file to use
        :return: A list of all datasets that are codependent with dataset_file
        """
        required = self.required_datasets(dataset_file)
        dataset_templates = []
        for data_type in required:
            dataset_file.data_type = DatasetType[data_type]
            ds = self.dataset(dataset_file)
            dataset_templates.append(ds)

        return dataset_templates
示例#29
0
def etl_forecast_trm(sfn_configuration_data, s3_valid_files):
    """This represents a single file upload"""
    config = Config.from_sfn(sfn_configuration_data)

    with mock_sts():
        dataset_file = DatasetFile(key="train/RetailDemandTRM.csv", bucket="testbucket")
        forecast = config.forecast(dataset_file, "RetailDemandTRM")
        yield ForecastETL(
            workgroup="primary",
            schema="default",
            config=config,
            dataset_file=dataset_file,
            forecast=forecast,
        )
    def _get_datasets(
        self,
    ) -> (
            Union[None, DatasetFileDataset],
            Union[None, DatasetFileDataset],
            Union[None, DatasetFileDataset],
    ):
        """
        Gets the datasets and dataset files associated with this forecast
        :return: (ts, rts, md)
        """
        datasets = self.config.datasets(self.dataset_file)
        prefix = f"s3://{self.dataset_file.bucket}/train/{self.dataset_file.prefix}"

        ts, rts, md = None, None, None

        for dataset in datasets:
            if dataset.dataset_type == DatasetType.TARGET_TIME_SERIES:
                ts = DatasetFileDataset(
                    dataset,
                    DatasetFile.from_s3_path(prefix +
                                             dataset.dataset_type.suffix),
                )
            elif dataset.dataset_type == DatasetType.RELATED_TIME_SERIES:
                rts = DatasetFileDataset(
                    dataset,
                    DatasetFile.from_s3_path(prefix +
                                             dataset.dataset_type.suffix),
                )
            elif dataset.dataset_type == DatasetType.ITEM_METADATA:
                md = DatasetFileDataset(
                    dataset,
                    DatasetFile.from_s3_path(prefix +
                                             dataset.dataset_type.suffix),
                )

        return (ts, rts, md)