Пример #1
0
    def transform(self, data: DataEntry) -> DataEntry:
        value = np.asarray(data[self.field], dtype=self.dtype)

        assert_data_error(
            value.ndim == self.expected_ndim,
            'Input for field "{self.field}" does not have the required'
            "dimension (field: {self.field}, ndim observed: {value.ndim}, "
            "expected ndim: {self.expected_ndim})",
            value=value,
            self=self,
        )
        data[self.field] = value
        return data
Пример #2
0
 def transform(self, data: DataEntry) -> DataEntry:
     value = data[self.field]
     if not isinstance(value, float):
         # this lines produces "ValueError: setting an array element with a
         # sequence" on our test
         # value = np.asarray(value, dtype=np.float32)
         # see https://stackoverflow.com/questions/43863748/
         value = np.asarray(list(value), dtype=self.dtype)
     else:
         # ugly: required as list conversion will fail in the case of a
         # float
         value = np.asarray(value, dtype=self.dtype)
     assert_data_error(
         value.ndim >= self.expected_ndim,
         'Input for field "{self.field}" does not have the required'
         "dimension (field: {self.field}, ndim observed: {value.ndim}, "
         "expected ndim: {self.expected_ndim})",
         value=value,
         self=self,
     )
     data[self.field] = value
     return data
Пример #3
0
def calculate_dataset_statistics(ts_dataset: Any) -> DatasetStatistics:
    """
    Computes the statistics of a given Dataset.

    Parameters
    ----------
    ts_dataset
        Dataset of which to compute the statistics.

    Returns
    -------
    DatasetStatistics
        NamedTuple containing the statistics.
    """
    num_time_observations = 0
    num_time_series = 0
    min_target = 1e20
    max_target = -1e20
    sum_target = 0.0
    sum_abs_target = 0.0
    integer_dataset = True
    observed_feat_static_cat: Optional[List[Set[int]]] = None
    observed_feat_static_real: Optional[List[Set[float]]] = None
    num_feat_static_real: Optional[int] = None
    num_feat_static_cat: Optional[int] = None
    num_past_feat_dynamic_real: Optional[int] = None
    num_feat_dynamic_real: Optional[int] = None
    num_feat_dynamic_cat: Optional[int] = None
    num_missing_values = 0

    scale_histogram = ScaleHistogram()

    with tqdm(enumerate(ts_dataset, start=1), total=len(ts_dataset)) as it:
        max_target_length = 0
        for num_time_series, ts in it:

            # TARGET
            target = ts[FieldName.TARGET]
            observed_target = target[~np.isnan(target)]
            num_observations = len(observed_target)

            if num_observations > 0:
                # 'nan' is handled in observed_target definition
                assert_data_error(
                    np.all(np.isfinite(observed_target)),
                    "Target values have to be finite (e.g., not inf, -inf, "
                    "or None) and cannot exceed single precision floating "
                    "point range.",
                )

                num_time_observations += num_observations
                max_target_length = max(num_observations, max_target_length)
                min_target = float(min(min_target, observed_target.min()))
                max_target = float(max(max_target, observed_target.max()))
                num_missing_values += int(np.isnan(target).sum())
                sum_target += float(observed_target.sum())
                sum_abs_target += float(np.abs(observed_target).sum())
                integer_dataset = integer_dataset and bool(
                    np.all(np.mod(observed_target, 1) == 0))

            scale_histogram.add(
                observed_target)  # after checks for inf and None

            # FEAT_STATIC_CAT
            feat_static_cat = (ts[FieldName.FEAT_STATIC_CAT]
                               if FieldName.FEAT_STATIC_CAT in ts else [])

            if num_feat_static_cat is None:
                num_feat_static_cat = len(feat_static_cat)
                observed_feat_static_cat = [
                    set() for _ in range(num_feat_static_cat)
                ]

            # needed to type check
            assert num_feat_static_cat is not None
            assert observed_feat_static_cat is not None

            assert_data_error(
                num_feat_static_cat == len(feat_static_cat),
                "Not all feat_static_cat vectors have the same length {} != {}.",
                num_feat_static_cat,
                len(feat_static_cat),
            )
            for i, c in enumerate(feat_static_cat):
                observed_feat_static_cat[i].add(c)

            # FEAT_STATIC_REAL
            feat_static_real = (ts[FieldName.FEAT_STATIC_REAL]
                                if FieldName.FEAT_STATIC_REAL in ts else [])

            if num_feat_static_real is None:
                num_feat_static_real = len(feat_static_real)
                observed_feat_static_real = [
                    set() for _ in range(num_feat_static_real)
                ]

            # needed to type check
            assert num_feat_static_real is not None
            assert observed_feat_static_real is not None

            assert_data_error(
                num_feat_static_real == len(feat_static_real),
                "Not all feat_static_real vectors have the same length {} != {}.",
                num_feat_static_real,
                len(feat_static_real),
            )
            for i, c in enumerate(feat_static_real):
                observed_feat_static_real[i].add(c)

            # FEAT_DYNAMIC_CAT
            feat_dynamic_cat = (ts[FieldName.FEAT_DYNAMIC_CAT]
                                if FieldName.FEAT_DYNAMIC_CAT in ts else None)

            if feat_dynamic_cat is None:
                # feat_dynamic_cat not found, check it was the first ts we encounter or
                # that feat_dynamic_cat were seen before
                assert_data_error(
                    num_feat_dynamic_cat is None or num_feat_dynamic_cat == 0,
                    "feat_dynamic_cat was found for some instances but not others.",
                )
                num_feat_dynamic_cat = 0
            else:
                if num_feat_dynamic_cat is None:
                    # first num_feat_dynamic_cat found
                    num_feat_dynamic_cat = len(feat_dynamic_cat)
                else:
                    assert_data_error(
                        num_feat_dynamic_cat == len(feat_dynamic_cat),
                        "Found instances with different number of features in "
                        "feat_dynamic_cat, found one with {} and another with {}.",
                        num_feat_dynamic_cat,
                        len(feat_dynamic_cat),
                    )

                assert_data_error(
                    np.all(np.isfinite(feat_dynamic_cat)),
                    "Features values have to be finite and cannot exceed single "
                    "precision floating point range.",
                )
                num_feat_dynamic_cat_time_steps = len(feat_dynamic_cat[0])
                assert_data_error(
                    num_feat_dynamic_cat_time_steps == len(target),
                    "Each feature in feat_dynamic_cat has to have the same length as "
                    "the target. Found an instance with feat_dynamic_cat of length {} "
                    "and a target of length {}.",
                    num_feat_dynamic_cat_time_steps,
                    len(target),
                )

            # FEAT_DYNAMIC_REAL
            feat_dynamic_real = None
            if FieldName.FEAT_DYNAMIC_REAL in ts:
                feat_dynamic_real = ts[FieldName.FEAT_DYNAMIC_REAL]
            elif FieldName.FEAT_DYNAMIC_REAL_LEGACY in ts:
                feat_dynamic_real = ts[FieldName.FEAT_DYNAMIC_REAL_LEGACY]

            if feat_dynamic_real is None:
                # feat_dynamic_real not found, check it was the first ts we encounter or
                # that feat_dynamic_real were seen before
                assert_data_error(
                    num_feat_dynamic_real is None
                    or num_feat_dynamic_real == 0,
                    "feat_dynamic_real was found for some instances but not others.",
                )
                num_feat_dynamic_real = 0
            else:
                if num_feat_dynamic_real is None:
                    # first num_feat_dynamic_real found
                    num_feat_dynamic_real = len(feat_dynamic_real)
                else:
                    assert_data_error(
                        num_feat_dynamic_real == len(feat_dynamic_real),
                        "Found instances with different number of features in "
                        "feat_dynamic_real, found one with {} and another with {}.",
                        num_feat_dynamic_real,
                        len(feat_dynamic_real),
                    )

                assert_data_error(
                    np.all(np.isfinite(feat_dynamic_real)),
                    "Features values have to be finite and cannot exceed single "
                    "precision floating point range.",
                )
                num_feat_dynamic_real_time_steps = len(feat_dynamic_real[0])
                assert_data_error(
                    num_feat_dynamic_real_time_steps == len(target),
                    "Each feature in feat_dynamic_real has to have the same length as "
                    "the target. Found an instance with feat_dynamic_real of length {} "
                    "and a target of length {}.",
                    num_feat_dynamic_real_time_steps,
                    len(target),
                )

            # PAST_FEAT_DYNAMIC_REAL
            past_feat_dynamic_real = None
            if FieldName.PAST_FEAT_DYNAMIC_REAL in ts:
                past_feat_dynamic_real = ts[FieldName.PAST_FEAT_DYNAMIC_REAL]

            if past_feat_dynamic_real is None:
                # past_feat_dynamic_real not found, check it was the first ts we encounter or
                # that past_feat_dynamic_real were seen before
                assert_data_error(
                    num_past_feat_dynamic_real is None
                    or num_past_feat_dynamic_real == 0,
                    "past_feat_dynamic_real was found for some instances but not others.",
                )
                num_past_feat_dynamic_real = 0
            else:
                if num_past_feat_dynamic_real is None:
                    # first num_past_feat_dynamic_real found
                    num_past_feat_dynamic_real = len(past_feat_dynamic_real)
                else:
                    assert_data_error(
                        num_past_feat_dynamic_real == len(
                            past_feat_dynamic_real),
                        "Found instances with different number of features in "
                        "past_feat_dynamic_real, found one with {} and another with {}.",
                        num_past_feat_dynamic_real,
                        len(past_feat_dynamic_real),
                    )

                assert_data_error(
                    np.all(np.isfinite(past_feat_dynamic_real)),
                    "Features values have to be finite and cannot exceed single "
                    "precision floating point range.",
                )

    assert_data_error(num_time_series > 0, "Time series dataset is empty!")
    assert_data_error(
        num_time_observations > 0,
        "Only empty time series found in the dataset!",
    )

    # note this require the above assumption to avoid a division by zero
    # runtime error
    mean_target_length = num_time_observations / num_time_series

    # note this require the above assumption to avoid a division by zero
    # runtime error
    mean_target = sum_target / num_time_observations
    mean_abs_target = sum_abs_target / num_time_observations

    integer_dataset = integer_dataset and min_target >= 0.0

    assert len(scale_histogram) == num_time_series

    return DatasetStatistics(
        integer_dataset=integer_dataset,
        max_target=max_target,
        mean_abs_target=mean_abs_target,
        mean_target=mean_target,
        mean_target_length=mean_target_length,
        max_target_length=max_target_length,
        min_target=min_target,
        num_missing_values=num_missing_values,
        feat_static_real=observed_feat_static_real
        if observed_feat_static_real else [],
        feat_static_cat=observed_feat_static_cat
        if observed_feat_static_cat else [],
        num_past_feat_dynamic_real=num_past_feat_dynamic_real,
        num_feat_dynamic_real=num_feat_dynamic_real,
        num_feat_dynamic_cat=num_feat_dynamic_cat,
        num_time_observations=num_time_observations,
        num_time_series=num_time_series,
        scale_histogram=scale_histogram,
    )
Пример #4
0
def calculate_dataset_statistics(ts_dataset: Any) -> DatasetStatistics:
    """
    Computes the statistics of a given Dataset.

    Parameters
    ----------
    ts_dataset
        Dataset of which to compute the statistics.

    Returns
    -------
    DatasetStatistics
        NamedTuple containing the statistics.
    """
    num_time_observations = 0
    num_time_series = 0
    min_target = 1e20
    max_target = -1e20
    sum_target = 0.0
    sum_abs_target = 0.0
    integer_dataset = True
    observed_cats: Optional[List[Set[int]]] = None
    num_cats: Optional[int] = None
    num_dynamic_feat: Optional[int] = None
    num_missing_values = 0

    scale_histogram = ScaleHistogram()

    with tqdm(enumerate(ts_dataset, start=1), total=len(ts_dataset)) as it:
        for num_time_series, ts in it:
            target = ts['target']
            observed_target = target[~np.isnan(target)]
            cat = ts['cat'] if 'cat' in ts else []  # FIXME
            num_observations = len(observed_target)
            scale_histogram.add(observed_target)

            if num_observations > 0:
                num_time_observations += num_observations
                # TODO: this code does not handle missing value: min_target would
                # TODO: be NaN if any missing value is present
                min_target = float(min(min_target, observed_target.min()))
                max_target = float(max(max_target, observed_target.max()))
                num_missing_values += int(np.isnan(target).sum())

                assert_data_error(
                    np.all(np.isfinite(observed_target)),
                    'Target values have to be finite (e.g., not "inf", "-inf", '
                    '"nan", or null) and cannot exceed single precision floating '
                    'point range.',
                )
                sum_target += float(observed_target.sum())
                sum_abs_target += float(np.abs(observed_target).sum())
                integer_dataset = integer_dataset and bool(
                    np.all(np.mod(observed_target, 1) == 0))

            if num_cats is None:
                num_cats = len(cat)
                observed_cats = [set() for _ in range(num_cats)]

            # needed to type check
            assert num_cats is not None
            assert observed_cats is not None

            assert_data_error(
                num_cats == len(cat),
                'Not all cat vectors have the same length {} != {}.',
                num_cats,
                len(cat),
            )
            for i, c in enumerate(cat):
                observed_cats[i].add(c)

            dynamic_feat = ts['dynamic_feat'] if 'dynamic_feat' in ts else None

            if dynamic_feat is None:
                # dynamic_feat not found, check it was the first ts we encounter or
                # that dynamic_feat were seen before
                assert_data_error(
                    num_dynamic_feat is None or num_dynamic_feat == 0,
                    'dynamic_feat was found for some instances but not others.',
                )
                num_dynamic_feat = 0
            else:
                if num_dynamic_feat is None:
                    # first dynamic_feat found
                    num_dynamic_feat = dynamic_feat.shape[0]
                else:
                    assert_data_error(
                        num_dynamic_feat == dynamic_feat.shape[0],
                        'Found instances with different number of features in '
                        'dynamic_feat, found one with {} and another with {}.',
                        num_dynamic_feat,
                        dynamic_feat.shape[0],
                    )

                assert_data_error(
                    np.all(np.isfinite(dynamic_feat)),
                    'Features values have to be finite and cannot exceed single '
                    'precision floating point range.',
                )
                num_dynamic_feat_time_steps = dynamic_feat.shape[1]
                assert_data_error(
                    num_dynamic_feat_time_steps == len(target),
                    'Each feature in dynamic_feat has to have the same length as '
                    'the target. Found an instance with dynamic_feat of length {} '
                    'and a target of length {}.',
                    num_dynamic_feat_time_steps,
                    len(target),
                )

    assert_data_error(num_time_series > 0, 'Time series dataset is empty!')
    assert_data_error(
        num_time_observations > 0,
        'Only empty time series found in the dataset!',
    )

    # note this require the above assumption to avoid a division by zero
    # runtime error
    mean_target_length = num_time_observations / num_time_series

    # note this require the above assumption to avoid a division by zero
    # runtime error
    mean_target = sum_target / num_time_observations
    mean_abs_target = sum_abs_target / num_time_observations

    integer_dataset = integer_dataset and min_target >= 0.0

    assert len(scale_histogram) == num_time_series

    return DatasetStatistics(
        cats=observed_cats if observed_cats is not None else [],
        integer_dataset=integer_dataset,
        max_target=max_target,
        mean_abs_target=mean_abs_target,
        mean_target=mean_target,
        mean_target_length=mean_target_length,
        min_target=min_target,
        num_missing_values=num_missing_values,
        num_dynamic_feat=num_dynamic_feat if num_dynamic_feat else 0,
        num_time_observations=num_time_observations,
        num_time_series=num_time_series,
        scale_histogram=scale_histogram,
    )