Пример #1
0
def test_check_timedelta(es):
    time_units = list(Timedelta._readable_units.keys())
    expanded_units = list(Timedelta._readable_units.values())
    exp_to_standard_unit = {e: t for e, t in zip(expanded_units, time_units)}
    singular_units = [u[:-1] for u in expanded_units]
    sing_to_standard_unit = {s: t for s, t in zip(singular_units, time_units)}
    to_standard_unit = merge(exp_to_standard_unit, sing_to_standard_unit)
    full_units = singular_units + expanded_units + time_units + time_units

    strings = ["2 {}".format(u) for u in singular_units + expanded_units +
               time_units]
    strings += ["2{}".format(u) for u in time_units]
    for i, s in enumerate(strings):
        unit = full_units[i]
        standard_unit = unit
        if unit in to_standard_unit:
            standard_unit = to_standard_unit[unit]

        if standard_unit == 'o':
            s = (s, 'logs')
        td = _check_timedelta(s)
        if standard_unit != 'w':
            assert td.value == 2
            assert td.unit == standard_unit
        else:
            assert td.value == 2 * 7

    td = _check_timedelta(2)
    assert td.value == 2
    assert td.unit == Timedelta._generic_unit
    td = _check_timedelta((2, 'logs'))
    assert td.value == 2
    assert td.unit == Timedelta._Observations
Пример #2
0
def test_check_timedelta(es):
    time_units = list(Timedelta._readable_units.keys())
    expanded_units = list(Timedelta._readable_units.values())
    exp_to_standard_unit = {e: t for e, t in zip(expanded_units, time_units)}
    singular_units = [u[:-1] for u in expanded_units]
    sing_to_standard_unit = {s: t for s, t in zip(singular_units, time_units)}
    to_standard_unit = merge(exp_to_standard_unit, sing_to_standard_unit)
    full_units = singular_units + expanded_units + time_units + time_units

    strings = ["2 {}".format(u) for u in singular_units + expanded_units +
               time_units]
    strings += ["2{}".format(u) for u in time_units]
    for i, s in enumerate(strings):
        unit = full_units[i]
        standard_unit = unit
        if unit in to_standard_unit:
            standard_unit = to_standard_unit[unit]

        if standard_unit == 'o':
            s = (s, 'logs')
        td = _check_timedelta(s)
        if standard_unit != 'w':
            assert td.value == 2
            assert td.unit == standard_unit
        else:
            assert td.value == 2 * 7

    td = _check_timedelta(2)
    assert td.value == 2
    assert td.unit == Timedelta._generic_unit
    td = _check_timedelta((2, 'logs'))
    assert td.value == 2
    assert td.unit == Timedelta._Observations
Пример #3
0
def test_has_multiple_units():
    single_unit = pd.DateOffset(months=3)
    multiple_units = pd.DateOffset(months=3, years=3, days=5)
    single_td = _check_timedelta(single_unit)
    multiple_td = _check_timedelta(multiple_units)
    assert single_td.has_multiple_units() is False
    assert multiple_td.has_multiple_units() is True
Пример #4
0
def test_pd_dateoffset_to_timedelta_math():
    base = pd.to_datetime("2020-01-31")
    add = _check_timedelta(pd.DateOffset(months=2))
    res = base + add
    assert res == pd.to_datetime("2020-03-31")

    base_2 = pd.to_datetime("2020-01-31")
    add_2 = _check_timedelta(pd.DateOffset(months=2, days=3))
    res_2 = base_2 + add_2
    assert res_2 == pd.to_datetime("2020-04-03")

    base_3 = pd.to_datetime("2019-09-20")
    sub = _check_timedelta(pd.offsets.BDay(10))
    res_3 = base_3 - sub
    assert res_3 == pd.to_datetime("2019-09-06")
Пример #5
0
    def query_by_values(self,
                        instance_vals,
                        variable_id=None,
                        columns=None,
                        time_last=None,
                        training_window=None):
        """Query instances that have variable with given value

        Args:
            instance_vals (pd.Dataframe, pd.Series, list[str] or str) :
                Instance(s) to match.
            variable_id (str) : Variable to query on. If None, query on index.
            columns (list[str]) : Columns to return. Return all columns if None.
            time_last (pd.TimeStamp) : Query data up to and including this
                time. Only applies if entity has a time index.
            training_window (Timedelta, optional):
                Window defining how much time before the cutoff time data
                can be used when calculating features. If None, all data before cutoff time is used.

        Returns:
            pd.DataFrame : instances that match constraints with ids in order of underlying dataframe
        """
        if not variable_id:
            variable_id = self.index

        instance_vals = self._vals_to_series(instance_vals, variable_id)

        training_window = _check_timedelta(training_window)

        if training_window is not None:
            assert training_window.has_no_observations(
            ), "Training window cannot be in observations"

        if instance_vals is None:
            df = self.df.copy()

        elif instance_vals.shape[0] == 0:
            df = self.df.head(0)

        else:
            df = self.df[self.df[variable_id].isin(instance_vals)]

            df = df.set_index(self.index, drop=False)

            # ensure filtered df has same categories as original
            # workaround for issue below
            # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538
            if pdtypes.is_categorical_dtype(self.df[variable_id]):
                categories = pd.api.types.CategoricalDtype(
                    categories=self.df[variable_id].cat.categories)
                df[variable_id] = df[variable_id].astype(categories)

        df = self._handle_time(df=df,
                               time_last=time_last,
                               training_window=training_window)

        if columns is not None:
            df = df[columns]

        return df
Пример #6
0
def test_relative_month():
    td_time = "1 month"
    td = _check_timedelta(td_time)
    assert td.get_value('mo') == 1
    assert isinstance(td.delta_obj, relativedelta)

    time = pd.to_datetime('2020-01-31')
    assert time + td == pd.to_datetime('2020-02-29')

    td_time = "6 months"
    td = _check_timedelta(td_time)
    assert td.get_value('mo') == 6
    assert isinstance(td.delta_obj, relativedelta)

    time = pd.to_datetime('2020-01-31')
    assert time + td == pd.to_datetime('2020-07-31')
Пример #7
0
    def __init__(self, base_features, parent_entity, primitive, use_previous=None,
                 where=None):
        if hasattr(base_features, '__iter__'):
            base_features = [_check_feature(bf) for bf in base_features]
            msg = "all base features must share the same entity"
            assert len(set([bf.entity for bf in base_features])) == 1, msg
        else:
            base_features = [_check_feature(base_features)]

        self.child_entity = base_features[0].entity
        self.parent_entity = parent_entity.entityset.metadata[parent_entity.id]

        if where is not None:
            self.where = _check_feature(where)
            msg = "Where feature must be defined on child entity {}".format(
                self.child_entity.id)
            assert self.where.entity.id == self.child_entity.id, msg

        if use_previous:
            assert self.child_entity.time_index is not None, (
                "Applying function that requires time index to entity that "
                "doesn't have one")

            self.use_previous = _check_timedelta(use_previous)
            assert len(base_features) > 0
            time_index = base_features[0].entity.time_index
            time_col = base_features[0].entity[time_index]
            assert time_index is not None, ("Use previous can only be defined "
                                            "on entities with a time index")
            assert _check_time_against_column(self.use_previous, time_col)

        super(AggregationFeature, self).__init__(parent_entity,
                                                 base_features,
                                                 primitive=primitive)
Пример #8
0
    def _handle_time(self, df, time_last=None, training_window=None):
        """
        Filter a dataframe for all instances before time_last.
        If this entity does not have a time index, return the original
        dataframe.
        """
        if self.time_index:
            if time_last is not None and not df.empty:
                df = df[df[self.time_index] <= time_last]
                if training_window is not None:
                    training_window = _check_timedelta(training_window)
                    mask = df[self.time_index] >= time_last - training_window
                    if self.last_time_index is not None:
                        lti_slice = self.last_time_index.reindex(df.index)
                        lti_mask = lti_slice >= time_last - training_window
                        mask = mask | lti_mask
                    else:
                        logger.warning(
                            "Using training_window but last_time_index is "
                            "not set on entity %s" % (self.id))
                    df = df[mask]

        for secondary_time_index, columns in self.secondary_time_index.items():
            # should we use ignore time last here?
            if time_last is not None and not df.empty:
                mask = df[secondary_time_index] >= time_last
                df.loc[mask, columns] = np.nan

        return df
Пример #9
0
    def __init__(self, entity, base_features, **kwargs):
        assert all(isinstance(f, PrimitiveBase) for f in base_features), \
            "All base features must be features"
        if len(set([bf.hash() for bf in base_features])) != len(base_features):
            raise ValueError(u"Duplicate base features ({}): {}".format(
                self.__class__, base_features))

        self.entity_id = entity.id
        self.entityset = entity.entityset

        # P TODO: where should this logic go?
        # not all primitives support use previous so doesn't make sense to have
        # in base
        if self.use_previous:
            self.use_previous = _check_timedelta(self.use_previous)
            assert len(self.base_features) > 0
            time_index = self.base_features[0].entity.time_index
            time_col = self.base_features[0].entity[time_index]
            assert time_index is not None, ("Use previous can only be defined "
                                            "on entities with a time index")
            assert _check_time_against_column(self.use_previous, time_col)

        self.base_features = base_features
        # variable type can be declared or inferred from first base feature
        self.additional_attributes = kwargs

        assert self._check_input_types(), ("Provided inputs don't match input "
                                           "type requirements")
        super(PrimitiveBase, self).__init__(**kwargs)
Пример #10
0
    def __init__(self, entity, base_features, **kwargs):
        assert all(isinstance(f, PrimitiveBase) for f in base_features), \
            "All base features must be features"
        if len(set([bf.hash() for bf in base_features])) != len(base_features):
            raise ValueError(u"Duplicate base features ({}): {}".format(
                self.__class__, base_features))

        self.entity_id = entity.id
        self.entityset = entity.entityset.metadata

        # P TODO: where should this logic go?
        # not all primitives support use previous so doesn't make sense to have
        # in base
        if self.use_previous:
            self.use_previous = _check_timedelta(self.use_previous)
            assert len(self.base_features) > 0
            time_index = self.base_features[0].entity.time_index
            time_col = self.base_features[0].entity[time_index]
            assert time_index is not None, ("Use previous can only be defined "
                                            "on entities with a time index")
            assert _check_time_against_column(self.use_previous, time_col)

        self.base_features = base_features
        # variable type can be declared or inferred from first base feature
        self.additional_attributes = kwargs

        assert self._check_input_types(), ("Provided inputs don't match input "
                                           "type requirements")
        super(PrimitiveBase, self).__init__(**kwargs)
Пример #11
0
    def __init__(self, value, unit=None):
        """
        Args:
            value (float, str) : Value of timedelta, or string providing
                both unit and value.
            unit (str) : Unit of time delta.
        """
        # TODO: check if value is int or float
        if is_string(value):
            from featuretools.utils.wrangle import _check_timedelta
            td = _check_timedelta(value)
            value, unit = td.value, td.unit

        self.value = value
        self._original_unit = None  # to alert get_name that although we converted the unit to 'd' it was initially
        unit = self._check_unit_plural(unit)
        assert unit in self._readable_units or unit in self._readable_to_unit
        if unit in self._readable_to_unit:
            unit = self._readable_to_unit[unit]

        # weeks
        if unit in self._convert_to_days:
            self._original_unit = unit
            self.value = self.value * self._convert_to_days[unit]
            unit = 'd'

        self.unit = unit
        self.delta_obj = self.get_unit_type()
Пример #12
0
    def query_by_values(self,
                        instance_vals,
                        variable_id=None,
                        columns=None,
                        time_last=None,
                        training_window=None):
        """Query instances that have variable with given value

        Args:
            instance_vals (pd.Dataframe, pd.Series, list[str] or str) :
                Instance(s) to match.
            variable_id (str) : Variable to query on. If None, query on index.
            columns (list[str]) : Columns to return. Return all columns if None.
            time_last (pd.TimeStamp) : Query data up to and including this
                time. Only applies if entity has a time index.
            training_window (Timedelta, optional):
                Data older than time_last by more than this will be ignored

        Returns:
            pd.DataFrame : instances that match constraints with ids in order of underlying dataframe
        """
        instance_vals = self._vals_to_series(instance_vals, variable_id)

        training_window = _check_timedelta(training_window)
        if training_window is not None:
            assert (isinstance(training_window, Timedelta) and
                    training_window.is_absolute()),\
                "training window must be an absolute Timedelta"

        if instance_vals is None:
            df = self.df.copy()

        elif instance_vals.shape[0] == 0:
            df = self.df.head(0)

        elif variable_id is None or variable_id == self.index:
            df = self.df.reindex(instance_vals)
            df.dropna(subset=[self.index], inplace=True)

        else:
            df = self.df[self.df[variable_id].isin(instance_vals)]

            df = df.set_index(self.index, drop=False)

            # ensure filtered df has same categories as original
            # workaround for issue below
            # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538
            if pdtypes.is_categorical_dtype(self.df[variable_id]):
                categories = pd.api.types.CategoricalDtype(
                    categories=self.df[variable_id].cat.categories)
                df[variable_id] = df[variable_id].astype(categories)

        df = self._handle_time(df=df,
                               time_last=time_last,
                               training_window=training_window)

        if columns is not None:
            df = df[columns]

        return df
Пример #13
0
def bin_cutoff_times(cuttoff_time, bin_size):
    binned_cutoff_time = cuttoff_time.copy()
    if type(bin_size) == int:
        binned_cutoff_time['time'] = binned_cutoff_time['time'].apply(lambda x: x / bin_size * bin_size)
    else:
        bin_size = _check_timedelta(bin_size).get_pandas_timedelta()
        binned_cutoff_time['time'] = datetime_round(binned_cutoff_time['time'], bin_size)
    return binned_cutoff_time
Пример #14
0
def test_relative_year():
    td_time = "1 years"
    td = _check_timedelta(td_time)
    assert td.get_value("Y") == 1
    assert isinstance(td.delta_obj, relativedelta)

    time = pd.to_datetime('2020-02-29')
    assert time + td == pd.to_datetime('2021-02-28')
def bin_cutoff_times(cuttoff_time, bin_size):
    binned_cutoff_time = cuttoff_time.copy()
    if type(bin_size) == int:
        binned_cutoff_time['time'] = binned_cutoff_time['time'].apply(lambda x: x / bin_size * bin_size)
    else:
        bin_size = _check_timedelta(bin_size).get_pandas_timedelta()
        binned_cutoff_time['time'] = datetime_round(binned_cutoff_time['time'], bin_size)
    return binned_cutoff_time
Пример #16
0
 def check_value(self, value, unit):
     if isinstance(value, str):
         from featuretools.utils.wrangle import _check_timedelta
         td = _check_timedelta(value)
         self.times = td.times
     elif isinstance(value, dict):
         self.times = value
     else:
         self.times = {unit: value}
Пример #17
0
def test_pd_dateoffset_to_timedelta():
    single_temporal = pd.DateOffset(months=3)
    single_td = _check_timedelta(single_temporal)
    assert single_td.get_value('mo') == 3
    assert single_td.delta_obj == pd.DateOffset(months=3)

    mult_temporal = pd.DateOffset(years=10, months=3, days=5)
    mult_td = _check_timedelta(mult_temporal)
    expected = {'Y': 10, 'mo': 3, 'd': 5}
    assert mult_td.get_value() == expected
    assert mult_td.delta_obj == mult_temporal
    # get_name() for multiple values is not deterministic
    assert len(mult_td.get_name()) == len("10 Years 3 Months 5 Days")

    special_dateoffset = pd.offsets.BDay(100)
    special_td = _check_timedelta(special_dateoffset)
    assert special_td.get_value("businessdays") == 100
    assert special_td.delta_obj == special_dateoffset
Пример #18
0
    def query_by_values(self,
                        instance_vals,
                        variable_id=None,
                        columns=None,
                        time_last=None,
                        training_window=None):
        """Query instances that have variable with given value

        Args:
            instance_vals (pd.Dataframe, pd.Series, list[str] or str) :
                Instance(s) to match.
            variable_id (str) : Variable to query on. If None, query on index.
            columns (list[str]) : Columns to return. Return all columns if None.
            time_last (pd.TimeStamp) : Query data up to and including this
                time. Only applies if entity has a time index.
            training_window (Timedelta, optional):
                Data older than time_last by more than this will be ignored

        Returns:
            pd.DataFrame : instances that match constraints
        """
        instance_vals = self._vals_to_series(instance_vals, variable_id)

        training_window = _check_timedelta(training_window)
        if training_window is not None:
            assert (isinstance(training_window, Timedelta) and
                    training_window.is_absolute()),\
                "training window must be an absolute Timedelta"

        if instance_vals is None:
            df = self.df

        elif instance_vals.shape[0] == 0:
            df = self.df.head(0)

        elif variable_id is None or variable_id == self.index:
            df = self.df.reindex(instance_vals)
            df.dropna(subset=[self.index], inplace=True)

        else:
            df = self.df.merge(instance_vals.to_frame(),
                               how="inner",
                               left_on=variable_id,
                               right_on=variable_id).set_index(self.index,
                                                               drop=False)

            # ensure filtered df has same categories as original
            if pdtypes.is_categorical_dtype(self.df[variable_id]):
                categories = pd.api.types.CategoricalDtype(
                    categories=self.df[variable_id].cat.categories)
                df[variable_id] = df[variable_id].astype(categories)

        return self._filter_and_sort(df=df,
                                     time_last=time_last,
                                     training_window=training_window,
                                     columns=columns)
Пример #19
0
def bin_cutoff_times(cutoff_time, bin_size):
    binned_cutoff_time = cutoff_time.ww.copy()
    if type(bin_size) == int:
        binned_cutoff_time["time"] = binned_cutoff_time["time"].apply(
            lambda x: x / bin_size * bin_size)
    else:
        bin_size = _check_timedelta(bin_size)
        binned_cutoff_time["time"] = datetime_round(binned_cutoff_time["time"],
                                                    bin_size)
    return binned_cutoff_time
Пример #20
0
    def query_by_values(self, instance_vals, variable_id=None, columns=None,
                        time_last=None, training_window=None):
        """Query instances that have variable with given value

        Args:
            instance_vals (pd.Dataframe, pd.Series, list[str] or str) :
                Instance(s) to match.
            variable_id (str) : Variable to query on. If None, query on index.
            columns (list[str]) : Columns to return. Return all columns if None.
            time_last (pd.TimeStamp) : Query data up to and including this
                time. Only applies if entity has a time index.
            training_window (Timedelta, optional):
                Data older than time_last by more than this will be ignored

        Returns:
            pd.DataFrame : instances that match constraints
        """
        instance_vals = self._vals_to_series(instance_vals, variable_id)

        training_window = _check_timedelta(training_window)
        if training_window is not None:
            assert (isinstance(training_window, Timedelta) and
                    training_window.is_absolute()),\
                "training window must be an absolute Timedelta"

        if instance_vals is None:
            df = self.df.copy()

        elif instance_vals.shape[0] == 0:
            df = self.df.head(0)

        elif variable_id is None or variable_id == self.index:
            df = self.df.reindex(instance_vals)
            df.dropna(subset=[self.index], inplace=True)

        else:
            df = self.df.merge(instance_vals.to_frame(variable_id),
                               how="inner", on=variable_id)
            df = df.set_index(self.index, drop=False)

            # ensure filtered df has same categories as original
            # workaround for issue below
            # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538
            if pdtypes.is_categorical_dtype(self.df[variable_id]):
                categories = pd.api.types.CategoricalDtype(categories=self.df[variable_id].cat.categories)
                df[variable_id] = df[variable_id].astype(categories)

        df = self._handle_time(df=df,
                               time_last=time_last,
                               training_window=training_window)

        if columns is not None:
            df = df[columns]

        return df
Пример #21
0
    def query_by_values(self, instance_vals, variable_id=None, columns=None,
                        time_last=None, training_window=None):
        """Query instances that have variable with given value

        Args:
            instance_vals (pd.Dataframe, pd.Series, list[str] or str) :
                Instance(s) to match.
            variable_id (str) : Variable to query on. If None, query on index.
            columns (list[str]) : Columns to return. Return all columns if None.
            time_last (pd.TimeStamp) : Query data up to and including this
                time. Only applies if entity has a time index.
            training_window (Timedelta, optional):
                Data older than time_last by more than this will be ignored

        Returns:
            pd.DataFrame : instances that match constraints
        """
        instance_vals = self._vals_to_series(instance_vals, variable_id)

        training_window = _check_timedelta(training_window)
        if training_window is not None:
            assert (isinstance(training_window, Timedelta) and
                    training_window.is_absolute()),\
                "training window must be an absolute Timedelta"

        if instance_vals is None:
            df = self.df

        elif variable_id is None or variable_id == self.index:
            df = self.df.reindex(instance_vals)
            df.dropna(subset=[self.index], inplace=True)

        elif variable_id in self.indexed_by:
            # some variables are indexed ahead of time
            index = self.indexed_by[variable_id]

            # generate pd.Series of all values from the index. Indexing
            # is much faster on this type.
            to_append = [pd.Series(index[v]) for v in instance_vals
                         if v in index]
            my_id_vals = pd.Series([]).append(to_append)
            df = self.df.loc[my_id_vals]

        else:
            # filter by "row.variable_id IN instance_vals"
            mask = self.df[variable_id].isin(instance_vals)
            df = self.df[mask]

        return self._filter_and_sort(df=df,
                                     time_last=time_last,
                                     training_window=training_window,
                                     columns=columns)
Пример #22
0
    def __init__(
        self,
        base_features,
        parent_dataframe_name,
        primitive,
        relationship_path=None,
        use_previous=None,
        where=None,
        name=None,
    ):
        base_features = _validate_base_features(base_features)

        for bf in base_features:
            if bf.number_output_features > 1:
                raise ValueError("Cannot stack on whole multi-output feature.")

        self.child_dataframe_name = base_features[0].dataframe_name
        entityset = base_features[0].entityset
        relationship_path, self._path_is_unique = self._handle_relationship_path(
            entityset, parent_dataframe_name, relationship_path
        )

        self.parent_dataframe_name = parent_dataframe_name

        if where is not None:
            self.where = _validate_base_features(where)[0]
            msg = "Where feature must be defined on child dataframe {}".format(
                self.child_dataframe_name
            )
            assert self.where.dataframe_name == self.child_dataframe_name, msg

        if use_previous:
            assert entityset[self.child_dataframe_name].ww.time_index is not None, (
                "Applying function that requires time index to dataframe that "
                "doesn't have one"
            )
            self.use_previous = _check_timedelta(use_previous)
            assert len(base_features) > 0
            time_index = base_features[0].dataframe.ww.time_index
            time_col = base_features[0].dataframe.ww[time_index]
            assert time_index is not None, (
                "Use previous can only be defined " "on dataframes with a time index"
            )
            assert _check_time_against_column(self.use_previous, time_col)

        super(AggregationFeature, self).__init__(
            dataframe=entityset[parent_dataframe_name],
            base_features=base_features,
            relationship_path=relationship_path,
            primitive=primitive,
            name=name,
        )
Пример #23
0
    def _handle_time(self, entity_id, df, time_last=None, training_window=None, include_cutoff_time=True):
        """
        Filter a dataframe for all instances before time_last.
        If the DataTable does not have a time index, return the original
        dataframe.
        """
        dt = self[entity_id]
        if is_instance(df, ks, 'DataFrame') and isinstance(time_last, np.datetime64):
            time_last = pd.to_datetime(time_last)
        if dt.time_index:
            df_empty = df.empty if isinstance(df, pd.DataFrame) else False
            if time_last is not None and not df_empty:
                if include_cutoff_time:
                    df = df[df[dt.time_index] <= time_last]
                else:
                    df = df[df[dt.time_index] < time_last]
                if training_window is not None:
                    training_window = _check_timedelta(training_window)
                    if include_cutoff_time:
                        mask = df[dt.time_index] > time_last - training_window
                    else:
                        mask = df[dt.time_index] >= time_last - training_window
                    if dt.last_time_index is not None:
                        lti_slice = dt.last_time_index.reindex(df.index)
                        if include_cutoff_time:
                            lti_mask = lti_slice > time_last - training_window
                        else:
                            lti_mask = lti_slice >= time_last - training_window
                        mask = mask | lti_mask
                    else:
                        warnings.warn(
                            "Using training_window but last_time_index is "
                            "not set on entity %s" % (dt.id)
                        )

                    df = df[mask]

        for secondary_time_index, columns in dt.secondary_time_index.items():
            # should we use ignore time last here?
            df_empty = df.empty if isinstance(df, pd.DataFrame) else False
            if time_last is not None and not df_empty:
                mask = df[secondary_time_index] >= time_last
                if isinstance(df, dd.DataFrame):
                    for col in columns:
                        df[col] = df[col].mask(mask, np.nan)
                elif is_instance(df, ks, 'DataFrame'):
                    df.loc[mask, columns] = None
                else:
                    df.loc[mask, columns] = np.nan

        return df
Пример #24
0
    def __init__(self,
                 base_features,
                 parent_entity,
                 primitive,
                 relationship_path=None,
                 use_previous=None,
                 where=None,
                 name=None):
        if hasattr(base_features, '__iter__'):
            base_features = [_check_feature(bf) for bf in base_features]
            msg = "all base features must share the same entity"
            assert len(set([bf.entity for bf in base_features])) == 1, msg
        else:
            base_features = [_check_feature(base_features)]

        for bf in base_features:
            if bf.number_output_features > 1:
                raise ValueError("Cannot stack on whole multi-output feature.")

        self.child_entity = base_features[0].entity

        relationship_path, self._path_is_unique = \
            self._handle_relationship_path(parent_entity, relationship_path)

        self.parent_entity = parent_entity.entityset.metadata[parent_entity.id]

        if where is not None:
            self.where = _check_feature(where)
            msg = "Where feature must be defined on child entity {}".format(
                self.child_entity.id)
            assert self.where.entity.id == self.child_entity.id, msg

        if use_previous:
            assert self.child_entity.time_index is not None, (
                "Applying function that requires time index to entity that "
                "doesn't have one")

            self.use_previous = _check_timedelta(use_previous)
            assert len(base_features) > 0
            time_index = base_features[0].entity.time_index
            time_col = base_features[0].entity[time_index]
            assert time_index is not None, ("Use previous can only be defined "
                                            "on entities with a time index")
            assert _check_time_against_column(self.use_previous, time_col)

        super(AggregationFeature,
              self).__init__(entity=parent_entity,
                             base_features=base_features,
                             relationship_path=relationship_path,
                             primitive=primitive,
                             name=name)
Пример #25
0
    def __init__(self,
                 value,
                 unit=None,
                 entity=None,
                 data=None,
                 inclusive=False):
        """
        Args:
            value (float, str) : Value of timedelta, or string providing
                both unit and value.
            unit (str) : Unit of time delta.
            entity (str, optional) : Entity id to use if unit equals
                "observations".
            data (pd.Series, optional) : series of timestamps to use
                with observations. Can be calculated later.
            inclusive (bool, optional) : if True, include events that are
                exactly timedelta distance away from the original time/observation
        """
        # TODO: check if value is int or float
        if isinstance(value, basestring):
            from featuretools.utils.wrangle import _check_timedelta
            td = _check_timedelta(value)
            value, unit = td.value, td.unit

        self.value = value
        self._original_unit = None  # to alert get_name that although we converted the unit to 'd' it was initially
        unit = self._check_unit_plural(unit)
        assert unit in self._readable_units or unit in self._readable_to_unit
        if unit in self._readable_to_unit:
            unit = self._readable_to_unit[unit]

        # weeks
        if unit in self._convert_to_days:
            self._original_unit = unit
            self.value = self.value * self._convert_to_days[unit]
            unit = 'd'

        self.unit = unit

        if unit == self._Observations and entity is None:
            raise Exception("Must define entity to use %s as unit" % (unit))

        self.entity = entity
        self.data = data

        self.inclusive = inclusive
Пример #26
0
    def __init__(self,
                 base_feature,
                 group_feature,
                 time_index=None,
                 where=None,
                 use_previous=None):
        """Summary

        Args:
            agg_feature (type): subclass of :class:`.AggregationPrimitive`;
                aggregation method being used.  This is passed by the
                constructors of the cumfeat subclasses
            base_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature
                or variable calculated on
            group_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature
                or variable used to group the rows before computation
            where (optional[:class:`.PrimitiveBase`]):
            use_previous (optional[:class:`.Timedelta`):
        """
        self.return_type = self.agg_feature.return_type

        base_feature = self._check_feature(base_feature)

        td_entity_id = None
        if isinstance(use_previous, basestring):
            td_entity_id = base_feature.entity.id
        self.use_previous = _check_timedelta(use_previous,
                                             entity_id=td_entity_id)

        group_feature = self._check_feature(group_feature)
        self.group_feature = group_feature

        self.base_features = [base_feature, group_feature]

        if time_index is None:
            entity = base_feature.entity
            time_index = IdentityFeature(entity[entity.time_index])
        self.base_features += [time_index]

        if where is not None:
            self.where = where

        super(CumFeature, self).__init__(*self.base_features)
Пример #27
0
    def __init__(self, value, unit=None, entity=None, data=None, inclusive=False):
        """
        Args:
            value (float, str) : Value of timedelta, or string providing
                both unit and value.
            unit (str) : Unit of time delta.
            entity (str, optional) : Entity id to use if unit equals
                "observations".
            data (pd.Series, optional) : series of timestamps to use
                with observations. Can be calculated later.
            inclusive (bool, optional) : if True, include events that are
                exactly timedelta distance away from the original time/observation
        """
        # TODO: check if value is int or float
        if isinstance(value, basestring):
            from featuretools.utils.wrangle import _check_timedelta
            td = _check_timedelta(value)
            value, unit = td.value, td.unit

        self.value = value
        self._original_unit = None  # to alert get_name that although we converted the unit to 'd' it was initially
        unit = self._check_unit_plural(unit)
        assert unit in self._readable_units or unit in self._readable_to_unit
        if unit in self._readable_to_unit:
            unit = self._readable_to_unit[unit]

        # weeks
        if unit in self._convert_to_days:
            self._original_unit = unit
            self.value = self.value * self._convert_to_days[unit]
            unit = 'd'

        self.unit = unit

        if unit == self._Observations and entity is None:
            raise Exception("Must define entity to use %s as unit" % (unit))

        self.entity = entity
        self.data = data

        self.inclusive = inclusive
    def __init__(self, base_feature, group_feature, time_index=None,
                 where=None, use_previous=None):
        """Summary

        Args:
            agg_feature (type): subclass of :class:`.AggregationPrimitive`;
                aggregation method being used.  This is passed by the
                constructors of the cumfeat subclasses
            base_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature
                or variable calculated on
            group_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature
                or variable used to group the rows before computation
            where (optional[:class:`.PrimitiveBase`]):
            use_previous (optional[:class:`.Timedelta`):
        """
        self.return_type = self.agg_feature.return_type

        base_feature = self._check_feature(base_feature)

        td_entity_id = None
        if isinstance(use_previous, basestring):
            td_entity_id = base_feature.entity.id
        self.use_previous = _check_timedelta(
            use_previous, entity_id=td_entity_id)

        group_feature = self._check_feature(group_feature)
        self.group_feature = group_feature

        self.base_features = [base_feature, group_feature]

        if time_index is None:
            entity = base_feature.entity
            time_index = IdentityFeature(entity[entity.time_index])
        self.base_features += [time_index]

        if where is not None:
            self.where = where

        super(CumFeature, self).__init__(*self.base_features)
Пример #29
0
def test_check_timedelta(es):
    time_units = list(Timedelta._readable_units.keys())
    expanded_units = list(Timedelta._readable_units.values())
    exp_to_standard_unit = {e: t for e, t in zip(expanded_units, time_units)}
    singular_units = [u[:-1] for u in expanded_units]
    sing_to_standard_unit = {s: t for s, t in zip(singular_units, time_units)}
    to_standard_unit = {}
    to_standard_unit.update(exp_to_standard_unit)
    to_standard_unit.update(sing_to_standard_unit)
    full_units = singular_units + expanded_units + time_units + time_units

    strings = ["2 {}".format(u) for u in singular_units + expanded_units +
               time_units]
    strings += ["2{}".format(u) for u in time_units]
    for i, s in enumerate(strings):
        unit = full_units[i]
        standard_unit = unit
        if unit in to_standard_unit:
            standard_unit = to_standard_unit[unit]

        td = _check_timedelta(s)
        assert td.get_value(standard_unit) == 2
Пример #30
0
    def query_by_values(self,
                        instance_vals,
                        variable_id=None,
                        columns=None,
                        time_last=None,
                        training_window=None,
                        return_sorted=False,
                        start=None,
                        end=None,
                        random_seed=None,
                        shuffle=False):
        """Query instances that have variable with given value

        Args:
            instance_vals (pd.Dataframe, pd.Series, list[str] or str) :
                Instance(s) to match
            variable_id (str) : Variable to query on. If None, query on index.
            columns (list[str]) : Columns to return. Return all columns if None.
            time_last (pd.TimeStamp) : Query data up to and including this
                time. Only applies if entity has a time index.
            training_window (dict[str-> :class:`Timedelta`] or :class:`Timedelta`, optional):
                Data older than time_last by more than this will be ignored
            return_sorted (bool) : Return instances in the same order as
                the instance_vals are passed.
            start (int) : If provided, only return instances equal to or after this index
            end (int) : If provided, only return instances before this index
            random_seed (int) : Provided to the shuffling procedure
            shuffle (bool) : If True, values will be shuffled before returning

        Returns:
            pd.DataFrame : instances that match constraints
        """
        instance_vals = self._vals_to_series(instance_vals, variable_id)

        training_window = _check_timedelta(training_window)
        if training_window is not None:
            assert (isinstance(training_window, Timedelta) and
                    training_window.is_absolute()),\
                "training window must be an absolute Timedelta"

        if instance_vals is None:
            df = self.df

        elif variable_id is None or variable_id == self.index:
            df = self.df.loc[instance_vals]
            df.dropna(subset=[self.index], inplace=True)

        elif variable_id in self.indexed_by:
            # some variables are indexed ahead of time
            index = self.indexed_by[variable_id]

            # generate pd.Series of all values from the index. Indexing
            # is much faster on this type.
            to_append = [
                pd.Series(index[v]) for v in instance_vals if v in index
            ]
            my_id_vals = pd.Series([]).append(to_append)
            df = self.df.loc[my_id_vals]

        else:
            # filter by "row.variable_id IN instance_vals"
            mask = self.df[variable_id].isin(instance_vals)
            df = self.df[mask]

        sortby = variable_id if (return_sorted and not shuffle) else None
        return self._filter_and_sort(df=df,
                                     time_last=time_last,
                                     training_window=training_window,
                                     columns=columns,
                                     sortby=sortby,
                                     start=start,
                                     end=end,
                                     shuffle=shuffle,
                                     random_seed=random_seed)
Пример #31
0
def test_check_pd_timedelta(es):
    pdtd = pd.Timedelta(5, 'm')
    td = _check_timedelta(pdtd)
    assert td.get_value('s') == 300
Пример #32
0
def test_check_pd_timedelta(es):
    pdtd = pd.Timedelta(5, 'm')
    td = _check_timedelta(pdtd)
    assert td.unit == 's'
    assert td.value == 300
Пример #33
0
    def query_by_values(self, entity_id, instance_vals, variable_id=None, columns=None,
                        time_last=None, training_window=None, include_cutoff_time=True):
        """Query instances that have variable with given value

        Args:
            entity_id (str): The id of the entity to query
            instance_vals (pd.Dataframe, pd.Series, list[str] or str) :
                Instance(s) to match.
            variable_id (str) : Variable to query on. If None, query on index.
            columns (list[str]) : Columns to return. Return all columns if None.
            time_last (pd.TimeStamp) : Query data up to and including this
                time. Only applies if entity has a time index.
            training_window (Timedelta, optional):
                Window defining how much time before the cutoff time data
                can be used when calculating features. If None, all data before cutoff time is used.
            include_cutoff_time (bool):
                If True, data at cutoff time are included in calculating features

        Returns:
            pd.DataFrame : instances that match constraints with ids in order of underlying dataframe
        """
        entity = self[entity_id]
        if not variable_id:
            variable_id = entity.index

        instance_vals = _vals_to_series(instance_vals, variable_id)

        training_window = _check_timedelta(training_window)

        if training_window is not None:
            assert training_window.has_no_observations(), "Training window cannot be in observations"

        if instance_vals is None:
            df = entity.df.copy()

        elif isinstance(instance_vals, pd.Series) and instance_vals.empty:
            df = entity.df.head(0)

        else:
            if is_instance(instance_vals, (dd, ks), 'Series'):
                df = entity.df.merge(instance_vals.to_frame(), how="inner", on=variable_id)
            elif isinstance(instance_vals, pd.Series) and is_instance(entity.df, ks, 'DataFrame'):
                df = entity.df.merge(ks.DataFrame({variable_id: instance_vals}), how="inner", on=variable_id)
            else:
                df = entity.df[entity.df[variable_id].isin(instance_vals)]

            if isinstance(entity.df, pd.DataFrame):
                df = df.set_index(entity.index, drop=False)

            # ensure filtered df has same categories as original
            # workaround for issue below
            # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538
            if pdtypes.is_categorical_dtype(entity.df[variable_id]):
                categories = pd.api.types.CategoricalDtype(categories=entity.df[variable_id].cat.categories)
                df[variable_id] = df[variable_id].astype(categories)

        df = self._handle_time(entity_id=entity_id,
                               df=df,
                               time_last=time_last,
                               training_window=training_window,
                               include_cutoff_time=include_cutoff_time)

        if columns is not None:
            df = df[columns]

        return df
Пример #34
0
def test_check_pd_timedelta(es):
    pdtd = pd.Timedelta(5, "m")
    td = _check_timedelta(pdtd)
    assert td.get_value("s") == 300
Пример #35
0
    def query_by_values(self, instance_vals, variable_id=None, columns=None,
                        time_last=None, training_window=None,
                        return_sorted=False, start=None, end=None,
                        random_seed=None, shuffle=False):
        """Query instances that have variable with given value

        Args:
            instance_vals (pd.Dataframe, pd.Series, list[str] or str) :
                Instance(s) to match.
            variable_id (str) : Variable to query on. If None, query on index.
            columns (list[str]) : Columns to return. Return all columns if None.
            time_last (pd.TimeStamp) : Query data up to and including this
                time. Only applies if entity has a time index.
            training_window (Timedelta, optional):
                Data older than time_last by more than this will be ignored
            return_sorted (bool) : Return instances in the same order as
                the instance_vals are passed.
            start (int) : If provided, only return instances equal to or after this index.
            end (int) : If provided, only return instances before this index.
            random_seed (int) : Provided to the shuffling procedure.
            shuffle (bool) : If True, values will be shuffled before returning.

        Returns:
            pd.DataFrame : instances that match constraints
        """
        instance_vals = self._vals_to_series(instance_vals, variable_id)

        training_window = _check_timedelta(training_window)
        if training_window is not None:
            assert (isinstance(training_window, Timedelta) and
                    training_window.is_absolute()),\
                "training window must be an absolute Timedelta"

        if instance_vals is None:
            df = self.df

        elif variable_id is None or variable_id == self.index:
            df = self.df.reindex(instance_vals)
            df.dropna(subset=[self.index], inplace=True)

        elif variable_id in self.indexed_by:
            # some variables are indexed ahead of time
            index = self.indexed_by[variable_id]

            # generate pd.Series of all values from the index. Indexing
            # is much faster on this type.
            to_append = [pd.Series(index[v]) for v in instance_vals
                         if v in index]
            my_id_vals = pd.Series([]).append(to_append)
            df = self.df.loc[my_id_vals]

        else:
            # filter by "row.variable_id IN instance_vals"
            mask = self.df[variable_id].isin(instance_vals)
            df = self.df[mask]

        sortby = variable_id if (return_sorted and not shuffle) else None
        return self._filter_and_sort(df=df,
                                     time_last=time_last,
                                     training_window=training_window,
                                     columns=columns,
                                     sortby=sortby,
                                     start=start,
                                     end=end,
                                     shuffle=shuffle,
                                     random_seed=random_seed)