示例#1
0
    def _calculate_agg_features(self, features, frame, df_trie):
        test_feature = features[0]
        child_entity = test_feature.base_features[0].entity

        base_frame = df_trie.get_node(test_feature.relationship_path).value
        # Sometimes approximate features get computed in a previous filter frame
        # and put in the current one dynamically,
        # so there may be existing features here
        features = [f for f in features if f.get_name() not in frame.columns]
        if not len(features):
            return frame

        # handle where
        where = test_feature.where
        if where is not None and not base_frame.empty:
            base_frame = base_frame.loc[base_frame[where.get_name()]]

        # when no child data, just add all the features to frame with nan
        if base_frame.empty:
            for f in features:
                frame[f.get_name()] = np.nan
        else:
            relationship_path = test_feature.relationship_path

            groupby_var = get_relationship_variable_id(relationship_path)

            # if the use_previous property exists on this feature, include only the
            # instances from the child entity included in that Timedelta
            use_previous = test_feature.use_previous
            if use_previous and not base_frame.empty:
                # Filter by use_previous values
                time_last = self.time_last
                if use_previous.is_absolute():
                    time_first = time_last - use_previous
                    ti = child_entity.time_index
                    if ti is not None:
                        base_frame = base_frame[base_frame[ti] >= time_first]
                else:
                    n = use_previous.value

                    def last_n(df):
                        return df.iloc[-n:]

                    base_frame = base_frame.groupby(groupby_var,
                                                    observed=True,
                                                    sort=False).apply(last_n)

            to_agg = {}
            agg_rename = {}
            to_apply = set()
            # apply multivariable and time-dependent features as we find them, and
            # save aggregable features for later
            for f in features:
                if _can_agg(f):
                    variable_id = f.base_features[0].get_name()

                    if variable_id not in to_agg:
                        to_agg[variable_id] = []

                    func = f.get_function()

                    # for some reason, using the string count is significantly
                    # faster than any method a primitive can return
                    # https://stackoverflow.com/questions/55731149/use-a-function-instead-of-string-in-pandas-groupby-agg
                    if is_python_2() and func == pd.Series.count.__func__:
                        func = "count"
                    elif func == pd.Series.count:
                        func = "count"

                    funcname = func
                    if callable(func):
                        # if the same function is being applied to the same
                        # variable twice, wrap it in a partial to avoid
                        # duplicate functions
                        funcname = str(id(func))
                        if u"{}-{}".format(variable_id,
                                           funcname) in agg_rename:
                            func = partial(func)
                            funcname = str(id(func))

                        func.__name__ = funcname

                    to_agg[variable_id].append(func)
                    # this is used below to rename columns that pandas names for us
                    agg_rename[u"{}-{}".format(variable_id,
                                               funcname)] = f.get_name()
                    continue

                to_apply.add(f)

            # Apply the non-aggregable functions generate a new dataframe, and merge
            # it with the existing one
            if len(to_apply):
                wrap = agg_wrapper(to_apply, self.time_last)
                # groupby_var can be both the name of the index and a column,
                # to silence pandas warning about ambiguity we explicitly pass
                # the column (in actuality grouping by both index and group would
                # work)
                to_merge = base_frame.groupby(base_frame[groupby_var],
                                              observed=True,
                                              sort=False).apply(wrap)
                frame = pd.merge(left=frame,
                                 right=to_merge,
                                 left_index=True,
                                 right_index=True,
                                 how='left')

            # Apply the aggregate functions to generate a new dataframe, and merge
            # it with the existing one
            if len(to_agg):
                # groupby_var can be both the name of the index and a column,
                # to silence pandas warning about ambiguity we explicitly pass
                # the column (in actuality grouping by both index and group would
                # work)
                to_merge = base_frame.groupby(base_frame[groupby_var],
                                              observed=True,
                                              sort=False).agg(to_agg)
                # rename columns to the correct feature names
                to_merge.columns = [
                    agg_rename["-".join(x)] for x in to_merge.columns.ravel()
                ]
                to_merge = to_merge[list(agg_rename.values())]

                # workaround for pandas bug where categories are in the wrong order
                # see: https://github.com/pandas-dev/pandas/issues/22501
                if pdtypes.is_categorical_dtype(frame.index):
                    categories = pdtypes.CategoricalDtype(
                        categories=frame.index.categories)
                    to_merge.index = to_merge.index.astype(object).astype(
                        categories)

                frame = pd.merge(left=frame,
                                 right=to_merge,
                                 left_index=True,
                                 right_index=True,
                                 how='left')

        # Handle default values
        fillna_dict = {}
        for f in features:
            feature_defaults = {
                name: f.default_value
                for name in f.get_feature_names()
            }
            fillna_dict.update(feature_defaults)

        frame.fillna(fillna_dict, inplace=True)

        # convert boolean dtypes to floats as appropriate
        # pandas behavior: https://github.com/pydata/pandas/issues/3752
        for f in features:
            if (f.number_output_features == 1
                    and f.variable_type == variable_types.Numeric
                    and frame[f.get_name()].dtype.name in ['object', 'bool']):
                frame[f.get_name()] = frame[f.get_name()].astype(float)

        return frame
示例#2
0
    def _calculate_agg_features(self, features, entity_frames):
        test_feature = features[0]
        entity = test_feature.entity
        child_entity = test_feature.base_features[0].entity

        assert entity.id in entity_frames and child_entity.id in entity_frames

        frame = entity_frames[entity.id]
        base_frame = entity_frames[child_entity.id]
        # Sometimes approximate features get computed in a previous filter frame
        # and put in the current one dynamically,
        # so there may be existing features here
        features = [f for f in features if f.get_name() not in frame.columns]
        if not len(features):
            return frame

        # handle where
        where = test_feature.where
        if where is not None and not base_frame.empty:
            base_frame = base_frame.loc[base_frame[where.get_name()]]

        # when no child data, just add all the features to frame with nan
        if base_frame.empty:
            for f in features:
                frame[f.get_name()] = np.nan
        else:
            relationship_path = self.entityset.find_backward_path(
                entity.id, child_entity.id)

            groupby_var = get_relationship_variable_id(relationship_path)

            # if the use_previous property exists on this feature, include only the
            # instances from the child entity included in that Timedelta
            use_previous = test_feature.use_previous
            if use_previous and not base_frame.empty:
                # Filter by use_previous values
                time_last = self.time_last
                if use_previous.is_absolute():
                    time_first = time_last - use_previous
                    ti = child_entity.time_index
                    if ti is not None:
                        base_frame = base_frame[base_frame[ti] >= time_first]
                else:
                    n = use_previous.value

                    def last_n(df):
                        return df.iloc[-n:]

                    base_frame = base_frame.groupby(groupby_var,
                                                    observed=True,
                                                    sort=False).apply(last_n)

            to_agg = {}
            agg_rename = {}
            to_apply = set()
            # apply multivariable and time-dependent features as we find them, and
            # save aggregable features for later
            for f in features:
                if _can_agg(f):
                    variable_id = f.base_features[0].get_name()

                    if variable_id not in to_agg:
                        to_agg[variable_id] = []

                    func = f.get_function()
                    funcname = func
                    if callable(func):
                        # make sure func has a unique name due to how pandas names aggregations
                        func.__name__ = f.name
                        funcname = f.name

                    to_agg[variable_id].append(func)
                    # this is used below to rename columns that pandas names for us
                    agg_rename[u"{}-{}".format(variable_id,
                                               funcname)] = f.get_name()
                    continue

                to_apply.add(f)

            # Apply the non-aggregable functions generate a new dataframe, and merge
            # it with the existing one
            if len(to_apply):
                wrap = agg_wrapper(to_apply, self.time_last)
                # groupby_var can be both the name of the index and a column,
                # to silence pandas warning about ambiguity we explicitly pass
                # the column (in actuality grouping by both index and group would
                # work)
                to_merge = base_frame.groupby(base_frame[groupby_var],
                                              observed=True,
                                              sort=False).apply(wrap)
                frame = pd.merge(left=frame,
                                 right=to_merge,
                                 left_index=True,
                                 right_index=True,
                                 how='left')

            # Apply the aggregate functions to generate a new dataframe, and merge
            # it with the existing one
            if len(to_agg):
                # groupby_var can be both the name of the index and a column,
                # to silence pandas warning about ambiguity we explicitly pass
                # the column (in actuality grouping by both index and group would
                # work)
                to_merge = base_frame.groupby(base_frame[groupby_var],
                                              observed=True,
                                              sort=False).agg(to_agg)
                # rename columns to the correct feature names
                to_merge.columns = [
                    agg_rename["-".join(x)] for x in to_merge.columns.ravel()
                ]
                to_merge = to_merge[list(agg_rename.values())]

                # workaround for pandas bug where categories are in the wrong order
                # see: https://github.com/pandas-dev/pandas/issues/22501
                if pdtypes.is_categorical_dtype(frame.index):
                    categories = pdtypes.CategoricalDtype(
                        categories=frame.index.categories)
                    to_merge.index = to_merge.index.astype(object).astype(
                        categories)

                frame = pd.merge(left=frame,
                                 right=to_merge,
                                 left_index=True,
                                 right_index=True,
                                 how='left')

        # Handle default values
        # 1. handle non scalar default values
        iterfeats = [
            f for f in features if hasattr(f.default_value, '__iter__')
        ]
        for f in iterfeats:
            nulls = pd.isnull(frame[f.get_name()])
            for ni in nulls[nulls].index:
                frame.at[ni, f.get_name()] = f.default_value

        # 2. handle scalars default values
        fillna_dict = {
            f.get_name(): f.default_value
            for f in features if f not in iterfeats
        }
        frame.fillna(fillna_dict, inplace=True)

        # convert boolean dtypes to floats as appropriate
        # pandas behavior: https://github.com/pydata/pandas/issues/3752
        for f in features:
            if (not f.expanding and f.variable_type == variable_types.Numeric
                    and frame[f.get_name()].dtype.name in ['object', 'bool']):
                frame[f.get_name()] = frame[f.get_name()].astype(float)

        return frame
def approximate_features(features,
                         cutoff_time,
                         window,
                         entityset,
                         backend,
                         training_window=None,
                         profile=None):
    '''Given a list of features and cutoff_times to be passed to
    calculate_feature_matrix, calculates approximate values of some features
    to speed up calculations.  Cutoff times are sorted into
    window-sized buckets and the approximate feature values are only calculated
    at one cutoff time for each bucket.


    ..note:: this only approximates DirectFeatures of AggregationFeatures, on
        the target entity. In future versions, it may also be possible to
        approximate these features on other top-level entities

    Args:
        features (list[:class:`.FeatureBase`]): if these features are dependent
            on aggregation features on the prediction, the approximate values
            for the aggregation feature will be calculated

        cutoff_time (pd.DataFrame): specifies what time to calculate
            the features for each instance at.  A DataFrame with
            'instance_id' and 'time' columns.

        window (Timedelta or str): frequency to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        entityset (:class:`.EntitySet`): An already initialized entityset.

        training_window (`Timedelta`, optional):
            Window defining how much older than the cutoff time data
            can be to be included when calculating the feature. If None, all older data is used.

        profile (bool, optional): Enables profiling if True

        save_progress (str, optional): path to save intermediate computational results
    '''
    approx_fms_by_entity = {}
    all_approx_feature_set = None
    target_entity = features[0].entity
    target_index_var = target_entity.index

    to_approximate, all_approx_feature_set = gather_approximate_features(
        features, backend)

    target_time_colname = 'target_time'
    cutoff_time[target_time_colname] = cutoff_time['time']
    target_instance_colname = target_index_var
    cutoff_time[target_instance_colname] = cutoff_time['instance_id']
    approx_cutoffs = bin_cutoff_times(cutoff_time.copy(), window)
    cutoff_df_time_var = 'time'
    cutoff_df_instance_var = 'instance_id'
    # should this order be by dependencies so that calculate_feature_matrix
    # doesn't skip approximating something?
    for approx_entity_id, approx_features in to_approximate.items():
        # Gather associated instance_ids from the approximate entity
        cutoffs_with_approx_e_ids = approx_cutoffs.copy()
        frames = entityset.get_pandas_data_slice(
            [approx_entity_id, target_entity.id], target_entity.id,
            cutoffs_with_approx_e_ids[target_instance_colname])

        if frames is not None:
            path = entityset.find_path(approx_entity_id, target_entity.id)
            rvar = get_relationship_variable_id(path)
            parent_instance_frame = frames[approx_entity_id][target_entity.id]
            cutoffs_with_approx_e_ids[rvar] = \
                cutoffs_with_approx_e_ids.merge(parent_instance_frame[[rvar]],
                                                left_on=target_index_var,
                                                right_index=True,
                                                how='left')[rvar].values
            new_approx_entity_index_var = rvar

            # Select only columns we care about
            columns_we_want = [
                target_instance_colname, new_approx_entity_index_var,
                cutoff_df_time_var, target_time_colname
            ]

            cutoffs_with_approx_e_ids = cutoffs_with_approx_e_ids[
                columns_we_want]
            cutoffs_with_approx_e_ids = cutoffs_with_approx_e_ids.drop_duplicates(
            )
            cutoffs_with_approx_e_ids.dropna(
                subset=[new_approx_entity_index_var], inplace=True)
        else:
            cutoffs_with_approx_e_ids = pd.DataFrame()

        if cutoffs_with_approx_e_ids.empty:
            approx_fms_by_entity = gen_empty_approx_features_df(
                approx_features)
            continue

        cutoffs_with_approx_e_ids.sort_values(
            [cutoff_df_time_var, new_approx_entity_index_var], inplace=True)
        # CFM assumes specific column names for cutoff_time argument
        rename = {new_approx_entity_index_var: cutoff_df_instance_var}
        cutoff_time_to_pass = cutoffs_with_approx_e_ids.rename(columns=rename)
        cutoff_time_to_pass = cutoff_time_to_pass[[
            cutoff_df_instance_var, cutoff_df_time_var
        ]]

        cutoff_time_to_pass.drop_duplicates(inplace=True)
        approx_fm = calculate_feature_matrix(
            approx_features,
            entityset,
            cutoff_time=cutoff_time_to_pass,
            training_window=training_window,
            approximate=None,
            cutoff_time_in_index=False,
            chunk_size=cutoff_time_to_pass.shape[0],
            profile=profile)

        approx_fms_by_entity[approx_entity_id] = approx_fm

    # Include entity because we only want to ignore features that
    # are base_features/dependencies of the top level entity we're
    # approximating.
    # For instance, if target entity is sessions, and we're
    # approximating customers.COUNT(sessions.COUNT(log.value)),
    # we could also just want the feature COUNT(log.value)
    # defined on sessions
    # as a first class feature in the feature matrix.
    # Unless we signify to only ignore it as a dependency of
    # a feature defined on customers, we would ignore computing it
    # and pandas_backend would error
    return approx_fms_by_entity, all_approx_feature_set
def approximate_features(features, cutoff_time, window, entityset, backend,
                         training_window=None, profile=None):
    '''Given a list of features and cutoff_times to be passed to
    calculate_feature_matrix, calculates approximate values of some features
    to speed up calculations.  Cutoff times are sorted into
    window-sized buckets and the approximate feature values are only calculated
    at one cutoff time for each bucket.


    ..note:: this only approximates DirectFeatures of AggregationPrimitives, on
        the target entity. In future versions, it may also be possible to
        approximate these features on other top-level entities

    Args:
        features (list[:class:`.PrimitiveBase`]): if these features are dependent
            on aggregation features on the prediction, the approximate values
            for the aggregation feature will be calculated

        cutoff_time (pd.DataFrame): specifies what time to calculate
            the features for each instance at.  A DataFrame with
            'instance_id' and 'time' columns.

        window (Timedelta or str): frequency to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        entityset (:class:`.EntitySet`): An already initialized entityset.

        training_window (`Timedelta`, optional):
            Window defining how much older than the cutoff time data
            can be to be included when calculating the feature. If None, all older data is used.

        profile (bool, optional): Enables profiling if True

        save_progress (str, optional): path to save intermediate computational results
    '''
    approx_fms_by_entity = {}
    all_approx_feature_set = None
    target_entity = features[0].entity
    target_index_var = target_entity.index

    to_approximate, all_approx_feature_set = gather_approximate_features(features, backend)

    target_time_colname = 'target_time'
    cutoff_time[target_time_colname] = cutoff_time['time']
    target_instance_colname = target_index_var
    cutoff_time[target_instance_colname] = cutoff_time['instance_id']
    approx_cutoffs = bin_cutoff_times(cutoff_time.copy(), window)
    cutoff_df_time_var = 'time'
    cutoff_df_instance_var = 'instance_id'
    # should this order be by dependencies so that calculate_feature_matrix
    # doesn't skip approximating something?
    for approx_entity_id, approx_features in to_approximate.items():
        # Gather associated instance_ids from the approximate entity
        cutoffs_with_approx_e_ids = approx_cutoffs.copy()
        frames = entityset.get_pandas_data_slice([approx_entity_id, target_entity.id],
                                                 target_entity.id,
                                                 cutoffs_with_approx_e_ids[target_instance_colname])

        if frames is not None:
            path = entityset.find_path(approx_entity_id, target_entity.id)
            rvar = get_relationship_variable_id(path)
            parent_instance_frame = frames[approx_entity_id][target_entity.id]
            cutoffs_with_approx_e_ids[rvar] = \
                cutoffs_with_approx_e_ids.merge(parent_instance_frame[[rvar]],
                                                left_on=target_index_var,
                                                right_index=True,
                                                how='left')[rvar].values
            new_approx_entity_index_var = rvar

            # Select only columns we care about
            columns_we_want = [target_instance_colname,
                               new_approx_entity_index_var,
                               cutoff_df_time_var,
                               target_time_colname]

            cutoffs_with_approx_e_ids = cutoffs_with_approx_e_ids[columns_we_want]
            cutoffs_with_approx_e_ids = cutoffs_with_approx_e_ids.drop_duplicates()
            cutoffs_with_approx_e_ids.dropna(subset=[new_approx_entity_index_var],
                                             inplace=True)
        else:
            cutoffs_with_approx_e_ids = pd.DataFrame()

        if cutoffs_with_approx_e_ids.empty:
            approx_fms_by_entity = gen_empty_approx_features_df(approx_features)
            continue

        cutoffs_with_approx_e_ids.sort_values([cutoff_df_time_var,
                                               new_approx_entity_index_var], inplace=True)
        # CFM assumes specific column names for cutoff_time argument
        rename = {new_approx_entity_index_var: cutoff_df_instance_var}
        cutoff_time_to_pass = cutoffs_with_approx_e_ids.rename(columns=rename)
        cutoff_time_to_pass = cutoff_time_to_pass[[cutoff_df_instance_var, cutoff_df_time_var]]

        cutoff_time_to_pass.drop_duplicates(inplace=True)
        approx_fm = calculate_feature_matrix(approx_features,
                                             entityset,
                                             cutoff_time=cutoff_time_to_pass,
                                             training_window=training_window,
                                             approximate=None,
                                             cutoff_time_in_index=False,
                                             chunk_size=cutoff_time_to_pass.shape[0],
                                             profile=profile)

        approx_fms_by_entity[approx_entity_id] = approx_fm

    # Include entity because we only want to ignore features that
    # are base_features/dependencies of the top level entity we're
    # approximating.
    # For instance, if target entity is sessions, and we're
    # approximating customers.COUNT(sessions.COUNT(log.value)),
    # we could also just want the feature COUNT(log.value)
    # defined on sessions
    # as a first class feature in the feature matrix.
    # Unless we signify to only ignore it as a dependency of
    # a feature defined on customers, we would ignore computing it
    # and pandas_backend would error
    return approx_fms_by_entity, all_approx_feature_set