def _calculate_agg_features(self, features, frame, df_trie): test_feature = features[0] child_entity = test_feature.base_features[0].entity base_frame = df_trie.get_node(test_feature.relationship_path).value # Sometimes approximate features get computed in a previous filter frame # and put in the current one dynamically, # so there may be existing features here features = [f for f in features if f.get_name() not in frame.columns] if not len(features): return frame # handle where where = test_feature.where if where is not None and not base_frame.empty: base_frame = base_frame.loc[base_frame[where.get_name()]] # when no child data, just add all the features to frame with nan if base_frame.empty: for f in features: frame[f.get_name()] = np.nan else: relationship_path = test_feature.relationship_path groupby_var = get_relationship_variable_id(relationship_path) # if the use_previous property exists on this feature, include only the # instances from the child entity included in that Timedelta use_previous = test_feature.use_previous if use_previous and not base_frame.empty: # Filter by use_previous values time_last = self.time_last if use_previous.is_absolute(): time_first = time_last - use_previous ti = child_entity.time_index if ti is not None: base_frame = base_frame[base_frame[ti] >= time_first] else: n = use_previous.value def last_n(df): return df.iloc[-n:] base_frame = base_frame.groupby(groupby_var, observed=True, sort=False).apply(last_n) to_agg = {} agg_rename = {} to_apply = set() # apply multivariable and time-dependent features as we find them, and # save aggregable features for later for f in features: if _can_agg(f): variable_id = f.base_features[0].get_name() if variable_id not in to_agg: to_agg[variable_id] = [] func = f.get_function() # for some reason, using the string count is significantly # faster than any method a primitive can return # https://stackoverflow.com/questions/55731149/use-a-function-instead-of-string-in-pandas-groupby-agg if is_python_2() and func == pd.Series.count.__func__: func = "count" elif func == pd.Series.count: func = "count" funcname = func if callable(func): # if the same function is being applied to the same # variable twice, wrap it in a partial to avoid # duplicate functions funcname = str(id(func)) if u"{}-{}".format(variable_id, funcname) in agg_rename: func = partial(func) funcname = str(id(func)) func.__name__ = funcname to_agg[variable_id].append(func) # this is used below to rename columns that pandas names for us agg_rename[u"{}-{}".format(variable_id, funcname)] = f.get_name() continue to_apply.add(f) # Apply the non-aggregable functions generate a new dataframe, and merge # it with the existing one if len(to_apply): wrap = agg_wrapper(to_apply, self.time_last) # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).apply(wrap) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # Apply the aggregate functions to generate a new dataframe, and merge # it with the existing one if len(to_agg): # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).agg(to_agg) # rename columns to the correct feature names to_merge.columns = [ agg_rename["-".join(x)] for x in to_merge.columns.ravel() ] to_merge = to_merge[list(agg_rename.values())] # workaround for pandas bug where categories are in the wrong order # see: https://github.com/pandas-dev/pandas/issues/22501 if pdtypes.is_categorical_dtype(frame.index): categories = pdtypes.CategoricalDtype( categories=frame.index.categories) to_merge.index = to_merge.index.astype(object).astype( categories) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # Handle default values fillna_dict = {} for f in features: feature_defaults = { name: f.default_value for name in f.get_feature_names() } fillna_dict.update(feature_defaults) frame.fillna(fillna_dict, inplace=True) # convert boolean dtypes to floats as appropriate # pandas behavior: https://github.com/pydata/pandas/issues/3752 for f in features: if (f.number_output_features == 1 and f.variable_type == variable_types.Numeric and frame[f.get_name()].dtype.name in ['object', 'bool']): frame[f.get_name()] = frame[f.get_name()].astype(float) return frame
def _calculate_agg_features(self, features, entity_frames): test_feature = features[0] entity = test_feature.entity child_entity = test_feature.base_features[0].entity assert entity.id in entity_frames and child_entity.id in entity_frames frame = entity_frames[entity.id] base_frame = entity_frames[child_entity.id] # Sometimes approximate features get computed in a previous filter frame # and put in the current one dynamically, # so there may be existing features here features = [f for f in features if f.get_name() not in frame.columns] if not len(features): return frame # handle where where = test_feature.where if where is not None and not base_frame.empty: base_frame = base_frame.loc[base_frame[where.get_name()]] # when no child data, just add all the features to frame with nan if base_frame.empty: for f in features: frame[f.get_name()] = np.nan else: relationship_path = self.entityset.find_backward_path( entity.id, child_entity.id) groupby_var = get_relationship_variable_id(relationship_path) # if the use_previous property exists on this feature, include only the # instances from the child entity included in that Timedelta use_previous = test_feature.use_previous if use_previous and not base_frame.empty: # Filter by use_previous values time_last = self.time_last if use_previous.is_absolute(): time_first = time_last - use_previous ti = child_entity.time_index if ti is not None: base_frame = base_frame[base_frame[ti] >= time_first] else: n = use_previous.value def last_n(df): return df.iloc[-n:] base_frame = base_frame.groupby(groupby_var, observed=True, sort=False).apply(last_n) to_agg = {} agg_rename = {} to_apply = set() # apply multivariable and time-dependent features as we find them, and # save aggregable features for later for f in features: if _can_agg(f): variable_id = f.base_features[0].get_name() if variable_id not in to_agg: to_agg[variable_id] = [] func = f.get_function() funcname = func if callable(func): # make sure func has a unique name due to how pandas names aggregations func.__name__ = f.name funcname = f.name to_agg[variable_id].append(func) # this is used below to rename columns that pandas names for us agg_rename[u"{}-{}".format(variable_id, funcname)] = f.get_name() continue to_apply.add(f) # Apply the non-aggregable functions generate a new dataframe, and merge # it with the existing one if len(to_apply): wrap = agg_wrapper(to_apply, self.time_last) # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).apply(wrap) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # Apply the aggregate functions to generate a new dataframe, and merge # it with the existing one if len(to_agg): # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).agg(to_agg) # rename columns to the correct feature names to_merge.columns = [ agg_rename["-".join(x)] for x in to_merge.columns.ravel() ] to_merge = to_merge[list(agg_rename.values())] # workaround for pandas bug where categories are in the wrong order # see: https://github.com/pandas-dev/pandas/issues/22501 if pdtypes.is_categorical_dtype(frame.index): categories = pdtypes.CategoricalDtype( categories=frame.index.categories) to_merge.index = to_merge.index.astype(object).astype( categories) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # Handle default values # 1. handle non scalar default values iterfeats = [ f for f in features if hasattr(f.default_value, '__iter__') ] for f in iterfeats: nulls = pd.isnull(frame[f.get_name()]) for ni in nulls[nulls].index: frame.at[ni, f.get_name()] = f.default_value # 2. handle scalars default values fillna_dict = { f.get_name(): f.default_value for f in features if f not in iterfeats } frame.fillna(fillna_dict, inplace=True) # convert boolean dtypes to floats as appropriate # pandas behavior: https://github.com/pydata/pandas/issues/3752 for f in features: if (not f.expanding and f.variable_type == variable_types.Numeric and frame[f.get_name()].dtype.name in ['object', 'bool']): frame[f.get_name()] = frame[f.get_name()].astype(float) return frame
def approximate_features(features, cutoff_time, window, entityset, backend, training_window=None, profile=None): '''Given a list of features and cutoff_times to be passed to calculate_feature_matrix, calculates approximate values of some features to speed up calculations. Cutoff times are sorted into window-sized buckets and the approximate feature values are only calculated at one cutoff time for each bucket. ..note:: this only approximates DirectFeatures of AggregationFeatures, on the target entity. In future versions, it may also be possible to approximate these features on other top-level entities Args: features (list[:class:`.FeatureBase`]): if these features are dependent on aggregation features on the prediction, the approximate values for the aggregation feature will be calculated cutoff_time (pd.DataFrame): specifies what time to calculate the features for each instance at. A DataFrame with 'instance_id' and 'time' columns. window (Timedelta or str): frequency to group instances with similar cutoff times by for features with costly calculations. For example, if bucket is 24 hours, all instances with cutoff times on the same day will use the same calculation for expensive features. entityset (:class:`.EntitySet`): An already initialized entityset. training_window (`Timedelta`, optional): Window defining how much older than the cutoff time data can be to be included when calculating the feature. If None, all older data is used. profile (bool, optional): Enables profiling if True save_progress (str, optional): path to save intermediate computational results ''' approx_fms_by_entity = {} all_approx_feature_set = None target_entity = features[0].entity target_index_var = target_entity.index to_approximate, all_approx_feature_set = gather_approximate_features( features, backend) target_time_colname = 'target_time' cutoff_time[target_time_colname] = cutoff_time['time'] target_instance_colname = target_index_var cutoff_time[target_instance_colname] = cutoff_time['instance_id'] approx_cutoffs = bin_cutoff_times(cutoff_time.copy(), window) cutoff_df_time_var = 'time' cutoff_df_instance_var = 'instance_id' # should this order be by dependencies so that calculate_feature_matrix # doesn't skip approximating something? for approx_entity_id, approx_features in to_approximate.items(): # Gather associated instance_ids from the approximate entity cutoffs_with_approx_e_ids = approx_cutoffs.copy() frames = entityset.get_pandas_data_slice( [approx_entity_id, target_entity.id], target_entity.id, cutoffs_with_approx_e_ids[target_instance_colname]) if frames is not None: path = entityset.find_path(approx_entity_id, target_entity.id) rvar = get_relationship_variable_id(path) parent_instance_frame = frames[approx_entity_id][target_entity.id] cutoffs_with_approx_e_ids[rvar] = \ cutoffs_with_approx_e_ids.merge(parent_instance_frame[[rvar]], left_on=target_index_var, right_index=True, how='left')[rvar].values new_approx_entity_index_var = rvar # Select only columns we care about columns_we_want = [ target_instance_colname, new_approx_entity_index_var, cutoff_df_time_var, target_time_colname ] cutoffs_with_approx_e_ids = cutoffs_with_approx_e_ids[ columns_we_want] cutoffs_with_approx_e_ids = cutoffs_with_approx_e_ids.drop_duplicates( ) cutoffs_with_approx_e_ids.dropna( subset=[new_approx_entity_index_var], inplace=True) else: cutoffs_with_approx_e_ids = pd.DataFrame() if cutoffs_with_approx_e_ids.empty: approx_fms_by_entity = gen_empty_approx_features_df( approx_features) continue cutoffs_with_approx_e_ids.sort_values( [cutoff_df_time_var, new_approx_entity_index_var], inplace=True) # CFM assumes specific column names for cutoff_time argument rename = {new_approx_entity_index_var: cutoff_df_instance_var} cutoff_time_to_pass = cutoffs_with_approx_e_ids.rename(columns=rename) cutoff_time_to_pass = cutoff_time_to_pass[[ cutoff_df_instance_var, cutoff_df_time_var ]] cutoff_time_to_pass.drop_duplicates(inplace=True) approx_fm = calculate_feature_matrix( approx_features, entityset, cutoff_time=cutoff_time_to_pass, training_window=training_window, approximate=None, cutoff_time_in_index=False, chunk_size=cutoff_time_to_pass.shape[0], profile=profile) approx_fms_by_entity[approx_entity_id] = approx_fm # Include entity because we only want to ignore features that # are base_features/dependencies of the top level entity we're # approximating. # For instance, if target entity is sessions, and we're # approximating customers.COUNT(sessions.COUNT(log.value)), # we could also just want the feature COUNT(log.value) # defined on sessions # as a first class feature in the feature matrix. # Unless we signify to only ignore it as a dependency of # a feature defined on customers, we would ignore computing it # and pandas_backend would error return approx_fms_by_entity, all_approx_feature_set
def approximate_features(features, cutoff_time, window, entityset, backend, training_window=None, profile=None): '''Given a list of features and cutoff_times to be passed to calculate_feature_matrix, calculates approximate values of some features to speed up calculations. Cutoff times are sorted into window-sized buckets and the approximate feature values are only calculated at one cutoff time for each bucket. ..note:: this only approximates DirectFeatures of AggregationPrimitives, on the target entity. In future versions, it may also be possible to approximate these features on other top-level entities Args: features (list[:class:`.PrimitiveBase`]): if these features are dependent on aggregation features on the prediction, the approximate values for the aggregation feature will be calculated cutoff_time (pd.DataFrame): specifies what time to calculate the features for each instance at. A DataFrame with 'instance_id' and 'time' columns. window (Timedelta or str): frequency to group instances with similar cutoff times by for features with costly calculations. For example, if bucket is 24 hours, all instances with cutoff times on the same day will use the same calculation for expensive features. entityset (:class:`.EntitySet`): An already initialized entityset. training_window (`Timedelta`, optional): Window defining how much older than the cutoff time data can be to be included when calculating the feature. If None, all older data is used. profile (bool, optional): Enables profiling if True save_progress (str, optional): path to save intermediate computational results ''' approx_fms_by_entity = {} all_approx_feature_set = None target_entity = features[0].entity target_index_var = target_entity.index to_approximate, all_approx_feature_set = gather_approximate_features(features, backend) target_time_colname = 'target_time' cutoff_time[target_time_colname] = cutoff_time['time'] target_instance_colname = target_index_var cutoff_time[target_instance_colname] = cutoff_time['instance_id'] approx_cutoffs = bin_cutoff_times(cutoff_time.copy(), window) cutoff_df_time_var = 'time' cutoff_df_instance_var = 'instance_id' # should this order be by dependencies so that calculate_feature_matrix # doesn't skip approximating something? for approx_entity_id, approx_features in to_approximate.items(): # Gather associated instance_ids from the approximate entity cutoffs_with_approx_e_ids = approx_cutoffs.copy() frames = entityset.get_pandas_data_slice([approx_entity_id, target_entity.id], target_entity.id, cutoffs_with_approx_e_ids[target_instance_colname]) if frames is not None: path = entityset.find_path(approx_entity_id, target_entity.id) rvar = get_relationship_variable_id(path) parent_instance_frame = frames[approx_entity_id][target_entity.id] cutoffs_with_approx_e_ids[rvar] = \ cutoffs_with_approx_e_ids.merge(parent_instance_frame[[rvar]], left_on=target_index_var, right_index=True, how='left')[rvar].values new_approx_entity_index_var = rvar # Select only columns we care about columns_we_want = [target_instance_colname, new_approx_entity_index_var, cutoff_df_time_var, target_time_colname] cutoffs_with_approx_e_ids = cutoffs_with_approx_e_ids[columns_we_want] cutoffs_with_approx_e_ids = cutoffs_with_approx_e_ids.drop_duplicates() cutoffs_with_approx_e_ids.dropna(subset=[new_approx_entity_index_var], inplace=True) else: cutoffs_with_approx_e_ids = pd.DataFrame() if cutoffs_with_approx_e_ids.empty: approx_fms_by_entity = gen_empty_approx_features_df(approx_features) continue cutoffs_with_approx_e_ids.sort_values([cutoff_df_time_var, new_approx_entity_index_var], inplace=True) # CFM assumes specific column names for cutoff_time argument rename = {new_approx_entity_index_var: cutoff_df_instance_var} cutoff_time_to_pass = cutoffs_with_approx_e_ids.rename(columns=rename) cutoff_time_to_pass = cutoff_time_to_pass[[cutoff_df_instance_var, cutoff_df_time_var]] cutoff_time_to_pass.drop_duplicates(inplace=True) approx_fm = calculate_feature_matrix(approx_features, entityset, cutoff_time=cutoff_time_to_pass, training_window=training_window, approximate=None, cutoff_time_in_index=False, chunk_size=cutoff_time_to_pass.shape[0], profile=profile) approx_fms_by_entity[approx_entity_id] = approx_fm # Include entity because we only want to ignore features that # are base_features/dependencies of the top level entity we're # approximating. # For instance, if target entity is sessions, and we're # approximating customers.COUNT(sessions.COUNT(log.value)), # we could also just want the feature COUNT(log.value) # defined on sessions # as a first class feature in the feature matrix. # Unless we signify to only ignore it as a dependency of # a feature defined on customers, we would ignore computing it # and pandas_backend would error return approx_fms_by_entity, all_approx_feature_set