def gather_approximate_features(feature_set): """ Find features which can be approximated. Returned as a trie where the values are sets of feature names. Args: feature_set (FeatureSet): Features to search the dependencies of for features to approximate. Returns: Trie[RelationshipPath, set[str]] """ approximate_feature_trie = Trie(default=set, path_constructor=RelationshipPath) for feature in feature_set.target_features: if feature_set.uses_full_entity(feature, check_dependents=True): continue if isinstance(feature, DirectFeature): path = feature.relationship_path base_feature = feature.base_features[0] while isinstance(base_feature, DirectFeature): path = path + base_feature.relationship_path base_feature = base_feature.base_features[0] if isinstance(base_feature, AggregationFeature): node_feature_set = approximate_feature_trie.get_node( path).value node_feature_set.add(base_feature.unique_name()) return approximate_feature_trie
def gather_approximate_features(feature_set): # A trie where the edges are RelationshipPaths and the nodes contain lists # of features. approximate_feature_trie = Trie(default=list, path_constructor=RelationshipPath) # A set of feature names. approximate_feature_set = set() for feature in feature_set.target_features: if feature_set.uses_full_entity(feature, check_dependents=True): continue if isinstance(feature, DirectFeature): path = feature.relationship_path base_feature = feature.base_features[0] while isinstance(base_feature, DirectFeature): path = path + base_feature.relationship_path base_feature = base_feature.base_features[0] if isinstance(base_feature, AggregationFeature): feature_list = approximate_feature_trie.get_node(path).value feature_list.append(base_feature) approximate_feature_set.add(base_feature.unique_name()) return approximate_feature_trie, approximate_feature_set
def test_feature_trie_ignores_approximate_features(es): value = ft.IdentityFeature(es['log']['value'], ) agg = ft.AggregationFeature(value, es['sessions'], primitive=ft.primitives.Mean) agg_of_agg = ft.AggregationFeature(agg, es['customers'], primitive=ft.primitives.Sum) direct = ft.DirectFeature(agg_of_agg, es['sessions']) features = [direct, agg] approximate_feature_trie = Trie(default=list, path_constructor=RelationshipPath) approximate_feature_trie.get_node( direct.relationship_path).value = [agg_of_agg] feature_set = FeatureSet(features, approximate_feature_trie=approximate_feature_trie) trie = feature_set.feature_trie # Since agg_of_agg is ignored it and its dependencies should not be in the # trie. sub_trie = trie.get_node(direct.relationship_path) for _path, (_, _, features) in sub_trie: assert not features assert trie.value == (False, set(), {direct.unique_name(), agg.unique_name()}) assert trie.get_node(agg.relationship_path).value == \ (False, set(), {value.unique_name()})
def test_get_node(): t = Trie(default=lambda: 'default') t.get_node([1, 2, 3]).value = '123' t.get_node([1, 2, 4]).value = '124' sub = t.get_node([1, 2]) assert sub.get_node([3]).value == '123' assert sub.get_node([4]).value == '124' sub.get_node([4, 5]).value = '1245' assert t.get_node([1, 2, 4, 5]).value == '1245'
def test_get_node(): t = Trie(default=lambda: "default") t.get_node([1, 2, 3]).value = "123" t.get_node([1, 2, 4]).value = "124" sub = t.get_node([1, 2]) assert sub.get_node([3]).value == "123" assert sub.get_node([4]).value == "124" sub.get_node([4, 5]).value = "1245" assert t.get_node([1, 2, 4, 5]).value == "1245"
def test_iteration(): t = Trie(default=lambda: 'default', path_constructor=tuple) t.get_node((1, 2, 3)).value = '123' t.get_node((1, 2, 4)).value = '124' expected = [ ((), 'default'), ((1, ), 'default'), ((1, 2), 'default'), ((1, 2, 3), '123'), ((1, 2, 4), '124'), ] for i, value in enumerate(t): assert value == expected[i]
def test_setting_and_getting(): t = Trie(default=lambda: 'default') assert t.get_node([1, 2, 3]).value == 'default' t.get_node([1, 2, 3]).value = '123' t.get_node([1, 2, 4]).value = '124' assert t.get_node([1, 2, 3]).value == '123' assert t.get_node([1, 2, 4]).value == '124' assert t.get_node([1]).value == 'default' t.get_node([1]).value = '1' assert t.get_node([1]).value == '1' t.get_node([1, 2, 3]).value = 'updated' assert t.get_node([1, 2, 3]).value == 'updated'
def test_iteration(): t = Trie(default=lambda: "default", path_constructor=tuple) t.get_node((1, 2, 3)).value = "123" t.get_node((1, 2, 4)).value = "124" expected = [ ((), "default"), ((1, ), "default"), ((1, 2), "default"), ((1, 2, 3), "123"), ((1, 2, 4), "124"), ] for i, value in enumerate(t): assert value == expected[i]
def test_setting_and_getting(): t = Trie(default=lambda: "default") assert t.get_node([1, 2, 3]).value == "default" t.get_node([1, 2, 3]).value = "123" t.get_node([1, 2, 4]).value = "124" assert t.get_node([1, 2, 3]).value == "123" assert t.get_node([1, 2, 4]).value == "124" assert t.get_node([1]).value == "default" t.get_node([1]).value = "1" assert t.get_node([1]).value == "1" t.get_node([1, 2, 3]).value = "updated" assert t.get_node([1, 2, 3]).value == "updated"
def __init__(self, entityset, feature_set, time_last=None, training_window=None, precalculated_features=None): """ Args: feature_set (FeatureSet): The features to calculate values for. time_last (pd.Timestamp, optional): Last allowed time. Data from exactly this time not allowed. training_window (Timedelta, optional): Window defining how much time before the cutoff time data can be used when calculating features. If None, all data before cutoff time is used. precalculated_features (Trie[RelationshipPath -> pd.DataFrame]): Maps RelationshipPaths to dataframes of precalculated_features """ self.entityset = entityset self.feature_set = feature_set self.training_window = training_window if time_last is None: time_last = datetime.now() self.time_last = time_last if precalculated_features is None: precalculated_features = Trie(path_constructor=RelationshipPath) self.precalculated_features = precalculated_features
def test_precalculated_features(pd_es): error_msg = ( "This primitive should never be used because the features are precalculated" ) class ErrorPrim(AggregationPrimitive): """A primitive whose function raises an error.""" name = "error_prim" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) def get_function(self, agg_type="pandas"): def error(s): raise RuntimeError(error_msg) return error value = ft.Feature(pd_es["log"].ww["value"]) agg = ft.Feature(value, parent_dataframe_name="sessions", primitive=ErrorPrim) agg2 = ft.Feature(agg, parent_dataframe_name="customers", primitive=ErrorPrim) direct = ft.Feature(agg2, dataframe_name="sessions") # Set up a FeatureSet which knows which features are precalculated. precalculated_feature_trie = Trie(default=set, path_constructor=RelationshipPath) precalculated_feature_trie.get_node(direct.relationship_path).value.add( agg2.unique_name()) feature_set = FeatureSet( [direct], approximate_feature_trie=precalculated_feature_trie) # Fake precalculated data. values = [0, 1, 2] parent_fm = pd.DataFrame({agg2.get_name(): values}) precalculated_fm_trie = Trie(path_constructor=RelationshipPath) precalculated_fm_trie.get_node(direct.relationship_path).value = parent_fm calculator = FeatureSetCalculator( pd_es, feature_set=feature_set, precalculated_features=precalculated_fm_trie) instance_ids = [0, 2, 3, 5] fm = calculator.run(np.array(instance_ids)) assert list( fm[direct.get_name()]) == [values[0], values[0], values[1], values[2]] # Calculating without precalculated features should error. with pytest.raises(RuntimeError, match=error_msg): FeatureSetCalculator(pd_es, feature_set=FeatureSet([direct ])).run(instance_ids)
def _build_feature_trie(self): """ Build the feature trie by adding the target features and their dependencies recursively. """ feature_trie = Trie(default=lambda: (False, set(), set()), path_constructor=RelationshipPath) for f in self.target_features: self._add_feature_to_trie(feature_trie, f, self.approximate_feature_trie) return feature_trie
def _build_feature_trie(self): """ Construct a trie mapping RelationshipPaths to a tuple of (bool, set[str], set[str]). The bool represents whether the full entity df is needed at that node, the first set contains the names of features which are needed on the full entity, and the second set contains the names of the rest of the features """ feature_trie = Trie(default=lambda: (False, set(), set()), path_constructor=RelationshipPath) for f in self.target_features: self._add_feature_to_trie(feature_trie, f) return feature_trie
def test_precalculated_features(es): error_msg = 'This primitive should never be used because the features are precalculated' class ErrorPrim(AggregationPrimitive): """A primitive whose function raises an error.""" name = "error_prim" input_types = [Numeric] return_type = Numeric def get_function(self): def error(s): raise RuntimeError(error_msg) return error value = ft.Feature(es['log']['value']) agg = ft.Feature(value, parent_entity=es['sessions'], primitive=ErrorPrim) agg2 = ft.Feature(agg, parent_entity=es['customers'], primitive=ErrorPrim) direct = ft.Feature(agg2, entity=es['sessions']) # Set up a FeatureSet which knows which features are precalculated. precalculated_feature_trie = Trie(default=set, path_constructor=RelationshipPath) precalculated_feature_trie.get_node(direct.relationship_path).value.add( agg2.unique_name()) feature_set = FeatureSet( [direct], approximate_feature_trie=precalculated_feature_trie) # Fake precalculated data. values = [0, 1, 2] parent_fm = pd.DataFrame({agg2.get_name(): values}) precalculated_fm_trie = Trie(path_constructor=RelationshipPath) precalculated_fm_trie.get_node(direct.relationship_path).value = parent_fm calculator = FeatureSetCalculator( es, feature_set=feature_set, precalculated_features=precalculated_fm_trie) instance_ids = [0, 2, 3, 5] fm = calculator.run(np.array(instance_ids)) assert list( fm[direct.get_name()]) == [values[0], values[0], values[1], values[2]] # Calculating without precalculated features should error. with pytest.raises(RuntimeError, match=error_msg): FeatureSetCalculator(es, feature_set=FeatureSet([direct ])).run(instance_ids)
def __init__(self, features, approximate_feature_trie=None): """ Args: features (list[Feature]): Features of the target entity. approximate_feature_trie (Trie[RelationshipPath, set[str]], optional): Dependency features to ignore because they have already been approximated. For example, if one of the target features is a direct feature of a feature A and A is included in approximate_feature_trie then neither A nor its dependencies will appear in FeatureSet.feature_trie. """ self.target_eid = features[0].entity.id self.target_features = features self.target_feature_names = {f.unique_name() for f in features} if not approximate_feature_trie: approximate_feature_trie = Trie(default=list, path_constructor=RelationshipPath) self.approximate_feature_trie = approximate_feature_trie # Maps the unique name of each feature to the actual feature. This is necessary # because features do not support equality and so cannot be used as # dictionary keys. The equality operator on features produces a new # feature (which will always be truthy). self.features_by_name = {f.unique_name(): f for f in features} feature_dependents = defaultdict(set) for f in features: deps = f.get_dependencies(deep=True) for dep in deps: feature_dependents[dep.unique_name()].add(f.unique_name()) self.features_by_name[dep.unique_name()] = dep subdeps = dep.get_dependencies(deep=True) for sd in subdeps: feature_dependents[sd.unique_name()].add(dep.unique_name()) # feature names (keys) and the features that rely on them (values). self.feature_dependents = { fname: [ self.features_by_name[dname] for dname in feature_dependents[fname] ] for fname, f in self.features_by_name.items() } self._feature_trie = None
def run(self, instance_ids): """ Calculate values of features for the given instances of the target entity. Summary of algorithm: 1. Construct a trie where the edges are relationships and each node contains a set of features for a single entity. See FeatureSet._build_feature_trie. 2. Initialize a trie for storing dataframes. 3. Traverse the trie using depth first search. At each node calculate the features and store the resulting dataframe in the dataframe trie (so that its values can be used by features which depend on these features). See _calculate_features_for_entity. 4. Get the dataframe at the root of the trie (for the target entity) and return the columns corresponding to the requested features. Args: instance_ids (list): List of instance id for which to build features. Returns: pd.DataFrame : Pandas DataFrame of calculated feature values. Indexed by instance_ids. Columns in same order as features passed in. """ assert len(instance_ids) > 0, "0 instance ids provided" feature_trie = self.feature_set.feature_trie df_trie = Trie(path_constructor=RelationshipPath) full_entity_df_trie = Trie(path_constructor=RelationshipPath) target_entity = self.entityset[self.feature_set.target_eid] self._calculate_features_for_entity( entity_id=self.feature_set.target_eid, feature_trie=feature_trie, df_trie=df_trie, full_entity_df_trie=full_entity_df_trie, precalculated_trie=self.precalculated_features, filter_variable=target_entity.index, filter_values=instance_ids) # The dataframe for the target entity should be stored at the root of # df_trie. df = df_trie.value if df.empty: return self.generate_default_df(instance_ids=instance_ids) # fill in empty rows with default values missing_ids = [ i for i in instance_ids if i not in df[target_entity.index] ] if missing_ids: default_df = self.generate_default_df(instance_ids=missing_ids, extra_columns=df.columns) df = df.append(default_df, sort=True) df.index.name = self.entityset[self.feature_set.target_eid].index column_list = [] for feat in self.feature_set.target_features: column_list.extend(feat.get_feature_names()) return df[column_list]
def approximate_features(feature_set, cutoff_time, window, entityset, training_window=None): '''Given a set of features and cutoff_times to be passed to calculate_feature_matrix, calculates approximate values of some features to speed up calculations. Cutoff times are sorted into window-sized buckets and the approximate feature values are only calculated at one cutoff time for each bucket. ..note:: this only approximates DirectFeatures of AggregationFeatures, on the target entity. In future versions, it may also be possible to approximate these features on other top-level entities Args: cutoff_time (pd.DataFrame): specifies what time to calculate the features for each instance at. The resulting feature matrix will use data up to and including the cutoff_time. A DataFrame with 'instance_id' and 'time' columns. window (Timedelta or str): frequency to group instances with similar cutoff times by for features with costly calculations. For example, if bucket is 24 hours, all instances with cutoff times on the same day will use the same calculation for expensive features. entityset (:class:`.EntitySet`): An already initialized entityset. feature_set (:class:`.FeatureSet`): The features to be calculated. training_window (`Timedelta`, optional): Window defining how much older than the cutoff time data can be to be included when calculating the feature. If None, all older data is used. save_progress (str, optional): path to save intermediate computational results ''' approx_fms_trie = Trie(path_constructor=RelationshipPath) target_time_colname = 'target_time' cutoff_time[target_time_colname] = cutoff_time['time'] approx_cutoffs = bin_cutoff_times(cutoff_time.copy(), window) cutoff_df_time_var = 'time' cutoff_df_instance_var = 'instance_id' # should this order be by dependencies so that calculate_feature_matrix # doesn't skip approximating something? for relationship_path, approx_feature_names in feature_set.approximate_feature_trie: if not approx_feature_names: continue cutoffs_with_approx_e_ids, new_approx_entity_index_var = \ _add_approx_entity_index_var(entityset, feature_set.target_eid, approx_cutoffs.copy(), relationship_path) # Select only columns we care about columns_we_want = [ new_approx_entity_index_var, cutoff_df_time_var, target_time_colname ] cutoffs_with_approx_e_ids = cutoffs_with_approx_e_ids[columns_we_want] cutoffs_with_approx_e_ids = cutoffs_with_approx_e_ids.drop_duplicates() cutoffs_with_approx_e_ids.dropna(subset=[new_approx_entity_index_var], inplace=True) approx_features = [ feature_set.features_by_name[name] for name in approx_feature_names ] if cutoffs_with_approx_e_ids.empty: approx_fm = gen_empty_approx_features_df(approx_features) else: cutoffs_with_approx_e_ids.sort_values( [cutoff_df_time_var, new_approx_entity_index_var], inplace=True) # CFM assumes specific column names for cutoff_time argument rename = {new_approx_entity_index_var: cutoff_df_instance_var} cutoff_time_to_pass = cutoffs_with_approx_e_ids.rename( columns=rename) cutoff_time_to_pass = cutoff_time_to_pass[[ cutoff_df_instance_var, cutoff_df_time_var ]] cutoff_time_to_pass.drop_duplicates(inplace=True) approx_fm = calculate_feature_matrix( approx_features, entityset, cutoff_time=cutoff_time_to_pass, training_window=training_window, approximate=None, cutoff_time_in_index=False, chunk_size=cutoff_time_to_pass.shape[0]) approx_fms_trie.get_node(relationship_path).value = approx_fm return approx_fms_trie
def run(self, instance_ids, progress_callback=None, include_cutoff_time=True): """ Calculate values of features for the given instances of the target dataframe. Summary of algorithm: 1. Construct a trie where the edges are relationships and each node contains a set of features for a single dataframe. See FeatureSet._build_feature_trie. 2. Initialize a trie for storing dataframes. 3. Traverse the trie using depth first search. At each node calculate the features and store the resulting dataframe in the dataframe trie (so that its values can be used by features which depend on these features). See _calculate_features_for_dataframe. 4. Get the dataframe at the root of the trie (for the target dataframe) and return the columns corresponding to the requested features. Args: instance_ids (np.ndarray or pd.Categorical): Instance ids for which to build features. progress_callback (callable): function to be called with incremental progress updates include_cutoff_time (bool): If True, data at cutoff time are included in calculating features. Returns: pd.DataFrame : Pandas DataFrame of calculated feature values. Indexed by instance_ids. Columns in same order as features passed in. """ assert len(instance_ids) > 0, "0 instance ids provided" if progress_callback is None: # do nothing for the progress call back if not provided def progress_callback(*args): pass feature_trie = self.feature_set.feature_trie df_trie = Trie(path_constructor=RelationshipPath) full_dataframe_trie = Trie(path_constructor=RelationshipPath) target_dataframe = self.entityset[self.feature_set.target_df_name] self._calculate_features_for_dataframe( dataframe_name=self.feature_set.target_df_name, feature_trie=feature_trie, df_trie=df_trie, full_dataframe_trie=full_dataframe_trie, precalculated_trie=self.precalculated_features, filter_column=target_dataframe.ww.index, filter_values=instance_ids, progress_callback=progress_callback, include_cutoff_time=include_cutoff_time, ) # The dataframe for the target dataframe should be stored at the root of # df_trie. df = df_trie.value # Fill in empty rows with default values. This only works for pandas dataframes # and is not currently supported for Dask dataframes. if isinstance(df, pd.DataFrame): index_dtype = df.index.dtype.name if df.empty: return self.generate_default_df(instance_ids=instance_ids) missing_ids = [ i for i in instance_ids if i not in df[target_dataframe.ww.index] ] if missing_ids: default_df = self.generate_default_df(instance_ids=missing_ids, extra_columns=df.columns) df = default_df.append(df, sort=True) df.index.name = self.entityset[ self.feature_set.target_df_name].ww.index # Order by instance_ids unique_instance_ids = pd.unique(instance_ids) unique_instance_ids = unique_instance_ids.astype( instance_ids.dtype) df = df.reindex(unique_instance_ids) # Keep categorical index if original index was categorical if index_dtype == "category": df.index = df.index.astype("category") column_list = [] for feat in self.feature_set.target_features: column_list.extend(feat.get_feature_names()) if is_instance(df, (dd, ps), "DataFrame"): column_list.extend([target_dataframe.ww.index]) return df[column_list]
def run(self, instance_ids, progress_callback=None): """ Calculate values of features for the given instances of the target entity. Summary of algorithm: 1. Construct a trie where the edges are relationships and each node contains a set of features for a single entity. See FeatureSet._build_feature_trie. 2. Initialize a trie for storing dataframes. 3. Traverse the trie using depth first search. At each node calculate the features and store the resulting dataframe in the dataframe trie (so that its values can be used by features which depend on these features). See _calculate_features_for_entity. 4. Get the dataframe at the root of the trie (for the target entity) and return the columns corresponding to the requested features. Args: instance_ids (np.ndarray or pd.Categorical): Instance ids for which to build features. progress_callback (callable): function to be called with incremental progress updates Returns: pd.DataFrame : Pandas DataFrame of calculated feature values. Indexed by instance_ids. Columns in same order as features passed in. """ assert len(instance_ids) > 0, "0 instance ids provided" if progress_callback is None: # do nothing for the progress call back if not provided def progress_callback(*args): pass feature_trie = self.feature_set.feature_trie df_trie = Trie(path_constructor=RelationshipPath) full_entity_df_trie = Trie(path_constructor=RelationshipPath) target_entity = self.entityset[self.feature_set.target_eid] self._calculate_features_for_entity( entity_id=self.feature_set.target_eid, feature_trie=feature_trie, df_trie=df_trie, full_entity_df_trie=full_entity_df_trie, precalculated_trie=self.precalculated_features, filter_variable=target_entity.index, filter_values=instance_ids, progress_callback=progress_callback) # The dataframe for the target entity should be stored at the root of # df_trie. df = df_trie.value if df.empty: return self.generate_default_df(instance_ids=instance_ids) # fill in empty rows with default values missing_ids = [ i for i in instance_ids if i not in df[target_entity.index] ] if missing_ids: default_df = self.generate_default_df(instance_ids=missing_ids, extra_columns=df.columns) df = df.append(default_df, sort=True) df.index.name = self.entityset[self.feature_set.target_eid].index column_list = [] # Order by instance_ids unique_instance_ids = pd.unique(instance_ids) # pd.unique changes the dtype for Categorical, so reset it. unique_instance_ids = unique_instance_ids.astype(instance_ids.dtype) df = df.reindex(unique_instance_ids) for feat in self.feature_set.target_features: column_list.extend(feat.get_feature_names()) return df[column_list]