def _calculate_agg_features(self, features, entity_frames): test_feature = features[0] entity = test_feature.entity child_entity = test_feature.base_features[0].entity assert entity.id in entity_frames and child_entity.id in entity_frames frame = entity_frames[entity.id] base_frame = entity_frames[child_entity.id] # Sometimes approximate features get computed in a previous filter frame # and put in the current one dynamically, # so there may be existing features here features = [f for f in features if f.get_name() not in frame.columns] if not len(features): return frame # handle where clause for all functions below where = test_feature.where if where is not None: base_frame = base_frame[base_frame[where.get_name()]] relationship_path = self.entityset.find_backward_path( entity.id, child_entity.id) groupby_var = Relationship._get_link_variable_name(relationship_path) # if the use_previous property exists on this feature, include only the # instances from the child entity included in that Timedelta use_previous = test_feature.use_previous if use_previous and not base_frame.empty: # Filter by use_previous values time_last = self.time_last if use_previous.is_absolute(): time_first = time_last - use_previous ti = child_entity.time_index if ti is not None: base_frame = base_frame[base_frame[ti] >= time_first] else: n = use_previous.value def last_n(df): return df.iloc[-n:] base_frame = base_frame.groupby(groupby_var, observed=True, sort=False).apply(last_n) to_agg = {} agg_rename = {} to_apply = set() # apply multivariable and time-dependent features as we find them, and # save aggregable features for later for f in features: if _can_agg(f): variable_id = f.base_features[0].get_name() if variable_id not in to_agg: to_agg[variable_id] = [] func = f.get_function() funcname = func if callable(func): funcname = func.__name__ to_agg[variable_id].append(func) # this is used below to rename columns that pandas names for us agg_rename[u"{}-{}".format(variable_id, funcname)] = f.get_name() continue to_apply.add(f) # Apply the non-aggregable functions generate a new dataframe, and merge # it with the existing one if len(to_apply): wrap = agg_wrapper(to_apply, self.time_last) # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).apply(wrap) to_merge.reset_index(1, drop=True, inplace=True) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # Apply the aggregate functions to generate a new dataframe, and merge # it with the existing one if len(to_agg): # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).agg(to_agg) # rename columns to the correct feature names to_merge.columns = [ agg_rename["-".join(x)] for x in to_merge.columns.ravel() ] to_merge = to_merge[list(agg_rename.values())] # workaround for pandas bug where categories are in the wrong order # see: https://github.com/pandas-dev/pandas/issues/22501 if pdtypes.is_categorical_dtype(frame.index): categories = pdtypes.CategoricalDtype( categories=frame.index.categories) to_merge.index = to_merge.index.astype(object).astype( categories) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # Handle default values # 1. handle non scalar default values iterfeats = [ f for f in features if hasattr(f.default_value, '__iter__') ] for f in iterfeats: nulls = pd.isnull(frame[f.get_name()]) for ni in nulls[nulls].index: frame.at[ni, f.get_name()] = f.default_value # 2. handle scalars default values fillna_dict = { f.get_name(): f.default_value for f in features if f not in iterfeats } frame.fillna(fillna_dict, inplace=True) # convert boolean dtypes to floats as appropriate # pandas behavior: https://github.com/pydata/pandas/issues/3752 for f in features: if (not f.expanding and f.variable_type == variable_types.Numeric and frame[f.get_name()].dtype.name in ['object', 'bool']): frame[f.get_name()] = frame[f.get_name()].astype(float) return frame
def _calculate_agg_features(self, features, entity_frames): test_feature = features[0] use_previous = test_feature.use_previous base_features = test_feature.base_features where = test_feature.where entity = test_feature.entity child_entity = base_features[0].entity assert entity.id in entity_frames and child_entity.id in entity_frames index_var = entity.index frame = entity_frames[entity.id] base_frame = entity_frames[child_entity.id] # Sometimes approximate features get computed in a previous filter frame # and put in the current one dynamically, # so there may be existing features here features = [f for f in features if f.get_name() not in frame.columns] if not len(features): return # handle where clause for all functions below if where is not None: base_frame = base_frame[base_frame[where.get_name()]] relationship_path = self.entityset.find_backward_path( entity.id, child_entity.id) groupby_var = Relationship._get_link_variable_name(relationship_path) # if the use_previous property exists on this feature, include only the # instances from the child entity included in that Timedelta if use_previous and not base_frame.empty: # Filter by use_previous values time_last = self.time_last if use_previous.is_absolute(): time_first = time_last - use_previous ti = child_entity.time_index if ti is not None: base_frame = base_frame[base_frame[ti] >= time_first] else: n = use_previous.value def last_n(df): return df.iloc[-n:] base_frame = base_frame.groupby(groupby_var).apply(last_n) if not base_frame.empty: if groupby_var not in base_frame: # This occured sometimes. I think it might have to do with category # but not sure. TODO: look into when this occurs no_instances = True # if the foreign key column in the child (base_frame) that links to # frame is an integer and the id column in the parent is an object or # category dtype, the .isin() call errors. elif (frame[index_var].dtype != base_frame[groupby_var].dtype or frame[index_var].dtype.name.find('category') > -1): try: frame_as_obj = frame[index_var].astype(object) base_frame_as_obj = base_frame[groupby_var].astype(object) except ValueError: msg = u"Could not join {}.{} (dtype={}) with {}.{} (dtype={})" raise ValueError( msg.format(entity.id, index_var, frame[index_var].dtype, child_entity.id, groupby_var, base_frame[groupby_var].dtype)) else: no_instances = check_no_related_instances( frame_as_obj.values, base_frame_as_obj.values) else: no_instances = check_no_related_instances( frame[index_var].values, base_frame[groupby_var].values) if base_frame.empty or no_instances: for f in features: set_default_column(entity_frames[entity.id], f) return def wrap_func_with_name(func, name): def inner(x): return func(x) inner.__name__ = name return inner to_agg = {} agg_rename = {} to_apply = set() # apply multivariable and time-dependent features as we find them, and # save aggregable features for later for f in features: if _can_agg(f): variable_id = f.base_features[0].get_name() if variable_id not in to_agg: to_agg[variable_id] = [] func = f.get_function() # make sure function names are unique random_id = str(uuid.uuid1()) func = wrap_func_with_name(func, random_id) funcname = random_id to_agg[variable_id].append(func) agg_rename[u"{}-{}".format(variable_id, funcname)] = f.get_name() continue to_apply.add(f) # Apply the non-aggregable functions generate a new dataframe, and merge # it with the existing one if len(to_apply): wrap = agg_wrapper(to_apply, self.time_last) # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var]).apply(wrap) to_merge.reset_index(1, drop=True, inplace=True) frame = pd.merge(left=frame, right=to_merge, left_on=index_var, right_index=True, how='left') # Apply the aggregate functions to generate a new dataframe, and merge # it with the existing one # Do the [variables] accessor on to_merge because the agg call returns # a dataframe with columns that contain the dataframes we want if len(to_agg): # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var]).agg(to_agg) # we apply multiple functions to each column, creating # a multiindex as the column # rename the columns to a concatenation of the two indexes to_merge.columns = [ u"{}-{}".format(n1, n2) for n1, n2 in to_merge.columns.ravel() ] # to enable a rename to_merge = to_merge.rename(columns=agg_rename) variables = agg_rename.values() to_merge = to_merge[variables] frame = pd.merge(left=frame, right=to_merge, left_on=index_var, right_index=True, how='left') # Handle default values # 1. handle non scalar default values iterfeats = [ f for f in features if hasattr(f.default_value, '__iter__') ] for f in iterfeats: nulls = pd.isnull(frame[f.get_name()]) for ni in nulls[nulls].index: frame.at[ni, f.get_name()] = f.default_value # 2. handle scalars default values fillna_dict = { f.get_name(): f.default_value for f in features if f not in iterfeats } frame.fillna(fillna_dict, inplace=True) # convert boolean dtypes to floats as appropriate # pandas behavior: https://github.com/pydata/pandas/issues/3752 for f in features: if (not f.expanding and f.variable_type == variable_types.Numeric and frame[f.get_name()].dtype.name in ['object', 'bool']): frame[f.get_name()] = frame[f.get_name()].astype(float) entity_frames[entity.id] = frame
def _calculate_agg_features(self, features, entity_frames): test_feature = features[0] use_previous = test_feature.use_previous base_features = test_feature.base_features where = test_feature.where entity = test_feature.entity child_entity = base_features[0].entity assert entity.id in entity_frames and child_entity.id in entity_frames index_var = entity.index frame = entity_frames[entity.id] base_frame = entity_frames[child_entity.id] # Sometimes approximate features get computed in a previous filter frame # and put in the current one dynamically, # so there may be existing features here features = [f for f in features if f.get_name() not in frame.columns] if not len(features): return frame # handle where clause for all functions below if where is not None: base_frame = base_frame[base_frame[where.get_name()]] relationship_path = self.entityset.find_backward_path(entity.id, child_entity.id) groupby_var = Relationship._get_link_variable_name(relationship_path) # if the use_previous property exists on this feature, include only the # instances from the child entity included in that Timedelta if use_previous and not base_frame.empty: # Filter by use_previous values time_last = self.time_last if use_previous.is_absolute(): time_first = time_last - use_previous ti = child_entity.time_index if ti is not None: base_frame = base_frame[base_frame[ti] >= time_first] else: n = use_previous.value def last_n(df): return df.iloc[-n:] base_frame = base_frame.groupby(groupby_var).apply(last_n) if not base_frame.empty: if groupby_var not in base_frame: # This occured sometimes. I think it might have to do with category # but not sure. TODO: look into when this occurs no_instances = True # if the foreign key column in the child (base_frame) that links to # frame is an integer and the id column in the parent is an object or # category dtype, the .isin() call errors. elif (frame[index_var].dtype != base_frame[groupby_var].dtype or frame[index_var].dtype.name.find('category') > -1): try: frame_as_obj = frame[index_var].astype(object) base_frame_as_obj = base_frame[groupby_var].astype(object) except ValueError: msg = u"Could not join {}.{} (dtype={}) with {}.{} (dtype={})" raise ValueError(msg.format(entity.id, index_var, frame[index_var].dtype, child_entity.id, groupby_var, base_frame[groupby_var].dtype)) else: no_instances = check_no_related_instances( frame_as_obj.values, base_frame_as_obj.values) else: no_instances = check_no_related_instances( frame[index_var].values, base_frame[groupby_var].values) if base_frame.empty or no_instances: for f in features: set_default_column(entity_frames[entity.id], f) return frame def wrap_func_with_name(func, name): def inner(x): return func(x) inner.__name__ = name return inner to_agg = {} agg_rename = {} to_apply = set() # apply multivariable and time-dependent features as we find them, and # save aggregable features for later for f in features: if _can_agg(f): variable_id = f.base_features[0].get_name() if variable_id not in to_agg: to_agg[variable_id] = [] func = f.get_function() # make sure function names are unique random_id = str(uuid.uuid1()) func = wrap_func_with_name(func, random_id) funcname = random_id to_agg[variable_id].append(func) agg_rename[u"{}-{}".format(variable_id, funcname)] = \ f.get_name() continue to_apply.add(f) # Apply the non-aggregable functions generate a new dataframe, and merge # it with the existing one if len(to_apply): wrap = agg_wrapper(to_apply, self.time_last) # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var]).apply(wrap) to_merge.reset_index(1, drop=True, inplace=True) frame = pd.merge(left=frame, right=to_merge, left_on=index_var, right_index=True, how='left') # Apply the aggregate functions to generate a new dataframe, and merge # it with the existing one # Do the [variables] accessor on to_merge because the agg call returns # a dataframe with columns that contain the dataframes we want if len(to_agg): # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var]).agg(to_agg) # we apply multiple functions to each column, creating # a multiindex as the column # rename the columns to a concatenation of the two indexes to_merge.columns = [u"{}-{}".format(n1, n2) for n1, n2 in to_merge.columns.ravel()] # to enable a rename to_merge = to_merge.rename(columns=agg_rename) variables = list(agg_rename.values()) to_merge = to_merge[variables] frame = pd.merge(left=frame, right=to_merge, left_on=index_var, right_index=True, how='left') # Handle default values # 1. handle non scalar default values iterfeats = [f for f in features if hasattr(f.default_value, '__iter__')] for f in iterfeats: nulls = pd.isnull(frame[f.get_name()]) for ni in nulls[nulls].index: frame.at[ni, f.get_name()] = f.default_value # 2. handle scalars default values fillna_dict = {f.get_name(): f.default_value for f in features if f not in iterfeats} frame.fillna(fillna_dict, inplace=True) # convert boolean dtypes to floats as appropriate # pandas behavior: https://github.com/pydata/pandas/issues/3752 for f in features: if (not f.expanding and f.variable_type == variable_types.Numeric and frame[f.get_name()].dtype.name in ['object', 'bool']): frame[f.get_name()] = frame[f.get_name()].astype(float) return frame