def set_time_index(self, variable_id, already_sorted=False): # check time type if self.df.empty: time_to_check = vtypes.DEFAULT_DTYPE_VALUES[self[variable_id]._default_pandas_dtype] else: time_to_check = self.df[variable_id].iloc[0] time_type = _check_time_type(time_to_check) if time_type is None: raise TypeError("%s time index not recognized as numeric or" " datetime" % (self.id)) if self.entityset.time_type is None: self.entityset.time_type = time_type elif self.entityset.time_type != time_type: raise TypeError("%s time index is %s type which differs from" " other entityset time indexes" % (self.id, time_type)) # use stable sort if not already_sorted: # sort by time variable, then by index self.df.sort_values([variable_id, self.index], inplace=True) t = vtypes.NumericTimeIndex if col_is_datetime(self.df[variable_id]): t = vtypes.DatetimeTimeIndex self.convert_variable_type(variable_id, t, convert_data=False) self.time_index = variable_id
def set_time_index(self, variable_id, already_sorted=False): if variable_id is not None: # check time type time_type = _check_time_type(self.df[variable_id].iloc[0]) if time_type is None: raise TypeError("%s time index not recognized as numeric or" " datetime" % (self.id)) if self.entityset.time_type is None: self.entityset.time_type = time_type elif self.entityset.time_type != time_type: raise TypeError("%s time index is %s type which differs from" " other entityset time indexes" % (self.id, time_type)) # use stable sort if not already_sorted: # sort by time variable, then by index self.df.sort_values([variable_id, self.index], kind="mergesort", inplace=True) t = vtypes.TimeIndex if col_is_datetime(self.df[variable_id]): t = vtypes.DatetimeTimeIndex self.convert_variable_type(variable_id, t, convert_data=False) else: # todo add test for this if not already_sorted: # sort by time variable, then by index self.df.sort_values([self.index], kind="mergesort", inplace=True) super(Entity, self).set_time_index(variable_id)
def set_time_index(self, variable_id, already_sorted=False): if variable_id is not None: # check time type time_type = _check_time_type(self.df[variable_id].iloc[0]) if time_type is None: raise TypeError("%s time index not recognized as numeric or" " datetime" % (self.id)) if self.entityset.time_type is None: self.entityset.time_type = time_type elif self.entityset.time_type != time_type: raise TypeError("%s time index is %s type which differs from" " other entityset time indexes" % (self.id, time_type)) # use stable sort if not already_sorted: # sort by time variable, then by index self.df.sort_values([variable_id, self.index], kind="mergesort", inplace=True) t = vtypes.TimeIndex if col_is_datetime(self.df[variable_id]): t = vtypes.DatetimeTimeIndex self.convert_variable_type(variable_id, t, convert_data=False) else: # todo add test for this if not already_sorted: # sort by time variable, then by index self.df.sort_values([self.index], kind="mergesort", inplace=True) super(Entity, self).set_time_index(variable_id)
def set_time_index(self, variable_id, already_sorted=False): # check time type if self.df.empty: time_to_check = vtypes.DEFAULT_DTYPE_VALUES[self[variable_id]._default_pandas_dtype] else: time_to_check = self.df[variable_id].iloc[0] time_type = _check_time_type(time_to_check) if time_type is None: raise TypeError("%s time index not recognized as numeric or" " datetime" % (self.id)) if self.entityset.time_type is None: self.entityset.time_type = time_type elif self.entityset.time_type != time_type: raise TypeError("%s time index is %s type which differs from" " other entityset time indexes" % (self.id, time_type)) # use stable sort if not already_sorted: # sort by time variable, then by index self.df.sort_values([variable_id, self.index], inplace=True) t = vtypes.NumericTimeIndex if col_is_datetime(self.df[variable_id]): t = vtypes.DatetimeTimeIndex self.convert_variable_type(variable_id, t, convert_data=False) self.time_index = variable_id
def set_secondary_time_index(self, secondary_time_index): if secondary_time_index is not None: for time_index in secondary_time_index: time_type = _check_time_type(self.df[time_index].iloc[0]) if time_type is None: raise TypeError( "%s time index not recognized as numeric or" " datetime" % (self.id)) if self.entityset.time_type != time_type: raise TypeError( "%s time index is %s type which differs from" " other entityset time indexes" % (self.id, time_type)) self.secondary_time_index = secondary_time_index or {}
def set_secondary_time_index(self, secondary_time_index): if secondary_time_index is not None: for time_index in secondary_time_index: time_type = _check_time_type(self.df[time_index].iloc[0]) if time_type is None: raise TypeError("%s time index not recognized as numeric or" " datetime" % (self.id)) if self.entityset.time_type != time_type: raise TypeError("%s time index is %s type which differs from" " other entityset time indexes" % (self.id, time_type)) super(Entity, self).set_secondary_time_index(secondary_time_index)
def set_secondary_time_index(self, secondary_time_index): if secondary_time_index is not None: for time_index in secondary_time_index: if self.df.empty: time_to_check = vtypes.DEFAULT_DTYPE_VALUES[self[time_index]._default_pandas_dtype] else: time_to_check = self.df[time_index].iloc[0] time_type = _check_time_type(time_to_check) if time_type is None: raise TypeError("%s time index not recognized as numeric or" " datetime" % (self.id)) if self.entityset.time_type != time_type: raise TypeError("%s time index is %s type which differs from" " other entityset time indexes" % (self.id, time_type)) self.secondary_time_index = secondary_time_index or {}
def set_secondary_time_index(self, secondary_time_index): for time_index, columns in secondary_time_index.items(): if is_instance(self.df, (dd, ks), 'DataFrame') or self.df.empty: time_to_check = vtypes.DEFAULT_DTYPE_VALUES[self[time_index]._default_pandas_dtype] else: time_to_check = self.df[time_index].head(1).iloc[0] time_type = _check_time_type(time_to_check) if time_type is None: raise TypeError("%s time index not recognized as numeric or" " datetime" % (self.id)) if self.entityset.time_type != time_type: raise TypeError("%s time index is %s type which differs from" " other entityset time indexes" % (self.id, time_type)) if time_index not in columns: columns.append(time_index) self.secondary_time_index = secondary_time_index
def _check_cutoff_time_type(cutoff_time, es_time_type): """ Check that the cutoff time values are of the proper type given the entityset time type """ # Check that cutoff_time time type matches entityset time type if isinstance(cutoff_time, tuple): cutoff_time_value = cutoff_time[0] time_type = _check_time_type(cutoff_time_value) is_numeric = time_type == NumericTimeIndex is_datetime = time_type == DatetimeTimeIndex else: cutoff_time_dtype = cutoff_time['time'].dtype.name is_numeric = cutoff_time_dtype in PandasTypes._pandas_numerics is_datetime = cutoff_time_dtype in PandasTypes._pandas_datetimes if es_time_type == NumericTimeIndex and not is_numeric: raise TypeError("cutoff_time times must be numeric: try casting " "via pd.to_numeric()") if es_time_type == DatetimeTimeIndex and not is_datetime: raise TypeError("cutoff_time times must be datetime type: try casting " "via pd.to_datetime()")
def _check_cutoff_time_type(cutoff_time, es_time_type): """ Check that the cutoff time values are of the proper type given the entityset time type """ # Check that cutoff_time time type matches entityset time type if isinstance(cutoff_time, tuple): cutoff_time_value = cutoff_time[0] time_type = _check_time_type(cutoff_time_value) is_numeric = time_type == 'numeric' is_datetime = time_type == Datetime else: cutoff_time_col = cutoff_time.ww['time'] is_numeric = cutoff_time_col.ww.schema.is_numeric is_datetime = cutoff_time_col.ww.schema.is_datetime if es_time_type == "numeric" and not is_numeric: raise TypeError("cutoff_time times must be numeric: try casting " "via pd.to_numeric()") if es_time_type == Datetime and not is_datetime: raise TypeError("cutoff_time times must be datetime type: try casting " "via pd.to_datetime()")
def calculate_feature_matrix(features, entityset=None, cutoff_time=None, instance_ids=None, entities=None, relationships=None, cutoff_time_in_index=False, training_window=None, approximate=None, save_progress=None, verbose=False, chunk_size=None, profile=False): """Calculates a matrix for a given set of instance ids and calculation times. Args: features (list[PrimitiveBase]): Feature definitions to be calculated. entityset (EntitySet): An already initialized entityset. Required if `entities` and `relationships` not provided cutoff_time (pd.DataFrame or Datetime): Specifies at which time to calculate the features for each instance. Can either be a DataFrame with 'instance_id' and 'time' columns, DataFrame with the name of the index variable in the target entity and a time column, or a single value to calculate for all instances. If the dataframe has more than two columns, any additional columns will be added to the resulting feature matrix. instance_ids (list): List of instances to calculate features on. Only used if cutoff_time is a single datetime. entities (dict[str -> tuple(pd.DataFrame, str, str)]): dictionary of entities. Entries take the format {entity id: (dataframe, id column, (time_column))}. relationships (list[(str, str, str, str)]): list of relationships between entities. List items are a tuple with the format (parent entity id, parent variable, child entity id, child variable). cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex where the second index is the cutoff time (first is instance id). DataFrame will be sorted by (time, instance_id). training_window (dict[str -> Timedelta] or Timedelta, optional): Window or windows defining how much older than the cutoff time data can be to be included when calculating the feature. To specify which entities to apply windows to, use a dictionary mapping entity id -> Timedelta. If None, all older data is used. approximate (Timedelta or str): Frequency to group instances with similar cutoff times by for features with costly calculations. For example, if bucket is 24 hours, all instances with cutoff times on the same day will use the same calculation for expensive features. verbose (bool, optional): Print progress info. The time granularity is per chunk. profile (bool, optional): Enables profiling if True. chunk_size (int or float or None or "cutoff time"): Number of rows of output feature matrix to calculate at time. If passed an integer greater than 0, will try to use that many rows per chunk. If passed a float value between 0 and 1 sets the chunk size to that percentage of all instances. If passed the string "cutoff time", rows are split per cutoff time. save_progress (str, optional): path to save intermediate computational results. """ assert (isinstance(features, list) and features != [] and all([isinstance(feature, PrimitiveBase) for feature in features])), \ "features must be a non-empty list of features" # handle loading entityset from featuretools.entityset.entityset import EntitySet if not isinstance(entityset, EntitySet): if entities is not None and relationships is not None: entityset = EntitySet("entityset", entities, relationships) target_entity = entityset[features[0].entity.id] pass_columns = [] if not isinstance(cutoff_time, pd.DataFrame): if isinstance(cutoff_time, list): raise TypeError("cutoff_time must be a single value or DataFrame") if cutoff_time is None: if entityset.time_type == NumericTimeIndex: cutoff_time = np.inf else: cutoff_time = datetime.now() if instance_ids is None: index_var = target_entity.index instance_ids = target_entity.df[index_var].tolist() cutoff_time = [cutoff_time] * len(instance_ids) map_args = [(id, time) for id, time in zip(instance_ids, cutoff_time)] cutoff_time = pd.DataFrame(map_args, columns=['instance_id', 'time']) else: cutoff_time = cutoff_time.copy() # handle how columns are names in cutoff_time if "instance_id" not in cutoff_time.columns: if target_entity.index not in cutoff_time.columns: raise AttributeError( 'Name of the index variable in the target entity' ' or "instance_id" must be present in cutoff_time') # rename to instance_id cutoff_time.rename(columns={target_entity.index: "instance_id"}, inplace=True) if "time" not in cutoff_time.columns: # take the first column that isn't instance_id and assume it is time not_instance_id = [ c for c in cutoff_time.columns if c != "instance_id" ] cutoff_time.rename(columns={not_instance_id[0]: "time"}, inplace=True) if cutoff_time['time'].dtype == object: if (entityset.time_type == NumericTimeIndex and cutoff_time['time'].dtype.name.find('int') == -1 and cutoff_time['time'].dtype.name.find('float') == -1): raise TypeError( "cutoff_time times must be numeric: try casting via pd.to_numeric(cutoff_time['time'])" ) elif (entityset.time_type == DatetimeTimeIndex and cutoff_time['time'].dtype.name.find('time') == -1): raise TypeError( "cutoff_time times must be datetime type: try casting via pd.to_datetime(cutoff_time['time'])" ) pass_columns = [column_name for column_name in cutoff_time.columns[2:]] if _check_time_type(cutoff_time['time'].iloc[0]) is None: raise ValueError("cutoff_time time values must be datetime or numeric") backend = PandasBackend(entityset, features) # Get dictionary of features to approximate if approximate is not None: to_approximate, all_approx_feature_set = gather_approximate_features( features, backend) else: to_approximate = defaultdict(list) all_approx_feature_set = None # Check if there are any non-approximated aggregation features no_unapproximated_aggs = True for feature in features: if isinstance(feature, AggregationPrimitive): # do not need to check if feature is in to_approximate since # only base features of direct features can be in to_approximate no_unapproximated_aggs = False break deps = feature.get_deep_dependencies(all_approx_feature_set) for dependency in deps: if (isinstance(dependency, AggregationPrimitive) and dependency not in to_approximate[dependency.entity.id]): no_unapproximated_aggs = False break cutoff_df_time_var = 'time' target_time = '_original_time' num_per_chunk = calc_num_per_chunk(chunk_size, cutoff_time.shape) if approximate is not None: # If there are approximated aggs, bin times binned_cutoff_time = bin_cutoff_times(cutoff_time.copy(), approximate) # Think about collisions: what if original time is a feature binned_cutoff_time[target_time] = cutoff_time[cutoff_df_time_var] cutoff_time_to_pass = binned_cutoff_time else: cutoff_time_to_pass = cutoff_time if num_per_chunk == "cutoff time": iterator = cutoff_time_to_pass.groupby(cutoff_df_time_var) else: iterator = get_next_chunk(cutoff_time=cutoff_time_to_pass, time_variable=cutoff_df_time_var, num_per_chunk=num_per_chunk) # if verbose, create progess bar if verbose: chunks = [] if num_per_chunk == "cutoff time": for _, group in iterator: chunks.append(group) else: for chunk in iterator: chunks.append(chunk) pbar_string = ("Elapsed: {elapsed} | Remaining: {remaining} | " "Progress: {l_bar}{bar}| " "Calculated: {n}/{total} chunks") iterator = make_tqdm_iterator(iterable=chunks, total=len(chunks), bar_format=pbar_string) feature_matrix = [] backend = PandasBackend(entityset, features) for chunk in iterator: # if not using chunks, pull out the group dataframe if isinstance(chunk, tuple): chunk = chunk[1] _feature_matrix = calculate_chunk(features, chunk, approximate, entityset, training_window, profile, verbose, save_progress, backend, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns) feature_matrix.append(_feature_matrix) # Do a manual garbage collection in case objects from calculate_chunk # weren't collected automatically gc.collect() if verbose: iterator.close() feature_matrix = pd.concat(feature_matrix) feature_matrix.sort_index(level='time', kind='mergesort', inplace=True) if not cutoff_time_in_index: feature_matrix.reset_index(level='time', drop=True, inplace=True) if save_progress and os.path.exists(os.path.join(save_progress, 'temp')): shutil.rmtree(os.path.join(save_progress, 'temp')) return feature_matrix
def calculate_feature_matrix(features, entityset=None, cutoff_time=None, instance_ids=None, entities=None, relationships=None, cutoff_time_in_index=False, training_window=None, approximate=None, save_progress=None, verbose=False, chunk_size=None, n_jobs=1, dask_kwargs=None): """Calculates a matrix for a given set of instance ids and calculation times. Args: features (list[:class:`.FeatureBase`]): Feature definitions to be calculated. entityset (EntitySet): An already initialized entityset. Required if `entities` and `relationships` not provided cutoff_time (pd.DataFrame or Datetime): Specifies at which time to calculate the features for each instance. The resulting feature matrix will use data up to and including the cutoff_time. Can either be a DataFrame with 'instance_id' and 'time' columns, DataFrame with the name of the index variable in the target entity and a time column, or a single value to calculate for all instances. If the dataframe has more than two columns, any additional columns will be added to the resulting feature matrix. instance_ids (list): List of instances to calculate features on. Only used if cutoff_time is a single datetime. entities (dict[str -> tuple(pd.DataFrame, str, str)]): dictionary of entities. Entries take the format {entity id: (dataframe, id column, (time_column))}. relationships (list[(str, str, str, str)]): list of relationships between entities. List items are a tuple with the format (parent entity id, parent variable, child entity id, child variable). cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex where the second index is the cutoff time (first is instance id). DataFrame will be sorted by (time, instance_id). training_window (Timedelta or str, optional): Window defining how much time before the cutoff time data can be used when calculating features. If ``None``, all data before cutoff time is used. Defaults to ``None``. approximate (Timedelta or str): Frequency to group instances with similar cutoff times by for features with costly calculations. For example, if bucket is 24 hours, all instances with cutoff times on the same day will use the same calculation for expensive features. verbose (bool, optional): Print progress info. The time granularity is per chunk. chunk_size (int or float or None or "cutoff time"): Number of rows of output feature matrix to calculate at time. If passed an integer greater than 0, will try to use that many rows per chunk. If passed a float value between 0 and 1 sets the chunk size to that percentage of all instances. If passed the string "cutoff time", rows are split per cutoff time. n_jobs (int, optional): number of parallel processes to use when calculating feature matrix dask_kwargs (dict, optional): Dictionary of keyword arguments to be passed when creating the dask client and scheduler. Even if n_jobs is not set, using `dask_kwargs` will enable multiprocessing. Main parameters: cluster (str or dask.distributed.LocalCluster): cluster or address of cluster to send tasks to. If unspecified, a cluster will be created. diagnostics port (int): port number to use for web dashboard. If left unspecified, web interface will not be enabled. Valid keyword arguments for LocalCluster will also be accepted. save_progress (str, optional): path to save intermediate computational results. """ assert (isinstance(features, list) and features != [] and all([isinstance(feature, FeatureBase) for feature in features])), \ "features must be a non-empty list of features" # handle loading entityset from featuretools.entityset.entityset import EntitySet if not isinstance(entityset, EntitySet): if entities is not None and relationships is not None: entityset = EntitySet("entityset", entities, relationships) target_entity = entityset[features[0].entity.id] pass_columns = [] if not isinstance(cutoff_time, pd.DataFrame): if isinstance(cutoff_time, list): raise TypeError("cutoff_time must be a single value or DataFrame") if cutoff_time is None: if entityset.time_type == NumericTimeIndex: cutoff_time = np.inf else: cutoff_time = datetime.now() if instance_ids is None: index_var = target_entity.index df = target_entity._handle_time(target_entity.df, time_last=cutoff_time, training_window=training_window) instance_ids = df[index_var].tolist() cutoff_time = [cutoff_time] * len(instance_ids) map_args = [(id, time) for id, time in zip(instance_ids, cutoff_time)] cutoff_time = pd.DataFrame(map_args, columns=['instance_id', 'time']) cutoff_time = cutoff_time.reset_index(drop=True) # handle how columns are names in cutoff_time # maybe add _check_time_dtype helper function if "instance_id" not in cutoff_time.columns: if target_entity.index not in cutoff_time.columns: raise AttributeError( 'Name of the index variable in the target entity' ' or "instance_id" must be present in cutoff_time') # rename to instance_id cutoff_time.rename(columns={target_entity.index: "instance_id"}, inplace=True) if "time" not in cutoff_time.columns: # take the first column that isn't instance_id and assume it is time not_instance_id = [ c for c in cutoff_time.columns if c != "instance_id" ] cutoff_time.rename(columns={not_instance_id[0]: "time"}, inplace=True) # Check that cutoff_time time type matches entityset time type if entityset.time_type == NumericTimeIndex: if cutoff_time['time'].dtype.name not in PandasTypes._pandas_numerics: raise TypeError("cutoff_time times must be numeric: try casting " "via pd.to_numeric(cutoff_time['time'])") elif entityset.time_type == DatetimeTimeIndex: if cutoff_time['time'].dtype.name not in PandasTypes._pandas_datetimes: raise TypeError( "cutoff_time times must be datetime type: try casting via pd.to_datetime(cutoff_time['time'])" ) assert (cutoff_time[['instance_id', 'time']].duplicated().sum() == 0), \ "Duplicated rows in cutoff time dataframe." pass_columns = [column_name for column_name in cutoff_time.columns[2:]] if _check_time_type(cutoff_time['time'].iloc[0]) is None: raise ValueError("cutoff_time time values must be datetime or numeric") # make sure dtype of instance_id in cutoff time # is same as column it references target_entity = features[0].entity dtype = entityset[target_entity.id].df[target_entity.index].dtype cutoff_time["instance_id"] = cutoff_time["instance_id"].astype(dtype) feature_set = FeatureSet(features) # Get features to approximate if approximate is not None: approximate_feature_trie = gather_approximate_features(feature_set) # Make a new FeatureSet that ignores approximated features feature_set = FeatureSet( features, approximate_feature_trie=approximate_feature_trie) # Check if there are any non-approximated aggregation features no_unapproximated_aggs = True for feature in features: if isinstance(feature, AggregationFeature): # do not need to check if feature is in to_approximate since # only base features of direct features can be in to_approximate no_unapproximated_aggs = False break if approximate is not None: all_approx_features = { f for _, feats in feature_set.approximate_feature_trie for f in feats } else: all_approx_features = set() deps = feature.get_dependencies(deep=True, ignored=all_approx_features) for dependency in deps: if isinstance(dependency, AggregationFeature): no_unapproximated_aggs = False break cutoff_df_time_var = 'time' target_time = '_original_time' num_per_chunk = calc_num_per_chunk(chunk_size, cutoff_time.shape) if approximate is not None: # If there are approximated aggs, bin times binned_cutoff_time = bin_cutoff_times(cutoff_time.copy(), approximate) # Think about collisions: what if original time is a feature binned_cutoff_time[target_time] = cutoff_time[cutoff_df_time_var] cutoff_time_to_pass = binned_cutoff_time else: cutoff_time_to_pass = cutoff_time if num_per_chunk == "cutoff time": iterator = cutoff_time_to_pass.groupby(cutoff_df_time_var) else: iterator = get_next_chunk(cutoff_time=cutoff_time_to_pass, time_variable=cutoff_df_time_var, num_per_chunk=num_per_chunk) chunks = [] if num_per_chunk == "cutoff time": for _, group in iterator: chunks.append(group) else: for chunk in iterator: chunks.append(chunk) if n_jobs != 1 or dask_kwargs is not None: feature_matrix = parallel_calculate_chunks( chunks=chunks, feature_set=feature_set, approximate=approximate, training_window=training_window, verbose=verbose, save_progress=save_progress, entityset=entityset, n_jobs=n_jobs, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns, dask_kwargs=dask_kwargs or {}) else: feature_matrix = linear_calculate_chunks( chunks=chunks, feature_set=feature_set, approximate=approximate, training_window=training_window, verbose=verbose, save_progress=save_progress, entityset=entityset, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns) feature_matrix = pd.concat(feature_matrix) feature_matrix.sort_index(level='time', kind='mergesort', inplace=True) if not cutoff_time_in_index: feature_matrix.reset_index(level='time', drop=True, inplace=True) if save_progress and os.path.exists(os.path.join(save_progress, 'temp')): shutil.rmtree(os.path.join(save_progress, 'temp')) return feature_matrix
def calculate_feature_matrix(features, entityset=None, cutoff_time=None, instance_ids=None, entities=None, relationships=None, cutoff_time_in_index=False, training_window=None, approximate=None, save_progress=None, verbose=False, chunk_size=None, n_jobs=1, dask_kwargs=None, progress_callback=None, include_cutoff_time=True): """Calculates a matrix for a given set of instance ids and calculation times. Args: features (list[:class:`.FeatureBase`]): Feature definitions to be calculated. entityset (EntitySet): An already initialized entityset. Required if `entities` and `relationships` not provided cutoff_time (pd.DataFrame or Datetime): Specifies times at which to calculate the features for each instance. The resulting feature matrix will use data up to and including the cutoff_time. Can either be a DataFrame or a single value. If a DataFrame is passed the instance ids for which to calculate features must be in a column with the same name as the target entity index or a column named `instance_id`. The cutoff time values in the DataFrame must be in a column with the same name as the target entity time index or a column named `time`. If the DataFrame has more than two columns, any additional columns will be added to the resulting feature matrix. If a single value is passed, this value will be used for all instances. instance_ids (list): List of instances to calculate features on. Only used if cutoff_time is a single datetime. entities (dict[str -> tuple(pd.DataFrame, str, str, dict[str -> Variable])]): dictionary of entities. Entries take the format {entity id -> (dataframe, id column, (time_column), (variable_types))}. Note that time_column and variable_types are optional. relationships (list[(str, str, str, str)]): list of relationships between entities. List items are a tuple with the format (parent entity id, parent variable, child entity id, child variable). cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex where the second index is the cutoff time (first is instance id). DataFrame will be sorted by (time, instance_id). training_window (Timedelta or str, optional): Window defining how much time before the cutoff time data can be used when calculating features. If ``None``, all data before cutoff time is used. Defaults to ``None``. approximate (Timedelta or str): Frequency to group instances with similar cutoff times by for features with costly calculations. For example, if bucket is 24 hours, all instances with cutoff times on the same day will use the same calculation for expensive features. verbose (bool, optional): Print progress info. The time granularity is per chunk. chunk_size (int or float or None): maximum number of rows of output feature matrix to calculate at time. If passed an integer greater than 0, will try to use that many rows per chunk. If passed a float value between 0 and 1 sets the chunk size to that percentage of all rows. if None, and n_jobs > 1 it will be set to 1/n_jobs n_jobs (int, optional): number of parallel processes to use when calculating feature matrix. dask_kwargs (dict, optional): Dictionary of keyword arguments to be passed when creating the dask client and scheduler. Even if n_jobs is not set, using `dask_kwargs` will enable multiprocessing. Main parameters: cluster (str or dask.distributed.LocalCluster): cluster or address of cluster to send tasks to. If unspecified, a cluster will be created. diagnostics port (int): port number to use for web dashboard. If left unspecified, web interface will not be enabled. Valid keyword arguments for LocalCluster will also be accepted. save_progress (str, optional): path to save intermediate computational results. progress_callback (callable): function to be called with incremental progress updates. Has the following parameters: update: percentage change (float between 0 and 100) in progress since last call progress_percent: percentage (float between 0 and 100) of total computation completed time_elapsed: total time in seconds that has elapsed since start of call include_cutoff_time (bool): Include data at cutoff times in feature calculations. Defaults to ``True``. """ assert (isinstance(features, list) and features != [] and all([isinstance(feature, FeatureBase) for feature in features])), \ "features must be a non-empty list of features" # handle loading entityset from featuretools.entityset.entityset import EntitySet if not isinstance(entityset, EntitySet): if entities is not None and relationships is not None: entityset = EntitySet("entityset", entities, relationships) if any(isinstance(es.df, dd.DataFrame) for es in entityset.entities): if approximate: msg = "Using approximate is not supported with Dask Entities" raise ValueError(msg) if training_window: msg = "Using training_window is not supported with Dask Entities" raise ValueError(msg) target_entity = entityset[features[0].entity.id] pass_columns = [] if not isinstance(cutoff_time, pd.DataFrame): if isinstance(cutoff_time, list): raise TypeError("cutoff_time must be a single value or DataFrame") if isinstance(cutoff_time, dd.DataFrame): msg = "cannot use Dask DataFrame for cutoff_time: "\ "cutoff_time must a single value or a Pandas DataFrame" raise TypeError(msg) if cutoff_time is None: if entityset.time_type == NumericTimeIndex: cutoff_time = np.inf else: cutoff_time = datetime.now() if instance_ids is None: index_var = target_entity.index df = target_entity._handle_time( target_entity.df, time_last=cutoff_time, training_window=training_window, include_cutoff_time=include_cutoff_time) instance_ids = list(df[index_var]) cutoff_time = [cutoff_time] * len(instance_ids) map_args = [(id, time) for id, time in zip(instance_ids, cutoff_time)] cutoff_time = pd.DataFrame(map_args, columns=['instance_id', 'time']) cutoff_time = cutoff_time.reset_index(drop=True) # handle how columns are names in cutoff_time # maybe add _check_time_dtype helper function if "instance_id" not in cutoff_time.columns: if target_entity.index not in cutoff_time.columns: raise AttributeError( 'Cutoff time DataFrame must contain a column with either the same name' ' as the target entity index or a column named "instance_id"') # rename to instance_id cutoff_time.rename(columns={target_entity.index: "instance_id"}, inplace=True) if "time" not in cutoff_time.columns: if target_entity.time_index and target_entity.time_index not in cutoff_time.columns: raise AttributeError( 'Cutoff time DataFrame must contain a column with either the same name' ' as the target entity time_index or a column named "time"') # rename to time cutoff_time.rename(columns={target_entity.time_index: "time"}, inplace=True) # Make sure user supplies only one valid name for instance id and time columns if "instance_id" in cutoff_time.columns and target_entity.index in cutoff_time.columns and \ "instance_id" != target_entity.index: raise AttributeError( 'Cutoff time DataFrame cannot contain both a column named "instance_id" and a column' ' with the same name as the target entity index') if "time" in cutoff_time.columns and target_entity.time_index in cutoff_time.columns and \ "time" != target_entity.time_index: raise AttributeError( 'Cutoff time DataFrame cannot contain both a column named "time" and a column' ' with the same name as the target entity time index') # Check that cutoff_time time type matches entityset time type if entityset.time_type == NumericTimeIndex: if cutoff_time['time'].dtype.name not in PandasTypes._pandas_numerics: raise TypeError("cutoff_time times must be numeric: try casting " "via pd.to_numeric(cutoff_time['time'])") elif entityset.time_type == DatetimeTimeIndex: if cutoff_time['time'].dtype.name not in PandasTypes._pandas_datetimes: raise TypeError( "cutoff_time times must be datetime type: try casting via pd.to_datetime(cutoff_time['time'])" ) assert (cutoff_time[['instance_id', 'time']].duplicated().sum() == 0), \ "Duplicated rows in cutoff time dataframe." pass_columns = [ col for col in cutoff_time.columns if col not in ['instance_id', 'time'] ] if _check_time_type(cutoff_time['time'].iloc[0]) is None: raise ValueError("cutoff_time time values must be datetime or numeric") # make sure dtype of instance_id in cutoff time # is same as column it references target_entity = features[0].entity dtype = entityset[target_entity.id].df[target_entity.index].dtype cutoff_time["instance_id"] = cutoff_time["instance_id"].astype(dtype) feature_set = FeatureSet(features) # Get features to approximate if approximate is not None: approximate_feature_trie = gather_approximate_features(feature_set) # Make a new FeatureSet that ignores approximated features feature_set = FeatureSet( features, approximate_feature_trie=approximate_feature_trie) # Check if there are any non-approximated aggregation features no_unapproximated_aggs = True for feature in features: if isinstance(feature, AggregationFeature): # do not need to check if feature is in to_approximate since # only base features of direct features can be in to_approximate no_unapproximated_aggs = False break if approximate is not None: all_approx_features = { f for _, feats in feature_set.approximate_feature_trie for f in feats } else: all_approx_features = set() deps = feature.get_dependencies(deep=True, ignored=all_approx_features) for dependency in deps: if isinstance(dependency, AggregationFeature): no_unapproximated_aggs = False break cutoff_df_time_var = 'time' target_time = '_original_time' if approximate is not None: # If there are approximated aggs, bin times binned_cutoff_time = bin_cutoff_times(cutoff_time.copy(), approximate) # Think about collisions: what if original time is a feature binned_cutoff_time[target_time] = cutoff_time[cutoff_df_time_var] cutoff_time_to_pass = binned_cutoff_time else: cutoff_time_to_pass = cutoff_time chunk_size = _handle_chunk_size(chunk_size, cutoff_time.shape[0]) tqdm_options = { 'total': (cutoff_time.shape[0] / FEATURE_CALCULATION_PERCENTAGE), 'bar_format': PBAR_FORMAT, 'disable': True } if verbose: tqdm_options.update({'disable': False}) elif progress_callback is not None: # allows us to utilize progress_bar updates without printing to anywhere tqdm_options.update({'file': open(os.devnull, 'w'), 'disable': False}) progress_bar = make_tqdm_iterator(**tqdm_options) progress_bar._instances.clear() if n_jobs != 1 or dask_kwargs is not None: feature_matrix = parallel_calculate_chunks( cutoff_time=cutoff_time_to_pass, chunk_size=chunk_size, feature_set=feature_set, approximate=approximate, training_window=training_window, save_progress=save_progress, entityset=entityset, n_jobs=n_jobs, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns, progress_bar=progress_bar, dask_kwargs=dask_kwargs or {}, progress_callback=progress_callback, include_cutoff_time=include_cutoff_time) else: feature_matrix = calculate_chunk( cutoff_time=cutoff_time_to_pass, chunk_size=chunk_size, feature_set=feature_set, approximate=approximate, training_window=training_window, save_progress=save_progress, entityset=entityset, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns, progress_bar=progress_bar, progress_callback=progress_callback, include_cutoff_time=include_cutoff_time) # ensure rows are sorted by input order if isinstance(feature_matrix, pd.DataFrame): feature_matrix = feature_matrix.reindex( pd.MultiIndex.from_frame(cutoff_time[["instance_id", "time"]], names=feature_matrix.index.names)) if not cutoff_time_in_index: feature_matrix.reset_index(level='time', drop=True, inplace=True) if save_progress and os.path.exists(os.path.join(save_progress, 'temp')): shutil.rmtree(os.path.join(save_progress, 'temp')) # force to 100% since we saved last 5 percent previous_progress = progress_bar.n progress_bar.update(progress_bar.total - progress_bar.n) if progress_callback is not None: update, progress_percent, time_elapsed = update_progress_callback_parameters( progress_bar, previous_progress) progress_callback(update, progress_percent, time_elapsed) progress_bar.refresh() progress_bar.close() return feature_matrix
def calculate_feature_matrix(features, entityset=None, cutoff_time=None, instance_ids=None, entities=None, relationships=None, cutoff_time_in_index=False, training_window=None, approximate=None, save_progress=None, verbose=False, chunk_size=None, n_jobs=1, dask_kwargs=None, profile=False): """Calculates a matrix for a given set of instance ids and calculation times. Args: features (list[PrimitiveBase]): Feature definitions to be calculated. entityset (EntitySet): An already initialized entityset. Required if `entities` and `relationships` not provided cutoff_time (pd.DataFrame or Datetime): Specifies at which time to calculate the features for each instance. Can either be a DataFrame with 'instance_id' and 'time' columns, DataFrame with the name of the index variable in the target entity and a time column, or a single value to calculate for all instances. If the dataframe has more than two columns, any additional columns will be added to the resulting feature matrix. instance_ids (list): List of instances to calculate features on. Only used if cutoff_time is a single datetime. entities (dict[str -> tuple(pd.DataFrame, str, str)]): dictionary of entities. Entries take the format {entity id: (dataframe, id column, (time_column))}. relationships (list[(str, str, str, str)]): list of relationships between entities. List items are a tuple with the format (parent entity id, parent variable, child entity id, child variable). cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex where the second index is the cutoff time (first is instance id). DataFrame will be sorted by (time, instance_id). training_window (Timedelta, optional): Window defining how much older than the cutoff time data can be to be included when calculating the feature. If None, all older data is used. approximate (Timedelta or str): Frequency to group instances with similar cutoff times by for features with costly calculations. For example, if bucket is 24 hours, all instances with cutoff times on the same day will use the same calculation for expensive features. verbose (bool, optional): Print progress info. The time granularity is per chunk. profile (bool, optional): Enables profiling if True. chunk_size (int or float or None or "cutoff time"): Number of rows of output feature matrix to calculate at time. If passed an integer greater than 0, will try to use that many rows per chunk. If passed a float value between 0 and 1 sets the chunk size to that percentage of all instances. If passed the string "cutoff time", rows are split per cutoff time. n_jobs (int, optional): number of parallel processes to use when calculating feature matrix dask_kwargs (dict, optional): Dictionary of keyword arguments to be passed when creating the dask client and scheduler. Even if n_jobs is not set, using `dask_kwargs` will enable multiprocessing. Main parameters: cluster (str or dask.distributed.LocalCluster): cluster or address of cluster to send tasks to. If unspecified, a cluster will be created. diagnostics port (int): port number to use for web dashboard. If left unspecified, web interface will not be enabled. Valid keyword arguments for LocalCluster will also be accepted. save_progress (str, optional): path to save intermediate computational results. """ assert (isinstance(features, list) and features != [] and all([isinstance(feature, PrimitiveBase) for feature in features])), \ "features must be a non-empty list of features" # handle loading entityset from featuretools.entityset.entityset import EntitySet if not isinstance(entityset, EntitySet): if entities is not None and relationships is not None: entityset = EntitySet("entityset", entities, relationships) target_entity = entityset[features[0].entity.id] pass_columns = [] if not isinstance(cutoff_time, pd.DataFrame): if isinstance(cutoff_time, list): raise TypeError("cutoff_time must be a single value or DataFrame") if cutoff_time is None: if entityset.time_type == NumericTimeIndex: cutoff_time = np.inf else: cutoff_time = datetime.now() if instance_ids is None: index_var = target_entity.index instance_ids = target_entity.df[index_var].tolist() cutoff_time = [cutoff_time] * len(instance_ids) map_args = [(id, time) for id, time in zip(instance_ids, cutoff_time)] cutoff_time = pd.DataFrame(map_args, columns=['instance_id', 'time']) else: cutoff_time = cutoff_time.reset_index(drop=True) # handle how columns are names in cutoff_time if "instance_id" not in cutoff_time.columns: if target_entity.index not in cutoff_time.columns: raise AttributeError('Name of the index variable in the target entity' ' or "instance_id" must be present in cutoff_time') # rename to instance_id cutoff_time.rename(columns={target_entity.index: "instance_id"}, inplace=True) if "time" not in cutoff_time.columns: # take the first column that isn't instance_id and assume it is time not_instance_id = [c for c in cutoff_time.columns if c != "instance_id"] cutoff_time.rename(columns={not_instance_id[0]: "time"}, inplace=True) if cutoff_time['time'].dtype == object: if (entityset.time_type == NumericTimeIndex and cutoff_time['time'].dtype.name.find('int') == -1 and cutoff_time['time'].dtype.name.find('float') == -1): raise TypeError("cutoff_time times must be numeric: try casting via pd.to_numeric(cutoff_time['time'])") elif (entityset.time_type == DatetimeTimeIndex and cutoff_time['time'].dtype.name.find('time') == -1): raise TypeError("cutoff_time times must be datetime type: try casting via pd.to_datetime(cutoff_time['time'])") assert (cutoff_time[['instance_id', 'time']].duplicated().sum() == 0), \ "Duplicated rows in cutoff time dataframe." pass_columns = [column_name for column_name in cutoff_time.columns[2:]] if _check_time_type(cutoff_time['time'].iloc[0]) is None: raise ValueError("cutoff_time time values must be datetime or numeric") backend = PandasBackend(entityset, features) # make sure dtype of instance_id in cutoff time # is same as column it references target_entity = features[0].entity dtype = entityset[target_entity.id].df[target_entity.index].dtype cutoff_time["instance_id"] = cutoff_time["instance_id"].astype(dtype) # Get dictionary of features to approximate if approximate is not None: to_approximate, all_approx_feature_set = gather_approximate_features(features, backend) else: to_approximate = defaultdict(list) all_approx_feature_set = None # Check if there are any non-approximated aggregation features no_unapproximated_aggs = True for feature in features: if isinstance(feature, AggregationPrimitive): # do not need to check if feature is in to_approximate since # only base features of direct features can be in to_approximate no_unapproximated_aggs = False break deps = feature.get_deep_dependencies(all_approx_feature_set) for dependency in deps: if (isinstance(dependency, AggregationPrimitive) and dependency not in to_approximate[dependency.entity.id]): no_unapproximated_aggs = False break cutoff_df_time_var = 'time' target_time = '_original_time' num_per_chunk = calc_num_per_chunk(chunk_size, cutoff_time.shape) if approximate is not None: # If there are approximated aggs, bin times binned_cutoff_time = bin_cutoff_times(cutoff_time.copy(), approximate) # Think about collisions: what if original time is a feature binned_cutoff_time[target_time] = cutoff_time[cutoff_df_time_var] cutoff_time_to_pass = binned_cutoff_time else: cutoff_time_to_pass = cutoff_time if num_per_chunk == "cutoff time": iterator = cutoff_time_to_pass.groupby(cutoff_df_time_var) else: iterator = get_next_chunk(cutoff_time=cutoff_time_to_pass, time_variable=cutoff_df_time_var, num_per_chunk=num_per_chunk) chunks = [] if num_per_chunk == "cutoff time": for _, group in iterator: chunks.append(group) else: for chunk in iterator: chunks.append(chunk) if n_jobs != 1 or dask_kwargs is not None: feature_matrix = parallel_calculate_chunks(chunks=chunks, features=features, approximate=approximate, training_window=training_window, verbose=verbose, save_progress=save_progress, entityset=entityset, n_jobs=n_jobs, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns, dask_kwargs=dask_kwargs or {}) else: feature_matrix = linear_calculate_chunks(chunks=chunks, features=features, approximate=approximate, training_window=training_window, profile=profile, verbose=verbose, save_progress=save_progress, entityset=entityset, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns) feature_matrix = pd.concat(feature_matrix) feature_matrix.sort_index(level='time', kind='mergesort', inplace=True) if not cutoff_time_in_index: feature_matrix.reset_index(level='time', drop=True, inplace=True) if save_progress and os.path.exists(os.path.join(save_progress, 'temp')): shutil.rmtree(os.path.join(save_progress, 'temp')) return feature_matrix
def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None, entities=None, relationships=None, entityset=None, cutoff_time_in_index=False, training_window=None, approximate=None, save_progress=None, verbose=False, chunk_size=None, profile=False): """Calculates a matrix for a given set of instance ids and calculation times. Args: features (list[PrimitiveBase]): Feature definitions to be calculated. cutoff_time (pd.DataFrame or Datetime): Specifies at which time to calculate the features for each instance. Can either be a DataFrame with 'instance_id' and 'time' columns, DataFrame with the name of the index variable in the target entity and a time column, a list of values, or a single value to calculate for all instances. If the dataframe has more than two columns, any additional columns will be added to the resulting feature matrix. instance_ids (list): List of instances to calculate features on. Only used if cutoff_time is a single datetime. entities (dict[str -> tuple(pd.DataFrame, str, str)]): dictionary of entities. Entries take the format {entity id: (dataframe, id column, (time_column))}. relationships (list[(str, str, str, str)]): list of relationships between entities. List items are a tuple with the format (parent entity id, parent variable, child entity id, child variable). entityset (EntitySet): An already initialized entityset. Required if entities and relationships are not defined. cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex where the second index is the cutoff time (first is instance id). DataFrame will be sorted by (time, instance_id). training_window (dict[str -> Timedelta] or Timedelta, optional): Window or windows defining how much older than the cutoff time data can be to be included when calculating the feature. To specify which entities to apply windows to, use a dictionary mapping entity id -> Timedelta. If None, all older data is used. approximate (Timedelta or str): Frequency to group instances with similar cutoff times by for features with costly calculations. For example, if bucket is 24 hours, all instances with cutoff times on the same day will use the same calculation for expensive features. verbose (bool, optional): Print progress info. The time granularity is per chunk. profile (bool, optional): Enables profiling if True. chunk_size (int or float or None or "cutoff time"): Number of rows of output feature matrix to calculate at time. If passed an integer greater than 0, will try to use that many rows per chunk. If passed a float value between 0 and 1 sets the chunk size to that percentage of all instances. If passed the string "cutoff time", rows are split per cutoff time. save_progress (str, optional): path to save intermediate computational results. """ assert (isinstance(features, list) and features != [] and all([isinstance(feature, PrimitiveBase) for feature in features])), \ "features must be a non-empty list of features" # handle loading entityset from featuretools.entityset.entityset import EntitySet if not isinstance(entityset, EntitySet): if entities is not None and relationships is not None: entityset = EntitySet("entityset", entities, relationships) if entityset is not None: for f in features: f.entityset = entityset entityset = features[0].entityset target_entity = features[0].entity pass_columns = [] if not isinstance(cutoff_time, pd.DataFrame): if cutoff_time is None: if entityset.time_type == NumericTimeIndex: cutoff_time = np.inf else: cutoff_time = datetime.now() if instance_ids is None: index_var = target_entity.index instance_ids = target_entity.df[index_var].tolist() if not isinstance(cutoff_time, list): cutoff_time = [cutoff_time] * len(instance_ids) map_args = [(id, time) for id, time in zip(instance_ids, cutoff_time)] df_args = pd.DataFrame(map_args, columns=['instance_id', 'time']) to_calc = df_args.values cutoff_time = pd.DataFrame(to_calc, columns=['instance_id', 'time']) else: cutoff_time = cutoff_time.copy() # handle how columns are names in cutoff_time if "instance_id" not in cutoff_time.columns: if target_entity.index not in cutoff_time.columns: raise AttributeError('Name of the index variable in the target entity' ' or "instance_id" must be present in cutoff_time') # rename to instance_id cutoff_time.rename(columns={target_entity.index: "instance_id"}, inplace=True) if "time" not in cutoff_time.columns: # take the first column that isn't instance_id and assume it is time not_instance_id = [c for c in cutoff_time.columns if c != "instance_id"] cutoff_time.rename(columns={not_instance_id[0]: "time"}, inplace=True) pass_columns = [column_name for column_name in cutoff_time.columns[2:]] if _check_time_type(cutoff_time['time'].iloc[0]) is None: raise ValueError("cutoff_time time values must be datetime or numeric") backend = PandasBackend(entityset, features) # Get dictionary of features to approximate if approximate is not None: to_approximate, all_approx_feature_set = gather_approximate_features(features, backend) else: to_approximate = defaultdict(list) all_approx_feature_set = None # Check if there are any non-approximated aggregation features no_unapproximated_aggs = True for feature in features: if isinstance(feature, AggregationPrimitive): # do not need to check if feature is in to_approximate since # only base features of direct features can be in to_approximate no_unapproximated_aggs = False break deps = feature.get_deep_dependencies(all_approx_feature_set) for dependency in deps: if (isinstance(dependency, AggregationPrimitive) and dependency not in to_approximate[dependency.entity.id]): no_unapproximated_aggs = False break cutoff_df_time_var = 'time' target_time = '_original_time' num_per_chunk = calc_num_per_chunk(chunk_size, cutoff_time.shape) if approximate is not None: # If there are approximated aggs, bin times binned_cutoff_time = bin_cutoff_times(cutoff_time.copy(), approximate) # Think about collisions: what if original time is a feature binned_cutoff_time[target_time] = cutoff_time[cutoff_df_time_var] cutoff_time_to_pass = binned_cutoff_time else: cutoff_time_to_pass = cutoff_time if num_per_chunk == "cutoff time": iterator = cutoff_time_to_pass.groupby(cutoff_df_time_var) else: iterator = get_next_chunk(cutoff_time=cutoff_time_to_pass, time_variable=cutoff_df_time_var, num_per_chunk=num_per_chunk) # if verbose, create progess bar if verbose: chunks = [] if num_per_chunk == "cutoff time": for _, group in iterator: chunks.append(group) else: for chunk in iterator: chunks.append(chunk) pbar_string = ("Elapsed: {elapsed} | Remaining: {remaining} | " "Progress: {l_bar}{bar}| " "Calculated: {n}/{total} chunks") iterator = make_tqdm_iterator(iterable=chunks, total=len(chunks), bar_format=pbar_string) feature_matrix = [] backend = PandasBackend(entityset, features) for chunk in iterator: # if not using chunks, pull out the group dataframe if isinstance(chunk, tuple): chunk = chunk[1] _feature_matrix = calculate_chunk(features, chunk, approximate, entityset, training_window, profile, verbose, save_progress, backend, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns) feature_matrix.append(_feature_matrix) # Do a manual garbage collection in case objects from calculate_chunk # weren't collected automatically gc.collect() if verbose: iterator.close() feature_matrix = pd.concat(feature_matrix) feature_matrix.sort_index(level='time', kind='mergesort', inplace=True) if not cutoff_time_in_index: feature_matrix.reset_index(level='time', drop=True, inplace=True) if save_progress and os.path.exists(os.path.join(save_progress, 'temp')): shutil.rmtree(os.path.join(save_progress, 'temp')) return feature_matrix