Пример #1
0
    def compute(self, df, datetime_column, groupby_columns=None):
        if groupby_columns is None:
            groupby_columns = []

        generic_check_compute_arguments(datetime_column, groupby_columns)

        # drop all rows where the timestamp is null
        df_copy = df.dropna(subset=[datetime_column]).copy()
        if nothing_to_do(df_copy, min_len=2):
            logger.warning('The time series has less than 2 rows with values, can not apply window.')
            return df_copy

        df_copy.loc[:, datetime_column] = pd.to_datetime(df_copy[datetime_column])
        raw_columns = df_copy.select_dtypes(include=['float', 'int']).columns.tolist()

        if groupby_columns:
            grouped = df_copy.groupby(groupby_columns)
            computed_groups = []
            identifiers_number = len(groupby_columns)
            for group_id, group in grouped:
                logger.info("Computing for group {}".format(group_id))

                try:
                    if self.params.causal_window:
                        computed_df = self._compute_causal_stats(group, datetime_column, raw_columns, df_id=group_id)
                    else:
                        computed_df = self._compute_bilateral_stats(group, datetime_column, raw_columns, df_id=group_id)
                except Exception as e:
                    from future.utils import raise_
                    # issues with left border, cf https://github.com/pandas-dev/pandas/issues/26005
                    if str(e) == ('skiplist_init failed'):
                        raise_(Exception, "Window width is too small", sys.exc_info()[2])
                    else:
                        raise_(Exception, "Compute stats failed. Check the full error log for more info: {}".format(str(e)), sys.exc_info()[2])
                if not nothing_to_do(group, min_len=2):
                    group_id = format_group_id(group_id, identifiers_number)
                    computed_df[groupby_columns] = pd.DataFrame([group_id], index=computed_df.index)
                computed_groups.append(computed_df)
            final_df = pd.concat(computed_groups, sort=True)
        else:
            try:
                if self.params.causal_window:
                    final_df = self._compute_causal_stats(df_copy, datetime_column, raw_columns)
                else:
                    final_df = self._compute_bilateral_stats(df_copy, datetime_column, raw_columns)
            except Exception as e:
                from future.utils import raise_
                if str(e) == ('skiplist_init failed'):
                    raise_(Exception, "Window width is too small", sys.exc_info()[2])
                else:
                    raise_(Exception, "Compute stats failed. Check the full error log for more info: {}".format(str(e)), sys.exc_info()[2])

        return final_df.reset_index(drop=True)
Пример #2
0
    def _compute_bilateral_stats(self, df, datetime_column, raw_columns, df_id=''):

        if nothing_to_do(df, min_len=2):
            logger.info('The time series {} has less than 2 rows with values, can not apply window.'.format(df_id))
            return df
        if has_duplicates(df, datetime_column):
            logger.error('The time series {} contain duplicate timestamps.'.format(df_id))
            raise ValueError('The time series {} contain duplicate timestamps.'.format(df_id))

        reference_df = df.set_index(datetime_column).sort_index().copy()
        new_df = pd.DataFrame(index=reference_df.index)

        frequency = infer_frequency(reference_df)
        if frequency:
            window_description_in_row = convert_time_freq_to_row_freq(frequency, self.params.window_description)
        else:
            logger.error('The input time series is not equispaced. Cannot compute bilateral window.')  # pandas limitation
            raise ValueError('The input time series is not equispaced. Cannot compute bilateral window.')  # pandas limitation

        # compute all stats except mean and sum, these stats dont need a win_type
        roller_without_win_type = reference_df.rolling(window=window_description_in_row, center=True)
        new_df = self._compute_stats_without_win_type(roller_without_win_type, raw_columns, new_df, reference_df)

        # compute mean and sum, the only operations that win_type has an effect
        roller_with_win_type = reference_df.rolling(window=window_description_in_row, win_type=self.params.window_type, center=True)
        new_df = self._compute_stats_with_win_type(roller_with_win_type, raw_columns, new_df)

        return new_df.rename_axis(datetime_column).reset_index()
    def _detect_segment(self,
                        df,
                        datetime_column,
                        filter_column,
                        filter_function,
                        df_id=''):

        if has_duplicates(df, datetime_column):
            raise ValueError(
                'The time series {} contain duplicate timestamps.'.format(
                    df_id))

        if nothing_to_do(df, min_len=0):
            logger.warning(
                'The time series {} is empty, can not compute.'.format(df_id))
            return pd.DataFrame(columns=df.columns)

        df_copy = df.copy()
        df_copy.loc[:,
                    datetime_column] = pd.to_datetime(df_copy[datetime_column])
        df_copy = df_copy.set_index(datetime_column).sort_index()

        segment_indexes = self._detect_time_segment(df_copy, filter_column,
                                                    filter_function)

        mask_dict = {}
        if len(segment_indexes) > 0:
            for segment_index, (start, end) in enumerate(segment_indexes):
                mask = (df_copy.index >= start) & (df_copy.index <= end)
                mask_dict[segment_index] = mask

            df_labeled = df_copy.copy()
            df_labeled['interval_id'] = np.nan

            for k, v in mask_dict.items():
                df_labeled.loc[v, 'interval_id'] = str(int(k))

            segment_df = df_labeled.loc[np.logical_or.reduce(
                list(mask_dict.values()))].sort_index()
        else:
            segment_df = pd.DataFrame(columns=df_copy.columns)

        return segment_df.rename_axis(datetime_column).reset_index()
Пример #4
0
    def transform(self, df, datetime_column, groupby_columns=None):
        if groupby_columns is None:
            groupby_columns = []

        generic_check_compute_arguments(datetime_column, groupby_columns)
        df_copy = df.copy()

        # drop all rows where the timestamp is null
        df_copy = df_copy.dropna(subset=[datetime_column])
        if nothing_to_do(df_copy, min_len=2):
            logger.warning('The timeseries has less than 2 rows with values, can not resample.')
            return df_copy

        df_copy.loc[:, datetime_column] = pd.to_datetime(df_copy[datetime_column])
        # when having multiple timeseries, their time range is not necessarily the same
        # we thus compute a unified time index for all partitions
        reference_time_index = self._compute_full_time_index(df_copy, datetime_column)
        columns_to_resample = [col for col in df_copy.select_dtypes([int, float]).columns.tolist() if col != datetime_column and col not in groupby_columns]
        category_columns = [col for col in df.select_dtypes([object, bool]).columns.tolist() if col != datetime_column and col not in columns_to_resample and
                            col not in groupby_columns]
        if groupby_columns:
            grouped = df_copy.groupby(groupby_columns)
            resampled_groups = []
            identifiers_number = len(groupby_columns)
            for group_id, group in grouped:
                logger.info("Computing for group: {}".format(group_id))
                group_resampled = self._resample(group.drop(groupby_columns, axis=1), datetime_column, columns_to_resample, category_columns,
                                                 reference_time_index,
                                                 df_id=group_id)
                group_id = format_group_id(group_id, identifiers_number)
                group_resampled[groupby_columns] = pd.DataFrame([group_id], index=group_resampled.index)
                resampled_groups.append(group_resampled)
            df_resampled = pd.concat(resampled_groups, sort=True)
        else:
            df_resampled = self._resample(df_copy, datetime_column, columns_to_resample, category_columns, reference_time_index)

        df_resampled = df_resampled[df.columns].reset_index(drop=True)

        return df_resampled
    def compute(self,
                df,
                datetime_column,
                threshold_dict,
                groupby_columns=None):

        generic_check_compute_arguments(datetime_column, groupby_columns)
        df_copy = df.copy()

        # drop all rows where the timestamp is null
        df_copy = df_copy.dropna(subset=[datetime_column])
        if nothing_to_do(df_copy, min_len=0):
            logger.warning('The time series is empty, can not compute.')
            return pd.DataFrame(columns=df_copy.columns)

        lower_threshold, upper_threshold, filter_column = None, None, None
        for column, threshold_tuple in threshold_dict.items():
            filter_column = column
            lower_threshold, upper_threshold = threshold_tuple

        filter_function = self._between_min_max_mask(lower_threshold,
                                                     upper_threshold)

        if groupby_columns:
            grouped = df.groupby(groupby_columns)
            filtered_groups = []
            for group_id, group in grouped:
                logger.info("Computing for group {}".format(group_id))
                filtered_df = self._detect_segment(group, datetime_column,
                                                   filter_column,
                                                   filter_function)
                filtered_groups.append(filtered_df)
            return pd.concat(filtered_groups, sort=True).reset_index(drop=True)
        else:
            return self._detect_segment(df, datetime_column, filter_column,
                                        filter_function)
Пример #6
0
    def _compute_causal_stats(self, df, datetime_column, raw_columns, df_id=''):

        if nothing_to_do(df, min_len=2):
            logger.info('The time series {} has less than 2 rows with values, can not apply window.'.format(df_id))
            return df
        if has_duplicates(df, datetime_column):
            logger.error('The time series {} contain duplicate timestamps.'.format(df_id))
            raise ValueError('The time series {} contain duplicate timestamps.'.format(df_id))

        reference_df = df.set_index(datetime_column).sort_index().copy()
        new_df = pd.DataFrame(index=reference_df.index)

        # compute all stats except mean and sum, the syntax does not change whether or not we have a window type
        roller_without_window_type = reference_df.rolling(window=self.params.window_description, closed=self.params.closed_option)
        new_df = self._compute_stats_without_win_type(roller_without_window_type, raw_columns, new_df, reference_df)

        # compute mean and sum, the only operations that might need a win_type
        # when using win_type, window must be defined in terms of rows and not time unit (pandas limitation)
        compute_sum_and_mean = len(set(self.params.aggregation_types).intersection(set(['average', 'sum']))) > 0
        if compute_sum_and_mean and self.params.window_type:
            # row-based rolling is always bound both side of the window, we thus shift 1 row down when closed is left
            if self.params.closed_option == 'left':
                shifted_df = reference_df.shift(1)
            else:
                shifted_df = reference_df

            frequency = infer_frequency(reference_df)
            if frequency:
                window_description_in_row = convert_time_freq_to_row_freq(frequency, self.params.window_description)
            else:
                raise ValueError('The input time series is not equispaced. Cannot apply window with time unit.')  # pandas limitation

            roller_with_window = shifted_df.rolling(window=window_description_in_row, win_type=self.params.window_type, closed=self.params.closed_option)
            new_df = self._compute_stats_with_win_type(roller_with_window, raw_columns, new_df)

        return new_df.rename_axis(datetime_column).reset_index()
    def compute(self,
                df,
                datetime_column,
                extrema_column,
                groupby_columns=None):
        if groupby_columns is None:
            groupby_columns = []
        generic_check_compute_arguments(datetime_column, groupby_columns)
        df_copy = df.copy()

        # drop all rows prwhere the timestamp is null
        df_copy = df_copy.dropna(subset=[datetime_column])
        if nothing_to_do(df_copy, min_len=2):
            logger.warning(
                'The time series has less than 2 rows with values, can not find extrema.'
            )
            return df_copy

        numerical_columns = df_copy.select_dtypes(
            include=['float', 'int']).columns.tolist()
        if extrema_column not in numerical_columns:
            raise ValueError(
                "The chosen extrema column, {}, is not of type float or int.".
                format(extrema_column))

        df_copy.loc[:, datetime_column] = pd.to_datetime(df[datetime_column])
        extrema_df_list = []
        identifiers_number = len(groupby_columns)
        if groupby_columns:
            grouped = df_copy.groupby(groupby_columns)
            for group_id, group in grouped:
                logger.info("Computing for group: {}".format(group_id))
                extrema_neighbor_df_list, extrema_value = self._find_extrema_neighbor_zone(
                    group, datetime_column, extrema_column, df_id=group_id)
                group_id = format_group_id(group_id, identifiers_number)
                if len(extrema_neighbor_df_list) == 0:
                    extrema_df = pd.DataFrame([group_id],
                                              columns=groupby_columns)
                    extrema_df_list.append(extrema_df)
                else:
                    for extrema_neighbor_df in extrema_neighbor_df_list:
                        rolling_df = self.params.window_aggregator.compute(
                            extrema_neighbor_df, datetime_column)
                        extrema_df = rolling_df.loc[
                            rolling_df[extrema_column] ==
                            extrema_value].copy()  # avoid .loc warning
                        extrema_df[groupby_columns] = pd.DataFrame(
                            [group_id], index=extrema_df.index)
                        extrema_df_list.append(extrema_df)

            final_df = pd.concat(extrema_df_list, sort=True)
            final_df = final_df.reset_index(drop=True)
        else:
            extrema_neighbor_df_list, extrema_value = self._find_extrema_neighbor_zone(
                df_copy, datetime_column, extrema_column)
            for extrema_neighbor_df in extrema_neighbor_df_list:
                rolling_df = self.params.window_aggregator.compute(
                    extrema_neighbor_df, datetime_column)
                extrema_df = rolling_df.loc[rolling_df[extrema_column] ==
                                            extrema_value].reset_index(
                                                drop=True)
                extrema_df_list.append(extrema_df)

            if len(extrema_df_list) > 0:
                final_df = pd.concat(extrema_df_list)
                final_df = final_df.reset_index(drop=True)
            else:
                final_df = pd.DataFrame(None)

        return final_df
Пример #8
0
    def _resample(self, df, datetime_column, columns_to_resample, category_columns, reference_time_index, df_id=''):
        """
        1. Move datetime column to the index.
        2. Merge the original datetime index with the full_time_index.
        3. Create a numerical index of the df and save the correspond index.
        """

        if has_duplicates(df, datetime_column):
            raise ValueError('The time series {} contain duplicate timestamps.'.format(df_id))

        if nothing_to_do(df, min_len=2):
            logger.warning('The time series {} has less than 2 rows with values, can not resample.'.format(df_id))
            return df

        # `scipy.interpolate.interp1d` does not like empty columns, so we need to filter these out first
        filtered_columns_to_resample = filter_empty_columns(df, columns_to_resample)
        if len(filtered_columns_to_resample) == 0:
            logger.warning('All numerical columns are empty for the time series {}.'.format(df_id))
            return pd.DataFrame({datetime_column: reference_time_index}, columns=[datetime_column] + columns_to_resample)

        df_resample = df.set_index(datetime_column).sort_index().copy()
        # merge the reference time index with the original ones that has data
        # cf: https://stackoverflow.com/questions/47148446/pandas-resample-interpolate-is-producing-nans
        df_resample = df_resample.reindex(df_resample.index | reference_time_index)

        # `scipy.interpolate.interp1d` only works with numerical index, so we create one
        df_resample['numerical_index'] = range(len(df_resample))
        reference_index = df_resample.loc[reference_time_index, 'numerical_index']
        category_imputation_index = pd.Index([])

        df_resample = df_resample.rename_axis(datetime_column).reset_index()
        for filtered_column in filtered_columns_to_resample:

            df_without_nan = df.dropna(subset=[filtered_column], how='all')
            interpolation_index_mask = (df_resample[datetime_column] >= df_without_nan[datetime_column].min()) & (
                    df_resample[datetime_column] <= df_without_nan[datetime_column].max())
            interpolation_index = df_resample.index[interpolation_index_mask]

            extrapolation_index_mask = (df_resample[datetime_column] < df_without_nan[datetime_column].min()) | (
                    df_resample[datetime_column] > df_without_nan[datetime_column].max())
            extrapolation_index = df_resample.index[extrapolation_index_mask]

            index_with_data = df_resample.loc[interpolation_index, filtered_column].dropna(how='all').index

            if self.params.interpolation_method not in ['constant', 'none']:
                interpolation_function = interpolate.interp1d(index_with_data,
                                                              df_resample.loc[index_with_data, filtered_column],
                                                              kind=self.params.interpolation_method,
                                                              axis=0,
                                                              fill_value='extrapolate')

                df_resample.loc[interpolation_index, filtered_column] = interpolation_function(df_resample.loc[interpolation_index].index)
                if self.params.extrapolation_method == "interpolation":
                    df_resample.loc[extrapolation_index, filtered_column] = interpolation_function(df_resample.loc[extrapolation_index].index)
            elif self.params.interpolation_method == 'constant':
                if self.params.extrapolation_method == 'interpolation':
                    df_resample.loc[:, filtered_column] = df_resample.loc[:, filtered_column].fillna(self.params.constant_value)
                else:
                    df_resample.loc[interpolation_index, filtered_column] = df_resample.loc[interpolation_index, filtered_column].fillna(
                        self.params.constant_value)

            if self.params.extrapolation_method == "clip":
                temp_df = df_resample.copy().ffill().bfill()
                df_resample.loc[extrapolation_index, filtered_column] = temp_df.loc[extrapolation_index, filtered_column]
            elif self.params.extrapolation_method == "no_extrapolation":
                reference_index = reference_index[~reference_index.isin(extrapolation_index.values)]
            category_imputation_index = category_imputation_index.union(extrapolation_index).union(interpolation_index)

        if len(category_columns) > 0 and len(category_imputation_index) > 0 and self.params.category_imputation_method != "empty":
            df_processed = df_resample.loc[category_imputation_index]
            df_resample.loc[category_imputation_index] = self._fill_in_category_values(df_processed, category_columns)
        df_resampled = df_resample.loc[reference_index].drop('numerical_index', axis=1)
        return df_resampled