Пример #1
0
    def test_warning_on_non_uniform_time_steps(self):
        with warnings.catch_warnings(record=True) as w:
            first_class = pd.DataFrame({
                "a": [1, 2, 3, 4],
                "b": [5, 6, 7, 8],
                "time": [1, 2, 4, 5]
            })
            second_class = pd.DataFrame({
                "a": [10, 11],
                "b": [12, 13],
                "time": range(20, 22)
            })

            first_class["id"] = 1
            second_class["id"] = 2

            df_full = pd.concat([first_class, second_class], ignore_index=True)

            dataframe_functions.roll_time_series(df_full,
                                                 column_id="id",
                                                 column_sort="time",
                                                 column_kind=None,
                                                 rolling_direction=1)

            self.assertEqual(len(w), 1)
            self.assertEqual(
                str(w[0].message),
                "Your time stamps are not uniformly sampled, which makes rolling "
                "nonsensical in some domains.")
Пример #2
0
    def test_rolling_with_larger_shift(self):
        first_class = pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [5, 6, 7, 8],
            "time": range(4)
        })
        second_class = pd.DataFrame({
            "a": [10, 11],
            "b": [12, 13],
            "time": range(20, 22)
        })

        first_class["id"] = 1
        second_class["id"] = 2

        df_full = pd.concat([first_class, second_class], ignore_index=True)
        """ df_full is
            a   b  time  id
        0   1   5     0   1
        1   2   6     1   1
        2   3   7     2   1
        3   4   8     3   1
        4  10  12    20   2
        5  11  13    21   2
        """
        correct_indices = [(1, 1), (1, 1), (1, 3), (1, 3), (1, 3), (1, 3),
                           (2, 21), (2, 21)]
        correct_values_a = [1.0, 2.0, 1.0, 2.0, 3.0, 4.0, 10.0, 11.0]
        correct_values_b = [5.0, 6.0, 5.0, 6.0, 7.0, 8.0, 12.0, 13.0]

        df = dataframe_functions.roll_time_series(df_full,
                                                  column_id="id",
                                                  column_sort="time",
                                                  column_kind=None,
                                                  rolling_direction=2,
                                                  n_jobs=0)

        self.assertListEqual(list(df["id"]), correct_indices)
        self.assertListEqual(list(df["a"].values), correct_values_a)
        self.assertListEqual(list(df["b"].values), correct_values_b)

        correct_indices = [(1, 0), (1, 0), (1, 0), (1, 0), (1, 2), (1, 2),
                           (2, 20), (2, 20)]
        correct_values_a = [1.0, 2.0, 3.0, 4.0, 3.0, 4.0, 10.0, 11.0]
        correct_values_b = [5.0, 6.0, 7.0, 8.0, 7.0, 8.0, 12.0, 13.0]

        df = dataframe_functions.roll_time_series(df_full,
                                                  column_id="id",
                                                  column_sort="time",
                                                  column_kind=None,
                                                  rolling_direction=-2,
                                                  n_jobs=0)

        self.assertListEqual(list(df["id"]), correct_indices)
        self.assertListEqual(list(df["a"].values), correct_values_a)
        self.assertListEqual(list(df["b"].values), correct_values_b)
Пример #3
0
    def test_negative_rolling(self):
        first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)})
        second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)})

        first_class["id"] = 1
        second_class["id"] = 2

        df_full = pd.concat([first_class, second_class], ignore_index=True)

        correct_indices = (["id=1, shift=-3"] * 1 +
                           ["id=1, shift=-2"] * 2 +
                           ["id=1, shift=-1"] * 3 +
                           ["id=2, shift=-1"] * 1 +
                           ["id=1, shift=0"] * 4 +
                           ["id=2, shift=0"] * 2)
        correct_values_a = [4, 3, 4, 2, 3, 4, 11, 1, 2, 3, 4, 10, 11]
        correct_values_b = [8, 7, 8, 6, 7, 8, 13, 5, 6, 7, 8, 12, 13]

        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
                                                  column_kind=None, rolling_direction=-1)

        self.assertListEqual(list(df["id"].values), correct_indices)
        self.assertListEqual(list(df["a"].values), correct_values_a)
        self.assertListEqual(list(df["b"].values), correct_values_b)

        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
                                                  column_kind=None, rolling_direction=-1,
                                                  maximum_number_of_timeshifts=None)

        self.assertListEqual(list(df["id"].values), correct_indices)
        self.assertListEqual(list(df["a"].values), correct_values_a)
        self.assertListEqual(list(df["b"].values), correct_values_b)

        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
                                                  column_kind=None, rolling_direction=-1,
                                                  maximum_number_of_timeshifts=1)

        self.assertListEqual(list(df["id"].values), correct_indices[3:])
        self.assertListEqual(list(df["a"].values), correct_values_a[3:])
        self.assertListEqual(list(df["b"].values), correct_values_b[3:])

        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
                                                  column_kind=None, rolling_direction=-1,
                                                  maximum_number_of_timeshifts=2)

        self.assertListEqual(list(df["id"].values), correct_indices[1:])
        self.assertListEqual(list(df["a"].values), correct_values_a[1:])
        self.assertListEqual(list(df["b"].values), correct_values_b[1:])

        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
                                                  column_kind=None, rolling_direction=-1,
                                                  maximum_number_of_timeshifts=4)

        self.assertListEqual(list(df["id"].values), correct_indices[:])
        self.assertListEqual(list(df["a"].values), correct_values_a[:])
        self.assertListEqual(list(df["b"].values), correct_values_b[:])
Пример #4
0
    def test_positive_rolling(self):
        first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)})
        second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)})

        first_class["id"] = 1
        second_class["id"] = 2

        df_full = pd.concat([first_class, second_class], ignore_index=True)

        """ df_full is
            a   b  time  id
        0   1   5     0   1
        1   2   6     1   1
        2   3   7     2   1
        3   4   8     3   1
        4  10  12    20   2
        5  11  13    21   2
        """
        correct_indices = [0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 20, 21, 21]
        correct_values_a = [1.0, 1.0, 2.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 10.0, 10.0, 11.0]
        correct_values_b = [5.0, 5.0, 6.0, 5.0, 6.0, 7.0, 5.0, 6.0, 7.0, 8.0, 12.0, 12.0, 13.0]

        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
                                                  column_kind=None, rolling_direction=1)

        self.assertListEqual(list(df["id"]), correct_indices)
        self.assertListEqual(list(df["a"].values), correct_values_a)
        self.assertListEqual(list(df["b"].values), correct_values_b)

        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
                                                  column_kind=None, rolling_direction=1,
                                                  max_timeshift=4)


        self.assertListEqual(list(df["id"]), correct_indices)
        self.assertListEqual(list(df["a"].values), correct_values_a)
        self.assertListEqual(list(df["b"].values), correct_values_b)

        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
                                                  column_kind=None, rolling_direction=1,
                                                  max_timeshift=2)

        correct_indices = [0, 1, 1, 2, 2, 2,  3, 3, 3, 20, 21, 21]
        correct_values_a = [1.0, 1.0, 2.0, 1.0, 2.0, 3.0, 2.0, 3.0, 4.0, 10.0, 10.0, 11.0]
        correct_values_b = [5.0, 5.0, 6.0, 5.0, 6.0, 7.0, 6.0, 7.0, 8.0, 12.0, 12.0, 13.0]

        self.assertListEqual(list(df["id"]), correct_indices)
        self.assertListEqual(list(df["a"].values), correct_values_a)
        self.assertListEqual(list(df["b"].values), correct_values_b)
def create_features_by_tsfresh(path, dataset, years, features):
    data = dataset.copy()
    data = data[features + ['time', 'id']]
    data_rolled = roll_time_series(data,
                                   column_id="id",
                                   column_sort="time",
                                   max_timeshift=7 * 24,
                                   n_jobs=8)
    features = extract_features(data_rolled,
                                column_id="id",
                                column_sort="time",
                                n_jobs=8)
    impute(features)
    print(features.shape)
    features.to_csv(path + '/modified_data_after_feature_extraction/')

    AQI = get_raw_AQI_data(path, years)
    AQI_data = pd.Series(data=AQI['AQI'].values,
                         index=features.index,
                         name='AQI')
    print(AQI_data.shape)
    selected_features = select_features(features, AQI_data)
    print(selected_features.shape)
    # features.drop('ID', axis=1, inplace=True)
    selected_features.index = range(selected_features.shape[0])
    return selected_features
Пример #6
0
    def test_dict_rolling(self):
        df_dict = {
            "a":
            pd.DataFrame({
                "_value": [1, 2, 3, 4, 10, 11],
                "id": [1, 1, 1, 1, 2, 2]
            }),
            "b":
            pd.DataFrame({
                "_value": [5, 6, 7, 8, 12, 13],
                "id": [1, 1, 1, 1, 2, 2]
            })
        }

        df = dataframe_functions.roll_time_series(df_dict,
                                                  column_id="id",
                                                  column_sort=None,
                                                  column_kind=None,
                                                  rolling_direction=-1)

        correct_indices = (["id=1, shift=-3"] * 1 + ["id=1, shift=-2"] * 2 +
                           ["id=1, shift=-1"] * 3 + ["id=2, shift=-1"] * 1 +
                           ["id=1, shift=0"] * 4 + ["id=2, shift=0"] * 2)

        self.assertListEqual(list(df["a"]["id"].values), correct_indices)
        self.assertListEqual(list(df["b"]["id"].values), correct_indices)

        self.assertListEqual(list(df["a"]["_value"].values),
                             [4, 3, 4, 2, 3, 4, 11, 1, 2, 3, 4, 10, 11])
        self.assertListEqual(list(df["b"]["_value"].values),
                             [8, 7, 8, 6, 7, 8, 13, 5, 6, 7, 8, 12, 13])
Пример #7
0
  def tsfresh_run(forecast,season,insample=True,forecast_out=None):
      df_roll_prep = forecast.reset_index()
      if insample:
        df_roll_prep = df_roll_prep.drop(["Target","Date"],axis=1)
        df_roll_prep["id"] = 1
        target = forecast["Target"]
      else: 
        df_roll_prep = df_roll_prep.drop(["index"],axis=1)
        df_roll_prep["id"] = 1

      df_roll = roll_time_series(df_roll_prep, column_id="id", column_sort=None, column_kind=None, rolling_direction=1,max_timeshift=season-1)
      counts = df_roll['id'].value_counts()
      df_roll_cut = df_roll[df_roll['id'].isin(counts[counts >= season].index)]

      ### TS feature extraction
      concat_df = pd.DataFrame()
      #rap = 4 ## Change this to suit your memory capacity, the lower the more memory
      concat_df = extract_features(df_roll_cut.ffill(), column_id="id", column_sort="sort", n_jobs=season, show_warnings=False, disable_progressbar=True )

      if insample:

        concat_df = concat_df.dropna(axis=1, how="all")
        concat_df.index =  target[df_roll_cut['id'].value_counts().index].sort_index().to_frame().index
        concat_df = pd.merge(target[df_roll_cut['id'].value_counts().index].sort_index().to_frame(), concat_df, left_index=True, right_index=True, how="left")
        concat_df_list = constant_feature_detect(data=concat_df,threshold=0.95)
        concat_df = concat_df.drop(concat_df_list,axis=1)
      else:
        forecast_out.index.name = "Date"
        concat_df.index = forecast_out.index

      concat_df = impute(concat_df)


      return concat_df
    def test_positive_rolling(self):
        first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)})
        second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)})

        first_class["id"] = 1
        second_class["id"] = 2

        df_full = pd.concat([first_class, second_class], ignore_index=True)

        df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
                                                  column_kind=None, rolling_direction=1)

        correct_indices = (["id=1, shift=3"] * 1 +
                           ["id=1, shift=2"] * 2 +
                           ["id=1, shift=1"] * 3 +
                           ["id=2, shift=1"] * 1 +
                           ["id=1, shift=0"] * 4 +
                           ["id=2, shift=0"] * 2)

        self.assertListEqual(list(df["id"]), correct_indices)

        self.assertListEqual(list(df["a"].values),
                             [1, 1, 2, 1, 2, 3, 10, 1, 2, 3, 4, 10, 11])
        self.assertListEqual(list(df["b"].values),
                             [5, 5, 6, 5, 6, 7, 12, 5, 6, 7, 8, 12, 13])
    def test_stacked_rolling(self):
        first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)})
        second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)})

        first_class["id"] = 1
        second_class["id"] = 2

        df_full = pd.concat([first_class, second_class], ignore_index=True)

        df_stacked = pd.concat([df_full[["time", "id", "a"]].rename(columns={"a": "_value"}),
                                df_full[["time", "id", "b"]].rename(columns={"b": "_value"})], ignore_index=True)
        df_stacked["kind"] = ["a"] * 6 + ["b"] * 6

        df = dataframe_functions.roll_time_series(df_stacked, column_id="id", column_sort="time",
                                                  column_kind="kind", rolling_direction=-1)

        correct_indices = (["id=1, shift=-3"] * 2 +
                           ["id=1, shift=-2"] * 4 +
                           ["id=1, shift=-1"] * 6 +
                           ["id=2, shift=-1"] * 2 +
                           ["id=1, shift=0"] * 8 +
                           ["id=2, shift=0"] * 4)

        self.assertListEqual(list(df["id"].values), correct_indices)

        self.assertListEqual(list(df["kind"].values), ["a", "b"] * 13)
        self.assertListEqual(list(df["_value"].values),
                             [4, 8, 3, 7, 4, 8, 2, 6, 3, 7, 4, 8, 11, 13, 1, 5, 2, 6, 3, 7, 4, 8, 10, 12, 11, 13])
Пример #10
0
    def test_order_rolling(self):

        first_class = pd.DataFrame({
            "x": [1, 2, 3, 4],
            "time": [1, 15, 132, 145]
        })
        second_class = pd.DataFrame({"x": [5, 6, 7], "time": [16, 133, 146]})

        first_class["initial_id"] = 1
        second_class["initial_id"] = 2
        df_full = pd.concat([first_class, second_class], ignore_index=True)

        window_size = 2
        df_rolled = dataframe_functions.roll_time_series(
            df_full,
            column_id="initial_id",
            column_sort="time",
            min_timeshift=window_size - 1,
            max_timeshift=window_size - 1)
        """ df is
        {x: _value  id
              1.0   1
              2.0   1
              3.0   1
              4.0   1
              5.0   2
              6.0   2
              7.0   2,
         }
        """

        correct_indices = [(1, 15), (1, 15), (1, 132), (1, 132), (1, 145),
                           (1, 145), (2, 133), (2, 133), (2, 146), (2, 146)]

        self.assertListEqual(list(df_rolled["id"]), correct_indices)
Пример #11
0
    def test_warning_on_non_uniform_time_steps(self):
        with warnings.catch_warnings(record=True) as w:
            first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": [1, 2, 4, 5]})
            second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)})

            first_class["id"] = 1
            second_class["id"] = 2

            df_full = pd.concat([first_class, second_class], ignore_index=True)

            dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time",
                                                 column_kind=None, rolling_direction=1)

            self.assertEqual(len(w), 1)
            self.assertEqual(str(w[0].message),
                             "Your time stamps are not uniformly sampled, which makes rolling "
                             "nonsensical in some domains.")
Пример #12
0
    def gen_rolling_feature(self,
                            window_size,
                            settings="comprehensive",
                            full_settings=None,
                            n_jobs=1):
        '''
        Generate aggregation feature for each sample.
        This method will be implemented by tsfresh.

        TODO: relationship with scale should be figured out.

        :param window_size: int, generate feature according to the rolling result.
        :param settings: str or dict. If a string is set, then it must be one of "comprehensive"
               "minimal" and "efficient". If a dict is set, then it should follow the instruction
               for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive".
        :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in
               tsfresh. The value is defaulted to None.
        :param n_jobs: int. The number of processes to use for parallelization.

        :return: the tsdataset instance.
        '''
        assert not self._has_generate_agg_feature,\
            "Only one of gen_global_feature and gen_rolling_feature should be called."
        if isinstance(settings, str):
            assert settings in ["comprehensive", "minimal", "efficient"], \
                f"settings str should be one of \"comprehensive\", \"minimal\", \"efficient\"\
                    , but found {settings}."

            default_fc_parameters = DEFAULT_PARAMS[settings]
        else:
            default_fc_parameters = settings

        df_rolled = roll_time_series(self.df,
                                     column_id=self.id_col,
                                     column_sort=self.dt_col,
                                     max_timeshift=window_size - 1,
                                     min_timeshift=window_size - 1,
                                     n_jobs=n_jobs)
        if not full_settings:
            self.roll_feature_df = extract_features(
                df_rolled,
                column_id=self.id_col,
                column_sort=self.dt_col,
                default_fc_parameters=default_fc_parameters,
                n_jobs=n_jobs)
        else:
            self.roll_feature_df = extract_features(
                df_rolled,
                column_id=self.id_col,
                column_sort=self.dt_col,
                kind_to_fc_parameters=full_settings,
                n_jobs=n_jobs)
        impute_tsfresh(self.roll_feature_df)

        self.feature_col += list(self.roll_feature_df.columns)
        self.roll_additional_feature = list(self.roll_feature_df.columns)
        self._has_generate_agg_feature = True
        return self
Пример #13
0
    def test_stacked_rolling(self):
        first_class = pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [5, 6, 7, 8],
            "time": range(4)
        })
        second_class = pd.DataFrame({
            "a": [10, 11],
            "b": [12, 13],
            "time": range(20, 22)
        })

        first_class["id"] = 1
        second_class["id"] = 2

        df_full = pd.concat([first_class, second_class], ignore_index=True)

        df_stacked = pd.concat([
            df_full[["time", "id", "a"]].rename(columns={"a": "_value"}),
            df_full[["time", "id", "b"]].rename(columns={"b": "_value"})
        ],
                               ignore_index=True)
        df_stacked["kind"] = ["a"] * 6 + ["b"] * 6
        """ df_stacked is
            time  id  _value kind
        0      0   1       1    a
        1      1   1       2    a
        2      2   1       3    a
        3      3   1       4    a
        4     20   2      10    a
        5     21   2      11    a
        6      0   1       5    b
        7      1   1       6    b
        8      2   1       7    b
        9      3   1       8    b
        10    20   2      12    b
        11    21   2      13    b
        """

        df = dataframe_functions.roll_time_series(df_stacked,
                                                  column_id="id",
                                                  column_sort="time",
                                                  column_kind="kind",
                                                  rolling_direction=-1,
                                                  n_jobs=0)

        correct_indices = ([(1, 0)] * 2 * 4 + [(1, 1)] * 2 * 3 +
                           [(1, 2)] * 2 * 2 + [(1, 3)] * 2 * 1 +
                           [(2, 20)] * 2 * 2 + [(2, 21)] * 2 * 1)
        self.assertListEqual(list(df["id"].values), correct_indices)

        self.assertListEqual(list(df["kind"].values), ["a", "b"] * 13)
        self.assertListEqual(list(df["_value"].values), [
            1., 5., 2., 6., 3., 7., 4., 8., 2., 6., 3., 7., 4., 8., 3., 7., 4.,
            8., 4., 8., 10., 12., 11., 13., 11., 13.
        ])
def create_new_features(df_x, s_y, x_train_cols=[]):
    """
    Create new Features from Input-Dataframe by using TSFRESH
    :param df_x: Dataframe containing Time-Series
    :param s_y: Series of Target-Var
    :param x_train_cols:
    :return: Dataframe containing created Features
    """

    # add id column (same id for every row, because only one time series is
    # considered in this dataset)
    df_x["id"] = 1

    # create roll time series for generating time series features
    df_x_rolled = roll_time_series(
        df_x,
        column_id="id",
        column_sort="Date",
        column_kind=None,
        rolling_direction=1,
        max_timeshift=TSFRESH_TIME_WINDOWS - 1,
    )

    x = df_x.set_index("Date")

    # for each variable in input df new features are generated
    for current_feature in FEATURES:
        # noinspection PyTypeChecker
        generated_features = extract_features(
            df_x_rolled,
            column_id="id",
            n_jobs=3,
            column_kind=None,
            column_value=current_feature,
            impute_function=impute,
            default_fc_parameters=settings,
        )

        x = pd.concat([x, generated_features], axis=1)
        print(f"\nNew shape of Feature-Matrix: {x.shape}")

    print(f"\nAmount of Features before selection: {len(x.columns)}")

    # check if features of train set are already selected
    if len(x_train_cols) == 0:
        # select relevant features for train set
        selected_features = feature_selection.select_features(x, s_y)
        print(f"\nAmount of Features after selection: "
              f"{len(selected_features.columns)}")
    else:
        # no selection is needed, features are already selected for train set
        selected_features = x[x_train_cols]

    return selected_features
Пример #15
0
def segment(dir_path):
    """Create segments of time series."""

    target = yaml.safe_load(open("params.yaml"))["clean"]["target"]

    filepaths = find_files(dir_path, file_extension=".csv")

    output_columns = np.array(
        pd.read_csv(DATA_PATH / OUTPUT_FEATURES_PATH, index_col=0)).reshape(-1)

    dfs = []

    for filepath in filepaths:
        df = pd.read_csv(filepath, index_col=0)
        # df = df.iloc[10000:90000,:]
        # df = df.iloc[:,:-1]
        dfs.append(df)

    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df = combined_df[::10]
    print(combined_df)

    n_rows = len(combined_df)
    segment_size = 100
    n_segments = int(n_rows / segment_size)
    ids = np.arange(1, n_segments + 1, 1)

    idlist = np.ones(segment_size)

    for i in ids[1:]:
        idlist = np.concatenate((idlist, np.ones(segment_size) * i))

    idlist = np.array(idlist, dtype=np.int32)

    # combined_df = combined_df.iloc[:len(idlist),:]
    # combined_df["id"] = idlist
    combined_df["id"] = np.ones(n_rows)

    # y = []

    # for i in ids:
    #     target_value = combined_df[combined_df["id"] == i][target].iloc[-1]
    #     y.append(target_value)

    # y = pd.Series(y)
    # y.index = y.index + 1
    # combined_df.index.name = "index"
    # print(y)
    print(combined_df)
    # print(np.unique(y))

    df_rolled = roll_time_series(combined_df, column_id="id", column_sort=None)
    print(df_rolled)
Пример #16
0
    def test_dict_rolling_maxshift_1(self):
        df_dict = {
            "a":
            pd.DataFrame({
                "_value": [1, 2, 3, 4, 10, 11],
                "id": [1, 1, 1, 1, 2, 2]
            }),
            "b":
            pd.DataFrame({
                "_value": [5, 6, 7, 8, 12, 13],
                "id": [1, 1, 1, 1, 2, 2]
            })
        }
        df = dataframe_functions.roll_time_series(df_dict,
                                                  column_id="id",
                                                  column_sort=None,
                                                  column_kind=None,
                                                  rolling_direction=-1,
                                                  max_timeshift=1,
                                                  n_jobs=0)
        """ df is
        {a: _value  id
              1.0   1
              2.0   1
              3.0   1
              4.0   1
             10.0   2
             11.0   2,

         b: _value  id
               5.0   1
               6.0   1
               7.0   1
               8.0   1
              12.0   2
              13.0   2
         }
        """

        correct_indices = [(1, 0), (1, 0), (1, 1), (1, 1), (1, 2), (1, 2),
                           (1, 3), (2, 0), (2, 0), (2, 1)]

        self.assertListEqual(list(df["a"]["id"].values), correct_indices)
        self.assertListEqual(list(df["b"]["id"].values), correct_indices)

        self.assertListEqual(
            list(df["a"]["_value"].values),
            [1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0])
        self.assertListEqual(
            list(df["b"]["_value"].values),
            [5.0, 6.0, 6.0, 7.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0])
Пример #17
0
    def test_dict_rolling(self):
        df_dict = {
            "a": pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}),
            "b": pd.DataFrame({"_value": [5, 6, 7, 8, 12, 13], "id": [1, 1, 1, 1, 2, 2]})
        }

        df = dataframe_functions.roll_time_series(df_dict, column_id="id", column_sort=None,
                                                  column_kind=None, rolling_direction=-1)
        """ df is 
        {a: _value  sort id
         7      1.0   0.0  0
         3      2.0   1.0  0
         1      3.0   2.0  0
         0      4.0   3.0  0
         8      2.0   1.0  1
         4      3.0   2.0  1
         2      4.0   3.0  1
         9      3.0   2.0  2
         5      4.0   3.0  2
         10     4.0   3.0  3
         11    10.0   4.0  4
         6     11.0   5.0  4
         12    11.0   5.0  5, 
         
         b: _value  sort id
         7      5.0   0.0  0
         3      6.0   1.0  0
         1      7.0   2.0  0
         0      8.0   3.0  0
         8      6.0   1.0  1
         4      7.0   2.0  1
         2      8.0   3.0  1
         9      7.0   2.0  2
         5      8.0   3.0  2
         10     8.0   3.0  3
         11    12.0   4.0  4
         6     13.0   5.0  4
         12    13.0   5.0  5}
        """

        correct_indices = [0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 4, 4, 5]

        self.assertListEqual(list(df["a"]["id"].values), correct_indices)
        self.assertListEqual(list(df["b"]["id"].values), correct_indices)

        self.assertListEqual(list(df["a"]["_value"].values),
                             [1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0])
        self.assertListEqual(list(df["b"]["_value"].values),
                             [5.0, 6.0, 7.0, 8.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0])
Пример #18
0
def test_features_on_btc():

    df = pd.DataFrame({
        "id": [1, 1, 1, 1, 2, 2],
        "time": [1, 2, 3, 4, 8, 9],
        "x": [1, 2, 3, 4, 10, 11],
        "y": [5, 6, 7, 8, 12, 13],
    })

    df_rolled = roll_time_series(df, column_id="id", column_sort="time")
    assert df_rolled['id'].nunique() == 6
    df_features = extract_features(df_rolled,
                                   column_id="id",
                                   column_sort="time")
    assert df_features.shape[0] == 6
Пример #19
0
def add_tsfresh_participant(data, tsfresh_features, columns, k):

    # The dictionary containing the features that we want to extract and the setting for those features
    if tsfresh_features == 'minimal':
        settings = MinimalFCParameters()
    elif tsfresh_features == 'efficient':
        settings = EfficientFCParameters()
    elif tsfresh_features == 'comprehensive':
        settings = ComprehensiveFCParameters()
    else:
        settings = MinimalFCParameters()

    for participant in range(len(data)):

        # First we add the necesary columns
        data[participant]['id'] = 0
        data[participant]['index'] = data[participant].index

        # We create the rolled time series which also creates new ids, also note that putting max_timeshift to none
        # means that it takes the maximal possible lengths
        rolled_series = roll_time_series(data[participant],
                                         column_id='id',
                                         column_sort='index',
                                         max_timeshift=k)

        all_features = []
        for column in columns:
            # We extract the features for every element of the time series which return a dataframe with the same number
            # of rows as the original dataframe but a different number of columns
            extracted = extract_features(rolled_series,
                                         default_fc_parameters=settings,
                                         column_id='id',
                                         column_sort='index',
                                         column_value=column)

            # We need to reset the indexes as they have been changed and add them to our list of features
            all_features.append(extracted.reset_index(drop=True))

        # Add all the features together
        extracted = pd.concat(all_features, axis=1)

        # We drop the columns that we previously created because we do no want them in the data
        del data[participant]['id']  # note that you can also use df.drop here
        del data[participant]['index']

        data[participant] = pd.concat([data[participant], extracted], axis=1)

    return data
Пример #20
0
    def transform(self, X, y=None):

        windows = roll_time_series(
            X,
            self.column_id,
            self.column_sort,
            self.column_kind,
            self.rolling_direction,
            self.max_timeshift,
            self.min_timeshift,
            self.chunksize,
            self.n_jobs,
            self.show_warnings,
            self.disable_progressbar,
            self.distributor,
        )

        return windows
Пример #21
0
def get_resample_features(data, window, settings=MinimalFCParameters()):
    """ Make rolling in time series to extrated daily features

    Given that time series that is taken as input must be in hours, the
    number of windows to make the rolling and in this way get daily features

    Parameters
    ----------
    data : DataFrame
        The DataFrame contains events update by CI
    settings : Object
        A object that maps feature calculator names in tsfresh. There are
        two options: ComprehensiveFCParameters() or MinimalFCParameters()
    list_features: list
        A list that contains the relevant features to calculate in tsfresh
    time_resample:
        Unit of time in which the features are required

    Returns
    -------
    DataFrame
        DataFrame with daily features
    """

    data["id"] = 1
    df_roll_time = roll_time_series(
        data,
        column_id="id",
        column_sort='Timestamp',
        column_kind=None,
        rolling_direction=1,
        #                                    max_timeshift=23,
        max_timeshift=window)

    X_features = extract_features(
        df_roll_time,
        column_id="id",
        column_sort='Timestamp',
        default_fc_parameters=settings,
        #                                n_jobs=4
    )
    #resample time series by day
    X_features.index = pd.to_datetime(X_features.index)
    return X_features
Пример #22
0
    def test_stacked_rolling(self):
        first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)})
        second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)})

        first_class["id"] = 1
        second_class["id"] = 2

        df_full = pd.concat([first_class, second_class], ignore_index=True)

        df_stacked = pd.concat([df_full[["time", "id", "a"]].rename(columns={"a": "_value"}),
                                df_full[["time", "id", "b"]].rename(columns={"b": "_value"})], ignore_index=True)
        df_stacked["kind"] = ["a"] * 6 + ["b"] * 6

        """ df_stacked is 
            time  id  _value kind
        0      0   1       1    a
        1      1   1       2    a
        2      2   1       3    a
        3      3   1       4    a
        4     20   2      10    a
        5     21   2      11    a
        6      0   1       5    b
        7      1   1       6    b
        8      2   1       7    b
        9      3   1       8    b
        10    20   2      12    b
        11    21   2      13    b
        """

        df = dataframe_functions.roll_time_series(df_stacked, column_id="id", column_sort="time",
                                                  column_kind="kind", rolling_direction=-1)

        correct_indices = ([0]*2*4 + [1]*2*3 + [2]*2*2 + [3]*2*1 + [20]*4 + [21] *2)
        self.assertListEqual(list(df["id"].values), correct_indices)

        print(df["_value"].values)
        self.assertListEqual(list(df["kind"].values), ["a", "b"] * 13)
        self.assertListEqual(list(df["_value"].values),
                             [1., 5., 2., 6., 3., 7., 4., 8., 2., 6., 3., 7., 4., 8., 3., 7., 4., 8., 4., 8., 10., 12.,
                              11., 13., 11., 13.])
Пример #23
0
    def _extract_features(self, data_frame):
        df_rolled = roll_time_series(
            data_frame,
            column_id=self.column_id,
            column_sort=self.time_stamp,
            max_timeshift=self.memory,
        )

        extracted_minimal = tsfresh.extract_features(
            df_rolled,
            column_id=self.column_id,
            column_sort=self.time_stamp,
            default_fc_parameters=tsfresh.feature_extraction.
            MinimalFCParameters(),
        )

        extracted_index_based = tsfresh.extract_features(
            df_rolled,
            column_id=self.column_id,
            column_sort=self.time_stamp,
            default_fc_parameters=tsfresh.feature_extraction.settings.
            IndexBasedFCParameters(),
        )

        extracted_features = pd.concat(
            [extracted_minimal, extracted_index_based], axis=1)
        del extracted_minimal
        del extracted_index_based

        gc.collect()

        extracted_features[np.isnan(extracted_features)] = 0.0

        extracted_features[np.isinf(extracted_features)] = 0.0

        return extracted_features
Пример #24
0
    def gen_rolling_feature(self,
                            window_size,
                            settings="comprehensive",
                            full_settings=None,
                            n_jobs=1):
        '''
        Generate aggregation feature for each sample.
        This method will be implemented by tsfresh.
        Make sure that the specified column name does not contain '__'.

        TODO: relationship with scale should be figured out.

        :param window_size: int, generate feature according to the rolling result.
        :param settings: str or dict. If a string is set, then it must be one of "comprehensive"
               "minimal" and "efficient". If a dict is set, then it should follow the instruction
               for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive".
        :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in
               tsfresh. The value is defaulted to None.
        :param n_jobs: int. The number of processes to use for parallelization.

        :return: the tsdataset instance.
        '''
        from tsfresh.utilities.dataframe_functions import roll_time_series
        from tsfresh.utilities.dataframe_functions import impute as impute_tsfresh
        from tsfresh import extract_features
        from tsfresh.feature_extraction import ComprehensiveFCParameters, \
            MinimalFCParameters, EfficientFCParameters

        DEFAULT_PARAMS = {
            "comprehensive": ComprehensiveFCParameters(),
            "minimal": MinimalFCParameters(),
            "efficient": EfficientFCParameters()
        }

        assert not self._has_generate_agg_feature,\
            "Only one of gen_global_feature and gen_rolling_feature should be called."
        if isinstance(settings, str):
            assert settings in ['comprehensive', 'minimal', 'efficient'], \
                "settings str should be one of 'comprehensive', 'minimal', 'efficient'"\
                f", but found {settings}."
            default_fc_parameters = DEFAULT_PARAMS[settings]
        else:
            default_fc_parameters = settings

        assert window_size < self.df.groupby(self.id_col).size().min() + 1, "gen_rolling_feature "\
            "should have a window_size smaller than shortest time series length."
        df_rolled = roll_time_series(self.df,
                                     column_id=self.id_col,
                                     column_sort=self.dt_col,
                                     max_timeshift=window_size - 1,
                                     min_timeshift=window_size - 1,
                                     n_jobs=n_jobs)
        if not full_settings:
            self.roll_feature_df = extract_features(
                df_rolled,
                column_id=self.id_col,
                column_sort=self.dt_col,
                default_fc_parameters=default_fc_parameters,
                n_jobs=n_jobs)
        else:
            self.roll_feature_df = extract_features(
                df_rolled,
                column_id=self.id_col,
                column_sort=self.dt_col,
                kind_to_fc_parameters=full_settings,
                n_jobs=n_jobs)
        impute_tsfresh(self.roll_feature_df)

        self.feature_col += list(self.roll_feature_df.columns)
        self.roll_additional_feature = list(self.roll_feature_df.columns)
        self._has_generate_agg_feature = True
        return self
feature_matrix_list = []

for room in room_params:
    for measurement in range(1, room_params[room] + 1):
        print(f'Room: {room}, Measurement: {measurement}')
        subset_data = loader.return_experiment_measurement(
            room_location=room, measurement_no=measurement, sensor_node=1)
        subset_data.set_index('entry_id', inplace=True)
        subset_data['entry_id'] = subset_data.index
        target_vector = subset_data['binary_target']
        subset_data.sort_values(['node_id', 'entry_id'], inplace=True)
        ts_for_rolling = subset_data[FEATURES]
        ts_for_rolling = roll_time_series(ts_for_rolling,
                                          column_id='node_id',
                                          column_sort='entry_id',
                                          column_kind=None,
                                          rolling_direction=1,
                                          max_timeshift=WINDOW_SIZE)
        temp_df = extract_features(ts_for_rolling,
                                   n_jobs=NCORES,
                                   column_sort='entry_id',
                                   column_id='node_id',
                                   column_kind=None,
                                   show_warnings=False)
        impute_df = impute(temp_df)
        temp_sel_df = select_features(X=impute_df,
                                      y=target_vector,
                                      n_jobs=NCORES)
        join_df = subset_data[FEATURES + JOIN_FEATURES]
        join_df = join_df.join(temp_sel_df)
        feature_matrix_list.append(join_df)
Пример #26
0
        dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
        data = pd.read_csv(folder + filename, index_col=0, date_parser=dateparse)
        data['id'] = [site_ID for _ in range(data.shape[0])]
        data.rename(columns={'AQI_': 'AQI'}, inplace=True)
        data['time'] = data.index

        data = data[['AQI', 'time', 'id']]
        # data = data.iloc[0:40, :]
        print(data.shape)
        # data = drop_missing_weeks(data, years)
        data_list[m] = data

    data = pd.concat(data_list, axis=0)
    print(data.shape)

    data_rolled = roll_time_series(data, column_id="id", column_sort="time", max_timeshift=7*24, n_jobs=8)
    features = extract_features(data_rolled, column_id="id", column_sort="time", n_jobs=8)
    impute(features)
    print(features.shape)
    features['time'] = data['time'].values
    features = drop_missing_weeks(features, years, typical_index=False)
    features.drop(['time'], axis=1, inplace=True)

    AQI = get_raw_AQI_data(path, years)
    AQI_data = pd.Series(data=AQI['AQI'].values, index=features.index, name='AQI')
    print(AQI.shape)

    selected_features = select_features(features, AQI_data)
    print(selected_features.shape)
    selected_features.to_csv('./data/modified_data_after_feature_extraction/AQI_features.csv', index=False)
Пример #27
0
    def test_negative_rolling(self):
        first_class = pd.DataFrame({
            "a": [1, 2, 3, 4],
            "b": [5, 6, 7, 8],
            "time": range(4)
        })
        second_class = pd.DataFrame({
            "a": [10, 11],
            "b": [12, 13],
            "time": range(20, 22)
        })

        first_class["id"] = 1
        second_class["id"] = 2

        df_full = pd.concat([first_class, second_class], ignore_index=True)
        """ df_full is 
            a   b  time  id
        0   1   5     0   1
        1   2   6     1   1
        2   3   7     2   1
        3   4   8     3   1
        4  10  12    20   2
        5  11  13    21   2
        """

        correct_indices = ([0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 20, 20, 21])
        correct_values_a = [
            1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0
        ]
        correct_values_b = [
            5.0, 6.0, 7.0, 8.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0
        ]

        df = dataframe_functions.roll_time_series(df_full,
                                                  column_id="id",
                                                  column_sort="time",
                                                  column_kind=None,
                                                  rolling_direction=-1)

        self.assertListEqual(list(df["id"].values), correct_indices)
        self.assertListEqual(list(df["a"].values), correct_values_a)
        self.assertListEqual(list(df["b"].values), correct_values_b)

        df = dataframe_functions.roll_time_series(df_full,
                                                  column_id="id",
                                                  column_sort="time",
                                                  column_kind=None,
                                                  rolling_direction=-1,
                                                  max_timeshift=None)

        self.assertListEqual(list(df["id"].values), correct_indices)
        self.assertListEqual(list(df["a"].values), correct_values_a)
        self.assertListEqual(list(df["b"].values), correct_values_b)

        df = dataframe_functions.roll_time_series(df_full,
                                                  column_id="id",
                                                  column_sort="time",
                                                  column_kind=None,
                                                  rolling_direction=-1,
                                                  max_timeshift=1)

        correct_indices = ([0, 0, 1, 1, 2, 2, 3, 20, 20, 21])
        correct_values_a = [
            1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0
        ]
        correct_values_b = [
            5.0, 6.0, 6.0, 7.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0
        ]

        self.assertListEqual(list(df["id"].values), correct_indices)
        self.assertListEqual(list(df["a"].values), correct_values_a)
        self.assertListEqual(list(df["b"].values), correct_values_b)

        df = dataframe_functions.roll_time_series(df_full,
                                                  column_id="id",
                                                  column_sort="time",
                                                  column_kind=None,
                                                  rolling_direction=-1,
                                                  max_timeshift=2)

        correct_indices = ([0, 0, 0, 1, 1, 1, 2, 2, 3, 20, 20, 21])
        correct_values_a = [
            1.0, 2.0, 3.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0
        ]
        correct_values_b = [
            5.0, 6.0, 7.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0
        ]

        self.assertListEqual(list(df["id"].values), correct_indices)
        self.assertListEqual(list(df["a"].values), correct_values_a)
        self.assertListEqual(list(df["b"].values), correct_values_b)

        df = dataframe_functions.roll_time_series(df_full,
                                                  column_id="id",
                                                  column_sort="time",
                                                  column_kind=None,
                                                  rolling_direction=-1,
                                                  max_timeshift=4)

        correct_indices = ([0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 20, 20, 21])
        correct_values_a = [
            1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0
        ]
        correct_values_b = [
            5.0, 6.0, 7.0, 8.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0
        ]

        self.assertListEqual(list(df["id"].values), correct_indices)
        self.assertListEqual(list(df["a"].values), correct_values_a)
        self.assertListEqual(list(df["b"].values), correct_values_b)
Пример #28
0
 def get_rolling_windows(self):
     self.df_rolled = roll_time_series(self.df,
                                       column_id=self.id,
                                       column_sort=self.column_sort)
Пример #29
0
    def preprocess(self,
                   feature_window: int,
                   aggregations: dict,
                   strict_feature_window: bool = True,
                   include_target: bool = True,
                   drop_na_target: bool = True) -> tuple:

        assert feature_window <= self.max_feature_window, \
            "Try smaller integer feature window!"
        df = self.df.copy()
        id_col = self.id_col
        datetime_col = self.datetime_col
        target_col = self.target_col
        forecast_horizon = self.forecast_horizon
        df[datetime_col] = pd.to_datetime(df[datetime_col],
                                          format=self.dt_format).dt.date
        df['timeID'] = super(RollWin, RollWin)._map_timeid(df, datetime_col)
        df['kind'] = df[id_col]

        # Process target variable outside rolling window implementation
        df_target = df[[id_col, datetime_col, target_col]]
        df_target['target_shift'] = df_target.groupby(
            id_col)[target_col].shift(-forecast_horizon)
        df_target = df_target.rename(columns={datetime_col: 'ref_date'})
        df_target.drop(target_col, 1, inplace=True)

        # Apply rolling and do some processing
        df_rolled = roll_time_series(df,
                                     column_id=id_col,
                                     column_sort='timeID',
                                     column_kind='kind',
                                     rolling_direction=1,
                                     max_timeshift=feature_window - 1)
        df_rolled = df_rolled.rename(columns={id_col: 'winID', 'kind': id_col})
        cols = list(df_rolled.columns.values)
        first_cols = [id_col, 'winID', 'timeID', datetime_col]
        remaining_cols = sorted(list(set(cols) - set(first_cols)))
        cols = first_cols + remaining_cols
        df_rolled = df_rolled[cols].sort_values(by=[id_col, 'winID',
                                                    'timeID']). \
            reset_index(drop=True)
        df_rolled['ref_date'] = df_rolled.groupby(
            [id_col, 'winID'])[datetime_col].transform('last')

        df_rolled = pd.merge(df_rolled,
                             df_target,
                             how='left',
                             on=[id_col, 'ref_date'])
        cols = list(df_rolled.columns)
        first_cols = [
            id_col, 'ref_date', 'winID', datetime_col, 'timeID', 'target_shift'
        ]
        remaining_cols = list(set(cols) - set(first_cols))
        cols = first_cols + sorted(remaining_cols)
        df_rolled = df_rolled[cols]

        self.n_strict_rolling_win = df_rolled[df_rolled.groupby([
            id_col, 'winID'
        ])['timeID'].transform(len) == feature_window].dropna(
            subset=['target_shift'])['timeID'].nunique()

        if strict_feature_window:
            df_rolled = df_rolled[df_rolled.groupby([id_col, 'winID'])
                                  ['timeID'].transform(len) == feature_window]
        else:
            pass

        if drop_na_target:
            df_rolled.dropna(subset=['target_shift'], inplace=True)
        else:
            pass

        self.n_rolling_win = df_rolled['timeID'].nunique()

        # TODO: set default aggregations as mean and last

        aggregations_local = deepcopy(aggregations)

        if include_target:
            aggregations_local[target_col] = 'last'
        else:
            pass

        df_aggregated = df_rolled.groupby([id_col,
                                           'ref_date']).agg(aggregations_local)
        df_aggregated.reset_index(inplace=True)
        # Rename columns
        df_aggregated.columns = [
            i[0] + '_' + i[1] if len(i) == 2 else i
            for i in df_aggregated.columns
        ]
        df_aggregated.columns = [
            i[:-1] if i[-1] == '_' else i for i in df_aggregated.columns
        ]
        df_aggregated['month'] = df_aggregated['ref_date'].map(
            lambda x: x.month)
        # TODO: get dummies for all categoricals
        df_aggregated = pd.concat(
            [df_aggregated,
             pd.get_dummies(list(df_aggregated.id))], axis=1)
        if include_target:
            df_aggregated.rename(columns={target_col + '_last': target_col},
                                 inplace=True)
        else:
            pass

        cols = list(df_aggregated.columns)
        first_cols = [id_col, 'ref_date', 'target', 'month']
        remaining_cols = list(set(cols) - set(first_cols))
        cols = first_cols + sorted(remaining_cols)
        df_aggregated = df_aggregated[cols]

        return df_aggregated, df_rolled