예제 #1
0
    def test_extract_features_after_randomisation_per_sample(self):
        df = self.create_test_data_sample()
        df_random = df.copy().sample(frac=1)

        extracted_features = extract_features(
            df,
            self.settings,
            "id",
            "sort",
            "kind",
            "val",
            parallelization='per_sample').sort_index()
        extracted_features_from_random = extract_features(
            df_random,
            self.settings,
            "id",
            "sort",
            "kind",
            "val",
            parallelization='per_sample').sort_index()

        six.assertCountEqual(self, extracted_features.columns,
                             extracted_features_from_random.columns)

        for col in extracted_features:
            self.assertIsNone(
                np.testing.assert_array_almost_equal(
                    extracted_features[col],
                    extracted_features_from_random[col]))
예제 #2
0
    def test_extract_features(self):
        # todo: implement more methods and test more aspects
        df = self.create_test_data_sample()
        extracted_features = extract_features(df, column_id="id", column_sort="sort",
                                              column_kind="kind", column_value="val",
                                              n_jobs=self.n_jobs)
        self.assertIsInstance(extracted_features, pd.DataFrame)
        self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77])))
        self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017])))
        self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167])))
        self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695])))
        self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1])))
        self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483])))
        self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75])))
        self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0])))

        df_sts = self.create_one_valued_time_series()
        extracted_features_sts = extract_features(df_sts, column_id="id", column_sort="sort",
                                                  column_kind="kind", column_value="val",
                                                  n_jobs=self.n_jobs)

        self.assertIsInstance(extracted_features_sts, pd.DataFrame)
        self.assertTrue(np.all(extracted_features_sts.a__maximum == np.array([1.0, 6.0])))
        self.assertTrue(np.all(extracted_features_sts.a__sum_values == np.array([1.0, 11.0])))
        self.assertTrue(np.all(extracted_features_sts.a__count_above_mean == np.array([0, 1])))
예제 #3
0
    def test_extract_features_per_sample_equals_per_kind(self):
        df = self.create_test_data_sample()

        features_per_sample = extract_features(df,
                                               self.settings,
                                               "id",
                                               "sort",
                                               "kind",
                                               "val",
                                               parallelization='per_sample')
        features_per_kind = extract_features(df,
                                             self.settings,
                                             "id",
                                             "sort",
                                             "kind",
                                             "val",
                                             parallelization='per_kind')

        six.assertCountEqual(self, features_per_sample.columns,
                             features_per_kind.columns)

        for col in features_per_sample.columns:
            self.assertIsNone(
                np.testing.assert_array_almost_equal(features_per_sample[col],
                                                     features_per_kind[col]))
예제 #4
0
    def test_extract_features_for_one_time_series(self):
        # todo: implement more methods and test more aspects
        df = self.create_test_data_sample()
        settings = ComprehensiveFCParameters()

        extracted_features = extract_features(df, default_fc_parameters=settings,
                                              column_value="val", column_id="id",
                                              column_kind="kind", column_sort="sort")

        self.assertIsInstance(extracted_features, pd.DataFrame)
        self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695])))
        self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1])))
        self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483])))
        self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75])))
        self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0])))

        df_sts = self.create_one_valued_time_series()
        extracted_features_sts = extract_features(df_sts, default_fc_parameters=settings,
                                                  column_value="val", column_id="id",
                                                  column_kind="kind", column_sort="sort")

        self.assertIsInstance(extracted_features_sts, pd.DataFrame)
        self.assertTrue(np.all(extracted_features_sts.a__maximum == np.array([1.0, 6.0])))
        self.assertTrue(np.all(extracted_features_sts.a__sum_values == np.array([1.0, 11.0])))
        self.assertTrue(np.all(extracted_features_sts.a__count_above_mean == np.array([0, 1])))
예제 #5
0
    def test_extract_features_for_one_time_series(self):
        # todo: implement more methods and test more aspects
        df = self.create_test_data_sample()
        settings = ComprehensiveFCParameters()

        extracted_features = extract_features(df, default_fc_parameters=settings,
                                              column_value="val", column_id="id", column_kind="kind",
                                              column_sort="sort")

        self.assertIsInstance(extracted_features, pd.DataFrame)
        self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695])))
        self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1])))
        self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483])))
        self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75])))
        self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0])))

        df_sts = self.create_one_valued_time_series()
        extracted_features_sts = extract_features(df_sts, default_fc_parameters=settings,
                                                  column_value="val", column_id="id", column_kind="kind",
                                                  column_sort="sort")

        self.assertIsInstance(extracted_features_sts, pd.DataFrame)
        self.assertTrue(np.all(extracted_features_sts.a__maximum == np.array([1.0, 6.0])))
        self.assertTrue(np.all(extracted_features_sts.a__sum_values == np.array([1.0, 11.0])))
        self.assertTrue(np.all(extracted_features_sts.a__count_above_mean == np.array([0, 1])))
예제 #6
0
    def test_extract_features(self):
        # todo: implement more methods and test more aspects
        df = self.create_test_data_sample()
        extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind",
                                              column_value="val",
                                              n_jobs=self.n_jobs)
        self.assertIsInstance(extracted_features, pd.DataFrame)
        self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77])))
        self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017])))
        self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167])))
        self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695])))
        self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1])))
        self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483])))
        self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75])))
        self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0])))

        df_sts = self.create_one_valued_time_series()
        extracted_features_sts = extract_features(df_sts, column_id="id", column_sort="sort", column_kind="kind",
                                                  column_value="val",
                                                  n_jobs=self.n_jobs)

        self.assertIsInstance(extracted_features_sts, pd.DataFrame)
        self.assertTrue(np.all(extracted_features_sts.a__maximum == np.array([1.0, 6.0])))
        self.assertTrue(np.all(extracted_features_sts.a__sum_values == np.array([1.0, 11.0])))
        self.assertTrue(np.all(extracted_features_sts.a__count_above_mean == np.array([0, 1])))
예제 #7
0
    def test_from_columns_correct_for_different_kind_datatypes(self):
        """The `settings.from_columns()` function is supposed to save the feature extraction / selection results so it
        can be reused later. It works by parsing the column names of the extracted dataframes. An unfortunate side
        effect of this is that when used with the 'long' format time series input, the typing information about the
        'kind' column is lost. For example, even if the 'kind' values are in int32, in the resulting settings dict, the
        type of the top level keys (representing different kind values) will be str
        """
        df = pd.DataFrame({
            'id': [1, 1, 1, 1],
            'time': [1, 1, 2, 2],
            'kind': [1, 2, 1, 2],
            'value': [1, 2, 3, 4]
        })

        features = extract_features(
            df,
            column_id='id',
            column_sort='time',
            column_kind='kind',
            column_value='value',
            default_fc_parameters=MinimalFCParameters())
        sample_settings = from_columns(features)
        X = extract_features(df,
                             column_id='id',
                             column_sort='time',
                             column_kind='kind',
                             column_value='value',
                             kind_to_fc_parameters=sample_settings)
        assert X.shape == (1, 2 * len(MinimalFCParameters()))
예제 #8
0
    def test_extract_features_after_randomisation(self):
        df = self.create_test_data_sample()
        df_random = df.copy().sample(frac=1)

        extracted_features = extract_features(df,
                                              column_id="id",
                                              column_sort="sort",
                                              column_kind="kind",
                                              column_value="val",
                                              n_jobs=self.n_jobs).sort_index()
        extracted_features_from_random = extract_features(
            df_random,
            column_id="id",
            column_sort="sort",
            column_kind="kind",
            column_value="val",
            n_jobs=self.n_jobs).sort_index()

        self.assertCountEqual(extracted_features.columns,
                              extracted_features_from_random.columns)

        for col in extracted_features:
            self.assertIsNone(
                np.testing.assert_array_almost_equal(
                    extracted_features[col],
                    extracted_features_from_random[col]))
예제 #9
0
    def test_profiling_cumulative_file_written_out(self):

        PROFILING_FILENAME = os.path.join(self.directory, "test_profiling_cumulative.txt")
        PROFILING_SORTING = "cumulative"

        df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), "val": np.random.normal(0, 1, 20)})
        extract_features(df, column_id="id",
                         column_value="val", n_jobs=self.n_jobs,
                         profile=True, profiling_filename=PROFILING_FILENAME, profiling_sorting=PROFILING_SORTING)

        self.assertTrue(os.path.isfile(PROFILING_FILENAME))
        os.remove(PROFILING_FILENAME)
예제 #10
0
    def test_profiling_cumulative_file_written_out(self):

        PROFILING_FILENAME = os.path.join(self.directory, "test_profiling_cumulative.txt")
        PROFILING_SORTING = "cumulative"

        df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), "val": np.random.normal(0, 1, 20)})
        extract_features(df, column_id="id", column_value="val", n_jobs=self.n_jobs,
                         profile=True, profiling_filename=PROFILING_FILENAME,
                         profiling_sorting=PROFILING_SORTING)

        self.assertTrue(os.path.isfile(PROFILING_FILENAME))
        os.remove(PROFILING_FILENAME)
예제 #11
0
 def test_from_column_correct_for_comprehensive_fc_parameters(self):
     fset = ComprehensiveFCParameters()
     X_org = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}),
                              default_fc_parameters=fset,
                              column_id="id", column_value="value",
                              n_jobs=0)
     inferred_fset = from_columns(X_org)
     X_new = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}),
                              kind_to_fc_parameters=inferred_fset,
                              column_id="id", column_value="value",
                              n_jobs=0)
     assert_frame_equal(X_org.sort_index(), X_new.sort_index())
예제 #12
0
    def test_extract_features_with_and_without_parallelization(self):
        df = self.create_test_data_sample()

        features_parallel = extract_features(df, column_id="id", column_sort="sort", column_kind="kind",
                                             column_value="val",
                                             n_jobs=self.n_jobs)
        features_serial = extract_features(df, column_id="id", column_sort="sort", column_kind="kind",
                                           column_value="val", n_jobs=0)

        six.assertCountEqual(self, features_parallel.columns, features_serial.columns)

        for col in features_parallel.columns:
            np.testing.assert_array_almost_equal(features_parallel[col], features_serial[col])
예제 #13
0
    def test_extract_features_with_and_without_parallelization(self):
        df = self.create_test_data_sample()

        features_parallel = extract_features(df, column_id="id", column_sort="sort", column_kind="kind",
                                             column_value="val",
                                             n_jobs=self.n_jobs)

        features_serial = extract_features(df, column_id="id", column_sort="sort", column_kind="kind",
                                           column_value="val", n_jobs=0)

        six.assertCountEqual(self, features_parallel.columns, features_serial.columns)

        for col in features_parallel.columns:
            np.testing.assert_array_almost_equal(features_parallel[col], features_serial[col])
예제 #14
0
    def test_from_columns(self):
        tsn = "TEST_TIME_SERIES"

        fset = ComprehensiveFCParameters()
        self.assertRaises(TypeError, from_columns, 42)
        self.assertRaises(TypeError, from_columns, 42)
        self.assertRaises(ValueError, from_columns, ["This is not a column name"])
        self.assertRaises(ValueError, from_columns, ["This__neither"])
        self.assertRaises(ValueError, from_columns, ["This__also__not"])

        # Aggregate functions
        feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"]

        # Aggregate functions with params
        feature_names += [tsn + '__quantile__q_10', tsn + '__quantile__q_70', tsn + '__number_peaks__n_30',
                          tsn + '__value_count__value_inf', tsn + '__value_count__value_-inf',
                          tsn + '__value_count__value_nan']

        # Apply functions
        feature_names += [tsn + '__ar_coefficient__k_20__coeff_4', tsn + '__ar_coefficient__coeff_10__k_-1']

        kind_to_fc_parameters = from_columns(feature_names)

        six.assertCountEqual(self, list(kind_to_fc_parameters[tsn].keys()),
                             ["sum_values", "median", "length", "sample_entropy", "quantile", "number_peaks",
                              "ar_coefficient", "value_count"])

        self.assertEqual(kind_to_fc_parameters[tsn]["sum_values"], None)
        self.assertEqual(kind_to_fc_parameters[tsn]["ar_coefficient"],
                         [{"k": 20, "coeff": 4}, {"k": -1, "coeff": 10}])

        self.assertEqual(kind_to_fc_parameters[tsn]["value_count"],
                         [{"value": np.PINF}, {"value": np.NINF}, {"value": np.NaN}])

        # test that it passes for all functions
        fset = ComprehensiveFCParameters()
        X_org = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}),
                                 default_fc_parameters=fset,
                                 column_id="id", column_value="value",
                                 n_jobs=0)

        inferred_fset = from_columns(X_org)

        X_new = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}),
                                 kind_to_fc_parameters=inferred_fset,
                                 column_id="id", column_value="value",
                                 n_jobs=0)

        assert_frame_equal(X_org.sort_index(), X_new.sort_index())
예제 #15
0
    def test_from_columns(self):
        tsn = "TEST_TIME_SERIES"

        fset = ComprehensiveFCParameters()
        self.assertRaises(TypeError, from_columns, 42)
        self.assertRaises(TypeError, from_columns, 42)
        self.assertRaises(ValueError, from_columns, ["This is not a column name"])
        self.assertRaises(ValueError, from_columns, ["This__neither"])
        self.assertRaises(ValueError, from_columns, ["This__also__not"])

        # Aggregate functions
        feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"]

        # Aggregate functions with params
        feature_names += [tsn + '__quantile__q_10', tsn + '__quantile__q_70', tsn + '__number_peaks__n_30',
                          tsn + '__value_count__value_inf', tsn + '__value_count__value_-inf',
                          tsn + '__value_count__value_nan']

        # Apply functions
        feature_names += [tsn + '__ar_coefficient__k_20__coeff_4', tsn + '__ar_coefficient__coeff_10__k_-1']

        kind_to_fc_parameters = from_columns(feature_names)

        six.assertCountEqual(self, list(kind_to_fc_parameters[tsn].keys()),
                             ["sum_values", "median", "length", "sample_entropy", "quantile", "number_peaks",
                              "ar_coefficient", "value_count"])

        self.assertEqual(kind_to_fc_parameters[tsn]["sum_values"], None)
        self.assertEqual(kind_to_fc_parameters[tsn]["ar_coefficient"],
                         [{"k": 20, "coeff": 4}, {"k": -1, "coeff": 10}])

        self.assertEqual(kind_to_fc_parameters[tsn]["value_count"],
                         [{"value": np.PINF}, {"value": np.NINF}, {"value": np.NaN}])

        # test that it passes for all functions
        fset = ComprehensiveFCParameters()
        X_org = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}),
                                 default_fc_parameters=fset,
                                 column_id="id", column_value="value",
                                 n_jobs=0)

        inferred_fset = from_columns(X_org)

        X_new = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}),
                                 kind_to_fc_parameters=inferred_fset,
                                 column_id="id", column_value="value",
                                 n_jobs=0)

        assert_frame_equal(X_org.sort_index(), X_new.sort_index())
예제 #16
0
    def test_extract_features_for_index_based_functions(self):
        df = self.create_test_data_sample_with_time_index()

        settings = {
            'linear_trend_timewise': [{"attr": "slope"}],
            'linear_trend': [{"attr": "slope"}]
        }

        extracted_features = extract_features(df, default_fc_parameters=settings,
                                              column_value="val", column_id="id",
                                              column_kind="kind",
                                              column_sort="sort")

        self.assertIsInstance(extracted_features, pd.DataFrame)

        slope_a = extracted_features['a__linear_trend_timewise__attr_"slope"'].values
        slope_b = extracted_features['b__linear_trend_timewise__attr_"slope"'].values

        self.assertAlmostEqual(slope_a[0], -0.001347117)
        self.assertAlmostEqual(slope_a[1], 0.052036340)
        self.assertAlmostEqual(slope_b[0], 0.021898496)
        self.assertAlmostEqual(slope_b[1], -0.012312)

        # Test that the index of the returned df is the ID and not the timestamp
        self.assertTrue(extracted_features.index.dtype != df.index.dtype)
        self.assertTrue(extracted_features.index.dtype == df['id'].dtype)
        self.assertEqual(
            sorted(extracted_features.index.unique().tolist()), sorted(df['id'].unique().tolist())
        )
예제 #17
0
 def test_extract_features_without_settings(self):
     df = pd.DataFrame(data={"id": np.repeat([1, 2], 10),
                             "value1": np.random.normal(0, 1, 20),
                             "value2": np.random.normal(0, 1, 20)})
     X = extract_features(df, column_id="id")
     self.assertIn("value1__maximum", list(X.columns))
     self.assertIn("value2__maximum", list(X.columns))
예제 #18
0
    def test_extract_features(self):
        # todo: implement more methods and test more aspects
        df = self.create_test_data_sample()
        extracted_features = extract_features(df, self.settings, "id", "sort",
                                              "kind", "val")

        self.assertIsInstance(extracted_features, pd.DataFrame)
        self.assertTrue(
            np.all(extracted_features.a__maximum == np.array([71, 77])))
        self.assertTrue(
            np.all(extracted_features.a__sum_values == np.array([691, 1017])))
        self.assertTrue(
            np.all(
                extracted_features.a__abs_energy == np.array([32211, 63167])))
        self.assertTrue(
            np.all(extracted_features.b__sum_values == np.array([757, 695])))
        self.assertTrue(
            np.all(extracted_features.b__minimum == np.array([3, 1])))
        self.assertTrue(
            np.all(
                extracted_features.b__abs_energy == np.array([36619, 35483])))
        self.assertTrue(
            np.all(extracted_features.b__mean == np.array([37.85, 34.75])))
        self.assertTrue(
            np.all(extracted_features.b__median == np.array([39.5, 28.0])))
예제 #19
0
    def test_extract_features_after_randomisation(self):
        df = self.create_test_data_sample()
        df_random = df.copy().sample(frac=1)

        extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind",
                                              column_value="val",
                                              n_jobs=self.n_jobs).sort_index()
        extracted_features_from_random = extract_features(df_random, column_id="id", column_sort="sort",
                                                          column_kind="kind",
                                                          column_value="val",
                                                          n_jobs=self.n_jobs).sort_index()

        six.assertCountEqual(self, extracted_features.columns, extracted_features_from_random.columns)

        for col in extracted_features:
            self.assertIsNone(np.testing.assert_array_almost_equal(extracted_features[col],
                                                                   extracted_features_from_random[col]))
예제 #20
0
    def test_extract_index_preservation(self):
        df = self.create_test_data_nearly_numerical_indices()
        extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind",
                                              column_value="val",
                                              n_jobs=self.n_jobs)

        self.assertIsInstance(extracted_features, pd.DataFrame)
        self.assertTrue(set(df.index) == set(extracted_features.index))
예제 #21
0
 def test_extract_features_without_settings(self):
     df = pd.DataFrame(data={"id": np.repeat([1, 2], 10),
                             "value1": np.random.normal(0, 1, 20),
                             "value2": np.random.normal(0, 1, 20)})
     X = extract_features(df, column_id="id",
                          n_jobs=self.n_jobs)
     self.assertIn("value1__maximum", list(X.columns))
     self.assertIn("value2__maximum", list(X.columns))
예제 #22
0
    def test_extract_index_preservation(self):
        df = self.create_test_data_nearly_numerical_indices()
        extracted_features = extract_features(df, column_id="id", column_sort="sort",
                                              column_kind="kind", column_value="val",
                                              n_jobs=self.n_jobs)

        self.assertIsInstance(extracted_features, pd.DataFrame)
        self.assertEqual(set(df["id"]), set(extracted_features.index))
예제 #23
0
    def test_profiling_file_written_out(self):

        df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), "val": np.random.normal(0, 1, 20)})
        profiling_filename = os.path.join(self.directory, "test_profiling.txt")
        X = extract_features(df, column_id="id", column_value="val", n_jobs=self.n_jobs,
                             profile=True, profiling_filename=profiling_filename)

        self.assertTrue(os.path.isfile(profiling_filename))
        os.remove(profiling_filename)
예제 #24
0
    def test_extraction_runs_through(self):
        rfs = EfficientFCParameters()
        data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"])

        extracted_features = extract_features(data, default_fc_parameters=rfs,
                                              column_kind="kind", column_value="value",
                                              column_sort="time", column_id="id")

        six.assertCountEqual(self, extracted_features.index, [0, 1])
예제 #25
0
    def test_calculate_ts_features_after_randomisation(self):
        df = self.create_test_data_sample()
        df_random = df.copy().sample(frac=1)

        extracted_features = extract_features(df, self.settings, "id", "sort",
                                              "kind", "val").sort_index()
        extracted_features_from_random = extract_features(
            df_random, self.settings, "id", "sort", "kind",
            "val").sort_index()

        self.assertItemsEqual(extracted_features.columns,
                              extracted_features_from_random.columns)

        for col in extracted_features:
            self.assertIsNone(
                np.testing.assert_array_almost_equal(
                    extracted_features[col],
                    extracted_features_from_random[col]))
예제 #26
0
    def test_extraction_runs_through(self):
        rfs = EfficientFCParameters()
        data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"])

        extracted_features = extract_features(data, default_fc_parameters=rfs,
                                              column_kind="kind", column_value="value",
                                              column_sort="time", column_id="id")

        six.assertCountEqual(self, extracted_features.index, [0, 1])
예제 #27
0
    def test_profiling_file_written_out(self):

        df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), "val": np.random.normal(0, 1, 20)})
        profiling_filename = os.path.join(self.directory, "test_profiling.txt")
        X = extract_features(df, column_id="id",
                             column_value="val", n_jobs=self.n_jobs,
                             profile=True, profiling_filename=profiling_filename)

        self.assertTrue(os.path.isfile(profiling_filename))
        os.remove(profiling_filename)
예제 #28
0
    def test_profiling_cumulative_file_written_out(self):
        PROFILING_FILENAME = "test_profiling_cumulative.txt"
        PROFILING_SORTING = "cumulative"

        df = pd.DataFrame(data={
            "id": np.repeat([1, 2], 10),
            "val": np.random.normal(0, 1, 20)
        })
        extract_features(df,
                         column_id="id",
                         column_value="val",
                         parallelization='per_sample',
                         n_processes=self.n_processes,
                         profile=True,
                         profiling_filename=PROFILING_FILENAME,
                         profiling_sorting=PROFILING_SORTING)

        self.assertTrue(os.path.isfile(PROFILING_FILENAME))
        os.remove(PROFILING_FILENAME)
예제 #29
0
    def test_extraction_runs_through(self):
        rfs = ReasonableFeatureExtractionSettings()

        data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"])

        extracted_features = extract_features(data, feature_extraction_settings=rfs,
                                              column_kind="kind", column_value="value",
                                              column_sort="time", column_id="id")

        six.assertCountEqual(self, extracted_features.index, [0, 1])
예제 #30
0
    def test_profiling_file_written_out(self):

        fes = FeatureExtractionSettings()
        fes.PROFILING = True
        fes.PROFILING_FILENAME = "test_profiling.txt"

        df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), "value": np.random.normal(0, 1, 20)})
        X = extract_features(df, column_id="id", column_value="value", feature_extraction_settings=fes)

        self.assertTrue(os.path.isfile(fes.PROFILING_FILENAME))
        os.remove(fes.PROFILING_FILENAME)
예제 #31
0
    def test_extraction_runs_through(self):
        mfs = MinimalFCParameters()

        data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"])

        extracted_features = extract_features(data, default_fc_parameters=mfs,
                                              column_kind="kind", column_value="value",
                                              column_sort="time", column_id="id")

        self.assertCountEqual(extracted_features.columns, ["0__median", "0__standard_deviation", "0__sum_values",
                                                           "0__maximum", "0__variance", "0__minimum", "0__mean",
                                                           "0__length", "0__root_mean_square", "0__absolute_maximum"])
        self.assertCountEqual(extracted_features.index, [0, 1])
예제 #32
0
    def test_extraction_runs_through(self):
        mfs = MinimalFeatureExtractionSettings()

        data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"])

        extracted_features = extract_features(data, feature_extraction_settings=mfs,
                                              column_kind="kind", column_value="value",
                                              column_sort="time", column_id="id")

        six.assertCountEqual(self, extracted_features.columns, ["0__median", "0__standard_deviation", "0__sum_values",
                                                                "0__maximum", "0__variance","0__minimum", "0__mean",
                                                                "0__length"])
        six.assertCountEqual(self, extracted_features.index, [0, 1])
예제 #33
0
    def test_extract_features_alphabetically_sorted(self):
        df = self.create_test_data_sample()

        features = extract_features(df, column_id="id", column_sort="sort",
                                    column_kind="kind", column_value="val")

        for col_name in features.columns:
            # split out the configuration of the features calculator
            col_name_chunks = col_name.split("__")
            # the name is always at the beginning, so remove it. Also remove the kind of the column
            col_name_chunks = col_name_chunks[2:]

            self.assertEqual(col_name_chunks, list(sorted(col_name_chunks)))
예제 #34
0
    def test_distributor_map_reduce_and_close_are_called(self):
        df = self.create_test_data_sample()

        mock = Mock(spec=DistributorBaseClass)
        mock.close.return_value = None
        mock.map_reduce.return_value = []

        X = extract_features(timeseries_container=df, column_id="id", column_sort="sort",
                             column_kind="kind", column_value="val",
                             default_fc_parameters=self.name_to_param, distributor=mock)

        self.assertTrue(mock.close.called)
        self.assertTrue(mock.map_reduce.called)
예제 #35
0
    def test_extraction_runs_through(self):
        mfs = MinimalFCParameters()

        data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"])

        extracted_features = extract_features(data, default_fc_parameters=mfs,
                                              column_kind="kind", column_value="value",
                                              column_sort="time", column_id="id")

        six.assertCountEqual(self, extracted_features.columns, ["0__median", "0__standard_deviation", "0__sum_values",
                                                                "0__maximum", "0__variance", "0__minimum", "0__mean",
                                                                "0__length"])
        six.assertCountEqual(self, extracted_features.index, [0, 1])
예제 #36
0
    def test_distributor_map_reduce_and_close_are_called(self):
        df = self.create_test_data_sample()

        mock = Mock(spec=DistributorBaseClass)
        mock.close.return_value = None
        mock.map_reduce.return_value = []

        X = extract_features(timeseries_container=df, column_id="id", column_sort="sort", column_kind="kind",
                             column_value="val", default_fc_parameters=self.name_to_param,
                             distributor=mock)

        self.assertTrue(mock.close.called)
        self.assertTrue(mock.map_reduce.called)
예제 #37
0
 def test_extract_features_uses_only_kind_to_fc_settings(self):
     df = self.create_test_data_sample()
     extracted_features = extract_features(
         df,
         column_id="id",
         column_sort="sort",
         column_kind="kind",
         column_value="val",
         n_jobs=self.n_jobs,
         kind_to_fc_parameters={"a": {
             "maximum": None,
             "minimum": None
         }})
     assert len(extracted_features) == 2
예제 #38
0
    def test_extract_features_per_sample_equals_per_kind(self):
        df = self.create_test_data_sample()

        features_per_sample = extract_features(df,
                                               column_id="id",
                                               column_sort="sort",
                                               column_kind="kind",
                                               column_value="val",
                                               parallelization='per_sample',
                                               n_processes=self.n_processes)
        features_per_kind = extract_features(df,
                                             column_id="id",
                                             column_sort="sort",
                                             column_kind="kind",
                                             column_value="val",
                                             parallelization='per_kind',
                                             n_processes=self.n_processes)
        features_serial = extract_features(df,
                                           column_id="id",
                                           column_sort="sort",
                                           column_kind="kind",
                                           column_value="val",
                                           parallelization='serial')

        six.assertCountEqual(self, features_per_sample.columns,
                             features_per_kind.columns)
        six.assertCountEqual(self, features_per_sample.columns,
                             features_serial.columns)

        for col in features_per_sample.columns:
            self.assertIsNone(
                np.testing.assert_array_almost_equal(features_per_sample[col],
                                                     features_per_kind[col]))
            self.assertIsNone(
                np.testing.assert_array_almost_equal(features_per_sample[col],
                                                     features_serial[col]))
예제 #39
0
    def test_distributor_close_is_called(self):
        df = self.create_test_data_sample()

        mock = MapDistributor()
        mock.close = Mock()
        mock.close.return_value = None

        X = extract_features(timeseries_container=df,
                             column_id="id",
                             column_sort="sort",
                             column_kind="kind",
                             column_value="val",
                             default_fc_parameters=self.name_to_param,
                             distributor=mock)

        self.assertTrue(mock.close.called)
예제 #40
0
    def test_profiling_file_written_out(self):

        df = pd.DataFrame(data={
            "id": np.repeat([1, 2], 10),
            "val": np.random.normal(0, 1, 20)
        })
        profiling_filename = "test_profiling.txt"
        X = extract_features(df,
                             column_id="id",
                             column_value="val",
                             parallelization='per_kind',
                             n_processes=self.n_processes,
                             profile=True,
                             profiling_filename=profiling_filename)

        self.assertTrue(os.path.isfile(profiling_filename))
        os.remove(profiling_filename)
예제 #41
0
    def test_extract_features_custom_function(self):
        df = self.create_test_data_sample()

        def custom_function(x, p):
            return len(x) + p

        settings = PickeableSettings({
            'mean': None,
            custom_function: [{
                "p": 1
            }, {
                "p": -1
            }],
        })

        extracted_features = extract_features(df,
                                              default_fc_parameters=settings,
                                              column_value="val",
                                              column_id="id",
                                              column_kind="kind",
                                              column_sort="sort")

        self.assertIsInstance(extracted_features, pd.DataFrame)

        mean_a = extracted_features['a__mean'].values
        custom_function_a_1 = extracted_features[
            'a__custom_function__p_1'].values
        custom_function_a_m1 = extracted_features[
            'a__custom_function__p_-1'].values

        self.assertAlmostEqual(mean_a[0], 34.55)
        self.assertAlmostEqual(mean_a[1], 50.85)
        self.assertAlmostEqual(custom_function_a_1[0], 21)
        self.assertAlmostEqual(custom_function_a_1[1], 21)
        self.assertAlmostEqual(custom_function_a_m1[0], 19)
        self.assertAlmostEqual(custom_function_a_m1[1], 19)