def test_extract_features_after_randomisation_per_sample(self): df = self.create_test_data_sample() df_random = df.copy().sample(frac=1) extracted_features = extract_features( df, self.settings, "id", "sort", "kind", "val", parallelization='per_sample').sort_index() extracted_features_from_random = extract_features( df_random, self.settings, "id", "sort", "kind", "val", parallelization='per_sample').sort_index() six.assertCountEqual(self, extracted_features.columns, extracted_features_from_random.columns) for col in extracted_features: self.assertIsNone( np.testing.assert_array_almost_equal( extracted_features[col], extracted_features_from_random[col]))
def test_extract_features(self): # todo: implement more methods and test more aspects df = self.create_test_data_sample() extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", n_jobs=self.n_jobs) self.assertIsInstance(extracted_features, pd.DataFrame) self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77]))) self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017]))) self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167]))) self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695]))) self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1]))) self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483]))) self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0]))) df_sts = self.create_one_valued_time_series() extracted_features_sts = extract_features(df_sts, column_id="id", column_sort="sort", column_kind="kind", column_value="val", n_jobs=self.n_jobs) self.assertIsInstance(extracted_features_sts, pd.DataFrame) self.assertTrue(np.all(extracted_features_sts.a__maximum == np.array([1.0, 6.0]))) self.assertTrue(np.all(extracted_features_sts.a__sum_values == np.array([1.0, 11.0]))) self.assertTrue(np.all(extracted_features_sts.a__count_above_mean == np.array([0, 1])))
def test_extract_features_per_sample_equals_per_kind(self): df = self.create_test_data_sample() features_per_sample = extract_features(df, self.settings, "id", "sort", "kind", "val", parallelization='per_sample') features_per_kind = extract_features(df, self.settings, "id", "sort", "kind", "val", parallelization='per_kind') six.assertCountEqual(self, features_per_sample.columns, features_per_kind.columns) for col in features_per_sample.columns: self.assertIsNone( np.testing.assert_array_almost_equal(features_per_sample[col], features_per_kind[col]))
def test_extract_features_for_one_time_series(self): # todo: implement more methods and test more aspects df = self.create_test_data_sample() settings = ComprehensiveFCParameters() extracted_features = extract_features(df, default_fc_parameters=settings, column_value="val", column_id="id", column_kind="kind", column_sort="sort") self.assertIsInstance(extracted_features, pd.DataFrame) self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695]))) self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1]))) self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483]))) self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0]))) df_sts = self.create_one_valued_time_series() extracted_features_sts = extract_features(df_sts, default_fc_parameters=settings, column_value="val", column_id="id", column_kind="kind", column_sort="sort") self.assertIsInstance(extracted_features_sts, pd.DataFrame) self.assertTrue(np.all(extracted_features_sts.a__maximum == np.array([1.0, 6.0]))) self.assertTrue(np.all(extracted_features_sts.a__sum_values == np.array([1.0, 11.0]))) self.assertTrue(np.all(extracted_features_sts.a__count_above_mean == np.array([0, 1])))
def test_extract_features_for_one_time_series(self): # todo: implement more methods and test more aspects df = self.create_test_data_sample() settings = ComprehensiveFCParameters() extracted_features = extract_features(df, default_fc_parameters=settings, column_value="val", column_id="id", column_kind="kind", column_sort="sort") self.assertIsInstance(extracted_features, pd.DataFrame) self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695]))) self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1]))) self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483]))) self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0]))) df_sts = self.create_one_valued_time_series() extracted_features_sts = extract_features(df_sts, default_fc_parameters=settings, column_value="val", column_id="id", column_kind="kind", column_sort="sort") self.assertIsInstance(extracted_features_sts, pd.DataFrame) self.assertTrue(np.all(extracted_features_sts.a__maximum == np.array([1.0, 6.0]))) self.assertTrue(np.all(extracted_features_sts.a__sum_values == np.array([1.0, 11.0]))) self.assertTrue(np.all(extracted_features_sts.a__count_above_mean == np.array([0, 1])))
def test_extract_features(self): # todo: implement more methods and test more aspects df = self.create_test_data_sample() extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", n_jobs=self.n_jobs) self.assertIsInstance(extracted_features, pd.DataFrame) self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77]))) self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017]))) self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167]))) self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695]))) self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1]))) self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483]))) self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0]))) df_sts = self.create_one_valued_time_series() extracted_features_sts = extract_features(df_sts, column_id="id", column_sort="sort", column_kind="kind", column_value="val", n_jobs=self.n_jobs) self.assertIsInstance(extracted_features_sts, pd.DataFrame) self.assertTrue(np.all(extracted_features_sts.a__maximum == np.array([1.0, 6.0]))) self.assertTrue(np.all(extracted_features_sts.a__sum_values == np.array([1.0, 11.0]))) self.assertTrue(np.all(extracted_features_sts.a__count_above_mean == np.array([0, 1])))
def test_from_columns_correct_for_different_kind_datatypes(self): """The `settings.from_columns()` function is supposed to save the feature extraction / selection results so it can be reused later. It works by parsing the column names of the extracted dataframes. An unfortunate side effect of this is that when used with the 'long' format time series input, the typing information about the 'kind' column is lost. For example, even if the 'kind' values are in int32, in the resulting settings dict, the type of the top level keys (representing different kind values) will be str """ df = pd.DataFrame({ 'id': [1, 1, 1, 1], 'time': [1, 1, 2, 2], 'kind': [1, 2, 1, 2], 'value': [1, 2, 3, 4] }) features = extract_features( df, column_id='id', column_sort='time', column_kind='kind', column_value='value', default_fc_parameters=MinimalFCParameters()) sample_settings = from_columns(features) X = extract_features(df, column_id='id', column_sort='time', column_kind='kind', column_value='value', kind_to_fc_parameters=sample_settings) assert X.shape == (1, 2 * len(MinimalFCParameters()))
def test_extract_features_after_randomisation(self): df = self.create_test_data_sample() df_random = df.copy().sample(frac=1) extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", n_jobs=self.n_jobs).sort_index() extracted_features_from_random = extract_features( df_random, column_id="id", column_sort="sort", column_kind="kind", column_value="val", n_jobs=self.n_jobs).sort_index() self.assertCountEqual(extracted_features.columns, extracted_features_from_random.columns) for col in extracted_features: self.assertIsNone( np.testing.assert_array_almost_equal( extracted_features[col], extracted_features_from_random[col]))
def test_profiling_cumulative_file_written_out(self): PROFILING_FILENAME = os.path.join(self.directory, "test_profiling_cumulative.txt") PROFILING_SORTING = "cumulative" df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), "val": np.random.normal(0, 1, 20)}) extract_features(df, column_id="id", column_value="val", n_jobs=self.n_jobs, profile=True, profiling_filename=PROFILING_FILENAME, profiling_sorting=PROFILING_SORTING) self.assertTrue(os.path.isfile(PROFILING_FILENAME)) os.remove(PROFILING_FILENAME)
def test_profiling_cumulative_file_written_out(self): PROFILING_FILENAME = os.path.join(self.directory, "test_profiling_cumulative.txt") PROFILING_SORTING = "cumulative" df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), "val": np.random.normal(0, 1, 20)}) extract_features(df, column_id="id", column_value="val", n_jobs=self.n_jobs, profile=True, profiling_filename=PROFILING_FILENAME, profiling_sorting=PROFILING_SORTING) self.assertTrue(os.path.isfile(PROFILING_FILENAME)) os.remove(PROFILING_FILENAME)
def test_from_column_correct_for_comprehensive_fc_parameters(self): fset = ComprehensiveFCParameters() X_org = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}), default_fc_parameters=fset, column_id="id", column_value="value", n_jobs=0) inferred_fset = from_columns(X_org) X_new = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}), kind_to_fc_parameters=inferred_fset, column_id="id", column_value="value", n_jobs=0) assert_frame_equal(X_org.sort_index(), X_new.sort_index())
def test_extract_features_with_and_without_parallelization(self): df = self.create_test_data_sample() features_parallel = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", n_jobs=self.n_jobs) features_serial = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", n_jobs=0) six.assertCountEqual(self, features_parallel.columns, features_serial.columns) for col in features_parallel.columns: np.testing.assert_array_almost_equal(features_parallel[col], features_serial[col])
def test_extract_features_with_and_without_parallelization(self): df = self.create_test_data_sample() features_parallel = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", n_jobs=self.n_jobs) features_serial = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", n_jobs=0) six.assertCountEqual(self, features_parallel.columns, features_serial.columns) for col in features_parallel.columns: np.testing.assert_array_almost_equal(features_parallel[col], features_serial[col])
def test_from_columns(self): tsn = "TEST_TIME_SERIES" fset = ComprehensiveFCParameters() self.assertRaises(TypeError, from_columns, 42) self.assertRaises(TypeError, from_columns, 42) self.assertRaises(ValueError, from_columns, ["This is not a column name"]) self.assertRaises(ValueError, from_columns, ["This__neither"]) self.assertRaises(ValueError, from_columns, ["This__also__not"]) # Aggregate functions feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"] # Aggregate functions with params feature_names += [tsn + '__quantile__q_10', tsn + '__quantile__q_70', tsn + '__number_peaks__n_30', tsn + '__value_count__value_inf', tsn + '__value_count__value_-inf', tsn + '__value_count__value_nan'] # Apply functions feature_names += [tsn + '__ar_coefficient__k_20__coeff_4', tsn + '__ar_coefficient__coeff_10__k_-1'] kind_to_fc_parameters = from_columns(feature_names) six.assertCountEqual(self, list(kind_to_fc_parameters[tsn].keys()), ["sum_values", "median", "length", "sample_entropy", "quantile", "number_peaks", "ar_coefficient", "value_count"]) self.assertEqual(kind_to_fc_parameters[tsn]["sum_values"], None) self.assertEqual(kind_to_fc_parameters[tsn]["ar_coefficient"], [{"k": 20, "coeff": 4}, {"k": -1, "coeff": 10}]) self.assertEqual(kind_to_fc_parameters[tsn]["value_count"], [{"value": np.PINF}, {"value": np.NINF}, {"value": np.NaN}]) # test that it passes for all functions fset = ComprehensiveFCParameters() X_org = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}), default_fc_parameters=fset, column_id="id", column_value="value", n_jobs=0) inferred_fset = from_columns(X_org) X_new = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}), kind_to_fc_parameters=inferred_fset, column_id="id", column_value="value", n_jobs=0) assert_frame_equal(X_org.sort_index(), X_new.sort_index())
def test_from_columns(self): tsn = "TEST_TIME_SERIES" fset = ComprehensiveFCParameters() self.assertRaises(TypeError, from_columns, 42) self.assertRaises(TypeError, from_columns, 42) self.assertRaises(ValueError, from_columns, ["This is not a column name"]) self.assertRaises(ValueError, from_columns, ["This__neither"]) self.assertRaises(ValueError, from_columns, ["This__also__not"]) # Aggregate functions feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"] # Aggregate functions with params feature_names += [tsn + '__quantile__q_10', tsn + '__quantile__q_70', tsn + '__number_peaks__n_30', tsn + '__value_count__value_inf', tsn + '__value_count__value_-inf', tsn + '__value_count__value_nan'] # Apply functions feature_names += [tsn + '__ar_coefficient__k_20__coeff_4', tsn + '__ar_coefficient__coeff_10__k_-1'] kind_to_fc_parameters = from_columns(feature_names) six.assertCountEqual(self, list(kind_to_fc_parameters[tsn].keys()), ["sum_values", "median", "length", "sample_entropy", "quantile", "number_peaks", "ar_coefficient", "value_count"]) self.assertEqual(kind_to_fc_parameters[tsn]["sum_values"], None) self.assertEqual(kind_to_fc_parameters[tsn]["ar_coefficient"], [{"k": 20, "coeff": 4}, {"k": -1, "coeff": 10}]) self.assertEqual(kind_to_fc_parameters[tsn]["value_count"], [{"value": np.PINF}, {"value": np.NINF}, {"value": np.NaN}]) # test that it passes for all functions fset = ComprehensiveFCParameters() X_org = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}), default_fc_parameters=fset, column_id="id", column_value="value", n_jobs=0) inferred_fset = from_columns(X_org) X_new = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}), kind_to_fc_parameters=inferred_fset, column_id="id", column_value="value", n_jobs=0) assert_frame_equal(X_org.sort_index(), X_new.sort_index())
def test_extract_features_for_index_based_functions(self): df = self.create_test_data_sample_with_time_index() settings = { 'linear_trend_timewise': [{"attr": "slope"}], 'linear_trend': [{"attr": "slope"}] } extracted_features = extract_features(df, default_fc_parameters=settings, column_value="val", column_id="id", column_kind="kind", column_sort="sort") self.assertIsInstance(extracted_features, pd.DataFrame) slope_a = extracted_features['a__linear_trend_timewise__attr_"slope"'].values slope_b = extracted_features['b__linear_trend_timewise__attr_"slope"'].values self.assertAlmostEqual(slope_a[0], -0.001347117) self.assertAlmostEqual(slope_a[1], 0.052036340) self.assertAlmostEqual(slope_b[0], 0.021898496) self.assertAlmostEqual(slope_b[1], -0.012312) # Test that the index of the returned df is the ID and not the timestamp self.assertTrue(extracted_features.index.dtype != df.index.dtype) self.assertTrue(extracted_features.index.dtype == df['id'].dtype) self.assertEqual( sorted(extracted_features.index.unique().tolist()), sorted(df['id'].unique().tolist()) )
def test_extract_features_without_settings(self): df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), "value1": np.random.normal(0, 1, 20), "value2": np.random.normal(0, 1, 20)}) X = extract_features(df, column_id="id") self.assertIn("value1__maximum", list(X.columns)) self.assertIn("value2__maximum", list(X.columns))
def test_extract_features(self): # todo: implement more methods and test more aspects df = self.create_test_data_sample() extracted_features = extract_features(df, self.settings, "id", "sort", "kind", "val") self.assertIsInstance(extracted_features, pd.DataFrame) self.assertTrue( np.all(extracted_features.a__maximum == np.array([71, 77]))) self.assertTrue( np.all(extracted_features.a__sum_values == np.array([691, 1017]))) self.assertTrue( np.all( extracted_features.a__abs_energy == np.array([32211, 63167]))) self.assertTrue( np.all(extracted_features.b__sum_values == np.array([757, 695]))) self.assertTrue( np.all(extracted_features.b__minimum == np.array([3, 1]))) self.assertTrue( np.all( extracted_features.b__abs_energy == np.array([36619, 35483]))) self.assertTrue( np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) self.assertTrue( np.all(extracted_features.b__median == np.array([39.5, 28.0])))
def test_extract_features_after_randomisation(self): df = self.create_test_data_sample() df_random = df.copy().sample(frac=1) extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", n_jobs=self.n_jobs).sort_index() extracted_features_from_random = extract_features(df_random, column_id="id", column_sort="sort", column_kind="kind", column_value="val", n_jobs=self.n_jobs).sort_index() six.assertCountEqual(self, extracted_features.columns, extracted_features_from_random.columns) for col in extracted_features: self.assertIsNone(np.testing.assert_array_almost_equal(extracted_features[col], extracted_features_from_random[col]))
def test_extract_index_preservation(self): df = self.create_test_data_nearly_numerical_indices() extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", n_jobs=self.n_jobs) self.assertIsInstance(extracted_features, pd.DataFrame) self.assertTrue(set(df.index) == set(extracted_features.index))
def test_extract_features_without_settings(self): df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), "value1": np.random.normal(0, 1, 20), "value2": np.random.normal(0, 1, 20)}) X = extract_features(df, column_id="id", n_jobs=self.n_jobs) self.assertIn("value1__maximum", list(X.columns)) self.assertIn("value2__maximum", list(X.columns))
def test_extract_index_preservation(self): df = self.create_test_data_nearly_numerical_indices() extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", n_jobs=self.n_jobs) self.assertIsInstance(extracted_features, pd.DataFrame) self.assertEqual(set(df["id"]), set(extracted_features.index))
def test_profiling_file_written_out(self): df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), "val": np.random.normal(0, 1, 20)}) profiling_filename = os.path.join(self.directory, "test_profiling.txt") X = extract_features(df, column_id="id", column_value="val", n_jobs=self.n_jobs, profile=True, profiling_filename=profiling_filename) self.assertTrue(os.path.isfile(profiling_filename)) os.remove(profiling_filename)
def test_extraction_runs_through(self): rfs = EfficientFCParameters() data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"]) extracted_features = extract_features(data, default_fc_parameters=rfs, column_kind="kind", column_value="value", column_sort="time", column_id="id") six.assertCountEqual(self, extracted_features.index, [0, 1])
def test_calculate_ts_features_after_randomisation(self): df = self.create_test_data_sample() df_random = df.copy().sample(frac=1) extracted_features = extract_features(df, self.settings, "id", "sort", "kind", "val").sort_index() extracted_features_from_random = extract_features( df_random, self.settings, "id", "sort", "kind", "val").sort_index() self.assertItemsEqual(extracted_features.columns, extracted_features_from_random.columns) for col in extracted_features: self.assertIsNone( np.testing.assert_array_almost_equal( extracted_features[col], extracted_features_from_random[col]))
def test_extraction_runs_through(self): rfs = EfficientFCParameters() data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"]) extracted_features = extract_features(data, default_fc_parameters=rfs, column_kind="kind", column_value="value", column_sort="time", column_id="id") six.assertCountEqual(self, extracted_features.index, [0, 1])
def test_profiling_file_written_out(self): df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), "val": np.random.normal(0, 1, 20)}) profiling_filename = os.path.join(self.directory, "test_profiling.txt") X = extract_features(df, column_id="id", column_value="val", n_jobs=self.n_jobs, profile=True, profiling_filename=profiling_filename) self.assertTrue(os.path.isfile(profiling_filename)) os.remove(profiling_filename)
def test_profiling_cumulative_file_written_out(self): PROFILING_FILENAME = "test_profiling_cumulative.txt" PROFILING_SORTING = "cumulative" df = pd.DataFrame(data={ "id": np.repeat([1, 2], 10), "val": np.random.normal(0, 1, 20) }) extract_features(df, column_id="id", column_value="val", parallelization='per_sample', n_processes=self.n_processes, profile=True, profiling_filename=PROFILING_FILENAME, profiling_sorting=PROFILING_SORTING) self.assertTrue(os.path.isfile(PROFILING_FILENAME)) os.remove(PROFILING_FILENAME)
def test_extraction_runs_through(self): rfs = ReasonableFeatureExtractionSettings() data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"]) extracted_features = extract_features(data, feature_extraction_settings=rfs, column_kind="kind", column_value="value", column_sort="time", column_id="id") six.assertCountEqual(self, extracted_features.index, [0, 1])
def test_profiling_file_written_out(self): fes = FeatureExtractionSettings() fes.PROFILING = True fes.PROFILING_FILENAME = "test_profiling.txt" df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), "value": np.random.normal(0, 1, 20)}) X = extract_features(df, column_id="id", column_value="value", feature_extraction_settings=fes) self.assertTrue(os.path.isfile(fes.PROFILING_FILENAME)) os.remove(fes.PROFILING_FILENAME)
def test_extraction_runs_through(self): mfs = MinimalFCParameters() data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"]) extracted_features = extract_features(data, default_fc_parameters=mfs, column_kind="kind", column_value="value", column_sort="time", column_id="id") self.assertCountEqual(extracted_features.columns, ["0__median", "0__standard_deviation", "0__sum_values", "0__maximum", "0__variance", "0__minimum", "0__mean", "0__length", "0__root_mean_square", "0__absolute_maximum"]) self.assertCountEqual(extracted_features.index, [0, 1])
def test_extraction_runs_through(self): mfs = MinimalFeatureExtractionSettings() data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"]) extracted_features = extract_features(data, feature_extraction_settings=mfs, column_kind="kind", column_value="value", column_sort="time", column_id="id") six.assertCountEqual(self, extracted_features.columns, ["0__median", "0__standard_deviation", "0__sum_values", "0__maximum", "0__variance","0__minimum", "0__mean", "0__length"]) six.assertCountEqual(self, extracted_features.index, [0, 1])
def test_extract_features_alphabetically_sorted(self): df = self.create_test_data_sample() features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", column_value="val") for col_name in features.columns: # split out the configuration of the features calculator col_name_chunks = col_name.split("__") # the name is always at the beginning, so remove it. Also remove the kind of the column col_name_chunks = col_name_chunks[2:] self.assertEqual(col_name_chunks, list(sorted(col_name_chunks)))
def test_distributor_map_reduce_and_close_are_called(self): df = self.create_test_data_sample() mock = Mock(spec=DistributorBaseClass) mock.close.return_value = None mock.map_reduce.return_value = [] X = extract_features(timeseries_container=df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", default_fc_parameters=self.name_to_param, distributor=mock) self.assertTrue(mock.close.called) self.assertTrue(mock.map_reduce.called)
def test_extraction_runs_through(self): mfs = MinimalFCParameters() data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"]) extracted_features = extract_features(data, default_fc_parameters=mfs, column_kind="kind", column_value="value", column_sort="time", column_id="id") six.assertCountEqual(self, extracted_features.columns, ["0__median", "0__standard_deviation", "0__sum_values", "0__maximum", "0__variance", "0__minimum", "0__mean", "0__length"]) six.assertCountEqual(self, extracted_features.index, [0, 1])
def test_distributor_map_reduce_and_close_are_called(self): df = self.create_test_data_sample() mock = Mock(spec=DistributorBaseClass) mock.close.return_value = None mock.map_reduce.return_value = [] X = extract_features(timeseries_container=df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", default_fc_parameters=self.name_to_param, distributor=mock) self.assertTrue(mock.close.called) self.assertTrue(mock.map_reduce.called)
def test_extract_features_uses_only_kind_to_fc_settings(self): df = self.create_test_data_sample() extracted_features = extract_features( df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", n_jobs=self.n_jobs, kind_to_fc_parameters={"a": { "maximum": None, "minimum": None }}) assert len(extracted_features) == 2
def test_extract_features_per_sample_equals_per_kind(self): df = self.create_test_data_sample() features_per_sample = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", parallelization='per_sample', n_processes=self.n_processes) features_per_kind = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", parallelization='per_kind', n_processes=self.n_processes) features_serial = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", parallelization='serial') six.assertCountEqual(self, features_per_sample.columns, features_per_kind.columns) six.assertCountEqual(self, features_per_sample.columns, features_serial.columns) for col in features_per_sample.columns: self.assertIsNone( np.testing.assert_array_almost_equal(features_per_sample[col], features_per_kind[col])) self.assertIsNone( np.testing.assert_array_almost_equal(features_per_sample[col], features_serial[col]))
def test_distributor_close_is_called(self): df = self.create_test_data_sample() mock = MapDistributor() mock.close = Mock() mock.close.return_value = None X = extract_features(timeseries_container=df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", default_fc_parameters=self.name_to_param, distributor=mock) self.assertTrue(mock.close.called)
def test_profiling_file_written_out(self): df = pd.DataFrame(data={ "id": np.repeat([1, 2], 10), "val": np.random.normal(0, 1, 20) }) profiling_filename = "test_profiling.txt" X = extract_features(df, column_id="id", column_value="val", parallelization='per_kind', n_processes=self.n_processes, profile=True, profiling_filename=profiling_filename) self.assertTrue(os.path.isfile(profiling_filename)) os.remove(profiling_filename)
def test_extract_features_custom_function(self): df = self.create_test_data_sample() def custom_function(x, p): return len(x) + p settings = PickeableSettings({ 'mean': None, custom_function: [{ "p": 1 }, { "p": -1 }], }) extracted_features = extract_features(df, default_fc_parameters=settings, column_value="val", column_id="id", column_kind="kind", column_sort="sort") self.assertIsInstance(extracted_features, pd.DataFrame) mean_a = extracted_features['a__mean'].values custom_function_a_1 = extracted_features[ 'a__custom_function__p_1'].values custom_function_a_m1 = extracted_features[ 'a__custom_function__p_-1'].values self.assertAlmostEqual(mean_a[0], 34.55) self.assertAlmostEqual(mean_a[1], 50.85) self.assertAlmostEqual(custom_function_a_1[0], 21) self.assertAlmostEqual(custom_function_a_1[1], 21) self.assertAlmostEqual(custom_function_a_m1[0], 19) self.assertAlmostEqual(custom_function_a_m1[1], 19)