def extractFeatures(dataSetToExtractFrom, feature_settings="minimal"): """ Extracts features of the given dataset and returns a new dataset of features only. Keyword arguments: dataSetToExtractFrom -- Dataset (type: pandas.core.frame.DataFrame) feature_settings -- Feature extraction parameter (type: string, options: 'minimal','maximal', 'findBest') Returns: pandas.core.frame.DataFrame """ dataset_for_extraction = dataSetToExtractFrom.drop( columns=['label', 'hand', 'annotator']) if feature_settings == "minimal": extractedFeatures = MinimalFCParameters() elif feature_settings == "maximal": extractedFeatures = ComprehensiveFCParameters() elif feature_settings == "findBest": extractedFeatures = EfficientFCParameters() else: extractedFeatures = MinimalFCParameters() print('Given value for feature_parameter not valid! Minimal feature set is used instead.') extracted_featureset = extract_features(dataset_for_extraction, column_id="punch_id", column_sort="timestamp", impute_function=impute, default_fc_parameters=extractedFeatures) return extracted_featureset
def test_from_columns_correct_for_different_kind_datatypes(self): """The `settings.from_columns()` function is supposed to save the feature extraction / selection results so it can be reused later. It works by parsing the column names of the extracted dataframes. An unfortunate side effect of this is that when used with the 'long' format time series input, the typing information about the 'kind' column is lost. For example, even if the 'kind' values are in int32, in the resulting settings dict, the type of the top level keys (representing different kind values) will be str """ df = pd.DataFrame({ 'id': [1, 1, 1, 1], 'time': [1, 1, 2, 2], 'kind': [1, 2, 1, 2], 'value': [1, 2, 3, 4] }) features = extract_features( df, column_id='id', column_sort='time', column_kind='kind', column_value='value', default_fc_parameters=MinimalFCParameters()) sample_settings = from_columns(features) X = extract_features(df, column_id='id', column_sort='time', column_kind='kind', column_value='value', kind_to_fc_parameters=sample_settings) assert X.shape == (1, 2 * len(MinimalFCParameters()))
def extract_sub_window(df_x, y, window, start_index, lag, fc_parameters="min", n_jobs=-1): from tsfresh import extract_relevant_features from tsfresh.feature_extraction.settings import MinimalFCParameters if fc_parameters == "min": fc_parameters = MinimalFCParameters() window_start, window_end = window sub_df_x = get_rolling_timeseries(df_x, start_index, lag, window_start, window_end) if n_jobs == -1: n_jobs = multiprocessing.cpu_count() y = y[y.index.isin(sub_df_x.window_id)] features = extract_relevant_features(sub_df_x, y, column_id="window_id", column_sort="timestamp", column_value=None, default_fc_parameters=fc_parameters, n_jobs=n_jobs) features = features.add_suffix(f"_{window_start}_{window_end}") return (features, y)
def test_make_forecasting_frame_feature_extraction(self): t_index = pd.date_range('1/1/2011', periods=4, freq='H') df, y = dataframe_functions.make_forecasting_frame(x=pd.Series(data=range(4), index=t_index), kind="test", max_timeshift=1, rolling_direction=1) extract_relevant_features(df, y, column_id="id", column_sort="time", column_value="value", default_fc_parameters=MinimalFCParameters())
def get_ts_features(X: Union[np.ndarray, torch.Tensor], y: Union[None, np.ndarray, torch.Tensor] = None, features: Union[str, dict] = 'min', n_jobs: Optional[int] = None, **kwargs): """ Args: X: np.array or torch.Tesnor of shape [samples, dimensions, timesteps]. y: Not required for unlabeled data. Otherwise, you need to pass it. features: 'min', 'efficient', 'all', or a dictionary. Be aware that 'efficient' and 'all' may required substantial memory and time. """ df = to_tsfresh_df(X) n_jobs = ifnone(n_jobs, defaults.cpus) if 'default_fc_parameters' in kwargs.keys(): default_fc_parameters = default_fc_parameters elif features == 'min': default_fc_parameters = MinimalFCParameters() elif features == 'efficient': default_fc_parameters = EfficientFCParameters() elif features == 'all': default_fc_parameters = ComprehensiveFCParameters() else: default_fc_parameters = None df = tsfresh.extract_features(df, column_id="id", n_jobs=n_jobs, default_fc_parameters=default_fc_parameters, **kwargs) if y is not None: if y.ndim == 1: y = y.reshape(-1, 1) for i in range(y.shape[-1]): df['target' if y.shape[-1] == 1 else f'target_{i}'] = y[:, i] return df
def _get_extraction_params(self): """Helper function to set default parameters from tsfresh""" # make n_jobs compatible with scikit-learn self.n_jobs = check_n_jobs(self.n_jobs) # lazy imports to avoid hard dependency from tsfresh.defaults import CHUNKSIZE from tsfresh.defaults import DISABLE_PROGRESSBAR from tsfresh.utilities.dataframe_functions import impute from tsfresh.defaults import N_PROCESSES from tsfresh.defaults import PROFILING from tsfresh.defaults import PROFILING_FILENAME from tsfresh.defaults import PROFILING_SORTING from tsfresh.defaults import SHOW_WARNINGS from tsfresh.feature_extraction.settings import ComprehensiveFCParameters from tsfresh.feature_extraction.settings import EfficientFCParameters from tsfresh.feature_extraction.settings import MinimalFCParameters # Set defaults from tsfresh extraction_params = { "kind_to_fc_parameters": self.kind_to_fc_parameters, "n_jobs": N_PROCESSES, "chunksize": CHUNKSIZE, "show_warnings": SHOW_WARNINGS, "disable_progressbar": DISABLE_PROGRESSBAR, "impute_function": impute, "profiling_sorting": PROFILING_SORTING, "profiling_filename": PROFILING_FILENAME, "profile": PROFILING, } # Replace defaults with user defined parameters for name in extraction_params.keys(): if hasattr(self, name): value = getattr(self, name) if value is not None: extraction_params[name] = value # Convert convenience string arguments to tsfresh parameters classes fc_param_lookup = { "minimal": MinimalFCParameters(), "efficient": EfficientFCParameters(), "comprehensive": ComprehensiveFCParameters(), } if isinstance(self.default_fc_parameters, str): if self.default_fc_parameters not in fc_param_lookup: raise ValueError( f"If `default_fc_parameters` is passed as a " f"string, " f"it must be one of" f" {fc_param_lookup.keys()}, but found: " f"{self.default_fc_parameters}" ) else: fc_parameters = fc_param_lookup[self.default_fc_parameters] else: fc_parameters = self.default_fc_parameters extraction_params["default_fc_parameters"] = fc_parameters return extraction_params
def test_all_minimal_features_in(self): mfs = MinimalFCParameters() self.assertIn("mean", mfs) self.assertIn("median", mfs) self.assertIn("minimum", mfs) self.assertIn("maximum", mfs) self.assertIn("length", mfs) self.assertIn("sum_values", mfs) self.assertIn("standard_deviation", mfs) self.assertIn("variance", mfs)
def test_param_sim_summ(): lhd = LatinHypercube(dmin, dmax) n_points = 10 lhd.generate_array(n_points) summ = lambda x: generate_tsfresh_features(x, MinimalFCParameters()) graph_dict = core.get_graph_chunked(param_func=lhd.draw, sim_func=simulator2, summaries_func=summ, batch_size=n_points, chunk_size=2) assert len( graph_dict["parameters"]) == 5, "Core test failed, dimensions mismatch" assert len(graph_dict["trajectories"] ) == 5, "Core test failed, dimensions mismatch" assert len( graph_dict["summarystats"]) == 5, "Core test failed, expected None" params, sim, summaries = dask.compute(graph_dict["parameters"], graph_dict["trajectories"], graph_dict["summarystats"]) sim = np.asarray(sim) params = np.asarray(params) summaries = np.asarray(summaries) assert params.shape == (5, 2, 5), "Core test failed, dimensions mismatch" assert sim.shape == (5, 2, 1, 2, 101), "Core test failed, dimensions mismatch" assert summaries.shape == (5, 2, 1, 16), "Core test failed, dimensions mismatch" fixed_data = np.asarray([simulator2(bound) for p in range(10)]) print(fixed_data.shape) fixed_data = fixed_data.reshape(10, 2, 101) fixed_mean = core.get_fixed_mean(fixed_data, summ, chunk_size=2) m, = dask.compute(fixed_mean) m = np.asarray(m) assert m.shape == (1, 16), "Core test failed, dimensions mismatch" dist_class = ns.NaiveSquaredDistance() dist_func = lambda x: dist_class.compute(x, m) dist = core.get_distance(dist_func, graph_dict["summarystats"]) assert len(dist) == 5, "Core test failed, dimesnion mismatch" dist_res, = dask.compute(dist) dist_res = np.asarray(dist_res) assert dist_res.shape == (5, 2, 1, 16), "Core test failed, dimension mismatch"
def test_extraction_runs_through(self): mfs = MinimalFCParameters() data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"]) extracted_features = extract_features(data, default_fc_parameters=mfs, column_kind="kind", column_value="value", column_sort="time", column_id="id") six.assertCountEqual(self, extracted_features.columns, ["0__median", "0__standard_deviation", "0__sum_values", "0__maximum", "0__variance", "0__minimum", "0__mean", "0__length"]) six.assertCountEqual(self, extracted_features.index, [0, 1])
def test_feature_extraction(self): df = pd.DataFrame({"my_id": [1, 1, 1, 2, 2, 2], "my_kind": ["a"]*6, "my_value": [1, 2, 3, 4, 5, 6]}) df = dd.from_pandas(df, chunksize=3) df_grouped = df.groupby(["my_id", "my_kind"]) features = dask_feature_extraction_on_chunk(df_grouped, column_id="my_id", column_kind="my_kind", column_value="my_value", column_sort=None, default_fc_parameters=MinimalFCParameters()) features = features.categorize(columns=["variable"]) features = features.reset_index(drop=True) feature_table = features.pivot_table(index="my_id", columns="variable", values="value", aggfunc="sum") feature_table = feature_table.compute() self.assertEqual(len(feature_table.columns), len(MinimalFCParameters())) self.assertEqual(len(feature_table), 2)
def test_feature_extraction(self): df = pd.DataFrame({ "my_id": [1, 1, 1, 2, 2, 2], "my_kind": ["a"] * 6, "my_value": [1, 2, 3, 4, 5, 6] }) df = dd.from_pandas(df, chunksize=3) df_grouped = df.groupby(["my_id", "my_kind"]) features = dask_feature_extraction_on_chunk( df_grouped, column_id="my_id", column_kind="my_kind", column_value="my_value", column_sort=None, default_fc_parameters=MinimalFCParameters()) features = features.compute() self.assertEqual(list(sorted(features.columns)), ["my_id", "value", "variable"]) self.assertEqual(len(features), 2 * len(MinimalFCParameters()))
def extract_sub_window(df_x, y, window, start_index, lag, fc_parameters=MinimalFCParameters(), n_jobs=-1): from tsfresh import extract_relevant_features window_start, window_end = window sub_df_x = get_rolling_timeseries(df_x, start_index, lag, window_end-window_start) if n_jobs == -1: n_jobs = multiprocessing.cpu_count() print('Remove non target values...') y = y.iloc[start_index + lag:] # y = y[y.index.isin(sub_df_x.window_id)] print('Extracting features...') features = extract_relevant_features(sub_df_x, y, column_id="window_id", column_sort="timestamp", column_value=None, default_fc_parameters=fc_parameters, n_jobs=n_jobs) # features = pd.concat([extracted_features], axis=1) features = features.add_suffix(f"_{window_start}_{window_end}") return features
def extract_sub_windows(df_x, df_y, window_array, lag, fc_parameters=MinimalFCParameters(), n_jobs=-1): # df_x = df_x.reset_index('timestamp') df_x['timestamp'] = list(range(len(df_x))) split_func = lambda x: list(map(int, x.split("-"))) windows = np.array(list(map(split_func, window_array))) max_end = max(windows[:, 1]) y = df_y.iloc[max_end + lag:] y = y.reset_index(drop=True) y.index.name = 'window_id' features = [ extract_sub_window(df_x.copy(), y.copy(), window, max_end - (window[1] - window[0]), lag, fc_parameters, n_jobs) for window in windows] features = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True, how='inner'), features)
def _extract_features(self, devices, trial_id): if self.motion == True: pickle_path = FEATURE_CACHE + 'X{}-{}-{}.pickle'.format( trial_id, self.window_size, self.feature_type) elif self.motion == 'only': pickle_path = FEATURE_CACHE + 'X{}-{}-{}-motion-only.pickle'.format( trial_id, self.window_size, self.feature_type) else: pickle_path = FEATURE_CACHE + 'X{}-{}-{}-no-motion.pickle'.format( trial_id, self.window_size, self.feature_type) if os.path.isfile(pickle_path): return pickle.load(open(pickle_path, "rb")) else: wrist_device = devices[0] if self.motion == True: input_columns = ['red', 'ir', 'gyro', 'accel'] elif self.motion == 'only': input_columns = ['gyro', 'accel'] else: input_columns = ['red', 'ir'] X_raw = wrist_device[input_columns] X_windowed = self._windowize_tsfresh(X_raw) if self.feature_type == 'efficient': features = EfficientFCParameters() elif self.feature_type == 'comprehensive': features = ComprehensiveFCParameters() elif self.feature_type == 'minimal': features = MinimalFCParameters() else: raise RuntimeError("Invalid feature type") print("Extracting features for trial " + str(trial_id)) X = extract_features(X_windowed, column_id='id', column_sort='time', n_jobs=N_JOBS, default_fc_parameters=features) impute(X) pickle.dump(X, open(pickle_path, "wb")) return X
def __init__(self, features='minimal', corrcoef=False, use_logger=False): self.name = 'SummariesTSFRESH' super(SummariesTSFRESH, self).__init__(self.name, use_logger=use_logger) if type(features) is str: allowed_str = ['minimal', 'full'] assert features in allowed_str, "{0} is not recognized, supported sets are 'minimal' and 'full'".format( features) if features == 'minimal': self.features = MinimalFCParameters() self.features.pop('length') else: self.features = EfficientFCParameters() else: self.features = features self.corrcoef = corrcoef self.summaries_names = _get_tsfresh_features_names(self.features)
from tsfresh import extract_features from tsfresh.feature_extraction.settings import MinimalFCParameters import h5py import os if __name__ == "__main__": savefile = "tsfresh_features.h5" datafile = "simSeriesData.h5" chunckLength = 100 with h5py.File(datafile,"r") as f: n = f["deltas"].shape[0]//chunckLength for i in range(n): print(f"Chunck: {i}") ts = np.tile(f["t"],chunckLength) deltas = np.reshape(f["deltas"][i*chunckLength:(i+1)*chunckLength],(-1,)) ids = np.repeat(range(i*chunckLength,(i+1)*chunckLength),f["deltas"].shape[1]) data = pd.DataFrame({'id':ids,'time':ts,'y':deltas}) features = extract_features(data,column_id="id", column_sort="time", default_fc_parameters=MinimalFCParameters(),n_jobs=15) features.to_hdf(savefile, 'table', mode='a',complevel=9, complib='zlib',format='table',append=True)
import seaborn as sns import pandas as pd from tsfresh.feature_extraction import extract_features, feature_calculators #from tsfresh import extract_features from tsfresh.feature_extraction.settings import ComprehensiveFCParameters from tsfresh.feature_extraction.settings import MinimalFCParameters from tsfresh.utilities.dataframe_functions import make_forecasting_frame from sklearn.ensemble import AdaBoostRegressor from tsfresh.utilities.dataframe_functions import impute import warnings warnings.filterwarnings('ignore') #%matplotlib inline #%load_ext autoreload #%autoreload 2 settings_time = MinimalFCParameters() #settings_time def feature_extraction(df): import pandas as pd X_tsfresh = extract_features(df, column_id="id", column_value="value", default_fc_parameters=settings_time) #df = X_tsfresh #df =df.T #df2.replace(to_replace='value__agg_linear_trend__f_agg_"max"__chunk_len_', value='linear', regex=True) return X_tsfresh
true_params = np.array(bound) dmin = true_params * 0.5 dmax = true_params * 2.0 uni_prior = uniform_prior.UniformPrior(dmin, dmax) fixed_data = toggle_model.run(solver=NumPySSASolver, number_of_trajectories=100, show_labels=False) # reshape data to (N,S,T) fixed_data = np.asarray([x.T for x in fixed_data]) # and remove timepoints fixed_data = fixed_data[:, 1:, :] summ_func = lambda x: fe.generate_tsfresh_features(x, MinimalFCParameters()) ns = naive_squared.NaiveSquaredDistance() def test_abc_functional(): abc = ABC(fixed_data, sim=simulator2, prior_function=uni_prior, summaries_function=summ_func, distance_function=ns) abc.compute_fixed_mean(chunk_size=2) # run in multiprocessing mode res = abc.infer(num_samples=30, batch_size=10, chunk_size=2)