def _get_extraction_params(self): """Helper function to set default parameters from tsfresh""" # make n_jobs compatible with scikit-learn self.n_jobs = check_n_jobs(self.n_jobs) # lazy imports to avoid hard dependency from tsfresh.defaults import CHUNKSIZE from tsfresh.defaults import DISABLE_PROGRESSBAR from tsfresh.utilities.dataframe_functions import impute from tsfresh.defaults import N_PROCESSES from tsfresh.defaults import PROFILING from tsfresh.defaults import PROFILING_FILENAME from tsfresh.defaults import PROFILING_SORTING from tsfresh.defaults import SHOW_WARNINGS from tsfresh.feature_extraction.settings import ComprehensiveFCParameters from tsfresh.feature_extraction.settings import EfficientFCParameters from tsfresh.feature_extraction.settings import MinimalFCParameters # Set defaults from tsfresh extraction_params = { "kind_to_fc_parameters": self.kind_to_fc_parameters, "n_jobs": N_PROCESSES, "chunksize": CHUNKSIZE, "show_warnings": SHOW_WARNINGS, "disable_progressbar": DISABLE_PROGRESSBAR, "impute_function": impute, "profiling_sorting": PROFILING_SORTING, "profiling_filename": PROFILING_FILENAME, "profile": PROFILING, } # Replace defaults with user defined parameters for name in extraction_params.keys(): if hasattr(self, name): value = getattr(self, name) if value is not None: extraction_params[name] = value # Convert convenience string arguments to tsfresh parameters classes fc_param_lookup = { "minimal": MinimalFCParameters(), "efficient": EfficientFCParameters(), "comprehensive": ComprehensiveFCParameters(), } if isinstance(self.default_fc_parameters, str): if self.default_fc_parameters not in fc_param_lookup: raise ValueError( f"If `default_fc_parameters` is passed as a " f"string, " f"it must be one of" f" {fc_param_lookup.keys()}, but found: " f"{self.default_fc_parameters}" ) else: fc_parameters = fc_param_lookup[self.default_fc_parameters] else: fc_parameters = self.default_fc_parameters extraction_params["default_fc_parameters"] = fc_parameters return extraction_params
def extractFeatures(dataSetToExtractFrom, feature_settings="minimal"): """ Extracts features of the given dataset and returns a new dataset of features only. Keyword arguments: dataSetToExtractFrom -- Dataset (type: pandas.core.frame.DataFrame) feature_settings -- Feature extraction parameter (type: string, options: 'minimal','maximal', 'findBest') Returns: pandas.core.frame.DataFrame """ dataset_for_extraction = dataSetToExtractFrom.drop( columns=['label', 'hand', 'annotator']) if feature_settings == "minimal": extractedFeatures = MinimalFCParameters() elif feature_settings == "maximal": extractedFeatures = ComprehensiveFCParameters() elif feature_settings == "findBest": extractedFeatures = EfficientFCParameters() else: extractedFeatures = MinimalFCParameters() print('Given value for feature_parameter not valid! Minimal feature set is used instead.') extracted_featureset = extract_features(dataset_for_extraction, column_id="punch_id", column_sort="timestamp", impute_function=impute, default_fc_parameters=extractedFeatures) return extracted_featureset
def get_ts_features(X: Union[np.ndarray, torch.Tensor], y: Union[None, np.ndarray, torch.Tensor] = None, features: Union[str, dict] = 'min', n_jobs: Optional[int] = None, **kwargs): """ Args: X: np.array or torch.Tesnor of shape [samples, dimensions, timesteps]. y: Not required for unlabeled data. Otherwise, you need to pass it. features: 'min', 'efficient', 'all', or a dictionary. Be aware that 'efficient' and 'all' may required substantial memory and time. """ df = to_tsfresh_df(X) n_jobs = ifnone(n_jobs, defaults.cpus) if 'default_fc_parameters' in kwargs.keys(): default_fc_parameters = default_fc_parameters elif features == 'min': default_fc_parameters = MinimalFCParameters() elif features == 'efficient': default_fc_parameters = EfficientFCParameters() elif features == 'all': default_fc_parameters = ComprehensiveFCParameters() else: default_fc_parameters = None df = tsfresh.extract_features(df, column_id="id", n_jobs=n_jobs, default_fc_parameters=default_fc_parameters, **kwargs) if y is not None: if y.ndim == 1: y = y.reshape(-1, 1) for i in range(y.shape[-1]): df['target' if y.shape[-1] == 1 else f'target_{i}'] = y[:, i] return df
def test_extraction_runs_through(self): rfs = EfficientFCParameters() data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"]) extracted_features = extract_features(data, default_fc_parameters=rfs, column_kind="kind", column_value="value", column_sort="time", column_id="id") six.assertCountEqual(self, extracted_features.index, [0, 1])
def __init__(self, features='minimal', corrcoef=False, use_logger=False): self.name = 'SummariesTSFRESH' super(SummariesTSFRESH, self).__init__(self.name, use_logger=use_logger) if type(features) is str: allowed_str = ['minimal', 'full'] assert features in allowed_str, "{0} is not recognized, supported sets are 'minimal' and 'full'".format( features) if features == 'minimal': self.features = MinimalFCParameters() self.features.pop('length') else: self.features = EfficientFCParameters() else: self.features = features self.corrcoef = corrcoef self.summaries_names = _get_tsfresh_features_names(self.features)
def test_contains_all_non_high_comp_cost_features(self): """ Test that by default a EfficientFCParameters object should be set up to calculate all features defined in tsfresh.feature_extraction.feature_calculators that do not have the attribute "high_comp_cost" """ rfs = EfficientFCParameters() all_feature_calculators = [name for name, func in feature_calculators.__dict__.items() if hasattr(func, "fctype") and not hasattr(func, "high_comp_cost")] for calculator in all_feature_calculators: self.assertIn(calculator, rfs, msg='Default EfficientFCParameters object does not setup calculation of {}' .format(calculator))
def get_potential_auxiliary_tasks(x, y, n_jobs=4): x_df = collect_tsfresh_dataframe(x) x_transform_df = make_tsfresh_transform_dataframe(x) ts_features = RelevantFeatureAugmenter( column_id="id", column_value="value", n_jobs=n_jobs, filter_only_tsfresh_features=True, show_warnings=False, # We use EfficientFCParameters here for computational performance reasons. # ComprehensiveFCParameters may be used when performance is not critical. # ComprehensiveFCParameters was used in # "Distantly Supervised Multitask Learning in Critical Care". default_fc_parameters=EfficientFCParameters()) ts_features.set_timeseries_container(x_df) ts_features.fit(X=x_transform_df, y=y) return ts_features
def _extract_features(self, devices, trial_id): if self.motion == True: pickle_path = FEATURE_CACHE + 'X{}-{}-{}.pickle'.format( trial_id, self.window_size, self.feature_type) elif self.motion == 'only': pickle_path = FEATURE_CACHE + 'X{}-{}-{}-motion-only.pickle'.format( trial_id, self.window_size, self.feature_type) else: pickle_path = FEATURE_CACHE + 'X{}-{}-{}-no-motion.pickle'.format( trial_id, self.window_size, self.feature_type) if os.path.isfile(pickle_path): return pickle.load(open(pickle_path, "rb")) else: wrist_device = devices[0] if self.motion == True: input_columns = ['red', 'ir', 'gyro', 'accel'] elif self.motion == 'only': input_columns = ['gyro', 'accel'] else: input_columns = ['red', 'ir'] X_raw = wrist_device[input_columns] X_windowed = self._windowize_tsfresh(X_raw) if self.feature_type == 'efficient': features = EfficientFCParameters() elif self.feature_type == 'comprehensive': features = ComprehensiveFCParameters() elif self.feature_type == 'minimal': features = MinimalFCParameters() else: raise RuntimeError("Invalid feature type") print("Extracting features for trial " + str(trial_id)) X = extract_features(X_windowed, column_id='id', column_sort='time', n_jobs=N_JOBS, default_fc_parameters=features) impute(X) pickle.dump(X, open(pickle_path, "wb")) return X
def split_into_train_test_out_tsfresh(data, in_num): """ Get the time series to be used for feature extraction y_train is the y value of the data fitting data """ data1 = np.roll(data, -1) # roll the data once #make the dataframe using Tsfresh package df_shift_small, y_train = make_forecasting_frame(data1, kind="price", max_timeshift=in_num, rolling_direction=1) #create the features needed for the result = extract_features(df_shift_small, column_id="id", column_sort="time", column_value="value", impute_function=impute, show_warnings=False, disable_progressbar=False, n_jobs=5, chunksize=1, default_fc_parameters=EfficientFCParameters()) #result_without_zero = result.loc[:, (result != 0).any(axis=0)] #the 50 columns i only need out tsfresh columl_list = [ # 'value__absolute_sum_of_changes', # ============================================================================= # ============================================================================= 'value__agg_autocorrelation__f_agg_"mean"', 'value__agg_autocorrelation__f_agg_"median"', 'value__agg_autocorrelation__f_agg_"var"', 'value__autocorrelation__lag_0', 'value__autocorrelation__lag_1', 'value__autocorrelation__lag_2', 'value__binned_entropy__max_bins_10', # ============================================================================= # ============================================================================= # 'value__cid_ce__normalize_False', # 'value__cid_ce__normalize_True', # 'value__count_above_mean', # 'value__count_below_mean', # 'value__fft_aggregated__aggtype_"centroid"', 'value__fft_aggregated__aggtype_"variance"', 'value__fft_coefficient__coeff_0__attr_"abs"', 'value__fft_coefficient__coeff_0__attr_"real"', 'value__fft_coefficient__coeff_1__attr_"abs"', 'value__fft_coefficient__coeff_1__attr_"angle"', 'value__fft_coefficient__coeff_1__attr_"imag"', 'value__fft_coefficient__coeff_1__attr_"real"', 'value__first_location_of_maximum', #============================================================================= # ============================================================================= 'value__large_standard_deviation__r_0.05', 'value__large_standard_deviation__r_0.1', 'value__large_standard_deviation__r_0.15000000000000002', 'value__large_standard_deviation__r_0.2', 'value__large_standard_deviation__r_0.25', # 'value__large_standard_deviation__r_0.30000000000000004', # 'value__large_standard_deviation__r_0.35000000000000003', # 'value__large_standard_deviation__r_0.4', # 'value__large_standard_deviation__r_0.45', # ============================================================================= # ============================================================================= 'value__linear_trend__attr_"intercept"', 'value__linear_trend__attr_"pvalue"', 'value__linear_trend__attr_"rvalue"', 'value__linear_trend__attr_"slope"', 'value__longest_strike_above_mean', 'value__longest_strike_below_mean', 'value__max_langevin_fixed_point__m_3__r_30', 'value__maximum', 'value__mean', 'value__mean_abs_change', 'value__mean_change', 'value__median', 'value__minimum', 'value__number_cwt_peaks__n_5', 'value__partial_autocorrelation__lag_0', 'value__partial_autocorrelation__lag_1', 'value__partial_autocorrelation__lag_2', 'value__standard_deviation', 'value__sum_values', 'value__variance' ] #extract just only those colums result_without_zero = result[columl_list] #return these values x_train = result_without_zero[:-1] x_test = result_without_zero[-1:] y_train = y_train[:-1] return x_train, y_train, x_test
def test_generate_tsfresh_features(): X = np.random.randn(2, 100) features = EfficientFCParameters() test = generate_tsfresh_features(X, features)
import tensorflow as tf device_name = tf.test.gpu_device_name() if device_name != '/device:GPU:0': raise SystemError('GPU device not found') print('Found GPU at: {}'.format(device_name)) def extract_product_features(df,fc_parameter,destination): features_product = [] extraction_method = fc_parameter.__class__.__name__ for p in df.sitc_id.unique(): product = df[df.sitc_id==p] p_features = extract_features( product[["export_val","year","country"]], column_id="country", column_sort="year", column_value=None,column_kind=None, chunksize=None, default_fc_parameters=fc_parameter ) features_product.append(p_features) p_features.to_csv(f"{p}_{extraction_method}_expval.csv") print(f'Extracted features for {p}: \n {features_product}') product_features = pd.concat(features_product) return p_features %timeit destination_1 =f'{PATH}/efficient_parameters' destination_2 = f'{PATH}/comprehensive_parameters' fc_parameters=[EfficientFCParameters(),ComprehensiveFCParameters()] extract_product_features(trade_dframe,fc_parameters[0],destination_1) extract_product_features(trade_dframe,fc_parameters[1],destination_2)
features_data = np.column_stack((lab_ver_features, lab_hor_features)) # Create TSFresh features for each projected dimension, # and stack both dimensions horizontally: lab_ver_for_tsf = ts_fresh.convert_signals_for_ts_fresh( sub_lab_ver_proj, "ver") lab_ver_tsf_features = extract_features( lab_ver_for_tsf, default_fc_parameters=ComprehensiveFCParameters(), column_id="signal_id", column_sort="time") lab_hor_for_tsf = ts_fresh.convert_signals_for_ts_fresh( sub_lab_hor_proj, "hor") lab_hor_tsf_features = extract_features( lab_hor_for_tsf, default_fc_parameters=EfficientFCParameters(), column_id="signal_id", column_sort="time") features_data = pd.concat([ lab_ver_tsf_features, lab_hor_tsf_features, pd.DataFrame(lab_ver_features), pd.DataFrame(lab_hor_features) ], axis=1) ''' Prepare the data for the classification process: ''' def create_labels(symptom_name, tags_data, condition_vector, binarize=True): if symptom_name == 'tremor':
class SummariesTSFRESH(SummaryBase): """ Class for computing features/statistics on time series data. An ensemble of different statistics from TSFRESH are supported. """ def __init__(self, features='minimal', corrcoef=False, use_logger=False): self.name = 'SummariesTSFRESH' super(SummariesTSFRESH, self).__init__(self.name, use_logger=use_logger) if type(features) is str: allowed_str = ['minimal', 'full'] assert features in allowed_str, "{0} is not recognized, supported sets are 'minimal' and 'full'".format( features) if features == 'minimal': self.features = MinimalFCParameters() self.features.pop('length') else: self.features = EfficientFCParameters() else: self.features = features self.corrcoef = corrcoef self.summaries_names = _get_tsfresh_features_names(self.features) def _compute_tsfresh(self, point): """ Computes features for one point (time series). Parameters ---------- point : ndarray trajectory of shape n_timepoints x 1 Returns ------- list list of generated features """ return generate_tsfresh_features(point, features=self.features) def _compute_corrcoef(self, x, y): """ Computes the Pearson correlation coefficient between two trajectories Parameters --------- x : ndarray Trajectory of shape n_timepoints x 1 y: ndarray Trajectory of shape n_timepoints x 1 Returns list list of generated feature """ return [np.corrcoef(x, y)[0, 1]] def compute(self, point): """[summary] Parameters ---------- point : [type] [description] """ point = np.asarray(point) assert len( point.shape ) == 3, "required input shape is (n_points, n_species, n_timepoints)" tsfresh_summaries = self._compute_tsfresh(point) tsfresh_summaries = np.asarray(tsfresh_summaries) tsfresh_summaries = np.mean(tsfresh_summaries, axis=0, keepdims=True) if self.corrcoef: assert point.shape[ 1] > 1, "corrcoef = True can only be used if the n_species > 1" corrcoef_summaries = [] n_species = range(point.shape[1]) for n in point: corr = [] for s in combinations(n_species, 2): x = n[s[0]] y = n[s[1]] corr.append(self._compute_corrcoef(x, y)[0]) corrcoef_summaries.append(corr) corrcoef_summaries = np.asarray(corrcoef_summaries) corrcoef_summaries = np.mean(corrcoef_summaries, axis=0, keepdims=True) tot = np.hstack((tsfresh_summaries, corrcoef_summaries)) return tot else: return tsfresh_summaries
def test_generate_tsfresh_features(): x = np.random.randn(2, 2, 100) features = EfficientFCParameters() test = generate_tsfresh_features(x, features) assert test.shape == (2, 1500)