def get_feature_dict(feature_set=None): full_feature_dict = ComprehensiveFCParameters() simple_baseline_features = { key: None for key in [ 'abs_energy', 'mean', 'median', 'minimum', 'maximum', 'standard_deviation', ] } distribution_features_dict = utils.distribution_features_tsfresh_dict() temporal_feature_dict = { key: full_feature_dict[key] for key in set(full_feature_dict) - set(distribution_features_dict) } no_entropy_features_dict = { key: value for key, value in full_feature_dict.items() if 'entropy' not in key } feature_dict = { 'simple_baseline': simple_baseline_features, 'distribution_features': distribution_features_dict, 'temporal_features': temporal_feature_dict, 'no_entropy': no_entropy_features_dict, } return feature_dict.get(feature_set, full_feature_dict)
def get_tsfresh(data): dataset = Dataset(data_array=data, data_labels=data, BATCH_SIZE=BATCH_SIZE) extraction_settings = ComprehensiveFCParameters( ) #EfficientFCParameters()#MinimalFCParameters()# features_to_return = [] start_time = time.time() eval_not_finished = 1 while eval_not_finished != 0: # time_checked = check_times(times[i]) data_batch, _ = dataset.get_batch_eval() batch_df = get_data_as_df(data_batch) X = extract_features(batch_df, column_id='ids', column_sort='time', default_fc_parameters=extraction_settings, impute_function=impute, n_jobs=10) impute(X) fetures_batch = X.values features_to_return.append(fetures_batch) eval_not_finished = dataset.BATCH_COUNTER_EVAL if dataset.BATCH_COUNTER_EVAL % 100 == 0: time_usage = str( datetime.timedelta(seconds=int(round(time.time() - start_time)))) print("it %i Time usage: %s" % (dataset.BATCH_COUNTER_EVAL, str(time_usage)), flush=True) features_to_return = np.concatenate(features_to_return) time_usage = str( datetime.timedelta(seconds=int(round(time.time() - start_time)))) print("Total Time usage: %s\n" % (str(time_usage)), flush=True) return features_to_return
def _extract_tsfresh_features(self, X): X_df = self._convert_to_df(X) X_df_no_nans = X_df.dropna() if self.extraction_type == "minimal": extraction_setting = MinimalFCParameters() elif self.extraction_type == "efficient": extraction_setting = EfficientFCParameters() elif self.extraction_type == "all": extraction_setting = ComprehensiveFCParameters() else: raise ValueError( f"{self.extraction_type} is not a supported feature extraction option. Please choose one from " f"the following options: [minimal, efficient, all]." ) # Extract time series features from the dataframe # Replace any ``NaNs`` and ``infs`` in the extracted features with median/extreme values for that column tsfresh_features = extract_features( X_df_no_nans, default_fc_parameters=extraction_setting, column_id="id", column_sort="time", impute_function=impute, ) # If X_df.dropna() dropped some observations entirely (i.e., due to all NaNs), # impute each tsfresh feature for those observations with the median of that tsfresh feature tsfresh_features_imputed = impute(tsfresh_features.reindex(pd.RangeIndex(X_df["id"].max() + 1))) return tsfresh_features_imputed, X_df
def reshape_data_tsfresh(seq_dataset, n_classes, n_steps, settings): """ Transform sequences dataset into dataset of features """ len_data = seq_dataset.shape[0] data_divided = [] for i in range(n_classes): data_divided.append(seq_dataset[:, :, i].reshape(-1)) to_extract = [] for i in range(n_classes): ids = np.arange(len_data).repeat(n_steps) tmp = np.vstack((ids, data_divided[i])) tmp = tmp.T to_extract.append(pd.DataFrame(data=tmp, columns=["id", "value"])) tfs = [] # parameters of tsfresh features extraction if settings == "complete": settings = ComprehensiveFCParameters() elif settings == "efficient": settings = EfficientFCParameters() elif settings == "minimal": settings = MinimalFCParameters() for i in range(n_classes): tf = tsfresh.extract_features( to_extract[i], column_id="id", default_fc_parameters=settings ) tfs.append(tf) data_feat = pd.concat( [tfs[i].reindex(tfs[0].index) for i in range(n_classes)], axis=1 ) print(data_feat.shape) data_feat.fillna(0, inplace=True) data_feat.replace([np.inf, -np.inf], 0, inplace=True) data_tensor = torch.from_numpy(data_feat.values).float() return data_tensor
def test_gen_global_feature_multi_id(self): dates = pd.date_range('1/1/2019', periods=8) data = np.random.randn(8, 3) df = pd.DataFrame({"datetime": dates, "values": data[:, 0], "A": data[:, 1], "B": data[:, 2], "id": ["00"]*4+["01"]*4}) from tsfresh.feature_extraction import ComprehensiveFCParameters from tsfresh.feature_extraction import MinimalFCParameters from tsfresh.feature_extraction import EfficientFCParameters for params in [ComprehensiveFCParameters(), MinimalFCParameters(), EfficientFCParameters()]: output_df, _ = generate_global_features(input_df=df, column_id="id", column_sort="datetime", default_fc_parameters=params) assert "datetime" in output_df.columns assert "values" in output_df.columns assert "A" in output_df.columns assert "B" in output_df.columns assert "id" in output_df.columns for col in output_df.columns: if col in ["datetime", "values", "A", "B", "id"]: continue assert len(set(output_df[output_df["id"] == "00"][col])) == 1 assert len(set(output_df[output_df["id"] == "01"][col])) == 1 assert output_df[output_df["id"] == "00"][col].isna().sum() == 0 assert output_df[output_df["id"] == "01"][col].isna().sum() == 0
def requires(self): settings = ComprehensiveFCParameters() for job in [0, 1, 4]: for time_series_length in [100, 500, 1000, 5000]: yield FullTimingTask(time_series_length=time_series_length, n_jobs=job, num_ids=10, random_seed=42) yield FullTimingTask(time_series_length=time_series_length, n_jobs=job, num_ids=100, random_seed=42) for feature_name in settings: yield TimingTask( feature_parameter={feature_name: settings[feature_name]}, time_series_length=time_series_length, n_jobs=job, num_ids=100, try_number=0, random_seed=42 ) for try_number in range(3): yield TimingTask( feature_parameter={feature_name: settings[feature_name]}, n_jobs=job, try_number=try_number, num_ids=10, time_series_length=time_series_length, random_seed=42 )
def add_tsfresh_day(new_data, data, tsfresh_features, columns): # The dictionary containing the features that we want to extract and the setting for those features if tsfresh_features == 'minimal': settings = MinimalFCParameters() elif tsfresh_features == 'efficient': settings = EfficientFCParameters() elif tsfresh_features == 'comprehensive': settings = ComprehensiveFCParameters() else: settings = MinimalFCParameters() for participant in range(len(data)): all_days = [] for day in range(len(data[participant])): # We only take the columns that we are interested in sub_data = data[participant][day].loc[data[participant][day] ['variable'].isin(columns)] # Drop all nan values sub_data = sub_data.dropna(axis=0) # If a columns is missing we add a row with that column and a 0. # If a column contains nan values we do the same for col in columns: if col not in sub_data['variable']: new_row = sub_data.iloc[0].copy(deep=True) new_row['variable'] = col new_row['value'] = 0 sub_data.append(new_row) from tsfresh.utilities.dataframe_functions import impute_dataframe_zero # Extract features for every variable still left in the dataframe extracted = extract_features(sub_data, default_fc_parameters=settings, column_id='variable', column_sort='time_seconds', column_value='value') # We do not want multiple rows therefore in the case of multiple variables therefore we need to change it # We also change the column names so that we know what kind if features they are extracted = extracted.stack() extracted.index = extracted.index.map('{0[1]}_{0[0]}_day'.format) extracted = extracted.to_frame().T # Add the extracted features to a list all_days.append(extracted) # Concat the days to make a new dataframe and reset the index to prevent conflicts all_days = pd.concat(all_days, axis=0).reset_index(drop=True) # Add the new features to the data new_data[participant] = pd.concat([new_data[participant], all_days], axis=1) return new_data
def tsfresh_extraction(X, y, config): n_jobs = config['SVM-config']['n_jobs'] extraction_settings = ComprehensiveFCParameters() return extract_relevant_features(X, y, n_jobs=n_jobs, fdr_level=0.01, show_warnings=False, column_id='id', column_sort='time', default_fc_parameters=extraction_settings)
def testLocalTSFresh(self): robot_execution_failures.download_robot_execution_failures() df, y = robot_execution_failures.load_robot_execution_failures() dist = MarsDistributor() df = df.iloc[:200] extraction_settings = ComprehensiveFCParameters() extract_features(df, column_id='id', column_sort='time', default_fc_parameters=extraction_settings, # we impute = remove all NaN features automatically impute_function=impute, distributor=dist)
def get_features(y, relevant_features, data): sensor_data_list = dict_as_list(data) df = pd.DataFrame(sensor_data_list, columns=['id', 'time', 'accx', 'accy','accz', 'gyrox', 'gyroy', 'gyroz']) extraction_settings = ComprehensiveFCParameters() if relevant_features: X = extract_relevant_features(df, y, column_id = 'id', column_sort = 'time', default_fc_parameters = extraction_settings) else: X = extract_features(df, column_id = 'id', column_sort = 'time', default_fc_parameters = extraction_settings, impute_function = impute) return X
def test_distributed_ts_fresh(setup): robot_execution_failures.download_robot_execution_failures() df, y = robot_execution_failures.load_robot_execution_failures() default_session = get_default_session() sync_session = new_session(default_session.address) dist = MarsDistributor(session=sync_session) df = df.iloc[:200].copy() extraction_settings = ComprehensiveFCParameters() extract_features(df, column_id='id', column_sort='time', default_fc_parameters=extraction_settings, # we impute = remove all NaN features automatically impute_function=impute, distributor=dist)
def gen_global_feature(self, settings="comprehensive", full_settings=None): ''' Generate per-time-series feature for each time series. This method will be implemented by tsfresh. :param settings: str or dict. If a string is set, then it must be one of "comprehensive" "minimal" and "efficient". If a dict is set then it should follow the instruction for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive". :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in tsfresh. The value is defaulted to None. :return: the tsdataset instance. ''' if full_settings is not None: self.df = generate_global_features( input_df=self.df, column_id=self.id_col, column_sort=self.dt_col, kind_to_fc_parameters=full_settings) return self from tsfresh.feature_extraction import ComprehensiveFCParameters,\ MinimalFCParameters, EfficientFCParameters default_params = { "comprehensive": ComprehensiveFCParameters(), "minimal": MinimalFCParameters(), "efficient": EfficientFCParameters() } if isinstance(settings, str): assert settings in ["comprehensive", "minimal", "efficient"], \ f"settings str should be one of \"comprehensive\", \"minimal\", \"efficient\"\ , but found {settings}." default_fc_parameters = default_params[settings] else: default_fc_parameters = settings self.df,\ addtional_feature =\ generate_global_features(input_df=self.df, column_id=self.id_col, column_sort=self.dt_col, default_fc_parameters=default_fc_parameters) self.feature_col += addtional_feature return self
def extract_features(self, ts, column_id='id', impute_function=impute, default_fc_parameters=ComprehensiveFCParameters(), show_warnings=False, profile=False): '''Extract all possible features from ts using tsfresh's extract_features method''' return extract_features(ts, column_id=column_id, impute_function=impute_function, default_fc_parameters=default_fc_parameters, n_jobs=self.n_jobs, show_warnings=show_warnings, profile=profile)
def testDistributedTSFresh(self): robot_execution_failures.download_robot_execution_failures() df, y = robot_execution_failures.load_robot_execution_failures() service_ep = 'http://127.0.0.1:' + self.web_port with new_session(service_ep) as sess: dist = MarsDistributor(sess) df = df.iloc[:200] extraction_settings = ComprehensiveFCParameters() extract_features(df, column_id='id', column_sort='time', default_fc_parameters=extraction_settings, # we impute = remove all NaN features automatically impute_function=impute, distributor=dist)
def add_tsfresh_participant(data, tsfresh_features, columns, k): # The dictionary containing the features that we want to extract and the setting for those features if tsfresh_features == 'minimal': settings = MinimalFCParameters() elif tsfresh_features == 'efficient': settings = EfficientFCParameters() elif tsfresh_features == 'comprehensive': settings = ComprehensiveFCParameters() else: settings = MinimalFCParameters() for participant in range(len(data)): # First we add the necesary columns data[participant]['id'] = 0 data[participant]['index'] = data[participant].index # We create the rolled time series which also creates new ids, also note that putting max_timeshift to none # means that it takes the maximal possible lengths rolled_series = roll_time_series(data[participant], column_id='id', column_sort='index', max_timeshift=k) all_features = [] for column in columns: # We extract the features for every element of the time series which return a dataframe with the same number # of rows as the original dataframe but a different number of columns extracted = extract_features(rolled_series, default_fc_parameters=settings, column_id='id', column_sort='index', column_value=column) # We need to reset the indexes as they have been changed and add them to our list of features all_features.append(extracted.reset_index(drop=True)) # Add all the features together extracted = pd.concat(all_features, axis=1) # We drop the columns that we previously created because we do no want them in the data del data[participant]['id'] # note that you can also use df.drop here del data[participant]['index'] data[participant] = pd.concat([data[participant], extracted], axis=1) return data
def extractFeatures(rawData): print("\nSetting extraction settings") extraction_settings = ComprehensiveFCParameters() print("Before extracting features") X = extract_features(rawData, column_id='id', column_value=None, column_kind=None, impute_function=impute, default_fc_parameters=extraction_settings) print("After extracting features") print("Number of extracted features: {}.".format(X.shape[1])) print("\nShape of X: ") print(X.shape) return X
def extract_features(data_windows: DataFrame, features: List[Feature]) -> Dict[Feature, DataFrame]: settings = { key: ComprehensiveFCParameters()[key] for key in [str(feature.value).lower() for feature in features] } extracted: DataFrame = tsfresh.extract_features( data_windows, column_id="id", default_fc_parameters=settings, disable_progressbar=True) result = {} for feature_index in range(len(features)): feature = features[feature_index] result[feature] = extracted.iloc[:, [feature_index]] return result
def extract_game_tot_feature(game_info_df, game_ratio_info_df): game_ratio_info_data = game_ratio_info_df[get_conf_item( 'data', 'game_ratio_info_clean', is_eval=True)] game_ratio_info_data.drop(['odds_grail', 'guest_ratio'], axis=1, inplace=True) y = game_info_df[['game_id', 'game_rst_two_cls']] y = pd.Series(y['game_rst_two_cls'].map(lambda x: x == 1).values, index=y.game_id) settings = ComprehensiveFCParameters() game_ratio_info_model = extract_relevant_features( game_ratio_info_data, y, fdr_level=0.1, default_fc_parameters=settings, column_id='game_id', column_sort='position_tm') game_ratio_info_model.to_csv('game_ratio_info_model.csv', index=True)
def __init__(self, train_X, train_y, test_X, test_y, train_ids, test_ids): super().__init__(test_X, test_y) self.train_y = train_y self.extraction_settings = ComprehensiveFCParameters() X = self.generate_features(pd.concat([train_X, test_X])) new_train_X = X.loc[train_ids] new_test_X = X.loc[test_ids] relevant_features = self.select_features(new_train_X, self.train_y) print("Selected Features: {}/{}".format(len(relevant_features), X.shape[1])) if len(relevant_features) > 10: self.train_X = new_train_X[relevant_features] self.test_X = new_test_X[relevant_features] else: self.train_X = new_train_X self.test_X = new_test_X self.model_names = ["Random Forest", "XGBoost"] self.best_model_name = None
def gen_rolling_feature(self, window_size, settings="comprehensive", full_settings=None, n_jobs=1): ''' Generate aggregation feature for each sample. This method will be implemented by tsfresh. Make sure that the specified column name does not contain '__'. TODO: relationship with scale should be figured out. :param window_size: int, generate feature according to the rolling result. :param settings: str or dict. If a string is set, then it must be one of "comprehensive" "minimal" and "efficient". If a dict is set, then it should follow the instruction for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive". :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in tsfresh. The value is defaulted to None. :param n_jobs: int. The number of processes to use for parallelization. :return: the tsdataset instance. ''' from tsfresh.utilities.dataframe_functions import roll_time_series from tsfresh.utilities.dataframe_functions import impute as impute_tsfresh from tsfresh import extract_features from tsfresh.feature_extraction import ComprehensiveFCParameters, \ MinimalFCParameters, EfficientFCParameters DEFAULT_PARAMS = { "comprehensive": ComprehensiveFCParameters(), "minimal": MinimalFCParameters(), "efficient": EfficientFCParameters() } assert not self._has_generate_agg_feature,\ "Only one of gen_global_feature and gen_rolling_feature should be called." if isinstance(settings, str): assert settings in ['comprehensive', 'minimal', 'efficient'], \ "settings str should be one of 'comprehensive', 'minimal', 'efficient'"\ f", but found {settings}." default_fc_parameters = DEFAULT_PARAMS[settings] else: default_fc_parameters = settings assert window_size < self.df.groupby(self.id_col).size().min() + 1, "gen_rolling_feature "\ "should have a window_size smaller than shortest time series length." df_rolled = roll_time_series(self.df, column_id=self.id_col, column_sort=self.dt_col, max_timeshift=window_size - 1, min_timeshift=window_size - 1, n_jobs=n_jobs) if not full_settings: self.roll_feature_df = extract_features( df_rolled, column_id=self.id_col, column_sort=self.dt_col, default_fc_parameters=default_fc_parameters, n_jobs=n_jobs) else: self.roll_feature_df = extract_features( df_rolled, column_id=self.id_col, column_sort=self.dt_col, kind_to_fc_parameters=full_settings, n_jobs=n_jobs) impute_tsfresh(self.roll_feature_df) self.feature_col += list(self.roll_feature_df.columns) self.roll_additional_feature = list(self.roll_feature_df.columns) self._has_generate_agg_feature = True return self
import functools from zoo.chronos.data.utils.feature import generate_dt_features, generate_global_features from zoo.chronos.data.utils.impute import impute_timeseries_dataframe from zoo.chronos.data.utils.deduplicate import deduplicate_timeseries_dataframe from zoo.chronos.data.utils.roll import roll_timeseries_dataframe from zoo.chronos.data.utils.scale import unscale_timeseries_numpy from zoo.chronos.data.utils.resample import resample_timeseries_dataframe from zoo.chronos.data.utils.split import split_timeseries_dataframe from tsfresh.utilities.dataframe_functions import roll_time_series from tsfresh.utilities.dataframe_functions import impute as impute_tsfresh from tsfresh import extract_features from tsfresh.feature_extraction import ComprehensiveFCParameters,\ MinimalFCParameters, EfficientFCParameters DEFAULT_PARAMS = {"comprehensive": ComprehensiveFCParameters(), "minimal": MinimalFCParameters(), "efficient": EfficientFCParameters()} _DEFAULT_ID_COL_NAME = "id" _DEFAULT_ID_PLACEHOLDER = "0" class TSDataset: def __init__(self, data, **schema): ''' TSDataset is an abstract of time series dataset. Cascade call is supported for most of the transform methods. ''' self.df = data self.id_col = schema["id_col"]
feat_multi = [] for i in range(len(twindows)): dict = { 'id': run_id[i], 'kind': kind[i], 'time': times[i], 'eta': g[i] } feat_tmp = extract_features( pd.DataFrame(dict), column_id='id', column_sort='time', column_kind='kind', column_value='eta', default_fc_parameters=ComprehensiveFCParameters(), impute_function=impute) # drop constant features feat_tmp = feat_tmp.loc[:, feat_tmp.apply(pd.Series.nunique) != 1] feat_multi.append(feat_tmp) # create model targets max_5_29 = max_eta_npy(eta_all, -2, multi_runs_used) max_5_38 = max_eta_npy(eta_all, -1, multi_runs_used) # define model, change cache size as needed. rmodel = GridSearchCV(SVR(kernel='rbf', gamma='scale', cache_size=1000),\ param_grid={"C": [1e-2,5e-1,1e-1,1e0, 1e1, 5e1, 1e2],\ "gamma": np.logspace(-5, 0, 21)})
from zoo.chronos.data.utils.feature import generate_dt_features, generate_global_features from zoo.chronos.data.utils.impute import impute_timeseries_dataframe from zoo.chronos.data.utils.deduplicate import deduplicate_timeseries_dataframe from zoo.chronos.data.utils.roll import roll_timeseries_dataframe from zoo.chronos.data.utils.scale import unscale_timeseries_numpy from zoo.chronos.data.utils.resample import resample_timeseries_dataframe from zoo.chronos.data.utils.split import split_timeseries_dataframe from tsfresh.utilities.dataframe_functions import roll_time_series from tsfresh.utilities.dataframe_functions import impute as impute_tsfresh from tsfresh import extract_features from tsfresh.feature_extraction import ComprehensiveFCParameters,\ MinimalFCParameters, EfficientFCParameters DEFAULT_PARAMS = { "comprehensive": ComprehensiveFCParameters(), "minimal": MinimalFCParameters(), "efficient": EfficientFCParameters() } _DEFAULT_ID_COL_NAME = "id" _DEFAULT_ID_PLACEHOLDER = "0" class TSDataset: def __init__(self, data, **schema): ''' TSDataset is an abstract of time series dataset. Cascade call is supported for most of the transform methods. ''' self.df = data
# In[66]: df.time.unique() # In[44]: df[df.id == 3][['time', 'F_x']].plot(x='time', title='Success example (id 3)', figsize=(12, 6)) df[df.id == 20][['time', 'F_x']].plot(x='time', title='Failure example (id 20)', figsize=(12, 6)) # In[45]: extraction_settings = ComprehensiveFCParameters() # In[46]: X = extract_features(df, column_id='id', column_sort='time', default_fc_parameters=extraction_settings, impute_function=impute) # In[47]: X.head() # In[48]:
tsr.stack_series(eta, t, 702, runnos, 901, 0.5, threshold, tsteps[i]) eta_g702.append(eta_tmp) run_id.append(run_id_tmp) runs_used.append(runs_used_tmp) times.append(times_tmp) tstart.append(tstart_tmp) # Featurize using tsfresh feat702 = [] for i in range(len(eta_g702)): dict = {'id':run_id[i], 'time':times[i], 'eta': eta_g702[i]} feat_temp = extract_features(pd.DataFrame(dict), column_id='id', column_sort='time',\ default_fc_parameters=ComprehensiveFCParameters(), impute_function=impute) # drop constant features feat702.append(feat_temp.loc[:, feat_temp.apply(pd.Series.nunique) != 1]) # Extract feature settings for model prediction params_30min = settings.from_columns(feat702[0]) params_60min = settings.from_columns(feat702[1]) # Create targets for train/test g901max = tsr.max_eta(eta,901,runnos) g911max = tsr.max_eta(eta,911,runnos) # Specify the model rmodel = RandomForestRegressor(n_estimators=100)
def gen_global_feature(self, settings="comprehensive", full_settings=None, n_jobs=1): ''' Generate per-time-series feature for each time series. This method will be implemented by tsfresh. Make sure that the specified column name does not contain '__'. TODO: relationship with scale should be figured out. :param settings: str or dict. If a string is set, then it must be one of "comprehensive" "minimal" and "efficient". If a dict is set, then it should follow the instruction for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive". :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in tsfresh. The value is defaulted to None. :param n_jobs: int. The number of processes to use for parallelization. :return: the tsdataset instance. ''' from tsfresh import extract_features from tsfresh.feature_extraction import ComprehensiveFCParameters, \ MinimalFCParameters, EfficientFCParameters DEFAULT_PARAMS = { "comprehensive": ComprehensiveFCParameters(), "minimal": MinimalFCParameters(), "efficient": EfficientFCParameters() } assert not self._has_generate_agg_feature, \ "Only one of gen_global_feature and gen_rolling_feature should be called." if full_settings is not None: self.df,\ addtional_feature =\ generate_global_features(input_df=self.df, column_id=self.id_col, column_sort=self.dt_col, kind_to_fc_parameters=full_settings, n_jobs=n_jobs) self.feature_col += addtional_feature return self if isinstance(settings, str): assert settings in ['comprehensive', 'minimal', 'efficient'], \ "settings str should be one of 'comprehensive', 'minimal', 'efficient'"\ f", but found {settings}." default_fc_parameters = DEFAULT_PARAMS[settings] else: default_fc_parameters = settings self.df,\ addtional_feature =\ generate_global_features(input_df=self.df, column_id=self.id_col, column_sort=self.dt_col, default_fc_parameters=default_fc_parameters, n_jobs=n_jobs) self.feature_col += addtional_feature self._has_generate_agg_feature = True return self
# antenna_order = [7, 9, 6, 4] if device_v else [6, 7, 4, 9] # antenna_order = [8, 10, 5, 3] if device_v else [5, 8, 3, 10] # va_list = ['VA_{}'.format(i) for i in [8, 10, 7, 9, 6, 4, 5, 3]] # va_list = ['VA_{}'.format(i) for i in [7, 9, 6, 4]] # va_list = ['VA_{}'.format(i) for i in [8, 10, 5, 3]] df_x = pd.DataFrame(data, columns=['id', 'time'] + va_list) df_x = df_x.astype({'id': int, 'time': int}) df_y = pd.DataFrame(label) X = pd.DataFrame(index=df_y.index) settings = ComprehensiveFCParameters() # settings = MinimalFCParameters() # settings = { # "length": None, # "large_standard_deviation": [{"r": 0.05}, {"r": 0.1}] # } extracted_features = extract_features(df_x, column_id="id", column_sort="time", impute_function=impute, default_fc_parameters=settings) # extracted_features = pd.merge(extracted_features, svd_feature, left_index=True, right_index=True) # extracted_features = extract_features(df_x, column_id="id", column_sort="time", impute_function=impute) # print(extracted_features)
def extract_data(data_folder: str, columns: list, overlap=False, all: bool = True, est_events: bool = False, event: str = None, event_type: str = None): ''' This function uses tsFRESH to extract relevant features for multiple machine learning tasks. If a csv file of features to use already exists (as features.csv), then those features will be used instead of finding relevant features from scratch (speeds up computing time). Inputs: data_folder: a string containing the location of the directory which the dataset.pkl is saved in. This dataset it created using data_preperation.py. columns: a list of strings containing the columns from the dataset which the user wishes to extract features from. This includes: id, time, ax_l, ay_l, az_l, ax_r, ay_r, az_r, ax_diff, ay_diff, az_diff, a_res_l, a_res_r, a_res_diff. NOTE: if id or time are not included in this list, they will be automatically added as they are necessary. all: a boolean (either True or False). If true, feature extraction will be run using all the data, if False, feature extraction will be run using the first trial, and then that we be used on all the data. est_events: a boolean (either True or False). If True, features will be extracted to estimate whether an event occured or not within a 100 ms time frame. If False, features will be extracted to estimate vertical GRF for the entire timeseries. event: A string containing either FS or FO. This will indicate which event the user wants to predict on. NOTE: this is only necessary as an input if est_events is True. event_type: A string containing either binary or time. This will indicate which type of output the user wants. NOTE: this is only necessary as an input if est_events is True. Outputs: This function does not return anything. However, it does save *.csv files in appropriate folders (based off the columns chosen) which can be used to fit either a classification or regression model (depending on what task is required) - see model_fitting.py Alex Woodall Auckland Bioengineering Institute 08/04/2020 ''' from tsfresh import extract_features, extract_relevant_features, select_features from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters from tsfresh.feature_extraction.settings import from_columns from tsfresh.utilities.dataframe_functions import impute import pickle import numpy as np import pandas as pd import os # Load data try: if overlap: dataset = pickle.load( open(data_folder + "dataset_overlap.pkl", "rb")) else: dataset = pickle.load( open(data_folder + "dataset_no_overlap.pkl", "rb")) except FileNotFoundError: dataset = pickle.load(open(data_folder + "dataset_200.pkl", "rb")) # Number selected columns which the user chose to use for feature extraction columns, columns_num = selected_columns(columns) # Create directories for saving new_directory = "{}{}\\".format(data_folder, ("_".join(map(str, columns_num)))) save_dir = create_directories(new_directory, event, event_type, est_events) # Attempt to load features from the save directory. try: X_features = pd.read_csv( "{}features.csv".format(save_dir), index_col=0) # DataFrame containing the features we want features_string = X_features.columns extraction_settings = from_columns( features_string) # These are the features that we will be using pre_extracted = True except FileNotFoundError: # File does not exist pre_extracted = False # List to append last uid's from each key (used when using all trials to extract features) uid_last = [] # Iterate through all the trials in the dataset for key in dataset.keys(): # Create the timeseries based on the user input columns for col in columns_num: if col == 0: timeseries = ( dataset[key]['X'])[:, col] # Only true accelerations else: timeseries = np.vstack((timeseries, dataset[key]['X'][:, col])) # dataset[key].keys() = ['X', 'force', 'y_FS_binary', 'y_FO_binary', 'y_FS_time_to', 'y_FO_time_to'] # Create y (real data output) if est_events: # If estimating events try: if event_type == 'binary': y = dataset[key]['y_{}_binary'.format(event)] # Convert to boolean (will remain boolean if already) y = (y == 1.0) elif event_type == 'time': y = dataset[key]['y_{}_time_to_next'.format(event)] else: print('Event type must either be binary or time') return except KeyError: print("Event must equal either 'FS' or 'FO'.") return else: # Estimating forces # possible force = ['Fx', 'Fy', 'Fz'] Assuming z direction is vertical y = dataset[key]['y'][:, 2] # Convert to pandas DataFrame/Series if type(timeseries) is np.ndarray: # Needs to be a pandas dataframe timeseries = pd.DataFrame(timeseries.T, columns=columns) # Convert ID column into integers timeseries = timeseries.astype({'id': int}) if est_events: if event_type == 'binary': y = pd.Series(data=y, dtype=bool, name='events') elif event_type == 'time': y = pd.Series(data=y, dtype=float, name='events') else: # Change ID column to fit for regression method ID = (np.arange(0, len(timeseries))).astype(int) timeseries['id'] = ID y = pd.Series(data=y, dtype=float, name='Fz') # Save X full dataset timeseries.to_csv("{}{}_timeseries.csv".format(save_dir, key), index=True, header=True) # Extract features from the first trial and use those for the rest if all == True if not all: # Extract features using tsFRESH if not pre_extracted: print('Finding relevant features using {}'.format(key)) X_filtered = extract_relevant_features( timeseries, y, column_id="id", column_sort="time", default_fc_parameters=ComprehensiveFCParameters()) # Save filtered features X_filtered.to_csv("{}features.csv".format(save_dir), header=True) features_string = X_filtered.columns extraction_settings = from_columns( features_string ) # These are the features that we will be using pre_extracted = True if pre_extracted: print('Using pre-extracted features for event = {}'.format( event)) print(str(key)) X_filtered = extract_features( timeseries, column_id="id", column_sort="time", kind_to_fc_parameters=extraction_settings) # Add start_time and mass column to dataframe if est_events: start_time = dataset[key]['X_starting_time'] mass = dataset[key]['X_mass_sample'] X_filtered.insert(0, "start_time", start_time, True) X_filtered.insert(1, "mass", mass, True) else: mass = dataset[key]['X_mass_all'] X_filtered.insert(0, "mass", mass, True) # Save dataframes X_filtered.to_csv("{}{}_X.csv".format(save_dir, key), index=True, header=True) y.to_csv("{}{}_y.csv".format(save_dir, key), index=True, header=True) else: try: uid_change = timeseries_temp['id'].iloc[-1] uid_last.append(uid_change) timeseries['id'] = timeseries['id'] + uid_change + 1 timeseries_temp = timeseries_temp.append(timeseries) y_temp = y_temp.append(y, ignore_index=True) except NameError: # *_temp DataFrames do not exist yet timeseries_temp = timeseries y_temp = y if all: print('Using all data to extract relevant features') # First remove any NaN values in y, this should only be at the end print('Extracting all features') if est_events: X = extract_features( timeseries_temp, column_id="id", column_sort="time", default_fc_parameters=ComprehensiveFCParameters(), impute_function=impute) y = y_temp # Remove NaN index's from X and y remove_idx = pd.isnull(y.to_numpy()).nonzero()[0] y = y.drop(remove_idx) X = X.drop(remove_idx) print('Selecting relevant features') X_filtered = select_features(X, y) else: X_filtered = extract_relevant_features( timeseries_temp, y_temp, column_id="id", column_sort="time", default_fc_parameters=ComprehensiveFCParameters()) X_filtered.to_csv("{}features.csv".format(save_dir), header=True) # Now save individual datasets # Reload DataFrame X_features = pd.read_csv("{}features.csv".format(save_dir), index_col=0) # Index values names = X_features.index.values # Saving individual trials print('Saving features for each trial') start = 0 i = 0 for key in dataset.keys(): try: end_temp = uid_last[i] # Name of the row except IndexError: # Last key end_temp = X_features.iloc[-1].name end = end_temp # Find the new end index accounting for removed values removed = True while removed: if end in remove_idx: end -= 1 else: removed = False # end = the name of the row (NOT index) which is the last in the trial end_idx = np.where(names == end)[0][0] X_save = X_features.iloc[start:end_idx + 1] X_save = X_save.reset_index(drop=True) y_save = y.iloc[start:end_idx + 1] y_save = y_save.reset_index(drop=True) start = end_idx + 1 i += 1 # Add start_time and mass column to dataframe if est_events: start_time = dataset[key]['X_starting_time'] mass = dataset[key]['X_mass_sample'] # Remove those due to NaN's start_time_new = start_time[:len(X_save)] mass_new = mass[:len(X_save)] X_save.insert(0, "start_time", start_time_new, True) X_save.insert(1, "mass", mass_new, True) else: mass = dataset[key]['X_mass_all'] # Remove those due to NaN's (should be zero for GRF estimation) mass_new = mass[:len(X_save)] X_save.insert(0, "mass", mass_new, True) # Save X_save.to_csv("{}{}_X.csv".format(save_dir, key), index=True, header=True) y_save.to_csv("{}{}_y.csv".format(save_dir, key), index=True, header=True) return