Пример #1
0
 def get_feature_dict(feature_set=None):
     full_feature_dict = ComprehensiveFCParameters()
     simple_baseline_features = {
         key: None
         for key in [
             'abs_energy',
             'mean',
             'median',
             'minimum',
             'maximum',
             'standard_deviation',
         ]
     }
     distribution_features_dict = utils.distribution_features_tsfresh_dict()
     temporal_feature_dict = {
         key: full_feature_dict[key]
         for key in set(full_feature_dict) - set(distribution_features_dict)
     }
     no_entropy_features_dict = {
         key: value
         for key, value in full_feature_dict.items() if 'entropy' not in key
     }
     feature_dict = {
         'simple_baseline': simple_baseline_features,
         'distribution_features': distribution_features_dict,
         'temporal_features': temporal_feature_dict,
         'no_entropy': no_entropy_features_dict,
     }
     return feature_dict.get(feature_set, full_feature_dict)
Пример #2
0
def get_tsfresh(data):
    dataset = Dataset(data_array=data, data_labels=data, BATCH_SIZE=BATCH_SIZE)
    extraction_settings = ComprehensiveFCParameters(
    )  #EfficientFCParameters()#MinimalFCParameters()#
    features_to_return = []
    start_time = time.time()
    eval_not_finished = 1
    while eval_not_finished != 0:
        # time_checked = check_times(times[i])
        data_batch, _ = dataset.get_batch_eval()
        batch_df = get_data_as_df(data_batch)
        X = extract_features(batch_df,
                             column_id='ids',
                             column_sort='time',
                             default_fc_parameters=extraction_settings,
                             impute_function=impute,
                             n_jobs=10)
        impute(X)
        fetures_batch = X.values
        features_to_return.append(fetures_batch)
        eval_not_finished = dataset.BATCH_COUNTER_EVAL
        if dataset.BATCH_COUNTER_EVAL % 100 == 0:
            time_usage = str(
                datetime.timedelta(seconds=int(round(time.time() -
                                                     start_time))))
            print("it %i Time usage: %s" %
                  (dataset.BATCH_COUNTER_EVAL, str(time_usage)),
                  flush=True)
    features_to_return = np.concatenate(features_to_return)
    time_usage = str(
        datetime.timedelta(seconds=int(round(time.time() - start_time))))
    print("Total Time usage: %s\n" % (str(time_usage)), flush=True)
    return features_to_return
Пример #3
0
 def _extract_tsfresh_features(self, X):
     X_df = self._convert_to_df(X)
     X_df_no_nans = X_df.dropna()
     if self.extraction_type == "minimal":
         extraction_setting = MinimalFCParameters()
     elif self.extraction_type == "efficient":
         extraction_setting = EfficientFCParameters()
     elif self.extraction_type == "all":
         extraction_setting = ComprehensiveFCParameters()
     else:
         raise ValueError(
             f"{self.extraction_type} is not a supported feature extraction option. Please choose one from "
             f"the following options: [minimal, efficient, all]."
         )
     # Extract time series features from the dataframe
     # Replace any ``NaNs`` and ``infs`` in the extracted features with median/extreme values for that column
     tsfresh_features = extract_features(
         X_df_no_nans,
         default_fc_parameters=extraction_setting,
         column_id="id",
         column_sort="time",
         impute_function=impute,
     )
     # If X_df.dropna() dropped some observations entirely (i.e., due to all NaNs),
     # impute each tsfresh feature for those observations with the median of that tsfresh feature
     tsfresh_features_imputed = impute(tsfresh_features.reindex(pd.RangeIndex(X_df["id"].max() + 1)))
     return tsfresh_features_imputed, X_df
Пример #4
0
def reshape_data_tsfresh(seq_dataset, n_classes, n_steps, settings):
    """
    Transform sequences dataset into dataset of features
    """
    len_data = seq_dataset.shape[0]
    data_divided = []
    for i in range(n_classes):
        data_divided.append(seq_dataset[:, :, i].reshape(-1))
    to_extract = []
    for i in range(n_classes):
        ids = np.arange(len_data).repeat(n_steps)
        tmp = np.vstack((ids, data_divided[i]))
        tmp = tmp.T
        to_extract.append(pd.DataFrame(data=tmp, columns=["id", "value"]))
    tfs = []
    # parameters of tsfresh features extraction
    if settings == "complete":
        settings = ComprehensiveFCParameters()
    elif settings == "efficient":
        settings = EfficientFCParameters()
    elif settings == "minimal":
        settings = MinimalFCParameters()
    for i in range(n_classes):
        tf = tsfresh.extract_features(
            to_extract[i], column_id="id", default_fc_parameters=settings
        )
        tfs.append(tf)
    data_feat = pd.concat(
        [tfs[i].reindex(tfs[0].index) for i in range(n_classes)], axis=1
    )
    print(data_feat.shape)
    data_feat.fillna(0, inplace=True)
    data_feat.replace([np.inf, -np.inf], 0, inplace=True)
    data_tensor = torch.from_numpy(data_feat.values).float()
    return data_tensor
Пример #5
0
    def test_gen_global_feature_multi_id(self):
        dates = pd.date_range('1/1/2019', periods=8)
        data = np.random.randn(8, 3)
        df = pd.DataFrame({"datetime": dates, "values": data[:, 0],
                           "A": data[:, 1], "B": data[:, 2],
                           "id": ["00"]*4+["01"]*4})
        from tsfresh.feature_extraction import ComprehensiveFCParameters
        from tsfresh.feature_extraction import MinimalFCParameters
        from tsfresh.feature_extraction import EfficientFCParameters
        for params in [ComprehensiveFCParameters(),
                       MinimalFCParameters(),
                       EfficientFCParameters()]:
            output_df, _ = generate_global_features(input_df=df,
                                                    column_id="id",
                                                    column_sort="datetime",
                                                    default_fc_parameters=params)

            assert "datetime" in output_df.columns
            assert "values" in output_df.columns
            assert "A" in output_df.columns
            assert "B" in output_df.columns
            assert "id" in output_df.columns

            for col in output_df.columns:
                if col in ["datetime", "values", "A", "B", "id"]:
                    continue
                assert len(set(output_df[output_df["id"] == "00"][col])) == 1
                assert len(set(output_df[output_df["id"] == "01"][col])) == 1
                assert output_df[output_df["id"] == "00"][col].isna().sum() == 0
                assert output_df[output_df["id"] == "01"][col].isna().sum() == 0
Пример #6
0
    def requires(self):
        settings = ComprehensiveFCParameters()
        for job in [0, 1, 4]:
            for time_series_length in [100, 500, 1000, 5000]:
                yield FullTimingTask(time_series_length=time_series_length,
                                     n_jobs=job,
                                     num_ids=10,
                                     random_seed=42)
                yield FullTimingTask(time_series_length=time_series_length,
                                     n_jobs=job,
                                     num_ids=100,
                                     random_seed=42)

                for feature_name in settings:
                    yield TimingTask(
                        feature_parameter={feature_name: settings[feature_name]},
                        time_series_length=time_series_length,
                        n_jobs=job,
                        num_ids=100,
                        try_number=0,
                        random_seed=42
                    )

                    for try_number in range(3):
                        yield TimingTask(
                            feature_parameter={feature_name: settings[feature_name]},
                            n_jobs=job,
                            try_number=try_number,
                            num_ids=10,
                            time_series_length=time_series_length,
                            random_seed=42
                        )
Пример #7
0
def add_tsfresh_day(new_data, data, tsfresh_features, columns):

    # The dictionary containing the features that we want to extract and the setting for those features
    if tsfresh_features == 'minimal':
        settings = MinimalFCParameters()
    elif tsfresh_features == 'efficient':
        settings = EfficientFCParameters()
    elif tsfresh_features == 'comprehensive':
        settings = ComprehensiveFCParameters()
    else:
        settings = MinimalFCParameters()

    for participant in range(len(data)):

        all_days = []
        for day in range(len(data[participant])):

            # We only take the columns that we are interested in
            sub_data = data[participant][day].loc[data[participant][day]
                                                  ['variable'].isin(columns)]

            # Drop all nan values
            sub_data = sub_data.dropna(axis=0)

            # If a columns is missing we add a row with that column and a 0.
            # If a column contains nan values we do the same
            for col in columns:
                if col not in sub_data['variable']:
                    new_row = sub_data.iloc[0].copy(deep=True)
                    new_row['variable'] = col
                    new_row['value'] = 0
                    sub_data.append(new_row)

            from tsfresh.utilities.dataframe_functions import impute_dataframe_zero
            # Extract features for every variable still left in the dataframe
            extracted = extract_features(sub_data,
                                         default_fc_parameters=settings,
                                         column_id='variable',
                                         column_sort='time_seconds',
                                         column_value='value')

            # We do not want multiple rows therefore in the case of multiple variables therefore we need to change it
            # We also change the column names so that we know what kind if features they are
            extracted = extracted.stack()
            extracted.index = extracted.index.map('{0[1]}_{0[0]}_day'.format)
            extracted = extracted.to_frame().T

            # Add the extracted features to a list
            all_days.append(extracted)

        # Concat the days to make a new dataframe and reset the index to prevent conflicts
        all_days = pd.concat(all_days, axis=0).reset_index(drop=True)

        # Add the new features to the data
        new_data[participant] = pd.concat([new_data[participant], all_days],
                                          axis=1)

    return new_data
Пример #8
0
def tsfresh_extraction(X, y, config):
    n_jobs = config['SVM-config']['n_jobs']
    extraction_settings = ComprehensiveFCParameters()
    return extract_relevant_features(X,
                                     y,
                                     n_jobs=n_jobs,
                                     fdr_level=0.01,
                                     show_warnings=False,
                                     column_id='id',
                                     column_sort='time',
                                     default_fc_parameters=extraction_settings)
Пример #9
0
    def testLocalTSFresh(self):
        robot_execution_failures.download_robot_execution_failures()
        df, y = robot_execution_failures.load_robot_execution_failures()

        dist = MarsDistributor()

        df = df.iloc[:200]

        extraction_settings = ComprehensiveFCParameters()
        extract_features(df, column_id='id', column_sort='time',
                         default_fc_parameters=extraction_settings,
                         # we impute = remove all NaN features automatically
                         impute_function=impute, distributor=dist)
Пример #10
0
def get_features(y, relevant_features, data):
    sensor_data_list = dict_as_list(data)
    df = pd.DataFrame(sensor_data_list,
        columns=['id', 'time', 'accx', 'accy','accz', 'gyrox', 'gyroy', 'gyroz'])

    extraction_settings = ComprehensiveFCParameters()
    if relevant_features:
        X = extract_relevant_features(df, y, column_id = 'id', column_sort = 'time',
            default_fc_parameters = extraction_settings)
    else:
        X = extract_features(df, column_id = 'id', column_sort = 'time',
            default_fc_parameters = extraction_settings, impute_function = impute)

    return X
Пример #11
0
def test_distributed_ts_fresh(setup):
    robot_execution_failures.download_robot_execution_failures()
    df, y = robot_execution_failures.load_robot_execution_failures()
    default_session = get_default_session()
    sync_session = new_session(default_session.address)
    dist = MarsDistributor(session=sync_session)

    df = df.iloc[:200].copy()

    extraction_settings = ComprehensiveFCParameters()
    extract_features(df, column_id='id', column_sort='time',
                     default_fc_parameters=extraction_settings,
                     # we impute = remove all NaN features automatically
                     impute_function=impute, distributor=dist)
Пример #12
0
    def gen_global_feature(self, settings="comprehensive", full_settings=None):
        '''
        Generate per-time-series feature for each time series.
        This method will be implemented by tsfresh.

        :param settings: str or dict. If a string is set, then it must be one of "comprehensive"
               "minimal" and "efficient". If a dict is set then it should follow the instruction
               for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive".
        :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in
               tsfresh. The value is defaulted to None.

        :return: the tsdataset instance.

        '''
        if full_settings is not None:
            self.df = generate_global_features(
                input_df=self.df,
                column_id=self.id_col,
                column_sort=self.dt_col,
                kind_to_fc_parameters=full_settings)
            return self

        from tsfresh.feature_extraction import ComprehensiveFCParameters,\
            MinimalFCParameters, EfficientFCParameters
        default_params = {
            "comprehensive": ComprehensiveFCParameters(),
            "minimal": MinimalFCParameters(),
            "efficient": EfficientFCParameters()
        }

        if isinstance(settings, str):
            assert settings in ["comprehensive", "minimal", "efficient"], \
                f"settings str should be one of \"comprehensive\", \"minimal\", \"efficient\"\
                    , but found {settings}."

            default_fc_parameters = default_params[settings]
        else:
            default_fc_parameters = settings

        self.df,\
            addtional_feature =\
            generate_global_features(input_df=self.df,
                                     column_id=self.id_col,
                                     column_sort=self.dt_col,
                                     default_fc_parameters=default_fc_parameters)

        self.feature_col += addtional_feature

        return self
Пример #13
0
 def extract_features(self,
                      ts,
                      column_id='id',
                      impute_function=impute,
                      default_fc_parameters=ComprehensiveFCParameters(),
                      show_warnings=False,
                      profile=False):
     '''Extract all possible features from ts using tsfresh's extract_features method'''
     return extract_features(ts,
                             column_id=column_id,
                             impute_function=impute_function,
                             default_fc_parameters=default_fc_parameters,
                             n_jobs=self.n_jobs,
                             show_warnings=show_warnings,
                             profile=profile)
Пример #14
0
    def testDistributedTSFresh(self):
        robot_execution_failures.download_robot_execution_failures()
        df, y = robot_execution_failures.load_robot_execution_failures()

        service_ep = 'http://127.0.0.1:' + self.web_port
        with new_session(service_ep) as sess:
            dist = MarsDistributor(sess)

            df = df.iloc[:200]

            extraction_settings = ComprehensiveFCParameters()
            extract_features(df, column_id='id', column_sort='time',
                             default_fc_parameters=extraction_settings,
                             # we impute = remove all NaN features automatically
                             impute_function=impute, distributor=dist)
Пример #15
0
def add_tsfresh_participant(data, tsfresh_features, columns, k):

    # The dictionary containing the features that we want to extract and the setting for those features
    if tsfresh_features == 'minimal':
        settings = MinimalFCParameters()
    elif tsfresh_features == 'efficient':
        settings = EfficientFCParameters()
    elif tsfresh_features == 'comprehensive':
        settings = ComprehensiveFCParameters()
    else:
        settings = MinimalFCParameters()

    for participant in range(len(data)):

        # First we add the necesary columns
        data[participant]['id'] = 0
        data[participant]['index'] = data[participant].index

        # We create the rolled time series which also creates new ids, also note that putting max_timeshift to none
        # means that it takes the maximal possible lengths
        rolled_series = roll_time_series(data[participant],
                                         column_id='id',
                                         column_sort='index',
                                         max_timeshift=k)

        all_features = []
        for column in columns:
            # We extract the features for every element of the time series which return a dataframe with the same number
            # of rows as the original dataframe but a different number of columns
            extracted = extract_features(rolled_series,
                                         default_fc_parameters=settings,
                                         column_id='id',
                                         column_sort='index',
                                         column_value=column)

            # We need to reset the indexes as they have been changed and add them to our list of features
            all_features.append(extracted.reset_index(drop=True))

        # Add all the features together
        extracted = pd.concat(all_features, axis=1)

        # We drop the columns that we previously created because we do no want them in the data
        del data[participant]['id']  # note that you can also use df.drop here
        del data[participant]['index']

        data[participant] = pd.concat([data[participant], extracted], axis=1)

    return data
Пример #16
0
def extractFeatures(rawData):
    print("\nSetting extraction settings")
    extraction_settings = ComprehensiveFCParameters()
    print("Before extracting features")
    X = extract_features(rawData,
                         column_id='id',
                         column_value=None,
                         column_kind=None,
                         impute_function=impute,
                         default_fc_parameters=extraction_settings)
    print("After extracting features")
    print("Number of extracted features: {}.".format(X.shape[1]))
    print("\nShape of X: ")
    print(X.shape)

    return X
Пример #17
0
def extract_features(data_windows: DataFrame,
                     features: List[Feature]) -> Dict[Feature, DataFrame]:
    settings = {
        key: ComprehensiveFCParameters()[key]
        for key in [str(feature.value).lower() for feature in features]
    }
    extracted: DataFrame = tsfresh.extract_features(
        data_windows,
        column_id="id",
        default_fc_parameters=settings,
        disable_progressbar=True)
    result = {}
    for feature_index in range(len(features)):
        feature = features[feature_index]
        result[feature] = extracted.iloc[:, [feature_index]]
    return result
Пример #18
0
def extract_game_tot_feature(game_info_df, game_ratio_info_df):
    game_ratio_info_data = game_ratio_info_df[get_conf_item(
        'data', 'game_ratio_info_clean', is_eval=True)]
    game_ratio_info_data.drop(['odds_grail', 'guest_ratio'],
                              axis=1,
                              inplace=True)
    y = game_info_df[['game_id', 'game_rst_two_cls']]
    y = pd.Series(y['game_rst_two_cls'].map(lambda x: x == 1).values,
                  index=y.game_id)
    settings = ComprehensiveFCParameters()
    game_ratio_info_model = extract_relevant_features(
        game_ratio_info_data,
        y,
        fdr_level=0.1,
        default_fc_parameters=settings,
        column_id='game_id',
        column_sort='position_tm')
    game_ratio_info_model.to_csv('game_ratio_info_model.csv', index=True)
Пример #19
0
    def __init__(self, train_X, train_y, test_X, test_y, train_ids, test_ids):
        super().__init__(test_X, test_y)
        self.train_y = train_y

        self.extraction_settings = ComprehensiveFCParameters()
        X = self.generate_features(pd.concat([train_X, test_X]))

        new_train_X = X.loc[train_ids]
        new_test_X = X.loc[test_ids]

        relevant_features = self.select_features(new_train_X, self.train_y)
        print("Selected Features: {}/{}".format(len(relevant_features),
                                                X.shape[1]))

        if len(relevant_features) > 10:
            self.train_X = new_train_X[relevant_features]
            self.test_X = new_test_X[relevant_features]
        else:
            self.train_X = new_train_X
            self.test_X = new_test_X

        self.model_names = ["Random Forest", "XGBoost"]
        self.best_model_name = None
Пример #20
0
    def gen_rolling_feature(self,
                            window_size,
                            settings="comprehensive",
                            full_settings=None,
                            n_jobs=1):
        '''
        Generate aggregation feature for each sample.
        This method will be implemented by tsfresh.
        Make sure that the specified column name does not contain '__'.

        TODO: relationship with scale should be figured out.

        :param window_size: int, generate feature according to the rolling result.
        :param settings: str or dict. If a string is set, then it must be one of "comprehensive"
               "minimal" and "efficient". If a dict is set, then it should follow the instruction
               for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive".
        :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in
               tsfresh. The value is defaulted to None.
        :param n_jobs: int. The number of processes to use for parallelization.

        :return: the tsdataset instance.
        '''
        from tsfresh.utilities.dataframe_functions import roll_time_series
        from tsfresh.utilities.dataframe_functions import impute as impute_tsfresh
        from tsfresh import extract_features
        from tsfresh.feature_extraction import ComprehensiveFCParameters, \
            MinimalFCParameters, EfficientFCParameters

        DEFAULT_PARAMS = {
            "comprehensive": ComprehensiveFCParameters(),
            "minimal": MinimalFCParameters(),
            "efficient": EfficientFCParameters()
        }

        assert not self._has_generate_agg_feature,\
            "Only one of gen_global_feature and gen_rolling_feature should be called."
        if isinstance(settings, str):
            assert settings in ['comprehensive', 'minimal', 'efficient'], \
                "settings str should be one of 'comprehensive', 'minimal', 'efficient'"\
                f", but found {settings}."
            default_fc_parameters = DEFAULT_PARAMS[settings]
        else:
            default_fc_parameters = settings

        assert window_size < self.df.groupby(self.id_col).size().min() + 1, "gen_rolling_feature "\
            "should have a window_size smaller than shortest time series length."
        df_rolled = roll_time_series(self.df,
                                     column_id=self.id_col,
                                     column_sort=self.dt_col,
                                     max_timeshift=window_size - 1,
                                     min_timeshift=window_size - 1,
                                     n_jobs=n_jobs)
        if not full_settings:
            self.roll_feature_df = extract_features(
                df_rolled,
                column_id=self.id_col,
                column_sort=self.dt_col,
                default_fc_parameters=default_fc_parameters,
                n_jobs=n_jobs)
        else:
            self.roll_feature_df = extract_features(
                df_rolled,
                column_id=self.id_col,
                column_sort=self.dt_col,
                kind_to_fc_parameters=full_settings,
                n_jobs=n_jobs)
        impute_tsfresh(self.roll_feature_df)

        self.feature_col += list(self.roll_feature_df.columns)
        self.roll_additional_feature = list(self.roll_feature_df.columns)
        self._has_generate_agg_feature = True
        return self
Пример #21
0
import functools

from zoo.chronos.data.utils.feature import generate_dt_features, generate_global_features
from zoo.chronos.data.utils.impute import impute_timeseries_dataframe
from zoo.chronos.data.utils.deduplicate import deduplicate_timeseries_dataframe
from zoo.chronos.data.utils.roll import roll_timeseries_dataframe
from zoo.chronos.data.utils.scale import unscale_timeseries_numpy
from zoo.chronos.data.utils.resample import resample_timeseries_dataframe
from zoo.chronos.data.utils.split import split_timeseries_dataframe

from tsfresh.utilities.dataframe_functions import roll_time_series
from tsfresh.utilities.dataframe_functions import impute as impute_tsfresh
from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters,\
    MinimalFCParameters, EfficientFCParameters
DEFAULT_PARAMS = {"comprehensive": ComprehensiveFCParameters(),
                  "minimal": MinimalFCParameters(),
                  "efficient": EfficientFCParameters()}

_DEFAULT_ID_COL_NAME = "id"
_DEFAULT_ID_PLACEHOLDER = "0"


class TSDataset:
    def __init__(self, data, **schema):
        '''
        TSDataset is an abstract of time series dataset.
        Cascade call is supported for most of the transform methods.
        '''
        self.df = data
        self.id_col = schema["id_col"]
Пример #22
0
    feat_multi = []
    for i in range(len(twindows)):
        dict = {
            'id': run_id[i],
            'kind': kind[i],
            'time': times[i],
            'eta': g[i]
        }

        feat_tmp = extract_features(
            pd.DataFrame(dict),
            column_id='id',
            column_sort='time',
            column_kind='kind',
            column_value='eta',
            default_fc_parameters=ComprehensiveFCParameters(),
            impute_function=impute)

        # drop constant features
        feat_tmp = feat_tmp.loc[:, feat_tmp.apply(pd.Series.nunique) != 1]

        feat_multi.append(feat_tmp)

    # create model targets
    max_5_29 = max_eta_npy(eta_all, -2, multi_runs_used)
    max_5_38 = max_eta_npy(eta_all, -1, multi_runs_used)

    # define model, change cache size as needed.
    rmodel = GridSearchCV(SVR(kernel='rbf', gamma='scale', cache_size=1000),\
                          param_grid={"C": [1e-2,5e-1,1e-1,1e0, 1e1, 5e1, 1e2],\
                                      "gamma": np.logspace(-5, 0, 21)})
Пример #23
0
from zoo.chronos.data.utils.feature import generate_dt_features, generate_global_features
from zoo.chronos.data.utils.impute import impute_timeseries_dataframe
from zoo.chronos.data.utils.deduplicate import deduplicate_timeseries_dataframe
from zoo.chronos.data.utils.roll import roll_timeseries_dataframe
from zoo.chronos.data.utils.scale import unscale_timeseries_numpy
from zoo.chronos.data.utils.resample import resample_timeseries_dataframe
from zoo.chronos.data.utils.split import split_timeseries_dataframe

from tsfresh.utilities.dataframe_functions import roll_time_series
from tsfresh.utilities.dataframe_functions import impute as impute_tsfresh
from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters,\
    MinimalFCParameters, EfficientFCParameters
DEFAULT_PARAMS = {
    "comprehensive": ComprehensiveFCParameters(),
    "minimal": MinimalFCParameters(),
    "efficient": EfficientFCParameters()
}

_DEFAULT_ID_COL_NAME = "id"
_DEFAULT_ID_PLACEHOLDER = "0"


class TSDataset:
    def __init__(self, data, **schema):
        '''
        TSDataset is an abstract of time series dataset.
        Cascade call is supported for most of the transform methods.
        '''
        self.df = data
# In[66]:

df.time.unique()

# In[44]:

df[df.id == 3][['time', 'F_x']].plot(x='time',
                                     title='Success example (id 3)',
                                     figsize=(12, 6))
df[df.id == 20][['time', 'F_x']].plot(x='time',
                                      title='Failure example (id 20)',
                                      figsize=(12, 6))

# In[45]:

extraction_settings = ComprehensiveFCParameters()

# In[46]:

X = extract_features(df,
                     column_id='id',
                     column_sort='time',
                     default_fc_parameters=extraction_settings,
                     impute_function=impute)

# In[47]:

X.head()

# In[48]:
Пример #25
0
           tsr.stack_series(eta, t, 702, runnos, 901, 0.5, threshold, tsteps[i])

        eta_g702.append(eta_tmp)
        run_id.append(run_id_tmp)
        runs_used.append(runs_used_tmp)
        times.append(times_tmp)
        tstart.append(tstart_tmp)
    
    # Featurize using tsfresh
    feat702 = []

    for i in range(len(eta_g702)):
        dict = {'id':run_id[i], 'time':times[i], 'eta': eta_g702[i]}

        feat_temp = extract_features(pd.DataFrame(dict), column_id='id', column_sort='time',\
                            default_fc_parameters=ComprehensiveFCParameters(), impute_function=impute)

        # drop constant features
        feat702.append(feat_temp.loc[:, feat_temp.apply(pd.Series.nunique) != 1]) 
    
    # Extract feature settings for model prediction
    params_30min = settings.from_columns(feat702[0])
    params_60min = settings.from_columns(feat702[1])
    
    # Create targets for train/test
    g901max = tsr.max_eta(eta,901,runnos)
    g911max = tsr.max_eta(eta,911,runnos)
    
    # Specify the model
    rmodel = RandomForestRegressor(n_estimators=100)
Пример #26
0
    def gen_global_feature(self,
                           settings="comprehensive",
                           full_settings=None,
                           n_jobs=1):
        '''
        Generate per-time-series feature for each time series.
        This method will be implemented by tsfresh.
        Make sure that the specified column name does not contain '__'.

        TODO: relationship with scale should be figured out.

        :param settings: str or dict. If a string is set, then it must be one of "comprehensive"
               "minimal" and "efficient". If a dict is set, then it should follow the instruction
               for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive".
        :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in
               tsfresh. The value is defaulted to None.
        :param n_jobs: int. The number of processes to use for parallelization.

        :return: the tsdataset instance.
        '''
        from tsfresh import extract_features
        from tsfresh.feature_extraction import ComprehensiveFCParameters, \
            MinimalFCParameters, EfficientFCParameters

        DEFAULT_PARAMS = {
            "comprehensive": ComprehensiveFCParameters(),
            "minimal": MinimalFCParameters(),
            "efficient": EfficientFCParameters()
        }

        assert not self._has_generate_agg_feature, \
            "Only one of gen_global_feature and gen_rolling_feature should be called."
        if full_settings is not None:
            self.df,\
                addtional_feature =\
                generate_global_features(input_df=self.df,
                                         column_id=self.id_col,
                                         column_sort=self.dt_col,
                                         kind_to_fc_parameters=full_settings,
                                         n_jobs=n_jobs)
            self.feature_col += addtional_feature
            return self

        if isinstance(settings, str):
            assert settings in ['comprehensive', 'minimal', 'efficient'], \
                "settings str should be one of 'comprehensive', 'minimal', 'efficient'"\
                f", but found {settings}."
            default_fc_parameters = DEFAULT_PARAMS[settings]
        else:
            default_fc_parameters = settings

        self.df,\
            addtional_feature =\
            generate_global_features(input_df=self.df,
                                     column_id=self.id_col,
                                     column_sort=self.dt_col,
                                     default_fc_parameters=default_fc_parameters,
                                     n_jobs=n_jobs)

        self.feature_col += addtional_feature
        self._has_generate_agg_feature = True
        return self
Пример #27
0
    # antenna_order = [7, 9, 6, 4] if device_v else [6, 7, 4, 9]
    # antenna_order = [8, 10, 5, 3] if device_v else [5, 8, 3, 10]

    # va_list = ['VA_{}'.format(i) for i in [8, 10, 7, 9, 6, 4, 5, 3]]
    # va_list = ['VA_{}'.format(i) for i in [7, 9, 6, 4]]
    # va_list = ['VA_{}'.format(i) for i in [8, 10, 5, 3]]

    df_x = pd.DataFrame(data, columns=['id', 'time'] + va_list)

    df_x = df_x.astype({'id': int, 'time': int})

    df_y = pd.DataFrame(label)

    X = pd.DataFrame(index=df_y.index)

    settings = ComprehensiveFCParameters()
    # settings = MinimalFCParameters()

    # settings = {
    #     "length": None,
    #     "large_standard_deviation": [{"r": 0.05}, {"r": 0.1}]
    # }

    extracted_features = extract_features(df_x, column_id="id", column_sort="time", impute_function=impute,
                                          default_fc_parameters=settings)

    # extracted_features = pd.merge(extracted_features, svd_feature, left_index=True, right_index=True)

    # extracted_features = extract_features(df_x, column_id="id", column_sort="time", impute_function=impute)
    # print(extracted_features)
Пример #28
0
def extract_data(data_folder: str,
                 columns: list,
                 overlap=False,
                 all: bool = True,
                 est_events: bool = False,
                 event: str = None,
                 event_type: str = None):
    '''
	This function uses tsFRESH to extract relevant features for multiple machine learning tasks.
	If a csv file of features to use already exists (as features.csv), then those features will
	be used instead of finding relevant features from scratch (speeds up computing time).

	Inputs:

	data_folder: a string containing the location of the directory which the dataset.pkl is saved in.
				 This dataset it created using data_preperation.py.

	columns: a list of strings containing the columns from the dataset which the user wishes to extract 
			 features from. This includes: id, time, ax_l, ay_l, az_l, ax_r, ay_r, az_r,
		 	 ax_diff, ay_diff, az_diff, a_res_l, a_res_r, a_res_diff.
			 NOTE: if id or time are not included in this list, they will be automatically added as they
			 are necessary.

	all: a boolean (either True or False). If true, feature extraction will be run using all the data, if
		 False, feature extraction will be run using the first trial, and then that we be used on all the
		 data.

	est_events: a boolean (either True or False). If True, features will be extracted to estimate whether
				an event occured or not within a 100 ms time frame. If False, features will be extracted
				to estimate vertical GRF for the entire timeseries.

	event: A string containing either FS or FO. This will indicate which event the user wants to predict on.
		NOTE: this is only necessary as an input if est_events is True.

	event_type: A string containing either binary or time. This will indicate which type of output the user wants.
		NOTE: this is only necessary as an input if est_events is True.
	
	Outputs:
	This function does not return anything. However, it does save *.csv files in appropriate folders (based off
	the columns chosen) which can be used to fit either a classification or regression model (depending on what
	task is required) - see model_fitting.py

	Alex Woodall

	Auckland Bioengineering Institute

	08/04/2020

	'''

    from tsfresh import extract_features, extract_relevant_features, select_features
    from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters
    from tsfresh.feature_extraction.settings import from_columns
    from tsfresh.utilities.dataframe_functions import impute

    import pickle
    import numpy as np
    import pandas as pd
    import os

    # Load data
    try:
        if overlap:
            dataset = pickle.load(
                open(data_folder + "dataset_overlap.pkl", "rb"))

        else:
            dataset = pickle.load(
                open(data_folder + "dataset_no_overlap.pkl", "rb"))

    except FileNotFoundError:
        dataset = pickle.load(open(data_folder + "dataset_200.pkl", "rb"))

    # Number selected columns which the user chose to use for feature extraction
    columns, columns_num = selected_columns(columns)

    # Create directories for saving
    new_directory = "{}{}\\".format(data_folder,
                                    ("_".join(map(str, columns_num))))
    save_dir = create_directories(new_directory, event, event_type, est_events)

    # Attempt to load features from the save directory.
    try:
        X_features = pd.read_csv(
            "{}features.csv".format(save_dir),
            index_col=0)  # DataFrame containing the features we want
        features_string = X_features.columns
        extraction_settings = from_columns(
            features_string)  # These are the features that we will be using

        pre_extracted = True

    except FileNotFoundError:  # File does not exist
        pre_extracted = False

    # List to append last uid's from each key (used when using all trials to extract features)
    uid_last = []

    # Iterate through all the trials in the dataset
    for key in dataset.keys():

        # Create the timeseries based on the user input columns
        for col in columns_num:
            if col == 0:
                timeseries = (
                    dataset[key]['X'])[:, col]  # Only true accelerations
            else:
                timeseries = np.vstack((timeseries, dataset[key]['X'][:, col]))

        # dataset[key].keys() = ['X', 'force', 'y_FS_binary', 'y_FO_binary', 'y_FS_time_to', 'y_FO_time_to']

        # Create y (real data output)
        if est_events:  # If estimating events

            try:
                if event_type == 'binary':
                    y = dataset[key]['y_{}_binary'.format(event)]

                    # Convert to boolean (will remain boolean if already)
                    y = (y == 1.0)

                elif event_type == 'time':
                    y = dataset[key]['y_{}_time_to_next'.format(event)]

                else:
                    print('Event type must either be binary or time')

                    return

            except KeyError:
                print("Event must equal either 'FS' or 'FO'.")

                return

        else:  # Estimating forces
            # possible force = ['Fx', 'Fy', 'Fz'] Assuming z direction is vertical
            y = dataset[key]['y'][:, 2]

        # Convert to pandas DataFrame/Series
        if type(timeseries) is np.ndarray:
            # Needs to be a pandas dataframe
            timeseries = pd.DataFrame(timeseries.T, columns=columns)

            # Convert ID column into integers
            timeseries = timeseries.astype({'id': int})

            if est_events:
                if event_type == 'binary':
                    y = pd.Series(data=y, dtype=bool, name='events')
                elif event_type == 'time':
                    y = pd.Series(data=y, dtype=float, name='events')
            else:
                # Change ID column to fit for regression method
                ID = (np.arange(0, len(timeseries))).astype(int)

                timeseries['id'] = ID

                y = pd.Series(data=y, dtype=float, name='Fz')

        # Save X full dataset
        timeseries.to_csv("{}{}_timeseries.csv".format(save_dir, key),
                          index=True,
                          header=True)

        # Extract features from the first trial and use those for the rest if all == True
        if not all:
            # Extract features using tsFRESH
            if not pre_extracted:
                print('Finding relevant features using {}'.format(key))
                X_filtered = extract_relevant_features(
                    timeseries,
                    y,
                    column_id="id",
                    column_sort="time",
                    default_fc_parameters=ComprehensiveFCParameters())

                # Save filtered features
                X_filtered.to_csv("{}features.csv".format(save_dir),
                                  header=True)

                features_string = X_filtered.columns
                extraction_settings = from_columns(
                    features_string
                )  # These are the features that we will be using

                pre_extracted = True

            if pre_extracted:
                print('Using pre-extracted features for event = {}'.format(
                    event))
                print(str(key))
                X_filtered = extract_features(
                    timeseries,
                    column_id="id",
                    column_sort="time",
                    kind_to_fc_parameters=extraction_settings)

            # Add start_time and mass column to dataframe
            if est_events:
                start_time = dataset[key]['X_starting_time']
                mass = dataset[key]['X_mass_sample']

                X_filtered.insert(0, "start_time", start_time, True)
                X_filtered.insert(1, "mass", mass, True)

            else:
                mass = dataset[key]['X_mass_all']
                X_filtered.insert(0, "mass", mass, True)

            # Save dataframes
            X_filtered.to_csv("{}{}_X.csv".format(save_dir, key),
                              index=True,
                              header=True)
            y.to_csv("{}{}_y.csv".format(save_dir, key),
                     index=True,
                     header=True)

        else:
            try:
                uid_change = timeseries_temp['id'].iloc[-1]

                uid_last.append(uid_change)

                timeseries['id'] = timeseries['id'] + uid_change + 1
                timeseries_temp = timeseries_temp.append(timeseries)
                y_temp = y_temp.append(y, ignore_index=True)

            except NameError:  # *_temp DataFrames do not exist yet
                timeseries_temp = timeseries
                y_temp = y

    if all:
        print('Using all data to extract relevant features')

        # First remove any NaN values in y, this should only be at the end
        print('Extracting all features')

        if est_events:
            X = extract_features(
                timeseries_temp,
                column_id="id",
                column_sort="time",
                default_fc_parameters=ComprehensiveFCParameters(),
                impute_function=impute)

            y = y_temp

            # Remove NaN index's from X and y
            remove_idx = pd.isnull(y.to_numpy()).nonzero()[0]
            y = y.drop(remove_idx)
            X = X.drop(remove_idx)

            print('Selecting relevant features')
            X_filtered = select_features(X, y)

        else:
            X_filtered = extract_relevant_features(
                timeseries_temp,
                y_temp,
                column_id="id",
                column_sort="time",
                default_fc_parameters=ComprehensiveFCParameters())

        X_filtered.to_csv("{}features.csv".format(save_dir), header=True)

        # Now save individual datasets
        # Reload DataFrame
        X_features = pd.read_csv("{}features.csv".format(save_dir),
                                 index_col=0)

        # Index values
        names = X_features.index.values

        # Saving individual trials
        print('Saving features for each trial')
        start = 0
        i = 0
        for key in dataset.keys():
            try:
                end_temp = uid_last[i]  # Name of the row

            except IndexError:
                # Last key
                end_temp = X_features.iloc[-1].name

            end = end_temp

            # Find the new end index accounting for removed values
            removed = True

            while removed:
                if end in remove_idx:
                    end -= 1
                else:
                    removed = False

            # end = the name of the row (NOT index) which is the last in the trial
            end_idx = np.where(names == end)[0][0]

            X_save = X_features.iloc[start:end_idx + 1]
            X_save = X_save.reset_index(drop=True)

            y_save = y.iloc[start:end_idx + 1]
            y_save = y_save.reset_index(drop=True)

            start = end_idx + 1
            i += 1

            # Add start_time and mass column to dataframe
            if est_events:
                start_time = dataset[key]['X_starting_time']
                mass = dataset[key]['X_mass_sample']

                # Remove those due to NaN's
                start_time_new = start_time[:len(X_save)]
                mass_new = mass[:len(X_save)]

                X_save.insert(0, "start_time", start_time_new, True)
                X_save.insert(1, "mass", mass_new, True)

            else:
                mass = dataset[key]['X_mass_all']

                # Remove those due to NaN's (should be zero for GRF estimation)
                mass_new = mass[:len(X_save)]
                X_save.insert(0, "mass", mass_new, True)

            # Save
            X_save.to_csv("{}{}_X.csv".format(save_dir, key),
                          index=True,
                          header=True)
            y_save.to_csv("{}{}_y.csv".format(save_dir, key),
                          index=True,
                          header=True)

    return