def test_extraction_null_as_column_name(self):

        df1 = pd.DataFrame(data={0: range(10), 1: np.repeat([0, 1], 5), 2: np.repeat([0, 1, 2, 3, 4], 2)})
        X1 = extract_features(df1, column_id=1, column_sort=2)
        self.assertEqual(len(X1), 2)

        df2 = pd.DataFrame(data={1: range(10), 0: np.repeat([0, 1], 5), 2: np.repeat([0, 1, 2, 3, 4], 2)})
        X2 = extract_features(df2, column_id=0, column_sort=2)
        self.assertEqual(len(X2), 2)

        df3 = pd.DataFrame(data={0: range(10), 2: np.repeat([0, 1], 5), 1: np.repeat([0, 1, 2, 3, 4], 2)})
        X3 = extract_features(df3, column_id=2, column_sort=1)
        self.assertEqual(len(X3), 2)
    def test_functional_equality(self):
        """
        `extract_relevant_features` should be equivalent to running first `extract_features` with impute and
        `select_features` afterwards.
        Meaning it should produce the same relevant features and the values of these features should be identical.
        :return:
        """
        df, y = self.create_test_data_sample_with_target()

        relevant_features = extract_relevant_features(df, y, column_id='id', column_value='val', column_kind='kind',
                                                      column_sort='sort')

        extracted_features = extract_features(df, column_id='id',
                                              column_value='val', column_kind='kind', column_sort='sort',
                                              impute_function=impute)
        selected_features = select_features(extracted_features, y)

        self.assertEqual(
            set(relevant_features.columns), set(selected_features.columns),
            "Should select the same columns:\n\t{}\n\nvs.\n\n\t{}".format(relevant_features.columns,
                                                                          selected_features.columns))

        relevant_columns = relevant_features.columns
        relevant_index = relevant_features.index
        self.assertTrue(
            relevant_features.equals(selected_features.loc[relevant_index][relevant_columns]),
            "Should calculate the same feature values")
示例#3
0
 def test_extract_feature(self):
     ts = pd.DataFrame({
         'id': np.array(['a', 'a', 'a', 'b', 'b', 'b']),
         'time': np.array([0,1,2,0,1,2]),
         'x': np.array([3,4,5,7,8,10])
     })
     extracted_features = extract_features(ts, column_id='id', column_sort='time')
     self.assertEqual(2, len(extracted_features))
def ts_extract(df, features):
    logging.getLogger('distributed.utils_perf').setLevel(logging.CRITICAL)

    # participant = file.split("_")[0]
    # video = file.split("_")[1]
    dd = df.copy()
    dd['id'] = dd.index
    extracted_features = extract_features(dd,
                                          column_id="id",
                                          column_sort="time",
                                          default_fc_parameters=features,
                                          distributor=MapDistributor())
    impute(extracted_features)
    return extracted_features
def ts_feature_extraction(dataframe):
    """
    Gets 5 transformed features from 794 features extracted by tsfresh
    :param dataframe: A pandas dataframe
    :return: A pandas dataframe
    """
    features = extract_features(
        dataframe,
        column_id='period',
        column_sort='date',
        column_value='value',
    )
    features = rm_const_cols(features)
    return pca_transformation(features)
示例#6
0
def generate_global_features(input_df,
                             column_id,
                             column_sort,
                             default_fc_parameters=None,
                             kind_to_fc_parameters=None):
    '''
    generate global features by tsfresh.
    :param input_df: input dataframe.
    :param column_id: id column name
    :param column_sort: time column name
    :param default_fc_parameters: same as tsfresh.
    :param kind_to_fc_parameters: same as tsfresh.

    :return : a new input_df that contains all generated feature.
    '''
    if kind_to_fc_parameters is not None:
        global_feature = extract_features(input_df,
                                          column_id=column_id,
                                          column_sort=column_sort,
                                          kind_to_fc_parameters=kind_to_fc_parameters)
    else:
        global_feature = extract_features(input_df,
                                          column_id=column_id,
                                          column_sort=column_sort,
                                          default_fc_parameters=default_fc_parameters)
    res_df = input_df.copy()
    id_list = list(np.unique(input_df[column_id]))
    addtional_feature = []
    for col_name in global_feature.columns:
        # any feature that can not be extracted will be dropped
        if global_feature[col_name].isna().sum() > 0:
            continue
        # const value will be given to each univariate time series
        for id_name in id_list:
            res_df.loc[input_df["id"] == id_name, col_name] = global_feature.loc[id_name][col_name]
        addtional_feature.append(col_name)
    return res_df, addtional_feature
示例#7
0
def make_extra_ts_featurs(train, meta_train):
    feats = []
    feats2 = []
    feats3 = []
    for chk in tqdm_chunks(meta_train.object_id.unique(), 1000):
        slc = train[get_membership_mask(train[OBJECT_ID], set(chk))]
        extracted_features = extract_features(slc,
                                              EXTRA_FLUX_PARS,
                                              column_value='flux',
                                              disable_progressbar=True,
                                              **TSKW)
        feats.append(extracted_features)
        extracted_features2 = extract_features(slc,
                                               FC_PASSBAND_V2,
                                               column_value='flux',
                                               column_kind='passband',
                                               disable_progressbar=True,
                                               **TSKW)

        feats2.append(extracted_features2)
        extracted_features3 = extract_features(
            slc,
            column_value='flux_by_flux_ratio_sq',
            column_kind='passband',
            disable_progressbar=True,
            **TSKW)
        feats3.append(extracted_features3)
    new_feat_df = pd.concat(feats)
    new_feat_df2 = pd.concat(feats2)
    new_feat_df3 = pd.concat(feats3)

    catted = pd.concat([
        new_feat_df, new_feat_df2,
        new_feat_df3.add_prefix('flux_by_flux_ratio_sq')
    ],
                       axis=1)
    return catted
示例#8
0
    def test_pandas_no_pivot(self):
        df = self.df

        X = extract_features(df, column_id="my_id", column_sort="time",
                             column_kind="dimension", column_value="value",
                             pivot=False,
                             default_fc_parameters=MinimalFCParameters())
        X = pd.DataFrame(X, columns=["my_id", "variable", "value"])
        self.assertIn("1__mean", X["variable"].values)
        self.assertAlmostEqual(X[(X["my_id"] == "5") & (X["variable"] == "1__mean")]["value"].iloc[0], 5.516e-05, 4)
        self.assertEqual(X.shape, (100*20, 3))

        X = extract_features(df, column_id="my_id", column_sort="time",
                             column_kind="dimension",
                             pivot=False,
                             default_fc_parameters=MinimalFCParameters())
        X = pd.DataFrame(X, columns=["my_id", "variable", "value"])
        self.assertIn("1__mean", X["variable"].values)
        self.assertAlmostEqual(X[(X["my_id"] == "5") & (X["variable"] == "1__mean")]["value"].iloc[0], 5.516e-05, 4)
        self.assertEqual(X.shape, (100*20, 3))

        X = extract_features(df.drop(columns=["dimension"]), column_id="my_id",
                             column_sort="time",
                             pivot=False,
                             default_fc_parameters=MinimalFCParameters())
        X = pd.DataFrame(X, columns=["my_id", "variable", "value"])
        self.assertIn("value__mean", X["variable"].values)
        self.assertAlmostEqual(X[(X["my_id"] == "5") & (X["variable"] == "value__mean")]["value"].iloc[0], 5.516e-05, 4)
        self.assertEqual(X.shape, (100*10, 3))

        X = extract_features(df.drop(columns=["dimension", "time"]), column_id="my_id",
                             pivot=False,
                             default_fc_parameters=MinimalFCParameters())
        X = pd.DataFrame(X, columns=["my_id", "variable", "value"])
        self.assertIn("value__mean", X["variable"].values)
        self.assertAlmostEqual(X[(X["my_id"] == "5") & (X["variable"] == "value__mean")]["value"].iloc[0], 5.516e-05, 4)
        self.assertEqual(X.shape, (100*10, 3))
示例#9
0
def tsfresh_extract_features():
    train_df_list = []
    for file_name in os.listdir(train_path):
        if file_name.endswith('.csv'):
            df = pd.read_csv(os.path.join(train_path, file_name))
            train_df_list.append(df)

    test_df_list = []
    for file_name in os.listdir(test_path):
        if file_name.endswith('.csv'):
            df = pd.read_csv(os.path.join(test_path, file_name))
            test_df_list.append(df)

    train_df = pd.concat(train_df_list)
    test_df = pd.concat(test_df_list)

    train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
    test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

    all_df = pd.concat([train_df, test_df], sort=False)

    df = all_df.drop(columns=['type'])

    extracted_df = extract_features(df, column_id='渔船ID', column_sort='time',
                                    n_jobs=8, kind_to_fc_parameters=fc_parameters_v1)

    train_df = extracted_df.iloc[:len(train_df_list)]
    test_df = extracted_df.iloc[len(train_df_list):]

    y = []
    for name, group in all_df.groupby('渔船ID'):
        y.append(group.iloc[0]['type'])

    y_train = y[:train_df.shape[0]]
    le = preprocessing.LabelEncoder()
    y_train = le.fit_transform(y_train)

    # impute(train_df)
    # filtered_train_df = select_features(train_df, y_train)
    # filtered_test_df = test_df[filtered_train_df.columns]

    train_df['type'] = le.inverse_transform(y_train)

    if not os.path.exists('./feature'):
        os.makedirs('./feature')
    train_df.to_csv('./feature/train.csv')
    test_df.to_csv('./feature/test.csv')

    return train_df, test_df
def ts_feature_extraction(dataframe, num_jobs=0):
    """
    Gets 5 transformed features from 794 features extracted by tsfresh
    :param dataframe: A pandas dataframe
    :param num_jobs: integer, number of parallel processes in tsfresh
    :return: A pandas dataframe
    """
    features = extract_features(dataframe,
                                column_id='period',
                                column_sort='date',
                                column_value='value',
                                n_jobs=num_jobs
                                )
    features = rm_const_cols(features)
    return pca_transformation(features)
示例#11
0
def pd_ts_tsfresh_features(df: pd.DataFrame,
                           cols: list = None,
                           pars: dict = None):
    from tsfresh import extract_relevant_features, extract_features
    from tsfresh.utilities.dataframe_functions import roll_time_series

    single_row_df = df[cols]
    single_row_df["time"] = range(0, len(single_row_df.index))
    id_col = pars.get("id_col", "id")
    if not "id_col" in pars.keys():
        single_row_df["id"] = 1
    X_feat = extract_features(single_row_df,
                              column_id=id_col,
                              column_sort='time')
    return X_feat, X_feat.columns.to_list()
示例#12
0
def test_features_on_btc():

    df = pd.DataFrame({
        "id": [1, 1, 1, 1, 2, 2],
        "time": [1, 2, 3, 4, 8, 9],
        "x": [1, 2, 3, 4, 10, 11],
        "y": [5, 6, 7, 8, 12, 13],
    })

    df_rolled = roll_time_series(df, column_id="id", column_sort="time")
    assert df_rolled['id'].nunique() == 6
    df_features = extract_features(df_rolled,
                                   column_id="id",
                                   column_sort="time")
    assert df_features.shape[0] == 6
示例#13
0
 def extract_features(self,
                      ts,
                      column_id='id',
                      impute_function=impute,
                      default_fc_parameters=ComprehensiveFCParameters(),
                      show_warnings=False,
                      profile=False):
     '''Extract all possible features from ts using tsfresh's extract_features method'''
     return extract_features(ts,
                             column_id=column_id,
                             impute_function=impute_function,
                             default_fc_parameters=default_fc_parameters,
                             n_jobs=self.n_jobs,
                             show_warnings=show_warnings,
                             profile=profile)
def get_features(file_name, count):
    csv_data = pd.read_csv(file_name)
    timeseries = csv_data.iloc[:, :-1]

    print('start getfeatures...')
    # 全部特征
    extracted_features = extract_features(timeseries,
                                          column_id="id",
                                          column_sort="time")
    impute(extracted_features)
    print('start save ...')
    extracted_features.to_csv('tsfresh_extractedFeatures' + str(count) +
                              '.csv')

    print(str(count) + '  end')
示例#15
0
def tsfresh_calculator(timeseries, column_id, column_sort, cleanup):

    from tsfresh import extract_features
    from tsfresh import extract_relevant_features

    if cleanup == "Yes":
        extracted_features = extract_relevant_features(timeseries,
                                                       column_id=column_id,
                                                       column_sort=column_sort)
    else:
        extracted_features = extract_features(timeseries,
                                              column_id=column_id,
                                              column_sort=column_sort)

    return extracted_features
示例#16
0
    def _extract_features(self, data_frame):
        df_rolled = roll_time_series(
            data_frame,
            column_id=self.column_id,
            column_sort=self.time_stamp,
            max_timeshift=self.memory,
        )

        extracted_minimal = tsfresh.extract_features(
            df_rolled,
            column_id=self.column_id,
            column_sort=self.time_stamp,
            default_fc_parameters=tsfresh.feature_extraction.
            MinimalFCParameters(),
        )

        extracted_index_based = tsfresh.extract_features(
            df_rolled,
            column_id=self.column_id,
            column_sort=self.time_stamp,
            default_fc_parameters=tsfresh.feature_extraction.settings.
            IndexBasedFCParameters(),
        )

        extracted_features = pd.concat(
            [extracted_minimal, extracted_index_based], axis=1)
        del extracted_minimal
        del extracted_index_based

        gc.collect()

        extracted_features[np.isnan(extracted_features)] = 0.0

        extracted_features[np.isinf(extracted_features)] = 0.0

        return extracted_features
def add_tsfresh_participant(data, tsfresh_features, columns, k):

    # The dictionary containing the features that we want to extract and the setting for those features
    if tsfresh_features == 'minimal':
        settings = MinimalFCParameters()
    elif tsfresh_features == 'efficient':
        settings = EfficientFCParameters()
    elif tsfresh_features == 'comprehensive':
        settings = ComprehensiveFCParameters()
    else:
        settings = MinimalFCParameters()

    for participant in range(len(data)):

        # First we add the necesary columns
        data[participant]['id'] = 0
        data[participant]['index'] = data[participant].index

        # We create the rolled time series which also creates new ids, also note that putting max_timeshift to none
        # means that it takes the maximal possible lengths
        rolled_series = roll_time_series(data[participant],
                                         column_id='id',
                                         column_sort='index',
                                         max_timeshift=k)

        all_features = []
        for column in columns:
            # We extract the features for every element of the time series which return a dataframe with the same number
            # of rows as the original dataframe but a different number of columns
            extracted = extract_features(rolled_series,
                                         default_fc_parameters=settings,
                                         column_id='id',
                                         column_sort='index',
                                         column_value=column)

            # We need to reset the indexes as they have been changed and add them to our list of features
            all_features.append(extracted.reset_index(drop=True))

        # Add all the features together
        extracted = pd.concat(all_features, axis=1)

        # We drop the columns that we previously created because we do no want them in the data
        del data[participant]['id']  # note that you can also use df.drop here
        del data[participant]['index']

        data[participant] = pd.concat([data[participant], extracted], axis=1)

    return data
示例#18
0
def get_features(file_name, count):
    csv_data = pd.read_csv(file_name)
    timeseries = csv_data.iloc[:, :-1]
    del timeseries['Unnamed: 0']
    y = csv_data[['id', 'y']]
    y = handle_y(y)

    print(timeseries)
    print(y)

    print('start getfeatures...')
    # 全部特征
    extracted_features = extract_features(timeseries, column_id="id", column_sort="time")
    impute(extracted_features)
    extracted_features.to_csv('tsfresh_extractedFeatures' + str(count) + '.csv')
    print(str(count) + '  end')
示例#19
0
 def extract_select_inference_features(self, data, args=None):
     """
     Extract-Select features
     Only extract specific features passed via args, we want the same as in taining
     https://stackoverflow.com/questions/50426458/retrieve-specific-features-by-using-tsfresh-in-python
     :param data: pandas.DataFrame
     :param args:
     :return: list
     """
     X = extract_features(data,
                          column_id=args[0],
                          n_jobs=args[1],
                          chunksize=args[2],
                          kind_to_fc_parameters=args[3])
     X = impute(X)
     return X
示例#20
0
 def extract_features(self, data, args=None):
     """
     Extract features
     :param data: pandas.DataFrame
     :param args:
     :return: pandas.DataFrame
     """
     print(args[0])
     print(args[1])
     print(args[2])
     X = extract_features(data,
                          column_id=args[0],
                          n_jobs=args[1],
                          chunksize=args[2])
     X = impute(X)
     return X
示例#21
0
 def predict_gamma_for_timeseries(self, timeseries_df):
     '''
     Predict the best gamma based on time series properties
     '''
     timeseries_df[
         'dummy_col'] = 'dummy'  # the library requires an id column, but all ids are the same for our timeseries, so adding a dummy id column
     try:
         features_df = extract_features(timeseries_df.rename(columns={self.id_col: 'value'}),
                                        column_id='dummy_col', column_sort=self.timestamp_col,
                                        disable_progressbar=True)[selected_features].fillna(0)
         features_df = features_df.replace(np.inf, 0)
         features_df = features_df.replace(-np.inf, 0)
         gamma = self.metadata.estimator.predict(features_df)[0]
     except:
         gamma = 1.0
     return gamma
    def extract(window):
        # Get all unique patients so we can pull first 24 hours of data
        pats = insheet.sort_values('mrn_csn_pair')['mrn_csn_pair'].unique()

        first = insheet[(insheet['timestamp'] < window) & (insheet['timestamp'] >= 0)]
        first = first.sort_values('mrn_csn_pair').reset_index(drop=True)

        extracted_flowsheet = tsfresh.extract_features(first, column_id='mrn_csn_pair', column_sort='timestamp', column_kind='measure', column_value='value', n_jobs=8)
        # Drop features that are only NaN
        extracted_flowsheet = extracted_flowsheet.dropna(axis=1, how='all')

        tsfresh.utilities.dataframe_functions.impute(extracted_flowsheet)
        # Add back the mrn_csn_pair
        extracted_flowsheet.insert(0, 'mrn_csn_pair', pats)

        return extracted_flowsheet.reset_index(drop=True)
示例#23
0
def tsfresh_features(signal_df, channels):
    '''
    Calculate features of sensor signal using TSFresh package.

    :param signal_df: dataframe housing sensor signals
    :param channels: channels of sensor signal to calculate TSFresh features
    :return: dataframe of calculated features for each sensor channel
    '''

    signal_df = signal_df[channels]
    signal_df.loc[:, 'id'] = 1
    tsfresh_df = tsf.extract_features(signal_df,
                                      column_id='id',
                                      disable_progressbar=True)

    return tsfresh_df.reset_index(drop=True)
示例#24
0
def extract_features(data_windows: DataFrame,
                     features: List[Feature]) -> Dict[Feature, DataFrame]:
    settings = {
        key: ComprehensiveFCParameters()[key]
        for key in [str(feature.value).lower() for feature in features]
    }
    extracted: DataFrame = tsfresh.extract_features(
        data_windows,
        column_id="id",
        default_fc_parameters=settings,
        disable_progressbar=True)
    result = {}
    for feature_index in range(len(features)):
        feature = features[feature_index]
        result[feature] = extracted.iloc[:, [feature_index]]
    return result
示例#25
0
def extractFeatures(rawData):
    print("\nSetting extraction settings")
    extraction_settings = ComprehensiveFCParameters()
    print("Before extracting features")
    X = extract_features(rawData,
                         column_id='id',
                         column_value=None,
                         column_kind=None,
                         impute_function=impute,
                         default_fc_parameters=extraction_settings)
    print("After extracting features")
    print("Number of extracted features: {}.".format(X.shape[1]))
    print("\nShape of X: ")
    print(X.shape)

    return X
示例#26
0
    def transform(self, X, y=None):
        """Transform X.

        Parameters
        ----------
        X : pd.DataFrame
            nested pandas DataFrame of shape [n_samples, n_columns]
        y : pd.Series, optional (default=None)

        Returns
        -------
        Xt : pandas DataFrame
          Transformed pandas DataFrame
        """
        # input checks
        self.check_is_fitted()
        X = check_X(X, coerce_to_pandas=True)

        # tsfresh requires unique index, returns only values for
        # unique index values
        if X.index.nunique() < X.shape[0]:
            warn(
                "tsfresh requires a unique index, but found "
                "non-unique. To avoid this warning, please make sure the index of X "
                "contains only unique values."
            )
            X = X.reset_index(drop=True)

        Xt = from_nested_to_long(X)

        # lazy imports to avoid hard dependency
        from tsfresh import extract_features

        extraction_params = self._get_extraction_params()
        Xt = extract_features(
            Xt,
            column_id="index",
            column_value="value",
            column_kind="column",
            column_sort="time_index",
            **extraction_params,
        )

        # When using the long input format, tsfresh seems to sort the index,
        # here we make sure we return the dataframe in the sort order as the
        # input data
        return Xt.reindex(X.index)
    def transform(self, X, y=None):
        feats = extract_features(
            X,
            column_id=self.column_id,
            column_sort=self.column_sort,
            chunksize=self.chunk_size,
            default_fc_parameters=self.default_fc_parameters,
            n_jobs=self.n_jobs,
        )

        # Rename columns to allow use with LightGBM, doesn't like "-", "."
        feats = feats.rename(
            columns=lambda x: re.sub("[^A-Za-z0-9_]+", "_", x)
        )

        # Grab the datetime index out of tuple multi index that tsfresh uses
        return feats.set_index(feats.index.map(lambda x: x[1]), drop=True)
示例#28
0
def compute_tsfresh_features(x, save_path, nb_splits=8, which_set='training'):
    print('Processing %s set...' % (which_set))
    n = x.shape[0]
    split_breaks = [int(n / nb_splits) * i for i in range(nb_splits)] + [n]
    for i in range(nb_splits):
        start = split_breaks[i]
        stop = split_breaks[i + 1]
        print('Number of rows being processed:', stop - start)
        features = extract_features(TSFormatting().transform(x.iloc[start:stop]),
                                    column_id='id', column_sort='time',
                                    default_fc_parameters=EfficientFCParameters())
        features['neuron_id'] = x.iloc[start:stop]['neuron_id']
        if (i == 0):
            features.to_csv(save_path, mode='w', header=True, index=True)
        else:
            features.to_csv(save_path, mode='a', header=False, index=True)
        del features
示例#29
0
    def run(self):
        raw: RawData = self.load("raw")

        df = pd.melt(
            raw.sales_train_validation,
            id_vars=[
                "id", "item_id", "dept_id", "cat_id", "store_id", "state_id"
            ],
            var_name="d",
            value_name="sales",
        )

        tsfresh_df = extract_features(df[["id", "d", "sales"]],
                                      column_id="id",
                                      column_sort="d")

        self.dump(tsfresh_df)
示例#30
0
def time_series_analyis(data):
    '''
	Function to perform time series analysis on provided
	dataset.
	Remove the columns stopped as it has nominal values
	'''

    rm_colm = ['stopped']
    df = data[data.columns.difference(rm_colm)]

    extracted_features = extract_features(df,
                                          column_id='animal_id',
                                          column_sort='time')

    impute(extracted_features)

    return (extracted_features)
示例#31
0
def jacky_feature(path = './data/data_odiginal/test', file_format='xls', num_files=40, Trainable=True):
    all_files = glob.glob(path + "/*.xls")
    # print(all_files)
    ans = []
    file_list = []
    for file in all_files:
        print(file)
        df = pd.read_excel(file, header=None)
        if Trainable:
            ans_ = df.iloc[-1,0]
            ans.append(ans_)
            df = df[:-1]
            file_list.append(df)
        else:
            file_list.append(df)



    df_con = pd.concat(file_list, ignore_index=True)
    df_con = df_con.astype('float32')
    df_con.columns = ['1st', '2nd', '3rd', '4th']
    df_con['id'] = pd.Series(np.repeat(np.arange(num_files), 7500), index=df_con.index)
    df_con['time'] = pd.Series(np.tile(np.arange(7500), num_files), index=df_con.index)


    f = []
    for file in os.listdir(path):
        if file.endswith(file_format):
            print(file)
            f.append(file)

    min_max_scaler = MinMaxScaler()
    df_con[['1st', '2nd', '3rd', '4th']] = min_max_scaler.fit_transform(df_con[['1st', '2nd', '3rd', '4th']])
    print('feature extract.')
    df_feature = extract_features(df_con, column_id='id', column_sort='time')
    features_filtered = ['id', '2nd__fft_coefficient__coeff_76__attr_"imag"', '3rd__fft_coefficient__coeff_24__attr_"real"', '2nd__fft_coefficient__coeff_94__attr_"real"', '2nd__partial_autocorrelation__lag_3',
                         '3rd__fft_coefficient__coeff_62__attr_"abs"', '2nd__fft_coefficient__coeff_99__attr_"imag"', '3rd__fft_coefficient__coeff_57__attr_"real"', '2nd__fft_coefficient__coeff_96__attr_"real"',
                         '1st__energy_ratio_by_chunks__num_segments_10__segment_focus_1', '2nd__fft_coefficient__coeff_11__attr_"angle"', '3rd__fft_coefficient__coeff_73__attr_"imag"',
                         '1st__fft_coefficient__coeff_41__attr_"imag"', '2nd__fft_coefficient__coeff_81__attr_"real"']
    print('feature select.')
    df_feature = df_feature[features_filtered]
    df_feature = df_feature.iloc[:, 1:]

    print(f)

    return df_feature, f, ans
示例#32
0
    def extract(self, data):

        assert isinstance(data, pd.DataFrame)
        # assert that data have no missing values
        assert not pd.isnull(
            data).values.any(), 'data should not contain missing values.'

        log.debug('Running Global feature extractor ..')
        gfe_start_time = time.time()

        # setting time series features to extract or use default
        # fc_parameters = MinimalFCParameters()
        # fc_parameters = EfficientFCParameters()
        # fc_parameters = ComprehensiveFCParameters()

        # feature extraction
        design_matrix = extract_features(
            data,
            default_fc_parameters=self._fc_parameters,
            column_id='batch_id',
            column_sort='end_time_stamp',
            column_kind='metric_id',
            column_value='sensor_value',
            n_jobs=self._num_of_cores_to_use)

        # impute: use a builtin tsfresh method that replaces NaN with median and -inf
        # [+inf] with min [max] in a columnwise fashion (and in place)
        # If the column does not contain finite values at all, it is filled with zeros
        # Also, all columns will be guaranteed to be of type np.float64
        # (can also be done by passing impute_function=impute) to extract_features())
        impute(design_matrix)
        # TODO: assert that none cf the columns was filled with zeros

        # TODO: think about feature selection as well (extract_relevant_features), see:
        # https://github.com/blue-yonder/tsfresh/blob/master/notebooks/robot_failure_example.ipynb
        # note though that this may be problematic for real-time ts anomaly detection

        gfe_end_time = time.time()
        gfe_duration = round((gfe_end_time - gfe_start_time) / 60, 2)

        log.debug(
            'Done running Global feature extractor [Total time: {} mins.].'.
            format(gfe_duration))

        return design_matrix
示例#33
0
    def test_local_dask_cluster_extraction(self):

        Distributor = LocalDaskDistributor(n_workers=1)

        df = self.create_test_data_sample()
        extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind",
                                              column_value="val",
                                              distributor=Distributor)

        self.assertIsInstance(extracted_features, pd.DataFrame)
        self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77])))
        self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017])))
        self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167])))
        self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695])))
        self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1])))
        self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483])))
        self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75])))
        self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0])))
示例#34
0
def ts_all_features(data):
    """
    Perform time series analysis on record data.
    Remove the column 'stopped' as it has nominal values
    :param data: pandas DataFrame, containing preprocessed movement records and features.
    :return: pandas DataFrame, containing autocorrelation for each id for each feature.
    """

    rm_colm = ['stopped']
    df = data[data.columns.difference(rm_colm)]

    time_series_features = tsfresh.extract_features(df,
                                                    column_id='animal_id',
                                                    column_sort='time')

    tsfresh.utilities.dataframe_functions.impute(time_series_features)

    return time_series_features
示例#35
0
文件: yac.py 项目: dangom/ica-yac
    def fit(self, data, labels):
        feats = tsfresh.extract_features(data,
                                         column_id='level_0',
                                         column_sort='level_1',
                                         default_fc_parameters=self.def_settings,
                                         distributor=self.distributor)

        tsfresh.utilities.dataframe_functions.impute(feats) # Remove NaNs, if any
        relevant_feats = tsfresh.select_features(feats,
                                                 labels,
                                                 fdr_level=1e-15)

        self.relevant_features = relevant_feats.columns
        self.settings = tsfresh.feature_extraction.settings.from_columns(relevant_feats)

        clf = RandomForestClassifier(n_estimators=40)
        clf.fit(relevant_feats, labels)
        self.classifier = clf
        self.trained = True
示例#36
0
文件: yac.py 项目: dangom/ica-yac
    def predict(self, data):
        if not self.trained:
            assert self.architecture is not None, 'No classifier selected and no fit performed.'
            filename = os.path.join(os.path.dirname(__file__),
                                    'classifiers', self.architecture + '.pkl')
            with open(filename, 'rb') as f:
                arch = pickle.load(f)
            clf = arch['clf']
            settings = arch['settings']
            relevant_features = arch['relevant_features']
        else:
            clf = self.classifier
            settings = self.settings
            relevant_features = self.relevant_features

        features = tsfresh.extract_features(data,
                                            column_id='level_0',
                                            column_sort='level_1',
                                            default_fc_parameters=settings['0'])

        return clf.predict(features[relevant_features])
    def test_extraction_runs_through(self):
        df = extract_features(self.X[self.X.id < 3], column_id="id", column_sort="time")

        six.assertCountEqual(self, df.index.values, [1, 2])
        self.assertGreater(len(df), 0)
示例#38
0
def main(console_args=None):
    parser = argparse.ArgumentParser(description="Extract features from time series stored in a CSV file and "
                                                 "write them back into another CSV file. The time series in the CSV "
                                                 "file should either have one of the dataframe-formats described in "
                                                 "http://tsfresh.readthedocs.io/en/latest/text/data_formats.html, "
                                                 "which means you have to supply the --csv-with-headers flag "
                                                 "or should be in the form "
                                                 "[time series 1 values ..., time series 2 values ...] "
                                                 "where you should not add the --csv-with-headers flag. "
                                                 "The CSV is expected to be space-separated.")
    parser.add_argument("input_file_name", help="File name of the input CSV file to read in.")
    parser.add_argument("--output-file-name", help="File name of the output CSV file to write to. "
                                                   "Defaults to input_file_name.features.csv",
                        default=None)

    parser.add_argument("--column-sort", help="Column name to be used to sort the rows. "
                                              "Only available when --csv-with-headers is enabled.",
                        default=None)
    parser.add_argument("--column-kind", help="Column name where the kind column can be found."
                                              "Only available when --csv-with-headers is enabled.",
                        default=None)
    parser.add_argument("--column-value", help="Column name where the values can be found."
                                               "Only available when --csv-with-headers is enabled.",
                        default=None)
    parser.add_argument("--column-id", help="Column name where the ids can be found."
                                            "Only available when --csv-with-headers is enabled.",
                        default=None)

    parser.add_argument('--csv-with-headers', action='store_true', help="")
    print(console_args)
    args = parser.parse_args(console_args)

    if (args.column_id or args.column_kind or args.column_sort or args.column_value) and (not args.csv_with_headers):
        raise AttributeError("You can only pass in column-value, column-kind, column-id or column-sort if "
                             "--csv-with-headers is enabled.")

    if args.csv_with_headers:
        column_kind = args.column_kind
        column_sort = args.column_sort
        column_value = args.column_value
        column_id = args.column_id
        header = 0
    else:
        column_kind = None
        column_sort = "time"
        column_value = "value"
        column_id = "id"
        header = None

    # Read in CSV file
    input_file_name = args.input_file_name
    df = pd.read_csv(input_file_name, delim_whitespace=True, header=header)

    if not args.csv_with_headers:
        df = _preprocess(df)

    df_features = extract_features(df, column_kind=column_kind,
                                   column_sort=column_sort, column_value=column_value,
                                   column_id=column_id)

    # re-cast index from float to int
    df_features.index = df_features.index.astype('int')

    # write to disk
    default_out_file_name = os.path.splitext(input_file_name)[0] + '.features.csv'
    output_file_name = args.output_file_name or default_out_file_name
    df_features.to_csv(output_file_name)