def extract_relevant_dataset_features(dataset, all_features_train,
                                      target_column_train, all_features_test,
                                      target_column_test):

    relevant_features_train = select_features(all_features_train,
                                              target_column_train)
    relevant_features_test = select_features(all_features_test,
                                             target_column_test)
    logger.info(
        'Relevant features train set: {0}, Relevant features test set: {1}'.
        format(relevant_features_train.shape, relevant_features_test.shape),
        extra=LOGGER_EXTRA_OBJECT)
    '''
    # Deleting columns (features) with NaN value
    relevant_features_train = relevant_features_train.dropna(axis = 1)
    relevant_features_test = relevant_features_test.dropna(axis = 1)
    logger.info('Relevant features [WITHOUT NaN values] train set: {0}, Relevant features [WITHOUT NaN values] test set: {1}'.format(relevant_features_train.shape, relevant_features_test.shape), extra = LOGGER_EXTRA_OBJECT)
    '''

    # Selecting common features
    relevant_common_features = relevant_features_train.columns.intersection(
        relevant_features_test.columns)

    # For each row selects only common features
    relevant_features_train = relevant_features_train.loc[:,
                                                          relevant_common_features]
    relevant_features_test = relevant_features_test.loc[:,
                                                        relevant_common_features]
    logger.info(
        'Relevant common features train set: {0}, Relevant common features test set: {1}'
        .format(relevant_features_train.shape, relevant_features_test.shape),
        extra=LOGGER_EXTRA_OBJECT)
    relevant_features_train = pd.concat([
        pd.DataFrame(target_column_train, columns=['target']),
        relevant_features_train
    ],
                                        axis=1)
    relevant_features_test = pd.concat([
        pd.DataFrame(target_column_test, columns=['target']),
        relevant_features_test
    ],
                                       axis=1)
    logger.info(
        'Relevant common features (including target column) train set: {0}, Relevant common features (including target column) test set: {1}'
        .format(relevant_features_train.shape, relevant_features_test.shape),
        extra=LOGGER_EXTRA_OBJECT)

    relevant_features_train.to_pickle(
        '../Pickle/RelevantFeatures/Train/{0}.pkl'.format(dataset))
    relevant_features_test.to_pickle(
        '../Pickle/RelevantFeatures/Test/{0}.pkl'.format(dataset))
示例#2
0
    def extract_select_training_features(self, data, args=None):
        """
        Extract-Select features
        :param data: pandas.DataFrame
        :param args:
        :return: list
        """
        #print(args[0])
        #print(args[1])
        #print(args[2])
        #print(args[3])

        X = extract_features(data,
                             column_id=args[0],
                             n_jobs=args[1],
                             chunksize=args[2])
        X = impute(X)

        y = args[3]
        X_selected = select_features(X,
                                     y,
                                     ml_task='classification',
                                     n_jobs=args[1],
                                     chunksize=args[2],
                                     fdr_level=args[4])

        return X_selected
示例#3
0
def main():
    if len(sys.argv) < 4:
        print('Usage: ./rfe.py features.csv labels.csv num_features')
        exit(1)

    features = pd.read_csv(sys.argv[1],
                           index_col=None,
                           header=0,
                           thousands=',')
    labels = pd.read_csv(sys.argv[2],
                         index_col=None,
                         header=None,
                         squeeze=True)

    features = select_features(features, labels)

    num_features = int(sys.argv[3])

    features.to_csv('features_selected.csv', index=False, header=True)
    return

    best_features = rfe(features, labels, num_features)
    features[best_features].to_csv('features_rfe.csv',
                                   index=False,
                                   header=True)
    def test_functional_equality(self):
        """
        `extract_relevant_features` should be equivalent to running first `extract_features` with impute and
        `select_features` afterwards.
        Meaning it should produce the same relevant features and the values of these features should be identical.
        :return:
        """
        df, y = self.create_test_data_sample_with_target()

        relevant_features = extract_relevant_features(df,
                                                      y,
                                                      column_id='id',
                                                      column_value='val',
                                                      column_kind='kind',
                                                      column_sort='sort')

        extraction_settings = FeatureExtractionSettings()
        extraction_settings.IMPUTE = impute
        extracted_features = extract_features(
            df,
            feature_extraction_settings=extraction_settings,
            column_id='id',
            column_value='val',
            column_kind='kind',
            column_sort='sort')
        selected_features = select_features(extracted_features, y)

        self.assertEqual(
            set(relevant_features.columns), set(selected_features.columns),
            "Should select the same columns:\n\t{}\n\nvs.\n\n\t{}".format(
                relevant_features.columns, selected_features.columns))
        self.assertTrue(
            (relevant_features.values == selected_features.values).all().all(),
            "Should calculate the same feature values")
    def compute_tsfresh_features(self):
        """Calculate the features using `tsfresh`."""
        value = self.df[self.ts_col]
        df_shift, y = make_forecasting_frame(value,
                                             kind="kind",
                                             max_timeshift=self.max_timeshift,
                                             rolling_direction=1)

        extract_start = time.time()
        X_gen_raw = extract_features(df_shift,
                                     column_id="id",
                                     column_sort="time",
                                     column_value="value",
                                     impute_function=impute,
                                     n_jobs=8,
                                     show_warnings=False)
        extract_end = time.time()
        tqdm.write("Extraction time: {}".format(extract_end - extract_start))

        non_const_idx = X_gen_raw.apply(pd.Series.nunique) != 1
        X_gen_raw_non_const = X_gen_raw.loc[:, non_const_idx]
        select_start = time.time()
        X_gen = select_features(
            X_gen_raw_non_const, y, ml_task='regression')
        select_end = time.time()

        tqdm.write("Filtering time: {}".format(select_end - select_start))
        tqdm.write("Raw features: {}".format(X_gen_raw.shape[1]))
        tqdm.write(
            "Non-constant features: {}".format(X_gen_raw_non_const.shape[1]))
        tqdm.write("Final filtered features: {}".format(X_gen.shape[1]))

        return X_gen
    def test_functional_equality(self):
        """
        `extract_relevant_features` should be equivalent to running first `extract_features` with impute and
        `select_features` afterwards.
        Meaning it should produce the same relevant features and the values of these features should be identical.
        :return:
        """
        df, y = self.create_test_data_sample_with_target()

        relevant_features = extract_relevant_features(df,
                                                      y,
                                                      column_id='id',
                                                      column_value='val',
                                                      column_kind='kind',
                                                      column_sort='sort')

        extracted_features = extract_features(df,
                                              column_id='id',
                                              column_value='val',
                                              column_kind='kind',
                                              column_sort='sort',
                                              impute_function=impute)
        selected_features = select_features(extracted_features, y)

        self.assertEqual(
            set(relevant_features.columns), set(selected_features.columns),
            "Should select the same columns:\n\t{}\n\nvs.\n\n\t{}".format(
                relevant_features.columns, selected_features.columns))

        relevant_columns = relevant_features.columns
        relevant_index = relevant_features.index
        self.assertTrue(
            relevant_features.equals(
                selected_features.loc[relevant_index][relevant_columns]),
            "Should calculate the same feature values")
def create_features_by_tsfresh(path, dataset, years, features):
    data = dataset.copy()
    data = data[features + ['time', 'id']]
    data_rolled = roll_time_series(data,
                                   column_id="id",
                                   column_sort="time",
                                   max_timeshift=7 * 24,
                                   n_jobs=8)
    features = extract_features(data_rolled,
                                column_id="id",
                                column_sort="time",
                                n_jobs=8)
    impute(features)
    print(features.shape)
    features.to_csv(path + '/modified_data_after_feature_extraction/')

    AQI = get_raw_AQI_data(path, years)
    AQI_data = pd.Series(data=AQI['AQI'].values,
                         index=features.index,
                         name='AQI')
    print(AQI_data.shape)
    selected_features = select_features(features, AQI_data)
    print(selected_features.shape)
    # features.drop('ID', axis=1, inplace=True)
    selected_features.index = range(selected_features.shape[0])
    return selected_features
示例#8
0
 def generate_features(self):
     start = time.time()
     self.flatWindowDF.astype({
         'windowID': int,
         'timeID': int,
         'entityType': str,
         'Velocity': float,
         'Altitude': float,
         'Heading': float
     })
     xDataDF = self.flatWindowDF[[
         'windowID', 'timeID', 'Velocity', 'Altitude', 'Heading'
     ]]
     yDataDuplicateDF = self.flatWindowDF[['windowID', 'entityType']]
     extractedFeaturesDF = extract_features(xDataDF,
                                            column_id='windowID',
                                            column_sort="timeID",
                                            column_kind=None,
                                            column_value=None)
     impute(extractedFeaturesDF)
     self.labelData = (yDataDuplicateDF.drop_duplicates(
         subset='windowID'))['entityType']
     self.featureData = select_features(extractedFeaturesDF,
                                        self.labelData.to_numpy())
     self.app_metrics.featureExtractionPerf = time.time() - start
     self.featureData.to_pickle(self.featuresPickle)
     self.labelData.to_pickle(self.labelPickle)
示例#9
0
def main():
    if len(sys.argv) < 3:
        print('Usage: ./predict.py features.csv all_coins.csv')
        exit(1)

    all_features = pd.read_csv(sys.argv[1], index_col=None, header=0)
    coins = pd.read_csv(sys.argv[2], index_col=None, header=0)

    print('Finished reading data')

    for coin in coins.columns:
        labels = coins[coin]
        for i in range(len(labels) - 1, 0, -1):
            if labels[i] > labels[i - 1]:
                labels[i] = 1
            else:
                labels[i] = 0

        labels = labels[30:].reset_index(drop=True)

        features = select_features(all_features, labels)
        if len(features.columns) <= 0:
            print('Skipped %s' % coin)
            continue
        accuracy = predict(features, labels, 0.9)
        print(coin, '\t', accuracy)
    def extract(self, use_features=[]):
        x = self.__x_data_frame()
        y = self.__y_series()

        settings = ReasonableFeatureExtractionSettings()
        extracted_features = extract_features(x, column_id='id', \
                feature_extraction_settings=settings)
        if len(use_features) == 0:
            impute(extracted_features)
            features_filtered = select_features(extracted_features, y)
            use_features = features_filtered.keys()
        else:
            features_filtered = extracted_features[use_features]

        keys = features_filtered.keys()
        timeseries = []
        for index, row in features_filtered.iterrows():
            values = []
            for key in keys:
                if key == 'id':
                    continue

                value = row[key]
                values.append(value)

            timeseries.append(Timeseries([values]))

        return timeseries, use_features
示例#11
0
def tsfresh_extract_cutoff_feature(data,
                                   seed,
                                   istest=False,
                                   feature_setting={}):
    if istest:
        ct = data.groupby('Engine').FlightNo.max().rename(
            'CutoffFlight').reset_index()
    else:
        ct = make_cutoff_flights(data.copy(), seed)
        data = make_cutoff_data(ct, data)

    feat = _extract_features(data, feature_setting)
    feat = impute(feat)
    feat.index.name = 'Engine'
    feat.reset_index(inplace=True)
    feat = feat.merge(ct, on='Engine', how='left')
    feat.set_index('Engine', inplace=True)
    feat_cols = [f for f in feat.columns if f not in CONST.EX_COLS]

    if not istest:
        print("Extracted Feature Shape =", feat.shape)
        print("First Step Selection...")
        _feat = select_features(feat[feat_cols],
                                feat['RUL'],
                                ml_task='regression')
        print("Selected Feature Shape =", _feat.shape)
        feat = pd.concat([_feat, feat['RUL']], axis=1)

    feat.reset_index(inplace=True)
    return feat
def get_features(file_name):
    csv_data = pd.read_csv(file_name)
    timeseries = csv_data.iloc[:, :-1]
    del timeseries['Unnamed: 0']
    y = csv_data[['id', 'y']]
    y = handle_y(y)

    print(timeseries)
    print(y)

    print('start getfeatures...')
    # 全部特征
    extracted_features = extract_features(timeseries,
                                          column_id="id",
                                          column_sort="time")
    impute(extracted_features)
    extracted_features.to_csv('tsfresh_extractedFeatures.csv')
    print('all features end')
    # 选取较相关的特征
    # 可选属性 fdr_level = 0.05 ?
    features_filtered = select_features(extracted_features,
                                        y,
                                        ml_task='classification',
                                        n_jobs=1,
                                        fdr_level=0.05)
    features_filtered.to_csv('tsfresh_filteredFeatures.csv')
    def test_functional_equality(self):
        """
        `extract_relevant_features` should be equivalent to running first `extract_features` with impute and
        `select_features` afterwards.
        Meaning it should produce the same relevant features and the values of these features should be identical.
        :return:
        """
        df, y = self.create_test_data_sample_with_target()

        relevant_features = extract_relevant_features(df, y, column_id='id', column_value='val', column_kind='kind',
                                                      column_sort='sort')

        extracted_features = extract_features(df, column_id='id',
                                              column_value='val', column_kind='kind', column_sort='sort',
                                              impute_function=impute)
        selected_features = select_features(extracted_features, y)

        self.assertEqual(
            set(relevant_features.columns), set(selected_features.columns),
            "Should select the same columns:\n\t{}\n\nvs.\n\n\t{}".format(relevant_features.columns,
                                                                          selected_features.columns))

        relevant_columns = relevant_features.columns
        relevant_index = relevant_features.index
        self.assertTrue(
            relevant_features.equals(selected_features.loc[relevant_index][relevant_columns]),
            "Should calculate the same feature values")
def filter_features_dataset(dataset):
    feat_cols = [
        col for col in dataset.columns if (("Acc" in col) or ("Gyro" in col))
    ]
    extracted_features = dataset[feat_cols]
    y = dataset.target
    features_filtered = select_features(extracted_features, y)
    return pd.concat((features_filtered, y), axis=1), features_filtered.columns
 def fit(self, X, y):
     self.selected_features = select_features(
         X, y, n_jobs=self.n_jobs
     ).columns
     print(
         f"{len(self.selected_features)} features found significant out of {X.shape[1]} possible."
     )
     return self
def create_agg_tsfresh(x_train, y_train, x_val, y_val, input_path, size=None):

    y_train = pd.DataFrame(y_train).idxmax(axis=1)
    y_val = pd.DataFrame(y_val).idxmax(axis=1)
    if os.path.exists(input_path + 'agg_train.csv') and os.path.exists(
            input_path + 'agg_val.csv') and size is None:

        x_train_filtered = pd.read_csv(input_path + 'agg_train.csv',
                                       index_col=0)
        x_val_filtered = pd.read_csv(input_path + 'agg_val.csv', index_col=0)

        x_train_filtered = x_train_filtered.loc[:, x_train_filtered.var() != 0]
        x_val_filtered = x_val_filtered[x_train_filtered.columns]

    else:
        x_train_df = df_from_3d_np(x_train)
        x_val_df = df_from_3d_np(x_val)

        x_train_df = x_train_df.fillna(0)
        x_val_df = x_val_df.fillna(0)
        # start_time = time.time()
        x_train_extracted = extract_features(
            x_train_df,
            column_id='index',
            column_sort='time',
            default_fc_parameters=EfficientFCParameters())
        # duration = time.time() - start_time
        # print(f'feature extraction {duration}')
        if 'mts_archive' in input_path:
            x_train_sel = select_features(x_train_extracted, y_train, n_jobs=0)

            # if not enough features, take larger set
            if x_train_sel.shape[1] < 300:
                X_best = SelectKBest(f_classif,
                                     k='all').fit(x_train_extracted, y_train)
                ufs_scores = pd.DataFrame(X_best.scores_,
                                          index=x_train_extracted.columns,
                                          columns=['score']).sort_values(
                                              by=['score'], ascending=False)
                x_train_sel = x_train_extracted[ufs_scores.iloc[:300].index]

            x_train_extracted = x_train_sel

        x_train_extracted = x_train_extracted.dropna(axis='columns')

        x_train_extracted.to_csv(input_path + f'agg_train.csv')
        y_train.to_csv(input_path + f'y_train.csv')

        x_val_filtered = pd.read_csv(input_path + 'agg_val.csv', index_col=0)

        x_train_filtered = x_train_extracted.loc[:,
                                                 x_train_extracted.var() != 0]
        x_val_filtered = x_val_filtered[x_train_filtered.columns]

        y_val.to_csv(input_path + 'y_test.csv')

    return x_train_filtered, y_train, x_val_filtered, y_val
def long_extract_features(df):
    y = pd.Series(df['target'])
    y.index = df["label"]
    y = get_unique_indexes(y)
    df = df[df.columns.drop(['target'])]
    extracted_features = extract_features(df,
                                          column_id="label",
                                          column_sort="t")
    impute(extracted_features)
    features_filtered = select_features(extracted_features, y)
    return features_filtered
def split_train_and_test(features, labels):
    split_point = int(len(features) * TRAIN_TEST_RATIO)

    labels = labels[:len(features)].astype(int)
    features = select_features(features, labels)

    features_train = features.loc[:split_point]
    labels_train = labels.loc[:split_point]
    features_test = features.loc[split_point:].reset_index(drop=True)
    labels_test = labels.loc[split_point:].reset_index(drop=True)

    return features_train, labels_train, features_test, labels_test
def tsfresh_extract_features(timeSeries, idCol, timeCol):
    from tsfresh import extract_relevant_features
    from tsfresh import select_features
    from tsfresh.utilities.dataframe_functions import impute

    extracted_features = extract_relevant_features(timeSeries,
                                                   column_id=idCol,
                                                   column_sort=timeCol)

    impute(extracted_features)
    features_filtered = select_features(extracted_features, y)
    return features_filtered
def engineer_features(df: pd.DataFrame, labels: pd.DataFrame, target='SpO2', select=True):
    """Automatic feature engineering (with optional selection) for timeseries dataframe.

    Parameters
    ----------
    df : pd.DataFrame
        Dataframe of rgb and/or ppg timeseries for all subjects.
    labels : pd.DataFrame
        A dataframe matching sample IDs to ground truth (e.g. SpO2)
    target : str, optional
        The name of the column you're trying to predict, by default 'SpO2'
    select : bool, optional
        Whether to automatically filter down to statistically significant features, by default True

    Returns
    -------
    pd.DataFrame
        A dataframe with new features added, including the target feature.
    """

    _df = df.copy()
    _labels = labels.copy()

    # if 'sample_id' not in _labels.columns:
    #     _labels = _attach_sample_id_to_ground_truth(_df, _labels)
    _df = _df.select_dtypes(np.number)

    ids = _df['sample_id'].unique()

    _labels = _labels[_labels['sample_id'].isin(ids)]
    y = _labels.set_index('sample_id')[target].astype(np.float)

    if 'sample_source' in _df.columns:
        _df.drop('sample_source', axis=1, inplace=True)

    extracted_features = extract_features(
        _df,
        column_id="sample_id",
        column_sort="frame",
    )

    impute(extracted_features)
    features = extracted_features

    if select:
        features_filtered = select_features(
            extracted_features, y, ml_task='regression',)
        print(extracted_features.shape, features_filtered.shape)
        features = features_filtered

    out_df = features.join(y, how='left')

    return out_df
def relevant_features(X, y):
    relevant_features = set()
    for label in y.unique():
        y_binary = numpy.array(y == label)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            X_filtered = select_features(X, y_binary)
            print("Number of relevant features for class {}: {}/{}".format(
                label, X_filtered.shape[1], X.shape[1]))
            relevant_features = relevant_features.union(set(
                X_filtered.columns))
    return relevant_features
示例#22
0
    def extract_tsfresh_relevant_features(extracted_features, classifications):
        """
        Return only relevant features.

        :param extracted_features:  A dataframe from the tsfresh rwe function
        above.
        :param classifications:  A list of ordered classifications for the features.
        :return:  A DataFrame of relevant features.
        """
        impute(extracted_features)
        features_filtered = select_features(extracted_features,
                                            classifications)
        return features_filtered
def _select_features(extracted_features):
    y=get_y()

    # del extracted_features['Unnamed: 0']
    print(extracted_features)
    print('select start...')

    # 选取较相关的特征
    # 可选属性 fdr_level = 0.05 ?
    features_filtered = select_features(extracted_features, y ,n_jobs=1,fdr_level =0.0001,ml_task='classification')
    print(features_filtered)
    features_filtered.to_csv('tsfresh_filteredFeatures.csv')
    print('select end')
def test_select_features_VarianceThreshold(extracted_features_name='test_sklearn_VarianceThreshold.csv'):
    y=get_y()

    # 全部特征
    extracted_features = pd.read_csv(extracted_features_name)
    del extracted_features['Unnamed: 0']

    print(extracted_features)
    print('select start...')

    # 选取较相关的特征
    # 可选属性 fdr_level = 0.05 ?
    features_filtered = select_features(extracted_features, y, n_jobs=1, fdr_level=0.01,ml_task='classification')
    print(features_filtered)
    features_filtered.to_csv('select_features_VarianceThreshold.csv')
    print('select end')
示例#25
0
    def select_features(self, data, args=None):
        """
        Select features
        :param data: pandas.DataFrame
        :param args:
        :return: pandas.DataFrame
        """
        y = args[0]
        X_selected = select_features(data,
                                     y,
                                     ml_task='classification',
                                     n_jobs=args[1],
                                     chunksize=args[2],
                                     fdr_level=args[3])

        return X_selected
def _select_features(extracted_features_name=base_path + 'multiclass_60s_features.csv'):
    y = get_y()

    # 全部特征
    extracted_features = pd.read_csv(base_path + extracted_features_name)

    # del extracted_features['Unnamed: 0']
    print(extracted_features)
    print('select start...')

    # 选取较相关的特征
    # 可选属性 fdr_level = 0.05 ?
    features_filtered = select_features(extracted_features, y, n_jobs=1, fdr_level=6, ml_task='classification')
    print(features_filtered)
    features_filtered.to_csv(base_path + 'tsfresh_filteredFeatures.csv')
    print('select end')
示例#27
0
    def _select_features(self, data_frame, target):
        print("Selecting the best out of " + str(len(data_frame.columns)) +
              " features...")

        df_selected = tsfresh.select_features(data_frame, target)

        colnames = np.asarray(df_selected.columns)

        correlations = np.asarray([
            np.abs(pearsonr(target, df_selected[col]))[0] for col in colnames
        ])

        # [::-1] is somewhat unintuitive syntax,
        # but it reverses the entire column.
        self.selected_features = colnames[np.argsort(
            correlations)][::-1][:self.num_features]

        return df_selected[self.selected_features]
示例#28
0
def tsfresh_extract_features():
    train_path = '../data/hy_round1_train_20200102'
    test_path = '../data/hy_round1_testB_20200221'

    train_df_list = []
    for file_name in os.listdir(train_path):
        df = pd.read_csv(os.path.join(train_path, file_name))
        train_df_list.append(df)

    test_df_list = []
    for file_name in os.listdir(test_path):
        df = pd.read_csv(os.path.join(test_path, file_name))
        test_df_list.append(df)

    train_df = pd.concat(train_df_list)
    test_df = pd.concat(test_df_list)

    train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S')
    test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S')

    all_df = pd.concat([train_df, test_df], sort=False)

    df = all_df.drop(columns=['type'])
    y = all_df['type']

    extracted_df = extract_features(df, column_id='渔船ID', column_sort='time')

    train_df = extracted_df.iloc[:7000]
    test_df = extracted_df.iloc[7000:]

    y_train = y[:7000]
    le = preprocessing.LabelEncoder()
    y_train = le.fit_transform(y_train)

    impute(train_df)
    filtered_train_df = select_features(train_df, y_train)
    filtered_test_df = test_df[filtered_train_df.columns]

    filtered_train_df['type'] = le.inverse_transform(y_train)

    filtered_train_df.to_csv('../feature/train.csv')
    filtered_test_df.to_csv('../feature/testB.csv')

    return filtered_train_df, filtered_test_df
示例#29
0
文件: yac.py 项目: dangom/ica-yac
    def fit(self, data, labels):
        feats = tsfresh.extract_features(data,
                                         column_id='level_0',
                                         column_sort='level_1',
                                         default_fc_parameters=self.def_settings,
                                         distributor=self.distributor)

        tsfresh.utilities.dataframe_functions.impute(feats) # Remove NaNs, if any
        relevant_feats = tsfresh.select_features(feats,
                                                 labels,
                                                 fdr_level=1e-15)

        self.relevant_features = relevant_feats.columns
        self.settings = tsfresh.feature_extraction.settings.from_columns(relevant_feats)

        clf = RandomForestClassifier(n_estimators=40)
        clf.fit(relevant_feats, labels)
        self.classifier = clf
        self.trained = True
示例#30
0
def tsfresh_extract_cutoff_regime_feature(data, seed, istest=False):
    feat = pd.DataFrame()
    if istest:
        print("Test feature processing...")
        ct = data.groupby('Engine').FlightNo.max().rename(
            'CutoffFlight').reset_index()
    else:
        print("Train feature processing...")
        ct = make_cutoff_flights(data.copy(), seed)
        data = make_cutoff_data(ct, data)

    feat_cols = [f for f in data.columns if f not in ['FlightRegime']]
    for r in [1, 2, 3, 4, 5, 6]:
        print(f"Regime {r}")
        tmp = data[data.FlightRegime == r][feat_cols].reset_index(
            drop=True).copy()
        tmp_gb = tmp.groupby('Engine')
        remove_engines = tmp_gb.size()[tmp_gb.size() <= 1].index.values
        print("Remove Engines", remove_engines)
        _feat = _extract_features(tmp[~tmp.Engine.isin(remove_engines)], {})
        _feat = impute(_feat)
        if not istest:
            _feat.index.name = 'Engine'
            _feat.reset_index(inplace=True)
            _feat = _feat.merge(ct[['Engine', 'RUL']], on='Engine', how='left')
            _feat.set_index('Engine', inplace=True)
            print("Extracted Feature Shape =", _feat.shape)
            print("First Step Selection...")
            _feat_cols = [f for f in _feat.columns if f not in CONST.EX_COLS]
            _feat = select_features(_feat[_feat_cols],
                                    _feat['RUL'],
                                    ml_task='regression')
            print("Selected Feature Shape =", _feat.shape)
        _feat.columns = [c + f'_Regime{r}' for c in _feat.columns]
        feat = pd.concat([feat, _feat], axis=1, sort=True)
        feat = impute(feat)

    feat.index.name = 'Engine'
    feat.reset_index(inplace=True)
    feat = feat.merge(ct, on='Engine', how='left')

    return feat
示例#31
0
def select(path_result_selected_features, path_data, subjects):

    for subject in subjects:
        senzori = os.listdir(path_data + os.sep + str(subject))
        if not os.path.exists(path_result_selected_features + os.sep +
                              str(subject)):
            os.mkdir(path_result_selected_features + os.sep + str(subject))
        for senzor in senzori:
            extracted_features = pd.read_csv(path_data + os.sep +
                                             str(subject) + os.sep + senzor)
            impute(extracted_features)
            pom = pd.read_csv("Sliding_Window_Data" + os.sep + "Labels" +
                              os.sep + "Subject_" + str(subject) + os.sep +
                              "Subject_" + str(subject) +
                              "_corrected_labels.csv")
            y = pom['Tag']
            features_filtered = select_features(extracted_features, y)
            features_filtered.to_csv(path_result_selected_features + os.sep +
                                     str(subject) + os.sep +
                                     senzor.split(".")[0] + "_SELECTED.csv",
                                     index=False)
def select():
    features_files = [
        f for f in listdir(features_dir) if isfile(join(features_dir, f))
    ]

    # Select features individually from each of the signal components
    train = pd.DataFrame()
    test = pd.DataFrame()
    for f_file in features_files:  # One file for each signal component

        print(f"loading {f_file}")
        features = pd.read_csv(features_dir + f_file)

        train_x = features.iloc[:validation_split_i].drop('y', axis=1)
        test_x = features.iloc[validation_split_i:].drop('y', axis=1)
        train_y = features.iloc[:validation_split_i].y
        test_y = features.iloc[validation_split_i:].y

        # Feature selection must be always done from the train set!
        print("selecting features...")
        train_features_selected = select_features(train_x,
                                                  train_y,
                                                  fdr_level=fdr_level)

        print(f"selected {len( train_features_selected.columns )} features.")

        comp_train = train_features_selected.copy()
        comp_test = test_x[train_features_selected.columns].copy()

        train = pd.concat([train, comp_train], axis=1)
        test = pd.concat([test, comp_test], axis=1)

    train['y'] = train_y
    test['y'] = test_y

    print(f"saving {train_file}")
    train.to_csv(train_file, index=None)

    print(f"saving {test_file}")
    test.to_csv(test_file, index=None)