Exemplo n.º 1
0
    def mrmr(self):

        if self.x.shape[2] == 32:
            mean_x = np.mean(self.x, axis=3, keepdims=True)
            mean_x = mean_x.reshape(mean_x.shape[0] * mean_x.shape[1],
                                    mean_x.shape[2])
            cols = [
                "Fp1", "AF3", "F3", "F7", "FC5", "FC1", "C3", "T7", "CP5",
                "CP1", "P3", "P7", "PO3", "O1", "Oz", "Pz", "Fp2", "AF4", "Fz",
                "F4", "F8", "Fc6", "Fc2", "Cz", "C4", "T8", "CP6", "CP2", "P4",
                "P8", "PO4", "O2"
            ]  #names of 32 channels

            df = pd.DataFrame(mean_x, columns=cols)

            self.target = self.target.reshape(
                self.target.shape[0] * self.target.shape[1], 1)

            df["Class"] = self.target
            df["Class"] = df["Class"].astype(int)
            col = df.columns.tolist()
            col = col[
                -1:] + col[:
                           -1]  #arranging the sequence of columns so class (target) is in first column and the rest are features (channels)
            df = df[col]

            return pymrmr.mRMR(df, self.scheme,
                               self.channel)  #mRMR can be "MID" or "MIQ"

        elif self.x.shape[2] == 62:
            mean_x = np.mean(self.x, axis=3, keepdims=True)
            mean_x = mean_x.reshape(mean_x.shape[0] * mean_x.shape[1],
                                    mean_x.shape[2])
            cols = [
                "FP1", "FPZ", "FP2", "AF3", "AF4", "F7", "F5", "F3", "F1",
                "FZ", "F2", "F4", "F6", "F8", "FT7", "FC5", "FC3", "FC1",
                "FCZ", "FC2", "FC4", "FC6", "FT8", "T7", "C5", "C3", "C1",
                "CZ", "C2", "C4", "C6", "T8", "TP7", "CP5", "CP3", "CP1",
                "CPZ", "CP2", "CP4", "CP6", "TP8", "P7", "P5", "P3", "P1",
                "PZ", "P2", "P4", "P6", "P8", "PO7", "PO5", "PO3", "POZ",
                "PO4", "PO6", "PO8", "CB1", "O1", "OZ", "O2", "CB2"
            ]  #names of 62 channels

            df = pd.DataFrame(mean_x, columns=cols)

            self.target = self.target.reshape(
                self.target.shape[0] * self.target.shape[1], 1)

            df["Class"] = self.target
            df["Class"] = df["Class"].astype(int)
            col = df.columns.tolist()
            col = col[
                -1:] + col[:
                           -1]  #arranging the sequence of columns so class (target) is in first column and the rest are features (channels)
            df = df[col]

            return pymrmr.mRMR(df, self.scheme,
                               self.channel)  #mRMR can be "MID" or "MIQ"
Exemplo n.º 2
0
    def processing_mrmr(df, n_components, mrmr_method='MIQ'):

        top_features = pymrmr.mRMR(df, mrmr_method, n_components)

        if 'DX' in top_features:
            n_components = n_components + 1
            print('Issue with MRMR - need next feature')
            top_features = pymrmr.mRMR(df, mrmr_method, n_components)

        if 'DX' not in top_features:
            top_features.append('DX')

        return df[top_features], top_features
Exemplo n.º 3
0
 def mRMR(self, X_train, X_test, y_train, feat_names, **kwargs):
     outliers = kwargs["outliers"]
     n_bins = kwargs["n_bins"]
     method = kwargs["method"]
     retain_ratio = kwargs["retain_ratio"]
     top_n = int(retain_ratio * len(feat_names))
     if top_n is None:
         top_n = X_train.shape[1]
     if y_train.dtype != int:
         le = LabelEncoder()
         y_train = le.fit_transform(y_train).astype(int)
     feat_names = list(feat_names)
     df = pd.DataFrame(np.hstack((y_train[:, np.newaxis], X_train)),
                       columns=["label"] + feat_names)
     df_bin = df.copy()
     for f in feat_names:
         series = df[f]
         if outliers:
             # remove outliers binning in 1st<>99th percentile
             if not np.all(series.values == series.values[0]):
                 # only do this step when series is made by at least 2 different values, otherwise something crashes
                 _, bins = pd.qcut(series + self.jitter(series),
                                   np.linspace(0, 1, 100),
                                   retbins=True)
                 first_perc, ninetyninth_perc = bins[0], bins[-1]
                 series = np.maximum(series, first_perc)
                 series = np.minimum(series, ninetyninth_perc)
         df_bin[f] = pd.cut(series,
                            bins=n_bins,
                            labels=np.arange(0, n_bins))
     which_features = pymrmr.mRMR(df_bin, method, top_n)
     return df[which_features]
Exemplo n.º 4
0
    def FV_mRMR(self):
        print("\nrunning mRMR algorithm for feature selection")
        ae = AutoEncoder('fv_gmm', 0)

        with smart_open(os.path.join(ae.save_dir, 'model_list.txt'),
                        'rb',
                        encoding='utf-8') as model_path:
            for line_no, line in enumerate(model_path):
                line = str(line).replace('\n', '')
                print(line_no, '\t', line[65:])

                if os.path.isfile(
                        os.path.join(
                            line, 'fisher_vector_train_%d.npy' %
                            self.kernel)) and os.path.isfile(
                                os.path.join(
                                    line,
                                    'fisher_vector_dev_%d.npy' % self.kernel)):
                    X_train = np.load(
                        os.path.join(
                            line, 'fisher_vector_train_%d.npy' % self.kernel))
                    X_dev = np.load(
                        os.path.join(line,
                                     'fisher_vector_dev_%d.npy' % self.kernel))
                    y_train = np.load(os.path.join(line, 'label_train.npy'))
                    y_dev = np.load(os.path.join(line, 'label_dev.npy'))
                    X_train = np.reshape(X_train,
                                         (-1, np.prod(X_train.shape[1:])))
                    X_dev = np.reshape(X_dev, (-1, np.prod(X_dev.shape[1:])))
                    X_train = np.nan_to_num(X_train)
                    X_dev = np.nan_to_num(X_dev)

                    df = pd.DataFrame(np.vstack((X_train, X_dev)))
                    df.columns = [
                        'feature_%d' % i for i in range(len(X_train[0]))
                    ]
                    df.insert(0, 'label', np.hstack((y_train, y_dev)).T)
                    print(df.head())

                    feature_list = pymrmr.mRMR(df, 'MIQ', 50)
                    np.save(os.path.join(line, 'feature_list'), feature_list)

                    X_train_df = pd.DataFrame(X_train)
                    X_train_df.columns = [
                        'feature_%d' % i for i in range(len(X_train[0]))
                    ]
                    X_train = X_train_df.loc[:, feature_list]

                    X_dev_df = pd.DataFrame(X_dev)
                    X_dev_df.columns = [
                        'feature_%d' % i for i in range(len(X_dev[0]))
                    ]
                    X_dev = X_dev_df.loc[:, feature_list]

                    print(X_train.head())
                    print(X_dev.head())

                    np.save(os.path.join(line, 'X_train_mrmr'), X_train)
                    np.save(os.path.join(line, 'X_dev_mrmr'), X_dev)
                    print("\nfeature selection done and data saved.")
Exemplo n.º 5
0
    def execute(data, cols):
        max_features = len(cols)
        print("====== mRMR Feature Ranking =====")
        ranking = pymrmr.mRMR(data, 'MID', max_features)

        #return ranking
        return '-- Not working --'
Exemplo n.º 6
0
    def get_filtered_data_frame_columns(df: pd.DataFrame,
                                        mrmr=False,
                                        features_left_cnt=10):
        if features_left_cnt >= len(df.columns) - 1:
            return df.columns

        if mrmr and len(df.columns) - features_left_cnt < 10:
            import pymrmr
            return [df.columns.values[0]] + pymrmr.mRMR(
                df, 'MID', features_left_cnt)
        else:
            data = df.to_numpy()
            correlations = feature_selection.mutual_info_regression(
                data[:, 1:], data[:, 0])
            treshold = sorted(correlations, reverse=True)[features_left_cnt]

            columns = []
            for i, col in enumerate(df.columns[1:]):
                if len(columns
                       ) < features_left_cnt and correlations[i] > treshold:
                    columns.append(col)
            for i, col in enumerate(df.columns[1:]):
                if len(columns
                       ) < features_left_cnt and correlations[i] == treshold:
                    columns.append(col)

            return [df.columns.values[0]] + columns
Exemplo n.º 7
0
    def GetSelectedFeatureIndex(self, data_container):
        data = data_container.GetArray()
        data /= np.linalg.norm(data, ord=2, axis=0)
        label = data_container.GetLabel()

        if data.shape[1] < self.GetSelectedFeatureNumber():
            print(
                'mMRM: The number of features {:d} in data container is smaller than the required number {:d}'
                .format(data.shape[1], self.GetSelectedFeatureNumber()))
            self.SetSelectedFeatureNumber(data.shape[1])

        feature_list = ['class'] + data_container.GetFeatureName()
        feature_index = []
        pd_label = pd.DataFrame(label)
        pd_data = pd.DataFrame(data)
        mRMR_input = pd.concat([pd_label, pd_data], axis=1)
        mRMR_input.columns = feature_list
        parameter_list = self.LoadFeatureSelectorParameterList(
            relative_path=r'HyperParameters\FeatureSelector')
        feature_name = pymrmr.mRMR(mRMR_input,
                                   parameter_list[0]['mutual_information'],
                                   self.GetSelectedFeatureNumber())
        feature_list.remove('class')

        rank = []
        for index, item in enumerate(feature_name):
            feature_index.append(feature_list.index(item))
            rank.append(index)
        return feature_index, rank, feature_name
Exemplo n.º 8
0
def select_features(X,y,modality,method,n_feats):

    if method == 'mrmr':
        if modality == 'gene' or modality == 'meth': #for these doing prefiltering with ttest
            # selector = VarianceThreshold(threshold=.025)
            # selector.fit(X)
            # ndx = selector.get_support(indices=True)
            # feat_keep = []
            # for i in range(X.shape[1]):
            #     if i in ndx:
            #         feat_keep.append(list(X)[i])
            # X = X.loc[:, feat_keep]

            init_feats = reduce(X, y, 2000)
            X = X.loc[:, init_feats]
        elif modality == 'CNV':
            # X, y = discretize(X, y, modality, 1)
            init_feats = chi(X, y, 2000)
            X = X.loc[:, init_feats]

        # calls helper function to discretize
        X,y = discretize(X,y,modality,.5) #4th param is number std away from mean as discretization threshold
        print('check 2')
        # combine response with features to one dataframe [y,X]
        z = pd.concat([y, X], axis=1)

        # calling mRMR function
        feat_selected = pymrmr.mRMR(z,'MIQ',n_feats)
    elif method == 'ttest':
        feat_selected = reduce(X, y, n_feats)
    elif method == 'chi-squared':
        # X,y = discretize(X,y,modality,.3)
        feat_selected = chi(X, y, n_feats)
    elif method == 'minfo':
        # selector = VarianceThreshold(threshold=.03)
        # selector.fit(X)
        # ndx = selector.get_support(indices=True)
        # feat_keep = []
        # for i in range(X.shape[1]):
        #     if i in ndx:
        #         feat_keep.append(list(X)[i])
        # X = X.loc[:,feat_keep]

        if modality == 'miRNA':
            X, y = discretize(X, y, modality, 2)

        if modality == 'gene' or modality == 'meth':
            X, y = discretize(X, y, modality, 1)
            init_feats = chi(X, y, 5000)
            X = X.loc[:, init_feats]

        if modality == 'CNV':
            # X, y = discretize(X, y, modality, 1)
            init_feats = chi(X, y, 5000)
            X = X.loc[:, init_feats]

        feat_selected = minfo(X,y,n_feats)

    return (feat_selected)
Exemplo n.º 9
0
    def mRMR(self):

        df_ = self.data.copy()
        cols = list(df_.columns)[:-1] + ['class']
        df_.columns = cols

        if self.type == CLASSIFICATION:
            features_ = pymrmr.mRMR(df_, 'MID', self.num_top_features)
            self.report_feature_importance(features_,
                                           self.num_top_features,
                                           label="mMRM - MID")

            features_ = pymrmr.mRMR(df_, 'MIQ', self.num_top_features)
            self.report_feature_importance(features_,
                                           self.num_top_features,
                                           label="mMRM - MIQ")
        else:
            print(
                "mRMR is designed to be used in for classification, not regression "
            )
Exemplo n.º 10
0
def mRMR(x_train, y_train, n_features):
    x_train.insert(loc=0, column='class', value=y_train)
    features = pymrmr.mRMR(x_train, 'MIQ', n_features)

    column_name = x_train.columns.tolist()
    results = []
    for feature_index in features:
        idx = column_name.index(feature_index)
        results.append(idx)

    return results
Exemplo n.º 11
0
    def fit(self, X, y):
        n, d = X.shape

        # Creating a dataFrame
        vectors = np.concatenate([y[:, None], X], axis=1)
        columns = ["label"] + [str(x) for x in range(d)]
        df = pd.DataFrame(vectors, columns=columns)

        with silence():
            output = pymrmr.mRMR(df, 'MIQ', self.k)

        self.index = np.array([int(x) for x in output])

        return self
Exemplo n.º 12
0
def dim_reduction_mRMR(df, k):
    import pymrmr
    s = datetime.now()
    reduced = pymrmr.mRMR(df, 'MIQ', k)
    func.execution_time(s)
    # finish Beep sound
    func.beeep()
    # below is how we can prepare the data to use mRMR
    #ordinal_df = pd.DataFrame(pd.np.column_stack([y,X_pad]))
    #ordinal_df.rename(columns={0:'class'}, inplace=True)
    #for i in range(1,1001):
    #    ordinal_df.rename(columns={i:'Feat'+str(i)}, inplace=True)
    #reduced_ordf = prp.dim_reduction_mRMR(ordinal_df)
    return reduced
 def fit(self, X, y):
     X_frame = pd.DataFrame(X,
                            index=list(y.index),
                            columns=[str(i) for i in range(X.shape[1])])
     self.selected_mask = []
     data_frame = pd.concat([y, X_frame], axis=1)
     all_features = X_frame.columns.tolist()
     selected_features = pymrmr.mRMR(data_frame, self.selection_method,
                                     self.selected_num)
     for i in range(len(all_features)):
         if all_features[i] in selected_features:
             self.selected_mask.append(True)
         else:
             self.selected_mask.append(False)
     return self
Exemplo n.º 14
0
def mrmr_feature(csv_path, feature_number_list):

    df = pd.read_csv(csv_path)

    for feature_number in feature_number_list:
        result = pymrmr.mRMR(df, 'MIQ', feature_number)
        book = xlwt.Workbook()
        sheet1 = book.add_sheet(u'sheet1', cell_overwrite_ok=True)
        for i in range(len(result)):
            name = result[i]
            for j in range(len(df[name]) + 1):
                if j == 0:
                    sheet1.write(j, i, name)
                else:
                    sheet1.write(j, i, float(df[name][j - 1]))
Exemplo n.º 15
0
  def _mRMR(self, n, method='MIQ', is_discrete=True, nscale=1):
    ''' minimum Redundancy Maximum Relevance algorithm '''

    sX = self.X.copy()

    if not is_discrete:
      log.info(f'Discretising X using scale = scale * {nscale}')
      sX = discretise(sX, nscale)

    sX.insert(0, self.y.columns[0], self.y.iloc[:, 0])

    log.info(f'Starting mRMR ({method}, n={n})')
    feats = pymrmr.mRMR(sX, 'MIQ', n)

    log.info(f'Updating dataset, {len(feats)} features')
    self.X = self.X[feats]
Exemplo n.º 16
0
    def findBestFeaturesMRMR(self):
        feature_set = dict()
        '''
        Finding features iteratively from 1 to 15
        '''
        for i in range(0, len(self.features)):
            feature_set[i] = pymrmr.mRMR(self.data, 'MID', i + 1)

        print(len(feature_set))

        index_feature_set = dict()
        for key, value in feature_set.items():
            index_feature_set[key] = list()
            for v in value:
                index_feature_set[key].append(list(self.data.columns).index(v))
        '''
        Cross-validation to find the best set of features
        '''
        loss = 100
        index = 0
        for i in range(0, 15):
            model = self.model
            kf = KFold(n_splits=4)
            total_loss = 0
            for train_index, test_index in kf.split(self.data.iloc[:, 1:]):
                train_X, test_X = self.data.iloc[
                    train_index,
                    index_feature_set[i]], self.data.iloc[test_index,
                                                          index_feature_set[i]]
                train_y, test_y = self.data.iloc[train_index,
                                                 0], self.data.iloc[test_index,
                                                                    0]
                model.fit(train_X.values, list(train_y.values))
                y_pred = model.predict_proba(test_X.values)
                total_loss += log_loss(list(test_y.values), y_pred)
            if (total_loss / 4) < loss:
                loss = total_loss
                index = i

        final_features = list()
        for x in index_feature_set[index]:
            final_features.append(self.data.columns[x])

        return final_features
Exemplo n.º 17
0
def mRMR_sel(X_tr, X_te, y_tr, k, feat_name):

    X_tr, X_te, feat_name = select_fs_alg('anova', X_tr, X_te, y_tr, 500,
                                          feat_name)
    if X_tr.shape[1] < k:
        X_t = X_tr
        mr_feat = feat_name
        X_te = X_te
        return X_t, X_te, mr_feat

    data = np.concatenate([np.expand_dims(y_tr, 1), X_tr], axis=1)
    fin_name = np.hstack((np.array('tar'), feat_name))
    df = pd.DataFrame(data, columns=fin_name)
    df_te = pd.DataFrame(X_te, columns=feat_name)
    mr_feat = pymrmr.mRMR(df, 'MIQ', k)
    X_t = np.array(df[mr_feat])
    X_te = np.array(df_te[mr_feat])

    return X_t, X_te, mr_feat
Exemplo n.º 18
0
def select_features(X, y, modality, method, n_feats):

    if method == 'mrmr':
        if modality == 'gene' or modality == 'meth':  #for these doing prefiltering with ttest
            init_feats = reduce(X, y, 2000)
            X = X.loc[:, init_feats]
        elif modality == 'CNV':
            X, y = discretize(X, y, modality, 1)
            init_feats = chi(X, y, 2000)
            X = X.loc[:, init_feats]

        # calls helper function to discretize
        X, y = discretize(
            X, y, modality, .5
        )  #4th param is number std away from mean as discretization threshold
        z = pd.concat([y, X], axis=1)

        # calling mRMR function
        feat_selected = pymrmr.mRMR(z, 'MIQ', n_feats)
    elif method == 'ttest':
        feat_selected = reduce(X, y, n_feats)
    elif method == 'chi-squared':
        X, y = discretize(X, y, modality, .3)
        feat_selected = chi(X, y, n_feats)
    elif method == 'minfo':
        if modality == 'miRNA':
            X, y = discretize(X, y, modality, 2)
        elif modality == 'gene' or modality == 'meth':
            X, y = discretize(X, y, modality, 1)
            init_feats = chi(X, y, 5000)
            X = X.loc[:, init_feats]
        elif modality == 'CNV':
            X, y = discretize(X, y, modality, 1)
            init_feats = chi(X, y, 1000)
            X = X.loc[:, init_feats]

        feat_selected = minfo(X, y, n_feats)

    return (feat_selected)
def select_features(X, y, selection_algorithm="mRMR", num_of_features=10):
    selection_algorithm = selection_algorithm.lower()
    assert selection_algorithm.lower() in (
        "mrmr", "select_k_best", "rrelief"), "Invalid selection algorithm."
    # X_selected_features = X
    print(
        f"Selecting features with {selection_algorithm} selection algorithm...."
    )
    if selection_algorithm == "mrmr":
        features = mRMR(X, 'MIQ', num_of_features)
        X_selected_features = X[features]
    elif selection_algorithm == "select_k_best":
        X_selected_features = SelectKBest(chi2,
                                          k=num_of_features).fit_transform(
                                              X, y)
    # raises KeyError - debug required
    else:
        r = relief.Relief(
            n_features=num_of_features
        )  # Will run by default on all processors concurrently
        X_selected_features = r.fit_transform(X, y)
    print("Feature selection finished....")
    return X_selected_features
Exemplo n.º 20
0
 def _choose_fea(self, filename, num):
     myfilename = filename + '.txt'
     data = pd.read_csv(myfilename, sep=' ')  # 生成空的pandas表
     self.val = pymrmr.mRMR(data, 'MID', num)
     print(self.val)
     numFeat = len(open(myfilename).readline().split(' '))
     dataset = []
     fr = open(myfilename)
     j = 0
     for line in fr.readlines():
         xi = []
         curline = line.strip().split(' ')
         if j != 0:
             xi.append((curline[0]))
             for i in range(1, numFeat):
                 ch = '%d' % i
                 if self.val.count(ch) > 0:
                     xi.append(float(curline[i]))
             dataset.append(xi)
         j = j + 1
     chfilename = filename + 'ch'
     # print(np.array(dataset))
     self.saveData(chfilename, np.array(dataset))
Exemplo n.º 21
0
    def fit(self, X, y):
        print('***** Fitting *****')
        # Check if DataFrame
        X = self.check_df(X)
        y = self.check_df(y)
        # print(f'X shape: {np.shape(X)}')
        # print(f'Y shape: {np.shape(y)}')
        # print(type(X), type(y))

        # Compose new DataFrame
        feat_cols = [f'feat_{i}' for i in range(X.shape[1])]
        X_df = pd.DataFrame(data=X, columns=feat_cols)
        target = pd.Series(y, name='target')
        X_df = X_df.join(target)  # Append labels to dataframe

        # Re-arrange the DataFrame so 'target' is the fisrt column
        ordered_cols = ['target'] + feat_cols
        X_df = X_df[ordered_cols]

        # Perform the feature selection using mRMR
        self.selected_features = pymrmr.mRMR(X_df, self.method, self.k_features)
        self.selected_indexes = [X_df.drop('target', axis='columns').columns.tolist().index(i) for i in self.selected_features]

        return self
Exemplo n.º 22
0
def select_n_genes_mRMR(df, num_genes):
    return pymrmr.mRMR(df, 'MIQ', num_genes)
Exemplo n.º 23
0
# import lightgbm as lgb
import pandas as pd
import pymrmr  # python 使用3.6版本的
import warnings

warnings.filterwarnings("ignore")

train = pd.read_csv('../data/feature_filter_60.csv', nrows=50)
y = pd.read_csv('../data/label_5.csv', nrows=50)

train_x = train
train_y = y

test_x = pd.read_csv('../test/feature_filter_60.csv', nrows=50)
test_y = pd.read_csv("../test/label_5_all.csv", nrows=50)
print(train_x.shape, test_x.shape)
data = pd.DataFrame(
    pd.concat((train_x, test_x)).drop(['o_x.26', 'o_y.26', 'o_z.26', 'yaw.26', 'o_w.26'], axis=1)).astype('int32')

# data = data[0:len(data)].astype("int32")  # .astype(str)
label = pd.concat((train_y, test_y), axis=0)
print(data.shape, label.shape)
df = pd.concat((label, data), axis=1)
print(df.shape)

res = pymrmr.mRMR(data, 'MIQ', 500)

print(res.shape)
y[1::2] = 0

# split set into training & test sets
X, X_test, y, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

nfeat_v = [10, 25, 50, 75, 100, 200]

# #############################################################################
# Classification and ROC analysis

print('%6s\t %6s\t %6s\t %6s\t %6s' % ('dset', 'nfeat', 'mean', 'stdev', 'pval'))

feat_selector = mifs.MutualInformationFeatureSelector()

for nfeat in nfeat_v:
    ind = pymrmr.mRMR(X, 'MIQ', 10)
    X_new = X[:, ind]

    # Run classifier with cross-validation and plot ROC curves
    cv = StratifiedKFold(n_splits=10, random_state=12345)
    classifier = LogisticRegression(C=1e15)

    tprs = []
    aucs = []
    aucs_tr = []
    mean_fpr = np.linspace(0, 1, 100)

    i = 0
    for train, test in cv.split(X_new, y):
        probas_ = classifier.fit(X_new[train], y[train]).predict_proba(X_new[test])
        # Compute ROC curve and area the curve
Exemplo n.º 25
0
#Random shuffle the rows of combined dataset
outlierless_dt = outlierless_dt.sample(frac = 1,random_state=1).reset_index(drop=True)
outlierless_dt.shape

"""**Feature Selection**
* Select features using MrMr algorithm
"""

nsp = outlierless_dt[['NSP']]
feature_dt = outlierless_dt.drop(['NSP'], axis = 1)

feature_dt.insert(0, 'NSP', nsp)    #For pymrmr module to select features the target column should be the first column in the dataframe
feature_dt.head()
feature_dt.shape

mrmr_features = pymrmr.mRMR(feature_dt, 'MIQ', 10)
 type(mrmr_features)
 print(mrmr_features)
 final_dt = outlierless_dt[mrmr_features]
 final_dt.insert(0, 'NSP', nsp)
 final_dt.shape
 #final_dt contains the feature selected data

"""**Feature Engineering**
1.   combine similar features
2.   Extract PCA features

**Start Analysis**

**Support Vector Machine**
def get_maxrel_feature(dataframe, num_features, mode="MIQ"):
    feature_index = pymrmr.mRMR(dataframe, mode, num_features)
    important_feature_index = [int(x) for x in feature_index]
    return important_feature_index
Exemplo n.º 27
0
    def bench(self, X, X_norm, y, n=2):
        num_feats = 20
        output_data = {'method': list(), 'features': list(), 'time': list(), self.test_att: list(), 'supervised': list()}

        # ----------------------------------------------------------------
        # CFS
        # start = time.perf_counter()
        # idx = cfs(X_norm.to_numpy(), y.to_numpy())[0]
        # print(idx)
        # selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist()
        # output_data['method'].append('CFS')
        # output_data['time'].append(time.perf_counter() - start)
        # output_data['features'].append(selected_features)
        # output_data[self.test_att].append(self.train_real_data(selected_features, X))

        # LA: Laplacian Score
        start = time.perf_counter()
        kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1}
        W = construct_W.construct_W(X_norm.to_numpy(), **kwargs_W)
        score = lap_score.lap_score(X_norm.to_numpy(), W=W)
        idx = lap_score.feature_ranking(score)
        selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist()
        output_data['method'].append('Laplacian Score')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(selected_features)
        output_data['supervised'].append(False)
        output_data[self.test_att].append(self.train_real_data(selected_features, X))
        print(output_data)

        # FCBF: Feature correlation based filter
        # start = time.perf_counter()
        # idx = fcbf(X_norm.to_numpy(), y.to_numpy(), n_selected_features=num_feats)[0]
        # selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist()
        # output_data['method'].append('FCBF')
        # output_data['time'].append(time.perf_counter() - start)
        # output_data['features'].append(selected_features)
        # output_data['supervised'].append(True)
        # output_data[self.test_att].append(self.train_real_data(selected_features, X))
        # print(output_data)
        # output_data['method'].append('FCBF')
        # output_data['time'].append(9999999)
        # output_data['features'].append([])
        # output_data['supervised'].append(True)
        # output_data[self.test_att].append(0.0)

        # UDFS: Unsupervised Discriminative Feature Selection
        start = time.perf_counter()
        Weight = udfs(X_norm.to_numpy(), gamma=0.1, n_clusters=n)
        idx = feature_ranking(Weight)
        selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist()
        output_data['method'].append('UDFS')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(selected_features)
        output_data['supervised'].append(False)
        output_data[self.test_att].append(self.train_real_data(selected_features, X))
        print(output_data)

        # SPEC: Spectral Feature Selection
        start = time.perf_counter()
        score = spec(X_norm.to_numpy())
        idx = feature_ranking_spec(score)
        selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist()
        output_data['method'].append('SPEC')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(selected_features)
        output_data['supervised'].append(False)
        output_data[self.test_att].append(self.train_real_data(selected_features, X))
        print(output_data)

        # Mrmr: minimum redundency maximum relevance
        start = time.perf_counter()
        mrmr = pymrmr.mRMR(X_norm, 'MIQ', num_feats)
        output_data['method'].append('MRMR(MIQ)')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(mrmr)
        output_data['supervised'].append(False)
        output_data[self.test_att].append(self.train_real_data(mrmr, X))
        print(output_data)

        # Mrmr: minimum redundency maximum relevance
        start = time.perf_counter()
        mrmr = pymrmr.mRMR(X_norm, 'MID', num_feats)
        output_data['method'].append('MRMR(MID)')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(mrmr)
        output_data['supervised'].append(False)
        output_data[self.test_att].append(self.train_real_data(mrmr, X))
        print(output_data)

        # recursive feature elimination(RFE):

        from sklearn.feature_selection import RFE
        from sklearn.linear_model import LogisticRegression
        rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5)
        start = time.perf_counter()
        rfe_selector.fit(X_norm, y)
        rfe_support = rfe_selector.get_support()
        rfe_feature = X_norm.loc[:, rfe_support].columns.tolist()
        output_data['method'].append('RFE')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(rfe_feature)
        output_data['supervised'].append(True)
        output_data[self.test_att].append(self.train_real_data(rfe_feature, X))
        print(output_data)

        # ----------------------------------------------------------------
        # Lasso: SelectFromModel:

        from sklearn.feature_selection import SelectFromModel
        from sklearn.linear_model import LogisticRegression

        embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), max_features=num_feats)
        start = time.perf_counter()
        embeded_lr_selector.fit(X_norm, y)

        embeded_lr_support = embeded_lr_selector.get_support()
        embeded_lr_feature = X_norm.loc[:, embeded_lr_support].columns.tolist()
        output_data['method'].append('Lasso')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(embeded_lr_feature)
        output_data['supervised'].append(True)
        output_data[self.test_att].append(self.train_real_data(embeded_lr_feature, X))
        print(output_data)
        print(str(len(embeded_lr_feature)), 'selected features')

        # -----------------------------------------------------------------------------
        # Tree - based: SelectFromModel:

        from sklearn.feature_selection import SelectFromModel
        from sklearn.ensemble import RandomForestClassifier

        embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats)
        start = time.perf_counter()
        embeded_rf_selector.fit(X_norm, y)

        embeded_rf_support = embeded_rf_selector.get_support()
        embeded_rf_feature = X_norm.loc[:, embeded_rf_support].columns.tolist()
        output_data['method'].append('Tree_Based_RF')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(embeded_rf_feature)
        output_data['supervised'].append(True)
        output_data[self.test_att].append(self.train_real_data(embeded_rf_feature, X))
        print(output_data)
        print(str(len(embeded_rf_feature)), 'selected features')

        # -------------------------------------------------------------------------------
        # also tree based:

        from sklearn.feature_selection import SelectFromModel
        from lightgbm import LGBMClassifier

        lgbc = LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
                              reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

        embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_feats)
        start = time.perf_counter()
        embeded_lgb_selector.fit(X_norm, y)

        embeded_lgb_support = embeded_lgb_selector.get_support()
        embeded_lgb_feature = X_norm.loc[:, embeded_lgb_support].columns.tolist()
        output_data['method'].append('Tree_Based_lightGBM')
        output_data['time'].append(time.perf_counter() - start)
        output_data['supervised'].append(True)
        output_data['features'].append(embeded_lgb_feature)
        output_data[self.test_att].append(self.train_real_data(embeded_lgb_feature, X))
        print(output_data)
        print(str(len(embeded_lgb_feature)), 'selected features')

        return output_data
Exemplo n.º 28
0
#x.remove(y)
##GBR
#from h2o.estimators.gbm import H2OGradientBoostingEstimator
#gbm = H2OGradientBoostingEstimator()
#gbm.train(x=x, y=y, training_frame=train, validation_frame=valid)
#y_pred=gbm.predict(test_data)
##gbm.cross_validation_models()
##gbm.cross_validation_metrics_summary()
#gbm.varimp_plot()
#gbm.varimp()
#gbm.mse(train=True, valid=True, xval=False)
#gbm.r2(train=True, valid=True, xval=False)

result = pd.concat([y_train, train_data], axis=1)

columns = pymrmr.mRMR(data, 'MIQ', 10)

print(columns)

new_data = []
new_data = pd.DataFrame(data=new_data)

new_data_test = []
new_data_test = pd.DataFrame(data=new_data_test)

for i in columns:
    new_data_test = pd.concat([new_data_test, data[i]], axis=1)

for i in columns:
    new_data = pd.concat([new_data, data[i]], axis=1)
Exemplo n.º 29
0
    plt.ioff()
    bin_cutoff[feat] = np.histogram(featMatAll2[feat], bins='fd')[1]

#use these to create bins for the cutting up the data - can input into pandas cut
cat = pd.DataFrame()
for feat in featMatAll2.columns:
    cat[feat]=pd.cut(featMatAll2[feat], bins= bin_cutoff[feat], \
       labels = np.arange(1,len(bin_cutoff[feat])), include_lowest=True)

#make ints
cat2 = pd.DataFrame(data = np.array(cat.values), dtype = int, columns = cat.columns)        
#add in info about rows
cat.insert(0, column = 'drug', value = featMatAll['drug'], allow_duplicates=True)

#select 150 features using mRMR
mrFeatsA = pymrmr.mRMR(cat2, 'MID', 150)

#export these features as txt tile
out = open(os.path.join(directoryA[:-7], 'mRMR_featsAgar.txt'), 'w')
out.writelines(["%s\n" % item  for item in mrFeatsA])
out.close()

#so this is the mRMR selected feature set.
mrFeatMatAll = pd.concat([featMatAll[mrFeatsA], featMatAll.iloc[:,-3:]], axis=1)


#%%
# We want to show that this feature set performs better than a random set of features
    #so use sss split again to pick out 150 features randomly, and do LDA (10CV'd)
        #need several loops to do this one
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 10 10:04:56 2019

@author: Arif Shahriar 15201002
"""
import numpy as np
import pandas as pd
import pymrmr
df=pd.read_csv("Pymrmr_data.csv")
df=df.drop(columns=['Timestamp','Rehab','ID'])
print(df.head)
featureName=pymrmr.mRMR(df,'MID',40)
print("Number of Features is",len(featureName))
print(featureName)

#from sklearn.datasets import make_classification
#
#from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = "all"
#
#X, y = make_classification(n_samples=10000,
#                           n_features=6,
#                           n_informative=3,
#                           n_classes=2,
#                           random_state=0,
#                           shuffle=False)
#
## Creating a dataFrame
#df = pd.DataFrame({'Feature 1':X[:,0],
#                                  'Feature 2':X[:,1],