def fit(self, X, y):
        sss = StratifiedShuffleSplit(n_splits=self.hsic_splits,
                                     random_state=42)
        idxs = []
        hsics = []
        for train_index, test_index in list(sss.split(X, y)):
            hsic_lasso2 = HSICLasso()
            hsic_lasso2.input(X[train_index], y[train_index])
            hsic_lasso2.classification(
                self.n_features, B=self.B,
                M=self.M)  #(self.n_features, B=self.B, M=self.M)
            hsics.append(hsic_lasso2)

            # not just best features - get their neighbors (similar features) too
            all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel()
            for i in range(len(all_ft_idx)):
                idx = np.array(hsic_lasso2.get_index_neighbors(
                    feat_index=i, num_neighbors=10),
                               dtype=int)
                score = np.array(hsic_lasso2.get_index_neighbors_score(
                    feat_index=i, num_neighbors=10),
                                 dtype=int)
                idx = idx[np.where(score > self.neighbor_threshold)[0]]
                all_ft_idx = np.concatenate((all_ft_idx, idx))
            all_ft_idx = np.unique(all_ft_idx)

            idxs.append(all_ft_idx)
            if len(idxs) == 1:
                self.hsic_idx_ = idxs[0]
            else:
                self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_)
        print("HSIC done.", len(self.hsic_idx_))

        print("Upsampling with ADASYN... (features: " +
              str(len(self.hsic_idx_)) + ")")
        sm = ADASYN(sampling_strategy="minority",
                    n_neighbors=self.adasyn_neighbors,
                    n_jobs=-1)
        sX, sy = X[:, self.hsic_idx_], y
        if self.adasyn_neighbors > 0:
            try:
                sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y)
                for i in range(len(np.unique(y) - 1)):
                    sX, sy = sm.fit_resample(sX, sy)
            except:
                pass
            print("ADASYN done. Starting clf")

        self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy)
        print("done")
        return self
示例#2
0
def HSIC_lasso():
    hsic = HSICLasso()
    hsic.input(data, labels)
    before = datetime.datetime.now()
    hsic.classification(num_feat=treshold, B=0, M=1, max_neighbors=10, discrete_x=False)
    # B a M su na postupne nacitanie ak mam velky dataset, B deli pocet vzoriek, pre klasicky algoritmus B=0, M=1
    after = datetime.datetime.now()
    print("HSIC Lasso")
    selected = hsic.get_index()
    print(len(selected))
    print("cas: " + str(after - before))
    print('\n')
    if len(selected) < len(header):
        transform_and_save(selected, "HSIC_Lasso")
示例#3
0
def main():
    hsic_lasso = HSICLasso()
    #out_list = ['c'+str(i) for i in range(1,51)]
    #print (out_list)
    hsic_lasso.input("./user_data_new.csv",
                     output_list=[
                         'c1', 'c2', 'c3', 'c4', 'c5,', 'c6', 'c7', 'c8', 'c9',
                         'c10'
                     ])
    # ,'c11', 'c12', 'c13', 'c14', 'c15,', 'c16', 'c17', 'c18', 'c19', 'c20','c21', 'c22', 'c23', 'c24', 'c25,', 'c26', 'c27', 'c28', 'c29', 'c30'])
    hsic_lasso.regression(100, B=50)
    hsic_lasso.dump()
    select_index = hsic_lasso.get_index()
    print(select_index)
    print(hsic_lasso.get_index_score())
    #hsic_lasso.plot_path()
    print(hsic_lasso.get_features())
    X_select = hsic_lasso.X_in[select_index, :]
    np.savetxt('X_select.txt', X_select, fmt=str('%.5f'), encoding='utf-8')
示例#4
0
class HSICLasso:
    def __init__(self, k=10):
        self.model = HLasso()
        self.k = k
        self.modelname = "HSICLasso_{}".format(k)
    
    def fit(self, X, y):
        self.model.input(X, y)
        self.model.classification(self.k)

        self.index = np.array(self.model.get_index())

        return self
        
    def transform(self, X):
        return X[:, self.index]

    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)
    def fit(self, X, y):
        if X.shape[1] > 10000:
            #clf = RandomForestClassifier(n_estimators=1000,n_jobs=-1).fit(X,y)
            clf = LGBMClassifier(n_estimators=1000, n_jobs=-1).fit(X, y)
            ftimp = clf.feature_importances_
            relevant = np.where(ftimp > 0)[0]
            print("relevant ft:", len(relevant), "/", X.shape[1])
        else:
            relevant = np.arange(X.shape[1])

        sss = StratifiedShuffleSplit(n_splits=self.hsic_splits,
                                     random_state=42)
        idxs = []
        hsics = []
        for train_index, test_index in list(sss.split(X, y)):
            hsic_lasso2 = HSICLasso()
            hsic_lasso2.input(X[:, relevant][train_index], y[train_index])
            hsic_lasso2.classification(
                self.n_features, B=self.B,
                M=self.M)  #(self.n_features, B=self.B, M=self.M)
            hsics.append(hsic_lasso2)

            # not just best features - get their neighbors (similar features) too
            all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel()
            for i in range(len(all_ft_idx)):
                idx = np.array(hsic_lasso2.get_index_neighbors(
                    feat_index=i, num_neighbors=10),
                               dtype=int)
                score = np.array(hsic_lasso2.get_index_neighbors_score(
                    feat_index=i, num_neighbors=10),
                                 dtype=int)
                idx = idx[np.where(score > self.neighbor_threshold)[0]]
                all_ft_idx = np.concatenate((all_ft_idx, idx))
            all_ft_idx = np.unique(all_ft_idx)

            idxs.append(relevant[all_ft_idx])
            #if len(idxs) == 1:
            #    self.hsic_idx_ = idxs[0]
            #else:
            #    self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_)
        self.hsic_idx_ = []

        stability_concession = 0
        while len(self.hsic_idx_) == 0:
            featurecandidates = np.unique(np.concatenate(idxs))
            for candidate in featurecandidates:
                occurrences = np.sum(
                    [1 if candidate in idx else 0 for idx in idxs])
                if occurrences > self.stability_minimum_across_splits - stability_concession:
                    self.hsic_idx_.append(candidate)
            if len(self.hsic_idx_) > 1:
                break
            else:
                # failed to find commonly occurring features - reduce threshold
                stability_concession += 1
        print("HSIC done.", len(self.hsic_idx_), "(out of ",
              len(featurecandidates), " candidates)")

        print("Upsampling with ADASYN... (features: " +
              str(len(self.hsic_idx_)) + ")")
        sm = ADASYN(sampling_strategy="minority",
                    n_neighbors=self.adasyn_neighbors,
                    n_jobs=-1)
        sX, sy = X[:, self.hsic_idx_], y
        if self.adasyn_neighbors > 0:
            try:
                sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y)
                for i in range(len(np.unique(y) - 1)):
                    sX, sy = sm.fit_resample(sX, sy)
            except:
                pass
            print("ADASYN done. Starting clf")

        self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy)
        print("done")
        return self
示例#6
0
    def hsic_lasso_matric(self,
                          data,
                          n_jobs=2,
                          n_sample=False,
                          frac_sample=False):
        '''Calculate hsic lasso (subtract correlation between explanatory variables).
        Since the correlation coefficient matrix is not symmetric, it is viewed in the row direction.
        The correlation between variable 0 and the other variable is stored as the component on the 0th row,
        and the correlation between variable 1 and the other variable is stored as the component on the first row.
        
        n_jobs : (int) Indicates the number of cores to be calculated. -1 for GPU.
        data: (numpy or pandas) A data frame that contains all explanatory and objective variables
        n_sample : (int) How much random sampling to do. False if not.
        If a numerical value is entered, sampling is performed using that number of rows.
        frac_sample: [0 ~ 1] (float) Sampled as a percentage of the number of rows. Not used at the same time as n_sample.
        '''
        data = copy(data)
        data = pd.DataFrame(data).dropna()
        # Sampling when n_sample contains a numerical value
        if not n_sample:
            if not frac_sample:
                # n_sample=False, frac_sample=False
                pass
            else:
                # n_sample=False, frac_sample=int
                data = data.sample(frac=frac_sample, replace=True)
        else:

            if not frac_sample:
                # n_sample=int, frac_sample=False
                data = data.sample(n=n_sample, replace=True)
            else:
                # n_sample=int, frac_sample=int
                raise ValueError(
                    'Please enter a value for `frac` OR `n`, not both')

        data = check_array(data, accept_sparse="csc",
                           dtype=float)  # Convert to numpy.ndarray
        n_col = data.shape[1]
        hsic_array = np.empty((0, n_col - 1), float)
        for i in range(n_col):
            X = np.delete(data, obj=i, axis=1)
            y = data[:, i]

            # Calculation of hsic_lasso
            hsic_lasso = HSICLasso()
            hsic_lasso.input(X, y)
            hsic_lasso.regression(num_feat=X.shape[1],
                                  discrete_x=False,
                                  n_jobs=n_jobs)
            # hsic_lasso only appears in descending order of score, so sort
            hsic_ = np.array(
                [hsic_lasso.get_index(),
                 hsic_lasso.get_index_score()])
            hsic_ = hsic_.T  # Transpose because it is difficult to use
            # Since there are not enough scores that came out, add 0.0 to the index to complement
            lack_set = set([x for x in range(X.shape[1])]) - set(hsic_[:, 0])
            for lack in lack_set:
                lack_list = np.array([[lack, 0.0]])
                hsic_ = np.append(hsic_, lack_list, axis=0)
            hsic_ = hsic_[np.argsort(hsic_[:, 0])]  # Sort by index
            hsic_array = np.append(hsic_array,
                                   hsic_[:, 1].reshape(1, -1),
                                   axis=0)
        # Since it does not include the correlation component with itself, add 1.0
        n_row = hsic_array.shape[0]
        for i in range(n_row):
            insert_i = (n_row + 1) * i
            hsic_array = np.insert(hsic_array, insert_i, 1.0)
        self.hsic_lasso = hsic_array.reshape(n_row, -1)
        return self.hsic_lasso
示例#7
0
def hsic(train, test, K):
    hsic_lasso = HSICLasso()
    hsic_lasso.input(train[0], train[1])
    hsic_lasso.classification(K, n_jobs=-1)
    indices = hsic_lasso.get_index()
    return indices
示例#8
0
def featureSelection(X, y, method = 'lasso', select = 500):
    
    t0 = time.time()
    
    # sparse (15 seconds)
    if method == 'lasso':
        from sklearn import linear_model
        
        a = 0.861 if select == 500 else 0.0755
        lasso = linear_model.Lasso(alpha = a)
        lasso.fit(X,y)
        XSelected = X[:,lasso.coef_ != 0]
        indices = np.where(lasso.coef_ != 0)
        if indices > select:
            indices = np.argsort(-lasso.coef_)[:select]
    
    # non-sparse (157 seconds)
    if method == 'rf':
        from sklearn.ensemble import ExtraTreesRegressor
        from sklearn.datasets import load_iris
        from sklearn.feature_selection import SelectFromModel
        
        t = ExtraTreesRegressor(n_estimators=50)
        t.fit(X, y)
        model = SelectFromModel(t, prefit=True,
                                max_features = select)
        XSelected = model.transform(X)
        indices = np.where(model.get_support)
    
    # non-sparse (8.5 seconds)
    if method == 'svm':
        from sklearn.svm import SVR
        from sklearn.feature_selection import SelectFromModel
        
        SVMReg = SVR(kernel = 'linear',
                     gamma='scale', C=1.0, epsilon=0.2)
        SVMReg.fit(X, y)
        model = SelectFromModel(SVMReg, prefit=True, 
                                max_features = select)
        XSelected = model.transform(X)
        indices = np.where(model.get_support())
    
    # wrapper model (preset number of features) (1000 seconds / 5000 seconds)
    if method == 'hsiclasso':
        from pyHSICLasso import HSICLasso
        
        hsic_lasso = HSICLasso()
        hsic_lasso.input(X,y)
        hsic_lasso.regression(select)
        XSelected = X[:,hsic_lasso.get_index()]
        indices = hsic_lasso.get_index()

    # dimensionality reduction
        # PCA
        # MDS
        # PLS
        # DWT
        
#    f = h5py.File('selected/' + str(select) + '/X_' + method + '.hdf5', "w")
#    f.create_dataset('X', data=XSelected)
#    f.create_dataset('indices', data=indices)
#    f.close()

    # return indices
    np.savetxt('selected/' + str(select) + '/X_' + method + '.dat', indices)
    
    # np.savetxt('selected/' + str(select) + '/X_' + method + '.dat', XSelected)

    print("--- %s seconds ---" % (time.time() - t0))