Пример #1
0
def main():
    hsic_lasso = HSICLasso()
    hsic_lasso.input("../tests/test_data/csv_data_mv.csv",
                     output_list=['output1', 'output2'])
    hsic_lasso.regression(5)
    hsic_lasso.dump()
    hsic_lasso.plot_path()
Пример #2
0
def main():
    hsic_lasso = HSICLasso()
    hsic_lasso.input("../tests/test_data/matlab_data.mat")

    #Single core processing
    hsic_lasso.regression(5, n_jobs=1)

    #Multi-core processing. Use all available cores (default)
    hsic_lasso.regression(5, n_jobs=-1)
Пример #3
0
def hsic(num_features, hsic_data, method='regression'):
    hsic_lasso = HSICLasso()
    hsic_lasso.input(hsic_data)

    if method == 'regression':
        hsic_lasso.regression(num_features)
    else:
        hsic_lasso.classification(num_features)

    return hsic_lasso.get_features()
Пример #4
0
def hsic_sel(csv, no_features, method='classification'):
    hsic_lasso = HSICLasso()
    hsic_lasso.input(csv)

    if method == 'regression':
        hsic_lasso.regression(no_features)
    else:
        hsic_lasso.classification(no_features)

    return hsic_lasso.get_features()
Пример #5
0
def main():
    hsic_lasso = HSICLasso()
    hsic_lasso.input("../tests/test_data/matlab_data.mat")

    #max_neighbors=0 means that we only use the HSIC Lasso features to plot heatmap
    hsic_lasso.regression(5, max_neighbors=0)

    #Compute linkage
    hsic_lasso.linkage()

    #Run Hierarchical clustering
    # Features are clustered by using HSIC scores
    # Samples are clusterd by using Euclid distance
    hsic_lasso.plot_heatmap()
Пример #6
0
def main():

    #Numpy array input example
    hsic_lasso = HSICLasso()
    data = sio.loadmat("../tests/test_data/matlab_data.mat")
    X = data['X'].transpose()
    Y = data['Y'][0]
    featname = ['Feat%d' % x for x in range(1, X.shape[1] + 1)]

    hsic_lasso.input(X, Y, featname=featname)
    hsic_lasso.regression(5)
    hsic_lasso.dump()
    hsic_lasso.plot_path()

    #Save parameters
    hsic_lasso.save_param()
Пример #7
0
def main():
    hsic_lasso = HSICLasso()
    #out_list = ['c'+str(i) for i in range(1,51)]
    #print (out_list)
    hsic_lasso.input("./user_data_new.csv",
                     output_list=[
                         'c1', 'c2', 'c3', 'c4', 'c5,', 'c6', 'c7', 'c8', 'c9',
                         'c10'
                     ])
    # ,'c11', 'c12', 'c13', 'c14', 'c15,', 'c16', 'c17', 'c18', 'c19', 'c20','c21', 'c22', 'c23', 'c24', 'c25,', 'c26', 'c27', 'c28', 'c29', 'c30'])
    hsic_lasso.regression(100, B=50)
    hsic_lasso.dump()
    select_index = hsic_lasso.get_index()
    print(select_index)
    print(hsic_lasso.get_index_score())
    #hsic_lasso.plot_path()
    print(hsic_lasso.get_features())
    X_select = hsic_lasso.X_in[select_index, :]
    np.savetxt('X_select.txt', X_select, fmt=str('%.5f'), encoding='utf-8')
Пример #8
0
    def HSICLasso(self):

        df_ = self.data.copy()
        cols = list(df_.columns)[:-1] + ['class']
        df_.columns = cols

        hsic_lasso = HSICLasso()
        hsic_lasso.input(self.X_train.values, self.Y_train.values)

        if self.type == CLASSIFICATION:
            hsic_lasso.classification(self.num_top_features)
        elif self.type == REGRESSION:
            hsic_lasso.regression(self.num_top_features)

        feats = [
            df_.columns[int(val) - 1] for val in hsic_lasso.get_features()
        ]

        for feat, imp in zip(feats, hsic_lasso.get_index_score()):
            features_[feat] = imp
        self.report_feature_importance(features_,
                                       self.num_top_features,
                                       label="HSICLasso")
Пример #9
0
    def hsic_lasso_matric(self,
                          data,
                          n_jobs=2,
                          n_sample=False,
                          frac_sample=False):
        '''Calculate hsic lasso (subtract correlation between explanatory variables).
        Since the correlation coefficient matrix is not symmetric, it is viewed in the row direction.
        The correlation between variable 0 and the other variable is stored as the component on the 0th row,
        and the correlation between variable 1 and the other variable is stored as the component on the first row.
        
        n_jobs : (int) Indicates the number of cores to be calculated. -1 for GPU.
        data: (numpy or pandas) A data frame that contains all explanatory and objective variables
        n_sample : (int) How much random sampling to do. False if not.
        If a numerical value is entered, sampling is performed using that number of rows.
        frac_sample: [0 ~ 1] (float) Sampled as a percentage of the number of rows. Not used at the same time as n_sample.
        '''
        data = copy(data)
        data = pd.DataFrame(data).dropna()
        # Sampling when n_sample contains a numerical value
        if not n_sample:
            if not frac_sample:
                # n_sample=False, frac_sample=False
                pass
            else:
                # n_sample=False, frac_sample=int
                data = data.sample(frac=frac_sample, replace=True)
        else:

            if not frac_sample:
                # n_sample=int, frac_sample=False
                data = data.sample(n=n_sample, replace=True)
            else:
                # n_sample=int, frac_sample=int
                raise ValueError(
                    'Please enter a value for `frac` OR `n`, not both')

        data = check_array(data, accept_sparse="csc",
                           dtype=float)  # Convert to numpy.ndarray
        n_col = data.shape[1]
        hsic_array = np.empty((0, n_col - 1), float)
        for i in range(n_col):
            X = np.delete(data, obj=i, axis=1)
            y = data[:, i]

            # Calculation of hsic_lasso
            hsic_lasso = HSICLasso()
            hsic_lasso.input(X, y)
            hsic_lasso.regression(num_feat=X.shape[1],
                                  discrete_x=False,
                                  n_jobs=n_jobs)
            # hsic_lasso only appears in descending order of score, so sort
            hsic_ = np.array(
                [hsic_lasso.get_index(),
                 hsic_lasso.get_index_score()])
            hsic_ = hsic_.T  # Transpose because it is difficult to use
            # Since there are not enough scores that came out, add 0.0 to the index to complement
            lack_set = set([x for x in range(X.shape[1])]) - set(hsic_[:, 0])
            for lack in lack_set:
                lack_list = np.array([[lack, 0.0]])
                hsic_ = np.append(hsic_, lack_list, axis=0)
            hsic_ = hsic_[np.argsort(hsic_[:, 0])]  # Sort by index
            hsic_array = np.append(hsic_array,
                                   hsic_[:, 1].reshape(1, -1),
                                   axis=0)
        # Since it does not include the correlation component with itself, add 1.0
        n_row = hsic_array.shape[0]
        for i in range(n_row):
            insert_i = (n_row + 1) * i
            hsic_array = np.insert(hsic_array, insert_i, 1.0)
        self.hsic_lasso = hsic_array.reshape(n_row, -1)
        return self.hsic_lasso
Пример #10
0
def main():
    hsic_lasso = HSICLasso()
    hsic_lasso.input("../tests/test_data/matlab_data.mat")
    hsic_lasso.regression(5)
    hsic_lasso.dump()
    hsic_lasso.plot_path()
Пример #11
0
class RegressionTest(unittest.TestCase):
    def setUp(self):
        self.hsic_lasso = HSICLasso()

    def test_regression(self):

        np.random.seed(0)

        with self.assertRaises(UnboundLocalError):
            self.hsic_lasso.regression()

        self.hsic_lasso.input("./tests/test_data/matlab_data.mat")
        self.hsic_lasso.regression(5, n_jobs = 1)
        self.assertEqual(self.hsic_lasso.A, [1099, 99, 199, 1299, 299])

        self.hsic_lasso.input("./tests/test_data/matlab_data.mat")
        self.hsic_lasso.regression(10, n_jobs = 1)
        self.assertEqual(self.hsic_lasso.A, [1099, 99, 199, 1299, 1477,
                                             1405, 1073, 299,1596, 358])

        # Blocks
        self.hsic_lasso.input("./tests/test_data/matlab_data.mat")
        B = int(self.hsic_lasso.X_in.shape[1]/2)
        self.hsic_lasso.regression(5, B, 10)
        self.assertEqual(self.hsic_lasso.A, [1099, 99, 199, 299, 1299])

        self.hsic_lasso.input("./tests/test_data/matlab_data.mat")
        B = int(self.hsic_lasso.X_in.shape[1]/2)
        self.hsic_lasso.regression(10, B, 10)
        self.assertEqual(self.hsic_lasso.A, [1099, 99, 199, 1477, 299,
                                             1299, 1073, 1405, 358, 1596])

        # use non-divisor as block size
        with warnings.catch_warnings(record=True) as w:
        
            self.hsic_lasso.input("./tests/test_data/csv_data.csv")
            B = int(self.hsic_lasso.X_in.shape[1]/2) - 1
            n = self.hsic_lasso.X_in.shape[1]
            numblocks = n / B
            
            self.hsic_lasso.regression(10, B, 10)
            self.assertEqual(self.hsic_lasso.A, [1422, 248, 512, 1581, 1670,
                                                 764, 1771, 896, 779, 398])
            self.assertEqual(len(w), 1)
            self.assertEqual(w[-1].category, RuntimeWarning)
            self.assertEqual(str(w[-1].message), "B {} must be an exact divisor of the \
number of samples {}. Number of blocks {} will be approximated to {}.".format(B, n, numblocks, int(numblocks)))
Пример #12
0
def featureSelection(X, y, method = 'lasso', select = 500):
    
    t0 = time.time()
    
    # sparse (15 seconds)
    if method == 'lasso':
        from sklearn import linear_model
        
        a = 0.861 if select == 500 else 0.0755
        lasso = linear_model.Lasso(alpha = a)
        lasso.fit(X,y)
        XSelected = X[:,lasso.coef_ != 0]
        indices = np.where(lasso.coef_ != 0)
        if indices > select:
            indices = np.argsort(-lasso.coef_)[:select]
    
    # non-sparse (157 seconds)
    if method == 'rf':
        from sklearn.ensemble import ExtraTreesRegressor
        from sklearn.datasets import load_iris
        from sklearn.feature_selection import SelectFromModel
        
        t = ExtraTreesRegressor(n_estimators=50)
        t.fit(X, y)
        model = SelectFromModel(t, prefit=True,
                                max_features = select)
        XSelected = model.transform(X)
        indices = np.where(model.get_support)
    
    # non-sparse (8.5 seconds)
    if method == 'svm':
        from sklearn.svm import SVR
        from sklearn.feature_selection import SelectFromModel
        
        SVMReg = SVR(kernel = 'linear',
                     gamma='scale', C=1.0, epsilon=0.2)
        SVMReg.fit(X, y)
        model = SelectFromModel(SVMReg, prefit=True, 
                                max_features = select)
        XSelected = model.transform(X)
        indices = np.where(model.get_support())
    
    # wrapper model (preset number of features) (1000 seconds / 5000 seconds)
    if method == 'hsiclasso':
        from pyHSICLasso import HSICLasso
        
        hsic_lasso = HSICLasso()
        hsic_lasso.input(X,y)
        hsic_lasso.regression(select)
        XSelected = X[:,hsic_lasso.get_index()]
        indices = hsic_lasso.get_index()

    # dimensionality reduction
        # PCA
        # MDS
        # PLS
        # DWT
        
#    f = h5py.File('selected/' + str(select) + '/X_' + method + '.hdf5', "w")
#    f.create_dataset('X', data=XSelected)
#    f.create_dataset('indices', data=indices)
#    f.close()

    # return indices
    np.savetxt('selected/' + str(select) + '/X_' + method + '.dat', indices)
    
    # np.savetxt('selected/' + str(select) + '/X_' + method + '.dat', XSelected)

    print("--- %s seconds ---" % (time.time() - t0))