def main(): hsic_lasso = HSICLasso() hsic_lasso.input("../tests/test_data/csv_data_mv.csv", output_list=['output1', 'output2']) hsic_lasso.regression(5) hsic_lasso.dump() hsic_lasso.plot_path()
def main(): hsic_lasso = HSICLasso() hsic_lasso.input("../tests/test_data/matlab_data.mat") #Single core processing hsic_lasso.regression(5, n_jobs=1) #Multi-core processing. Use all available cores (default) hsic_lasso.regression(5, n_jobs=-1)
def hsic(num_features, hsic_data, method='regression'): hsic_lasso = HSICLasso() hsic_lasso.input(hsic_data) if method == 'regression': hsic_lasso.regression(num_features) else: hsic_lasso.classification(num_features) return hsic_lasso.get_features()
def hsic_sel(csv, no_features, method='classification'): hsic_lasso = HSICLasso() hsic_lasso.input(csv) if method == 'regression': hsic_lasso.regression(no_features) else: hsic_lasso.classification(no_features) return hsic_lasso.get_features()
def main(): hsic_lasso = HSICLasso() hsic_lasso.input("../tests/test_data/matlab_data.mat") #max_neighbors=0 means that we only use the HSIC Lasso features to plot heatmap hsic_lasso.regression(5, max_neighbors=0) #Compute linkage hsic_lasso.linkage() #Run Hierarchical clustering # Features are clustered by using HSIC scores # Samples are clusterd by using Euclid distance hsic_lasso.plot_heatmap()
def main(): #Numpy array input example hsic_lasso = HSICLasso() data = sio.loadmat("../tests/test_data/matlab_data.mat") X = data['X'].transpose() Y = data['Y'][0] featname = ['Feat%d' % x for x in range(1, X.shape[1] + 1)] hsic_lasso.input(X, Y, featname=featname) hsic_lasso.regression(5) hsic_lasso.dump() hsic_lasso.plot_path() #Save parameters hsic_lasso.save_param()
def main(): hsic_lasso = HSICLasso() #out_list = ['c'+str(i) for i in range(1,51)] #print (out_list) hsic_lasso.input("./user_data_new.csv", output_list=[ 'c1', 'c2', 'c3', 'c4', 'c5,', 'c6', 'c7', 'c8', 'c9', 'c10' ]) # ,'c11', 'c12', 'c13', 'c14', 'c15,', 'c16', 'c17', 'c18', 'c19', 'c20','c21', 'c22', 'c23', 'c24', 'c25,', 'c26', 'c27', 'c28', 'c29', 'c30']) hsic_lasso.regression(100, B=50) hsic_lasso.dump() select_index = hsic_lasso.get_index() print(select_index) print(hsic_lasso.get_index_score()) #hsic_lasso.plot_path() print(hsic_lasso.get_features()) X_select = hsic_lasso.X_in[select_index, :] np.savetxt('X_select.txt', X_select, fmt=str('%.5f'), encoding='utf-8')
def HSICLasso(self): df_ = self.data.copy() cols = list(df_.columns)[:-1] + ['class'] df_.columns = cols hsic_lasso = HSICLasso() hsic_lasso.input(self.X_train.values, self.Y_train.values) if self.type == CLASSIFICATION: hsic_lasso.classification(self.num_top_features) elif self.type == REGRESSION: hsic_lasso.regression(self.num_top_features) feats = [ df_.columns[int(val) - 1] for val in hsic_lasso.get_features() ] for feat, imp in zip(feats, hsic_lasso.get_index_score()): features_[feat] = imp self.report_feature_importance(features_, self.num_top_features, label="HSICLasso")
def hsic_lasso_matric(self, data, n_jobs=2, n_sample=False, frac_sample=False): '''Calculate hsic lasso (subtract correlation between explanatory variables). Since the correlation coefficient matrix is not symmetric, it is viewed in the row direction. The correlation between variable 0 and the other variable is stored as the component on the 0th row, and the correlation between variable 1 and the other variable is stored as the component on the first row. n_jobs : (int) Indicates the number of cores to be calculated. -1 for GPU. data: (numpy or pandas) A data frame that contains all explanatory and objective variables n_sample : (int) How much random sampling to do. False if not. If a numerical value is entered, sampling is performed using that number of rows. frac_sample: [0 ~ 1] (float) Sampled as a percentage of the number of rows. Not used at the same time as n_sample. ''' data = copy(data) data = pd.DataFrame(data).dropna() # Sampling when n_sample contains a numerical value if not n_sample: if not frac_sample: # n_sample=False, frac_sample=False pass else: # n_sample=False, frac_sample=int data = data.sample(frac=frac_sample, replace=True) else: if not frac_sample: # n_sample=int, frac_sample=False data = data.sample(n=n_sample, replace=True) else: # n_sample=int, frac_sample=int raise ValueError( 'Please enter a value for `frac` OR `n`, not both') data = check_array(data, accept_sparse="csc", dtype=float) # Convert to numpy.ndarray n_col = data.shape[1] hsic_array = np.empty((0, n_col - 1), float) for i in range(n_col): X = np.delete(data, obj=i, axis=1) y = data[:, i] # Calculation of hsic_lasso hsic_lasso = HSICLasso() hsic_lasso.input(X, y) hsic_lasso.regression(num_feat=X.shape[1], discrete_x=False, n_jobs=n_jobs) # hsic_lasso only appears in descending order of score, so sort hsic_ = np.array( [hsic_lasso.get_index(), hsic_lasso.get_index_score()]) hsic_ = hsic_.T # Transpose because it is difficult to use # Since there are not enough scores that came out, add 0.0 to the index to complement lack_set = set([x for x in range(X.shape[1])]) - set(hsic_[:, 0]) for lack in lack_set: lack_list = np.array([[lack, 0.0]]) hsic_ = np.append(hsic_, lack_list, axis=0) hsic_ = hsic_[np.argsort(hsic_[:, 0])] # Sort by index hsic_array = np.append(hsic_array, hsic_[:, 1].reshape(1, -1), axis=0) # Since it does not include the correlation component with itself, add 1.0 n_row = hsic_array.shape[0] for i in range(n_row): insert_i = (n_row + 1) * i hsic_array = np.insert(hsic_array, insert_i, 1.0) self.hsic_lasso = hsic_array.reshape(n_row, -1) return self.hsic_lasso
def main(): hsic_lasso = HSICLasso() hsic_lasso.input("../tests/test_data/matlab_data.mat") hsic_lasso.regression(5) hsic_lasso.dump() hsic_lasso.plot_path()
class RegressionTest(unittest.TestCase): def setUp(self): self.hsic_lasso = HSICLasso() def test_regression(self): np.random.seed(0) with self.assertRaises(UnboundLocalError): self.hsic_lasso.regression() self.hsic_lasso.input("./tests/test_data/matlab_data.mat") self.hsic_lasso.regression(5, n_jobs = 1) self.assertEqual(self.hsic_lasso.A, [1099, 99, 199, 1299, 299]) self.hsic_lasso.input("./tests/test_data/matlab_data.mat") self.hsic_lasso.regression(10, n_jobs = 1) self.assertEqual(self.hsic_lasso.A, [1099, 99, 199, 1299, 1477, 1405, 1073, 299,1596, 358]) # Blocks self.hsic_lasso.input("./tests/test_data/matlab_data.mat") B = int(self.hsic_lasso.X_in.shape[1]/2) self.hsic_lasso.regression(5, B, 10) self.assertEqual(self.hsic_lasso.A, [1099, 99, 199, 299, 1299]) self.hsic_lasso.input("./tests/test_data/matlab_data.mat") B = int(self.hsic_lasso.X_in.shape[1]/2) self.hsic_lasso.regression(10, B, 10) self.assertEqual(self.hsic_lasso.A, [1099, 99, 199, 1477, 299, 1299, 1073, 1405, 358, 1596]) # use non-divisor as block size with warnings.catch_warnings(record=True) as w: self.hsic_lasso.input("./tests/test_data/csv_data.csv") B = int(self.hsic_lasso.X_in.shape[1]/2) - 1 n = self.hsic_lasso.X_in.shape[1] numblocks = n / B self.hsic_lasso.regression(10, B, 10) self.assertEqual(self.hsic_lasso.A, [1422, 248, 512, 1581, 1670, 764, 1771, 896, 779, 398]) self.assertEqual(len(w), 1) self.assertEqual(w[-1].category, RuntimeWarning) self.assertEqual(str(w[-1].message), "B {} must be an exact divisor of the \ number of samples {}. Number of blocks {} will be approximated to {}.".format(B, n, numblocks, int(numblocks)))
def featureSelection(X, y, method = 'lasso', select = 500): t0 = time.time() # sparse (15 seconds) if method == 'lasso': from sklearn import linear_model a = 0.861 if select == 500 else 0.0755 lasso = linear_model.Lasso(alpha = a) lasso.fit(X,y) XSelected = X[:,lasso.coef_ != 0] indices = np.where(lasso.coef_ != 0) if indices > select: indices = np.argsort(-lasso.coef_)[:select] # non-sparse (157 seconds) if method == 'rf': from sklearn.ensemble import ExtraTreesRegressor from sklearn.datasets import load_iris from sklearn.feature_selection import SelectFromModel t = ExtraTreesRegressor(n_estimators=50) t.fit(X, y) model = SelectFromModel(t, prefit=True, max_features = select) XSelected = model.transform(X) indices = np.where(model.get_support) # non-sparse (8.5 seconds) if method == 'svm': from sklearn.svm import SVR from sklearn.feature_selection import SelectFromModel SVMReg = SVR(kernel = 'linear', gamma='scale', C=1.0, epsilon=0.2) SVMReg.fit(X, y) model = SelectFromModel(SVMReg, prefit=True, max_features = select) XSelected = model.transform(X) indices = np.where(model.get_support()) # wrapper model (preset number of features) (1000 seconds / 5000 seconds) if method == 'hsiclasso': from pyHSICLasso import HSICLasso hsic_lasso = HSICLasso() hsic_lasso.input(X,y) hsic_lasso.regression(select) XSelected = X[:,hsic_lasso.get_index()] indices = hsic_lasso.get_index() # dimensionality reduction # PCA # MDS # PLS # DWT # f = h5py.File('selected/' + str(select) + '/X_' + method + '.hdf5', "w") # f.create_dataset('X', data=XSelected) # f.create_dataset('indices', data=indices) # f.close() # return indices np.savetxt('selected/' + str(select) + '/X_' + method + '.dat', indices) # np.savetxt('selected/' + str(select) + '/X_' + method + '.dat', XSelected) print("--- %s seconds ---" % (time.time() - t0))