def select(self, params=None): X, y = self.data.for_train().gene_data() if bool(params) and 'feature_file' in params: feature_file = params['feature_file'] features = utils.load_string_data(feature_file) X = X.filter(items=features, axis=0) data = X.values.T print('Gene data size {}'.format(data.shape)) from sklearn.decomposition import PCA pca = PCA(svd_solver='full') pca.fit(data) print(pca.singular_values_) print(pca.components_[0], np.max(pca.components_[0])) sing_values = pca.singular_values_ agg = 0 for i in range(1, len(sing_values)): agg = agg + sing_values[i - 1] if (agg / np.sum(sing_values)) > .9: break print('90 percent variance captured by {} vectors'.format(i)) agg = 0 for i in range(1, len(sing_values)): agg = agg + sing_values[i - 1] if (agg / np.sum(sing_values)) > .99: break print('99 percent variance captured by {} vectors'.format(i))
def select(self, params={}): if bool(params) and 'feature_file' in params: feature_file = params['feature_file'] self.features = utils.load_string_data(feature_file) else: self.features = list(self.data.genes()) self.current_features = list(self.features)
def select(self, params=None): if bool(params) and 'feature_file' in params: feature_file = params['feature_file'] features = utils.load_string_data(feature_file) else: features = list(self.data.genes()) self.features = self._create_featurelist(features)
def compare_genes(dir, prefix): # files = os.listdir(dir) sets = set() alt = 0 gene_map = {} # for f in files: # if f.startswith(prefix): value = utils.load_string_data(os.path.join(dir,prefix)) gene_map[prefix] = set(list(value)) if len(value.shape) > 0: sets = sets.union(value) alt = alt + value.shape[0] else: sets.add(str(value)) alt = alt + 1 print(len(sets)) ds = Dataset('../data/') ds.load_gene_data() X, y = ds.for_train().gene_data() X = X.filter(items=sets, axis=0) data = X.values.T from sklearn.decomposition import PCA pca = PCA(svd_solver='full') pca.fit(data) sing_values = pca.singular_values_ print(sing_values[0:20]) agg = 0 for i in range(1, len(sing_values)): agg = agg + sing_values[i-1] if (agg/np.sum(sing_values)) > .9: break print('90 percent variance captured by {} vectors'.format(i)) agg = 0 for i in range(1, len(sing_values)): agg = agg + sing_values[i-1] if (agg/np.sum(sing_values)) > .99: break print('99 percent variance captured by {} vectors'.format(i)) evec1 = pca.components_[0]
def check_cov(dir, prefix): files = os.listdir(dir) sets = set() alt = 0 for f in files: if f.startswith(prefix): value = utils.load_string_data(os.path.join(dir,f)) if len(value.shape) > 0: sets = sets.union(value) alt = alt + value.shape[0] else: sets.add(str(value)) alt = alt + 1 print(len(sets)) print('Total', alt) np.set_printoptions(threshold=np.nan, linewidth= np.nan) ds = Dataset('../data/') ds.load_gene_data() X, Y = ds.for_train().gene_data() fil = X.filter(items=sets, axis=0) print(fil.shape) fil_v = fil.values cov = np.cov(fil_v) print(cov.shape) print(np.sort(np.diag(cov))) # exit() cov[np.where(np.identity(cov.shape[0])==1)] = 0 sort_ind = np.argsort(np.sum(np.abs(cov), axis=1)) print(sort_ind) print(np.sort(np.sum(np.abs(cov), axis=1))) l_set = np.array(list(sets)) trim_set = l_set[sort_ind[0:cov.shape[0]]] print(trim_set) np_trim_set = np.array(trim_set)
def combine_data(dir, prefix, saveto): files = os.listdir(dir) dset = set() for f in files: if f.startswith(prefix): value = utils.load_string_data(os.path.join(dir,f)) if isinstance(value, list) or isinstance(value, np.ndarray): for v in value: if isinstance(v, list) or isinstance(v, np.ndarray): dset = dset.union(v) else: dset.add(v) else: dset.add(value) # print(len(dset), dset) utils.save_string_data(saveto, np.array(list(dset)))
def clean_genes(ds): genes = set(ds.genes()) print(len(genes)) data = utils.load_string_data('random_selected_features_0') new_data = [] for d in data: good = [] for v in d: if v.startswith('cg'): good.append(v) small = list(genes.difference(good)) wanted = d.shape[0] - len(good) want = list(np.random.choice(len(small), wanted)) for w in want: good.append(small[w]) new_data.append(good) utils.save_string_data('random_selected_features_0-clean', np.array(new_data))