def __init__(self, data, index=None, pca=False, min_std=10e-9): self.data = data self._pca = None self._zs = ZScore(data, replace_inf=True) data = self._zs.normalize(data) # to remove columns that contaim nan's and have zero variance mask_nan = np.invert(np.isnan(data.sum(axis=0))) self._mask = np.ones(mask_nan.shape).astype(bool) if index is not None: self._mask[:] = False self._mask[index] = True self._mask *= mask_nan data = self.filter(data) if pca: # data1 = data self.normalize(data) self._pca = PCA(data, minfrac=0.05) self.traindata = self._pca.project(data) else: self.traindata = data #self.normalize(data)
def __call__(self): # z-scoring if self.treedata is None: raise SortingError("No examples for similarity measure available!") zs = ZScore(self.data, replace_inf=True) data_zs = zs.normalize(self.data) # z-scored mean value of pca procjected treedata mu = zs.normalize(self.treedata) data_zs, mu = filter_nans(data_zs, mu) mu = mu.mean(axis=0) distsq = [np.power((x - mu), 2).sum() for x in data_zs] return np.sqrt(distsq)
def __call__(self): # z-scoring if self.treedata is None: raise SortingError("No examples for similarity measure available!") if self.treedata.shape[1] < 2: raise SortingError(("CosineSimilarity needs at least 2 " "features for sorting")) zs = ZScore(self.data, replace_inf=True) data_zs = zs.normalize(self.data) # z-scored mean value of pca procjected treedata mu = zs.normalize(self.treedata) data_zs, mu = filter_nans(data_zs, mu) mu = mu.mean(axis=0) denom = np.sqrt((mu**2).sum()) * np.sqrt((data_zs**2).sum(axis=1)) s_cos = np.sum(mu * data_zs, axis=1) / denom return -1.0 * s_cos
def __call__(self): # z-scoring if self.treedata is None: raise SortingError("No examples for similarity measure available!") if self.treedata.shape[1] < 2: raise SortingError(("CosineSimilarity needs at least 2 " "features for sorting")) zs = ZScore(self.data, replace_inf=True) data_zs = zs.normalize(self.data) # z-scored mean value of pca procjected treedata mu = zs.normalize(self.treedata) data_zs, mu = filter_nans(data_zs, mu) mu = mu.mean(axis=0) denom = np.sqrt((mu**2).sum())*np.sqrt((data_zs**2).sum(axis=1)) s_cos = np.sum(mu*data_zs, axis=1)/denom return -1.0*s_cos
class PreProcessor(object): """PreProcessor is used to normalize the data and remove columns that contain NaN's and have zero variance. If index is None all features are taken into account otherwise only those specified with index. Index can be an integer or a list of integers. """ def __init__(self, data, index=None, pca=False, min_std=10e-9): self.data = data self._pca = None self._zs = ZScore(data, replace_inf=True) data = self._zs.normalize(data) # to remove columns that contaim nan's and have zero variance mask_nan = np.invert(np.isnan(data.sum(axis=0))) self._mask = np.ones(mask_nan.shape).astype(bool) if index is not None: self._mask[:] = False self._mask[index] = True self._mask *= mask_nan data = self.filter(data) if pca: # data1 = data self.normalize(data) self._pca = PCA(data, minfrac=0.05) self.traindata = self._pca.project(data) else: self.traindata = data #self.normalize(data) @property def std(self): return self.data.std(axis=0) @property def mean(self): return self.data.mean(axis=0) @property def mask(self): return self._mask @property def nfeatures(self): return self.traindata.shape[1] @property def nsamples(self): return self.data.shape[0] def normalize(self, data): return self._zs.normalize(data) def filter(self, data): return data[:, self._mask] def __call__(self, data): data1 = self.filter(self.normalize(data)) if self._pca is None: return data1 else: return self._pca.project(data1)