def __init__(self, data, index=None, pca=False, min_std=10e-9): self.data = data self._pca = None self._zs = ZScore(data, replace_inf=True) data = self._zs.normalize(data) # to remove columns that contaim nan's and have zero variance mask_nan = np.invert(np.isnan(data.sum(axis=0))) self._mask = np.ones(mask_nan.shape).astype(bool) if index is not None: self._mask[:] = False self._mask[index] = True self._mask *= mask_nan data = self.filter(data) if pca: # data1 = data self.normalize(data) self._pca = PCA(data, minfrac=0.05) self.traindata = self._pca.project(data) else: self.traindata = data #self.normalize(data)
def __call__(self): # z-scoring if self.treedata is None: raise SortingError("No examples for similarity measure available!") zs = ZScore(self.data, replace_inf=True) data_zs = zs.normalize(self.data) # z-scored mean value of pca procjected treedata mu = zs.normalize(self.treedata) data_zs, mu = filter_nans(data_zs, mu) mu = mu.mean(axis=0) distsq = [np.power((x - mu), 2).sum() for x in data_zs] return np.sqrt(distsq)
def __call__(self): # z-scoring if self.treedata is None: raise SortingError("No examples for similarity measure available!") if self.treedata.shape[1] < 2: raise SortingError(("CosineSimilarity needs at least 2 " "features for sorting")) zs = ZScore(self.data, replace_inf=True) data_zs = zs.normalize(self.data) # z-scored mean value of pca procjected treedata mu = zs.normalize(self.treedata) data_zs, mu = filter_nans(data_zs, mu) mu = mu.mean(axis=0) denom = np.sqrt((mu**2).sum()) * np.sqrt((data_zs**2).sum(axis=1)) s_cos = np.sum(mu * data_zs, axis=1) / denom return -1.0 * s_cos