def setUpClass(cls): super().setUpClass() cls.log("FA") cls.X_lo = cls.X()[:10, :] df = pandas.DataFrame(data=cls.X_lo, columns=cls.features()) cls._spark_lo = TestDimredAPI.spark().createDataFrame(df) cls.fa = FactorAnalysis(cls.spark(), 2, cls.features(), max_iter=5) cls.trans = cls.fa.fit_transform(cls._spark_lo) cls.trans = split_vector(cls.trans.data.select(FEATURES__), FEATURES__).toPandas().values model = cls.fa.model cls.W = model.loadings cls.ll = model.loglikelihood cls.psi = model.error_vcov cls.fa.fit(cls._spark_lo) cls.fittransform_data = cls.fa.transform(cls._spark_lo) cls.fittransform_data = split_vector( cls.fittransform_data.data.select(FEATURES__), FEATURES__).toPandas().values cls.sk_fa = sklearn.decomposition.FactorAnalysis(n_components=2, max_iter=5, random_state=23) cls.sk_fit = cls.sk_fa.fit(cls.X_lo) cls.sk_trans = cls.sk_fit.transform(cls.X_lo)
def setUpClass(cls): super().setUpClass() cls.log("ICA") cls.X_lo = cls.X()[:10, :] cls.X_lo = scale(cls.X_lo) df = pandas.DataFrame(data=cls.X_lo, columns=cls.features()) cls._spark_lo = TestDimredAPI.spark().createDataFrame(df) cls.ica = ICA(cls.spark(), 2, cls.features()) cls.trans = cls.ica.fit_transform(cls._spark_lo) cls.trans = split_vector(cls.trans.data.select(FEATURES__), FEATURES__) \ .toPandas().values model = cls.ica.model cls.compo = model.loadings cls.W = model.unmixing cls.K = model.whitening cls.ica.fit(cls._spark_lo) cls.fittransform_data = cls.ica.transform(cls._spark_lo) cls.fittransform_data = split_vector( cls.fittransform_data.data.select(FEATURES__), FEATURES__).toPandas().values cls.sk_ica = sklearn.decomposition.FastICA(n_components=2, algorithm="deflation", fun="exp", max_iter=5, random_state=23) cls.sk_fit = cls.sk_ica.fit(cls.X_lo) cls.sk_fit.whiten = False cls.sk_trans = cls.sk_fit.transform(cls.X_lo)
def write_clusters(self, outpath, suff="", sort_me=True): outpath = outpath + "-components" + str(suff) logger.info("Writing components to: {}".format(outpath)) mkdir(outpath) data = split_vector(self.data, FEATURES__) data = split_vector(data, RESPONSIBILITIES__) self._write_clusters(data, outpath, sort_me)
def setUpClass(cls): super().setUpClass() cls.log("PCA") cls.X_lo = cls.X()[:10, :] # we need to scale this here, because sklearn does not do # the scaling for transformations cls.X_lo = scale(cls.X_lo) df = pandas.DataFrame(data=cls.X_lo, columns=cls.features()) cls._spark_lo = TestDimredAPI.spark().createDataFrame(df) cls.pca = PCA(cls.spark(), 2, cls.features()) cls.trans = cls.pca.fit_transform(cls._spark_lo) cls.trans_panda = split_vector(cls.trans.data.select(FEATURES__), FEATURES__).toPandas() cls.trans = cls.trans_panda.values model = cls.pca.model cls.loadings = model.loadings cls.sds = model.sds cls.pca.fit(cls._spark_lo) cls.fittransform_trans = cls.pca.transform(cls._spark_lo) cls.fittransform_trans = split_vector( cls.fittransform_trans.data.select(FEATURES__), FEATURES__).toPandas().values cls.sk_pca = sklearn.decomposition.PCA(n_components=2) cls.sk_pca_trans = cls.sk_pca.fit(cls.X_lo).transform(cls.X_lo) k = 2
def _plot(self, outfile): logger.info("Plotting") subsamp = as_pandas(split_vector(sample(self.data, 10000), FEATURES__)) for suf in ["png", "pdf", "svg", "eps"]: scatter(outfile + "-scatter_plot." + suf, subsamp, "f_0", "f_1", "Factor 1", "Factor 2") for i in map(lambda x: "f_" + str(x), range(min(10, self.n_factors))): histogram(outfile + "-histogram_{}.".format(i) + suf, subsamp[i].values, i)
def write(self, outpath): """ Write a transformed data set to tsv. :param outpath: the path to where the files are written. """ outpath = outpath + "-predicted" data = drop(self.data, FEATURES__, RAW_PREDICTION__) data = split_vector(data, PROBABILITY__) write_tsv(data, outpath)
def test_kpca_fourier(self): X = self.kpca._preprocess_data(self._spark_lo) X = fourier_transform(X, self.kpca.model.fourier_coefficients, self.kpca.model.fourier_offset) df = self.spark().createDataFrame(X.rows.map(lambda x: (x, ))) df = split_vector(df, "_1").toPandas().values for i in range(5): ax1 = sorted(df[:, i]) ax2 = sorted(self._X_transformed[:, i]) assert numpy.allclose(numpy.absolute(ax1), numpy.absolute(ax2), atol=1e-01)
def setUpClass(cls): super().setUpClass() cls.log("LDA") cls.sk_lda = LinearDiscriminantAnalysis(n_components=2, solver="eigen") cls.sk_lda_trans = cls.sk_lda.fit(cls.X(), cls.y()).transform(cls.X()) cls.lda = LDA(cls.spark(), 2, cls.features(), cls.response()) cls.trans = cls.lda.fit_transform(cls.spark_df()) model = cls.lda.model cls.evec = model.projection cls.fit_tran = cls.lda.fit_transform(cls.spark_df()) cls.fittransform_data = split_vector( cls.fit_tran.data.select(FEATURES__), FEATURES__).toPandas().values
def setUpClass(cls): super().setUpClass() cls.log("KPCA") cls.X_lo = cls.X()[:10, :] cls.X_lo = scale(cls.X_lo) df = pandas.DataFrame(data=cls.X_lo, columns=cls.features()) cls._spark_lo = TestDimredAPI.spark().createDataFrame(df) cls.sbf_feature = sklearn.kernel_approximation.RBFSampler \ (random_state=23, n_components=5) cls._X_transformed = cls.sbf_feature.fit_transform(cls.X_lo) cls.sk_pca = PCA(n_components=2).fit(cls._X_transformed) cls.kpca = KPCA(cls.spark(), 2, cls.features(), 5, 1.) cls.trans = cls.kpca.fit_transform(cls._spark_lo) cls.trans = split_vector(cls.trans.data.select(FEATURES__), FEATURES__).toPandas().values model = cls.kpca.model cls.evals = model.loadings cls.sds = model.sds cls.w = model.fourier_coefficients cls.b = model.fourier_offset cls.kpca.fit(cls._spark_lo) cls.fittransform_trans = cls.kpca.transform(cls._spark_lo) cls.fittransform_trans = split_vector( cls.fittransform_trans.data.select(FEATURES__), FEATURES__).toPandas().values # The sklearn PCA would substract the mean here # We don't want that to happen, but work and the Fourier matrix directly # setting the mean to None does the trick cls.sk_pca.mean_ = None cls.sk_pca.components_ = cls.evals cls.sk_pca_trans = cls.sk_pca.transform(cls._X_transformed)
def fit(self, data, outpath=None): n, p = dimension(data) data = data.select(FEATURES__) tot_var = self.tot_var(split_vector(data, FEATURES__), outpath) self.model = self._fit(KMeansFitProfile(), outpath, data, n, p, tot_var) return self
def test_transform_forest_binomial(self): df = split_vector(self.transform_bin.data, PROBABILITY__) df = df.toPandas() assert "p_0" in df.columns.values assert "p_1" in df.columns.values
def write_tsv(self, outfolder): data = split_vector(self.data, FEATURES__) write_tsv(data, outfolder)
def write_clusters(self, outpath, suff="", sort_me=True): outpath = outpath + "-clusters" + str(suff) logger.info("Writing clusters to: %s", outpath) mkdir(outpath) data = split_vector(self.data, FEATURES__) self._write_clusters(data, outpath, sort_me)
def test_transform_glm_binomial(self): df = split_vector(self.transform_bin.data, PROBABILITY__) df = df.toPandas() assert "prediction" in df.columns.values