Exemplo n.º 1
0
    def run(self, data1: Data, data2: Data) -> ClusterMatcherResult:
        # todo [perf, low effort, med prio]: for speedup: only use pd.Series of
        #   clusters
        ndata1 = data1.copy(deep=True)
        ndata2 = data2.copy(deep=True)

        # 1. Throw out
        index_intersection = set(ndata1.df.index).intersection(
            set(ndata2.df.index))
        ndata1.df = ndata1.df.loc[index_intersection]
        ndata2.df = ndata2.df.loc[index_intersection]

        # 2. Rename clusters
        clusters2 = set(ndata2.df[self.cluster_column])
        dct = {}
        for cluster2 in clusters2:
            mask = ndata2.df[self.cluster_column] == cluster2
            most_likely = np.argmax(
                np.bincount(ndata1.df[self.cluster_column][mask]))
            dct[cluster2] = most_likely

        ndata2.df[self.cluster_column] = ndata2.df[self.cluster_column].map(
            dct)

        return ClusterMatcherResult(data1=ndata1, data2=ndata2, rename_dct=dct)
Exemplo n.º 2
0
    def run(self, data1: Data, data2: Data) -> ClusterMatcherResult:
        ndata1 = data1.copy(deep=True)
        ndata2 = data2.copy(deep=True)

        nclusters1 = len(data1.df[self.cluster_column].unique())
        nclusters2 = len(data2.df[self.cluster_column].unique())
        if nclusters1 != nclusters2:
            raise ValueError("Cluster numbers don't match")
        order1 = self._get_order_of_clusters(data1)
        order2 = self._get_order_of_clusters(data2)
        order1_inverted = {value: key for key, value in order1.items()}
        rename_dct = {}
        for cluster in order2:
            rename_dct[cluster] = order1_inverted[order2[cluster]]

        ndata2.df[self.cluster_column] = ndata2.df[self.cluster_column].map(
            rename_dct)
        return ClusterMatcherResult(data1=ndata1,
                                    data2=ndata2,
                                    rename_dct=rename_dct)
Exemplo n.º 3
0
    def run(
        self,
        data: Data,
        cluster: Cluster,
        benchmark: Optional[AbstractBenchmark] = None,
    ) -> SubSampleStabilityTesterResult:
        """ Run test.

        Args:
            data: :class:`~clusterking.data.Data` object
            cluster: Pre-configured :class:`~clusterking.cluster.Cluster`
                object
            benchmark: Optional: :class:`~clusterking.cluster.cluster.Cluster`
                object

        Returns:
            :class:`SubSampleStabilityTesterResult` object
        """
        if not self._sample_kwargs:
            msg = ("You need to configure sampling with set_sampling before "
                   "you can run this method.")
            raise ValueError(msg)

        original_data = data.copy(deep=True)
        cluster.run(original_data).write()
        if self._progress_bar:
            iterator = tqdm.auto.tqdm(range(self._repeat))
        else:
            iterator = range(self._repeat)
        fom_results = collections.defaultdict(list)

        sample_kwargs = copy.deepcopy(self._sample_kwargs)
        if benchmark is not None and "bpoints" not in self._sample_kwargs:
            sample_kwargs["bpoints"] = True

        for _ in iterator:
            this_data = data.sample_param_random(**sample_kwargs)
            cluster.run(this_data).write()
            if benchmark is not None:
                benchmark.run(this_data).write()
            for fom_name, fom in self._foms.items():
                try:
                    fom = fom.run(original_data, this_data).fom
                except ValueError:
                    fom = -1
                fom_results[fom_name].append(fom)

        df = pd.DataFrame(fom_results)
        return SubSampleStabilityTesterResult(df=df)
Exemplo n.º 4
0
class TestData(MyTestCase):
    def setUp(self):
        path = Path(__file__).parent / "data" / "test.sql"
        self.data = [[100, 200], [400, 500]]
        self.d = Data(path)

    def nd(self):
        return self.d.copy(deep=True)

    # **************************************************************************
    # Property shortcuts
    # **************************************************************************

    def test_bin_cols(self):
        self.assertEqual(self.d.bin_cols, ["bin0", "bin1"])

    def test_par_cols(self):
        self.assertEqual(
            self.d.par_cols,
            ["CVL_bctaunutau", "CT_bctaunutau", "CSL_bctaunutau"],
        )

    def test_n(self):
        self.assertEqual(self.d.n, 2)

    def test_nbins(self):
        self.assertEqual(self.d.nbins, 2)

    def test_npars(self):
        self.assertEqual(self.d.npars, 3)

    def test__dist_xrange(self):
        self.assertEqual(self.d._dist_xrange, (0, 20))

    # **************************************************************************
    # Returning things
    # **************************************************************************

    def test_data(self):
        self.assertAllClose(self.d.data(), self.data)

    def test_norms(self):
        self.assertAllClose(self.d.norms(), [300, 900])

    def test_clusters(self):
        self.assertEqual(self.d.clusters(), [0])
        self.assertEqual(self.d.clusters(cluster_column="other_cluster"),
                         [0, 1])

    def test_get_param_values(self):
        self.assertEqual(
            sorted(list(self.d.get_param_values().keys())),
            sorted(["CVL_bctaunutau", "CT_bctaunutau", "CSL_bctaunutau"]),
        )
        self.assertAlmostEqual(
            self.d.get_param_values("CVL_bctaunutau")[0], -1.0)
        self.assertAlmostEqual(
            self.d.get_param_values("CT_bctaunutau")[1], 0.0)

    def test_data_normed(self):
        self.assertAllClose(self.d.data(normalize=True),
                            [[1 / 3, 2 / 3], [4 / 9, 5 / 9]])

    # **************************************************************************
    # Subsample
    # **************************************************************************

    # see next class

    # **************************************************************************
    # Quick plots
    # **************************************************************************
    # We just check that they run without throwing.

    def test_plot_dist(self):
        self.d.plot_dist()

    def test_plot_dist_minmax(self):
        self.d.plot_dist_minmax()

    def test_plot_dist_box(self):
        self.d.plot_dist_box()

    def test_plot_clusters_scatter(self):
        self.d.plot_clusters_scatter()
        self.d.plot_clusters_scatter(
            ["CVL_bctaunutau", "CT_bctaunutau", "CSL_bctaunutau"])
        self.d.plot_clusters_scatter(["CVL_bctaunutau", "CT_bctaunutau"])
        self.d.plot_clusters_scatter(["CVL_bctaunutau"])

    def test_plot_clusters_fill(self):
        self.d.plot_clusters_fill(["CVL_bctaunutau", "CT_bctaunutau"])
Exemplo n.º 5
0
class TestSubSample(MyTestCase):
    def setUp(self):
        path = Path(__file__).parent / "data" / "test_longer.sql"
        self.d = Data(path)

    def nd(self):
        return self.d.copy(deep=True)

    def test_only_bpoints(self):
        self.assertEqual(self.d.only_bpoints().n, 1)
        self.assertEqual(self.d.only_bpoints(bpoint_column="bpoint1").n, 2)
        self.assertEqual(self.d.only_bpoints(bpoint_column="bpoint2").n, 3)

    def test_fix_param(self):
        e = self.d.fix_param(a=0)
        self.assertEqual(e.n, 16)
        self.assertAllClose(e.get_param_values("a"), [0.0])

        e = self.d.fix_param(a=-100)
        self.assertEqual(e.n, 16)
        self.assertAllClose(e.get_param_values("a"), [0.0])

        e = self.d.fix_param(a=2.3)
        self.assertEqual(e.n, 16)
        self.assertAllClose(e.get_param_values("a"), [2.0])

        e = self.d.fix_param(a=[0, 2.3])
        self.assertEqual(e.n, 32)
        self.assertAllClose(e.get_param_values("a"), [0.0, 2.0])

        e = self.d.fix_param(a=[0, 2.3], b=0)
        self.assertEqual(e.n, 8)
        self.assertAllClose(e.get_param_values("a"), [0.0, 2.0])
        self.assertAllClose(e.get_param_values("b"), [0.0])

        e = self.d.fix_param(a=[0, 2.3], b=0, c=0.0)
        self.assertEqual(e.n, 2)
        self.assertAllClose(e.get_param_values("a"), [0.0, 2.0])
        self.assertAllClose(e.get_param_values("b"), [0.0])
        self.assertAllClose(e.get_param_values("c"), [0.0])

    def test_fix_param_bpoints(self):
        e = self.d.fix_param(a=[], bpoints=True)
        self.assertEqual(e.n, 1)

        e = self.d.fix_param(a=[], bpoints=True, bpoint_column="bpoint1")
        self.assertEqual(e.n, 2)

        e = self.d.fix_param(a=0.0, bpoints=True, bpoint_column="bpoint1")
        self.assertEqual(e.n, 16)

        e = self.d.fix_param(c=0.0, bpoints=True, bpoint_column="bpoint1")
        self.assertEqual(e.n, 17)

        e = self.d.fix_param(a=0.0,
                             b=0.0,
                             c=0.0,
                             bpoints=True,
                             bpoint_column="bpoint1")
        self.assertEqual(e.n, 2)

    def test_fix_param_bpoint_slices(self):
        e = self.d.fix_param(a=[], bpoint_slices=True)
        self.assertEqual(e.n, 16)

        e = self.d.fix_param(c=[], bpoint_slices=True, bpoint_column="bpoint2")
        self.assertEqual(e.n, 3 * 16)

        e = self.d.fix_param(a=[],
                             b=[],
                             c=[],
                             bpoint_slices=True,
                             bpoint_column="bpoint2")
        self.assertEqual(e.n, 3)

    def test_sample_param(self):
        e = self.d.sample_param(a=0)
        self.assertEqual(e.n, 0)

        e = self.d.sample_param(a=3)
        self.assertEqual(e.n, 3 * 4 * 4)

        e = self.d.sample_param(a=4)
        self.assertEqual(e.n, 4 * 4 * 4)

        e = self.d.sample_param(a=10)
        self.assertEqual(e.n, 4 * 4 * 4)

        e = self.d.sample_param(a=3, b=3, c=3)
        self.assertEqual(e.n, 3 * 3 * 3)

        e = self.d.sample_param(a=(0, 0.4, 3))
        self.assertEqual(e.n, 1 * 4 * 4)

        e = self.d.sample_param(a=(0, 1, 3))
        self.assertEqual(e.n, 2 * 4 * 4)

        e = self.d.sample_param(a=(0, 1, 3), b=2, c=2)
        self.assertEqual(e.n, 2 * 2 * 2)

        e = self.d.sample_param(a=(0, 1, 3), b=(0, 1, 3), c=2)
        self.assertEqual(e.n, 2 * 2 * 2)

    def test_sample_param_bpoints(self):
        e = self.d.sample_param(a=0, bpoints=True)
        self.assertEqual(e.n, 1)

        e = self.d.sample_param(a=0, bpoints=True, bpoint_column="bpoint2")
        self.assertEqual(e.n, 3)

    def test_sample_param_bpoint_slices(self):
        e = self.d.sample_param(a=0, bpoint_slices=True)
        self.assertEqual(e.n, 16)

        e = self.d.sample_param(a=0,
                                bpoint_slices=True,
                                bpoint_column="bpoint2")
        self.assertEqual(e.n, 16)

    def test_sample_param_random(self):
        e = self.d.sample_param_random(n=5)
        self.assertEqual(e.n, 5)

    def test_find_closest_spoints(self):
        self.assertAllClose(
            self.d.find_closest_spoints(point=dict(a=0, b=0, c=0),
                                        n=1).df[["a", "b", "c"]].values,
            np.array([0, 0, 0]),
        )
        self.assertAllClose(
            sorted(
                self.d.find_closest_spoints(point=dict(a=0, b=0, c=0),
                                            n=4).df[["a", "b",
                                                     "c"]].values.tolist()),
            [[0, 0, 0], [0, 0, 1], [0, 1, 0], [1, 0, 0]],
        )
        self.assertAllClose(
            sorted(
                self.d.find_closest_spoints(point=dict(a=0, b=1, c=0),
                                            n=5).df[["a", "b",
                                                     "c"]].values.tolist()),
            [[0, 0, 0], [0, 1, 0], [0, 1, 1], [0, 2, 0], [1, 1, 0]],
        )
class TestHierarchyCluster(MyTestCase):
    def setUp(self):
        self.ddir = Path(__file__).parent / "data"
        self.dname = "1d.sql"
        self.d = Data(self.ddir / self.dname)

    def test_cluster(self):
        d = self.d.copy()
        c = HierarchyCluster()
        c.set_metric("euclidean")
        c.set_max_d(0.75)
        c.run(d).write()
        c.set_max_d(1.5)
        c.run(d).write(cluster_column="cluster15")
        # The minimal distance between our distributions is 1, so they all
        # end up in different clusters
        self.assertEqual(len(d.clusters()), self.d.n)
        # This is a bit unfortunate, since we have so many distribution pairs
        # with equal distance (so it's up to the implementation of the algorithm
        # , which clusters develop) but this is what happened so far:
        self.assertEqual(len(d.clusters(cluster_column="cluster15")), 6)

    def test_reuse_hierarchy(self):
        d = self.d.copy()
        c = HierarchyCluster()
        c.set_metric("euclidean")
        c.set_max_d(1.5)
        r = c.run(d)
        r.write()
        r2 = c.run(d, reuse_hierarchy_from=r)
        r2.write(cluster_column="reused")
        self.assertListEqual(d.df["cluster"].tolist(), d.df["reused"].tolist())

    def test_reuse_hierarchy_fail_different_data(self):
        d = self.d.copy()
        e = self.d.copy()
        c = HierarchyCluster()
        c.set_metric("euclidean")
        c.set_max_d(1.5)
        r = c.run(d)
        r.write()
        with self.assertRaises(ValueError) as ex:
            c.run(e, reuse_hierarchy_from=r)
        self.assertTrue("different data object" in str(ex.exception))

    def test_reuse_hierarchy_fail_different_cluster(self):
        d = self.d.copy()
        c = HierarchyCluster()
        c2 = HierarchyCluster()
        c.set_metric("euclidean")
        c.set_max_d(1.5)
        c2.set_metric("euclidean")
        c2.set_max_d(1.5)
        r = c.run(d)
        r.write()
        with self.assertRaises(ValueError) as e:
            c2.run(e, reuse_hierarchy_from=r)
        self.assertTrue(
            "different HierarchyCluster object" in str(e.exception))

    def test_hierarchy_cluster_no_max_d(self):
        d = self.d.copy()
        c = HierarchyCluster()
        with self.assertRaises(ValueError) as e:
            c.run(d)
        self.assertTrue("set_max_d" in str(e.exception))

    def test_dendrogram_plot(self):
        c = HierarchyCluster()
        c.set_metric()
        c.set_max_d(0.2)
        r = c.run(self.d)
        r.dendrogram()