Пример #1
0
    def run(self, data1: Data, data2: Data) -> ClusterMatcherResult:
        # todo [perf, low effort, med prio]: for speedup: only use pd.Series of
        #   clusters
        ndata1 = data1.copy(deep=True)
        ndata2 = data2.copy(deep=True)

        # 1. Throw out
        index_intersection = set(ndata1.df.index).intersection(
            set(ndata2.df.index))
        ndata1.df = ndata1.df.loc[index_intersection]
        ndata2.df = ndata2.df.loc[index_intersection]

        # 2. Rename clusters
        clusters2 = set(ndata2.df[self.cluster_column])
        dct = {}
        for cluster2 in clusters2:
            mask = ndata2.df[self.cluster_column] == cluster2
            most_likely = np.argmax(
                np.bincount(ndata1.df[self.cluster_column][mask]))
            dct[cluster2] = most_likely

        ndata2.df[self.cluster_column] = ndata2.df[self.cluster_column].map(
            dct)

        return ClusterMatcherResult(data1=ndata1, data2=ndata2, rename_dct=dct)
Пример #2
0
 def setUp(self):
     self.d1 = Data()
     self.d2 = Data()
     self.d3 = Data()
     self.d4 = Data()
     self.d1.df = pd.DataFrame({"cluster": [1, 1, 2, 2, 3]})
     self.d2.df = pd.DataFrame({"cluster": [2, 2, 3, 3, 1]})
     self.d3.df = pd.DataFrame({"cluster": [2, 1, 2, 2, 3]})
     self.d4.df = pd.DataFrame({"cluster": [4, 1, 2, 2, 3]})
Пример #3
0
 def test_run_identity(self):
     s = Scanner()
     d = Data()
     s.set_spoints_equidist({"a": (0, 1, 2)})
     s.set_dfunction(func_identity)
     s.run(d).write()
     self.assertEqual(sorted(list(d.df.columns)), ["a", "bin0"])
     self.assertAllClose(d.df.values, np.array([[0.0, 0.0], [1.0, 1.0]]))
     d.write(Path(self.tmpdir.name) / "test.sql")
Пример #4
0
def test_dress_rehearsal(tmp_path):
    s = WilsonScanner(scale=5, eft="WET", basis="flavio")

    s.set_dfunction(random_kinematics,
                    sampling=np.linspace(0.0, 1.0, 10),
                    normalize=True)
    s.set_no_workers(no_workers=1)

    s.set_spoints_equidist({
        "CVL_bctaunutau": (-0.5, 0.5, 3),
        "CSL_bctaunutau": (-0.5, 0.5, 3),
        "CT_bctaunutau": (-0.1, 0.1, 3),
    })
    d = Data()
    r = s.run(d)
    r.write()
    # Can remove str casting once we remove py3.5 support
    d.write(str(tmp_path / "dress_rehearsal.sql"), overwrite="overwrite")

    d = DataWithErrors(str(tmp_path / "dress_rehearsal.sql"))

    d.add_rel_err_uncorr(0.01)
    d.add_err_poisson(1000)

    c = HierarchyCluster()
    c.set_metric(chi2_metric)
    b = Benchmark()
    b.set_metric(chi2_metric)

    c.set_max_d(1)
    c.run(d).write()
    b.run(d).write()
Пример #5
0
 def test_run_simple_bins_singlecore(self):
     s = Scanner()
     d = Data()
     s.set_spoints_equidist({"a": (0, 1, 2)})
     s.set_dfunction(func_zero_bins, binning=[0, 1, 2])
     s.set_no_workers(1)
     s.run(d).write()
     self.assertEqual(sorted(list(d.df.columns)), ["a", "bin0", "bin1"])
     self.assertAllClose(d.df.values,
                         np.array([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]]))
     d.write(Path(self.tmpdir.name) / "test.sql")
Пример #6
0
    def run(
        self,
        data: Data,
        cluster: Cluster,
        benchmark: Optional[AbstractBenchmark] = None,
    ) -> SubSampleStabilityTesterResult:
        """ Run test.

        Args:
            data: :class:`~clusterking.data.Data` object
            cluster: Pre-configured :class:`~clusterking.cluster.Cluster`
                object
            benchmark: Optional: :class:`~clusterking.cluster.cluster.Cluster`
                object

        Returns:
            :class:`SubSampleStabilityTesterResult` object
        """
        if not self._sample_kwargs:
            msg = ("You need to configure sampling with set_sampling before "
                   "you can run this method.")
            raise ValueError(msg)

        original_data = data.copy(deep=True)
        cluster.run(original_data).write()
        if self._progress_bar:
            iterator = tqdm.auto.tqdm(range(self._repeat))
        else:
            iterator = range(self._repeat)
        fom_results = collections.defaultdict(list)

        sample_kwargs = copy.deepcopy(self._sample_kwargs)
        if benchmark is not None and "bpoints" not in self._sample_kwargs:
            sample_kwargs["bpoints"] = True

        for _ in iterator:
            this_data = data.sample_param_random(**sample_kwargs)
            cluster.run(this_data).write()
            if benchmark is not None:
                benchmark.run(this_data).write()
            for fom_name, fom in self._foms.items():
                try:
                    fom = fom.run(original_data, this_data).fom
                except ValueError:
                    fom = -1
                fom_results[fom_name].append(fom)

        df = pd.DataFrame(fom_results)
        return SubSampleStabilityTesterResult(df=df)
    def load(cls,
             directory: Union[str, PurePath],
             loader: Optional[Callable] = None) -> "NoisySampleResult":
        """Load from output directory

        Args:
            directory: Path to directory to load from
            loader: Function used to load data (optional).

        Example:

        .. code-block:: python

            def loader(path):
                d = clusterking.DataWithError(path)
                d.add_rel_err_uncorr(0.01)
                return d

            nsr = NoisySampleResult.load("/path/to/dir/", loader=loader)

        """
        directory = Path(directory)
        if not directory.is_dir():
            raise FileNotFoundError(
                "{} does not exist or is not a directory".format(directory))
        samples = []
        for path in sorted(directory.glob("data_*.sql")):
            if loader is not None:
                d = loader(path)
            else:
                d = Data(path)
            samples.append(d)
        return NoisySampleResult(samples=samples)
Пример #8
0
 def setUp(self):
     self.s = WilsonScanner(scale=5, eft="WET", basis="flavio")
     self.s.set_spoints_equidist({
         "CVL_bctaunutau": (-1, 1, 2),
         "CSL_bctaunutau": (-1, 1, 2),
         "CT_bctaunutau": (-1, 1, 2),
     })
     self.s.set_dfunction(simple_func, binning=[0, 1, 2], normalize=True)
     self.d = Data()
Пример #9
0
 def test_run_simple_bins_sample(self):
     s = Scanner()
     d = Data()
     s.set_spoints_equidist({"a": (0, 2, 3)})
     s.set_dfunction(func_sum_indentity_x, sampling=[0, 1, 2])
     s.run(d).write()
     self.assertEqual(sorted(list(d.df.columns)),
                      ["a", "bin0", "bin1", "bin2"])
     print(d.df.values)
     self.assertAllClose(
         d.df.values,
         np.array([
             [0.0, 0.0, 0.0, 0.0],
             [1.0, 0.0, 1.0, 2.0],
             [2.0, 0.0, 2.0, 4.0],
         ]),
     )
     d.write(Path(self.tmpdir.name) / "test.sql")
Пример #10
0
    def run(self, data1: Data, data2: Data) -> ClusterMatcherResult:
        ndata1 = data1.copy(deep=True)
        ndata2 = data2.copy(deep=True)

        nclusters1 = len(data1.df[self.cluster_column].unique())
        nclusters2 = len(data2.df[self.cluster_column].unique())
        if nclusters1 != nclusters2:
            raise ValueError("Cluster numbers don't match")
        order1 = self._get_order_of_clusters(data1)
        order2 = self._get_order_of_clusters(data2)
        order1_inverted = {value: key for key, value in order1.items()}
        rename_dct = {}
        for cluster in order2:
            rename_dct[cluster] = order1_inverted[order2[cluster]]

        ndata2.df[self.cluster_column] = ndata2.df[self.cluster_column].map(
            rename_dct)
        return ClusterMatcherResult(data1=ndata1,
                                    data2=ndata2,
                                    rename_dct=rename_dct)
Пример #11
0
 def test(self):
     d1 = Data()
     d2 = Data()
     d1.df = pd.DataFrame({"cluster": [1, 1, 2, 2, 3]})
     d2.df = pd.DataFrame({"cluster": [2, 2, 3, 3, 1]})
     ttcmr = TrivialClusterMatcher().run(d1, d2)
     self.assertDictEqual(ttcmr.rename_dct, {2: 1, 3: 2, 1: 3})
 def test_sss(self):
     d = Data()
     s = Scanner()
     s.set_no_workers(1)
     s.set_spoints_equidist({"a": (0, 1, 4)})
     s.set_dfunction(func_one)
     s.run(d).write()
     c = KmeansCluster()
     c.set_kmeans_options(n_clusters=2)
     ssst = SubSampleStabilityTester()
     ssst.set_sampling(frac=0.95)
     ssst.set_repeat(2)
     ssst.run(data=d, cluster=c)
    def test_noisy_sample(self):
        d = Data()
        s = Scanner()
        s.set_no_workers(1)
        s.set_spoints_equidist({"a": (0, 1, 2)})
        s.set_dfunction(func_zero)
        ns = NoisySample()
        ns.set_repeat(1)
        ns.set_noise("gauss", mean=0.0, sigma=1 / 30 / 4)
        nsr = ns.run(scanner=s, data=d)
        self.assertEqual(len(nsr.samples), 2)
        nsr.write(self.tmpdir.name, non_empty="raise")
        nsr_loaded = NoisySampleResult.load(self.tmpdir.name)
        for i in range(2):
            self.assertDictEqual(
                nsr.samples[i].df.to_dict(), nsr_loaded.samples[i].df.to_dict()
            )

        c = KmeansCluster()
        c.set_kmeans_options(n_clusters=2)
        nsst = NoisySampleStabilityTester()
        nsst.run(nsr, cluster=c)
class TestHierarchyCluster(MyTestCase):
    def setUp(self):
        self.ddir = Path(__file__).parent / "data"
        self.dname = "1d.sql"
        self.d = Data(self.ddir / self.dname)

    def test_cluster(self):
        d = self.d.copy()
        c = HierarchyCluster()
        c.set_metric("euclidean")
        c.set_max_d(0.75)
        c.run(d).write()
        c.set_max_d(1.5)
        c.run(d).write(cluster_column="cluster15")
        # The minimal distance between our distributions is 1, so they all
        # end up in different clusters
        self.assertEqual(len(d.clusters()), self.d.n)
        # This is a bit unfortunate, since we have so many distribution pairs
        # with equal distance (so it's up to the implementation of the algorithm
        # , which clusters develop) but this is what happened so far:
        self.assertEqual(len(d.clusters(cluster_column="cluster15")), 6)

    def test_reuse_hierarchy(self):
        d = self.d.copy()
        c = HierarchyCluster()
        c.set_metric("euclidean")
        c.set_max_d(1.5)
        r = c.run(d)
        r.write()
        r2 = c.run(d, reuse_hierarchy_from=r)
        r2.write(cluster_column="reused")
        self.assertListEqual(d.df["cluster"].tolist(), d.df["reused"].tolist())

    def test_reuse_hierarchy_fail_different_data(self):
        d = self.d.copy()
        e = self.d.copy()
        c = HierarchyCluster()
        c.set_metric("euclidean")
        c.set_max_d(1.5)
        r = c.run(d)
        r.write()
        with self.assertRaises(ValueError) as ex:
            c.run(e, reuse_hierarchy_from=r)
        self.assertTrue("different data object" in str(ex.exception))

    def test_reuse_hierarchy_fail_different_cluster(self):
        d = self.d.copy()
        c = HierarchyCluster()
        c2 = HierarchyCluster()
        c.set_metric("euclidean")
        c.set_max_d(1.5)
        c2.set_metric("euclidean")
        c2.set_max_d(1.5)
        r = c.run(d)
        r.write()
        with self.assertRaises(ValueError) as e:
            c2.run(e, reuse_hierarchy_from=r)
        self.assertTrue(
            "different HierarchyCluster object" in str(e.exception))

    def test_hierarchy_cluster_no_max_d(self):
        d = self.d.copy()
        c = HierarchyCluster()
        with self.assertRaises(ValueError) as e:
            c.run(d)
        self.assertTrue("set_max_d" in str(e.exception))

    def test_dendrogram_plot(self):
        c = HierarchyCluster()
        c.set_metric()
        c.set_max_d(0.2)
        r = c.run(self.d)
        r.dendrogram()
def _data():
    ddir = Path(__file__).parent / "data"
    dname = "1d.sql"
    d = Data(ddir / dname)
    return d
Пример #16
0
 def setUp(self):
     self.ddir = Path(__file__).parent / "data"
     self.dname = "1d_clustered.sql"
     self.d = Data(self.ddir / self.dname)
Пример #17
0
 def setUp(self):
     path = Path(__file__).parent / "data" / "test.sql"
     self.data = [[100, 200], [400, 500]]
     self.d = Data(path)
Пример #18
0
class TestData(MyTestCase):
    def setUp(self):
        path = Path(__file__).parent / "data" / "test.sql"
        self.data = [[100, 200], [400, 500]]
        self.d = Data(path)

    def nd(self):
        return self.d.copy(deep=True)

    # **************************************************************************
    # Property shortcuts
    # **************************************************************************

    def test_bin_cols(self):
        self.assertEqual(self.d.bin_cols, ["bin0", "bin1"])

    def test_par_cols(self):
        self.assertEqual(
            self.d.par_cols,
            ["CVL_bctaunutau", "CT_bctaunutau", "CSL_bctaunutau"],
        )

    def test_n(self):
        self.assertEqual(self.d.n, 2)

    def test_nbins(self):
        self.assertEqual(self.d.nbins, 2)

    def test_npars(self):
        self.assertEqual(self.d.npars, 3)

    def test__dist_xrange(self):
        self.assertEqual(self.d._dist_xrange, (0, 20))

    # **************************************************************************
    # Returning things
    # **************************************************************************

    def test_data(self):
        self.assertAllClose(self.d.data(), self.data)

    def test_norms(self):
        self.assertAllClose(self.d.norms(), [300, 900])

    def test_clusters(self):
        self.assertEqual(self.d.clusters(), [0])
        self.assertEqual(self.d.clusters(cluster_column="other_cluster"),
                         [0, 1])

    def test_get_param_values(self):
        self.assertEqual(
            sorted(list(self.d.get_param_values().keys())),
            sorted(["CVL_bctaunutau", "CT_bctaunutau", "CSL_bctaunutau"]),
        )
        self.assertAlmostEqual(
            self.d.get_param_values("CVL_bctaunutau")[0], -1.0)
        self.assertAlmostEqual(
            self.d.get_param_values("CT_bctaunutau")[1], 0.0)

    def test_data_normed(self):
        self.assertAllClose(self.d.data(normalize=True),
                            [[1 / 3, 2 / 3], [4 / 9, 5 / 9]])

    # **************************************************************************
    # Subsample
    # **************************************************************************

    # see next class

    # **************************************************************************
    # Quick plots
    # **************************************************************************
    # We just check that they run without throwing.

    def test_plot_dist(self):
        self.d.plot_dist()

    def test_plot_dist_minmax(self):
        self.d.plot_dist_minmax()

    def test_plot_dist_box(self):
        self.d.plot_dist_box()

    def test_plot_clusters_scatter(self):
        self.d.plot_clusters_scatter()
        self.d.plot_clusters_scatter(
            ["CVL_bctaunutau", "CT_bctaunutau", "CSL_bctaunutau"])
        self.d.plot_clusters_scatter(["CVL_bctaunutau", "CT_bctaunutau"])
        self.d.plot_clusters_scatter(["CVL_bctaunutau"])

    def test_plot_clusters_fill(self):
        self.d.plot_clusters_fill(["CVL_bctaunutau", "CT_bctaunutau"])
Пример #19
0
 def setUp(self):
     path = Path(__file__).parent / "data" / "test_longer.sql"
     self.d = Data(path)
Пример #20
0
class TestSubSample(MyTestCase):
    def setUp(self):
        path = Path(__file__).parent / "data" / "test_longer.sql"
        self.d = Data(path)

    def nd(self):
        return self.d.copy(deep=True)

    def test_only_bpoints(self):
        self.assertEqual(self.d.only_bpoints().n, 1)
        self.assertEqual(self.d.only_bpoints(bpoint_column="bpoint1").n, 2)
        self.assertEqual(self.d.only_bpoints(bpoint_column="bpoint2").n, 3)

    def test_fix_param(self):
        e = self.d.fix_param(a=0)
        self.assertEqual(e.n, 16)
        self.assertAllClose(e.get_param_values("a"), [0.0])

        e = self.d.fix_param(a=-100)
        self.assertEqual(e.n, 16)
        self.assertAllClose(e.get_param_values("a"), [0.0])

        e = self.d.fix_param(a=2.3)
        self.assertEqual(e.n, 16)
        self.assertAllClose(e.get_param_values("a"), [2.0])

        e = self.d.fix_param(a=[0, 2.3])
        self.assertEqual(e.n, 32)
        self.assertAllClose(e.get_param_values("a"), [0.0, 2.0])

        e = self.d.fix_param(a=[0, 2.3], b=0)
        self.assertEqual(e.n, 8)
        self.assertAllClose(e.get_param_values("a"), [0.0, 2.0])
        self.assertAllClose(e.get_param_values("b"), [0.0])

        e = self.d.fix_param(a=[0, 2.3], b=0, c=0.0)
        self.assertEqual(e.n, 2)
        self.assertAllClose(e.get_param_values("a"), [0.0, 2.0])
        self.assertAllClose(e.get_param_values("b"), [0.0])
        self.assertAllClose(e.get_param_values("c"), [0.0])

    def test_fix_param_bpoints(self):
        e = self.d.fix_param(a=[], bpoints=True)
        self.assertEqual(e.n, 1)

        e = self.d.fix_param(a=[], bpoints=True, bpoint_column="bpoint1")
        self.assertEqual(e.n, 2)

        e = self.d.fix_param(a=0.0, bpoints=True, bpoint_column="bpoint1")
        self.assertEqual(e.n, 16)

        e = self.d.fix_param(c=0.0, bpoints=True, bpoint_column="bpoint1")
        self.assertEqual(e.n, 17)

        e = self.d.fix_param(a=0.0,
                             b=0.0,
                             c=0.0,
                             bpoints=True,
                             bpoint_column="bpoint1")
        self.assertEqual(e.n, 2)

    def test_fix_param_bpoint_slices(self):
        e = self.d.fix_param(a=[], bpoint_slices=True)
        self.assertEqual(e.n, 16)

        e = self.d.fix_param(c=[], bpoint_slices=True, bpoint_column="bpoint2")
        self.assertEqual(e.n, 3 * 16)

        e = self.d.fix_param(a=[],
                             b=[],
                             c=[],
                             bpoint_slices=True,
                             bpoint_column="bpoint2")
        self.assertEqual(e.n, 3)

    def test_sample_param(self):
        e = self.d.sample_param(a=0)
        self.assertEqual(e.n, 0)

        e = self.d.sample_param(a=3)
        self.assertEqual(e.n, 3 * 4 * 4)

        e = self.d.sample_param(a=4)
        self.assertEqual(e.n, 4 * 4 * 4)

        e = self.d.sample_param(a=10)
        self.assertEqual(e.n, 4 * 4 * 4)

        e = self.d.sample_param(a=3, b=3, c=3)
        self.assertEqual(e.n, 3 * 3 * 3)

        e = self.d.sample_param(a=(0, 0.4, 3))
        self.assertEqual(e.n, 1 * 4 * 4)

        e = self.d.sample_param(a=(0, 1, 3))
        self.assertEqual(e.n, 2 * 4 * 4)

        e = self.d.sample_param(a=(0, 1, 3), b=2, c=2)
        self.assertEqual(e.n, 2 * 2 * 2)

        e = self.d.sample_param(a=(0, 1, 3), b=(0, 1, 3), c=2)
        self.assertEqual(e.n, 2 * 2 * 2)

    def test_sample_param_bpoints(self):
        e = self.d.sample_param(a=0, bpoints=True)
        self.assertEqual(e.n, 1)

        e = self.d.sample_param(a=0, bpoints=True, bpoint_column="bpoint2")
        self.assertEqual(e.n, 3)

    def test_sample_param_bpoint_slices(self):
        e = self.d.sample_param(a=0, bpoint_slices=True)
        self.assertEqual(e.n, 16)

        e = self.d.sample_param(a=0,
                                bpoint_slices=True,
                                bpoint_column="bpoint2")
        self.assertEqual(e.n, 16)

    def test_sample_param_random(self):
        e = self.d.sample_param_random(n=5)
        self.assertEqual(e.n, 5)

    def test_find_closest_spoints(self):
        self.assertAllClose(
            self.d.find_closest_spoints(point=dict(a=0, b=0, c=0),
                                        n=1).df[["a", "b", "c"]].values,
            np.array([0, 0, 0]),
        )
        self.assertAllClose(
            sorted(
                self.d.find_closest_spoints(point=dict(a=0, b=0, c=0),
                                            n=4).df[["a", "b",
                                                     "c"]].values.tolist()),
            [[0, 0, 0], [0, 0, 1], [0, 1, 0], [1, 0, 0]],
        )
        self.assertAllClose(
            sorted(
                self.d.find_closest_spoints(point=dict(a=0, b=1, c=0),
                                            n=5).df[["a", "b",
                                                     "c"]].values.tolist()),
            [[0, 0, 0], [0, 1, 0], [0, 1, 1], [0, 2, 0], [1, 1, 0]],
        )