示例#1
0
def optimize_cover(X=None,
                   r=30,
                   g=0.67,
                   scale_r=False,
                   scale_g=False,
                   ndim=2,
                   scale_limits=False):
    """ Get optimized cover for data.

    Notes
    -----
    - Requires kmapper

    """
    from kmapper.cover import Cover

    # Define r, g based on data / heuristic
    if X is not None:
        # Heuristic based on size, dimensionality of data
        scale_factor = (len(X) / 1000.) * (2. / ndim)

        # Scale r
        if scale_r:
            r = r * scale_factor

        # Scale g
        if scale_g:
            # Convert to gain, if defined as percent
            if g < 1:
                g = 1. / (1. - g)
            # Scale
            g = g * scale_factor

    # Convert gain to percent
    if g >= 1:
        g = (g - 1) / float(g)

    # Get n_cubes, overlap
    n_cubes = max(1, r)
    p_overlap = float(g)

    # Round final values
    n_cubes = int(n_cubes)
    p_overlap = np.round(p_overlap, 2)

    # Define optimized limits
    limits = None
    if scale_limits is True:
        offset = p_overlap / float(n_cubes)
        limits = [[-offset, 1 + offset] for _ in range(ndim)]
        n_cubes += 2  #* ndim

    try:
        # Initialize Cover with limits
        cover = Cover(n_cubes, p_overlap, limits=limits)
    except Exception as e:
        # Ignore limits, probably using older version
        cover = Cover(n_cubes, p_overlap)
        print("[warning]", e)
    return cover
示例#2
0
    def test_equal_entries(self):
        settings = {"cubes": 10, "overlap": 0.5}

        # uniform data:
        data = np.arange(0, 100)
        data = data[:, np.newaxis]
        lens = data

        cov = Cover(settings["cubes"], settings["overlap"])

        # Prefix'ing the data with an ID column
        ids = np.array([x for x in range(lens.shape[0])])
        lens = np.c_[ids, lens]

        bins = cov.fit(lens)

        bins = list(bins)  # extract list from generator

        assert len(bins) == settings["cubes"]

        cube_entries = [cov.transform_single(lens, cube) for cube in bins]

        for c1, c2 in list(zip(cube_entries, cube_entries[1:]))[2:]:
            c1, c2 = c1[:, 0], c2[:, 0]  # indices only

            calced_overlap = len(set(list(c1)).intersection(set(list(c2)))) / max(
                len(c1), len(c2)
            )
            assert calced_overlap == pytest.approx(0.5)
示例#3
0
    def test_cube_dim(self):

        data = np.arange(30).reshape(10, 3)
        c = Cover(n_cubes=10)
        cubes = c.define_bins(data)

        assert all(len(cube) == 2 for cube in cubes)
示例#4
0
    def test_chunk_dist(self):
        data = np.arange(20).reshape(10, 2)

        cover = Cover(n_cubes=10)
        _ = cover.define_bins(data)
        chunks = list(cover.chunk_dist)
        # TODO: this test is really fagile and has magic number, fix.
        assert all(i == 1.8 for i in chunks)
示例#5
0
    def test_entries_even(self):
        data = np.arange(40).reshape(20, 2)

        cover = Cover(n_cubes=10)
        cubes = cover.define_bins(data)

        for cube in cubes:
            entries = cover.find_entries(data, cube)

            assert len(entries) >= 2
示例#6
0
    def test_bounds(self):
        data_vals = np.arange(40).reshape(20, 2)
        data = np.zeros((20, 3))
        data[:, 0] = np.arange(20, dtype=int)  # Index row
        data[:, 1:3] = data_vals

        limits = np.array([[np.float("inf"), np.float("inf")], [-10, 100]])
        cover = Cover(n_cubes=10, limits=limits)
        cubes = cover.fit(data)

        assert np.array_equal(cover.bounds_, np.array([[0, -10], [38, 100]]))
示例#7
0
    def test_entries_in_correct_cubes(self):
        # TODO: this test is a little hacky

        data = np.arange(40).reshape(20, 2)

        cover = Cover(n_cubes=10)
        cubes = cover.define_bins(data)
        cubes = list(cubes)
        entries = [cover.find_entries(data, cube) for cube in cubes]

        # inside of each cube is there. Sometimes the edges don't line up.
        for i in range(10):
            assert data[2 * i] in entries[i]
            assert data[2 * i + 1] in entries[i]
示例#8
0
    def test_cubes_overlap(self):
        data = np.arange(40).reshape(20, 2)

        cover = Cover(n_cubes=10)
        cubes = cover.define_bins(data)

        entries = []
        for cube in cubes:
            # turn singleton lists into individual elements
            res = [i[0] for i in cover.find_entries(data, cube)]
            entries.append(res)

        for i, j in zip(range(9), range(1, 10)):
            assert set(entries[i]).union(set(entries[j]))
示例#9
0
    def test_radius_dist(self):

        test_cases = [
            {
                "cubes": 1,
                "range": [0, 4],
                "overlap": 0.4,
                "radius": 10.0 / 3
            },
            {
                "cubes": 1,
                "range": [0, 4],
                "overlap": 0.9,
                "radius": 20.0
            },
            {
                "cubes": 2,
                "range": [-4, 4],
                "overlap": 0.5,
                "radius": 4.0
            },
            {
                "cubes": 3,
                "range": [-4, 4],
                "overlap": 0.5,
                "radius": 2.666666666
            },
            {
                "cubes": 10,
                "range": [-4, 4],
                "overlap": 0.5,
                "radius": 0.8
            },
            {
                "cubes": 10,
                "range": [-4, 4],
                "overlap": 1.0,
                "radius": np.inf
            },
        ]

        for test_case in test_cases:
            scaler = preprocessing.MinMaxScaler(
                feature_range=test_case["range"])
            data = scaler.fit_transform(np.arange(20).reshape(10, 2))

            cover = Cover(n_cubes=test_case["cubes"],
                          perc_overlap=test_case["overlap"])
            _ = cover.fit(data)
            assert cover.radius_[0] == pytest.approx(test_case["radius"])
示例#10
0
    def test_bounds(self):
        data_vals = np.arange(40).reshape(20, 2)
        data = np.zeros((20, 3))
        data[:, 0] = np.arange(20, dtype=int)  # Index row
        data[:, 1:3] = data_vals

        limits = np.array([[np.float('inf'), np.float('inf')], [-10, 100]])
        cover = Cover(n_cubes=10, limits=limits)
        cubes = cover.define_bins(data)

        start = cover.d
        end = cover.end
        assert np.array_equal(np.array([start, end]),
                              np.array([[0, -10], [38, 100]]))
示例#11
0
    def test_entries_in_correct_cubes(self):
        # TODO: this test is a little hacky

        data_vals = np.arange(20)
        data = np.zeros((20, 2))
        data[:, 0] = np.arange(20, dtype=int)  # Index row
        data[:, 1] = data_vals

        cover = Cover(n_cubes=10, perc_overlap=0.2)
        cubes = cover.fit(data)
        cubes = list(cubes)
        entries = [cover.transform_single(data, cube) for cube in cubes]

        # inside of each cube is there. Sometimes the edges don't line up.
        for i in range(10):
            assert data[2 * i] in entries[i]
            assert data[2 * i + 1] in entries[i]
示例#12
0
    def test_perc_overlap(self, CoverClass):
        """
        2 cubes with 50% overlap and a range of [0,1] should lead to two cubes with intervals:
            [0, .75]
            [.25, 1]
        """

        data = np.array([[0, 0], [1, 0.25], [2, 0.5], [3, 0.75], [4, 1]])

        cover = Cover(n_cubes=2, perc_overlap=0.5)
        cubes = cover.fit(data)
        cubes = list(cubes)
        entries = [cover.transform_single(data, cube) for cube in cubes]

        for i in (0, 1, 2, 3):
            assert data[i] in entries[0]
        for i in (1, 2, 3, 4):
            assert data[i] in entries[1]
示例#13
0
    def __init__(self,
                 projection=None,
                 scaler=None,
                 cover=None,
                 clusterer=None,
                 remove_duplicate_nodes=False,
                 memory='dyneusr_cache',
                 verbose=1):
        """ Wraps KeplerMapper 

        Usage
        -----
            mapper = KMapperWrapper(projection=PCA(3), cover=dict(r=10, g=2))
            l = mapper.fit(X)
            g = mapper.map(l, X)

            # or 
            g = mapper.fit_map(X)
        """
        try:
            from kmapper import KeplerMapper
            from kmapper.cover import Cover
        except ImportError as e:
            print("[warning]", e)

        # init mapper
        self.mapper = KeplerMapper()
        self.verbose = verbose

        # [1] fit params
        self.projection = projection if projection is not None else PCA(2)
        self.scaler = scaler  #or MinMaxScaler()

        # [2] map params
        self.clusterer = clusterer or DBSCAN(eps=1, min_samples=2)
        self.cover = cover or Cover(10, 0.5)
        self.remove_duplicate_nodes = remove_duplicate_nodes

        # setup memory
        self.memory = Memory(memory, verbose=verbose)
示例#14
0
    def test_125_replication(self):
        # uniform data:
        data = np.arange(0, 100)
        data = data[:, np.newaxis]
        lens = data

        cov = Cover(10, 0.5)

        # Prefix'ing the data with an ID column
        ids = np.array([x for x in range(lens.shape[0])])
        lens = np.c_[ids, lens]

        bins = cov.fit(lens)

        cube_entries = [cov.transform_single(lens, cube) for cube in bins]

        overlaps = [
            len(set(list(c1[:, 0])).intersection(set(list(c2[:, 0]))))
            for c1, c2 in zip(cube_entries, cube_entries[1:])
        ]
        assert (len(set(overlaps)) == 1
                ), "Each overlap should have the same number of entries. "
示例#15
0
 def test_diff_overlap_per_dim(self):
     data = np.random.rand(100, 10)
     c = Cover(overlap_perc=[2, 10])
示例#16
0
 def test_define_diff_bins_per_dim(self):
     data = np.arange(30).reshape(10, 3)
     c = Cover(n_cubes=[5, 10])
     cubes = c.fit(data)
     assert len(list(cubes)) == 5 * 10
示例#17
0
 def test_diff_overlap_per_dim(self):
     data = np.random.rand(100, 3)
     c = Cover(perc_overlap=[0.4, 0.2])
     c.fit(data)
示例#18
0
    def test_cube_count(self):
        data = np.arange(30).reshape(10, 3)
        c = Cover(n_cubes=10)
        cubes = c.define_bins(data)

        assert len(list(cubes)) == 10**2, "idx column is ignored"
示例#19
0
 def test_bound_is_min(self):
     data = np.arange(30).reshape(10, 3)
     cov = Cover(n_cubes=10)
     _ = cov.fit(data)
     bounds = list(zip(cov.bounds_[0], range(1, 10)))
     assert all(b[0] == b[1] for b in bounds)
示例#20
0
    def test_single_dim(self):
        data = np.arange(20).reshape(10, 2)
        c = Cover(n_cubes=10)
        cubes = c.define_bins(data)

        assert all(len(cube) == 1 for cube in cubes)
示例#21
0
    def test_nr_dimensions(self):
        data = np.arange(30).reshape(10, 3)

        c = Cover(n_cubes=10)
        _ = c.define_bins(data)
        assert c.nr_dimensions == 2
示例#22
0
 def test_transform_runs_with_diff_bins(self):
     data = np.arange(30).reshape(10, 3)
     c = Cover(n_cubes=[5, 10])
     cubes = list(c.fit(data))
     _ = c.transform_single(data, cubes[0])
import numpy as np
from kmapper.cover import Cover

# uniform data:
data = np.arange(0, 1000).reshape((1000, 1))
lens = data
cov = Cover(10, 0.5, verbose=0)


def overlap(c1, c2):
    ints = set(c1).intersection(set(c2))
    return len(ints) / max(len(c1), len(c2))


# Prefix'ing the data with an ID column
ids = np.array([x for x in range(lens.shape[0])])
lens = np.c_[ids, lens]

bins = cov.fit(lens)
cube_entries = cov.transform(lens, bins)

for i, hypercube in enumerate(cube_entries):
    print("There are %s points in cube %s/%s" %
          (hypercube.shape[0], i, len(cube_entries)))

print()
for i, (c1, c2) in enumerate(zip(cube_entries, cube_entries[1:])):
    print("Overlap %s" % (overlap(c1[:, 0], c2[:, 0])))
示例#24
0
 def test_find_entries_runs_with_diff_bins(self):
     data = np.arange(30).reshape(10, 3)
     c = Cover(n_cubes=[5, 10])
     cubes = list(c.define_bins(data))
     _ = c.find_entries(data, cubes[0])