def test_random_upsample(sampleset): samples = [('pos', 1), ('pos', 1), ('neg', 0)] stratified = sorted(util.upsample(samples, 1, rand=StableRandom(0))) assert stratified == [('neg', 0), ('neg', 0), ('pos', 1), ('pos', 1)] stratified1 = util.upsample(sampleset, 0, rand=StableRandom(0)) _, labelcnts = util.group_samples(stratified1, 0) assert labelcnts == {0: 50, 1: 50} stratified2 = util.upsample(sampleset, 0, rand=StableRandom(1)) assert stratified1 != stratified2, 'Order should be random'
def test_Shuffle(): data = list(range(50)) assert data >> Shuffle(100) >> Collect() != data assert data >> Shuffle(100) >> Collect(set) == set(data) assert data >> Shuffle(20) >> Collect() != data assert data >> Shuffle(20) >> Collect(set) == set(data) assert data >> Shuffle(1) >> Collect() == data shuffled1 = data >> Shuffle(10, rand=StableRandom(0)) >> Collect() shuffled2 = data >> Shuffle(10, rand=StableRandom(0)) >> Collect() assert shuffled1 == shuffled2
def test_Stratify(): samples = [('pos', 1), ('pos', 1), ('neg', 0)] stratify = Stratify(1, mode='up', rand=StableRandom(0)) stratified = samples >> stratify >> Sort() assert stratified == [('neg', 0), ('neg', 0), ('pos', 1), ('pos', 1)] samples = [('pos', 1), ('pos', 1), ('pos', 1), ('neg1', 0), ('neg2', 0)] stratify = Stratify(1, mode='downrnd', rand=StableRandom(0)) stratified = samples >> stratify >> Sort() assert stratified == [('neg1', 0), ('neg2', 0), ('pos', 1), ('pos', 1)] with pytest.raises(ValueError) as ex: samples >> Stratify(1, mode='invalid') >> Collect() assert str(ex.value).startswith('Unknown mode')
def test_SplitRandom_constraint(): same_letter = lambda t: t[0] data = zip('aabbccddee', range(10)) train, val = data >> SplitRandom(rand=StableRandom(0), ratio=0.6, constraint=same_letter) >> Collect() print(train) print(val) assert train == [('a', 1), ('a', 0), ('d', 7), ('b', 2), ('d', 6), ('b', 3)] assert val == [('c', 5), ('e', 8), ('e', 9), ('c', 4)]
def test_Pick(): assert Range(5) >> Pick(1) >> Collect() == [0, 1, 2, 3, 4] assert Range(5) >> Pick(2) >> Collect() == [0, 2, 4] with pytest.raises(ValueError) as ex: [1, 2, 3] >> Pick(-1) >> Consume() assert str(ex.value).startswith('p_n must not be negative') assert Range(5) >> Pick(0.5, StableRandom(1)) >> Collect() == [0, 4] assert Range(5) >> Pick(0.7, StableRandom(0)) >> Collect() == [0, 1, 4] assert Range(10) >> Pick(1.0) >> Count() == 10 assert Range(10) >> Pick(0.0) >> Count() == 0 assert (Range(100) >> Pick(0.3) >> Collect(set)).issubset(set(range(100))) with pytest.raises(ValueError) as ex: [1, 2, 3] >> Pick(1.1) >> Consume() assert str(ex.value).startswith('Probability must be in [0, 1]') with pytest.raises(ValueError) as ex: [1, 2, 3] >> Pick(-0.1) >> Consume() assert str(ex.value).startswith('Probability must be in [0, 1]')
def test_Stratify(): samples = [('pos', 1)] * 1000 + [('neg', 0)] * 100 dist = samples >> CountValues(1) stratify = Stratify(1, dist, rand=StableRandom(0)) stratified1 = samples >> stratify >> Collect() stratified2 = samples >> stratify >> Collect() assert stratified1 != stratified2 dist1 = stratified1 >> Get(1) >> CountValues() print(dist1) assert dist1[0] == 100 assert 90 < dist1[1] < 110 dist2 = stratified2 >> Get(1) >> CountValues() print(dist2) assert dist1[0] == 100 assert 90 < dist1[1] < 110
def test_SplitRandom_seed(): split1 = range(10) >> SplitRandom(rand=StableRandom(0)) split2 = range(10) >> SplitRandom(rand=StableRandom(0)) split3 = range(10) >> SplitRandom(rand=StableRandom(1)) assert split1 == split2 assert split1 != split3
def test_StableRandom(): rnd = StableRandom(1) assert rnd.randint(1, 10) == 5 assert rnd.uniform(-10, 10) == approx(9.943696167306904) assert rnd.random() == approx(0.7203244894557457) assert rnd.sample(range(10), 3) == [9, 0, 1] lst = [1, 2, 3, 4, 5] rnd.shuffle(lst) assert lst == [5, 3, 1, 4, 2] rnd = StableRandom() assert max(rnd.random() for _ in range(1000)) < 1.0 assert min(rnd.random() for _ in range(1000)) >= 0.0 rnd1, rnd2 = StableRandom(0), StableRandom(0) for _ in range(100): assert rnd1.random() == rnd2.random() rnd1, rnd2 = StableRandom(0), StableRandom(0) rnd2.jumpahead(10) for _ in range(100): assert rnd1.random() != rnd2.random() rnd2.setstate(rnd1.getstate()) for _ in range(100): assert rnd1.random() == rnd2.random() rnd1, rnd2 = StableRandom(0), StableRandom(1) for _ in range(100): assert rnd1.random() != rnd2.random() rnd1 = StableRandom() sleep(0.5) # seed is based on system time. rnd2 = StableRandom() for _ in range(100): assert rnd1.random() != rnd2.random() rnd = StableRandom() numbers = [rnd._randbelow(10) for _ in range(1000)] assert max(numbers) < 10 assert min(numbers) >= 0 rnd = StableRandom() numbers = [rnd.gauss_next() for _ in range(10000)] my, std = numbers >> MeanStd() assert 0.0 == approx(my, abs=0.1) assert 1.0 == approx(std, abs=0.1)
def SplitRandom(iterable, ratio=0.7, constraint=None, rand=None): """ Randomly split iterable into partitions. For the same input data the same split is created every time and is stable across different Python version 2.x or 3.x. A random number generator can be provided to create varying splits. >>> train, val = range(10) >> SplitRandom(ratio=0.7) >>> train, val ([6, 3, 1, 7, 0, 2, 4], [5, 9, 8]) >>> range(10) >> SplitRandom(ratio=0.7) # Same split again [[6, 3, 1, 7, 0, 2, 4], [5, 9, 8]] >>> train, val, test = range(10) >> SplitRandom(ratio=(0.6, 0.3, 0.1)) >>> train, val, test ([6, 1, 4, 0, 3, 2], [8, 7, 9], [5]) >>> data = zip('aabbccddee', range(10)) >>> same_letter = lambda t: t[0] >>> train, val = data >> SplitRandom(ratio=0.6, constraint=same_letter) >>> train [('a', 1), ('a', 0), ('d', 7), ('b', 2), ('d', 6), ('b', 3)] >>> val [('c', 5), ('e', 8), ('e', 9), ('c', 4)] :param iterable iterable: Iterable over anything. Will be consumed! :param float|tuple ratio: Ratio of two partition e.g. a ratio of 0.7 means 70%, 30% split. Alternatively a list or ratios can be provided, e.g. ratio=(0.6, 0.3, 0.1). Note that ratios must sum up to one. :param function|None constraint: Function that returns key the elements of the iterable are grouped by before partitioning. Useful to ensure that a partition contains related elements, e.g. left and right eye images are not scattered across partitions. Note that constrains have precedence over ratios. :param Random|None rand: Random number generator. The default None ensures that the same split is created every time SplitRandom is called. This is important when continuing an interrupted training session or running the same training on machines with different Python versions. Note that Python's random.Random(0) generates different number for Python 2.x and 3.x! :return: partitions of iterable with sizes according to provided ratios. :rtype: (list, list, ...) """ rand = StableRandom(0) if rand is None else rand samples = list(iterable) if hasattr(ratio, '__iter__'): ratios = tuple(ratio) if abs(sum(ratios) - 1.0) > 1e-6: raise ValueError('Ratios must sum up to one: ' + str(ratios)) else: ratios = (ratio, 1.0 - ratio) ns = [int(len(samples) * r) for r in ratios] if constraint is None: groups = [[s] for s in samples] else: # sort to make stable across python 2.x, 3.x groups = sorted(group_by(samples, constraint).values()) rand.shuffle(groups) groups = iter(groups) splits = [] def append(split): rand.shuffle(split) splits.append(split) for n in ns[:-1]: split = [] for group in groups: split.extend(group) if len(split) >= n: append(split) break append([e for g in groups for e in g]) # append remaining groups return splits
def test_shuffle_sublists(): sublists = [[1, 2, 3], [4, 5, 6, 7]] util.shuffle_sublists(sublists, StableRandom(0)) assert sublists == [[1, 3, 2], [4, 5, 7, 6]]