def generate_block(*args, overlap): ''' Generate pairs, where the overlap in {'partial', 'none', 'superset'}. ''' clauses1 = [] clauses2 = [] for i, arg in enumerate(args): attr = Attribute('x{}'.format(i)) if arg == 'dt' or arg == 'num': start_dt = random.randint(2100, 3000) end_dt = start_dt + 1000 start_shift = ( random.randint(200, 500) if overlap == 'partial' else random.randint(2000, 2500) if overlap == 'none' else random.randint(-500, -200) if overlap == 'superset' else None) end_shift = (random.randint(200, 500) if overlap == 'partial' or overlap == 'superset' else random.randint( 2000, 2500) if overlap == 'none' else None) if arg == 'num': clauses1.extend([Ge(attr, start_dt), Le(attr, end_dt)]) clauses2.extend([ Ge(attr, start_dt + start_shift), Le(attr, end_dt + end_shift) ]) else: clauses1.extend([ Ge(attr, datetime.fromordinal(start_dt)), Le(attr, datetime.fromordinal(end_dt)) ]) clauses2.extend([ Ge(attr, datetime.fromordinal(start_dt + start_shift)), Le(attr, datetime.fromordinal(end_dt + end_shift)) ]) elif arg == 'arg': valueset = [random.randint(0, 10) for _ in range(4)] clauses1.append(In(attr, valueset)) clauses2.append( In(attr, (valueset[:2] + [random.randint(20, 30) for _ in range(2)] if overlap == 'partial' else [random.randint(20, 30) for _ in range(4)] if overlap == 'none' else valueset + [random.randint(20, 30) for _ in range(2)] if overlap == 'superset' else None))) else: raise ValueError(arg) return And(clauses1), And(clauses2)
def bench(path, name): for i, (_, current, cached) in enumerate(path): intersection = And([current, cached]) difference = And([current, Not(cached)]) def run_intersection(): expand_dnf_simplify(intersection) def run_difference(): expand_dnf_simplify(difference) yield dict(name=name, ind=i, clauses=len(get_clauses(intersection)), intersection=min( timeit.repeat(run_intersection, number=1, repeat=1)), difference=min( timeit.repeat(run_difference, number=1, repeat=1)))
def test_simplify_flat_and_fuzz(clauses): ''' Currently a simple error check, but this should really validate algorithm guarantees by checking against a data set. ''' result = simplify_flat_and(And(clauses)) n_output = len(result.clauses) if type(result) is And else 1 assert n_output <= len(clauses) if result is False: event('Simplified False') else: if n_output < len(clauses): event('Shortened')
]), # Exact repetition. (None, [ (Le(X, 3), Le(X, 3)), (Le(X, 3), None), ]), # Following queries are subsets. (None, [ (Le(X, 3), Le(X, 3)), (Le(X, 2), None), (Le(X, 1), None), ]), # Following queries overlap partially. (None, [ (Le(X, 1), Le(X, 1)), (Le(X, 3), And([Gt(X, 1), Le(X, 3)])), ]), (None, [ (Le(X, 2), Le(X, 2)), (Ge(X, 1), Gt(X, 2)), ]), # Multiple cache records stored. (None, [ (Le(X, 1), Le(X, 1)), (Ge(X, 3), Ge(X, 3)), (Le(X, 0), None), (Ge(X, 4), None), ]), # Assembling a result from partial cached queries. (None, [ (Le(X, 2), Le(X, 2)),
def remainder(expr1, expr2): expression = And([expr1, Not(expr2)]) expression = simplify(expression) return expression
def remains(expr1, expr2): expression = And([expr1, Not(expr2)]) expression = simplify(expression) return expression is not False
def intersection(expr1, expr2): expression = And([expr1, expr2]) expression = simplify(expression) return expression
def intersects(expr1, expr2): ''' Return whether e1 intersects with e2. ''' expression = And([expr1, expr2]) expression = simplify(expression) return expression is not False
''' Profiling a large-ish DNF expansion, shows a couple of things: - A few operations on boolean logic can be short-circuited - There is a lot of hashing done, for example any creation of an And/Or adds expressions to a set, relations are stored in a dict to calculate the truth table. Both structures are hash table backed, but the first is likely unnecessary and the second usually has very small tables. Hashing seems to be faster than comparison, so the structure is probably still the correct one; just need to minimise the number of lookups. python -m cProfile -o profile.out testcase.py && cprofilev -f profile.out ''' import json from split_query.core import object_hook, And, Not from split_query.core.expand import expand_dnf_simplify with open('path_persistent.json') as infile: _, e1, e2 = json.load(infile, object_hook=object_hook)[3] expand_dnf_simplify(And([e1, Not(e2)]))
def test_filter_between(dataset): return (dataset[dataset.x.between(1, 3)], And([Ge(Attribute('x'), 1), Le(Attribute('x'), 3)]))
def test_filter_and(dataset): return (dataset[(dataset.y < 2) & (dataset.x > 5)], And([Lt(Attribute('y'), 2), Gt(Attribute('x'), 5)]))
def test_filter_chained(dataset): return (dataset[dataset.x <= 1][dataset.z >= 0], And([Le(Attribute('x'), 1), Ge(Attribute('z'), 0)]))
(x.isin([1, 2, 3]) & x.isin([2, 3, 4]), x.isin([2, 3])), (~x.isin([1, 2, 3]) & ~x.isin([2, 3, 4]), ~x.isin([1, 2, 3, 4])), (x.isin([1, 2, 3]) & ~x.isin([2, 3, 4]), (x == 1)), (x.isin([1, 2, 3]) & x.isin([4, 5, 6]), False), (x.isin([1, 2, 3]) & ~x.isin([1, 2, 3, 4]), False), # Combined bounds + sets. ((x == 1) & (x >= 0), (x == 1)), ((x == 1) & (x == 2), False), (x.isin([0, 1, 2]) & (x < 2), x.isin([0, 1])), (x.isin([1, 2, 3]) & (x <= 2), x.isin([1, 2])), (x.isin([1, 2, 3]) & (x > 3), False), # Edge cases (~(x == 0) & x.isin([0]), False), (~(x == 0) & (x == 0), False), (~(~(x == 0)) & (y == 0), (x == 0) & (y == 0)), (And([True]), True), (And([False]), False), # Found in cache tests. ((x >= 2014) & (x < 2015) & ~(x == 2015), (x >= 2014) & (x < 2015)), ((x >= 2015) & (x <= 2015) & ~(x == 2015), False), ] @pytest.mark.parametrize('expression, simplified', TESTCASES) def test_simplify_flat_and(expression, simplified): ''' Obviously simplifiable cases to define algorithm behaviour. Should be reducible to a simpler set. ''' if type(expression) is ExpressionContainer: expression = expression.wrapped if type(simplified) is ExpressionContainer: simplified = simplified.wrapped
import pytest from split_query.core import Attribute, And, Or, Not, In, Ge, Le, Lt, Gt, Eq from split_query.extract import extract_parameters, split_parameters XVAR = Attribute('x') YVAR = Attribute('y') TESTCASES_EXTRACT = [ (In(XVAR, [1, 2, 3]), [dict(attr='x', type='tag', key='xtags', single=False)], [(In(XVAR, [1, 2, 3]), dict(xtags={1, 2, 3}))]), (And([In(XVAR, [1, 2, 3]), Ge(YVAR, 2), Le(YVAR, 4)]), [ dict(attr='x', type='tag', key='xtags', single=False), dict(attr='y', type='range', key_lower='from_y', key_upper='to_y') ], [(And([In(XVAR, [1, 2, 3]), Ge(YVAR, 2), Le(YVAR, 4)]), dict(xtags={1, 2, 3}, from_y=2, to_y=4))]), (And([In(XVAR, [1, 2, 3]), In(YVAR, [4, 5, 6])]), [ dict(attr='x', type='tag', key='xtag', single=True), dict(attr='y', type='tag', key='ytags', single=False) ], [ (And([In(XVAR, [1]), In(YVAR, [4, 5, 6])]), dict(xtag=1, ytags={4, 5, 6})), (And([In(XVAR, [2]), In(YVAR, [4, 5, 6])]), dict(xtag=2, ytags={4, 5, 6})),