def groupcountdistinctvalues(table, key, value): """Group by the `key` field then count the number of distinct values in the `value` field.""" s1 = cut(table, key, value) s2 = distinct(s1) s3 = aggregate(s2, key, len) return s3
def test_key_distinct_2(): # test for https://github.com/alimanfoo/petl/issues/318 tbl = (('a', 'b'), ('x', '1'), ('x', '3'), ('y', '1'), (None, None)) result = distinct(tbl, key='b') expect = (('a', 'b'), (None, None), ('x', '1'), ('x', '3')) ieq(expect, result)
def test_distinct_count(): table = (('foo', 'bar', 'baz'), ('A', 1, 2), ('B', '2', '3.4'), ('B', '2', '3.4'), ('D', 4, 12.3)) result = distinct(table, count='count') expect = (('foo', 'bar', 'baz', 'count'), ('A', 1, 2, 1), ('B', '2', '3.4', 2), ('D', 4, 12.3, 1)) ieq(expect, result)
def test_distinct(): table = (('foo', 'bar', 'baz'), ('A', 1, 2), ('B', '2', '3.4'), ('B', '2', '3.4'), ('D', 4, 12.3)) result = distinct(table) expect = (('foo', 'bar', 'baz'), ('A', 1, 2), ('B', '2', '3.4'), ('D', 4, 12.3)) ieq(expect, result)
def test_key_distinct(): table = (('foo', 'bar', 'baz'), (None, None, None), ('A', 1, 2), ('B', '2', '3.4'), ('B', '2', '5'), ('D', 4, 12.3)) result = distinct(table, key='foo') expect = (('foo', 'bar', 'baz'), (None, None, None), ('A', 1, 2), ('B', '2', '3.4'), ('D', 4, 12.3)) ieq(expect, result)
def test_key_distinct_2(): # test for https://github.com/alimanfoo/petl/issues/318 tbl = (('a', 'b'), ('x', '1'), ('x', '3'), ('y', '1')) result = distinct(tbl, key='b') expect = (('a', 'b'), ('x', '1'), ('x', '3')) ieq(expect, result)
def test_key_distinct_count(): table = (('foo', 'bar', 'baz'), ('A', 1, 2), ('B', '2', '3.4'), ('B', '2', '5'), ('D', 4, 12.3)) result = distinct(table, key='foo', count='count') expect = (('foo', 'bar', 'baz', 'count'), ('A', 1, 2, 1), ('B', '2', '3.4', 2), ('D', 4, 12.3, 1)) ieq(expect, result)
def unjoin(table, value, key=None, autoincrement=(1, 1), presorted=False, buffersize=None, tempdir=None, cache=True): """ Split a table into two tables by reversing an inner join. E.g., if the join key is present in the table:: >>> from petl import look, unjoin >>> look(table1) +-------+-------+----------+ | 'foo' | 'bar' | 'baz' | +=======+=======+==========+ | 'A' | 1 | 'apple' | +-------+-------+----------+ | 'B' | 1 | 'apple' | +-------+-------+----------+ | 'C' | 2 | 'orange' | +-------+-------+----------+ >>> table2, table3 = unjoin(table1, 'baz', key='bar') >>> look(table2) +-------+-------+ | 'foo' | 'bar' | +=======+=======+ | 'A' | 1 | +-------+-------+ | 'B' | 1 | +-------+-------+ | 'C' | 2 | +-------+-------+ >>> look(table3) +-------+----------+ | 'bar' | 'baz' | +=======+==========+ | 1 | 'apple' | +-------+----------+ | 2 | 'orange' | +-------+----------+ An integer join key can also be reconstructed, e.g.:: >>> look(table4) +-------+----------+ | 'foo' | 'bar' | +=======+==========+ | 'A' | 'apple' | +-------+----------+ | 'B' | 'apple' | +-------+----------+ | 'C' | 'orange' | +-------+----------+ >>> table5, table6 = unjoin(table4, 'bar') >>> look(table5) +-------+----------+ | 'foo' | 'bar_id' | +=======+==========+ | 'A' | 1 | +-------+----------+ | 'B' | 1 | +-------+----------+ | 'C' | 2 | +-------+----------+ >>> look(table6) +------+----------+ | 'id' | 'bar' | +======+==========+ | 1 | 'apple' | +------+----------+ | 2 | 'orange' | +------+----------+ .. versionadded:: 0.12 """ if key is None: # first sort the table by the value field if presorted: tbl_sorted = table else: tbl_sorted = sort(table, value, buffersize=buffersize, tempdir=tempdir, cache=cache) # on the left, return the original table but with the value field # replaced by an incrementing integer left = ConvertToIncrementingCounterView(tbl_sorted, value, autoincrement) # on the right, return a new table with distinct values from the # given field right = EnumerateDistinctView(tbl_sorted, value, autoincrement) else: # on the left, return distinct rows from the original table # with the value field cut out left = distinct(cutout(table, value)) # on the right, return distinct rows from the original table # with all fields but the key and value cut out right = distinct(cut(table, key, value)) return left, right
def unjoin(table, value, key=None, autoincrement=(1, 1), presorted=False, buffersize=None, tempdir=None, cache=True): """ Split a table into two tables by reversing an inner join. E.g.:: >>> import petl as etl >>> # join key is present in the table ... table1 = (('foo', 'bar', 'baz'), ... ('A', 1, 'apple'), ... ('B', 1, 'apple'), ... ('C', 2, 'orange')) >>> table2, table3 = etl.unjoin(table1, 'baz', key='bar') >>> table2 +-----+-----+ | foo | bar | +=====+=====+ | 'A' | 1 | +-----+-----+ | 'B' | 1 | +-----+-----+ | 'C' | 2 | +-----+-----+ >>> table3 +-----+----------+ | bar | baz | +=====+==========+ | 1 | 'apple' | +-----+----------+ | 2 | 'orange' | +-----+----------+ >>> # an integer join key can also be reconstructed ... table4 = (('foo', 'bar'), ... ('A', 'apple'), ... ('B', 'apple'), ... ('C', 'orange')) >>> table5, table6 = etl.unjoin(table4, 'bar') >>> table5 +-----+--------+ | foo | bar_id | +=====+========+ | 'A' | 1 | +-----+--------+ | 'B' | 1 | +-----+--------+ | 'C' | 2 | +-----+--------+ >>> table6 +----+----------+ | id | bar | +====+==========+ | 1 | 'apple' | +----+----------+ | 2 | 'orange' | +----+----------+ The `autoincrement` parameter controls how an integer join key is reconstructed, and should be a tuple of (`start`, `step`). """ if key is None: # first sort the table by the value field if presorted: tbl_sorted = table else: tbl_sorted = sort(table, value, buffersize=buffersize, tempdir=tempdir, cache=cache) # on the left, return the original table but with the value field # replaced by an incrementing integer left = ConvertToIncrementingCounterView(tbl_sorted, value, autoincrement) # on the right, return a new table with distinct values from the # given field right = EnumerateDistinctView(tbl_sorted, value, autoincrement) else: # on the left, return distinct rows from the original table # with the value field cut out left = distinct(cutout(table, value)) # on the right, return distinct rows from the original table # with all fields but the key and value cut out right = distinct(cut(table, key, value)) return left, right