def test_values(): table = (('foo', 'bar', 'baz'), ('a', 1, True), ('b', 2), ('b', 7, False)) actual = values(table, 'foo') expect = ('a', 'b', 'b') ieq(expect, actual) ieq(expect, actual) actual = values(table, 'bar') expect = (1, 2, 7) ieq(expect, actual) ieq(expect, actual) # old style signature for multiple fields, still supported actual = values(table, ('foo', 'bar')) expect = (('a', 1), ('b', 2), ('b', 7)) ieq(expect, actual) ieq(expect, actual) # as of 0.24 new style signature for multiple fields actual = values(table, 'foo', 'bar') expect = (('a', 1), ('b', 2), ('b', 7)) ieq(expect, actual) ieq(expect, actual) actual = values(table, 'baz') expect = (True, None, False) ieq(expect, actual) ieq(expect, actual)
def typecounter(table, field): """ Count the number of values found for each Python type. >>> import petl as etl >>> table = [['foo', 'bar', 'baz'], ... ['A', 1, 2], ... ['B', u'2', '3.4'], ... [u'B', u'3', u'7.8', True], ... ['D', u'xyz', 9.0], ... ['E', 42]] >>> etl.typecounter(table, 'foo') Counter({'str': 5}) >>> etl.typecounter(table, 'bar') Counter({'str': 3, 'int': 2}) >>> etl.typecounter(table, 'baz') Counter({'str': 2, 'int': 1, 'NoneType': 1, 'float': 1}) The `field` argument can be a field name or index (starting from zero). """ counter = Counter() for v in values(table, field): try: counter[v.__class__.__name__] += 1 except IndexError: pass # ignore short rows return counter
def valuecounter(table, *field, **kwargs): """ Find distinct values for the given field and count the number of occurrences. Returns a :class:`dict` mapping values to counts. E.g.:: >>> import petl as etl >>> table = [['foo', 'bar'], ... ['a', True], ... ['b'], ... ['b', True], ... ['c', False]] >>> etl.valuecounter(table, 'foo') Counter({'b': 2, 'c': 1, 'a': 1}) The `field` argument can be a single field name or index (starting from zero) or a tuple of field names and/or indexes. """ missing = kwargs.get('missing', None) counter = Counter() for v in values(table, field, missing=missing): try: counter[v] += 1 except IndexError: pass # short row return counter
def valuecount(table, field, value, missing=None): """ Count the number of occurrences of `value` under the given field. Returns the absolute count and relative frequency as a pair. E.g.:: >>> import petl as etl >>> table = [['foo', 'bar'], ... ['a', 1], ... ['b', 2], ... ['b', 7]] >>> etl.valuecount(table, 'foo', 'b') (2, 0.6666666666666666) The `field` argument can be a single field name or index (starting from zero) or a tuple of field names and/or indexes. """ total = 0 vs = 0 for v in values(table, field, missing=missing): total += 1 if v == value: vs += 1 return vs, float(vs)/total
def typeset(table, field): """ Return a set containing all Python types found for values in the given field. E.g.:: >>> import petl as etl >>> table = [['foo', 'bar', 'baz'], ... ['A', 1, '2'], ... ['B', u'2', '3.4'], ... [u'B', u'3', '7.8', True], ... ['D', u'xyz', 9.0], ... ['E', 42]] >>> sorted(etl.typeset(table, 'foo')) ['str'] >>> sorted(etl.typeset(table, 'bar')) ['int', 'str'] >>> sorted(etl.typeset(table, 'baz')) ['NoneType', 'float', 'str'] The `field` argument can be a field name or index (starting from zero). """ s = set() for v in values(table, field): try: s.add(type(v).__name__) except IndexError: pass # ignore short rows return s
def limits(table, field): """ Find minimum and maximum values under the given field. E.g.:: >>> import petl as etl >>> table = [['foo', 'bar'], ['a', 1], ['b', 2], ['b', 3]] >>> minv, maxv = etl.limits(table, 'bar') >>> minv 1 >>> maxv 3 The `field` argument can be a field name or index (starting from zero). """ vals = iter(values(table, field)) try: minv = maxv = next(vals) except StopIteration: return None, None else: for v in vals: if v < minv: minv = v if v > maxv: maxv = v return minv, maxv
def __init__(self, *args, **kwargs): if len(args) == 2: self.input = args[0] self.period = args[1] elif len(args) == 3: self.input = values(args[0], args[1]) self.period = args[2] else: assert False, 'invalid arguments' self.missing = kwargs.get('missing', None)
def diffvalues(t1, t2, f): """ Return the difference between the values under the given field in the two tables, e.g.:: >>> import petl as etl >>> table1 = [['foo', 'bar'], ... ['a', 1], ... ['b', 3]] >>> table2 = [['bar', 'foo'], ... [1, 'a'], ... [3, 'c']] >>> add, sub = etl.diffvalues(table1, table2, 'foo') >>> add {'c'} >>> sub {'b'} """ t1v = set(values(t1, f)) t2v = set(values(t2, f)) return t2v - t1v, t1v - t2v
def stringpatterncounter(table, field): """ Profile string patterns in the given field, returning a :class:`dict` mapping patterns to counts. """ trans = maketrans( 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789', 'AAAAAAAAAAAAAAAAAAAAAAAAAAaaaaaaaaaaaaaaaaaaaaaaaaaa9999999999' ) counter = Counter() for v in values(table, field): p = str(v).translate(trans) counter[p] += 1 return counter
def parsecounter(table, field, parsers=(('int', int), ('float', float))): """ Count the number of `str` or `unicode` values under the given fields that can be parsed as ints, floats or via custom parser functions. Return a pair of `Counter` objects, the first mapping parser names to the number of strings successfully parsed, the second mapping parser names to the number of errors. E.g.:: >>> import petl as etl >>> table = [['foo', 'bar', 'baz'], ... ['A', 'aaa', 2], ... ['B', u'2', '3.4'], ... [u'B', u'3', u'7.8', True], ... ['D', '3.7', 9.0], ... ['E', 42]] >>> counter, errors = etl.parsecounter(table, 'bar') >>> counter Counter({'float': 3, 'int': 2}) >>> errors Counter({'int': 2, 'float': 1}) The `field` argument can be a field name or index (starting from zero). """ if isinstance(parsers, (list, tuple)): parsers = dict(parsers) counter, errors = Counter(), Counter() # need to initialise for n in parsers.keys(): counter[n] = 0 errors[n] = 0 for v in values(table, field): if isinstance(v, string_types): for name, parser in parsers.items(): try: parser(v) except: errors[name] += 1 else: counter[name] += 1 return counter, errors
def collapsedintervals(table, start='start', stop='stop', key=None): """ Utility function to collapse intervals in a table. If no facet `key` is given, returns an iterator over `(start, stop)` tuples. If facet `key` is given, returns an iterator over `(key, start, stop)` tuples. """ if key is None: table = sort(table, key=start) for iv in _collapse(values(table, (start, stop))): yield iv else: table = sort(table, key=(key, start)) for k, g in rowgroupby(table, key=key, value=(start, stop)): for iv in _collapse(g): yield (k,) + iv
def facet(table, key): """ Return a dictionary mapping field values to tables. E.g.:: >>> import petl as etl >>> table1 = [['foo', 'bar', 'baz'], ... ['a', 4, 9.3], ... ['a', 2, 88.2], ... ['b', 1, 23.3], ... ['c', 8, 42.0], ... ['d', 7, 100.9], ... ['c', 2]] >>> foo = etl.facet(table1, 'foo') >>> sorted(foo.keys()) ['a', 'b', 'c', 'd'] >>> foo['a'] +-----+-----+------+ | foo | bar | baz | +=====+=====+======+ | 'a' | 4 | 9.3 | +-----+-----+------+ | 'a' | 2 | 88.2 | +-----+-----+------+ >>> foo['c'] +-----+-----+------+ | foo | bar | baz | +=====+=====+======+ | 'c' | 8 | 42.0 | +-----+-----+------+ | 'c' | 2 | | +-----+-----+------+ See also :func:`petl.util.materialise.facetcolumns`. """ fct = dict() for v in set(values(table, key)): fct[v] = selecteq(table, key, v) return fct
def stats(table, field): """ Calculate basic descriptive statistics on a given field. E.g.:: >>> import petl as etl >>> table = [['foo', 'bar', 'baz'], ... ['A', 1, 2], ... ['B', '2', '3.4'], ... [u'B', u'3', u'7.8', True], ... ['D', 'xyz', 9.0], ... ['E', None]] >>> etl.stats(table, 'bar') stats(count=3, errors=2, sum=6.0, min=1.0, max=3.0, mean=2.0, pvariance=0.6666666666666666, pstdev=0.816496580927726) The `field` argument can be a field name or index (starting from zero). """ _min = None _max = None _sum = 0 _mean = 0 _var = 0 _count = 0 _errors = 0 for v in values(table, field): try: v = float(v) except (ValueError, TypeError): _errors += 1 else: _count += 1 if _min is None or v < _min: _min = v if _max is None or v > _max: _max = v _sum += v _mean, _var = onlinestats(v, _count, mean=_mean, variance=_var) _std = _var**.5 return _stats(_count, _errors, _sum, _min, _max, _mean, _var, _std)