示例#1
0
def itermultiaggregate(source, key, aggregation):
    aggregation = OrderedDict(aggregation.items())  # take a copy
    it = iter(source)
    hdr = next(it)
    # push back header to ensure we iterate only once
    it = itertools.chain([hdr], it)

    # normalise aggregators
    for outfld in aggregation:
        agg = aggregation[outfld]
        if callable(agg):
            aggregation[outfld] = None, agg
        elif isinstance(agg, string_types):
            aggregation[outfld] = agg, list  # list is default
        elif len(agg) == 1 and isinstance(agg[0], string_types):
            aggregation[outfld] = agg[0], list  # list is default
        elif len(agg) == 1 and callable(agg[0]):
            aggregation[outfld] = None, agg[0]  # aggregate whole rows
        elif len(agg) == 2:
            pass  # no need to normalise
        else:
            raise ArgumentError('invalid aggregation: %r, %r' % (outfld, agg))

    # determine output header
    if isinstance(key, (list, tuple)):
        outhdr = list(key)
    elif callable(key):
        outhdr = ['key']
    else:
        outhdr = [key]
    for outfld in aggregation:
        outhdr.append(outfld)
    yield tuple(outhdr)
    
    # generate data
    for k, rows in rowgroupby(it, key):
        rows = list(rows)  # may need to iterate over these more than once
        # handle compound key
        if isinstance(key, (list, tuple)):
            outrow = list(k)
        else:
            outrow = [k]
        for outfld in aggregation:
            srcfld, aggfun = aggregation[outfld]
            if srcfld is None:
                aggval = aggfun(rows)
                outrow.append(aggval)
            elif isinstance(srcfld, (list, tuple)):
                idxs = [hdr.index(f) for f in srcfld]
                valgetter = operator.itemgetter(*idxs)
                vals = (valgetter(row) for row in rows)
                aggval = aggfun(vals)
                outrow.append(aggval)
            else:
                idx = hdr.index(srcfld)
                # try using generator comprehension
                vals = (row[idx] for row in rows)
                aggval = aggfun(vals)
                outrow.append(aggval)
        yield tuple(outrow)
示例#2
0
def itermultiaggregate(source, key, aggregation):
    aggregation = OrderedDict(aggregation.items())  # take a copy
    it = iter(source)
    hdr = next(it)
    # push back header to ensure we iterate only once
    it = itertools.chain([hdr], it)

    # normalise aggregators
    for outfld in aggregation:
        agg = aggregation[outfld]
        if callable(agg):
            aggregation[outfld] = None, agg
        elif isinstance(agg, string_types):
            aggregation[outfld] = agg, list  # list is default
        elif len(agg) == 1 and isinstance(agg[0], string_types):
            aggregation[outfld] = agg[0], list  # list is default
        elif len(agg) == 1 and callable(agg[0]):
            aggregation[outfld] = None, agg[0]  # aggregate whole rows
        elif len(agg) == 2:
            pass  # no need to normalise
        else:
            raise ArgumentError('invalid aggregation: %r, %r' % (outfld, agg))

    # determine output header
    if isinstance(key, (list, tuple)):
        outhdr = list(key)
    elif callable(key):
        outhdr = ['key']
    else:
        outhdr = [key]
    for outfld in aggregation:
        outhdr.append(outfld)
    yield tuple(outhdr)

    # generate data
    for k, rows in rowgroupby(it, key):
        rows = list(rows)  # may need to iterate over these more than once
        # handle compound key
        if isinstance(key, (list, tuple)):
            outrow = list(k)
        else:
            outrow = [k]
        for outfld in aggregation:
            srcfld, aggfun = aggregation[outfld]
            if srcfld is None:
                aggval = aggfun(rows)
                outrow.append(aggval)
            elif isinstance(srcfld, (list, tuple)):
                idxs = [hdr.index(f) for f in srcfld]
                valgetter = operator.itemgetter(*idxs)
                vals = (valgetter(row) for row in rows)
                aggval = aggfun(vals)
                outrow.append(aggval)
            else:
                idx = hdr.index(srcfld)
                # try using generator comprehension
                vals = (row[idx] for row in rows)
                aggval = aggfun(vals)
                outrow.append(aggval)
        yield tuple(outrow)
示例#3
0
 def __init__(self,
              source,
              key,
              aggregation=None,
              presorted=False,
              buffersize=None,
              tempdir=None,
              cache=True):
     if presorted:
         self.source = source
     else:
         self.source = sort(source,
                            key,
                            buffersize=buffersize,
                            tempdir=tempdir,
                            cache=cache)
     self.key = key
     if aggregation is None:
         self.aggregation = OrderedDict()
     elif isinstance(aggregation, (list, tuple)):
         self.aggregation = OrderedDict()
         for t in aggregation:
             self.aggregation[t[0]] = t[1:]
     elif isinstance(aggregation, dict):
         self.aggregation = aggregation
     else:
         raise ArgumentError(
             'expected aggregation is None, list, tuple or dict, found %r' %
             aggregation)
示例#4
0
 def __init__(self, numrows=100, fields=None, wait=0, seed=None):
     self.numrows = numrows
     self.wait = wait
     if fields is None:
         self.fields = OrderedDict()
     else:
         self.fields = OrderedDict(fields)
     if seed is None:
         self.seed = datetime.datetime.now()
     else:
         self.seed = seed
示例#5
0
文件: random.py 项目: Mgutjahr/petl
 def __init__(self, numrows=100, fields=None, wait=0, seed=None):
     self.numrows = numrows
     self.wait = wait
     if fields is None:
         self.fields = OrderedDict()
     else:
         self.fields = OrderedDict(fields)
     if seed is None:
         self.seed = datetime.datetime.now()
     else:
         self.seed = seed
示例#6
0
文件: test_json.py 项目: zhatrix/petl
def test_fromdicts_ordered():

    from petl.compat import OrderedDict
    data = [
        OrderedDict([('foo', 'a'), ('bar', 1)]),
        OrderedDict([('foo', 'b')]),
        OrderedDict([('foo', 'c'), ('bar', 2), ('baz', True)])
    ]
    actual = fromdicts(data)
    # N.B., fields come out in original order
    expect = (('foo', 'bar', 'baz'), ('a', 1, None), ('b', None, None),
              ('c', 2, True))
    ieq(expect, actual)
示例#7
0
def test_rangeaggregate_multifield():

    table1 = (('foo', 'bar'), ('a', 3), ('a', 7), ('b', 2), ('b', 1), ('b', 9),
              ('c', 4), ('d', 3))

    # dict arg

    aggregators = OrderedDict()
    aggregators['foocount'] = len
    aggregators['foojoin'] = 'foo', strjoin('')
    aggregators['foolist'] = 'foo'  # default is list

    table2 = rangeaggregate(table1, 'bar', 2, aggregators)
    expect2 = (('bar', 'foocount', 'foojoin', 'foolist'), ((1, 3), 2, 'bb',
                                                           ['b', 'b']),
               ((3, 5), 3, 'adc', ['a', 'd', 'c']), ((5, 7), 0, '', []),
               ((7, 9), 1, 'a', ['a']), ((9, 11), 1, 'b', ['b']))
    ieq(expect2, table2)

    # suffix notation

    table3 = rangeaggregate(table1, 'bar', 2)
    table3['foocount'] = len
    table3['foojoin'] = 'foo', strjoin('')
    table3['foolist'] = 'foo'  # default is list
    ieq(expect2, table3)

    # list arg

    aggregators = [('foocount', len), ('foojoin', 'foo', strjoin('')),
                   ('foolist', 'foo', list)]
    table4 = rangeaggregate(table1, 'bar', 2, aggregators)
    ieq(expect2, table4)
示例#8
0
def test_aggregate_more():

    table1 = (('foo', 'bar'), ('aa', 3), ('aa', 7), ('bb', 2), ('bb', 1),
              ('bb', 9), ('cc', 4), ('dd', 3))

    aggregators = OrderedDict()
    aggregators['minbar'] = 'bar', min
    aggregators['maxbar'] = 'bar', max
    aggregators['sumbar'] = 'bar', sum
    aggregators['listbar'] = 'bar'  # default aggregation is list
    aggregators['bars'] = 'bar', strjoin(', ')

    table2 = aggregate(table1, 'foo', aggregators)
    expect2 = (('foo', 'minbar', 'maxbar', 'sumbar', 'listbar', 'bars'),
               ('aa', 3, 7, 10, [3,
                                 7], '3, 7'), ('bb', 1, 9, 12, [2, 1,
                                                                9], '2, 1, 9'),
               ('cc', 4, 4, 4, [4], '4'), ('dd', 3, 3, 3, [3], '3'))
    ieq(expect2, table2)
    ieq(expect2, table2)  # check can iterate twice

    table3 = aggregate(table1, 'foo')
    table3['minbar'] = 'bar', min
    table3['maxbar'] = 'bar', max
    table3['sumbar'] = 'bar', sum
    table3['listbar'] = 'bar'  # default aggregation is list
    table3['bars'] = 'bar', strjoin(', ')
    ieq(expect2, table3)
def columns(table, missing=None):
    """
    Construct a :class:`dict` mapping field names to lists of values. E.g.::

        >>> import petl as etl
        >>> table = [['foo', 'bar'], ['a', 1], ['b', 2], ['b', 3]]
        >>> cols = etl.columns(table)
        >>> cols['foo']
        ['a', 'b', 'b']
        >>> cols['bar']
        [1, 2, 3]

    See also :func:`petl.util.materialise.facetcolumns`.

    """

    cols = OrderedDict()
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    for f in flds:
        cols[f] = list()
    for row in it:
        for f, v in izip_longest(flds, row, fillvalue=missing):
            if f in cols:
                cols[f].append(v)
    return cols
def test_fieldmap_empty():
    table = (('foo', 'bar'), )
    expect = (('foo', 'baz'), )
    mappings = OrderedDict()
    mappings['foo'] = 'foo'
    mappings['baz'] = 'bar', lambda v: v * 2
    actual = fieldmap(table, mappings)
    ieq(expect, actual)
示例#11
0
def itermultirangeaggregate(source, key, width, aggregation, minv, maxv):
    aggregation = OrderedDict(aggregation.items()) # take a copy
    it = iter(source)
    srcflds = it.next()
    # push back header to ensure we iterate only once
    it = itertools.chain([srcflds], it)

    # normalise aggregators
    for outfld in aggregation:
        agg = aggregation[outfld]
        if callable(agg):
            aggregation[outfld] = None, agg
        elif isinstance(agg, basestring):
            aggregation[outfld] = agg, list # list is default
        elif len(agg) == 1 and isinstance(agg[0], basestring):
            aggregation[outfld] = agg[0], list # list is default 
        elif len(agg) == 1 and callable(agg[0]):
            aggregation[outfld] = None, agg[0] # aggregate whole rows
        elif len(agg) == 2:
            pass # no need to normalise
        else:
            raise Exception('invalid aggregation: %r, %r' % (outfld, agg))
        
    outflds = [key]
    for outfld in aggregation:
        outflds.append(outfld)
    yield tuple(outflds)
    
    for k, rows in rowgroupbybin(it, key, width, minv=minv, maxv=maxv):
        outrow = [k]
        for outfld in aggregation:
            srcfld, aggfun = aggregation[outfld]
            if srcfld is None:
                aggval = aggfun(rows)
                outrow.append(aggval)
            else:
                idx = srcflds.index(srcfld)
                # try using generator comprehension
                vals = (row[idx] for row in rows)
                aggval = aggfun(vals)
                outrow.append(aggval)
        yield tuple(outrow)
示例#12
0
def test_aggregate_empty():

    table = (('foo', 'bar'), )

    aggregators = OrderedDict()
    aggregators['minbar'] = 'bar', min
    aggregators['maxbar'] = 'bar', max
    aggregators['sumbar'] = 'bar', sum

    actual = aggregate(table, 'foo', aggregators)
    expect = (('foo', 'minbar', 'maxbar', 'sumbar'), )
    ieq(expect, actual)
示例#13
0
 def __init__(self,
              source,
              mappings=None,
              failonerror=False,
              errorvalue=None):
     self.source = source
     if mappings is None:
         self.mappings = OrderedDict()
     else:
         self.mappings = mappings
     self.failonerror = failonerror
     self.errorvalue = errorvalue
示例#14
0
def test_fieldmap():

    table = (('id', 'sex', 'age', 'height', 'weight'),
             (1, 'male', 16, 1.45, 62.0),
             (2, 'female', 19, 1.34, 55.4),
             (3, 'female', 17, 1.78, 74.4),
             (4, 'male', 21, 1.33, 45.2),
             (5, '-', 25, 1.65, 51.9))

    mappings = OrderedDict()
    mappings['subject_id'] = 'id'
    mappings['gender'] = 'sex', {'male': 'M', 'female': 'F'}
    mappings['age_months'] = 'age', lambda v: v * 12
    mappings['bmi'] = lambda rec: rec['weight'] / rec['height']**2
    actual = fieldmap(table, mappings)
    expect = (('subject_id', 'gender', 'age_months', 'bmi'),
              (1, 'M', 16*12, 62.0/1.45**2),
              (2, 'F', 19*12, 55.4/1.34**2),
              (3, 'F', 17*12, 74.4/1.78**2),
              (4, 'M', 21*12, 45.2/1.33**2),
              (5, '-', 25*12, 51.9/1.65**2))
    ieq(expect, actual)
    ieq(expect, actual)  # can iteratate twice?

    # do it with suffix
    actual = fieldmap(table)
    actual['subject_id'] = 'id'
    actual['gender'] = 'sex', {'male': 'M', 'female': 'F'}
    actual['age_months'] = 'age', lambda v: v * 12
    actual['bmi'] = '{weight} / {height}**2'
    ieq(expect, actual)

    # test short rows
    table2 = (('id', 'sex', 'age', 'height', 'weight'),
              (1, 'male', 16, 1.45, 62.0),
              (2, 'female', 19, 1.34, 55.4),
              (3, 'female', 17, 1.78, 74.4),
              (4, 'male', 21, 1.33, 45.2),
              (5, '-', 25, 1.65))
    expect = (('subject_id', 'gender', 'age_months', 'bmi'),
              (1, 'M', 16*12, 62.0/1.45**2),
              (2, 'F', 19*12, 55.4/1.34**2),
              (3, 'F', 17*12, 74.4/1.78**2),
              (4, 'M', 21*12, 45.2/1.33**2),
              (5, '-', 25*12, None))
    actual = fieldmap(table2, mappings)
    ieq(expect, actual)
示例#15
0
def test_aggregate_multifield():

    table1 = (('foo', 'bar'), ('a', 3), ('a', 7), ('b', 2), ('b', 1), ('b', 9),
              ('c', 4))

    # dict arg

    aggregators = OrderedDict()
    aggregators['count'] = len
    aggregators['minbar'] = 'bar', min
    aggregators['maxbar'] = 'bar', max
    aggregators['sumbar'] = 'bar', sum
    aggregators['listbar'] = 'bar', list
    aggregators['bars'] = 'bar', strjoin(', ')

    table2 = aggregate(table1, 'foo', aggregators)
    expect2 = (('foo', 'count', 'minbar', 'maxbar', 'sumbar', 'listbar',
                'bars'), ('a', 2, 3, 7, 10, [3, 7], '3, 7'),
               ('b', 3, 1, 9, 12, [2, 1,
                                   9], '2, 1, 9'), ('c', 1, 4, 4, 4, [4], '4'))
    ieq(expect2, table2)
    ieq(expect2, table2)  # check can iterate twice

    # use suffix notation

    table3 = aggregate(table1, 'foo')
    table3['count'] = len
    table3['minbar'] = 'bar', min
    table3['maxbar'] = 'bar', max
    table3['sumbar'] = 'bar', sum
    table3['listbar'] = 'bar'  # default aggregation is list
    table3['bars'] = 'bar', strjoin(', ')
    ieq(expect2, table3)

    # list arg

    aggregators = [('count', len), ('minbar', 'bar', min),
                   ('maxbar', 'bar', max), ('sumbar', 'bar', sum),
                   ('listbar', 'bar', list), ('bars', 'bar', strjoin(', '))]

    table4 = aggregate(table1, 'foo', aggregators)
    ieq(expect2, table4)
    ieq(expect2, table4)  # check can iterate twice
def test_fieldmap_record_access():
    table = (('id', 'sex', 'age', 'height', 'weight'),
             (1, 'male', 16, 1.45, 62.0), (2, 'female', 19, 1.34, 55.4),
             (3, 'female', 17, 1.78, 74.4), (4, 'male', 21, 1.33,
                                             45.2), (5, '-', 25, 1.65, 51.9))

    mappings = OrderedDict()
    mappings['subject_id'] = 'id'
    mappings['gender'] = 'sex', {'male': 'M', 'female': 'F'}
    mappings['age_months'] = 'age', lambda v: v * 12
    mappings['bmi'] = lambda rec: rec.weight / rec.height**2
    actual = fieldmap(table, mappings)
    expect = (('subject_id', 'gender', 'age_months', 'bmi'), (1, 'M', 16 * 12,
                                                              62.0 / 1.45**2),
              (2, 'F', 19 * 12, 55.4 / 1.34**2), (3, 'F', 17 * 12,
                                                  74.4 / 1.78**2),
              (4, 'M', 21 * 12, 45.2 / 1.33**2), (5, '-', 25 * 12,
                                                  51.9 / 1.65**2))
    ieq(expect, actual)
    ieq(expect, actual)  # can iteratate twice?
示例#17
0
文件: random.py 项目: Mgutjahr/petl
class DummyTable(Table):

    def __init__(self, numrows=100, fields=None, wait=0, seed=None):
        self.numrows = numrows
        self.wait = wait
        if fields is None:
            self.fields = OrderedDict()
        else:
            self.fields = OrderedDict(fields)
        if seed is None:
            self.seed = datetime.datetime.now()
        else:
            self.seed = seed

    def __setitem__(self, item, value):
        self.fields[str(item)] = value

    def __iter__(self):
        nr = self.numrows
        seed = self.seed
        fields = self.fields.copy()

        # N.B., we want this to be stable, i.e., same data each time
        random.seed(seed)

        # construct header row
        hdr = tuple(str(f) for f in fields.keys())
        yield hdr

        # construct data rows
        for _ in xrange(nr):
            # artificial delay
            if self.wait:
                time.sleep(self.wait)
            yield tuple(fields[f]() for f in fields)

    def reseed(self):
        self.seed = datetime.datetime.now()
示例#18
0
class DummyTable(Table):
    def __init__(self, numrows=100, fields=None, wait=0, seed=None):
        self.numrows = numrows
        self.wait = wait
        if fields is None:
            self.fields = OrderedDict()
        else:
            self.fields = OrderedDict(fields)
        if seed is None:
            self.seed = datetime.datetime.now()
        else:
            self.seed = seed

    def __setitem__(self, item, value):
        self.fields[text_type(item)] = value

    def __iter__(self):
        nr = self.numrows
        seed = self.seed
        fields = self.fields.copy()

        # N.B., we want this to be stable, i.e., same data each time
        random.seed(seed)

        # construct header row
        hdr = tuple(text_type(f) for f in fields.keys())
        yield hdr

        # construct data rows
        for _ in xrange(nr):
            # artificial delay
            if self.wait:
                time.sleep(self.wait)
            yield tuple(fields[f]() for f in fields)

    def reseed(self):
        self.seed = datetime.datetime.now()
示例#19
0
 def ordereddict(self):
     return OrderedDict(self)
示例#20
0
def rangefacet(table,
               field,
               width,
               minv=None,
               maxv=None,
               presorted=False,
               buffersize=None,
               tempdir=None,
               cache=True):
    """
    Return a dictionary mapping ranges to tables. E.g.::

        >>> from petl import rangefacet, look
        >>> look(table1)
        +-------+-------+
        | 'foo' | 'bar' |
        +=======+=======+
        | 'a'   | 3     |
        +-------+-------+
        | 'a'   | 7     |
        +-------+-------+
        | 'b'   | 2     |
        +-------+-------+
        | 'b'   | 1     |
        +-------+-------+
        | 'b'   | 9     |
        +-------+-------+
        | 'c'   | 4     |
        +-------+-------+
        | 'd'   | 3     |
        +-------+-------+

        >>> rf = rangefacet(table1, 'bar', 2)
        >>> rf.keys()
        [(1, 3), (3, 5), (5, 7), (7, 9)]
        >>> look(rf[(1, 3)])
        +-------+-------+
        | 'foo' | 'bar' |
        +=======+=======+
        | 'b'   | 2     |
        +-------+-------+
        | 'b'   | 1     |
        +-------+-------+

        >>> look(rf[(7, 9)])
        +-------+-------+
        | 'foo' | 'bar' |
        +=======+=======+
        | 'a'   | 7     |
        +-------+-------+
        | 'b'   | 9     |
        +-------+-------+

    Note that the last bin includes both edges.

    """

    # determine minimum and maximum values
    if minv is None and maxv is None:
        minv, maxv = limits(table, field)
    elif minv is None:
        minv = min(itervalues(table, field))
    elif max is None:
        maxv = max(itervalues(table, field))

    fct = OrderedDict()
    for binminv in xrange(minv, maxv, width):
        binmaxv = binminv + width
        if binmaxv >= maxv:  # final bin
            binmaxv = maxv
            # final bin includes right edge
            fct[(binminv, binmaxv)] = selectrangeopen(table, field, binminv,
                                                      binmaxv)
        else:
            fct[(binminv,
                 binmaxv)] = selectrangeopenleft(table, field, binminv,
                                                 binmaxv)

    return fct