Exemplo n.º 1
0
def iterhashantijoin(left, right, lkey, rkey):
    lit = iter(left)
    rit = iter(right)

    lhdr = next(lit)
    rhdr = next(rit)
    yield tuple(lhdr)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lhdr, lkey)
    rkind = asindices(rhdr, rkey)

    # construct functions to extract key values from both tables
    lgetk = operator.itemgetter(*lkind)
    rgetk = operator.itemgetter(*rkind)

    rkeys = set()
    for rrow in rit:
        rk = rgetk(rrow)
        rkeys.add(rk)

    for lrow in lit:
        lk = lgetk(lrow)
        if lk not in rkeys:
            yield tuple(lrow)
Exemplo n.º 2
0
def test_records():
    table = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 3))
    actual = records(table)
    # access items
    it = iter(actual)
    o = next(it)
    eq_('a', o['foo'])
    eq_(1, o['bar'])
    o = next(it)
    eq_('b', o['foo'])
    eq_(2, o['bar'])
    # access attributes
    it = iter(actual)
    o = next(it)
    eq_('a', o.foo)
    eq_(1, o.bar)
    o = next(it)
    eq_('b', o.foo)
    eq_(2, o.bar)
    # access with get() method
    it = iter(actual)
    o = next(it)
    eq_('a', o.get('foo'))
    eq_(1, o.get('bar'))
    eq_(None, o.get('baz'))
    eq_('qux', o.get('baz', default='qux'))
Exemplo n.º 3
0
def test_fromdb_mkcursor():

    # initial data
    data = (('a', 1), ('b', 2), ('c', 2.0))
    connection = sqlite3.connect(':memory:')
    c = connection.cursor()
    c.execute('create table foobar (foo, bar)')
    for row in data:
        c.execute('insert into foobar values (?, ?)', row)
    connection.commit()
    c.close()

    # test the function
    mkcursor = lambda: connection.cursor()
    actual = fromdb(mkcursor, 'select * from foobar')
    expect = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2.0))
    ieq(expect, actual)
    ieq(expect, actual)  # verify can iterate twice

    # test iterators are isolated
    i1 = iter(actual)
    i2 = iter(actual)
    eq_(('foo', 'bar'), next(i1))
    eq_(('a', 1), next(i1))
    eq_(('foo', 'bar'), next(i2))
    eq_(('b', 2), next(i1))
Exemplo n.º 4
0
def test_records():
    table = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 3))
    actual = records(table)
    # access items
    it = iter(actual)
    o = next(it)
    eq_('a', o['foo'])
    eq_(1, o['bar'])
    o = next(it)
    eq_('b', o['foo'])
    eq_(2, o['bar'])
    # access attributes
    it = iter(actual)
    o = next(it)
    eq_('a', o.foo)
    eq_(1, o.bar)
    o = next(it)
    eq_('b', o.foo)
    eq_(2, o.bar)
    # access with get() method
    it = iter(actual)
    o = next(it)
    eq_('a', o.get('foo'))
    eq_(1, o.get('bar'))
    eq_(None, o.get('baz'))
    eq_('qux', o.get('baz', default='qux'))
Exemplo n.º 5
0
def test_fromdb_mkcursor():

    # initial data
    data = (("a", 1), ("b", 2), ("c", 2.0))
    connection = sqlite3.connect(":memory:")
    c = connection.cursor()
    c.execute("create table foobar (foo, bar)")
    for row in data:
        c.execute("insert into foobar values (?, ?)", row)
    connection.commit()
    c.close()

    # test the function
    mkcursor = lambda: connection.cursor()
    actual = fromdb(mkcursor, "select * from foobar")
    expect = (("foo", "bar"), ("a", 1), ("b", 2), ("c", 2.0))
    ieq(expect, actual)
    ieq(expect, actual)  # verify can iterate twice

    # test iterators are isolated
    i1 = iter(actual)
    i2 = iter(actual)
    eq_(("foo", "bar"), next(i1))
    eq_(("a", 1), next(i1))
    eq_(("foo", "bar"), next(i2))
    eq_(("b", 2), next(i1))
Exemplo n.º 6
0
def iterhashantijoin(left, right, lkey, rkey):
    lit = iter(left)
    rit = iter(right)

    lhdr = next(lit)
    rhdr = next(rit)
    yield tuple(lhdr)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lhdr, lkey)
    rkind = asindices(rhdr, rkey)
    
    # construct functions to extract key values from both tables
    lgetk = operator.itemgetter(*lkind)
    rgetk = operator.itemgetter(*rkind)
    
    rkeys = set()
    for rrow in rit:
        rk = rgetk(rrow)
        rkeys.add(rk)
        
    for lrow in lit:
        lk = lgetk(lrow)
        if lk not in rkeys:
            yield tuple(lrow)
Exemplo n.º 7
0
def iterhashrightjoin(left, right, lkey, rkey, missing, llookup, lprefix,
                      rprefix):
    lit = iter(left)
    rit = iter(right)

    lhdr = next(lit)
    rhdr = next(rit)
    
    # determine indices of the key fields in left and right tables
    lkind = asindices(lhdr, lkey)
    rkind = asindices(rhdr, rkey)
    
    # construct functions to extract key values from left table
    rgetk = operator.itemgetter(*rkind)
    
    # determine indices of non-key fields in the right table
    # (in the output, we only include key fields from the left table - we
    # don't want to duplicate fields)
    rvind = [i for i in range(len(rhdr)) if i not in rkind]
    rgetv = rowgetter(*rvind)
    
    # determine the output fields
    if lprefix is None:
        outhdr = list(lhdr)
    else:
        outhdr = [(str(lprefix) + str(f))
                  for f in lhdr]
    if rprefix is None:
        outhdr.extend(rgetv(rhdr))
    else:
        outhdr.extend([(str(rprefix) + str(f)) for f in rgetv(rhdr)])
    yield tuple(outhdr)

    # define a function to join rows
    def joinrows(_rrow, _lrows):
        for lrow in _lrows:
            # start with the left row
            _outrow = list(lrow)
            # extend with non-key values from the right row
            _outrow.extend(rgetv(_rrow))
            yield tuple(_outrow)

    for rrow in rit:
        k = rgetk(rrow)
        if k in llookup:
            lrows = llookup[k]
            for outrow in joinrows(rrow, lrows):
                yield outrow
        else:
            # start with missing values in place of the left row
            outrow = [missing] * len(lhdr)
            # set key values
            for li, ri in zip(lkind, rkind):
                outrow[li] = rrow[ri]
            # extend with non-key values from the right row  
            outrow.extend(rgetv(rrow))
            yield tuple(outrow)
Exemplo n.º 8
0
def iterhashrightjoin(left, right, lkey, rkey, missing, llookup, lprefix,
                      rprefix):
    lit = iter(left)
    rit = iter(right)

    lhdr = next(lit)
    rhdr = next(rit)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lhdr, lkey)
    rkind = asindices(rhdr, rkey)

    # construct functions to extract key values from left table
    rgetk = operator.itemgetter(*rkind)

    # determine indices of non-key fields in the right table
    # (in the output, we only include key fields from the left table - we
    # don't want to duplicate fields)
    rvind = [i for i in range(len(rhdr)) if i not in rkind]
    rgetv = rowgetter(*rvind)

    # determine the output fields
    if lprefix is None:
        outhdr = list(lhdr)
    else:
        outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr]
    if rprefix is None:
        outhdr.extend(rgetv(rhdr))
    else:
        outhdr.extend([(text_type(rprefix) + text_type(f))
                       for f in rgetv(rhdr)])
    yield tuple(outhdr)

    # define a function to join rows
    def joinrows(_rrow, _lrows):
        for lrow in _lrows:
            # start with the left row
            _outrow = list(lrow)
            # extend with non-key values from the right row
            _outrow.extend(rgetv(_rrow))
            yield tuple(_outrow)

    for rrow in rit:
        k = rgetk(rrow)
        if k in llookup:
            lrows = llookup[k]
            for outrow in joinrows(rrow, lrows):
                yield outrow
        else:
            # start with missing values in place of the left row
            outrow = [missing] * len(lhdr)
            # set key values
            for li, ri in zip(lkind, rkind):
                outrow[li] = rrow[ri]
            # extend with non-key values from the right row
            outrow.extend(rgetv(rrow))
            yield tuple(outrow)
Exemplo n.º 9
0
def test_namedtuples():
    table = (('foo', 'bar'), ('a', 1), ('b', 2))
    actual = namedtuples(table)
    it = iter(actual)
    o = next(it)
    eq_('a', o.foo)
    eq_(1, o.bar)
    o = next(it)
    eq_('b', o.foo)
    eq_(2, o.bar)
Exemplo n.º 10
0
def test_namedtuples_unevenrows():
    table = (('foo', 'bar'), ('a', 1, True), ('b',))
    actual = namedtuples(table)
    it = iter(actual)
    o = next(it)
    eq_('a', o.foo)
    eq_(1, o.bar)
    o = next(it)
    eq_('b', o.foo)
    eq_(None, o.bar)
Exemplo n.º 11
0
def test_namedtuples_unevenrows():
    table = (('foo', 'bar'), ('a', 1, True), ('b', ))
    actual = namedtuples(table)
    it = iter(actual)
    o = next(it)
    eq_('a', o.foo)
    eq_(1, o.bar)
    o = next(it)
    eq_('b', o.foo)
    eq_(None, o.bar)
Exemplo n.º 12
0
def test_namedtuples():
    table = (('foo', 'bar'), ('a', 1), ('b', 2))
    actual = namedtuples(table)
    it = iter(actual)
    o = next(it)
    eq_('a', o.foo)
    eq_(1, o.bar)
    o = next(it)
    eq_('b', o.foo)
    eq_(2, o.bar)
Exemplo n.º 13
0
    def push(self, ta, tb, limit=None):
        ita = iter(ta) 
        itb = iter(tb)
        aflds = [str(f) for f in next(ita)]
        next(itb)  # ignore b fields

        default_connections, keyed_connections = self._connect_receivers(aflds)

        def _broadcast(*args):
            if len(args) == 1:
                for c in default_connections:
                    c.accept(args[0])
            else:
                key, row = args
                if key in keyed_connections:
                    for c in keyed_connections[key]:
                        c.accept(row)
        
        try:
            a = tuple(next(ita))
        except StopIteration:
            # a is empty, everything in b is added
            for b in itb:
                _broadcast('+', b)
        else:
            try:
                b = tuple(next(itb))
            except StopIteration:
                # b is empty, everything in a is subtracted
                _broadcast('-', a)
                for a in ita:
                    _broadcast('-', a)
            else:
                while a is not None and b is not None:
                    if b is None or a < b:
                        _broadcast('-', a)
                        # advance a
                        try:
                            a = tuple(next(ita))
                        except StopIteration:
                            a = None
                    elif a == b:
                        _broadcast(a)  # default channel
                        # advance both
                        try:
                            a = tuple(next(ita))
                        except StopIteration:
                            a = None
                        try:
                            b = tuple(next(itb))
                        except StopIteration:
                            b = None
                    else:
                        _broadcast('+', b)
                        # advance b
                        try:
                            b = tuple(next(itb))
                        except StopIteration:
                            b = None
Exemplo n.º 14
0
    def push(self, ta, tb, limit=None):
        ita = iter(ta)
        itb = iter(tb)
        aflds = [str(f) for f in next(ita)]
        next(itb)  # ignore b fields

        default_connections, keyed_connections = self._connect_receivers(aflds)

        def _broadcast(*args):
            if len(args) == 1:
                for c in default_connections:
                    c.accept(args[0])
            else:
                key, row = args
                if key in keyed_connections:
                    for c in keyed_connections[key]:
                        c.accept(row)

        try:
            a = tuple(next(ita))
        except StopIteration:
            # a is empty, everything in b is added
            for b in itb:
                _broadcast('+', b)
        else:
            try:
                b = tuple(next(itb))
            except StopIteration:
                # b is empty, everything in a is subtracted
                _broadcast('-', a)
                for a in ita:
                    _broadcast('-', a)
            else:
                while a is not None and b is not None:
                    if b is None or a < b:
                        _broadcast('-', a)
                        # advance a
                        try:
                            a = tuple(next(ita))
                        except StopIteration:
                            a = None
                    elif a == b:
                        _broadcast(a)  # default channel
                        # advance both
                        try:
                            a = tuple(next(ita))
                        except StopIteration:
                            a = None
                        try:
                            b = tuple(next(itb))
                        except StopIteration:
                            b = None
                    else:
                        _broadcast('+', b)
                        # advance b
                        try:
                            b = tuple(next(itb))
                        except StopIteration:
                            b = None
Exemplo n.º 15
0
def iterfilldown(table, fillfields, missing, where, anchorfields, until):
	# prepare where function
	if isinstance(where, string_types):
		where = expr(where)
	elif where is not None:
		assert callable(where), 'expected callable for "where" argument, found %r' % where
	else:
		where = lambda r: True # default where callable returns True
	# prepare until function
	if isinstance(until, string_types):
		until = expr(until)
	elif until is not None:
		assert callable(until), 'expected callable for "until" argument, found %r' % until
	else:
		until = lambda r: False # default until callable returns True
	# normal iter function
	it = iter(table)
	hdr = next(it)
	flds = list(map(text_type, hdr))
	yield tuple(hdr)
	if not fillfields:  # fill down all fields
		fillfields = hdr
	fillindices = asindices(hdr, fillfields)
	if anchorfields:
		anchorindices = asindices(hdr, anchorfields)
	fill = list(next(it))  # fill values
	prev = fill
	untilfunctiontriggered = False
	yield tuple(fill)
	for row in it:
		outrow = list(row)
		if untilfunctiontriggered:
			fill = outrow
			untilfunctiontriggered = False # reset
		if anchorfields:
			row_values = [row[i] for i in anchorindices]
			prev_values = [prev[i] for i in anchorindices]
			check_anchor = row_values == prev_values
		else:
			check_anchor = True
		# loop through fill-down fields
		for idx in fillindices:
			if row[idx] == missing and where(Record(row, flds)) and check_anchor: 
				outrow[idx] = fill[idx]  # fill down
			elif row[idx] == missing and check_anchor:
				pass
			else:
				fill[idx] = row[idx]  # new fill value
		prev = outrow
		yield tuple(outrow)
		# found stop point, reset fill with next row's contents
		if until(Record(row, flds)):
			untilfunctiontriggered = True
Exemplo n.º 16
0
def iterhashintersection(a, b):
    ita = iter(a)
    ahdr = next(ita)
    yield tuple(ahdr)
    itb = iter(b)
    next(itb)  # discard b header, assume same as a

    # N.B., need to account for possibility of duplicate rows
    bcnt = Counter(tuple(row) for row in itb)
    for ar in ita:
        t = tuple(ar)
        if bcnt[t] > 0:
            yield t
            bcnt[t] -= 1
Exemplo n.º 17
0
def issorted(table, key=None, reverse=False, strict=False):
    """
    Return True if the table is ordered (i.e., sorted) by the given key. E.g.::

        >>> import petl as etl
        >>> table1 = [['foo', 'bar', 'baz'],
        ...           ['a', 1, True],
        ...           ['b', 3, True],
        ...           ['b', 2]]
        >>> etl.issorted(table1, key='foo')
        True
        >>> etl.issorted(table1, key='bar')
        False
        >>> etl.issorted(table1, key='foo', strict=True)
        False
        >>> etl.issorted(table1, key='foo', reverse=True)
        False

    """

    # determine the operator to use when comparing rows
    if reverse and strict:
        op = operator.lt
    elif reverse and not strict:
        op = operator.le
    elif strict:
        op = operator.gt
    else:
        op = operator.ge

    it = iter(table)
    flds = [text_type(f) for f in next(it)]
    if key is None:
        prev = next(it)
        for curr in it:
            if not op(curr, prev):
                return False
            prev = curr
    else:
        getkey = comparable_itemgetter(*asindices(flds, key))
        prev = next(it)
        prevkey = getkey(prev)
        for curr in it:
            currkey = getkey(curr)
            if not op(currkey, prevkey):
                return False
            prevkey = currkey
    return True
Exemplo n.º 18
0
def merge(
    source: any, name: str, from_names: list, sep: str = "-", preserve: bool = True
) -> Iterator:
    it = iter(source)

    hdr = next(it)
    field_indexes = list()
    flds = list(map(text_type, hdr))

    # determine output fields
    outhdr = list(flds)
    for field in from_names:
        field_index = flds.index(field)
        if not preserve:
            outhdr.remove(field)
        field_indexes.append(field_index)
    outhdr.extend([name])
    yield tuple(outhdr)

    # construct the output data
    for row in it:
        value = [v for i, v in enumerate(row) if i in field_indexes]
        if preserve:
            out_row = list(row)
        else:
            out_row = [v for i, v in enumerate(row) if i not in field_indexes]
        out_row.extend([sep.join(value)])
        yield tuple(out_row)
Exemplo n.º 19
0
def iterhashcomplement(a, b, strict):
    ita = iter(a)
    ahdr = next(ita)
    yield tuple(ahdr)
    itb = iter(b)
    next(itb)  # discard b header, assume same as a

    # N.B., need to account for possibility of duplicate rows
    bcnt = Counter(tuple(row) for row in itb)
    for ar in ita:
        t = tuple(ar)
        if bcnt[t] > 0:
            if not strict:
                bcnt[t] -= 1
        else:
            yield t
Exemplo n.º 20
0
def itersplit(source, field, pattern, newfields, include_original, maxsplit,
              flags):

    it = iter(source)
    prog = re.compile(pattern, flags)

    hdr = next(it)
    flds = list(map(text_type, hdr))
    if isinstance(field, int) and field < len(hdr):
        field_index = field
        field = hdr[field_index]
    elif field in flds:
        field_index = flds.index(field)
    else:
        raise ArgumentError('field invalid: must be either field name or index')

    # determine output fields
    outhdr = list(flds)
    if not include_original:
        outhdr.remove(field)
    if newfields:
        outhdr.extend(newfields)
    yield tuple(outhdr)

    # construct the output data
    for row in it:
        value = row[field_index]
        if include_original:
            out_row = list(row)
        else:
            out_row = [v for i, v in enumerate(row) if i != field_index]
        out_row.extend(prog.split(value, maxsplit))
        yield tuple(out_row)
Exemplo n.º 21
0
def iterselectusingcontext(table, query):
    it = iter(table)
    hdr = tuple(next(it))
    flds = list(map(text_type, hdr))
    yield hdr
    it = (Record(row, flds) for row in it)
    prv = None
    cur = next(it)
    for nxt in it:
        if query(prv, cur, nxt):
            yield cur
        prv = cur
        cur = nxt
    # handle last row
    if query(prv, cur, None):
        yield cur
Exemplo n.º 22
0
 def __iter__(self):
     it = iter(self.table)
     hdr = next(it)
     outhdr = tuple((text_type(f) + text_type(self.suffix)) for f in hdr)
     yield outhdr
     for row in it:
         yield row
Exemplo n.º 23
0
def iteraddfieldusingcontext(table, field, query):
    it = iter(table)
    hdr = tuple(next(it))
    flds = list(map(text_type, hdr))
    yield hdr + (field, )
    it = (Record(row, flds) for row in it)
    prv = None
    cur = next(it)
    for nxt in it:
        v = query(prv, cur, nxt)
        yield tuple(cur) + (v, )
        prv = cur
        cur = nxt
    # handle last row
    v = query(prv, cur, None)
    yield tuple(cur) + (v, )
Exemplo n.º 24
0
def iterpackdict(source: any,
                 name: str,
                 from_names: list,
                 preserve: bool = False) -> Iterator:
    """Combines multiple columns as JSON Object"""
    it = iter(source)

    hdr = next(it)
    field_indexes = list()
    flds = list(map(text_type, hdr))

    # determine output fields
    outhdr = list(flds)
    for field in from_names:
        field_index = flds.index(field)
        if not preserve:
            outhdr.remove(field)
        field_indexes.append(field_index)
    outhdr.extend([name])
    yield tuple(outhdr)

    # construct the output data
    for row in it:
        value = dict((from_names[i - 1], v) for i, v in enumerate(row)
                     if i in field_indexes)
        if preserve:
            out_row = list(row)
        else:
            out_row = [v for i, v in enumerate(row) if i not in field_indexes]
        out_row.extend([value])
        yield tuple(out_row)
Exemplo n.º 25
0
def itercat(sources, missing, header):
    its = [iter(t) for t in sources]
    hdrs = [list(next(it)) for it in its]

    if header is None:
        # determine output fields by gathering all fields found in the sources
        outhdr = list(hdrs[0])
        for hdr in hdrs[1:]:
            for h in hdr:
                if h not in outhdr:
                    # add any new fields as we find them
                    outhdr.append(h)
    else:
        # predetermined output fields
        outhdr = header
    yield tuple(outhdr)

    # output data rows
    for hdr, it in zip(hdrs, its):

        # now construct and yield the data rows
        for row in it:
            outrow = list()
            for h in outhdr:
                val = missing
                try:
                    val = row[hdr.index(h)]
                except IndexError:
                    # short row
                    pass
                except ValueError:
                    # field not in table
                    pass
                outrow.append(val)
            yield tuple(outrow)
Exemplo n.º 26
0
def iteraddfield(source, field, value, index):
    it = iter(source)
    hdr = next(it)
    flds = list(map(text_type, hdr))

    # determine index of new field
    if index is None:
        index = len(hdr)

    # construct output fields
    outhdr = list(hdr)
    outhdr.insert(index, field)
    yield tuple(outhdr)

    if callable(value):
        # wrap rows as records if using calculated value
        it = (Record(row, flds) for row in it)
        for row in it:
            outrow = list(row)
            v = value(row)
            outrow.insert(index, v)
            yield tuple(outrow)
    else:
        for row in it:
            outrow = list(row)
            outrow.insert(index, value)
            yield tuple(outrow)
Exemplo n.º 27
0
def iteraddfieldusingcontext(table, field, query):
    it = iter(table)
    hdr = tuple(next(it))
    flds = list(map(text_type, hdr))
    yield hdr + (field,)
    it = (Record(row, flds) for row in it)
    prv = None
    cur = next(it)
    for nxt in it:
        v = query(prv, cur, nxt)
        yield tuple(cur) + (v,)
        prv = cur
        cur = nxt
    # handle last row
    v = query(prv, cur, None)
    yield tuple(cur) + (v,)
Exemplo n.º 28
0
def iterpack(source: any,
             name: str,
             from_names: list,
             preserve: bool = False) -> Iterator:
    """Combines multiple columns as array
    Code partially referenced from https://github.com/petl-developers/petl/blob/master/petl/transform/unpacks.py#L64
    """
    it = iter(source)

    hdr = next(it)
    field_indexes = list()
    flds = list(map(text_type, hdr))

    # determine output fields
    outhdr = list(flds)
    for field in from_names:
        field_index = flds.index(field)
        if not preserve:
            outhdr.remove(field)
        field_indexes.append(field_index)
    outhdr.extend([name])
    yield tuple(outhdr)

    # construct the output data
    for row in it:
        value = [v for i, v in enumerate(row) if i in field_indexes]
        if preserve:
            out_row = list(row)
        else:
            out_row = [v for i, v in enumerate(row) if i not in field_indexes]
        out_row.extend([value])
        yield tuple(out_row)
Exemplo n.º 29
0
def iterselectusingcontext(table, query):
    it = iter(table)
    hdr = tuple(next(it))
    flds = list(map(text_type, hdr))
    yield hdr
    it = (Record(row, flds) for row in it)
    prv = None
    cur = next(it)
    for nxt in it:
        if query(prv, cur, nxt):
            yield cur
        prv = cur
        cur = nxt
    # handle last row
    if query(prv, cur, None):
        yield cur
Exemplo n.º 30
0
def iterhashcomplement(a, b, strict):
    ita = iter(a)
    ahdr = next(ita)
    yield tuple(ahdr)
    itb = iter(b)
    next(itb)  # discard b header, assume same as a

    # N.B., need to account for possibility of duplicate rows
    bcnt = Counter(tuple(row) for row in itb)
    for ar in ita:
        t = tuple(ar)
        if bcnt[t] > 0:
            if not strict:
                bcnt[t] -= 1
        else:
            yield t
Exemplo n.º 31
0
 def push(self, source, limit=None):
     it = iter(source)
     fields = next(it)
     c = self.connect(fields)
     for row in islice(it, limit):
         c.accept(tuple(row))
     c.close()
Exemplo n.º 32
0
def groupselectfirst(table, key, presorted=False, buffersize=None,
                     tempdir=None, cache=True):
    """Group by the `key` field then return the first row within each group."""

    _reducer = lambda k, rows: next(rows)
    return rowreduce(table, key, reducer=_reducer, presorted=presorted,
                     buffersize=buffersize, tempdir=tempdir, cache=cache)
Exemplo n.º 33
0
 def __iter__(self):
     it = iter(self.table)
     hdr = next(it)
     outhdr = tuple((text_type(self.prefix) + text_type(f)) for f in hdr)
     yield outhdr
     for row in it:
         yield row
Exemplo n.º 34
0
def itersearch(table, pattern, field, flags, complement):
    prog = re.compile(pattern, flags)
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    yield tuple(hdr)

    if field is None:
        # search whole row
        test = lambda r: any(prog.search(text_type(v)) for v in r)
    else:
        indices = asindices(hdr, field)
        if len(indices) == 1:
            index = indices[0]
            test = lambda r: prog.search(text_type(r[index]))
        else:
            getvals = operator.itemgetter(*indices)
            test = lambda r: any(prog.search(text_type(v)) for v in getvals(r))
    # complement==False, return rows that match
    if not complement:
        for row in it:
            if test(row):
                yield tuple(row)
    # complement==True, return rows that do not match
    else:
        for row in it:
            if not test(row):
                yield tuple(row)
Exemplo n.º 35
0
 def __iter__(self):
     it = iter(self.table)
     hdr = next(it)
     outhdr = tuple((str(self.prefix) + str(f)) for f in hdr)
     yield outhdr
     for row in it:
         yield row
Exemplo n.º 36
0
def itermultiaggregate(source, key, aggregation):
    aggregation = OrderedDict(aggregation.items())  # take a copy
    it = iter(source)
    hdr = next(it)
    # push back header to ensure we iterate only once
    it = itertools.chain([hdr], it)

    # normalise aggregators
    for outfld in aggregation:
        agg = aggregation[outfld]
        if callable(agg):
            aggregation[outfld] = None, agg
        elif isinstance(agg, string_types):
            aggregation[outfld] = agg, list  # list is default
        elif len(agg) == 1 and isinstance(agg[0], string_types):
            aggregation[outfld] = agg[0], list  # list is default
        elif len(agg) == 1 and callable(agg[0]):
            aggregation[outfld] = None, agg[0]  # aggregate whole rows
        elif len(agg) == 2:
            pass  # no need to normalise
        else:
            raise ArgumentError('invalid aggregation: %r, %r' % (outfld, agg))

    # determine output header
    if isinstance(key, (list, tuple)):
        outhdr = list(key)
    elif callable(key):
        outhdr = ['key']
    else:
        outhdr = [key]
    for outfld in aggregation:
        outhdr.append(outfld)
    yield tuple(outhdr)
    
    # generate data
    for k, rows in rowgroupby(it, key):
        rows = list(rows)  # may need to iterate over these more than once
        # handle compound key
        if isinstance(key, (list, tuple)):
            outrow = list(k)
        else:
            outrow = [k]
        for outfld in aggregation:
            srcfld, aggfun = aggregation[outfld]
            if srcfld is None:
                aggval = aggfun(rows)
                outrow.append(aggval)
            elif isinstance(srcfld, (list, tuple)):
                idxs = [hdr.index(f) for f in srcfld]
                valgetter = operator.itemgetter(*idxs)
                vals = (valgetter(row) for row in rows)
                aggval = aggfun(vals)
                outrow.append(aggval)
            else:
                idx = hdr.index(srcfld)
                # try using generator comprehension
                vals = (row[idx] for row in rows)
                aggval = aggfun(vals)
                outrow.append(aggval)
        yield tuple(outrow)
Exemplo n.º 37
0
def iteraddfield(source, field, value, index):
    it = iter(source)
    hdr = next(it)
    flds = list(map(text_type, hdr))

    # determine index of new field
    if index is None:
        index = len(hdr)
        
    # construct output fields
    outhdr = list(hdr)
    outhdr.insert(index, field)
    yield tuple(outhdr)

    if callable(value):
        # wrap rows as records if using calculated value
        it = (Record(row, flds) for row in it)
        for row in it:
            outrow = list(row)
            v = value(row)
            outrow.insert(index, v)
            yield tuple(outrow)
    else:
        for row in it:
            outrow = list(row)
            outrow.insert(index, value)
            yield tuple(outrow)
Exemplo n.º 38
0
Arquivo: xls.py Projeto: DeanWay/petl
def toxls(tbl, filename, sheet, encoding=None, style_compression=0,
          styles=None):
    """
    Write a table to a new Excel .xls file.

    """

    import xlwt
    if encoding is None:
        encoding = locale.getpreferredencoding()
    wb = xlwt.Workbook(encoding=encoding, style_compression=style_compression)
    ws = wb.add_sheet(sheet)

    if styles is None:
        # simple version, don't worry about styles
        for r, row in enumerate(tbl):
            for c, v in enumerate(row):
                ws.write(r, c, label=v)
    else:
        # handle styles
        it = iter(tbl)
        hdr = next(it)
        flds = list(map(str, hdr))
        for c, f in enumerate(flds):
            ws.write(0, c, label=f)
            if f not in styles or styles[f] is None:
                styles[f] = xlwt.Style.default_style
        # convert to list for easy zipping
        styles = [styles[f] for f in flds]
        for r, row in enumerate(it):
            for c, (v, style) in enumerate(izip_longest(row, styles,
                                                        fillvalue=None)):
                ws.write(r+1, c, label=v, style=style)

    wb.save(filename)
Exemplo n.º 39
0
def itercat(sources, missing, header):
    its = [iter(t) for t in sources]
    hdrs = [list(next(it)) for it in its]

    if header is None:
        # determine output fields by gathering all fields found in the sources
        outhdr = list(hdrs[0])
        for hdr in hdrs[1:]:
            for h in hdr:
                if h not in outhdr:
                    # add any new fields as we find them
                    outhdr.append(h)
    else:
        # predetermined output fields
        outhdr = header
    yield tuple(outhdr)

    # output data rows
    for hdr, it in zip(hdrs, its):

        # now construct and yield the data rows
        for row in it:
            outrow = list()
            for h in outhdr:
                val = missing
                try:
                    val = row[hdr.index(h)]
                except IndexError:
                    # short row
                    pass
                except ValueError:
                    # field not in table
                    pass
                outrow.append(val)
            yield tuple(outrow)
Exemplo n.º 40
0
def columns(table, missing=None):
    """
    Construct a :class:`dict` mapping field names to lists of values. E.g.::

        >>> import petl as etl
        >>> table = [['foo', 'bar'], ['a', 1], ['b', 2], ['b', 3]]
        >>> cols = etl.columns(table)
        >>> cols['foo']
        ['a', 'b', 'b']
        >>> cols['bar']
        [1, 2, 3]

    See also :func:`petl.util.materialise.facetcolumns`.

    """

    cols = OrderedDict()
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    for f in flds:
        cols[f] = list()
    for row in it:
        for f, v in izip_longest(flds, row, fillvalue=missing):
            if f in cols:
                cols[f].append(v)
    return cols
Exemplo n.º 41
0
 def push(self, source, limit=None):
     it = iter(source)
     fields = next(it)
     c = self.connect(fields)
     for row in islice(it, limit):
         c.accept(tuple(row))
     c.close()
Exemplo n.º 42
0
def itermultiaggregate(source, key, aggregation):
    aggregation = OrderedDict(aggregation.items())  # take a copy
    it = iter(source)
    hdr = next(it)
    # push back header to ensure we iterate only once
    it = itertools.chain([hdr], it)

    # normalise aggregators
    for outfld in aggregation:
        agg = aggregation[outfld]
        if callable(agg):
            aggregation[outfld] = None, agg
        elif isinstance(agg, string_types):
            aggregation[outfld] = agg, list  # list is default
        elif len(agg) == 1 and isinstance(agg[0], string_types):
            aggregation[outfld] = agg[0], list  # list is default
        elif len(agg) == 1 and callable(agg[0]):
            aggregation[outfld] = None, agg[0]  # aggregate whole rows
        elif len(agg) == 2:
            pass  # no need to normalise
        else:
            raise ArgumentError('invalid aggregation: %r, %r' % (outfld, agg))

    # determine output header
    if isinstance(key, (list, tuple)):
        outhdr = list(key)
    elif callable(key):
        outhdr = ['key']
    else:
        outhdr = [key]
    for outfld in aggregation:
        outhdr.append(outfld)
    yield tuple(outhdr)

    # generate data
    for k, rows in rowgroupby(it, key):
        rows = list(rows)  # may need to iterate over these more than once
        # handle compound key
        if isinstance(key, (list, tuple)):
            outrow = list(k)
        else:
            outrow = [k]
        for outfld in aggregation:
            srcfld, aggfun = aggregation[outfld]
            if srcfld is None:
                aggval = aggfun(rows)
                outrow.append(aggval)
            elif isinstance(srcfld, (list, tuple)):
                idxs = [hdr.index(f) for f in srcfld]
                valgetter = operator.itemgetter(*idxs)
                vals = (valgetter(row) for row in rows)
                aggval = aggfun(vals)
                outrow.append(aggval)
            else:
                idx = hdr.index(srcfld)
                # try using generator comprehension
                vals = (row[idx] for row in rows)
                aggval = aggfun(vals)
                outrow.append(aggval)
        yield tuple(outrow)
Exemplo n.º 43
0
def itervalues(table, field, **kwargs):

    missing = kwargs.get('missing', None)
    it = iter(table)
    hdr = next(it)

    indices = asindices(hdr, field)
    assert len(indices) > 0, 'no field selected'
    getvalue = operator.itemgetter(*indices)
    for row in it:
        try:
            value = getvalue(row)
            yield value
        except IndexError:
            if len(indices) > 1:
                # try one at a time
                value = list()
                for i in indices:
                    if i < len(row):
                        value.append(row[i])
                    else:
                        value.append(missing)
                yield tuple(value)
            else:
                yield missing
Exemplo n.º 44
0
def itervalues(table, field, **kwargs):

    missing = kwargs.get('missing', None)
    it = iter(table)
    hdr = next(it)

    indices = asindices(hdr, field)
    assert len(indices) > 0, 'no field selected'
    getvalue = operator.itemgetter(*indices)
    for row in it:
        try:
            value = getvalue(row)
            yield value
        except IndexError:
            if len(indices) > 1:
                # try one at a time
                value = list()
                for i in indices:
                    if i < len(row):
                        value.append(row[i])
                    else:
                        value.append(missing)
                yield tuple(value)
            else:
                yield missing
Exemplo n.º 45
0
def itersearch(table, pattern, field, flags, complement):
    prog = re.compile(pattern, flags)
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    yield tuple(hdr)

    if field is None:
        # search whole row
        test = lambda r: any(prog.search(text_type(v)) for v in r)
    else:
        indices = asindices(hdr, field)
        if len(indices) == 1:
            index = indices[0]
            test = lambda r: prog.search(text_type(r[index]))
        else:
            getvals = operator.itemgetter(*indices)
            test = lambda r: any(prog.search(text_type(v)) for v in getvals(r))
    # complement==False, return rows that match
    if not complement:
        for row in it:
            if test(row):
                yield tuple(row)
    # complement==True, return rows that do not match
    else:
        for row in it:
            if not test(row):
                yield tuple(row)
Exemplo n.º 46
0
 def __iter__(self):
     it = iter(self.table)
     hdr = next(it)
     outhdr = tuple((str(f) + str(self.suffix)) for f in hdr)
     yield outhdr
     for row in it:
         yield row
Exemplo n.º 47
0
def itersplit(source, field, pattern, newfields, include_original, maxsplit,
              flags):

    it = iter(source)
    prog = re.compile(pattern, flags)

    hdr = next(it)
    flds = list(map(text_type, hdr))
    if isinstance(field, int) and field < len(hdr):
        field_index = field
        field = hdr[field_index]
    elif field in flds:
        field_index = flds.index(field)
    else:
        raise ArgumentError(
            'field invalid: must be either field name or index')

    # determine output fields
    outhdr = list(flds)
    if not include_original:
        outhdr.remove(field)
    if newfields:
        outhdr.extend(newfields)
    yield tuple(outhdr)

    # construct the output data
    for row in it:
        value = row[field_index]
        if include_original:
            out_row = list(row)
        else:
            out_row = [v for i, v in enumerate(row) if i != field_index]
        out_row.extend(prog.split(value, maxsplit))
        yield tuple(out_row)
Exemplo n.º 48
0
def iterpeek(it, n=1):
    it = iter(it)  # make sure it's an iterator
    if n == 1:
        peek = next(it)
        return peek, chain([peek], it)
    else:
        peek = list(islice(it, n))
        return peek, chain(peek, it)
Exemplo n.º 49
0
def iterdicts(table, *sliceargs, **kwargs):
    missing = kwargs.get('missing', None)
    it = iter(table)
    hdr = next(it)
    if sliceargs:
        it = islice(it, *sliceargs)
    for row in it:
        yield asdict(hdr, row, missing)
Exemplo n.º 50
0
 def __getitem__(self, item):
     if isinstance(item, int):
         try:
             return next(islice(self, item, item + 1))
         except StopIteration:
             raise IndexError('index out of range')
     elif isinstance(item, slice):
         return islice(self, item.start, item.stop, item.step)
Exemplo n.º 51
0
def iterpeek(it, n=1):
    it = iter(it)  # make sure it's an iterator
    if n == 1:
        peek = next(it)
        return peek, chain([peek], it)
    else:
        peek = list(islice(it, n))
        return peek, chain(peek, it)
Exemplo n.º 52
0
def iterdicts(table, *sliceargs, **kwargs):
    missing = kwargs.get('missing', None)
    it = iter(table)
    hdr = next(it)
    if sliceargs:
        it = islice(it, *sliceargs)
    for row in it:
        yield asdict(hdr, row, missing)
Exemplo n.º 53
0
 def __getitem__(self, item):
     if isinstance(item, int):
         try:
             return next(islice(self, item, item+1))
         except StopIteration:
             raise IndexError('index out of range')
     elif isinstance(item, slice):
         return islice(self, item.start, item.stop, item.step)
Exemplo n.º 54
0
def iterextendheader(source, fields):
    it = iter(source)
    hdr = next(it)
    outhdr = list(hdr)
    outhdr.extend(fields)
    yield tuple(outhdr)
    for row in it:
        yield tuple(row)
Exemplo n.º 55
0
def iterfilldown(table, fillfields, missing):
    it = iter(table)
    hdr = next(it)
    yield tuple(hdr)
    if not fillfields:  # fill down all fields
        fillfields = hdr
    fillindices = asindices(hdr, fillfields)
    fill = list(next(it))  # fill values
    yield tuple(fill)
    for row in it:
        outrow = list(row)
        for idx in fillindices:
            if row[idx] == missing:
                outrow[idx] = fill[idx]  # fill down
            else:
                fill[idx] = row[idx]  # new fill value
        yield tuple(outrow)
Exemplo n.º 56
0
def iterfilldown(table, fillfields, missing):
    it = iter(table)
    hdr = next(it)
    yield tuple(hdr)
    if not fillfields:  # fill down all fields
        fillfields = hdr
    fillindices = asindices(hdr, fillfields)
    fill = list(next(it))  # fill values
    yield tuple(fill)
    for row in it:
        outrow = list(row)
        for idx in fillindices:
            if row[idx] == missing:
                outrow[idx] = fill[idx]  # fill down
            else:
                fill[idx] = row[idx]  # new fill value
        yield tuple(outrow)
Exemplo n.º 57
0
def iterextendheader(source, fields):
    it = iter(source)
    hdr = next(it)
    outhdr = list(hdr)
    outhdr.extend(fields)
    yield tuple(outhdr)
    for row in it:
        yield tuple(row)