def iterhashantijoin(left, right, lkey, rkey): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) yield tuple(lhdr) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from both tables lgetk = operator.itemgetter(*lkind) rgetk = operator.itemgetter(*rkind) rkeys = set() for rrow in rit: rk = rgetk(rrow) rkeys.add(rk) for lrow in lit: lk = lgetk(lrow) if lk not in rkeys: yield tuple(lrow)
def test_records(): table = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 3)) actual = records(table) # access items it = iter(actual) o = next(it) eq_('a', o['foo']) eq_(1, o['bar']) o = next(it) eq_('b', o['foo']) eq_(2, o['bar']) # access attributes it = iter(actual) o = next(it) eq_('a', o.foo) eq_(1, o.bar) o = next(it) eq_('b', o.foo) eq_(2, o.bar) # access with get() method it = iter(actual) o = next(it) eq_('a', o.get('foo')) eq_(1, o.get('bar')) eq_(None, o.get('baz')) eq_('qux', o.get('baz', default='qux'))
def test_fromdb_mkcursor(): # initial data data = (('a', 1), ('b', 2), ('c', 2.0)) connection = sqlite3.connect(':memory:') c = connection.cursor() c.execute('create table foobar (foo, bar)') for row in data: c.execute('insert into foobar values (?, ?)', row) connection.commit() c.close() # test the function mkcursor = lambda: connection.cursor() actual = fromdb(mkcursor, 'select * from foobar') expect = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2.0)) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice # test iterators are isolated i1 = iter(actual) i2 = iter(actual) eq_(('foo', 'bar'), next(i1)) eq_(('a', 1), next(i1)) eq_(('foo', 'bar'), next(i2)) eq_(('b', 2), next(i1))
def test_fromdb_mkcursor(): # initial data data = (("a", 1), ("b", 2), ("c", 2.0)) connection = sqlite3.connect(":memory:") c = connection.cursor() c.execute("create table foobar (foo, bar)") for row in data: c.execute("insert into foobar values (?, ?)", row) connection.commit() c.close() # test the function mkcursor = lambda: connection.cursor() actual = fromdb(mkcursor, "select * from foobar") expect = (("foo", "bar"), ("a", 1), ("b", 2), ("c", 2.0)) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice # test iterators are isolated i1 = iter(actual) i2 = iter(actual) eq_(("foo", "bar"), next(i1)) eq_(("a", 1), next(i1)) eq_(("foo", "bar"), next(i2)) eq_(("b", 2), next(i1))
def iterhashrightjoin(left, right, lkey, rkey, missing, llookup, lprefix, rprefix): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table rgetk = operator.itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(str(lprefix) + str(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(str(rprefix) + str(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_rrow, _lrows): for lrow in _lrows: # start with the left row _outrow = list(lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) yield tuple(_outrow) for rrow in rit: k = rgetk(rrow) if k in llookup: lrows = llookup[k] for outrow in joinrows(rrow, lrows): yield outrow else: # start with missing values in place of the left row outrow = [missing] * len(lhdr) # set key values for li, ri in zip(lkind, rkind): outrow[li] = rrow[ri] # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow)
def iterhashrightjoin(left, right, lkey, rkey, missing, llookup, lprefix, rprefix): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table rgetk = operator.itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_rrow, _lrows): for lrow in _lrows: # start with the left row _outrow = list(lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) yield tuple(_outrow) for rrow in rit: k = rgetk(rrow) if k in llookup: lrows = llookup[k] for outrow in joinrows(rrow, lrows): yield outrow else: # start with missing values in place of the left row outrow = [missing] * len(lhdr) # set key values for li, ri in zip(lkind, rkind): outrow[li] = rrow[ri] # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow)
def test_namedtuples(): table = (('foo', 'bar'), ('a', 1), ('b', 2)) actual = namedtuples(table) it = iter(actual) o = next(it) eq_('a', o.foo) eq_(1, o.bar) o = next(it) eq_('b', o.foo) eq_(2, o.bar)
def test_namedtuples_unevenrows(): table = (('foo', 'bar'), ('a', 1, True), ('b',)) actual = namedtuples(table) it = iter(actual) o = next(it) eq_('a', o.foo) eq_(1, o.bar) o = next(it) eq_('b', o.foo) eq_(None, o.bar)
def test_namedtuples_unevenrows(): table = (('foo', 'bar'), ('a', 1, True), ('b', )) actual = namedtuples(table) it = iter(actual) o = next(it) eq_('a', o.foo) eq_(1, o.bar) o = next(it) eq_('b', o.foo) eq_(None, o.bar)
def push(self, ta, tb, limit=None): ita = iter(ta) itb = iter(tb) aflds = [str(f) for f in next(ita)] next(itb) # ignore b fields default_connections, keyed_connections = self._connect_receivers(aflds) def _broadcast(*args): if len(args) == 1: for c in default_connections: c.accept(args[0]) else: key, row = args if key in keyed_connections: for c in keyed_connections[key]: c.accept(row) try: a = tuple(next(ita)) except StopIteration: # a is empty, everything in b is added for b in itb: _broadcast('+', b) else: try: b = tuple(next(itb)) except StopIteration: # b is empty, everything in a is subtracted _broadcast('-', a) for a in ita: _broadcast('-', a) else: while a is not None and b is not None: if b is None or a < b: _broadcast('-', a) # advance a try: a = tuple(next(ita)) except StopIteration: a = None elif a == b: _broadcast(a) # default channel # advance both try: a = tuple(next(ita)) except StopIteration: a = None try: b = tuple(next(itb)) except StopIteration: b = None else: _broadcast('+', b) # advance b try: b = tuple(next(itb)) except StopIteration: b = None
def iterfilldown(table, fillfields, missing, where, anchorfields, until): # prepare where function if isinstance(where, string_types): where = expr(where) elif where is not None: assert callable(where), 'expected callable for "where" argument, found %r' % where else: where = lambda r: True # default where callable returns True # prepare until function if isinstance(until, string_types): until = expr(until) elif until is not None: assert callable(until), 'expected callable for "until" argument, found %r' % until else: until = lambda r: False # default until callable returns True # normal iter function it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) yield tuple(hdr) if not fillfields: # fill down all fields fillfields = hdr fillindices = asindices(hdr, fillfields) if anchorfields: anchorindices = asindices(hdr, anchorfields) fill = list(next(it)) # fill values prev = fill untilfunctiontriggered = False yield tuple(fill) for row in it: outrow = list(row) if untilfunctiontriggered: fill = outrow untilfunctiontriggered = False # reset if anchorfields: row_values = [row[i] for i in anchorindices] prev_values = [prev[i] for i in anchorindices] check_anchor = row_values == prev_values else: check_anchor = True # loop through fill-down fields for idx in fillindices: if row[idx] == missing and where(Record(row, flds)) and check_anchor: outrow[idx] = fill[idx] # fill down elif row[idx] == missing and check_anchor: pass else: fill[idx] = row[idx] # new fill value prev = outrow yield tuple(outrow) # found stop point, reset fill with next row's contents if until(Record(row, flds)): untilfunctiontriggered = True
def iterhashintersection(a, b): ita = iter(a) ahdr = next(ita) yield tuple(ahdr) itb = iter(b) next(itb) # discard b header, assume same as a # N.B., need to account for possibility of duplicate rows bcnt = Counter(tuple(row) for row in itb) for ar in ita: t = tuple(ar) if bcnt[t] > 0: yield t bcnt[t] -= 1
def issorted(table, key=None, reverse=False, strict=False): """ Return True if the table is ordered (i.e., sorted) by the given key. E.g.:: >>> import petl as etl >>> table1 = [['foo', 'bar', 'baz'], ... ['a', 1, True], ... ['b', 3, True], ... ['b', 2]] >>> etl.issorted(table1, key='foo') True >>> etl.issorted(table1, key='bar') False >>> etl.issorted(table1, key='foo', strict=True) False >>> etl.issorted(table1, key='foo', reverse=True) False """ # determine the operator to use when comparing rows if reverse and strict: op = operator.lt elif reverse and not strict: op = operator.le elif strict: op = operator.gt else: op = operator.ge it = iter(table) flds = [text_type(f) for f in next(it)] if key is None: prev = next(it) for curr in it: if not op(curr, prev): return False prev = curr else: getkey = comparable_itemgetter(*asindices(flds, key)) prev = next(it) prevkey = getkey(prev) for curr in it: currkey = getkey(curr) if not op(currkey, prevkey): return False prevkey = currkey return True
def merge( source: any, name: str, from_names: list, sep: str = "-", preserve: bool = True ) -> Iterator: it = iter(source) hdr = next(it) field_indexes = list() flds = list(map(text_type, hdr)) # determine output fields outhdr = list(flds) for field in from_names: field_index = flds.index(field) if not preserve: outhdr.remove(field) field_indexes.append(field_index) outhdr.extend([name]) yield tuple(outhdr) # construct the output data for row in it: value = [v for i, v in enumerate(row) if i in field_indexes] if preserve: out_row = list(row) else: out_row = [v for i, v in enumerate(row) if i not in field_indexes] out_row.extend([sep.join(value)]) yield tuple(out_row)
def iterhashcomplement(a, b, strict): ita = iter(a) ahdr = next(ita) yield tuple(ahdr) itb = iter(b) next(itb) # discard b header, assume same as a # N.B., need to account for possibility of duplicate rows bcnt = Counter(tuple(row) for row in itb) for ar in ita: t = tuple(ar) if bcnt[t] > 0: if not strict: bcnt[t] -= 1 else: yield t
def itersplit(source, field, pattern, newfields, include_original, maxsplit, flags): it = iter(source) prog = re.compile(pattern, flags) hdr = next(it) flds = list(map(text_type, hdr)) if isinstance(field, int) and field < len(hdr): field_index = field field = hdr[field_index] elif field in flds: field_index = flds.index(field) else: raise ArgumentError('field invalid: must be either field name or index') # determine output fields outhdr = list(flds) if not include_original: outhdr.remove(field) if newfields: outhdr.extend(newfields) yield tuple(outhdr) # construct the output data for row in it: value = row[field_index] if include_original: out_row = list(row) else: out_row = [v for i, v in enumerate(row) if i != field_index] out_row.extend(prog.split(value, maxsplit)) yield tuple(out_row)
def iterselectusingcontext(table, query): it = iter(table) hdr = tuple(next(it)) flds = list(map(text_type, hdr)) yield hdr it = (Record(row, flds) for row in it) prv = None cur = next(it) for nxt in it: if query(prv, cur, nxt): yield cur prv = cur cur = nxt # handle last row if query(prv, cur, None): yield cur
def __iter__(self): it = iter(self.table) hdr = next(it) outhdr = tuple((text_type(f) + text_type(self.suffix)) for f in hdr) yield outhdr for row in it: yield row
def iteraddfieldusingcontext(table, field, query): it = iter(table) hdr = tuple(next(it)) flds = list(map(text_type, hdr)) yield hdr + (field, ) it = (Record(row, flds) for row in it) prv = None cur = next(it) for nxt in it: v = query(prv, cur, nxt) yield tuple(cur) + (v, ) prv = cur cur = nxt # handle last row v = query(prv, cur, None) yield tuple(cur) + (v, )
def iterpackdict(source: any, name: str, from_names: list, preserve: bool = False) -> Iterator: """Combines multiple columns as JSON Object""" it = iter(source) hdr = next(it) field_indexes = list() flds = list(map(text_type, hdr)) # determine output fields outhdr = list(flds) for field in from_names: field_index = flds.index(field) if not preserve: outhdr.remove(field) field_indexes.append(field_index) outhdr.extend([name]) yield tuple(outhdr) # construct the output data for row in it: value = dict((from_names[i - 1], v) for i, v in enumerate(row) if i in field_indexes) if preserve: out_row = list(row) else: out_row = [v for i, v in enumerate(row) if i not in field_indexes] out_row.extend([value]) yield tuple(out_row)
def itercat(sources, missing, header): its = [iter(t) for t in sources] hdrs = [list(next(it)) for it in its] if header is None: # determine output fields by gathering all fields found in the sources outhdr = list(hdrs[0]) for hdr in hdrs[1:]: for h in hdr: if h not in outhdr: # add any new fields as we find them outhdr.append(h) else: # predetermined output fields outhdr = header yield tuple(outhdr) # output data rows for hdr, it in zip(hdrs, its): # now construct and yield the data rows for row in it: outrow = list() for h in outhdr: val = missing try: val = row[hdr.index(h)] except IndexError: # short row pass except ValueError: # field not in table pass outrow.append(val) yield tuple(outrow)
def iteraddfield(source, field, value, index): it = iter(source) hdr = next(it) flds = list(map(text_type, hdr)) # determine index of new field if index is None: index = len(hdr) # construct output fields outhdr = list(hdr) outhdr.insert(index, field) yield tuple(outhdr) if callable(value): # wrap rows as records if using calculated value it = (Record(row, flds) for row in it) for row in it: outrow = list(row) v = value(row) outrow.insert(index, v) yield tuple(outrow) else: for row in it: outrow = list(row) outrow.insert(index, value) yield tuple(outrow)
def iteraddfieldusingcontext(table, field, query): it = iter(table) hdr = tuple(next(it)) flds = list(map(text_type, hdr)) yield hdr + (field,) it = (Record(row, flds) for row in it) prv = None cur = next(it) for nxt in it: v = query(prv, cur, nxt) yield tuple(cur) + (v,) prv = cur cur = nxt # handle last row v = query(prv, cur, None) yield tuple(cur) + (v,)
def iterpack(source: any, name: str, from_names: list, preserve: bool = False) -> Iterator: """Combines multiple columns as array Code partially referenced from https://github.com/petl-developers/petl/blob/master/petl/transform/unpacks.py#L64 """ it = iter(source) hdr = next(it) field_indexes = list() flds = list(map(text_type, hdr)) # determine output fields outhdr = list(flds) for field in from_names: field_index = flds.index(field) if not preserve: outhdr.remove(field) field_indexes.append(field_index) outhdr.extend([name]) yield tuple(outhdr) # construct the output data for row in it: value = [v for i, v in enumerate(row) if i in field_indexes] if preserve: out_row = list(row) else: out_row = [v for i, v in enumerate(row) if i not in field_indexes] out_row.extend([value]) yield tuple(out_row)
def push(self, source, limit=None): it = iter(source) fields = next(it) c = self.connect(fields) for row in islice(it, limit): c.accept(tuple(row)) c.close()
def groupselectfirst(table, key, presorted=False, buffersize=None, tempdir=None, cache=True): """Group by the `key` field then return the first row within each group.""" _reducer = lambda k, rows: next(rows) return rowreduce(table, key, reducer=_reducer, presorted=presorted, buffersize=buffersize, tempdir=tempdir, cache=cache)
def __iter__(self): it = iter(self.table) hdr = next(it) outhdr = tuple((text_type(self.prefix) + text_type(f)) for f in hdr) yield outhdr for row in it: yield row
def itersearch(table, pattern, field, flags, complement): prog = re.compile(pattern, flags) it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) yield tuple(hdr) if field is None: # search whole row test = lambda r: any(prog.search(text_type(v)) for v in r) else: indices = asindices(hdr, field) if len(indices) == 1: index = indices[0] test = lambda r: prog.search(text_type(r[index])) else: getvals = operator.itemgetter(*indices) test = lambda r: any(prog.search(text_type(v)) for v in getvals(r)) # complement==False, return rows that match if not complement: for row in it: if test(row): yield tuple(row) # complement==True, return rows that do not match else: for row in it: if not test(row): yield tuple(row)
def __iter__(self): it = iter(self.table) hdr = next(it) outhdr = tuple((str(self.prefix) + str(f)) for f in hdr) yield outhdr for row in it: yield row
def itermultiaggregate(source, key, aggregation): aggregation = OrderedDict(aggregation.items()) # take a copy it = iter(source) hdr = next(it) # push back header to ensure we iterate only once it = itertools.chain([hdr], it) # normalise aggregators for outfld in aggregation: agg = aggregation[outfld] if callable(agg): aggregation[outfld] = None, agg elif isinstance(agg, string_types): aggregation[outfld] = agg, list # list is default elif len(agg) == 1 and isinstance(agg[0], string_types): aggregation[outfld] = agg[0], list # list is default elif len(agg) == 1 and callable(agg[0]): aggregation[outfld] = None, agg[0] # aggregate whole rows elif len(agg) == 2: pass # no need to normalise else: raise ArgumentError('invalid aggregation: %r, %r' % (outfld, agg)) # determine output header if isinstance(key, (list, tuple)): outhdr = list(key) elif callable(key): outhdr = ['key'] else: outhdr = [key] for outfld in aggregation: outhdr.append(outfld) yield tuple(outhdr) # generate data for k, rows in rowgroupby(it, key): rows = list(rows) # may need to iterate over these more than once # handle compound key if isinstance(key, (list, tuple)): outrow = list(k) else: outrow = [k] for outfld in aggregation: srcfld, aggfun = aggregation[outfld] if srcfld is None: aggval = aggfun(rows) outrow.append(aggval) elif isinstance(srcfld, (list, tuple)): idxs = [hdr.index(f) for f in srcfld] valgetter = operator.itemgetter(*idxs) vals = (valgetter(row) for row in rows) aggval = aggfun(vals) outrow.append(aggval) else: idx = hdr.index(srcfld) # try using generator comprehension vals = (row[idx] for row in rows) aggval = aggfun(vals) outrow.append(aggval) yield tuple(outrow)
def toxls(tbl, filename, sheet, encoding=None, style_compression=0, styles=None): """ Write a table to a new Excel .xls file. """ import xlwt if encoding is None: encoding = locale.getpreferredencoding() wb = xlwt.Workbook(encoding=encoding, style_compression=style_compression) ws = wb.add_sheet(sheet) if styles is None: # simple version, don't worry about styles for r, row in enumerate(tbl): for c, v in enumerate(row): ws.write(r, c, label=v) else: # handle styles it = iter(tbl) hdr = next(it) flds = list(map(str, hdr)) for c, f in enumerate(flds): ws.write(0, c, label=f) if f not in styles or styles[f] is None: styles[f] = xlwt.Style.default_style # convert to list for easy zipping styles = [styles[f] for f in flds] for r, row in enumerate(it): for c, (v, style) in enumerate(izip_longest(row, styles, fillvalue=None)): ws.write(r+1, c, label=v, style=style) wb.save(filename)
def columns(table, missing=None): """ Construct a :class:`dict` mapping field names to lists of values. E.g.:: >>> import petl as etl >>> table = [['foo', 'bar'], ['a', 1], ['b', 2], ['b', 3]] >>> cols = etl.columns(table) >>> cols['foo'] ['a', 'b', 'b'] >>> cols['bar'] [1, 2, 3] See also :func:`petl.util.materialise.facetcolumns`. """ cols = OrderedDict() it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) for f in flds: cols[f] = list() for row in it: for f, v in izip_longest(flds, row, fillvalue=missing): if f in cols: cols[f].append(v) return cols
def itervalues(table, field, **kwargs): missing = kwargs.get('missing', None) it = iter(table) hdr = next(it) indices = asindices(hdr, field) assert len(indices) > 0, 'no field selected' getvalue = operator.itemgetter(*indices) for row in it: try: value = getvalue(row) yield value except IndexError: if len(indices) > 1: # try one at a time value = list() for i in indices: if i < len(row): value.append(row[i]) else: value.append(missing) yield tuple(value) else: yield missing
def __iter__(self): it = iter(self.table) hdr = next(it) outhdr = tuple((str(f) + str(self.suffix)) for f in hdr) yield outhdr for row in it: yield row
def itersplit(source, field, pattern, newfields, include_original, maxsplit, flags): it = iter(source) prog = re.compile(pattern, flags) hdr = next(it) flds = list(map(text_type, hdr)) if isinstance(field, int) and field < len(hdr): field_index = field field = hdr[field_index] elif field in flds: field_index = flds.index(field) else: raise ArgumentError( 'field invalid: must be either field name or index') # determine output fields outhdr = list(flds) if not include_original: outhdr.remove(field) if newfields: outhdr.extend(newfields) yield tuple(outhdr) # construct the output data for row in it: value = row[field_index] if include_original: out_row = list(row) else: out_row = [v for i, v in enumerate(row) if i != field_index] out_row.extend(prog.split(value, maxsplit)) yield tuple(out_row)
def iterpeek(it, n=1): it = iter(it) # make sure it's an iterator if n == 1: peek = next(it) return peek, chain([peek], it) else: peek = list(islice(it, n)) return peek, chain(peek, it)
def iterdicts(table, *sliceargs, **kwargs): missing = kwargs.get('missing', None) it = iter(table) hdr = next(it) if sliceargs: it = islice(it, *sliceargs) for row in it: yield asdict(hdr, row, missing)
def __getitem__(self, item): if isinstance(item, int): try: return next(islice(self, item, item + 1)) except StopIteration: raise IndexError('index out of range') elif isinstance(item, slice): return islice(self, item.start, item.stop, item.step)
def __getitem__(self, item): if isinstance(item, int): try: return next(islice(self, item, item+1)) except StopIteration: raise IndexError('index out of range') elif isinstance(item, slice): return islice(self, item.start, item.stop, item.step)
def iterextendheader(source, fields): it = iter(source) hdr = next(it) outhdr = list(hdr) outhdr.extend(fields) yield tuple(outhdr) for row in it: yield tuple(row)
def iterfilldown(table, fillfields, missing): it = iter(table) hdr = next(it) yield tuple(hdr) if not fillfields: # fill down all fields fillfields = hdr fillindices = asindices(hdr, fillfields) fill = list(next(it)) # fill values yield tuple(fill) for row in it: outrow = list(row) for idx in fillindices: if row[idx] == missing: outrow[idx] = fill[idx] # fill down else: fill[idx] = row[idx] # new fill value yield tuple(outrow)