def iterantijoin(left, right, lkey, rkey): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) yield tuple(lhdr) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from both tables lgetk = comparable_itemgetter(*lkind) rgetk = comparable_itemgetter(*rkind) # construct group iterators for both tables lgit = itertools.groupby(lit, key=lgetk) rgit = itertools.groupby(rit, key=rgetk) lrowgrp = [] # loop until *either* of the iterators is exhausted lkval, rkval = Comparable(None), Comparable(None) try: # pick off initial row groups lkval, lrowgrp = next(lgit) rkval, _ = next(rgit) while True: if lkval < rkval: for row in lrowgrp: yield tuple(row) # advance left lkval, lrowgrp = next(lgit) elif lkval > rkval: # advance right rkval, _ = next(rgit) else: # advance both lkval, lrowgrp = next(lgit) rkval, _ = next(rgit) except StopIteration: pass # any left over? if lkval > rkval: # yield anything that got left hanging for row in lrowgrp: yield tuple(row) # and the rest... for lkval, lrowgrp in lgit: for row in lrowgrp: yield tuple(row)
def rowgroupby(table, key, value=None): """Convenient adapter for :func:`itertools.groupby`. E.g.:: >>> import petl as etl >>> table1 = [['foo', 'bar', 'baz'], ... ['a', 1, True], ... ['b', 3, True], ... ['b', 2]] >>> # group entire rows ... for key, group in etl.rowgroupby(table1, 'foo'): ... print(key, list(group)) ... a [('a', 1, True)] b [('b', 3, True), ('b', 2)] >>> # group specific values ... for key, group in etl.rowgroupby(table1, 'foo', 'bar'): ... print(key, list(group)) ... a [1] b [3, 2] N.B., assumes the input table is already sorted by the given key. """ it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) # wrap rows as records it = (Record(row, flds) for row in it) # determine key function if callable(key): getkey = key native_key = True else: kindices = asindices(hdr, key) getkey = comparable_itemgetter(*kindices) native_key = False git = groupby(it, key=getkey) if value is None: if native_key: return git else: return ((k.inner, vals) for (k, vals) in git) else: if callable(value): getval = value else: vindices = asindices(hdr, value) getval = operator.itemgetter(*vindices) if native_key: return ((k, (getval(v) for v in vals)) for (k, vals) in git) else: return ((k.inner, (getval(v) for v in vals)) for (k, vals) in git)
def itermergesort(sources, key, header, missing, reverse): # first need to standardise headers of all input tables # borrow this from itercat - TODO remove code smells its = [iter(t) for t in sources] src_hdrs = [next(it) for it in its] if header is None: # determine output fields by gathering all fields found in the sources outhdr = list() for hdr in src_hdrs: for f in list(map(text_type, hdr)): if f not in outhdr: # add any new fields as we find them outhdr.append(f) else: # predetermined output fields outhdr = header yield tuple(outhdr) def _standardisedata(it, hdr, ofs): flds = list(map(text_type, hdr)) # now construct and yield the data rows for _row in it: try: # should be quickest to do this way yield tuple(_row[flds.index(fo)] if fo in flds else missing for fo in ofs) except IndexError: # handle short rows outrow = [missing] * len(ofs) for i, fi in enumerate(flds): try: outrow[ofs.index(fi)] = _row[i] except IndexError: pass # be relaxed about short rows yield tuple(outrow) # wrap all iterators to standardise fields sits = [_standardisedata(it, hdr, outhdr) for hdr, it in zip(src_hdrs, its)] # now determine key function getkey = None if key is not None: # convert field selection into field indices indices = asindices(outhdr, key) # now use field indices to construct a _getkey function # N.B., this will probably raise an exception on short rows getkey = comparable_itemgetter(*indices) # OK, do the merge sort for row in _shortlistmergesorted(getkey, reverse, *sits): yield row
def issorted(table, key=None, reverse=False, strict=False): """ Return True if the table is ordered (i.e., sorted) by the given key. E.g.:: >>> import petl as etl >>> table1 = [['foo', 'bar', 'baz'], ... ['a', 1, True], ... ['b', 3, True], ... ['b', 2]] >>> etl.issorted(table1, key='foo') True >>> etl.issorted(table1, key='bar') False >>> etl.issorted(table1, key='foo', strict=True) False >>> etl.issorted(table1, key='foo', reverse=True) False """ # determine the operator to use when comparing rows if reverse and strict: op = operator.lt elif reverse and not strict: op = operator.le elif strict: op = operator.gt else: op = operator.ge it = iter(table) flds = [text_type(f) for f in next(it)] if key is None: prev = next(it) for curr in it: if not op(curr, prev): return False prev = curr else: getkey = comparable_itemgetter(*asindices(flds, key)) prev = next(it) prevkey = getkey(prev) for curr in it: currkey = getkey(curr) if not op(currkey, prevkey): return False prevkey = currkey return True
def __init__(self, default_connections, keyed_connections, fields, key, reverse, buffersize): super(SortConnection, self).__init__(default_connections, keyed_connections, fields) self.getkey = None if key is not None: # convert field selection into field indices indices = asindices(fields, key) # now use field indices to construct a _getkey function # N.B., this will probably raise an exception on short rows self.getkey = comparable_itemgetter(*indices) self.reverse = reverse if buffersize is None: self.buffersize = petl.config.sort_buffersize else: self.buffersize = buffersize self.cache = list() self.chunkfiles = list()
def rowitemgetter(hdr, spec): indices = asindices(hdr, spec) getter = comparable_itemgetter(*indices) return getter
def _iternocache(self, source, key, reverse): debug('iterate without cache') self.clearcache() it = iter(source) hdr = next(it) yield tuple(hdr) if key is not None: # convert field selection into field indices indices = asindices(hdr, key) else: indices = range(len(hdr)) # now use field indices to construct a _getkey function # TODO check if this raises an exception on short rows getkey = comparable_itemgetter(*indices) # TODO support native comparison # initialise the first chunk rows = list(itertools.islice(it, 0, self.buffersize)) rows.sort(key=getkey, reverse=reverse) # have we exhausted the source iterator? if self.buffersize is None or len(rows) < self.buffersize: # yes, table fits within sort buffer if self.cache: debug('caching mem') self._hdrcache = hdr self._memcache = rows # actually not needed to iterate from memcache self._getkey = getkey for row in rows: yield tuple(row) else: # no, table is too big, need to sort in chunks chunkfiles = [] while rows: # dump the chunk with NamedTemporaryFile(dir=self.tempdir, delete=False, mode='wb') as f: # N.B., we **don't** want the file to be deleted on close, # but we **do** want the file to be deleted when self # is garbage collected, or when the program exits. When # all references to the wrapper are gone, the file should # get deleted. wrapper = _NamedTempFileDeleteOnGC(f.name) debug('created temporary chunk file %s' % f.name) for row in rows: pickle.dump(row, f, protocol=-1) f.flush() chunkfiles.append(wrapper) # grab the next chunk rows = list(itertools.islice(it, 0, self.buffersize)) rows.sort(key=getkey, reverse=reverse) if self.cache: debug('caching files') self._hdrcache = hdr self._filecache = chunkfiles self._getkey = getkey chunkiters = [_iterchunk(f.name) for f in chunkfiles] for row in _mergesorted(getkey, reverse, *chunkiters): yield tuple(row)
def iterjoin(left, right, lkey, rkey, leftouter=False, rightouter=False, missing=None, lprefix=None, rprefix=None): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from both tables lgetk = comparable_itemgetter(*lkind) rgetk = comparable_itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join two groups of rows def joinrows(_lrowgrp, _rrowgrp): if _rrowgrp is None: for lrow in _lrowgrp: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow) elif _lrowgrp is None: for rrow in _rrowgrp: # start with missing values in place of the left row outrow = [missing] * len(lhdr) # set key values for li, ri in zip(lkind, rkind): outrow[li] = rrow[ri] # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow) else: _rrowgrp = list(_rrowgrp) # may need to iterate more than once for lrow in _lrowgrp: for rrow in _rrowgrp: # start with the left row outrow = list(lrow) # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow) # construct group iterators for both tables lgit = itertools.groupby(lit, key=lgetk) rgit = itertools.groupby(rit, key=rgetk) lrowgrp = [] rrowgrp = [] # loop until *either* of the iterators is exhausted # initialise here to handle empty tables lkval, rkval = Comparable(None), Comparable(None) try: # pick off initial row groups lkval, lrowgrp = next(lgit) rkval, rrowgrp = next(rgit) while True: if lkval < rkval: if leftouter: for row in joinrows(lrowgrp, None): yield tuple(row) # advance left lkval, lrowgrp = next(lgit) elif lkval > rkval: if rightouter: for row in joinrows(None, rrowgrp): yield tuple(row) # advance right rkval, rrowgrp = next(rgit) else: for row in joinrows(lrowgrp, rrowgrp): yield tuple(row) # advance both lkval, lrowgrp = next(lgit) rkval, rrowgrp = next(rgit) except StopIteration: pass # make sure any left rows remaining are yielded if leftouter: if lkval > rkval: # yield anything that got left hanging for row in joinrows(lrowgrp, None): yield tuple(row) # yield the rest for lkval, lrowgrp in lgit: for row in joinrows(lrowgrp, None): yield tuple(row) # make sure any right rows remaining are yielded if rightouter: if lkval < rkval: # yield anything that got left hanging for row in joinrows(None, rrowgrp): yield tuple(row) # yield the rest for rkval, rrowgrp in rgit: for row in joinrows(None, rrowgrp): yield tuple(row)
def iterrecast(source, key, variablefield, valuefield, samplesize, reducers, missing): # TODO only make one pass through the data it = iter(source) hdr = next(it) flds = list(map(text_type, hdr)) # normalise some stuff keyfields = key variablefields = variablefield # N.B., could be more than one # normalise key fields if keyfields and not isinstance(keyfields, (list, tuple)): keyfields = (keyfields,) # normalise variable fields if variablefields: if isinstance(variablefields, dict): pass # handle this later elif not isinstance(variablefields, (list, tuple)): variablefields = (variablefields,) # infer key fields if not keyfields: # assume keyfields is fields not in variables keyfields = [f for f in flds if f not in variablefields and f != valuefield] # infer key fields if not variablefields: # assume variables are fields not in keyfields variablefields = [f for f in flds if f not in keyfields and f != valuefield] # sanity checks assert valuefield in flds, 'invalid value field: %s' % valuefield assert valuefield not in keyfields, 'value field cannot be keyfields' assert valuefield not in variablefields, \ 'value field cannot be variable field' for f in keyfields: assert f in flds, 'invalid keyfields field: %s' % f for f in variablefields: assert f in flds, 'invalid variable field: %s' % f # we'll need these later valueindex = flds.index(valuefield) keyindices = [flds.index(f) for f in keyfields] variableindices = [flds.index(f) for f in variablefields] # determine the actual variable names to be cast as fields if isinstance(variablefields, dict): # user supplied dictionary variables = variablefields else: variables = collections.defaultdict(set) # sample the data to discover variables to be cast as fields for row in itertools.islice(it, 0, samplesize): for i, f in zip(variableindices, variablefields): variables[f].add(row[i]) for f in variables: # turn from sets to sorted lists variables[f] = sorted(variables[f]) # finished the first pass # determine the output fields outhdr = list(keyfields) for f in variablefields: outhdr.extend(variables[f]) yield tuple(outhdr) # output data source = sort(source, key=keyfields) it = itertools.islice(source, 1, None) # skip header row getsortablekey = comparable_itemgetter(*keyindices) getactualkey = operator.itemgetter(*keyindices) # process sorted data in newfields groups = itertools.groupby(it, key=getsortablekey) for _, group in groups: # may need to iterate over the group more than once group = list(group) # N.B., key returned by groupby may be wrapped as SortableItem, we want # to output the actual key value, get it from the first row in the group key_value = getactualkey(group[0]) if len(keyfields) > 1: out_row = list(key_value) else: out_row = [key_value] for f, i in zip(variablefields, variableindices): for variable in variables[f]: # collect all values for the current variable vals = [r[valueindex] for r in group if r[i] == variable] if len(vals) == 0: val = missing elif len(vals) == 1: val = vals[0] else: if variable in reducers: redu = reducers[variable] else: redu = list # list all values val = redu(vals) out_row.append(val) yield tuple(out_row)
def iterjoin(left, right, lkey, rkey, leftouter=False, rightouter=False, missing=None, lprefix=None, rprefix=None): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from both tables lgetk = comparable_itemgetter(*lkind) rgetk = comparable_itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(str(lprefix) + str(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(str(rprefix) + str(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join two groups of rows def joinrows(_lrowgrp, _rrowgrp): if _rrowgrp is None: for lrow in _lrowgrp: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow) elif _lrowgrp is None: for rrow in _rrowgrp: # start with missing values in place of the left row outrow = [missing] * len(lhdr) # set key values for li, ri in zip(lkind, rkind): outrow[li] = rrow[ri] # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow) else: _rrowgrp = list(_rrowgrp) # may need to iterate more than once for lrow in _lrowgrp: for rrow in _rrowgrp: # start with the left row outrow = list(lrow) # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow) # construct group iterators for both tables lgit = itertools.groupby(lit, key=lgetk) rgit = itertools.groupby(rit, key=rgetk) lrowgrp = [] rrowgrp = [] # loop until *either* of the iterators is exhausted # initialise here to handle empty tables lkval, rkval = Comparable(None), Comparable(None) try: # pick off initial row groups lkval, lrowgrp = next(lgit) rkval, rrowgrp = next(rgit) while True: if lkval < rkval: if leftouter: for row in joinrows(lrowgrp, None): yield tuple(row) # advance left lkval, lrowgrp = next(lgit) elif lkval > rkval: if rightouter: for row in joinrows(None, rrowgrp): yield tuple(row) # advance right rkval, rrowgrp = next(rgit) else: for row in joinrows(lrowgrp, rrowgrp): yield tuple(row) # advance both lkval, lrowgrp = next(lgit) rkval, rrowgrp = next(rgit) except StopIteration: pass # make sure any left rows remaining are yielded if leftouter: if lkval > rkval: # yield anything that got left hanging for row in joinrows(lrowgrp, None): yield tuple(row) # yield the rest for lkval, lrowgrp in lgit: for row in joinrows(lrowgrp, None): yield tuple(row) # make sure any right rows remaining are yielded if rightouter: if lkval < rkval: # yield anything that got left hanging for row in joinrows(None, rrowgrp): yield tuple(row) # yield the rest for rkval, rrowgrp in rgit: for row in joinrows(None, rrowgrp): yield tuple(row)