def iterhashrightjoin(left, right, lkey, rkey, missing, llookup, lprefix, rprefix): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table rgetk = operator.itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_rrow, _lrows): for lrow in _lrows: # start with the left row _outrow = list(lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) yield tuple(_outrow) for rrow in rit: k = rgetk(rrow) if k in llookup: lrows = llookup[k] for outrow in joinrows(rrow, lrows): yield outrow else: # start with missing values in place of the left row outrow = [missing] * len(lhdr) # set key values for li, ri in zip(lkind, rkind): outrow[li] = rrow[ri] # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow)
def iterhashrightjoin(left, right, lkey, rkey, missing, llookup, lprefix, rprefix): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table rgetk = operator.itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(str(lprefix) + str(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(str(rprefix) + str(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_rrow, _lrows): for lrow in _lrows: # start with the left row _outrow = list(lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) yield tuple(_outrow) for rrow in rit: k = rgetk(rrow) if k in llookup: lrows = llookup[k] for outrow in joinrows(rrow, lrows): yield outrow else: # start with missing values in place of the left row outrow = [missing] * len(lhdr) # set key values for li, ri in zip(lkind, rkind): outrow[li] = rrow[ri] # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow)
def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix): lit = iter(left) lhdr = next(lit) rhdr, rit = iterpeek(right) # need the whole lot to pass to lookup rlookup = lookupone(rit, rkey, strict=False) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table lgetk = operator.itemgetter(*lkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(str(lprefix) + str(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(str(rprefix) + str(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_lrow, _rrow): # start with the left row _outrow = list(_lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) return tuple(_outrow) for lrow in lit: k = lgetk(lrow) if k in rlookup: rrow = rlookup[k] yield joinrows(lrow, rrow) else: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow)
def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix): lit = iter(left) lhdr = next(lit) rhdr, rit = iterpeek(right) # need the whole lot to pass to lookup rlookup = lookupone(rit, rkey, strict=False) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table lgetk = operator.itemgetter(*lkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_lrow, _rrow): # start with the left row _outrow = list(_lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) return tuple(_outrow) for lrow in lit: k = lgetk(lrow) if k in rlookup: rrow = rlookup[k] yield joinrows(lrow, rrow) else: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow)
def iterhashjoin(left, right, lkey, rkey, rlookup, lprefix, rprefix): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table lgetk = operator.itemgetter(*lkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(str(lprefix) + str(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(str(rprefix) + str(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_lrow, _rrows): for rrow in _rrows: # start with the left row _outrow = list(_lrow) # extend with non-key values from the right row _outrow.extend(rgetv(rrow)) yield tuple(_outrow) for lrow in lit: k = lgetk(lrow) if k in rlookup: rrows = rlookup[k] for outrow in joinrows(lrow, rrows): yield outrow
def itermelt(source, key, variables, variablefield, valuefield): if key is None and variables is None: raise ValueError('either key or variables must be specified') it = iter(source) hdr = next(it) # determine key and variable field indices key_indices = variables_indices = None if key is not None: key_indices = asindices(hdr, key) if variables is not None: if not isinstance(variables, (list, tuple)): variables = (variables,) variables_indices = asindices(hdr, variables) if key is None: # assume key is fields not in variables key_indices = [i for i in range(len(hdr)) if i not in variables_indices] if variables is None: # assume variables are fields not in key variables_indices = [i for i in range(len(hdr)) if i not in key_indices] variables = [hdr[i] for i in variables_indices] getkey = rowgetter(*key_indices) # determine the output fields outhdr = [hdr[i] for i in key_indices] outhdr.append(variablefield) outhdr.append(valuefield) yield tuple(outhdr) # construct the output data for row in it: k = getkey(row) for v, i in zip(variables, variables_indices): try: o = list(k) # populate with key values initially o.append(v) # add variable o.append(row[i]) # add value yield tuple(o) except IndexError: # row is missing this value, and melt() should yield no row pass
def __iter__(self): it = iter(self.table) hdr = next(it) shdr = sorted(hdr) indices = asindices(hdr, shdr) transform = rowgetter(*indices) # yield the transformed header yield tuple(shdr) # construct the transformed data missing = self.missing for row in it: try: yield transform(row) except IndexError: # row is short, let's be kind and fill in any missing fields yield tuple(row[i] if i < len(row) else missing for i in indices)
def itermelt(source, key, variables, variablefield, valuefield): it = iter(source) # normalise some stuff hdr = next(it) flds = list(map(text_type, hdr)) if key and not isinstance(key, (list, tuple)): key = (key,) # normalise to a tuple if variables and not isinstance(variables, (list, tuple)): # shouldn't expect this, but ... ? variables = (variables,) # normalise to a tuple if not key: # assume key is fields not in variables key = [f for f in flds if f not in variables] if not variables: # assume variables are fields not in key variables = [f for f in flds if f not in key] # determine the output fields outhdr = list(key) outhdr.append(variablefield) outhdr.append(valuefield) yield tuple(outhdr) key_indices = [flds.index(k) for k in key] getkey = rowgetter(*key_indices) variables_indices = [flds.index(v) for v in variables] # construct the output data for row in it: k = getkey(row) for v, i in zip(variables, variables_indices): try: o = list(k) # populate with key values initially o.append(v) # add variable o.append(row[i]) # add value yield tuple(o) except IndexError: # row is missing this value, and melt() should yield no row pass
def _setup_lookup(table, key, value): # obtain iterator and header row it = iter(table) hdr = next(it) # prepare key getter keyindices = asindices(hdr, key) assert len(keyindices) > 0, 'no key selected' getkey = operator.itemgetter(*keyindices) # prepare value getter if value is None: # default value is complete row getvalue = rowgetter(*range(len(hdr))) else: valueindices = asindices(hdr, value) assert len(valueindices) > 0, 'no value selected' getvalue = operator.itemgetter(*valueindices) return it, getkey, getvalue
def itercut(source, spec, missing=None): it = iter(source) spec = tuple(spec) # make sure no-one can change midstream # convert field selection into field indices hdr = next(it) indices = asindices(hdr, spec) # define a function to transform each row in the source data # according to the field selection transform = rowgetter(*indices) # yield the transformed header yield transform(hdr) # construct the transformed data for row in it: try: yield transform(row) except IndexError: # row is short, let's be kind and fill in any missing fields yield tuple(row[i] if i < len(row) else missing for i in indices)
def __iter__(self): it = iter(self.table) # determine output fields hdr = next(it) outhdr = [f for f in hdr if f != self.field] outhdr.insert(self.index, self.field) yield tuple(outhdr) # define a function to transform each row in the source data # according to the field selection outflds = list(map(str, outhdr)) indices = asindices(hdr, outflds) transform = rowgetter(*indices) # construct the transformed data for row in it: try: yield transform(row) except IndexError: # row is short, let's be kind and fill in any missing fields yield tuple(row[i] if i < len(row) else self.missing for i in indices)
def iterlookupjoin(left, right, lkey, rkey, missing=None, lprefix=None, rprefix=None): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from both tables lgetk = operator.itemgetter(*lkind) rgetk = operator.itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join two groups of rows def joinrows(_lrowgrp, _rrowgrp): if _rrowgrp is None: for lrow in _lrowgrp: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow) else: rrow = next(iter(_rrowgrp)) # pick first arbitrarily for lrow in _lrowgrp: # start with the left row outrow = list(lrow) # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow) # construct group iterators for both tables lgit = itertools.groupby(lit, key=lgetk) rgit = itertools.groupby(rit, key=rgetk) lrowgrp = [] # loop until *either* of the iterators is exhausted lkval, rkval = None, None # initialise here to handle empty tables try: # pick off initial row groups lkval, lrowgrp = next(lgit) rkval, rrowgrp = next(rgit) while True: if lkval < rkval: for row in joinrows(lrowgrp, None): yield tuple(row) # advance left lkval, lrowgrp = next(lgit) elif lkval > rkval: # advance right rkval, rrowgrp = next(rgit) else: for row in joinrows(lrowgrp, rrowgrp): yield tuple(row) # advance both lkval, lrowgrp = next(lgit) rkval, rrowgrp = next(rgit) except StopIteration: pass # make sure any left rows remaining are yielded if lkval > rkval: # yield anything that got left hanging for row in joinrows(lrowgrp, None): yield tuple(row) # yield the rest for lkval, lrowgrp in lgit: for row in joinrows(lrowgrp, None): yield tuple(row)
def iterlookupjoin(left, right, lkey, rkey, missing=None, lprefix=None, rprefix=None): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from both tables lgetk = operator.itemgetter(*lkind) rgetk = operator.itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(str(lprefix) + str(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(str(rprefix) + str(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join two groups of rows def joinrows(_lrowgrp, _rrowgrp): if _rrowgrp is None: for lrow in _lrowgrp: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow) else: rrow = next(iter(_rrowgrp)) # pick first arbitrarily for lrow in _lrowgrp: # start with the left row outrow = list(lrow) # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow) # construct group iterators for both tables lgit = itertools.groupby(lit, key=lgetk) rgit = itertools.groupby(rit, key=rgetk) lrowgrp = [] # loop until *either* of the iterators is exhausted lkval, rkval = None, None # initialise here to handle empty tables try: # pick off initial row groups lkval, lrowgrp = next(lgit) rkval, rrowgrp = next(rgit) while True: if lkval < rkval: for row in joinrows(lrowgrp, None): yield tuple(row) # advance left lkval, lrowgrp = next(lgit) elif lkval > rkval: # advance right rkval, rrowgrp = next(rgit) else: for row in joinrows(lrowgrp, rrowgrp): yield tuple(row) # advance both lkval, lrowgrp = next(lgit) rkval, rrowgrp = next(rgit) except StopIteration: pass # make sure any left rows remaining are yielded if lkval > rkval: # yield anything that got left hanging for row in joinrows(lrowgrp, None): yield tuple(row) # yield the rest for lkval, lrowgrp in lgit: for row in joinrows(lrowgrp, None): yield tuple(row)