def itermergesort(sources, key, header, missing, reverse): # first need to standardise headers of all input tables # borrow this from itercat - TODO remove code smells its = [iter(t) for t in sources] source_flds_lists = [it.next() for it in its] if header is None: # determine output fields by gathering all fields found in the sources outflds = list() for flds in source_flds_lists: for f in flds: if f not in outflds: # add any new fields as we find them outflds.append(f) else: # predetermined output fields outflds = header yield tuple(outflds) def _standardisedata(it, flds, outflds): # now construct and yield the data rows for row in it: try: # should be quickest to do this way yield tuple(row[flds.index(f)] if f in flds else missing for f in outflds) except IndexError: # handle short rows outrow = [missing] * len(outflds) for i, f in enumerate(flds): try: outrow[outflds.index(f)] = row[i] except IndexError: pass # be relaxed about short rows yield tuple(outrow) # wrap all iterators to standardise fields sits = [ _standardisedata(it, flds, outflds) for flds, it in zip(source_flds_lists, its) ] # now determine key function getkey = None if key is not None: # convert field selection into field indices indices = asindices(outflds, key) # now use field indices to construct a _getkey function # N.B., this will probably raise an exception on short rows getkey = operator.itemgetter(*indices) # OK, do the merge sort for row in shortlistmergesorted(getkey, reverse, *sits): yield row
def _mergesorted(key=None, reverse=False, *iterables): # N.B., I've used heapq for normal merge sort and shortlist merge sort for reverse # merge sort because I've assumed that heapq.merge is faster and so is preferable # but it doesn't support reverse sorting so the shortlist merge sort has to # be used for reverse sorting. Some casual profiling suggests there isn't much # between the two in terms of speed, but might be worth profiling more carefully if reverse: return shortlistmergesorted(key, True, *iterables) else: return heapqmergesorted(key, *iterables)
def close(self): # sort anything remaining in the cache self.cache.sort(key=self.getkey, reverse=self.reverse) if self.chunkfiles: chunkiters = [iterchunk(f) for f in self.chunkfiles] chunkiters.append(self.cache) # make sure any left in cache are included for row in shortlistmergesorted(self.getkey, self.reverse, *chunkiters): self.broadcast(row) else: for row in self.cache: self.broadcast(row) super(SortConnection, self).close()
def itermergesort(sources, key, header, missing, reverse): # first need to standardise headers of all input tables # borrow this from itercat - TODO remove code smells its = [iter(t) for t in sources] source_flds_lists = [it.next() for it in its] if header is None: # determine output fields by gathering all fields found in the sources outflds = list() for flds in source_flds_lists: for f in flds: if f not in outflds: # add any new fields as we find them outflds.append(f) else: # predetermined output fields outflds = header yield tuple(outflds) def _standardisedata(it, flds, outflds): # now construct and yield the data rows for row in it: try: # should be quickest to do this way yield tuple(row[flds.index(f)] if f in flds else missing for f in outflds) except IndexError: # handle short rows outrow = [missing] * len(outflds) for i, f in enumerate(flds): try: outrow[outflds.index(f)] = row[i] except IndexError: pass # be relaxed about short rows yield tuple(outrow) # wrap all iterators to standardise fields sits = [_standardisedata(it, flds, outflds) for flds, it in zip(source_flds_lists, its)] # now determine key function getkey = None if key is not None: # convert field selection into field indices indices = asindices(outflds, key) # now use field indices to construct a _getkey function # N.B., this will probably raise an exception on short rows getkey = operator.itemgetter(*indices) # OK, do the merge sort for row in shortlistmergesorted(getkey, reverse, *sits): yield row