예제 #1
0
def iterhashantijoin(left, right, lkey, rkey):
    lit = iter(left)
    rit = iter(right)

    lflds = lit.next()
    rflds = rit.next()
    yield tuple(lflds)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lflds, lkey)
    rkind = asindices(rflds, rkey)
    
    # construct functions to extract key values from both tables
    lgetk = operator.itemgetter(*lkind)
    rgetk = operator.itemgetter(*rkind)
    
    rkeys = set()
    for rrow in rit:
        rk = rgetk(rrow)
        rkeys.add(rk)
        
    for lrow in lit:
        lk = lgetk(lrow)
        if lk not in rkeys:
            yield tuple(lrow)
예제 #2
0
def tupletrees(table, facet, start='start', stop='stop', value=None):
    """
    Construct faceted interval trees for the given table, where each node in the tree is a row of the table.

    """

    try:
        import bx.intervals
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)

    it = iter(table)
    fields = it.next()
    assert start in fields, 'start field not recognised'
    assert stop in fields, 'stop field not recognised'
    getstart = itemgetter(fields.index(start))
    getstop = itemgetter(fields.index(stop))
    if value is None:
        getvalue = tuple
    else:
        valueindices = asindices(fields, value)
        assert len(valueindices) > 0, 'invalid value field specification'
        getvalue = itemgetter(*valueindices)
    keyindices = asindices(fields, facet)
    assert len(keyindices) > 0, 'invalid key'
    getkey = itemgetter(*keyindices)

    trees = dict()
    for row in it:
        k = getkey(row)
        if k not in trees:
            trees[k] = bx.intervals.intersection.IntervalTree()
        trees[k].add(getstart(row), getstop(row), getvalue(row))
    return trees
예제 #3
0
파일: push.py 프로젝트: pombredanne/petl
    def __init__(self, default_connections, keyed_connections, fields, key):
        super(DuplicatesConnection, self).__init__(default_connections, keyed_connections, fields)

        # convert field selection into field indices
        indices = asindices(fields, key)
        
        # now use field indices to construct a _getkey function
        # N.B., this may raise an exception on short rows, depending on
        # the field selection
        self.getkey = itemgetter(*indices)

        # initial state
        self.previous = None
        self.previous_is_duplicate = False

        # convert field selection into field indices
        indices = asindices(fields, key)
        
        # now use field indices to construct a _getkey function
        # N.B., this may raise an exception on short rows, depending on
        # the field selection
        self.getkey = itemgetter(*indices)

        # initial state
        self.previous = None
        self.previous_is_duplicate = False
예제 #4
0
def iterhashrightjoin(left, right, lkey, rkey, missing, llookup, lprefix,
                      rprefix):
    lit = iter(left)
    rit = iter(right)

    lflds = lit.next()
    rflds = rit.next()
    
    # determine indices of the key fields in left and right tables
    lkind = asindices(lflds, lkey)
    rkind = asindices(rflds, rkey)
    
    # construct functions to extract key values from left table
    rgetk = operator.itemgetter(*rkind)
    
    # determine indices of non-key fields in the right table
    # (in the output, we only include key fields from the left table - we
    # don't want to duplicate fields)
    rvind = [i for i in range(len(rflds)) if i not in rkind]
    rgetv = rowgetter(*rvind)
    
    # determine the output fields
    if lprefix is None:
        outflds = list(lflds)
    else:
        outflds = [(str(lprefix) + str(f))
                   for f in lflds]
    if rprefix is None:
        outflds.extend(rgetv(rflds))
    else:
        outflds.extend([(str(rprefix) + str(f)) for f in rgetv(rflds)])
    yield tuple(outflds)

    # define a function to join rows
    def joinrows(_rrow, _lrows):
        for lrow in _lrows:
            # start with the left row
            _outrow = list(lrow)
            # extend with non-key values from the right row
            _outrow.extend(rgetv(_rrow))
            yield tuple(_outrow)

    for rrow in rit:
        k = rgetk(rrow)
        if k in llookup:
            lrows = llookup[k]
            for outrow in joinrows(rrow, lrows):
                yield outrow
        else:
            # start with missing values in place of the left row
            outrow = [missing] * len(lflds)
            # set key values
            for li, ri in zip(lkind, rkind):
                outrow[li] = rrow[ri]
            # extend with non-key values from the right row  
            outrow.extend(rgetv(rrow))
            yield tuple(outrow)
예제 #5
0
파일: joins.py 프로젝트: podpearson/petl
def iterantijoin(left, right, lkey, rkey):
    lit = iter(left)
    rit = iter(right)

    lflds = lit.next()
    rflds = rit.next()
    yield tuple(lflds)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lflds, lkey)
    rkind = asindices(rflds, rkey)

    # construct functions to extract key values from both tables
    lgetk = operator.itemgetter(*lkind)
    rgetk = operator.itemgetter(*rkind)

    # construct group iterators for both tables
    lgit = itertools.groupby(lit, key=lgetk)
    rgit = itertools.groupby(rit, key=rgetk)

    # loop until *either* of the iterators is exhausted
    lkval, rkval = None, None  # initialise here to handle empty tables
    try:

        # pick off initial row groups
        lkval, lrowgrp = lgit.next()
        rkval, _ = rgit.next()

        while True:
            if lkval < rkval:
                for row in lrowgrp:
                    yield tuple(row)
                # advance left
                lkval, lrowgrp = lgit.next()
            elif lkval > rkval:
                # advance right
                rkval, _ = rgit.next()
            else:
                # advance both
                lkval, lrowgrp = lgit.next()
                rkval, _ = rgit.next()

    except StopIteration:
        pass

    # any left over?
    if lkval > rkval:
        # yield anything that got left hanging
        for row in lrowgrp:
            yield tuple(row)
    # and the rest...
    for lkval, lrowgrp in lgit:
        for row in lrowgrp:
            yield tuple(row)
예제 #6
0
def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix):
    lit = iter(left)
    lflds = lit.next()

    rflds, rit = iterpeek(right)  # need the whole lot to pass to lookup
    from petl.util import lookupone
    rlookup = lookupone(rit, rkey, strict=False)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lflds, lkey)
    rkind = asindices(rflds, rkey)

    # construct functions to extract key values from left table
    lgetk = operator.itemgetter(*lkind)

    # determine indices of non-key fields in the right table
    # (in the output, we only include key fields from the left table - we
    # don't want to duplicate fields)
    rvind = [i for i in range(len(rflds)) if i not in rkind]
    rgetv = rowgetter(*rvind)

    # determine the output fields
    if lprefix is None:
        outflds = list(lflds)
    else:
        outflds = [(str(lprefix) + str(f))
                   for f in lflds]
    if rprefix is None:
        outflds.extend(rgetv(rflds))
    else:
        outflds.extend([(str(rprefix) + str(f))
                        for f in rgetv(rflds)])
    yield tuple(outflds)

    # define a function to join rows
    def joinrows(_lrow, _rrow):
        # start with the left row
        _outrow = list(_lrow)
        # extend with non-key values from the right row
        _outrow.extend(rgetv(_rrow))
        return tuple(_outrow)

    for lrow in lit:
        k = lgetk(lrow)
        if k in rlookup:
            rrow = rlookup[k]
            yield joinrows(lrow, rrow)
        else:
            outrow = list(lrow) # start with the left row
            # extend with missing values in place of the right row
            outrow.extend([missing] * len(rvind))
            yield tuple(outrow)
예제 #7
0
def itersimplemultirangeaggregate(table, keys, widths, aggregation, value,
                                      mins, maxs):

    if aggregation == len:
        aggregation = lambda grp: sum(1 for _ in grp) # count length of iterable
    yield ('key', 'value')

    # we want a recursive grouping algorithm so we could cope with any number of
    # key fields

    it = iter(table)
    fields = it.next()

    # wrap rows
    it = hybridrows(fields, it)

    # determine value function
    if value is None:
        getval = lambda v: v # identity function - i.e., whole row
    else:
        if callable(value):
            getval = value
        else:
            vindices = asindices(fields, value)
            getval = operator.itemgetter(*vindices)

    for bindef, vals in _recursive_bin(it, 0, [], fields, keys, widths, getval,
                                       mins, maxs):
        yield bindef, aggregation(vals)
예제 #8
0
파일: regex.py 프로젝트: podpearson/petl
def itersearch(table, pattern, field, flags, complement):
    prog = re.compile(pattern, flags)
    it = iter(table)
    fields = [str(f) for f in it.next()]
    yield tuple(fields)

    if field is None:
        # search whole row
        test = lambda row: any(prog.search(str(v)) for v in row)
    elif isinstance(field, basestring):
        # search single field
        index = fields.index(field)
        test = lambda row: prog.search(str(row[index]))
    else: # list or tuple or ...
        # search selection of fields
        indices = asindices(fields, field)
        getvals = operator.itemgetter(*indices)
        test = lambda row: any(prog.search(str(v)) for v in getvals(row))

    # complement==False, return rows that match
    if complement == False:
        for row in it:
            if test(row):
                yield tuple(row)
    # complement==True, return rows that do not match
    else:
        for row in it:
            if not test(row):
                yield tuple(row)
예제 #9
0
파일: dedup.py 프로젝트: pombredanne/petl
def iterunique(source, key):
    # assume source is sorted
    # first need to sort the data
    it = iter(source)

    flds = it.next()
    yield tuple(flds)

    # convert field selection into field indices
    if key is None:
        indices = range(len(flds))
    else:
        indices = asindices(flds, key)
        
    # now use field indices to construct a _getkey function
    # N.B., this may raise an exception on short rows, depending on
    # the field selection
    getkey = operator.itemgetter(*indices)
    
    prev = it.next()
    prev_key = getkey(prev)
    prev_comp_ne = True
    
    for curr in it:
        curr_key = getkey(curr)
        curr_comp_ne = (curr_key != prev_key)
        if prev_comp_ne and curr_comp_ne:
            yield tuple(prev)
        prev = curr
        prev_key = curr_key
        prev_comp_ne = curr_comp_ne
        
    # last one?
    if prev_comp_ne:
        yield prev
예제 #10
0
파일: selects.py 프로젝트: pombredanne/petl
def iterfieldselect(source, field, where, complement):
    it = iter(source)
    flds = it.next()
    yield tuple(flds)
    indices = asindices(flds, field)
    getv = operator.itemgetter(*indices)
    for row in it:
        v = getv(row)
        if where(v) != complement: # XOR
            yield tuple(row)
예제 #11
0
파일: dedup.py 프로젝트: pombredanne/petl
def iterconflicts(source, key, missing, exclude, include):

    # normalise arguments
    if isinstance(exclude, basestring):
        exclude = (exclude,)
    if isinstance(include, basestring):
        include = (include,)

    # exclude overrides include
    if include and exclude:
        include = None
        
    it = iter(source)
    flds = it.next()
    yield tuple(flds)

    # convert field selection into field indices
    indices = asindices(flds, key)
                    
    # now use field indices to construct a _getkey function
    # N.B., this may raise an exception on short rows, depending on
    # the field selection
    getkey = operator.itemgetter(*indices)
    
    previous = None
    previous_yielded = False
    
    for row in it:
        if previous is None:
            previous = row
        else:
            kprev = getkey(previous)
            kcurr = getkey(row)
            if kprev == kcurr:
                # is there a conflict?
                conflict = False
                for x, y, f in zip(previous, row, flds):
                    if (exclude and f not in exclude) \
                            or (include and f in include) \
                            or (not exclude and not include):
                        if missing not in (x, y) and x != y:
                            conflict = True
                            break
                if conflict:
                    if not previous_yielded:
                        yield tuple(previous)
                        previous_yielded = True
                    yield tuple(row)
            else:
                # reset
                previous_yielded = False
            previous = row
예제 #12
0
파일: sorts.py 프로젝트: pombredanne/petl
def itermergesort(sources, key, header, missing, reverse):

    # first need to standardise headers of all input tables
    # borrow this from itercat - TODO remove code smells

    its = [iter(t) for t in sources]
    source_flds_lists = [it.next() for it in its]

    if header is None:
        # determine output fields by gathering all fields found in the sources
        outflds = list()
        for flds in source_flds_lists:
            for f in flds:
                if f not in outflds:
                    # add any new fields as we find them
                    outflds.append(f)
    else:
        # predetermined output fields
        outflds = header
    yield tuple(outflds)

    def _standardisedata(it, flds, outflds):
        # now construct and yield the data rows
        for row in it:
            try:
                # should be quickest to do this way
                yield tuple(row[flds.index(f)] if f in flds else missing for f in outflds)
            except IndexError:
                # handle short rows
                outrow = [missing] * len(outflds)
                for i, f in enumerate(flds):
                    try:
                        outrow[outflds.index(f)] = row[i]
                    except IndexError:
                        pass # be relaxed about short rows
                yield tuple(outrow)

    # wrap all iterators to standardise fields
    sits = [_standardisedata(it, flds, outflds) for flds, it in zip(source_flds_lists, its)]

    # now determine key function
    getkey = None
    if key is not None:
        # convert field selection into field indices
        indices = asindices(outflds, key)
        # now use field indices to construct a _getkey function
        # N.B., this will probably raise an exception on short rows
        getkey = operator.itemgetter(*indices)

    # OK, do the merge sort
    for row in shortlistmergesorted(getkey, reverse, *sits):
        yield row
예제 #13
0
파일: fills.py 프로젝트: pombredanne/petl
def iterfilldown(table, fillfields, missing):
    it = iter(table)
    fields = it.next()
    yield tuple(fields)
    if not fillfields: # fill down all fields
        fillfields = fields
    fillindices = asindices(fields, fillfields)
    fill = list(it.next()) # fill values
    yield tuple(fill)
    for row in it:
        outrow = list(row)
        for idx in fillindices:
            if row[idx] == missing:
                outrow[idx] = fill[idx] # fill down
            else:
                fill[idx] = row[idx] # new fill value
        yield tuple(outrow)
예제 #14
0
파일: dedup.py 프로젝트: rs/petl
    def __iter__(self):
        it = iter(self.table)
        flds = it.next()

        # convert field selection into field indices
        if self.key is None:
            indices = range(len(flds))
        else:
            indices = asindices(flds, self.key)

        # now use field indices to construct a _getkey function
        # N.B., this may raise an exception on short rows, depending on
        # the field selection
        getkey = operator.itemgetter(*indices)

        if self.count:
            flds = tuple(flds) + (self.count,)
            yield flds
            previous = None
            n_dup = 1
            for row in it:
                if previous is None:
                    previous = row
                else:
                    kprev = getkey(previous)
                    kcurr = getkey(row)
                    if kprev == kcurr:
                        n_dup += 1
                    else:
                        yield tuple(previous) + (n_dup,)
                        n_dup = 1
                        previous = row
            # deal with last row
            yield tuple(previous) + (n_dup,)
        else:
            yield flds
            previous_keys = None
            for row in it:
                keys = getkey(row)
                if keys != previous_keys:
                    yield tuple(row)
                previous_keys = keys
예제 #15
0
파일: push.py 프로젝트: pombredanne/petl
    def __init__(self, default_connections, keyed_connections, fields, key, reverse, buffersize):
        super(SortConnection, self).__init__(default_connections, keyed_connections, fields)

        self.getkey = None
        if key is not None:
            # convert field selection into field indices
            indices = asindices(fields, key)
            # now use field indices to construct a _getkey function
            # N.B., this will probably raise an exception on short rows
            self.getkey = itemgetter(*indices)

        self.reverse = reverse

        if buffersize is None:
            self.buffersize = petl.transform.sorts.defaultbuffersize
        else:
            self.buffersize = buffersize

        self.cache = list()
        self.chunkfiles = list()
예제 #16
0
파일: basics.py 프로젝트: podpearson/petl
    def __iter__(self):
        it = iter(self.table)

        # determine output fields
        fields = list(it.next())
        newfields = [f for f in fields if f != self.field]
        newfields.insert(self.index, self.field)
        yield tuple(newfields)

        # define a function to transform each row in the source data
        # according to the field selection
        indices = asindices(fields, newfields)
        transform = rowgetter(*indices)

        # construct the transformed data
        for row in it:
            try:
                yield transform(row)
            except IndexError:
                # row is short, let's be kind and fill in any missing fields
                yield tuple(row[i] if i < len(row) else self.missing for i in indices)
예제 #17
0
def itersearch(table, pattern, field, flags):
    prog = re.compile(pattern, flags)
    it = iter(table)
    fields = [str(f) for f in it.next()]
    yield tuple(fields)

    if field is None:
        # search whole row
        test = lambda row: any(prog.search(str(v)) for v in row)
    elif isinstance(field, basestring):
        # search single field
        index = fields.index(field)
        test = lambda row: prog.search(str(row[index]))
    else:  # list or tuple or ...
        # search selection of fields
        indices = asindices(fields, field)
        getvals = operator.itemgetter(*indices)
        test = lambda row: any(prog.search(str(v)) for v in getvals(row))

    for row in it:
        if test(row):
            yield tuple(row)
예제 #18
0
파일: basics.py 프로젝트: podpearson/petl
def itercut(source, spec, missing=None):
    it = iter(source)
    spec = tuple(spec)  # make sure no-one can change midstream
    
    # convert field selection into field indices
    flds = it.next()
    indices = asindices(flds, spec)

    # define a function to transform each row in the source data 
    # according to the field selection
    transform = rowgetter(*indices)
    
    # yield the transformed field names
    yield transform(flds)
    
    # construct the transformed data
    for row in it:
        try:
            yield transform(row) 
        except IndexError:
            # row is short, let's be kind and fill in any missing fields
            yield tuple(row[i] if i < len(row) else missing for i in indices)
예제 #19
0
def itercutout(source, spec, missing=None):
    it = iter(source)
    spec = tuple(spec)  # make sure no-one can change midstream

    # convert field selection into field indices
    flds = it.next()
    indicesout = asindices(flds, spec)
    indices = [i for i in range(len(flds)) if i not in indicesout]

    # define a function to transform each row in the source data
    # according to the field selection
    transform = rowgetter(*indices)

    # yield the transformed field names
    yield transform(flds)

    # construct the transformed data
    for row in it:
        try:
            yield transform(row)
        except IndexError:
            # row is short, let's be kind and fill in any missing fields
            yield tuple(row[i] if i < len(row) else missing for i in indices)
예제 #20
0
파일: dedup.py 프로젝트: pombredanne/petl
def iterduplicates(source, key):
    # assume source is sorted
    # first need to sort the data
    it = iter(source)

    flds = it.next()
    yield tuple(flds)

    # convert field selection into field indices
    if key is None:
        indices = range(len(flds))
    else:
        indices = asindices(flds, key)
        
    # now use field indices to construct a _getkey function
    # N.B., this may raise an exception on short rows, depending on
    # the field selection
    getkey = operator.itemgetter(*indices)
    
    previous = None
    previous_yielded = False
    
    for row in it:
        if previous is None:
            previous = row
        else:
            kprev = getkey(previous)
            kcurr = getkey(row)
            if kprev == kcurr:
                if not previous_yielded:
                    yield tuple(previous)
                    previous_yielded = True
                yield tuple(row)
            else:
                # reset
                previous_yielded = False
            previous = row
예제 #21
0
def iterduplicates(source, key):
    # assume source is sorted
    # first need to sort the data
    it = iter(source)

    flds = it.next()
    yield tuple(flds)

    # convert field selection into field indices
    if key is None:
        indices = range(len(flds))
    else:
        indices = asindices(flds, key)

    # now use field indices to construct a _getkey function
    # N.B., this may raise an exception on short rows, depending on
    # the field selection
    getkey = operator.itemgetter(*indices)

    previous = None
    previous_yielded = False

    for row in it:
        if previous is None:
            previous = row
        else:
            kprev = getkey(previous)
            kcurr = getkey(row)
            if kprev == kcurr:
                if not previous_yielded:
                    yield tuple(previous)
                    previous_yielded = True
                yield tuple(row)
            else:
                # reset
                previous_yielded = False
            previous = row
예제 #22
0
파일: push.py 프로젝트: talwai/petl
 def __init__(self, default_connections, keyed_connections, fields, discriminator):
     super(PartitionConnection, self).__init__(default_connections, keyed_connections, fields)
     if callable(discriminator):
         self.discriminator = discriminator
     else: # assume field or fields
         self.discriminator = itemgetter(*asindices(fields, discriminator))
예제 #23
0
파일: push.py 프로젝트: pombredanne/petl
 def __init__(self, default_connections, keyed_connections, fields, discriminator):
     super(PartitionConnection, self).__init__(default_connections, keyed_connections, fields)
     if callable(discriminator):
         self.discriminator = discriminator
     else: # assume field or fields
         self.discriminator = itemgetter(*asindices(fields, discriminator))
예제 #24
0
파일: sorts.py 프로젝트: pombredanne/petl
    def _iternocache(self, source, key, reverse):
        debug('iterate without cache')
        self._clearcache()
        it = iter(source)

        flds = it.next()
        yield tuple(flds)

        if key is not None:
            # convert field selection into field indices
            indices = asindices(flds, key)
        else:
            indices = range(len(flds))
        # now use field indices to construct a _getkey function
        # N.B., this will probably raise an exception on short rows
        getkey = sortable_itemgetter(*indices)

        # initialise the first chunk
        rows = list(itertools.islice(it, 0, self.buffersize))
        rows.sort(key=getkey, reverse=reverse)

        # have we exhausted the source iterator?
        if self.buffersize is None or len(rows) < self.buffersize:

            if self.cache:
                debug('caching mem')
                self._fldcache = flds
                self._memcache = rows
                self._getkey = getkey # actually not needed to iterate from memcache

            for row in rows:
                yield tuple(row)

        else:

            chunkfiles = []

            while rows:

                # dump the chunk
                f = NamedTemporaryFile(dir=self.tempdir)
                for row in rows:
                    pickle.dump(row, f, protocol=-1)
                f.flush()
                # N.B., do not close the file! Closing will delete
                # the file, and we might want to keep it around
                # if it can be cached. We'll let garbage collection
                # deal with this, i.e., when no references to the
                # chunk files exist any more, garbage collection
                # should be an implicit close, which will cause file
                # deletion.
                chunkfiles.append(f)

                # grab the next chunk
                rows = list(itertools.islice(it, 0, self.buffersize))
                rows.sort(key=getkey, reverse=reverse)

            if self.cache:
                debug('caching files %r', chunkfiles)
                self._fldcache = flds
                self._filecache = chunkfiles
                self._getkey = getkey

            chunkiters = [iterchunk(f) for f in chunkfiles]
            for row in _mergesorted(getkey, reverse, *chunkiters):
                yield tuple(row)
예제 #25
0
def iterlookupjoin(left, right, lkey, rkey, missing=None, lprefix=None,
                   rprefix=None):
    lit = iter(left)
    rit = iter(right)

    lflds = lit.next()
    rflds = rit.next()

    # determine indices of the key fields in left and right tables
    lkind = asindices(lflds, lkey)
    rkind = asindices(rflds, rkey)

    # construct functions to extract key values from both tables
    lgetk = operator.itemgetter(*lkind)
    rgetk = operator.itemgetter(*rkind)

    # determine indices of non-key fields in the right table
    # (in the output, we only include key fields from the left table - we
    # don't want to duplicate fields)
    rvind = [i for i in range(len(rflds)) if i not in rkind]
    rgetv = rowgetter(*rvind)

    # determine the output fields
    if lprefix is None:
        outflds = list(lflds)
    else:
        outflds = [(str(lprefix) + str(f))
                   for f in lflds]
    if rprefix is None:
        outflds.extend(rgetv(rflds))
    else:
        outflds.extend([(str(rprefix) + str(f)) for f in rgetv(rflds)])
    yield tuple(outflds)

    # define a function to join two groups of rows
    def joinrows(lrowgrp, rrowgrp):
        if rrowgrp is None:
            for lrow in lrowgrp:
                outrow = list(lrow)  # start with the left row
                # extend with missing values in place of the right row
                outrow.extend([missing] * len(rvind))
                yield tuple(outrow)
        else:
            rrow = iter(rrowgrp).next()  # pick first arbitrarily
            for lrow in lrowgrp:
                # start with the left row
                outrow = list(lrow)
                # extend with non-key values from the right row
                outrow.extend(rgetv(rrow))
                yield tuple(outrow)

    # construct group iterators for both tables
    lgit = itertools.groupby(lit, key=lgetk)
    rgit = itertools.groupby(rit, key=rgetk)

    # loop until *either* of the iterators is exhausted
    lkval, rkval = None, None  # initialise here to handle empty tables
    try:

        # pick off initial row groups
        lkval, lrowgrp = lgit.next()
        rkval, rrowgrp = rgit.next()

        while True:
            if lkval < rkval:
                for row in joinrows(lrowgrp, None):
                    yield tuple(row)
                # advance left
                lkval, lrowgrp = lgit.next()
            elif lkval > rkval:
                # advance right
                rkval, rrowgrp = rgit.next()
            else:
                for row in joinrows(lrowgrp, rrowgrp):
                    yield tuple(row)
                # advance both
                lkval, lrowgrp = lgit.next()
                rkval, rrowgrp = rgit.next()

    except StopIteration:
        pass

    # make sure any left rows remaining are yielded
    if lkval > rkval:
        # yield anything that got left hanging
        for row in joinrows(lrowgrp, None):
            yield tuple(row)
    # yield the rest
    for lkval, lrowgrp in lgit:
        for row in joinrows(lrowgrp, None):
            yield tuple(row)
예제 #26
0
파일: interval.py 프로젝트: ianfiske/petlx
def iterintervaljoin(left, right, lstart, lstop, rstart, rstop, lfacet,
                     rfacet, proximity, missing, lprefix, rprefix, leftouter,
                     anti=False):

    # create iterators and obtain fields
    lit = iter(left)
    lfields = lit.next()
    rit = iter(right)
    rfields = rit.next()

    # check fields via petl.util.asindices (raises FieldSelectionError if spec
    # is not valid)
    asindices(lfields, lstart)
    asindices(lfields, lstop)
    if lfacet is not None:
        asindices(lfields, lfacet)
    asindices(rfields, rstart)
    asindices(rfields, rstop)
    if rfacet is not None:
        asindices(rfields, rfacet)

    # determine output fields
    if lprefix is None:
        outfields = list(lfields)
        if not anti:
            outfields.extend(rfields)
    else:
        outfields = list(lprefix + f for f in lfields)
        if not anti:
            outfields.extend(rprefix + f for f in rfields)
    yield tuple(outfields)
    
    # create getters for start and stop positions
    getlstart = itemgetter(lfields.index(lstart))
    getlstop = itemgetter(lfields.index(lstop))

    if rfacet is None:
        # build interval lookup for right table
        lookup = intervallookup(right, rstart, rstop, proximity=proximity)
        find = lookup.find
        # main loop
        for lrow in lit:
            start = getlstart(lrow)
            stop = getlstop(lrow)
            rrows = find(start, stop)
            if rrows:
                if not anti:
                    for rrow in rrows:
                        outrow = list(lrow)
                        outrow.extend(rrow)
                        yield tuple(outrow)
            elif leftouter:
                outrow = list(lrow)
                if not anti:
                    outrow.extend([missing] * len(rfields))
                yield tuple(outrow)

    else:
        # build interval lookup for right table
        lookup = facetintervallookup(right, facet=rfacet, start=rstart,
                                     stop=rstop, proximity=proximity)
        find = dict()
        for f in lookup:
            find[f] = lookup[f].find
        # getter for facet key values in left table
        getlkey = itemgetter(*asindices(lfields, lfacet))
        # main loop
        for lrow in lit:
            lkey = getlkey(lrow)
            start = getlstart(lrow)
            stop = getlstop(lrow)
            
            try:
                rrows = find[lkey](start, stop)
            except KeyError:
                rrows = None
            except AttributeError:
                rrows = None
                
            if rrows:
                if not anti:
                    for rrow in rrows:
                        outrow = list(lrow)
                        outrow.extend(rrow)
                        yield tuple(outrow)
            elif leftouter:
                outrow = list(lrow)
                if not anti:
                    outrow.extend([missing] * len(rfields))
                yield tuple(outrow)
예제 #27
0
def iterintervalsubtract(left, right, lstart, lstop, rstart, rstop, lfacet, rfacet,
                         proximity):

    # create iterators and obtain fields
    lit = iter(left)
    lfields = lit.next()
    assert lstart in lfields, 'field not found: %s' % lstart 
    assert lstop in lfields, 'field not found: %s' % lstop
    if lfacet is not None:
        assert lfacet in lfields, 'field not found: %s' % lfacet
    rit = iter(right)
    rfields = rit.next()
    assert rstart in rfields, 'field not found: %s' % rstart 
    assert rstop in rfields, 'field not found: %s' % rstop
    if rfacet is not None:
        assert rfacet in rfields, 'field not found: %s' % rfacet

    # determine output fields
    outfields = list(lfields)
#    outfields.extend(rfields)
    yield tuple(outfields)
    
    # create getters for start and stop positions
    lstartidx = lfields.index(lstart)
    lstopidx = lfields.index(lstop)
    getlcoords = itemgetter(lstartidx, lstopidx)
    getrcoords = itemgetter(rfields.index(rstart), rfields.index(rstop))

    if rfacet is None:
        # build interval lookup for right table
        lookup = intervallookup(right, rstart, rstop, proximity=proximity)
        find = lookup.find
        # main loop
        for lrow in lit:
            start, stop = getlcoords(lrow)
            rrows = find(start, stop)
            if not rrows:
                yield tuple(lrow)
            else:
                rivs = sorted([getrcoords(rrow) for rrow in rrows], key=itemgetter(0))  # sort by start
                for x, y in _subtract(start, stop, rivs):
                    out = list(lrow)
                    out[lstartidx] = x
                    out[lstopidx] = y
                    yield tuple(out)
                
    else:
        # build interval lookup for right table
        lookup = facetintervallookup(right, facet=rfacet, start=rstart, stop=rstop,
                                     proximity=proximity)   
        # getter for facet key values in left table
        getlkey = itemgetter(*asindices(lfields, lfacet))
        # main loop
        for lrow in lit:
            lkey = getlkey(lrow)
            start, stop = getlcoords(lrow)
            try:
                rrows = lookup[lkey].find(start, stop)
            except KeyError:
                rrows = None
            except AttributeError:
                rrows = None
            if not rrows:
                yield tuple(lrow)
            else:
                rivs = sorted([getrcoords(rrow) for rrow in rrows], key=itemgetter(0))  # sort by start
                for x, y in _subtract(start, stop, rivs):
                    out = list(lrow)
                    out[lstartidx] = x
                    out[lstopidx] = y
                    yield tuple(out)
예제 #28
0
def iterintervalleftjoin(left, right, lstart, lstop, rstart, rstop, lfacet, rfacet,
                         proximity, missing):

    # create iterators and obtain fields
    lit = iter(left)
    lfields = lit.next()
    assert lstart in lfields, 'field not found: %s' % lstart 
    assert lstop in lfields, 'field not found: %s' % lstop
    if lfacet is not None:
        assert lfacet in lfields, 'field not found: %s' % lfacet
    rit = iter(right)
    rfields = rit.next()
    assert rstart in rfields, 'field not found: %s' % rstart 
    assert rstop in rfields, 'field not found: %s' % rstop
    if rfacet is not None:
        assert rfacet in rfields, 'field not found: %s' % rfacet

    # determine output fields
    outfields = list(lfields)
    outfields.extend(rfields)
    yield tuple(outfields)
    
    # create getters for start and stop positions
    getlstart = itemgetter(lfields.index(lstart))
    getlstop = itemgetter(lfields.index(lstop))

    if rfacet is None:
        # build interval lookup for right table
        lookup = intervallookup(right, rstart, rstop, proximity=proximity)
        find = lookup.find
        # main loop
        for lrow in lit:
            start = getlstart(lrow)
            stop = getlstop(lrow)
            rrows = find(start, stop)
            if rrows:
                for rrow in rrows:
                    outrow = list(lrow)
                    outrow.extend(rrow)
                    yield tuple(outrow)
            else:
                outrow = list(lrow)
                outrow.extend([missing] * len(rfields))
                yield tuple(outrow)

    else:
        # build interval lookup for right table
        lookup = facetintervallookup(right, facet=rfacet, start=rstart, stop=rstop,
                                     proximity=proximity)   
        find = dict()
        for f in lookup:
            find[f] = lookup[f].find
        # getter for facet key values in left table
        getlkey = itemgetter(*asindices(lfields, lfacet))
        # main loop
        for lrow in lit:
            lkey = getlkey(lrow)
            start = getlstart(lrow)
            stop = getlstop(lrow)
            
            try:
                rrows = find[lkey](start, stop)
            except KeyError:
                rrows = None
            except AttributeError:
                rrows = None
                
            if rrows:
                for rrow in rrows:
                    outrow = list(lrow)
                    outrow.extend(rrow)
                    yield tuple(outrow)
            else:
                outrow = list(lrow)
                outrow.extend([missing] * len(rfields))
                yield tuple(outrow)
예제 #29
0
def iterintervaljoin(left,
                     right,
                     lstart,
                     lstop,
                     rstart,
                     rstop,
                     lfacet,
                     rfacet,
                     proximity,
                     missing,
                     lprefix,
                     rprefix,
                     leftouter,
                     anti=False):

    # create iterators and obtain fields
    lit = iter(left)
    lfields = lit.next()
    rit = iter(right)
    rfields = rit.next()

    # check fields via petl.util.asindices (raises FieldSelectionError if spec
    # is not valid)
    asindices(lfields, lstart)
    asindices(lfields, lstop)
    if lfacet is not None:
        asindices(lfields, lfacet)
    asindices(rfields, rstart)
    asindices(rfields, rstop)
    if rfacet is not None:
        asindices(rfields, rfacet)

    # determine output fields
    if lprefix is None:
        outfields = list(lfields)
        if not anti:
            outfields.extend(rfields)
    else:
        outfields = list(lprefix + f for f in lfields)
        if not anti:
            outfields.extend(rprefix + f for f in rfields)
    yield tuple(outfields)

    # create getters for start and stop positions
    getlstart = itemgetter(lfields.index(lstart))
    getlstop = itemgetter(lfields.index(lstop))

    if rfacet is None:
        # build interval lookup for right table
        lookup = intervallookup(right, rstart, rstop, proximity=proximity)
        find = lookup.find
        # main loop
        for lrow in lit:
            start = getlstart(lrow)
            stop = getlstop(lrow)
            rrows = find(start, stop)
            if rrows:
                if not anti:
                    for rrow in rrows:
                        outrow = list(lrow)
                        outrow.extend(rrow)
                        yield tuple(outrow)
            elif leftouter:
                outrow = list(lrow)
                if not anti:
                    outrow.extend([missing] * len(rfields))
                yield tuple(outrow)

    else:
        # build interval lookup for right table
        lookup = facetintervallookup(right,
                                     facet=rfacet,
                                     start=rstart,
                                     stop=rstop,
                                     proximity=proximity)
        find = dict()
        for f in lookup:
            find[f] = lookup[f].find
        # getter for facet key values in left table
        getlkey = itemgetter(*asindices(lfields, lfacet))
        # main loop
        for lrow in lit:
            lkey = getlkey(lrow)
            start = getlstart(lrow)
            stop = getlstop(lrow)

            try:
                rrows = find[lkey](start, stop)
            except KeyError:
                rrows = None
            except AttributeError:
                rrows = None

            if rrows:
                if not anti:
                    for rrow in rrows:
                        outrow = list(lrow)
                        outrow.extend(rrow)
                        yield tuple(outrow)
            elif leftouter:
                outrow = list(lrow)
                if not anti:
                    outrow.extend([missing] * len(rfields))
                yield tuple(outrow)
예제 #30
0
def iterintervalsubtract(left, right, lstart, lstop, rstart, rstop, lfacet,
                         rfacet, proximity):

    # create iterators and obtain fields
    lit = iter(left)
    lfields = lit.next()
    assert lstart in lfields, 'field not found: %s' % lstart
    assert lstop in lfields, 'field not found: %s' % lstop
    if lfacet is not None:
        assert lfacet in lfields, 'field not found: %s' % lfacet
    rit = iter(right)
    rfields = rit.next()
    assert rstart in rfields, 'field not found: %s' % rstart
    assert rstop in rfields, 'field not found: %s' % rstop
    if rfacet is not None:
        assert rfacet in rfields, 'field not found: %s' % rfacet

    # determine output fields
    outfields = list(lfields)
    #    outfields.extend(rfields)
    yield tuple(outfields)

    # create getters for start and stop positions
    lstartidx = lfields.index(lstart)
    lstopidx = lfields.index(lstop)
    getlcoords = itemgetter(lstartidx, lstopidx)
    getrcoords = itemgetter(rfields.index(rstart), rfields.index(rstop))

    if rfacet is None:
        # build interval lookup for right table
        lookup = intervallookup(right, rstart, rstop, proximity=proximity)
        find = lookup.find
        # main loop
        for lrow in lit:
            start, stop = getlcoords(lrow)
            rrows = find(start, stop)
            if not rrows:
                yield tuple(lrow)
            else:
                rivs = sorted([getrcoords(rrow) for rrow in rrows],
                              key=itemgetter(0))  # sort by start
                for x, y in _subtract(start, stop, rivs):
                    out = list(lrow)
                    out[lstartidx] = x
                    out[lstopidx] = y
                    yield tuple(out)

    else:
        # build interval lookup for right table
        lookup = facetintervallookup(right,
                                     facet=rfacet,
                                     start=rstart,
                                     stop=rstop,
                                     proximity=proximity)
        # getter for facet key values in left table
        getlkey = itemgetter(*asindices(lfields, lfacet))
        # main loop
        for lrow in lit:
            lkey = getlkey(lrow)
            start, stop = getlcoords(lrow)
            try:
                rrows = lookup[lkey].find(start, stop)
            except KeyError:
                rrows = None
            except AttributeError:
                rrows = None
            if not rrows:
                yield tuple(lrow)
            else:
                rivs = sorted([getrcoords(rrow) for rrow in rrows],
                              key=itemgetter(0))  # sort by start
                for x, y in _subtract(start, stop, rivs):
                    out = list(lrow)
                    out[lstartidx] = x
                    out[lstopidx] = y
                    yield tuple(out)