예제 #1
0
class Join(MiniEngine):
    '''
    The Join mini-engine combines the records of two streams in such a way
    that the result is the Cartesian product between two corresponding
    record partitions, one from each stream.
    '''
    class PartitionBuffer(object):
        '''
        This class represents a partition buffer for a given endpoint. Each
        partition received from the endpoint is stored in a separate
        buffer.
        '''
        def __init__(self):
            self._b = []

        def append(self, r):
            if len(self._b) == 0:
                self._b.append([])

            self._b[-1].append(r)

        def current(self):
            return len(self._b) - 1

        def next(self):
            self._b.append([])

        def get(self, i):
            assert i >= 0 and i < len(self._b)
            return self._b[i]

        def finished(self, i):
            return i < len(self._b) - 1

        def remove(self, i):
            assert i >= 0 and i < len(self._b)
            t = self._b[i]
            self._b[i] = None
            del t

    def __init__(self, first, second):
        MiniEngine.__init__(self)

        self._first = first
        self._second = second

        # Construct the schema of the output stream.
        self._schema = Schema()
        for a in self._first.schema() + self._second.schema():
            self._schema.append(a)

        self._queue = Queue(100)

        self._first_ep = self._first.connect()
        self._first_ep.notify(self._queue)

        self._second_ep = self._second.connect()
        self._second_ep.notify(self._queue)

        self._output = Stream(self._schema, SortOrder(), 'Join')

        self._m = {
            self._first_ep: self._first,
            self._second_ep: self._second,
        }

        self._empty = 0

    def output(self):
        return self._output

    def _merge(self, buffers, i):
        assert buffers[self._first_ep].finished(i)
        assert buffers[self._second_ep].finished(i)

        b1 = buffers[self._first_ep].get(i)
        b2 = buffers[self._second_ep].get(i)

        if len(b2) == 1:
            self._empty += 1

        for r1 in b1[:-1]:
            for r2 in b2[:-1]:
                yield r1 + r2

        buffers[self._first_ep].remove(i)
        buffers[self._second_ep].remove(i)

    def run(self):
        done = False
        buffers = {
            self._first_ep: self.PartitionBuffer(),
            self._second_ep: self.PartitionBuffer(),
        }
        while not done or not self._queue.empty():
            e = self._queue.get()

            if e not in buffers:
                print 'ERROR: no buffer for endpoint'
                continue

            valid = True
            closed = False
            while valid and not closed:
                try:
                    r = e.receive(False)
                    buffers[e].append(r)
                    if type(r) is StopWord:
                        current = buffers[e].current()
                        buffers[e].next()
                        # Only merge if all buffers have completed this
                        # partition.
                        merge = True
                        for o in buffers:
                            merge &= buffers[o].finished(current)
                        if merge:
                            for x in self._merge(buffers, current):
                                self._output.send(x)
                            self._output.send(StopWord())

                        # Advance this buffer's partition by 1
                    e.processed()
                except StreamClosedException:
                    closed = True
                except Empty:
                    valid = False
                except:
                    raise
            else:
                done = True
                for o in buffers:
                    done &= o.closed()
            self._queue.task_done()
        self._output.close()
        print 'Join done. %d empty buffers.' % (self._empty)
예제 #2
0
class Join(MiniEngine):
    '''
    The Join mini-engine combines the records of two streams in such a way
    that the result is the Cartesian product between two corresponding
    record partitions, one from each stream.
    '''
    class PartitionBuffer(object):
        '''
        This class represents a partition buffer for a given endpoint. Each
        partition received from the endpoint is stored in a separate
        buffer.
        '''
        def __init__(self):
            self._b = []

        def append(self, r):
            if len(self._b) == 0:
                self._b.append([])

            self._b[-1].append(r)

        def current(self):
            return len(self._b) - 1

        def next(self):
            self._b.append([])

        def get(self, i):
            assert i >= 0 and i < len(self._b)
            return self._b[i]

        def finished(self, i):
            return i < len(self._b) - 1

        def remove(self, i):
            assert i >= 0 and i < len(self._b)
            t = self._b[i]
            self._b[i] = None
            del t

    def __init__(self, first, second):
        MiniEngine.__init__(self)

        self._first = first
        self._second = second

        # Construct the schema of the output stream.
        self._schema = Schema()
        for a in self._first.schema() + self._second.schema():
            self._schema.append(a)

        self._queue = Queue(100)
        
        self._first_ep = self._first.connect()
        self._first_ep.notify(self._queue)

        self._second_ep = self._second.connect()
        self._second_ep.notify(self._queue)
        
        self._output = Stream(
            self._schema,
            SortOrder(),
            'Join'
        )

        self._m = {
            self._first_ep: self._first,
            self._second_ep: self._second,
        }

        self._empty = 0

    def output(self):
        return self._output

    def _merge(self, buffers, i):
        assert buffers[self._first_ep].finished(i)
        assert buffers[self._second_ep].finished(i)

        b1 = buffers[self._first_ep].get(i)
        b2 = buffers[self._second_ep].get(i)

        if len(b2) == 1:
            self._empty += 1

        for r1 in b1[:-1]:
            for r2 in b2[:-1]:
                yield r1 + r2

        buffers[self._first_ep].remove(i)
        buffers[self._second_ep].remove(i)
        
    def run(self):
        done = False
        buffers = {
            self._first_ep: self.PartitionBuffer(),
            self._second_ep: self.PartitionBuffer(),
        }
        while not done or not self._queue.empty():
            e = self._queue.get()
            
            if e not in buffers:
                print 'ERROR: no buffer for endpoint'
                continue

            valid = True
            closed = False
            while valid and not closed:
                try:
                    r = e.receive(False)
                    buffers[e].append(r)
                    if type(r) is StopWord:
                        current = buffers[e].current()
                        buffers[e].next()
                        # Only merge if all buffers have completed this
                        # partition.
                        merge = True
                        for o in buffers:
                            merge &= buffers[o].finished(current)
                        if merge:
                            for x in self._merge(buffers, current):
                                self._output.send(x)
                            self._output.send(StopWord())

                        # Advance this buffer's partition by 1
                    e.processed()
                except StreamClosedException:
                    closed = True
                except Empty:
                    valid = False
                except:
                    raise
            else:
                done = True
                for o in buffers:
                    done &= o.closed()
            self._queue.task_done()
        self._output.close()
        print 'Join done. %d empty buffers.' % (self._empty)
예제 #3
0
class Rtree(DataSource):
    def __init__(self, filename, name):
        # Name of the Rtree file
        self._filename = filename
        # Name of the geometry attribute
        self._name = name

        # Construct the schema for the R-tree data source. It consists of a
        # unique OID which was generated during index creation and the
        # geometry object with the specified attribute name.
        self._schema = Schema()
        self._schema.append(Attribute('oid', int, True))
        self._schema.append(Attribute(name, Geometry))
      
        # Construct the file name of the data and data index file.
        self._data_filename = self._filename + '.data'
        self._index_filename = self._filename + '.data.idx'
        
        # Open the data and data index files
        self._data_file = open(self._data_filename, 'r+')
        self._index_file = open(self._index_filename, 'r+')

        # Determine the length of the data and the data index files.
        self._index_file.seek(0, os.SEEK_END)
        self._index_length = self._index_file.tell()
        self._index_file.seek(0)
        self._data_file.seek(0, os.SEEK_END)
        self._data_length = self._data_file.tell()
        self._data_file.seek(0)

        # Compute size of a long int.
        self._long_size = struct.calcsize('L')
        self._index_size = self._index_length / self._long_size

        # Memory-map data and data index
        self._data = mmap.mmap(self._data_file.fileno(), 0)
        self._index = mmap.mmap(self._index_file.fileno(), 0)
        
        # Open the R-tree
        self._tree = _Rtree(self._filename)

    def schema(self):
        return self._schema

    def _get_by_oid(self, oid):
        if oid < 0 or oid >= self._index_size:
            raise KeyError('Object with ID [%d] does not exist.' % (oid))
        # Compute address of object pointer.
        a = oid * self._long_size
        # Compute address of following object pointer.
        b = a + 2 * self._long_size
        # Unpack pointer to the address in the datafile.
        if b > self._index_length:
            # If the object pointer is the last one and thus there is
            # no following record, the length of the object is
            # restricted to the data file's size.
            first, = struct.unpack(
                'L', 
                self._index[a:self._index_length]
            )
            second = self._data_length
        else:
            # Otherwise simply compute the object's size from the
            # difference between its address and the address of the
            # following object.
            first, second = struct.unpack('LL', self._index[a:b])
        return (oid, Geometry(self._data[first:second]))

    def __getitem__(self, key):
        return self._get_by_oid(key['oid'])
        
    def __iter__(self):
        return self._intersect_oid((0, self._index_size))

    def _intersect_geom(self, geom):
        '''
        Returns records for which their geometry portion intersects with
        the given geometry.
        '''
        if not geom.geom().is_valid or geom.geom().area == 0.0:
            return 

        query = geom.geom().bounds
        # print query
        c = 0
        r = 0
        for id, g in self._intersect_box(query):
            c += 1
            # if geom.geom().intersects(g.geom()):
            if g.geom().intersects(geom.geom()):
                r += 1
                yield (id, g)
        #print 'Total: ', c
        #print 'Returned: ', r

    def _intersect_box(self, box):
        for id in self._tree.intersection(box):
            yield self._get_by_oid(id)

    def _intersect_oid(self, r):
        # Trim the range to the size of the file.
        low = r[0] >= 0 and r[0] or 0
        high = r[1] <= self._index_size and r[1] or self._index_size
        for id in range(low, high):
            yield self._get_by_oid(id)

    def intersect(self, ranges):
        if self._name in ranges and 'oid' not in ranges:
            if isinstance(ranges[self._name], Geometry):
                return self._intersect_geom(ranges[self._name])
            elif type(ranges[self._name]) is tuple:
                return self._intersect_box(ranges[self._name])
            elif ranges[self._name] is None:
                return []
            else:
                raise Exception('Invalid argument to intersect() method.')
        elif 'oid' in ranges and self._name not in ranges:
            r = ranges['oid']
            if isinstance(r, tuple):
                return self._intersect_oid(r)
            else:
                raise Exception('Invalid argument to intersect() method.')
        else:
            raise Exception('Invalid argument to intersect() method.')
            
    def __del__(self):
        self._data.close()
        self._index.close()