class Join(MiniEngine): ''' The Join mini-engine combines the records of two streams in such a way that the result is the Cartesian product between two corresponding record partitions, one from each stream. ''' class PartitionBuffer(object): ''' This class represents a partition buffer for a given endpoint. Each partition received from the endpoint is stored in a separate buffer. ''' def __init__(self): self._b = [] def append(self, r): if len(self._b) == 0: self._b.append([]) self._b[-1].append(r) def current(self): return len(self._b) - 1 def next(self): self._b.append([]) def get(self, i): assert i >= 0 and i < len(self._b) return self._b[i] def finished(self, i): return i < len(self._b) - 1 def remove(self, i): assert i >= 0 and i < len(self._b) t = self._b[i] self._b[i] = None del t def __init__(self, first, second): MiniEngine.__init__(self) self._first = first self._second = second # Construct the schema of the output stream. self._schema = Schema() for a in self._first.schema() + self._second.schema(): self._schema.append(a) self._queue = Queue(100) self._first_ep = self._first.connect() self._first_ep.notify(self._queue) self._second_ep = self._second.connect() self._second_ep.notify(self._queue) self._output = Stream(self._schema, SortOrder(), 'Join') self._m = { self._first_ep: self._first, self._second_ep: self._second, } self._empty = 0 def output(self): return self._output def _merge(self, buffers, i): assert buffers[self._first_ep].finished(i) assert buffers[self._second_ep].finished(i) b1 = buffers[self._first_ep].get(i) b2 = buffers[self._second_ep].get(i) if len(b2) == 1: self._empty += 1 for r1 in b1[:-1]: for r2 in b2[:-1]: yield r1 + r2 buffers[self._first_ep].remove(i) buffers[self._second_ep].remove(i) def run(self): done = False buffers = { self._first_ep: self.PartitionBuffer(), self._second_ep: self.PartitionBuffer(), } while not done or not self._queue.empty(): e = self._queue.get() if e not in buffers: print 'ERROR: no buffer for endpoint' continue valid = True closed = False while valid and not closed: try: r = e.receive(False) buffers[e].append(r) if type(r) is StopWord: current = buffers[e].current() buffers[e].next() # Only merge if all buffers have completed this # partition. merge = True for o in buffers: merge &= buffers[o].finished(current) if merge: for x in self._merge(buffers, current): self._output.send(x) self._output.send(StopWord()) # Advance this buffer's partition by 1 e.processed() except StreamClosedException: closed = True except Empty: valid = False except: raise else: done = True for o in buffers: done &= o.closed() self._queue.task_done() self._output.close() print 'Join done. %d empty buffers.' % (self._empty)
class Join(MiniEngine): ''' The Join mini-engine combines the records of two streams in such a way that the result is the Cartesian product between two corresponding record partitions, one from each stream. ''' class PartitionBuffer(object): ''' This class represents a partition buffer for a given endpoint. Each partition received from the endpoint is stored in a separate buffer. ''' def __init__(self): self._b = [] def append(self, r): if len(self._b) == 0: self._b.append([]) self._b[-1].append(r) def current(self): return len(self._b) - 1 def next(self): self._b.append([]) def get(self, i): assert i >= 0 and i < len(self._b) return self._b[i] def finished(self, i): return i < len(self._b) - 1 def remove(self, i): assert i >= 0 and i < len(self._b) t = self._b[i] self._b[i] = None del t def __init__(self, first, second): MiniEngine.__init__(self) self._first = first self._second = second # Construct the schema of the output stream. self._schema = Schema() for a in self._first.schema() + self._second.schema(): self._schema.append(a) self._queue = Queue(100) self._first_ep = self._first.connect() self._first_ep.notify(self._queue) self._second_ep = self._second.connect() self._second_ep.notify(self._queue) self._output = Stream( self._schema, SortOrder(), 'Join' ) self._m = { self._first_ep: self._first, self._second_ep: self._second, } self._empty = 0 def output(self): return self._output def _merge(self, buffers, i): assert buffers[self._first_ep].finished(i) assert buffers[self._second_ep].finished(i) b1 = buffers[self._first_ep].get(i) b2 = buffers[self._second_ep].get(i) if len(b2) == 1: self._empty += 1 for r1 in b1[:-1]: for r2 in b2[:-1]: yield r1 + r2 buffers[self._first_ep].remove(i) buffers[self._second_ep].remove(i) def run(self): done = False buffers = { self._first_ep: self.PartitionBuffer(), self._second_ep: self.PartitionBuffer(), } while not done or not self._queue.empty(): e = self._queue.get() if e not in buffers: print 'ERROR: no buffer for endpoint' continue valid = True closed = False while valid and not closed: try: r = e.receive(False) buffers[e].append(r) if type(r) is StopWord: current = buffers[e].current() buffers[e].next() # Only merge if all buffers have completed this # partition. merge = True for o in buffers: merge &= buffers[o].finished(current) if merge: for x in self._merge(buffers, current): self._output.send(x) self._output.send(StopWord()) # Advance this buffer's partition by 1 e.processed() except StreamClosedException: closed = True except Empty: valid = False except: raise else: done = True for o in buffers: done &= o.closed() self._queue.task_done() self._output.close() print 'Join done. %d empty buffers.' % (self._empty)
class Rtree(DataSource): def __init__(self, filename, name): # Name of the Rtree file self._filename = filename # Name of the geometry attribute self._name = name # Construct the schema for the R-tree data source. It consists of a # unique OID which was generated during index creation and the # geometry object with the specified attribute name. self._schema = Schema() self._schema.append(Attribute('oid', int, True)) self._schema.append(Attribute(name, Geometry)) # Construct the file name of the data and data index file. self._data_filename = self._filename + '.data' self._index_filename = self._filename + '.data.idx' # Open the data and data index files self._data_file = open(self._data_filename, 'r+') self._index_file = open(self._index_filename, 'r+') # Determine the length of the data and the data index files. self._index_file.seek(0, os.SEEK_END) self._index_length = self._index_file.tell() self._index_file.seek(0) self._data_file.seek(0, os.SEEK_END) self._data_length = self._data_file.tell() self._data_file.seek(0) # Compute size of a long int. self._long_size = struct.calcsize('L') self._index_size = self._index_length / self._long_size # Memory-map data and data index self._data = mmap.mmap(self._data_file.fileno(), 0) self._index = mmap.mmap(self._index_file.fileno(), 0) # Open the R-tree self._tree = _Rtree(self._filename) def schema(self): return self._schema def _get_by_oid(self, oid): if oid < 0 or oid >= self._index_size: raise KeyError('Object with ID [%d] does not exist.' % (oid)) # Compute address of object pointer. a = oid * self._long_size # Compute address of following object pointer. b = a + 2 * self._long_size # Unpack pointer to the address in the datafile. if b > self._index_length: # If the object pointer is the last one and thus there is # no following record, the length of the object is # restricted to the data file's size. first, = struct.unpack( 'L', self._index[a:self._index_length] ) second = self._data_length else: # Otherwise simply compute the object's size from the # difference between its address and the address of the # following object. first, second = struct.unpack('LL', self._index[a:b]) return (oid, Geometry(self._data[first:second])) def __getitem__(self, key): return self._get_by_oid(key['oid']) def __iter__(self): return self._intersect_oid((0, self._index_size)) def _intersect_geom(self, geom): ''' Returns records for which their geometry portion intersects with the given geometry. ''' if not geom.geom().is_valid or geom.geom().area == 0.0: return query = geom.geom().bounds # print query c = 0 r = 0 for id, g in self._intersect_box(query): c += 1 # if geom.geom().intersects(g.geom()): if g.geom().intersects(geom.geom()): r += 1 yield (id, g) #print 'Total: ', c #print 'Returned: ', r def _intersect_box(self, box): for id in self._tree.intersection(box): yield self._get_by_oid(id) def _intersect_oid(self, r): # Trim the range to the size of the file. low = r[0] >= 0 and r[0] or 0 high = r[1] <= self._index_size and r[1] or self._index_size for id in range(low, high): yield self._get_by_oid(id) def intersect(self, ranges): if self._name in ranges and 'oid' not in ranges: if isinstance(ranges[self._name], Geometry): return self._intersect_geom(ranges[self._name]) elif type(ranges[self._name]) is tuple: return self._intersect_box(ranges[self._name]) elif ranges[self._name] is None: return [] else: raise Exception('Invalid argument to intersect() method.') elif 'oid' in ranges and self._name not in ranges: r = ranges['oid'] if isinstance(r, tuple): return self._intersect_oid(r) else: raise Exception('Invalid argument to intersect() method.') else: raise Exception('Invalid argument to intersect() method.') def __del__(self): self._data.close() self._index.close()