def __init__(self, ctx, path, fields=None, taskMemory=None): RDD.__init__(self, ctx) if taskMemory: self.mem = taskMemory if isinstance(path, six.string_types): files = self._get_files(path) else: files = chain(self._get_files(p) for p in path) rdds = [TabularFileRDD(ctx, f, fields) for f in files] self._splits = [] i = 0 for rdd in rdds: for sp in rdd.splits: self._splits.append(TabularSplit(i, rdd, sp)) i += 1 self._dependencies = [OneToOneDependency(rdd) for rdd in rdds] self.repr_name = '<%s %d %s...>' % (self.__class__.__name__, len(rdds), ','.join( str(rdd) for rdd in rdds[:1])) self._preferred_locs = {} for split in self._splits: self._preferred_locs[split] = split.rdd.preferredLocations( split.split)
def compute(self, split): buffers = [list() for i in self.fields] remain_size = STRIPE_DATA_SIZE path = os.path.join(self.path, '%04d.dt' % split.index) indices = dict((i, AdaptiveIndex()) for i in self.indices) def write_stripe(f, compressed, header, padding=True): h = compress(marshal.dumps(header)) assert len(h) < STRIPE_HEADER_SIZE f.write(struct.pack('I', len(h))) f.write(h) padding_size = STRIPE_SIZE - len(h) - 4 for c in compressed: f.write(c) padding_size -= len(c) if padding: f.write('\0' * padding_size) with atomic_file(path) as f: stripe_id = 0 for it in chain(self.prev.iterator(sp) for sp in split.splits): row = it[:len(self.fields)] size = len(marshal.dumps(tuple(row))) if size > STRIPE_DATA_SIZE: raise RuntimeError('Row too big') if size > remain_size: compressed = [compress(marshal.dumps(tuple(b))) for b in buffers] _sizes = tuple(map(len, compressed)) _remain_size = STRIPE_DATA_SIZE - sum(_sizes) if size > _remain_size: write_stripe(f, compressed, _sizes) buffers = [list() for i in self.fields] remain_size = STRIPE_DATA_SIZE stripe_id += 1 else: remain_size = _remain_size remain_size -= size for i, value in enumerate(row): buffers[i].append(value) field = self.fields[i] if field in self.indices: indices[field].add(value, stripe_id) if any(buffers): compressed = [compress(marshal.dumps(tuple(b))) for b in buffers] _sizes = tuple(map(len, compressed)) write_stripe(f, compressed, _sizes, False) footer_indices = zlib.compress(cPickle.dumps(indices, -1)) footer_fields = compress(marshal.dumps(self.fields)) f.write(footer_indices) f.write(footer_fields) f.write(struct.pack('II', len(footer_fields), len(footer_indices))) yield path
def __init__(self, ctx, path, fields = None, taskMemory=None): RDD.__init__(self, ctx) if taskMemory: self.mem = taskMemory if isinstance(path, basestring): files = self._get_files(path) else: files = chain(self._get_files(p) for p in path) self.rdds = [TabularFileRDD(ctx, f, fields) for f in files] self._splits = [TabularSplit(rdd, sp) for rdd in self.rdds for sp in rdd.splits] self.dependencies = [OneToOneDependency(rdd) for rdd in self.rdds]
def __init__(self, ctx, path, fields=None, taskMemory=None): RDD.__init__(self, ctx) if taskMemory: self.mem = taskMemory if isinstance(path, basestring): files = self._get_files(path) else: files = chain(self._get_files(p) for p in path) self.rdds = [TabularFileRDD(ctx, f, fields) for f in files] self._splits = [] i = 0 for rdd in self.rdds: for sp in rdd.splits: self._splits.append(TabularSplit(i, rdd, sp)) i += 1 self.dependencies = [OneToOneDependency(rdd) for rdd in self.rdds]
def __init__(self, ctx, path, fields = None, taskMemory=None): RDD.__init__(self, ctx) if taskMemory: self.mem = taskMemory if isinstance(path, basestring): files = self._get_files(path) else: files = chain(self._get_files(p) for p in path) rdds = [TabularFileRDD(ctx, f, fields) for f in files] self._splits = [] i = 0 for rdd in rdds: for sp in rdd.splits: self._splits.append(TabularSplit(i, rdd, sp)) i += 1 self._dependencies = [OneToOneDependency(rdd) for rdd in rdds] self.repr_name = '<%s %d %s...>' % (self.__class__.__name__, len(rdds), ','.join(str(rdd) for rdd in rdds[:1])) self._preferred_locs = {} for split in self._splits: self._preferred_locs[split] = split.rdd.preferredLocations(split.split)
def filter(self, fun): if self.index_type == BITMAP_INDEX: return chain(v.positions() for k, v in self.index.items() if fun(k))