def __init__(self, rdd, path, field_names, indices=None, numSplits=None): RDD.__init__(self, rdd.ctx) self.prev = rdd self.mem = rdd.mem + 600 self.path = path if os.path.exists(path): raise RuntimeError('path already exists: %s' % path) os.makedirs(path) if isinstance(field_names, basestring): field_names = field_names.replace(',', ' ').split() if len(set(field_names)) != len(field_names): raise ValueError('duplicated field names') self.fields = map(str, field_names) if isinstance(indices, types.StringTypes): indices = indices.replace(',', ' ').split() self.indices = set() if indices: for i in indices: i = str(i) if i not in self.fields: raise ValueError('index field %s not in field list' % i) self.indices.add(i) prev_splits = len(rdd) numSplits = min(numSplits or prev_splits, prev_splits) self.numSplits = min(numSplits, prev_splits) s = [int(round(1.0*prev_splits/numSplits*i)) for i in xrange(numSplits + 1)] self._splits = [MultiSplit(i, rdd.splits[s[i]:s[i+1]]) for i in xrange(numSplits)] self.dependencies = [OneToRangeDependency(rdd, int(prev_splits/numSplits), prev_splits)]
def __init__(self, rdd, filters): RDD.__init__(self, rdd.ctx) self.rdd = rdd self.filters = filters self.mem = max(self.mem, rdd.mem) self.dependencies = [OneToOneDependency(rdd)] self._splits = self._get_splits()
def __init__(self, ctx, path, fields=None, taskMemory=None): RDD.__init__(self, ctx) if taskMemory: self.mem = taskMemory if isinstance(path, six.string_types): files = self._get_files(path) else: files = chain(self._get_files(p) for p in path) rdds = [TabularFileRDD(ctx, f, fields) for f in files] self._splits = [] i = 0 for rdd in rdds: for sp in rdd.splits: self._splits.append(TabularSplit(i, rdd, sp)) i += 1 self._dependencies = [OneToOneDependency(rdd) for rdd in rdds] self.repr_name = '<%s %d %s...>' % (self.__class__.__name__, len(rdds), ','.join( str(rdd) for rdd in rdds[:1])) self._preferred_locs = {} for split in self._splits: self._preferred_locs[split] = split.rdd.preferredLocations( split.split)
def __init__(self, rdd, filters): RDD.__init__(self, rdd.ctx) self.rdd = rdd self.filters = filters self.mem = max(self.mem, rdd.mem) self._dependencies = [OneToOneDependency(rdd)] self._splits = self._get_splits() self.repr_name = '<%s %s>' % (self.__class__.__name__, rdd) self._preferred_locs = {} for split in self._splits: self._preferred_locs[split] = rdd.preferredLocations(split)
def __init__(self, ctx, path, fields = None, taskMemory=None): RDD.__init__(self, ctx) if taskMemory: self.mem = taskMemory if isinstance(path, basestring): files = self._get_files(path) else: files = chain(self._get_files(p) for p in path) self.rdds = [TabularFileRDD(ctx, f, fields) for f in files] self._splits = [TabularSplit(rdd, sp) for rdd in self.rdds for sp in rdd.splits] self.dependencies = [OneToOneDependency(rdd) for rdd in self.rdds]
def __init__(self, ctx, path, fields=None, taskMemory=None): RDD.__init__(self, ctx) if taskMemory: self.mem = taskMemory if isinstance(path, basestring): files = self._get_files(path) else: files = chain(self._get_files(p) for p in path) self.rdds = [TabularFileRDD(ctx, f, fields) for f in files] self._splits = [] i = 0 for rdd in self.rdds: for sp in rdd.splits: self._splits.append(TabularSplit(i, rdd, sp)) i += 1 self.dependencies = [OneToOneDependency(rdd) for rdd in self.rdds]
def __init__(self, rdd, path, field_names, indices=None, numSplits=None): RDD.__init__(self, rdd.ctx) self.prev = rdd self.mem = rdd.mem + 600 self.path = path if os.path.exists(path): raise RuntimeError('path already exists: %s' % path) os.makedirs(path) if isinstance(field_names, six.string_types): field_names = field_names.replace(',', ' ').split() if len(set(field_names)) != len(field_names): raise ValueError('duplicated field names') self.fields = list(map(str, field_names)) if isinstance(indices, (str, )): indices = indices.replace(',', ' ').split() self.indices = set() if indices: for i in indices: i = str(i) if i not in self.fields: raise ValueError('index field %s not in field list' % i) self.indices.add(i) prev_splits = len(rdd) numSplits = min(numSplits or prev_splits, prev_splits) self.numSplits = min(numSplits, prev_splits) s = [ int(round(1.0 * prev_splits / numSplits * i)) for i in range(numSplits + 1) ] self._splits = [ MultiSplit(i, rdd.splits[s[i]:s[i + 1]]) for i in range(numSplits) ] self._dependencies = [ OneToRangeDependency(rdd, int(prev_splits / numSplits), prev_splits) ] self.repr_name = '<OutputTabularRDD %s %s>' % (path, rdd)
def __init__(self, ctx, path, fields = None, taskMemory=None): RDD.__init__(self, ctx) if taskMemory: self.mem = taskMemory if isinstance(path, basestring): files = self._get_files(path) else: files = chain(self._get_files(p) for p in path) rdds = [TabularFileRDD(ctx, f, fields) for f in files] self._splits = [] i = 0 for rdd in rdds: for sp in rdd.splits: self._splits.append(TabularSplit(i, rdd, sp)) i += 1 self._dependencies = [OneToOneDependency(rdd) for rdd in rdds] self.repr_name = '<%s %d %s...>' % (self.__class__.__name__, len(rdds), ','.join(str(rdd) for rdd in rdds[:1])) self._preferred_locs = {} for split in self._splits: self._preferred_locs[split] = split.rdd.preferredLocations(split.split)
def __getstate__(self): d = RDD.__getstate__(self) del d['filters'] return d, dumps(self.filters)
def _clear_dependencies(self): RDD._clear_dependencies(self) self.rdd = None