def __init__(self, rdd, filters): RDD.__init__(self, rdd.ctx) self.rdd = rdd self.filters = filters self.mem = max(self.mem, rdd.mem) self.dependencies = [OneToOneDependency(rdd)] self._splits = self._get_splits()
def __init__(self, ctx, path, fields=None, taskMemory=None): RDD.__init__(self, ctx) if taskMemory: self.mem = taskMemory if isinstance(path, six.string_types): files = self._get_files(path) else: files = chain(self._get_files(p) for p in path) rdds = [TabularFileRDD(ctx, f, fields) for f in files] self._splits = [] i = 0 for rdd in rdds: for sp in rdd.splits: self._splits.append(TabularSplit(i, rdd, sp)) i += 1 self._dependencies = [OneToOneDependency(rdd) for rdd in rdds] self.repr_name = '<%s %d %s...>' % (self.__class__.__name__, len(rdds), ','.join( str(rdd) for rdd in rdds[:1])) self._preferred_locs = {} for split in self._splits: self._preferred_locs[split] = split.rdd.preferredLocations( split.split)
def __init__(self, rdd, filters): RDD.__init__(self, rdd.ctx) self.rdd = rdd self.filters = filters self.mem = max(self.mem, rdd.mem) self._dependencies = [OneToOneDependency(rdd)] self._splits = self._get_splits() self.repr_name = '<%s %s>' % (self.__class__.__name__, rdd) self._preferred_locs = {} for split in self._splits: self._preferred_locs[split] = rdd.preferredLocations(split)
def __init__(self, ctx, path, fields=None, taskMemory=None): RDD.__init__(self, ctx) if taskMemory: self.mem = taskMemory if isinstance(path, basestring): files = self._get_files(path) else: files = chain(self._get_files(p) for p in path) self.rdds = [TabularFileRDD(ctx, f, fields) for f in files] self._splits = [] i = 0 for rdd in self.rdds: for sp in rdd.splits: self._splits.append(TabularSplit(i, rdd, sp)) i += 1 self.dependencies = [OneToOneDependency(rdd) for rdd in self.rdds]