예제 #1
0
파일: tabular.py 프로젝트: zofuthan/dpark
 def __init__(self, rdd, filters):
     RDD.__init__(self, rdd.ctx)
     self.rdd = rdd
     self.filters = filters
     self.mem = max(self.mem, rdd.mem)
     self.dependencies = [OneToOneDependency(rdd)]
     self._splits = self._get_splits()
예제 #2
0
파일: tabular.py 프로젝트: zouzias/dpark
    def __init__(self, ctx, path, fields=None, taskMemory=None):
        RDD.__init__(self, ctx)
        if taskMemory:
            self.mem = taskMemory

        if isinstance(path, six.string_types):
            files = self._get_files(path)
        else:
            files = chain(self._get_files(p) for p in path)

        rdds = [TabularFileRDD(ctx, f, fields) for f in files]
        self._splits = []
        i = 0
        for rdd in rdds:
            for sp in rdd.splits:
                self._splits.append(TabularSplit(i, rdd, sp))
                i += 1
        self._dependencies = [OneToOneDependency(rdd) for rdd in rdds]
        self.repr_name = '<%s %d %s...>' % (self.__class__.__name__, len(rdds),
                                            ','.join(
                                                str(rdd) for rdd in rdds[:1]))
        self._preferred_locs = {}
        for split in self._splits:
            self._preferred_locs[split] = split.rdd.preferredLocations(
                split.split)
예제 #3
0
 def __init__(self, rdd, filters):
     RDD.__init__(self, rdd.ctx)
     self.rdd = rdd
     self.filters = filters
     self.mem = max(self.mem, rdd.mem)
     self._dependencies = [OneToOneDependency(rdd)]
     self._splits = self._get_splits()
     self.repr_name = '<%s %s>' % (self.__class__.__name__, rdd)
     self._preferred_locs = {}
     for split in self._splits:
         self._preferred_locs[split] = rdd.preferredLocations(split)
예제 #4
0
파일: tabular.py 프로젝트: zofuthan/dpark
    def __init__(self, ctx, path, fields=None, taskMemory=None):
        RDD.__init__(self, ctx)
        if taskMemory:
            self.mem = taskMemory

        if isinstance(path, basestring):
            files = self._get_files(path)
        else:
            files = chain(self._get_files(p) for p in path)

        self.rdds = [TabularFileRDD(ctx, f, fields) for f in files]
        self._splits = []
        i = 0
        for rdd in self.rdds:
            for sp in rdd.splits:
                self._splits.append(TabularSplit(i, rdd, sp))
                i += 1
        self.dependencies = [OneToOneDependency(rdd) for rdd in self.rdds]