Пример #1
0
    def parallelize(self, c, numSlices=None):
        """
        Distribute a local Python collection to form an RDD.

        >>> sc.parallelize(range(5), 5).glom().collect()
        [[0], [1], [2], [3], [4]]
        """
        numSlices = numSlices or self.defaultParallelism
        # Calling the Java parallelize() method with an ArrayList is too slow,
        # because it sends O(n) Py4J commands.  As an alternative, serialized
        # objects are written to a file and loaded through textFile().
        tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
        # Make sure we distribute data evenly if it's smaller than self.batchSize
        if "__len__" not in dir(c):
            c = list(c)    # Make it a list so we can compute its length
        batchSize = min(len(c) // numSlices, self._batchSize)
        if batchSize > 1:
            serializer = BatchedSerializer(self._unbatched_serializer,
                                           batchSize)
        else:
            serializer = self._unbatched_serializer
        serializer.dump_stream(c, tempFile)
        tempFile.close()
        readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile
        jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices)
        return RDD(jrdd, self, serializer)
Пример #2
0
    def parallelize(self, c, numSlices=None):
        """
        Distribute a local Python collection to form an RDD.

        >>> sc.parallelize(range(5), 5).glom().collect()
        [[0], [1], [2], [3], [4]]
        """
        numSlices = numSlices or self.defaultParallelism
        # Calling the Java parallelize() method with an ArrayList is too slow,
        # because it sends O(n) Py4J commands.  As an alternative, serialized
        # objects are written to a file and loaded through textFile().
        tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
        # Make sure we distribute data evenly if it's smaller than self.batchSize
        if "__len__" not in dir(c):
            c = list(c)    # Make it a list so we can compute its length
        batchSize = min(len(c) // numSlices, self._batchSize)
        if batchSize > 1:
            serializer = BatchedSerializer(self._unbatched_serializer,
                                           batchSize)
        else:
            serializer = self._unbatched_serializer
        serializer.dump_stream(c, tempFile)
        tempFile.close()
        readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile
        jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices)
        return RDD(jrdd, self, serializer)
Пример #3
0
 def __init__(self, vertex_jrdd, edge_jrdd,
              partition_strategy=PartitionStrategy.EdgePartition1D):
     self._vertex_jrdd = VertexRDD(vertex_jrdd, vertex_jrdd.context,
                                   BatchedSerializer(PickleSerializer()))
     self._edge_jrdd = EdgeRDD(edge_jrdd, edge_jrdd.context,
                               BatchedSerializer(PickleSerializer()))
     self._partition_strategy = partition_strategy
     self._jsc = vertex_jrdd.context
Пример #4
0
 def _open_file(self):
     dirs = _get_local_dirs("objects")
     d = dirs[id(self) % len(dirs)]
     if not os.path.exists(d):
         os.makedirs(d)
     p = os.path.join(d, str(id(self)))
     self._file = open(p, "w+b", 65536)
     self._ser = BatchedSerializer(CompressedSerializer(PickleSerializer()), 1024)
     os.unlink(p)
Пример #5
0
def read_udfs(pickleSer, infile, eval_type):
    runner_conf = {}

    if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                     PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF):

        # Load conf used for pandas_udf evaluation
        num_conf = read_int(infile)
        for i in range(num_conf):
            k = utf8_deserializer.loads(infile)
            v = utf8_deserializer.loads(infile)
            runner_conf[k] = v

        # NOTE: if timezone is set here, that implies respectSessionTimeZone is True
        timezone = runner_conf.get("spark.sql.session.timeZone", None)
        ser = ArrowStreamPandasSerializer(timezone)
    else:
        ser = BatchedSerializer(PickleSerializer(), 100)

    num_udfs = read_int(infile)
    udfs = {}
    call_udf = []
    mapper_str = ""
    if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
        # Create function like this:
        #   lambda a: f([a[0]], [a[0], a[1]])

        # We assume there is only one UDF here because grouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1

        # See FlatMapGroupsInPandasExec for how arg_offsets are used to
        # distinguish between grouping attributes and data attributes
        arg_offsets, udf = read_single_udf(
            pickleSer, infile, eval_type, runner_conf, udf_index=0)
        udfs['f'] = udf
        split_offset = arg_offsets[0] + 1
        arg0 = ["a[%d]" % o for o in arg_offsets[1: split_offset]]
        arg1 = ["a[%d]" % o for o in arg_offsets[split_offset:]]
        mapper_str = "lambda a: f([%s], [%s])" % (", ".join(arg0), ", ".join(arg1))
    else:
        # Create function like this:
        #   lambda a: (f0(a[0]), f1(a[1], a[2]), f2(a[3]))
        # In the special case of a single UDF this will return a single result rather
        # than a tuple of results; this is the format that the JVM side expects.
        for i in range(num_udfs):
            arg_offsets, udf = read_single_udf(
                pickleSer, infile, eval_type, runner_conf, udf_index=i)
            udfs['f%d' % i] = udf
            args = ["a[%d]" % o for o in arg_offsets]
            call_udf.append("f%d(%s)" % (i, ", ".join(args)))
        mapper_str = "lambda a: (%s)" % (", ".join(call_udf))

    mapper = eval(mapper_str, udfs)
    func = lambda _, it: map(mapper, it)

    # profiling is not supported for UDF
    return func, None, ser, ser
Пример #6
0
    def parallelize(self, c, numSlices=None):
        """
        Distribute a local Python collection to form an RDD. Using xrange
        is recommended if the input represents a range for performance.

        >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect()
        [[0], [2], [3], [4], [6]]
        >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect()
        [[], [0], [], [2], [4]]
        """
        numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism
        if isinstance(c, xrange):
            size = len(c)
            if size == 0:
                return self.parallelize([], numSlices)
            step = c[1] - c[0] if size > 1 else 1
            start0 = c[0]

            def getStart(split):
                return start0 + int((split * size / numSlices)) * step

            def f(split, iterator):
                return xrange(getStart(split), getStart(split + 1), step)

            return self.parallelize([], numSlices).mapPartitionsWithIndex(f)

        # Make sure we distribute data evenly if it's smaller than self.batchSize
        if "__len__" not in dir(c):
            c = list(c)    # Make it a list so we can compute its length
        batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024))
        serializer = BatchedSerializer(self._unbatched_serializer, batchSize)
        jrdd = self._serialize_to_jvm(c, numSlices, serializer)
        return RDD(jrdd, self, serializer)
Пример #7
0
 def test_zip_with_different_serializers(self):
     a = self.sc.parallelize(range(5))
     b = self.sc.parallelize(range(100, 105))
     self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])
     a = a._reserialize(BatchedSerializer(PickleSerializer(), 2))
     b = b._reserialize(MarshalSerializer())
     self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])
Пример #8
0
    def newAPIHadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConverter=None,
                         valueConverter=None, conf=None, batchSize=None):
        """
        Read a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS,
        a local file system (available on all nodes), or any Hadoop-supported file system URI.
        The mechanism is the same as for sc.sequenceFile.

        A Hadoop configuration can be passed in as a Python dict. This will be converted into a
        Configuration in Java

        @param path: path to Hadoop file
        @param inputFormatClass: fully qualified classname of Hadoop InputFormat
               (e.g. "org.apache.hadoop.mapreduce.lib.input.TextInputFormat")
        @param keyClass: fully qualified classname of key Writable class
               (e.g. "org.apache.hadoop.io.Text")
        @param valueClass: fully qualified classname of value Writable class
               (e.g. "org.apache.hadoop.io.LongWritable")
        @param keyConverter: (None by default)
        @param valueConverter: (None by default)
        @param conf: Hadoop configuration, passed in as a dict
               (None by default)
        @param batchSize: The number of Python objects represented as a single
               Java object. (default sc._default_batch_size_for_serialized_input)
        """
        jconf = self._dictToJavaMap(conf)
        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
        jrdd = self._jvm.PythonRDD.newAPIHadoopFile(self._jsc, path, inputFormatClass, keyClass,
                                                    valueClass, keyConverter, valueConverter,
                                                    jconf, batchSize)
        return RDD(jrdd, self, ser)
Пример #9
0
    def hadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=None,
                  valueConverter=None, conf=None, batchSize=None):
        """
        Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary
        Hadoop configuration, which is passed in as a Python dict.
        This will be converted into a Configuration in Java.
        The mechanism is the same as for sc.sequenceFile.

        @param inputFormatClass: fully qualified classname of Hadoop InputFormat
               (e.g. "org.apache.hadoop.mapred.TextInputFormat")
        @param keyClass: fully qualified classname of key Writable class
               (e.g. "org.apache.hadoop.io.Text")
        @param valueClass: fully qualified classname of value Writable class
               (e.g. "org.apache.hadoop.io.LongWritable")
        @param keyConverter: (None by default)
        @param valueConverter: (None by default)
        @param conf: Hadoop configuration, passed in as a dict
               (None by default)
        @param batchSize: The number of Python objects represented as a single
               Java object. (default sc._default_batch_size_for_serialized_input)
        """
        jconf = self._dictToJavaMap(conf)
        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
        jrdd = self._jvm.PythonRDD.hadoopRDD(self._jsc, inputFormatClass, keyClass,
                                             valueClass, keyConverter, valueConverter,
                                             jconf, batchSize)
        return RDD(jrdd, self, ser)
Пример #10
0
 def __init__(self,
              aggregator,
              memory_limit=512,
              serializer=None,
              localdirs=None,
              scale=1,
              partitions=59,
              batch=1000):
     Merger.__init__(self, aggregator)
     self.memory_limit = memory_limit
     # default serializer is only used for tests
     self.serializer = serializer or \
         BatchedSerializer(PickleSerializer(), 1024)
     self.localdirs = localdirs or _get_local_dirs(str(id(self)))
     # number of partitions when spill data into disks
     self.partitions = partitions
     # check the memory after # of items merged
     self.batch = batch
     # scale is used to scale down the hash of key for recursive hash map
     self.scale = scale
     # unpartitioned merged data
     self.data = {}
     # partitioned merged data, list of dicts
     self.pdata = []
     # number of chunks dumped into disks
     self.spills = 0
     # randomize the hash of key, id(o) is the address of o (aligned by 8)
     self._seed = id(self) + 7
Пример #11
0
    def sequenceFile(self, path, keyClass=None, valueClass=None, keyConverter=None,
                     valueConverter=None, minSplits=None, batchSize=None):
        """
        Read a Hadoop SequenceFile with arbitrary key and value Writable class from HDFS,
        a local file system (available on all nodes), or any Hadoop-supported file system URI.
        The mechanism is as follows:
            1. A Java RDD is created from the SequenceFile or other InputFormat, and the key
               and value Writable classes
            2. Serialization is attempted via Pyrolite pickling
            3. If this fails, the fallback is to call 'toString' on each key and value
            4. C{PickleSerializer} is used to deserialize pickled objects on the Python side

        @param path: path to sequncefile
        @param keyClass: fully qualified classname of key Writable class
               (e.g. "org.apache.hadoop.io.Text")
        @param valueClass: fully qualified classname of value Writable class
               (e.g. "org.apache.hadoop.io.LongWritable")
        @param keyConverter:
        @param valueConverter:
        @param minSplits: minimum splits in dataset
               (default min(2, sc.defaultParallelism))
        @param batchSize: The number of Python objects represented as a single
               Java object. (default sc._default_batch_size_for_serialized_input)
        """
        minSplits = minSplits or min(self.defaultParallelism, 2)
        batchSize = max(1, batchSize or self._default_batch_size_for_serialized_input)
        ser = BatchedSerializer(PickleSerializer()) if (batchSize > 1) else PickleSerializer()
        jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, path, keyClass, valueClass,
                                                keyConverter, valueConverter, minSplits, batchSize)
        return RDD(jrdd, self, ser)
Пример #12
0
    def predict(self, x):
        """
        Predict the label of one or more examples.

        :param x:  Data point (feature vector),
                   or an RDD of data points (feature vectors).
        """
        SerDe = self._sc._jvm.SerDe
        ser = PickleSerializer()
        if isinstance(x, RDD):
            # Bulk prediction
            first = x.take(1)
            if not first:
                return self._sc.parallelize([])
            if not isinstance(first[0], Vector):
                x = x.map(_convert_to_vector)
            jPred = self._java_model.predict(x._to_java_object_rdd()).toJavaRDD()
            jpyrdd = self._sc._jvm.PythonRDD.javaToPython(jPred)
            return RDD(jpyrdd, self._sc, BatchedSerializer(ser, 1024))

        else:
            # Assume x is a single data point.
            bytes = bytearray(ser.dumps(_convert_to_vector(x)))
            vec = self._sc._jvm.SerDe.loads(bytes)
            return self._java_model.predict(vec)
Пример #13
0
 def rdd(self):
     """Returns the content as an :class:`pyspark.RDD` of :class:`Row`.
     """
     if self._lazy_rdd is None:
         jrdd = self._jdf.javaToPython()
         self._lazy_rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer()))
     return self._lazy_rdd
Пример #14
0
def read_udfs(pickleSer, infile, eval_type):
    num_udfs = read_int(infile)
    udfs = {}
    call_udf = []
    for i in range(num_udfs):
        arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type)
        udfs['f%d' % i] = udf
        args = ["a[%d]" % o for o in arg_offsets]
        call_udf.append("f%d(%s)" % (i, ", ".join(args)))
    # Create function like this:
    #   lambda a: (f0(a0), f1(a1, a2), f2(a3))
    # In the special case of a single UDF this will return a single result rather
    # than a tuple of results; this is the format that the JVM side expects.
    mapper_str = "lambda a: (%s)" % (", ".join(call_udf))
    mapper = eval(mapper_str, udfs)

    func = lambda _, it: map(mapper, it)

    if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF):
        timezone = utf8_deserializer.loads(infile)
        ser = ArrowStreamPandasSerializer(timezone)
    else:
        ser = BatchedSerializer(PickleSerializer(), 100)

    # profiling is not supported for UDF
    return func, None, ser, ser
Пример #15
0
    def loadLabeledPoints(sc, path, minPartitions=None):
        """
        Load labeled points saved using RDD.saveAsTextFile.

        :param sc: Spark context
        :param path: file or directory path in any Hadoop-supported file
                     system URI
        :param minPartitions: min number of partitions
        @return: labeled data stored as an RDD of LabeledPoint

        >>> from tempfile import NamedTemporaryFile
        >>> from pyspark.mllib.util import MLUtils
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \
                        LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
        >>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
        >>> type(loaded[0]) == LabeledPoint
        True
        >>> print examples[0]
        (1.1,(3,[0,2],[-1.23,4.56e-07]))
        >>> type(examples[1]) == LabeledPoint
        True
        >>> print examples[1]
        (0.0,[1.01,2.02,3.03])
        """
        minPartitions = minPartitions or min(sc.defaultParallelism, 2)
        jrdd = sc._jvm.PythonMLLibAPI().loadLabeledPoints(
            sc._jsc, path, minPartitions)
        jpyrdd = sc._jvm.PythonRDD.javaToPython(jrdd)
        return RDD(jpyrdd, sc, BatchedSerializer(PickleSerializer()))
Пример #16
0
    def partitionBy(self, numPartitions, partitionFunc=hash):
        """
        Return a copy of the RDD partitioned using the specified partitioner.

        >>> pairs = sc.parallelize([1, 2, 3, 4, 2, 4, 1]).map(lambda x: (x, x))
        >>> sets = pairs.partitionBy(2).glom().collect()
        >>> set(sets[0]).intersection(set(sets[1]))
        set([])
        """
        if numPartitions is None:
            numPartitions = self.ctx.defaultParallelism
        # Transferring O(n) objects to Java is too expensive.  Instead, we'll
        # form the hash buckets in Python, transferring O(numPartitions) objects
        # to Java.  Each object is a (splitNumber, [objects]) pair.
        outputSerializer = self.ctx._unbatched_serializer
        def add_shuffle_key(split, iterator):

            buckets = defaultdict(list)

            for (k, v) in iterator:
                buckets[partitionFunc(k) % numPartitions].append((k, v))
            for (split, items) in buckets.iteritems():
                yield pack_long(split)
                yield outputSerializer.dumps(items)
        keyed = PipelinedRDD(self, add_shuffle_key)
        keyed._bypass_serializer = True
        pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
        partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
                                                     id(partitionFunc))
        jrdd = pairRDD.partitionBy(partitioner).values()
        rdd = RDD(jrdd, self.ctx, BatchedSerializer(outputSerializer))
        # This is required so that id(partitionFunc) remains unique, even if
        # partitionFunc is a lambda:
        rdd._partitionFunc = partitionFunc
        return rdd
Пример #17
0
def transformToRDD(cursor, sc, parallelism=1):
    """
    Transform a StellarCursor to a Python RDD object

    param cursor: StellarCursor
    param sc: SparkContext
    param parallelism: Parallelism of RDD
    """
    # Get all data from cursor
    data = cursor.fetchall()

    # Set parallelism
    parallelism = max(1, parallelism)

    def reader_func(temp_filename):
        return sc._jvm.PythonRDD.readRDDFromFile(sc._jsc, temp_filename,
                                                 parallelism)

    def createRDDServer():
        return sc._jvm.PythonParallelizeServer(sc._jsc.sc(), parallelism)

    batchSize = max(1, min(len(data) // parallelism, 1024))
    serializer = BatchedSerializer(sc._unbatched_serializer, batchSize)

    jrdd = sc._serialize_to_jvm(data, serializer, reader_func, createRDDServer)

    return RDD(jrdd, sc, serializer)
Пример #18
0
    def parallelize(self, c, numSlices=None):
        """
        Distribute a local Python collection to form an RDD. Using xrange
        is recommended if the input represents a range for performance.
		分发一个本地的Python集合以形成一个RDD。如果输入表示的是一个范围推荐使用xrange。

        >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect()
        [[0], [2], [3], [4], [6]]
        >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect()
        [[], [0], [], [2], [4]]
        """
        numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism
        if isinstance(c, xrange):
            size = len(c)
            if size == 0:
                return self.parallelize([], numSlices)
            step = c[1] - c[0] if size > 1 else 1
            start0 = c[0]

            def getStart(split):
                return start0 + int((split * size / numSlices)) * step

            def f(split, iterator):
                return xrange(getStart(split), getStart(split + 1), step)

            return self.parallelize([], numSlices).mapPartitionsWithIndex(f)
        # Calling the Java parallelize() method with an ArrayList is too slow,
        # because it sends O(n) Py4J commands.  As an alternative, serialized
        # objects are written to a file and loaded through textFile().
		#对一个ArrayList调用Java的并行化方法速度太慢,因为它要发送O(n)Py4J命令。作为替代,序列化对象被写入到一个文件中,并通过textFile()加载。
        tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
        try:
            # Make sure we distribute data evenly if it's smaller than self.batchSize
			#确保我们均匀地分布数据如果它小于self.batchsize
            if "__len__" not in dir(c):
                c = list(c)    # Make it a list so we can compute its length将其转换成list,以便可以计算它的长度
            batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024))#计算块大小
            serializer = BatchedSerializer(self._unbatched_serializer, batchSize)#获取分块后的序列化对象
            serializer.dump_stream(c, tempFile)#写入临时文件中
            tempFile.close()#关闭临时文件
            readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile
            jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices)
        finally:
            # readRDDFromFile eagerily reads the file so we can delete right after.
			#readRDDFromFile将快速读取文件,以便我们可以在之后删除
            os.unlink(tempFile.name)#删除临时文件,如果文件是一个目录则返回一个错误
        return RDD(jrdd, self, serializer)
Пример #19
0
    def parallelize(self, c, numSlices=None):
        """
        Distribute a local Python collection to form an RDD. Using xrange
        is recommended if the input represents a range for performance.

        >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect()
        [[0], [2], [3], [4], [6]]
        >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect()
        [[], [0], [], [2], [4]]
        """
        numSlices = int(
            numSlices) if numSlices is not None else self.defaultParallelism
        if isinstance(c, xrange):
            size = len(c)
            if size == 0:
                return self.parallelize([], numSlices)
            step = c[1] - c[0] if size > 1 else 1
            start0 = c[0]

            def getStart(split):
                return start0 + int((split * size / numSlices)) * step

            def f(split, iterator):
                return xrange(getStart(split), getStart(split + 1), step)

            return self.parallelize([], numSlices).mapPartitionsWithIndex(f)
        # Calling the Java parallelize() method with an ArrayList is too slow,
        # because it sends O(n) Py4J commands.  As an alternative, serialized
        # objects are written to a file and loaded through textFile().
        tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
        try:
            # Make sure we distribute data evenly if it's smaller than self.batchSize
            if "__len__" not in dir(c):
                c = list(c)  # Make it a list so we can compute its length
            batchSize = max(1,
                            min(len(c) // numSlices, self._batchSize or 1024))
            serializer = BatchedSerializer(self._unbatched_serializer,
                                           batchSize)
            serializer.dump_stream(c, tempFile)
            tempFile.close()
            readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile
            jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices)
        finally:
            # readRDDFromFile eagerily reads the file so we can delete right after.
            os.unlink(tempFile.name)
        return RDD(jrdd, self, serializer)
Пример #20
0
    def collect(self):
        """Returns all the records as a list of :class:`Row`.

        >>> df.collect()
        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
        """
        with SCCallSiteSync(self._sc) as css:
            port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd())
        return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
Пример #21
0
 def _open_file(self):
     dirs = _get_local_dirs("objects")
     d = dirs[id(self) % len(dirs)]
     if not os.path.exists(d):
         os.makedirs(d)
     p = os.path.join(d, str(id(self)))
     self._file = open(p, "wb+", 65536)
     self._ser = BatchedSerializer(CompressedSerializer(PickleSerializer()), 1024)
     os.unlink(p)
Пример #22
0
 def _toPython(self):
     # We have to import the Row class explicitly, so that the reference Pickler has is
     # pyspark.sql.Row instead of __main__.Row
     from pyspark.sql import Row
     jrdd = self._jschema_rdd.javaToPython()
     # TODO: This is inefficient, we should construct the Python Row object
     # in Java land in the javaToPython function. May require a custom
     # pickle serializer in Pyrolite
     return RDD(jrdd, self._sc,
                BatchedSerializer(PickleSerializer())).map(lambda d: Row(d))
Пример #23
0
    def take(self, num):
        """Returns the first ``num`` rows as a :class:`list` of :class:`Row`.

        >>> df.take(2)
        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
        """
        with SCCallSiteSync(self._sc) as css:
            port = self._sc._jvm.org.apache.spark.sql.execution.EvaluatePython.takeAndServe(
                self._jdf, num)
        return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
Пример #24
0
    def parallelize(self, c, numSlices=None):
        """
        Distribute a local Python collection to form an RDD. Using xrange
        is recommended if the input represents a range for performance.

        >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect()
        [[0], [2], [3], [4], [6]]
        >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect()
        [[], [0], [], [2], [4]]
        """
        numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism
        if isinstance(c, xrange):
            size = len(c)
            if size == 0:
                return self.parallelize([], numSlices)
            step = c[1] - c[0] if size > 1 else 1
            start0 = c[0]

            def getStart(split):
                return start0 + int((split * size / numSlices)) * step

            def f(split, iterator):
                return xrange(getStart(split), getStart(split + 1), step)

            return self.parallelize([], numSlices).mapPartitionsWithIndex(f)
        # Calling the Java parallelize() method with an ArrayList is too slow,
        # because it sends O(n) Py4J commands.  As an alternative, serialized
        # objects are written to a file and loaded through textFile().
        tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
        try:
            # Make sure we distribute data evenly if it's smaller than self.batchSize
            if "__len__" not in dir(c):
                c = list(c)    # Make it a list so we can compute its length
            batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024))
            serializer = BatchedSerializer(self._unbatched_serializer, batchSize)
            serializer.dump_stream(c, tempFile)
            tempFile.close()
            readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile
            jrdd = readRDDFromFile(self._jsc, tempFile.name, numSlices)
        finally:
            # readRDDFromFile eagerily reads the file so we can delete right after.
            os.unlink(tempFile.name)
        return RDD(jrdd, self, serializer)
Пример #25
0
 def test_hash_serializer(self):
     hash(NoOpSerializer())
     hash(UTF8Deserializer())
     hash(CPickleSerializer())
     hash(MarshalSerializer())
     hash(AutoSerializer())
     hash(BatchedSerializer(CPickleSerializer()))
     hash(AutoBatchedSerializer(MarshalSerializer()))
     hash(PairDeserializer(NoOpSerializer(), UTF8Deserializer()))
     hash(CartesianDeserializer(NoOpSerializer(), UTF8Deserializer()))
     hash(CompressedSerializer(CPickleSerializer()))
     hash(FlattenedValuesSerializer(CPickleSerializer()))
Пример #26
0
    def parallelize(self, c, numSlices=None):
        """
        Distribute a local Python collection to form an RDD. Using xrange
        is recommended if the input represents a range for performance.

        >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect()
        [[0], [2], [3], [4], [6]]
        >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect()
        [[], [0], [], [2], [4]]
        """
        numSlices = int(
            numSlices) if numSlices is not None else self.defaultParallelism
        if isinstance(c, xrange):
            size = len(c)
            if size == 0:
                return self.parallelize([], numSlices)
            step = c[1] - c[0] if size > 1 else 1
            start0 = c[0]

            def getStart(split):
                return start0 + int((split * size / numSlices)) * step

            def f(split, iterator):
                # it's an empty iterator here but we need this line for triggering the
                # logic of signal handling in FramedSerializer.load_stream, for instance,
                # SpecialLengths.END_OF_DATA_SECTION in _read_with_length. Since
                # FramedSerializer.load_stream produces a generator, the control should
                # at least be in that function once. Here we do it by explicitly converting
                # the empty iterator to a list, thus make sure worker reuse takes effect.
                # See more details in SPARK-26549.
                assert len(list(iterator)) == 0
                return xrange(getStart(split), getStart(split + 1), step)

            return self.parallelize([], numSlices).mapPartitionsWithIndex(f)

        # Make sure we distribute data evenly if it's smaller than self.batchSize
        if "__len__" not in dir(c):
            c = list(c)  # Make it a list so we can compute its length
        batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024))
        serializer = BatchedSerializer(self._unbatched_serializer, batchSize)

        def reader_func(temp_filename):
            return self._jvm.PythonRDD.readRDDFromFile(self._jsc,
                                                       temp_filename,
                                                       numSlices)

        def createRDDServer():
            return self._jvm.PythonParallelizeServer(self._jsc.sc(), numSlices)

        jrdd = self._serialize_to_jvm(c, serializer, reader_func,
                                      createRDDServer)
        return RDD(jrdd, self, serializer)
Пример #27
0
    def cassandraTable(self, keyspace, table):
        """Returns all the Rows in a Cassandra keyspace and table as an RDD.

        @param keyspace: Cassandra keyspace / schema name
        @param table: Cassandra table / column family name
        """
        # Unsure right now if we need CassandraSerializer, but likely do since
        # we'll get generic CassandraRow instances back that we'll need to
        # inspect?
        # return RDD(self._jcsc.cassandraTable(keyspace, table), self,
        #            CassandraSerializer())
        return RDD(self._jcsc.cassandraTable(keyspace, table), self,
                   BatchedSerializer(PickleSerializer()))
Пример #28
0
    def pickleFile(self, name, minPartitions=None):
        """
        Load an RDD previously saved using L{RDD.saveAsPickleFile} method.

        >>> tmpFile = NamedTemporaryFile(delete=True)
        >>> tmpFile.close()
        >>> sc.parallelize(range(10)).saveAsPickleFile(tmpFile.name, 5)
        >>> sorted(sc.pickleFile(tmpFile.name, 3).collect())
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
        """
        minPartitions = minPartitions or self.defaultMinPartitions
        return RDD(self._jsc.objectFile(name, minPartitions), self,
                   BatchedSerializer(PickleSerializer()))
Пример #29
0
    def collect(self):
        """Return a list that contains all of the rows.

        Each object in the list is a Row, the fields can be accessed as
        attributes.

        >>> df.collect()
        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
        """
        with SCCallSiteSync(self._sc) as css:
            port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd())
        rs = list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
        cls = _create_cls(self.schema)
        return [cls(r) for r in rs]
Пример #30
0
    def rdd(self):
        """Returns the content as an :class:`pyspark.RDD` of :class:`Row`.
        """
        if self._lazy_rdd is None:
            jrdd = self._jdf.javaToPython()
            rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer()))
            schema = self.schema

            def applySchema(it):
                cls = _create_cls(schema)
                return map(cls, it)

            self._lazy_rdd = rdd.mapPartitions(applySchema)

        return self._lazy_rdd
Пример #31
0
 def test_zip_with_different_serializers(self):
     a = self.sc.parallelize(range(5))
     b = self.sc.parallelize(range(100, 105))
     self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])
     a = a._reserialize(BatchedSerializer(CPickleSerializer(), 2))
     b = b._reserialize(MarshalSerializer())
     self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])
     # regression test for SPARK-4841
     path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt")
     t = self.sc.textFile(path)
     cnt = t.count()
     self.assertEqual(cnt, t.zip(t).count())
     rdd = t.map(str)
     self.assertEqual(cnt, t.zip(rdd).count())
     # regression test for bug in _reserializer()
     self.assertEqual(cnt, t.zip(rdd).count())
Пример #32
0
    def rdd(self):
        """
        Return the content of the :class:`DataFrame` as an :class:`RDD`
        of :class:`Row` s.
        """
        if not hasattr(self, '_lazy_rdd'):
            jrdd = self._jdf.javaToPython()
            rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer()))
            schema = self.schema

            def applySchema(it):
                cls = _create_cls(schema)
                return itertools.imap(cls, it)

            self._lazy_rdd = rdd.mapPartitions(applySchema)

        return self._lazy_rdd
Пример #33
0
def read_udfs(pickleSer, infile):
    num_udfs = read_int(infile)
    udfs = {}
    call_udf = []
    for i in range(num_udfs):
        arg_offsets, udf = read_single_udf(pickleSer, infile)
        udfs['f%d' % i] = udf
        args = ["a[%d]" % o for o in arg_offsets]
        call_udf.append("f%d(%s)" % (i, ", ".join(args)))
    # Create function like this:
    #   lambda a: (f0(a0), f1(a1, a2), f2(a3))
    # In the special case of a single UDF this will return a single result rather
    # than a tuple of results; this is the format that the JVM side expects.
    mapper_str = "lambda a: (%s)" % (", ".join(call_udf))
    mapper = eval(mapper_str, udfs)

    func = lambda _, it: map(mapper, it)
    ser = BatchedSerializer(PickleSerializer(), 100)
    # profiling is not supported for UDF
    return func, None, ser, ser
Пример #34
0
class ExternalList(object):
    """
    ExternalList can have many items which cannot be hold in memory in
    the same time.

    >>> l = ExternalList(list(range(100)))
    >>> len(l)
    100
    >>> l.append(10)
    >>> len(l)
    101
    >>> for i in range(20240):
    ...     l.append(i)
    >>> len(l)
    20341
    >>> import pickle
    >>> l2 = pickle.loads(pickle.dumps(l))
    >>> len(l2)
    20341
    >>> list(l2)[100]
    10
    """
    LIMIT = 10240

    def __init__(self, values):
        self.values = values
        self.count = len(values)
        self._file = None
        self._ser = None

    def __getstate__(self):
        if self._file is not None:
            self._file.flush()
            with os.fdopen(os.dup(self._file.fileno()), "rb") as f:
                f.seek(0)
                serialized = f.read()
        else:
            serialized = b''
        return self.values, self.count, serialized

    def __setstate__(self, item):
        self.values, self.count, serialized = item
        if serialized:
            self._open_file()
            self._file.write(serialized)
        else:
            self._file = None
            self._ser = None

    def __iter__(self):
        if self._file is not None:
            self._file.flush()
            # read all items from disks first
            with os.fdopen(os.dup(self._file.fileno()), 'rb') as f:
                f.seek(0)
                for v in self._ser.load_stream(f):
                    yield v

        for v in self.values:
            yield v

    def __len__(self):
        return self.count

    def append(self, value):
        self.values.append(value)
        self.count += 1
        # dump them into disk if the key is huge
        if len(self.values) >= self.LIMIT:
            self._spill()

    def _open_file(self):
        dirs = _get_local_dirs("objects")
        d = dirs[id(self) % len(dirs)]
        if not os.path.exists(d):
            os.makedirs(d)
        p = os.path.join(d, str(id(self)))
        self._file = open(p, "wb+", 65536)
        self._ser = BatchedSerializer(CompressedSerializer(PickleSerializer()), 1024)
        os.unlink(p)

    def __del__(self):
        if self._file:
            self._file.close()
            self._file = None

    def _spill(self):
        """ dump the values into disk """
        global MemoryBytesSpilled, DiskBytesSpilled
        if self._file is None:
            self._open_file()

        used_memory = get_used_memory()
        pos = self._file.tell()
        self._ser.dump_stream(self.values, self._file)
        self.values = []
        gc.collect()
        DiskBytesSpilled += self._file.tell() - pos
        MemoryBytesSpilled += (used_memory - get_used_memory()) << 20