示例#1
0
    def loadLabeledPoints(sc, path, minPartitions=None):
        """
        Load labeled points saved using RDD.saveAsTextFile.

        @param sc: Spark context
        @param path: file or directory path in any Hadoop-supported file
                     system URI
        @param minPartitions: min number of partitions
        @return: labeled data stored as an RDD of LabeledPoint

        >>> from tempfile import NamedTemporaryFile
        >>> from pyspark.mllib.util import MLUtils
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \
                        LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
        >>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
        >>> type(loaded[0]) == LabeledPoint
        True
        >>> print examples[0]
        (1.1,(3,[0,2],[-1.23,4.56e-07]))
        >>> type(examples[1]) == LabeledPoint
        True
        >>> print examples[1]
        (0.0,[1.01,2.02,3.03])
        """
        minPartitions = minPartitions or min(sc.defaultParallelism, 2)
        jSerialized = sc._jvm.PythonMLLibAPI().loadLabeledPoints(
            sc._jsc, path, minPartitions)
        serialized = RDD(jSerialized, sc, NoOpSerializer())
        return serialized.map(
            lambda bytes: _deserialize_labeled_point(bytearray(bytes)))
示例#2
0
 def calculate_value_to_index(self, rdd: RDD):
     column_name = self.column
     if isinstance(column_name, str):
         column = rdd.map(lambda x: getattr(x, column_name))
     else:
         column = rdd.map(lambda x: x[column_name])
     self._log.info("Collecting the list of distinct sorted values (%s)", column_name)
     values = column.distinct()
     if self.explained:
         self._log.info("toDebugString():\n%s", values.toDebugString().decode())
     values = values.collect()
     values.sort()  # We do not expect an extraordinary number of distinct values
     self._log.info("%d distinct values", len(values))
     if len(values) == 0:
         raise RuntimeError("Number of distinct values is zero.")
     self._value_to_index = {d: i for i, d in enumerate(values)}
示例#3
0
    def __call__(self, rdd: RDD):
        column_name = self.column
        if self._value_to_index is None:
            self.calculate_value_to_index(rdd)
        column2id = rdd.context.broadcast(self._value_to_index)

        def index_column(row):
            """
            Map column_id column to its index value stored in column2id.
            WARNING: due to pyspark documentation
            (http://spark.apache.org/docs/latest/rdd-programming-guide.html#passing-functions-to-spark)
            do not use self inside this function. It will be suboptimal and probably fail to run.
            Please contact me if you have troubles: [email protected]
            """
            if isinstance(column_name, str):
                assert isinstance(row, Row)
                row_dict = row.asDict()
                row_dict[column_name] = column2id.value[row_dict[column_name]]
                return Row(**row_dict)
            return row[:column_name] + (
                column2id.value[row[column_name]], ) + row[column_name + 1:]

        indexed_rdd = rdd.map(index_column)
        column2id.unpersist(blocking=True)
        return indexed_rdd
示例#4
0
    def loadLabeledPoints(sc, path, minPartitions=None):
        """
        Load labeled points saved using RDD.saveAsTextFile.

        @param sc: Spark context
        @param path: file or directory path in any Hadoop-supported file
                     system URI
        @param minPartitions: min number of partitions
        @return: labeled data stored as an RDD of LabeledPoint

        >>> from tempfile import NamedTemporaryFile
        >>> from pyspark.mllib.util import MLUtils
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \
                        LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
        >>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
        >>> type(loaded[0]) == LabeledPoint
        True
        >>> print examples[0]
        (1.1,(3,[0,2],[-1.23,4.56e-07]))
        >>> type(examples[1]) == LabeledPoint
        True
        >>> print examples[1]
        (0.0,[1.01,2.02,3.03])
        """
        minPartitions = minPartitions or min(sc.defaultParallelism, 2)
        jSerialized = sc._jvm.PythonMLLibAPI().loadLabeledPoints(sc._jsc, path, minPartitions)
        serialized = RDD(jSerialized, sc, NoOpSerializer())
        return serialized.map(lambda bytes: _deserialize_labeled_point(bytearray(bytes)))
示例#5
0
    def poissonVectorRDD(sc,
                         mean,
                         numRows,
                         numCols,
                         numPartitions=None,
                         seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the Poisson distribution with the input mean.

        >>> import numpy as np
        >>> mean = 100.0
        >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L)
        >>> mat = np.mat(rdd.collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - mean) < 0.5
        True
        >>> from math import sqrt
        >>> abs(mat.std() - sqrt(mean)) < 0.5
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI() \
            .poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed)
        poisson = RDD(jrdd, sc, NoOpSerializer())
        return poisson.map(
            lambda bytes: _deserialize_double_vector(bytearray(bytes)))
示例#6
0
    def uniformRDD(sc, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d. samples from the
        uniform distribution U(0.0, 1.0).

        To transform the distribution in the generated RDD from U(0.0, 1.0)
        to U(a, b), use
        C{RandomRDDs.uniformRDD(sc, n, p, seed)\
          .map(lambda v: a + (b - a) * v)}

        >>> x = RandomRDDs.uniformRDD(sc, 100).collect()
        >>> len(x)
        100
        >>> max(x) <= 1.0 and min(x) >= 0.0
        True
        >>> RandomRDDs.uniformRDD(sc, 100, 4).getNumPartitions()
        4
        >>> parts = RandomRDDs.uniformRDD(sc, 100, seed=4).getNumPartitions()
        >>> parts == sc.defaultParallelism
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(sc._jsc, size,
                                                   numPartitions, seed)
        uniform = RDD(jrdd, sc, NoOpSerializer())
        return uniform.map(lambda bytes: _deserialize_double(bytearray(bytes)))
示例#7
0
def rank_using_reverted_index(index: RDD):
    """

    :param index:
    :return:
    """
    return index.map(lambda lang_article: (lang_article[0], len(lang_article[1]))) \
        .sortBy(lambda k: k[1], ascending=False) \
        .collect()
        def summary_cluster(cluster_result_: RDD) -> dict:
            """
            Get the summary information of the discard set/ compression set
            :param cluster_result_: pair RDD, key is point_id, value is (cluster_id, feature vector)
            :return: a dictionary containing the summary information
            """
            cluster_sum = cluster_result_. \
                values(). \
                flatMapValues(lambda features: [(feature_index, (feature, feature ** 2))
                                                for feature_index, feature in enumerate(features)]). \
                map(lambda pair: ((pair[0], pair[1][0]), pair[1][1])). \
                reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])). \
                map(lambda pair: (pair[0][0], (pair[0][1], (pair[1][0], pair[1][1])))). \
                groupByKey(). \
                mapValues(sorted). \
                mapValues(lambda values: [value for feature_index, value in values]). \
                mapValues(lambda values: [value for value, _ in values]).\
                persist()
            cluster_sum = dict(cluster_sum.collect())

            cluster_assignments = dict(
                cluster_result_.map(lambda pair: (pair[1][0], pair[0])).
                groupByKey().mapValues(set).collect())
            cluster_point_index = set(cluster_result_.keys().collect())
            cluster_count = dict(
                cluster_result_.map(lambda pair: (pair[1][0], 1)).reduceByKey(
                    lambda x, y: x + y).collect())
            cluster_centroids = dict()
            for cid in cluster_sum.keys():
                cluster_size = cluster_count[cid]
                cluster_centroids[cid] = [
                    Sum / cluster_size for Sum in cluster_sum[cid]
                ]

            return {
                "assignments": cluster_assignments,
                "point_index": cluster_point_index,
                "count": cluster_count,
                "sum": cluster_sum,
                "centroids": cluster_centroids
            }
示例#9
0
    def log_likelihood(self, rdd: RDD) -> float:
        """Returns the log-likelihood that this model generated the given data distribution.

        Parameters
        ----------
        rdd: RDD
            Elements are assumed to be floats.

        Returns
        -------
        float
            The log-likelihood that this model generated the given data distribution.
        """
        return rdd.map(lambda x: math.log(self.pdf(x))).sum()  # action
示例#10
0
 def fit(self, data: RDD):
     # data.cache()
     # 初始化聚类中心
     clusters = self._init_clusters(data)
     for i in range(self.max_iter):
         # 计算data和聚类中心的距离
         dists = data.map(lambda x:
                          (x, [cal_distance(x, i) for i in clusters]))
         # 找到每条数据最近的聚类中心
         tmp = dists.map(lambda x: (np.argmin(x[1]), x[0]))
         # 生成新的聚类中心
         clusters = tmp.mapValues(lambda x: (x, 1)).reduceByKey(
             lambda x, y: (x[0] + y[0], x[1] + y[1])).map(
                 lambda x: x[1][0] / x[1][1]).collect()
     self.clusters = clusters
     return self
示例#11
0
文件: random.py 项目: hcook/spark
    def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the standard normal distribution.

        >>> import numpy as np
        >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - 0.0) < 0.1
        True
        >>> abs(mat.std() - 1.0) < 0.1
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
        normal = RDD(jrdd, sc, NoOpSerializer())
        return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
示例#12
0
文件: random.py 项目: hcook/spark
    def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the uniform distribution U(0.0, 1.0).

        >>> import numpy as np
        >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())
        >>> mat.shape
        (10, 10)
        >>> mat.max() <= 1.0 and mat.min() >= 0.0
        True
        >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
        4
        """
        jrdd = sc._jvm.PythonMLLibAPI().uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
        uniform = RDD(jrdd, sc, NoOpSerializer())
        return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
示例#13
0
    def createRDD(sc,
                  kafkaParams,
                  offsetRanges,
                  leaders=None,
                  keyDecoder=utf8_decoder,
                  valueDecoder=utf8_decoder):
        """
        .. note:: Experimental

        Create a RDD from Kafka using offset ranges for each topic and partition.

        :param sc:  SparkContext object
        :param kafkaParams: Additional params for Kafka
        :param offsetRanges:  list of offsetRange to specify topic:partition:[start, end) to consume
        :param leaders: Kafka brokers for each TopicAndPartition in offsetRanges.  May be an empty
            map, in which case leaders will be looked up on the driver.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder)
        :param valueDecoder:  A function used to decode value (default is utf8_decoder)
        :return: A RDD object
        """
        if leaders is None:
            leaders = dict()
        if not isinstance(kafkaParams, dict):
            raise TypeError("kafkaParams should be dict")
        if not isinstance(offsetRanges, list):
            raise TypeError("offsetRanges should be list")

        try:
            helperClass = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
            helper = helperClass.newInstance()
            joffsetRanges = [o._jOffsetRange(helper) for o in offsetRanges]
            jleaders = dict([(k._jTopicAndPartition(helper),
                              v._jBroker(helper))
                             for (k, v) in leaders.items()])
            jrdd = helper.createRDD(sc._jsc, kafkaParams, joffsetRanges,
                                    jleaders)
        except Py4JJavaError as e:
            if 'ClassNotFoundException' in str(e.java_exception):
                KafkaUtils._printErrorMsg(sc)
            raise e

        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        rdd = RDD(jrdd, sc, ser)
        return rdd.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
示例#14
0
    def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the uniform distribution U(0.0, 1.0).

        >>> import numpy as np
        >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())
        >>> mat.shape
        (10, 10)
        >>> mat.max() <= 1.0 and mat.min() >= 0.0
        True
        >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
        4
        """
        jrdd = sc._jvm.PythonMLLibAPI() \
            .uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
        uniform = RDD(jrdd, sc, NoOpSerializer())
        return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
示例#15
0
    def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the standard normal distribution.

        >>> import numpy as np
        >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - 0.0) < 0.1
        True
        >>> abs(mat.std() - 1.0) < 0.1
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI() \
            .normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
        normal = RDD(jrdd, sc, NoOpSerializer())
        return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
示例#16
0
    def poissonRDD(sc, mean, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d samples from the Poisson
        distribution with the input mean.

        >>> mean = 100.0
        >>> x = RandomRDDGenerators.poissonRDD(sc, mean, 1000, seed=1L)
        >>> stats = x.stats()
        >>> stats.count()
        1000L
        >>> abs(stats.mean() - mean) < 0.5
        True
        >>> from math import sqrt
        >>> abs(stats.stdev() - sqrt(mean)) < 0.5
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(sc._jsc, mean, size, numPartitions, seed)
        poisson = RDD(jrdd, sc, NoOpSerializer())
        return poisson.map(lambda bytes: _deserialize_double(bytearray(bytes)))
示例#17
0
    def poissonRDD(sc, mean, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d. samples from the Poisson
        distribution with the input mean.

        >>> mean = 100.0
        >>> x = RandomRDDs.poissonRDD(sc, mean, 1000, seed=1L)
        >>> stats = x.stats()
        >>> stats.count()
        1000L
        >>> abs(stats.mean() - mean) < 0.5
        True
        >>> from math import sqrt
        >>> abs(stats.stdev() - sqrt(mean)) < 0.5
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(sc._jsc, mean, size, numPartitions, seed)
        poisson = RDD(jrdd, sc, NoOpSerializer())
        return poisson.map(lambda bytes: _deserialize_double(bytearray(bytes)))
示例#18
0
文件: random.py 项目: hcook/spark
    def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the Poisson distribution with the input mean.

        >>> import numpy as np
        >>> mean = 100.0
        >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L)
        >>> mat = np.mat(rdd.collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - mean) < 0.5
        True
        >>> from math import sqrt
        >>> abs(mat.std() - sqrt(mean)) < 0.5
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed)
        poisson = RDD(jrdd, sc, NoOpSerializer())
        return poisson.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
示例#19
0
    def normalRDD(sc, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d samples from the standard normal
        distribution.

        To transform the distribution in the generated RDD from standard normal
        to some other normal N(mean, sigma), use
        C{RandomRDDGenerators.normal(sc, n, p, seed)\
          .map(lambda v: mean + sigma * v)}

        >>> x = RandomRDDGenerators.normalRDD(sc, 1000, seed=1L)
        >>> stats = x.stats()
        >>> stats.count()
        1000L
        >>> abs(stats.mean() - 0.0) < 0.1
        True
        >>> abs(stats.stdev() - 1.0) < 0.1
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().normalRDD(sc._jsc, size, numPartitions, seed)
        normal = RDD(jrdd, sc, NoOpSerializer())
        return normal.map(lambda bytes: _deserialize_double(bytearray(bytes)))
示例#20
0
    def normalRDD(sc, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d. samples from the standard normal
        distribution.

        To transform the distribution in the generated RDD from standard normal
        to some other normal N(mean, sigma^2), use
        C{RandomRDDs.normal(sc, n, p, seed)\
          .map(lambda v: mean + sigma * v)}

        >>> x = RandomRDDs.normalRDD(sc, 1000, seed=1L)
        >>> stats = x.stats()
        >>> stats.count()
        1000L
        >>> abs(stats.mean() - 0.0) < 0.1
        True
        >>> abs(stats.stdev() - 1.0) < 0.1
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().normalRDD(sc._jsc, size, numPartitions, seed)
        normal = RDD(jrdd, sc, NoOpSerializer())
        return normal.map(lambda bytes: _deserialize_double(bytearray(bytes)))
示例#21
0
文件: kafka.py 项目: 308306362/spark
    def createRDD(sc, kafkaParams, offsetRanges, leaders={},
                  keyDecoder=utf8_decoder, valueDecoder=utf8_decoder):
        """
        .. note:: Experimental

        Create a RDD from Kafka using offset ranges for each topic and partition.

        :param sc:  SparkContext object
        :param kafkaParams: Additional params for Kafka
        :param offsetRanges:  list of offsetRange to specify topic:partition:[start, end) to consume
        :param leaders: Kafka brokers for each TopicAndPartition in offsetRanges.  May be an empty
            map, in which case leaders will be looked up on the driver.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder)
        :param valueDecoder:  A function used to decode value (default is utf8_decoder)
        :return: A RDD object
        """
        if not isinstance(kafkaParams, dict):
            raise TypeError("kafkaParams should be dict")
        if not isinstance(offsetRanges, list):
            raise TypeError("offsetRanges should be list")

        try:
            helperClass = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
            helper = helperClass.newInstance()
            joffsetRanges = [o._jOffsetRange(helper) for o in offsetRanges]
            jleaders = dict([(k._jTopicAndPartition(helper),
                              v._jBroker(helper)) for (k, v) in leaders.items()])
            jrdd = helper.createRDD(sc._jsc, kafkaParams, joffsetRanges, jleaders)
        except Py4JJavaError as e:
            if 'ClassNotFoundException' in str(e.java_exception):
                KafkaUtils._printErrorMsg(sc)
            raise e

        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        rdd = RDD(jrdd, sc, ser)
        return rdd.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
示例#22
0
    def uniformRDD(sc, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d. samples from the
        uniform distribution on [0.0, 1.0].

        To transform the distribution in the generated RDD from U[0.0, 1.0]
        to U[a, b], use
        C{RandomRDDGenerators.uniformRDD(sc, n, p, seed)\
          .map(lambda v: a + (b - a) * v)}

        >>> x = RandomRDDGenerators.uniformRDD(sc, 100).collect()
        >>> len(x)
        100
        >>> max(x) <= 1.0 and min(x) >= 0.0
        True
        >>> RandomRDDGenerators.uniformRDD(sc, 100, 4).getNumPartitions()
        4
        >>> parts = RandomRDDGenerators.uniformRDD(sc, 100, seed=4).getNumPartitions()
        >>> parts == sc.defaultParallelism
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(sc._jsc, size, numPartitions, seed)
        uniform = RDD(jrdd, sc, NoOpSerializer())
        return uniform.map(lambda bytes: _deserialize_double(bytearray(bytes)))
示例#23
0
 def predict(self, data: RDD):
     dists = data.map(lambda x:
                      (x, [cal_distance(x, i) for i in self.clusters]))
     tmp = dists.map(lambda x: np.argmin(x[1]))
     return tmp.collect()