Пример #1
0
    def loadLabeledPoints(sc, path, minPartitions=None):
        """
        Load labeled points saved using RDD.saveAsTextFile.

        @param sc: Spark context
        @param path: file or directory path in any Hadoop-supported file
                     system URI
        @param minPartitions: min number of partitions
        @return: labeled data stored as an RDD of LabeledPoint

        >>> from tempfile import NamedTemporaryFile
        >>> from pyspark.mllib.util import MLUtils
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \
                        LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
        >>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
        >>> type(loaded[0]) == LabeledPoint
        True
        >>> print examples[0]
        (1.1,(3,[0,2],[-1.23,4.56e-07]))
        >>> type(examples[1]) == LabeledPoint
        True
        >>> print examples[1]
        (0.0,[1.01,2.02,3.03])
        """
        minPartitions = minPartitions or min(sc.defaultParallelism, 2)
        jSerialized = sc._jvm.PythonMLLibAPI().loadLabeledPoints(sc._jsc, path, minPartitions)
        serialized = RDD(jSerialized, sc, NoOpSerializer())
        return serialized.map(lambda bytes: _deserialize_labeled_point(bytearray(bytes)))
Пример #2
0
	def __init__(self, ctx, resource_read=None, query=None, **kwargs):
		kwargs = make_es_config(kwargs, resource_read=resource_read, query=query)
		kwargs = as_java_object(ctx._gateway, kwargs)
		jrdd = helper(ctx).esJsonRDD(ctx._jsc, kwargs)
		rdd = RDD(jrdd, ctx, NoOpSerializer())

		# read the rdd in batches of two (first key then value / doc)
		def pairwise(iterable):
			iterator = iter(iterable)
			return izip(iterator, iterator)
		kvRdd = rdd.mapPartitions(pairwise, True)

		super(EsRDD, self).__init__(kvRdd._jrdd, ctx)
Пример #3
0
    def rdd(self):
        """Returns the content as an :class:`pyspark.RDD` of :class:`Row`.
        """
        if self._lazy_rdd is None:
            jrdd = self._jdf.javaToPython()
            rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer()))
            schema = self.schema

            def applySchema(it):
                cls = _create_cls(schema)
                return map(cls, it)

            self._lazy_rdd = rdd.mapPartitions(applySchema)

        return self._lazy_rdd
Пример #4
0
    def rdd(self):
        """
        Return the content of the :class:`DataFrame` as an :class:`RDD`
        of :class:`Row` s.
        """
        if not hasattr(self, '_lazy_rdd'):
            jrdd = self._jdf.javaToPython()
            rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer()))
            schema = self.schema

            def applySchema(it):
                cls = _create_cls(schema)
                return itertools.imap(cls, it)

            self._lazy_rdd = rdd.mapPartitions(applySchema)

        return self._lazy_rdd
Пример #5
0
    def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the standard normal distribution.

        >>> import numpy as np
        >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - 0.0) < 0.1
        True
        >>> abs(mat.std() - 1.0) < 0.1
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
        normal = RDD(jrdd, sc, NoOpSerializer())
        return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
Пример #6
0
    def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the uniform distribution U(0.0, 1.0).

        >>> import numpy as np
        >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())
        >>> mat.shape
        (10, 10)
        >>> mat.max() <= 1.0 and mat.min() >= 0.0
        True
        >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
        4
        """
        jrdd = sc._jvm.PythonMLLibAPI().uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
        uniform = RDD(jrdd, sc, NoOpSerializer())
        return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
Пример #7
0
    def poissonRDD(sc, mean, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d samples from the Poisson
        distribution with the input mean.

        >>> mean = 100.0
        >>> x = RandomRDDGenerators.poissonRDD(sc, mean, 1000, seed=1L)
        >>> stats = x.stats()
        >>> stats.count()
        1000L
        >>> abs(stats.mean() - mean) < 0.5
        True
        >>> from math import sqrt
        >>> abs(stats.stdev() - sqrt(mean)) < 0.5
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(sc._jsc, mean, size, numPartitions, seed)
        poisson = RDD(jrdd, sc, NoOpSerializer())
        return poisson.map(lambda bytes: _deserialize_double(bytearray(bytes)))
Пример #8
0
    def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the Poisson distribution with the input mean.

        >>> import numpy as np
        >>> mean = 100.0
        >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L)
        >>> mat = np.mat(rdd.collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - mean) < 0.5
        True
        >>> from math import sqrt
        >>> abs(mat.std() - sqrt(mean)) < 0.5
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed)
        poisson = RDD(jrdd, sc, NoOpSerializer())
        return poisson.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
Пример #9
0
    def normalRDD(sc, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d samples from the standard normal
        distribution.

        To transform the distribution in the generated RDD from standard normal
        to some other normal N(mean, sigma), use
        C{RandomRDDGenerators.normal(sc, n, p, seed)\
          .map(lambda v: mean + sigma * v)}

        >>> x = RandomRDDGenerators.normalRDD(sc, 1000, seed=1L)
        >>> stats = x.stats()
        >>> stats.count()
        1000L
        >>> abs(stats.mean() - 0.0) < 0.1
        True
        >>> abs(stats.stdev() - 1.0) < 0.1
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().normalRDD(sc._jsc, size, numPartitions, seed)
        normal = RDD(jrdd, sc, NoOpSerializer())
        return normal.map(lambda bytes: _deserialize_double(bytearray(bytes)))
Пример #10
0
	def asDataFrames(self, *index_by):
		'''
			Reads the spanned rows as DataFrames if pandas is available, or as
			a dict of numpy arrays if only numpy is available or as a dict with
			primitives and objects otherwise.
			
			@param index_by If pandas is available, the dataframes will be
			indexed by the given columns.
		'''
		for c in index_by:
			if c in self.columns:
				raise ValueError('column %s cannot be used as index in the data'
					'frames as it is a column by which the rows are spanned.') 
		
		columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in self.columns))
		jrdd = self._helper.spanBy(self._cjrdd, columns)
		rdd = RDD(jrdd, self.ctx)
		
		global pd
		if index_by and pd:
			return rdd.mapValues(lambda _: _.set_index(*[str(c) for c in index_by]))
		else:
			return rdd
Пример #11
0
    def createRDD(sc, kafkaParams, offsetRanges, leaders={},
                  keyDecoder=utf8_decoder, valueDecoder=utf8_decoder):
        """
        .. note:: Experimental

        Create a RDD from Kafka using offset ranges for each topic and partition.

        :param sc:  SparkContext object
        :param kafkaParams: Additional params for Kafka
        :param offsetRanges:  list of offsetRange to specify topic:partition:[start, end) to consume
        :param leaders: Kafka brokers for each TopicAndPartition in offsetRanges.  May be an empty
            map, in which case leaders will be looked up on the driver.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder)
        :param valueDecoder:  A function used to decode value (default is utf8_decoder)
        :return: A RDD object
        """
        if not isinstance(kafkaParams, dict):
            raise TypeError("kafkaParams should be dict")
        if not isinstance(offsetRanges, list):
            raise TypeError("offsetRanges should be list")

        try:
            helperClass = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
            helper = helperClass.newInstance()
            joffsetRanges = [o._jOffsetRange(helper) for o in offsetRanges]
            jleaders = dict([(k._jTopicAndPartition(helper),
                              v._jBroker(helper)) for (k, v) in leaders.items()])
            jrdd = helper.createRDD(sc._jsc, kafkaParams, joffsetRanges, jleaders)
        except Py4JJavaError as e:
            if 'ClassNotFoundException' in str(e.java_exception):
                KafkaUtils._printErrorMsg(sc)
            raise e

        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        rdd = RDD(jrdd, sc, ser)
        return rdd.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
Пример #12
0
    def uniformRDD(sc, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d. samples from the
        uniform distribution on [0.0, 1.0].

        To transform the distribution in the generated RDD from U[0.0, 1.0]
        to U[a, b], use
        C{RandomRDDGenerators.uniformRDD(sc, n, p, seed)\
          .map(lambda v: a + (b - a) * v)}

        >>> x = RandomRDDGenerators.uniformRDD(sc, 100).collect()
        >>> len(x)
        100
        >>> max(x) <= 1.0 and min(x) >= 0.0
        True
        >>> RandomRDDGenerators.uniformRDD(sc, 100, 4).getNumPartitions()
        4
        >>> parts = RandomRDDGenerators.uniformRDD(sc, 100, seed=4).getNumPartitions()
        >>> parts == sc.defaultParallelism
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(sc._jsc, size, numPartitions, seed)
        uniform = RDD(jrdd, sc, NoOpSerializer())
        return uniform.map(lambda bytes: _deserialize_double(bytearray(bytes)))
Пример #13
0
    def sequenceFile(self,
                     path,
                     keyClass=None,
                     valueClass=None,
                     keyConverter=None,
                     valueConverter=None,
                     minSplits=None,
                     batchSize=0):
        """
        Read a Hadoop SequenceFile with arbitrary key and value Writable class from HDFS,
        a local file system (available on all nodes), or any Hadoop-supported file system URI.
        The mechanism is as follows:

            1. A Java RDD is created from the SequenceFile or other InputFormat, and the key
               and value Writable classes
            2. Serialization is attempted via Pyrolite pickling
            3. If this fails, the fallback is to call 'toString' on each key and value
            4. C{PickleSerializer} is used to deserialize pickled objects on the Python side

        :param path: path to sequncefile
        :param keyClass: fully qualified classname of key Writable class
               (e.g. "org.apache.hadoop.io.Text")
        :param valueClass: fully qualified classname of value Writable class
               (e.g. "org.apache.hadoop.io.LongWritable")
        :param keyConverter:
        :param valueConverter:
        :param minSplits: minimum splits in dataset
               (default min(2, sc.defaultParallelism))
        :param batchSize: The number of Python objects represented as a single
               Java object. (default 0, choose batchSize automatically)
        """
        minSplits = minSplits or min(self.defaultParallelism, 2)
        jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, path, keyClass,
                                                valueClass, keyConverter,
                                                valueConverter, minSplits,
                                                batchSize)
        return RDD(jrdd, self)
Пример #14
0
    def hadoopFile(self,
                   path,
                   inputFormatClass,
                   keyClass,
                   valueClass,
                   keyConverter=None,
                   valueConverter=None,
                   conf=None,
                   batchSize=0):
        """
        Read an 'old' Hadoop InputFormat with arbitrary key and value class from HDFS,
        a local file system (available on all nodes), or any Hadoop-supported file system URI.
        The mechanism is the same as for sc.sequenceFile.

        A Hadoop configuration can be passed in as a Python dict. This will be converted into a
        Configuration in Java.

        :param path: path to Hadoop file
        :param inputFormatClass: fully qualified classname of Hadoop InputFormat
               (e.g. "org.apache.hadoop.mapred.TextInputFormat")
        :param keyClass: fully qualified classname of key Writable class
               (e.g. "org.apache.hadoop.io.Text")
        :param valueClass: fully qualified classname of value Writable class
               (e.g. "org.apache.hadoop.io.LongWritable")
        :param keyConverter: (None by default)
        :param valueConverter: (None by default)
        :param conf: Hadoop configuration, passed in as a dict
               (None by default)
        :param batchSize: The number of Python objects represented as a single
               Java object. (default 0, choose batchSize automatically)
        """
        jconf = self._dictToJavaMap(conf)
        jrdd = self._jvm.PythonRDD.hadoopFile(self._jsc, path,
                                              inputFormatClass, keyClass,
                                              valueClass, keyConverter,
                                              valueConverter, jconf, batchSize)
        return RDD(jrdd, self)
Пример #15
0
    def union(self, rdds):
        """
        Build the union of a list of RDDs.

        This supports unions() of RDDs with different serialized formats,
        although this forces them to be reserialized using the default
        serializer:

        >>> path = os.path.join(tempdir, "union-text.txt")
        >>> with open(path, "w") as testFile:
        ...    _ = testFile.write("Hello")
        >>> textFile = sc.textFile(path)
        >>> textFile.collect()
        [u'Hello']
        >>> parallelized = sc.parallelize(["World!"])
        >>> sorted(sc.union([textFile, parallelized]).collect())
        [u'Hello', 'World!']
        """
        first_jrdd_deserializer = rdds[0]._jrdd_deserializer
        if any(x._jrdd_deserializer != first_jrdd_deserializer for x in rdds):
            rdds = [x._reserialize() for x in rdds]
        first = rdds[0]._jrdd
        rest = [x._jrdd for x in rdds[1:]]
        return RDD(self._jsc.union(first, rest), self, rdds[0]._jrdd_deserializer)
Пример #16
0
    def parallelize(self, c, numSlices=None):
        """
        Distribute a local Python collection to form an RDD.

        >>> sc.parallelize(range(5), 5).glom().collect()
        [[0], [1], [2], [3], [4]]
        """
        numSlices = numSlices or self.defaultParallelism
        # Calling the Java parallelize() method with an ArrayList is too slow,
        # because it sends O(n) Py4J commands.  As an alternative, serialized
        # objects are written to a file and loaded through textFile().
        tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
        # Make sure we distribute data evenly if it's smaller than self.batchSize
        if "__len__" not in dir(c):
            c = list(c)  # Make it a list so we can compute its length
        batchSize = min(len(c) // numSlices, self.batchSize)
        if batchSize > 1:
            c = batched(c, batchSize)
        for x in c:
            write_with_length(dump_pickle(x), tempFile)
        tempFile.close()
        readRDDFromPickleFile = self._jvm.PythonRDD.readRDDFromPickleFile
        jrdd = readRDDFromPickleFile(self._jsc, tempFile.name, numSlices)
        return RDD(jrdd, self)
Пример #17
0
 def productFeatures(self):
     sc = self._context
     jpf = self._java_model.productFeatures()
     jpf = sc._jvm.SerDe.fromTuple2RDD(jpf).toJavaRDD()
     return RDD(sc._jvm.PythonRDD.javaToPython(jpf), sc,
                AutoBatchedSerializer(PickleSerializer()))
from pyspark import SparkContext
from pyspark import SparkConf
import sys
from pyspark.rdd import RDD

def update_dictionary(map):
        map.update(test="alex")
        return map

if __name__ == "__main__":
        print("here1")
        conf = SparkConf()
        sc = SparkContext(appName="alex_test_app")
        print("here2")
        print("here2b: " + conf.get("spark.aleph2_job_config"))
        aleph2 = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader().loadClass("com.ikanow.aleph2.analytics.spark.utils.SparkPyTechnologyUtils").newInstance().getAleph2(sc._jsc, sys.argv[1])
        print("here3")
        print aleph2.getRddInputNames()
        print("here4")
        #print RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()), sc).count()
        print("here5")
        to_output = RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()), sc).map(lambda m: update_dictionary(m))
        aleph2.emitRdd(to_output._to_java_object_rdd())
Пример #19
0
 def _checkpointFile(self, name, input_deserializer):
     jrdd = self._jsc.checkpointFile(name)
     return RDD(jrdd, self, input_deserializer)
Пример #20
0
 def _checkpointFile(self, name):
     jrdd = self._jsc.checkpointFile(name)
     return RDD(jrdd, self)
Пример #21
0
 def __init__(self, jrdd, ctx, jrdd_deserializer):
     warnings.warn(
         "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
         "See SPARK-21893.",
         DeprecationWarning)
     RDD.__init__(self, jrdd, ctx, jrdd_deserializer)
Пример #22
0
def import_csv(path, delimiter=",", header=False, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=TkContext.implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns
                   (unless a schema is provided), and not be included in the data.  The default value is false.
    :param schema: (Optional(list[tuple(str, type)] or list[str])) The are different options for specifying a schema:

    * Provide the full schema for the frame as a list of tuples (string column name and data type)
    * Provide the column names as a list of strings.  Column data types will be inferred, based on the data.  The column names specified will override column names that are found in the header row.
    * None, where the schema is automatically inferred based on the data.  Columns are named based on the header, or will be named generically ("C0", "C1", "C2", etc).

    :param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat
                        specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------

    Load a frame from a csv file by specifying the path to the file, delimiter

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv(file_path, "|", header=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)]

    The schema parameter can be used to specify a custom schema (column names and data types) or column names (and the
    data types are inferred based on the data).  Here, we will specify the column names, which will override the
    header from the csv file.

        >>> column_names = ["Rank", "City", "2013", "2010", "Percent_Change", "County"]
        >>> frame = tc.frame.import_csv(file_path, "|", header=True, schema=column_names)
        -etc-

        >>> frame.schema
        [('Rank', <type 'int'>), ('City', <type 'str'>), ('2013', <type 'int'>), ('2010', <type 'int'>), ('Percent_Change', <type 'str'>), ('County', <type 'str'>)]

        <hide>
        >>> file_path = "../datasets/unicode.csv"
        >>> schema = [("a", unicode),("b", unicode),("c",unicode)]
        >>> frame = tc.frame.import_csv(file_path, schema=schema, header=False)
        -etc-

        >>> print unicode(frame.get_inspect()).encode('utf-8')  # because this file is UT8 and this docstring is str
        [#]  a  b  c
        ============
        [0]  à  ë  ñ
        [1]  ã  ê  ü

        </hide>

    """
    TkContext.validate(tc)
    require_type.non_empty_str(path, "path")
    require_type.non_empty_str(delimiter, "delimiter")
    require_type(bool, header, "header")
    require_type(str, datetime_format, "datetime_format")

    infer_schema = True
    column_names = []   # custom column names

    if schema is not None:
        if not isinstance(schema, list):
            raise TypeError("Unsupported type %s for schema parameter." % type(schema))
        elif all(isinstance(item, basestring) for item in schema):
            # schema is just column names
            column_names = schema
            schema = None
        else:
            infer_schema = False   # if a custom schema is provided, don't waste time inferring the schema during load
            sparktk_schema.validate(schema)

    header_str = str(header).lower()
    infer_schema_str = str(infer_schema).lower()
    pyspark_schema = None

    if schema is not None:
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True))
            else:
                raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0]))
        pyspark_schema = StructType(fields)

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=header_str,
            dateformat=datetime_format,
            inferschema=infer_schema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for i, column in enumerate(df.schema.fields):
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType))
            except ValueError:
                raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name))
            column_name = column_names[i] if (i < len(column_names)) else column.name
            df_schema.append((column_name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError("Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                             "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)
Пример #23
0
 def predictAll(self, usersProducts):
     usersProductsJRDD = _get_unmangled_rdd(usersProducts, _serialize_tuple)
     return RDD(self._java_model.predict(usersProductsJRDD._jrdd),
                self._context, RatingDeserializer())
Пример #24
0
 def call(self, jrdd, time):
     # Wrap JavaRDD into python's RDD class
     rdd = RDD(jrdd, self.ctx, self.deserializer)
     # Call user defined RDD function
     self.func(rdd, time)
Пример #25
0
    def pcy_for_rdd(baskets: RDD, support_threshold_total=support_threshold_total) -> list:

        def check_all_subsets_frequent(itemset: list, frequent_itemsets_dict: dict) -> bool:
            '''
            For example, given a triple ['2', '1', '8'], check if all its subsets ['2', '1'], ['2', '8'], ['1', '8']
            are frequent items.
            :param itemset:
            :return:
            '''
            itemset_size = len(itemset)
            for i in range(itemset_size):
                subset = itemset.copy()
                subset.pop(i)
                try:
                    _ = frequent_itemsets_dict[tuple(subset)]  # 不再需要sorted这个subset,basket已sort
                except:
                    return False
            return True

        num_baskets = baskets.count()
        singleton_counts = baskets.\
            flatMap(lambda set: [(item, 1) for item in set]).\
            reduceByKey(lambda x,y: x+y).\
            filter(lambda pair: pair[1] >= support_threshold_total)
        # frequent_singletons_dict = dict(singleton_counts.collect()).keys()
        frequent_itemsets_dict = dict(singleton_counts.collect())
        # print("frequent_itemsets_dict", frequent_itemsets_dict)
        frequent_itemsets_list = [sorted(list(frequent_itemsets_dict.keys()))]
        del singleton_counts
        gc.collect()

        # all_pairs = baskets.flatMap(lambda basket: generate_combination(basket, 2)).persist()  # 既然first/second pass都要用,为何不persist
        #
        # bucket_counts = all_pairs.map(lambda pair:(hash_pair(pair), 1)).reduceByKey(lambda x,y: x+y).collect()  # first pass
        # bitmap = dict(bucket_counts)
        # for key, value in bitmap.items():
        #     if value >= support_threshold_total:
        #         bitmap[key] = 1
        #     else:
        #         bitmap[key] = 0

        current_itemset_size = 2
        while True:
            # print("current_itemset_size", current_itemset_size)
            # if current_itemset_size == 2: # pairs are special
            #     frequent_itemsets = all_pairs.\
            #         filter(lambda _: qualified_as_candidate_pair(_, frequent_itemsets_dict, bitmap)).\
            #         map(lambda pair: (tuple(pair), 1)).\
            #         reduceByKey(lambda x, y: x + y).\
            #         filter(lambda pair: pair[1] >= support_threshold_total).persist()
            #     del all_pairs
            #     gc.collect()
            # else:  # 双重filter
            frequent_itemsets = baskets.flatMap(lambda basket: generate_combination_with_filter(basket, frequent_itemsets_dict, current_itemset_size)). \
                map(lambda itemset: (tuple(itemset), 1)).\
                reduceByKey(lambda x,y: x+y).\
                filter(lambda pair: pair[1] >= support_threshold_total).persist()
            # if frequent_itemsets.count() == 0:
            #     break
            current_size_frequent_itemsets = sorted(frequent_itemsets.keys().collect())
            if current_size_frequent_itemsets == []:
                break

            frequent_itemsets_list.append(current_size_frequent_itemsets)
            frequent_itemsets_dict.update(dict.fromkeys(current_size_frequent_itemsets))
            # frequent_itemsets_dict.update(dict(frequent_itemsets.collect()))
            current_itemset_size += 1
            del frequent_itemsets  # 也许正确操作应该是释放内存之后再del?我不懂
            del current_size_frequent_itemsets
            gc.collect()

        gc.collect()
        return frequent_itemsets_list
Пример #26
0
	def loadAsAvro(self, name, deserializer, minParitions = None):
		return RDD(self._pc.parquetFileAsAvro(name), self._sc, deserializer)
Пример #27
0
	def loadAsJson(self, name, minPartitions = None):
		return RDD(self._pc.parquetFileAsJSON(name), self._sc, MUTF8Deserializer()).map(lambda x: json.loads(x))
Пример #28
0
	def loadADAMRecords(self, path):
		return RDD(self._pc.loadADAMRecords(path), self._sc, AvroDeserializer(self._adam_record_schema))
Пример #29
0
def remove_stop_words(rdd_in: RDD) -> RDD:
    file = open(STOP_WORDS_FILENAME, "r")
    sw_list: list = file.read().split("\n")
    return rdd_in.filter(lambda x: x not in sw_list)
Пример #30
0
def into_words(rdd_in: RDD) -> RDD:
    words: RDD = rdd_in.flatMap(lambda x: x.split(' ')).filter(
        lambda x: len(x) > 1)
    return words.map(lambda x: re.sub(',()', "", x))
Пример #31
0
 def func(sc, *a, **kw):
     jrdd = f(sc, *a, **kw)
     return RDD(sc._jvm.SerDe.javaToPython(jrdd), sc,
                BatchedSerializer(PickleSerializer(), 1024))
Пример #32
0
 def __init__(self, jrdd, ctx, jrdd_deserializer):
     RDD.__init__(self, jrdd, ctx, jrdd_deserializer)
Пример #33
0
 def _rdd(self):
     if self._lazy_rdd is None:
         jrdd = self._jdf.javaToPython()
         self._lazy_rdd = RDD(jrdd, self._sc,
                              BatchedSerializer(PickleSerializer()))
     return self._lazy_rdd
Пример #34
0
 def getRddInput(self, sc, name):
     return RDD(sc._jvm.SerDe.javaToPython(self.aleph2.getRddInput(name)),
                sc)
Пример #35
0
def read_to_geometry_rdd(sc, input_path):
    jrdd = sc._jvm.org.datasyslab.geospark.formatMapper.shapefileParser.ShapefileReader.normalrdd(
        sc._jsc)
    return RDD(jrdd, sc)
Пример #36
0
def train_distributed(
    rdd: RDD,
    torch_obj: str,
    iters: int = 10,
    partition_shuffles: int = 1,
    verbose: int = 1,
    mini_batch: int = -1,
    validation_pct: float = 0.0,
    world_size: int = 2,
    device: str = 'cpu',
    early_stop_patience: int = -1
) -> Dict:
    """
    Entry point to train the model in distributed fashion.

    :param rdd: The rdd of data to run on the model.
    :param torch_obj: The torch object as a string that includes the model and param shapes.
    :param master_url: The main url for the driver.
    :param iters: Number of iterations for training.
    :param partition_shuffles: Number of partition shuffles (Need to implement)
    :param verbose: Verbosity of logs
    :param mini_batch: Mini batch for each iteration of training.
    :param validation_pct: How many items to validate
    :param world_size: number of partitions.
    :param device: pytorch device

    :return: The train dict.
    """
    master_url = retrieve_url()

    torch_loaded, params = load_base_torch(torch_obj)

    # Start the driver process.
    p = Process(
        target=handle_model,
        args=(-1, None, params, master_url, iters, world_size, early_stop_patience)
    )
    p.start()

    try:
        state_dict = None
        for i in range(partition_shuffles):

            # Run model with barrier execution mode.
            state_dict = mapPartitionsWithIndex(
                rdd, lambda i, x: handle_model(
                    i,
                    x,
                    torch_obj=torch_loaded,
                    master_url=master_url,
                    iters=iters,
                    verbose=verbose,
                    mini_batch=mini_batch,
                    validation_pct=validation_pct,
                    world_size=world_size,
                    device=device,
                    early_stop_patience=int(early_stop_patience+0)
                )
            ).collect()

            if partition_shuffles - i > 1:
                num_partitions = rdd.getNumPartitions()
                rdd = rdd.repartition(num_partitions)

        return state_dict[0]

    finally:
        p.terminate()
        p.join()
Пример #37
0
 def getAllRddInputs(self, sc):
     return RDD(sc._jvm.SerDe.javaToPython(self.aleph2.getAllRddInputs()),
                sc)
Пример #38
0
from pyspark import SparkContext
from pyspark import SparkConf
import sys
from pyspark.rdd import RDD


def update_dictionary(map):
    map.update(test="alex")
    return map


if __name__ == "__main__":
    print("here1")
    conf = SparkConf()
    sc = SparkContext(appName="alex_test_app")
    print("here2")
    print("here2b: " + conf.get("spark.aleph2_job_config"))
    aleph2 = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader(
    ).loadClass(
        "com.ikanow.aleph2.analytics.spark.utils.SparkPyTechnologyUtils"
    ).newInstance().getAleph2(sc._jsc, sys.argv[1])
    print("here3")
    print aleph2.getRddInputNames()
    print("here4")
    #print RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()), sc).count()
    print("here5")
    to_output = RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()),
                    sc).map(lambda m: update_dictionary(m))
    aleph2.emitRdd(to_output._to_java_object_rdd())
Пример #39
0
 def m_o(ctx: SparkContext, data: pr.RDD) -> None:
     assert isinstance(ctx, SparkContext)
     assert 1 == len(data.collect())
Пример #40
0
 def emptyRDD(self):
     """
     Create an RDD that has no partitions or elements.
     """
     return RDD(self._jsc.emptyRDD(), self, NoOpSerializer())
Пример #41
0
def import_csv(path,
               delimiter=",",
               header=False,
               infer_schema=True,
               schema=None,
               tc=TkContext.implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns,
                   and not be included in the data.  The default value is false.
    :param infer_schema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred.
                       It requires one extra pass over the data and is false by default.
    :param: schema: (Optional[List[tuple(str, type)]]) Optionally specify the schema for the dataset.  Number of
                    columns specified in the schema must match the number of columns in the csv file provided.
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------
    Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that
    there is a header and to infer the schema based on the data.

        >>> file_path = "../integration-tests/datasets/cities.csv"

        >>> frame = tc.load_frame_from_csv(file_path, "|", header=True, infer_schema=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', int),
         ('city', str),
         ('population_2013', int),
         ('population_2010', int),
         ('change', str),
         ('county', str)]
    """

    if schema is not None:
        infer_schema = False  # if a custom schema is provided, don't waste time inferring the schema during load
    if not isinstance(header, bool):
        raise ValueError(
            "header parameter must be a boolean, but is {0}.".format(
                type(header)))
    if not isinstance(infer_schema, bool):
        raise ValueError(
            "infer_schema parameter must be a boolean, but is {0}.".format(
                type(infer_schema)))
    TkContext.validate(tc)

    header_str = str(header).lower()
    infer_schema_str = str(infer_schema).lower()
    pyspark_schema = None

    if (not infer_schema) and (schema is not None):
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(
                    StructField(
                        column[0],
                        dtypes._data_type_to_pyspark_type_table[column[1]],
                        True))
            else:
                raise TypeError(
                    "Unsupported type {0} in schema for column {1}.".format(
                        column[1], column[0]))
        pyspark_schema = StructType(fields)

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=header_str,
            dateformat="yyyy-MM-dd'T'HH:mm:ss.SSSX",
            inferschema=infer_schema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for column in df.schema.fields:
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(
                    type(column.dataType))
            except ValueError:
                raise TypeError(
                    "Unsupported data type ({0}) for column {1}.".format(
                        str(column.dataType), column.name))
            df_schema.append((column.name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError(
                "Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                "number of columns in the csv file data ({1}).".format(
                    custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(
                    row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(
        df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)
Пример #42
0
    def son(baskets: RDD, support_threshold_total=support_threshold_total) -> list:

        def pcy_for_list(partition: list, support_threshold_total=support_threshold_total) -> dict:
            # partition = baskets
            num_baskets_chunk = len(partition)
            support_threshold = math.ceil(support_threshold_total * num_baskets_chunk / num_baskets)

            # first pass
            singleton_counts = {}
            bucket_counts = {}
            for basket in partition:
                for item in basket:
                    singleton_counts[item] = singleton_counts.get(item, 0) + 1

                pairs = generate_combination(basket, size=2)
                for pair in pairs:
                    key = hash_pair(pair)
                    bucket_counts[key] = bucket_counts.get(key, 0) + 1

            for key, value in bucket_counts.items():
                if value >= support_threshold:
                    bucket_counts[key] = 1
                else:
                    bucket_counts[key] = 0

            frequent_itemsets = {}
            for key, value in singleton_counts.items():
                if value >= support_threshold:
                    frequent_itemsets[key] = None  # store all frequent singletons
            # print("singleton_counts", singleton_counts)
            # print("frequent singletons", frequent_itemsets)
            del singleton_counts
            gc.collect()

            # second pass
            itemset_counts = {}
            for basket in partition:
                pairs = generate_combination(basket, size=2)
                for pair in pairs:
                    if qualified_as_candidate_pair(pair, frequent_itemsets, bitmap=bucket_counts):
                        key = tuple(pair)
                        itemset_counts[key] = itemset_counts.get(key, 0) + 1

            for key, value in itemset_counts.items():
                if value >= support_threshold:
                    frequent_itemsets[key] = None  # store all frequent pairs
            # print("pair counts", itemset_counts)
            del itemset_counts
            gc.collect()

            # more passes for larger-size itemsets
            size = 3
            num_frequent_itemsets = len(frequent_itemsets)
            while True:
                itemset_counts = {}
                for basket in partition:
                    itemsets = generate_combination_with_filter(basket, frequent_itemsets, size)
                    for itemset in itemsets:
                        key = tuple(itemset)
                        itemset_counts[key] = itemset_counts.get(key, 0) + 1

                for key, value in itemset_counts.items():
                    if value >= support_threshold:
                        frequent_itemsets[key] = None  # store all frequent pairs
                del itemset_counts
                gc.collect()

                current_num_frequent_itemsets = len(frequent_itemsets)
                # print("frequent_itemsets", frequent_itemsets)
                if current_num_frequent_itemsets == num_frequent_itemsets:  # no more new frequent itemsets
                    # print("break")
                    break

                num_frequent_itemsets = current_num_frequent_itemsets
                size += 1

            # print("frequent_itemsets", frequent_itemsets)
            return frequent_itemsets

        # First stage
        num_baskets = baskets.count()
        candidate_itemsets = dict.fromkeys(baskets.mapPartitions(lambda _: pcy_for_list(list(_), support_threshold_total)).distinct().collect(), 0)
        # print("candidate_itemsets", candidate_itemsets)

        # Second stage
        def qualified_as_candidate_itemset(itemset):
            try:
                _ = candidate_itemsets[itemset]
                return True
            except:
                return False

        singleton_counts = baskets.\
            flatMap(lambda basket: basket).\
            filter(lambda item: qualified_as_candidate_itemset(item)).\
            map(lambda _: (_, 1)).\
            reduceByKey(lambda x,y: x+y).\
            filter(lambda pair: pair[1] >= support_threshold_total).keys().collect()
        frequent_itemsets = [sorted(singleton_counts)]
        del singleton_counts
        gc.collect()

        size = 2
        while True:
            frequent_itemsets_for_particular_size = baskets.\
                flatMap(lambda _: generate_combination_with_filter(_, candidate_itemsets, size)).\
                filter(lambda _: qualified_as_candidate_itemset(tuple(_))).\
                map(lambda _: (tuple(_), 1)).\
                reduceByKey(lambda x,y: x+y).\
                filter(lambda pair: pair[1] >= support_threshold_total).keys().collect()
            if frequent_itemsets_for_particular_size == []:
                break
            else:
                frequent_itemsets.append(sorted(frequent_itemsets_for_particular_size))
                size += 1

            del frequent_itemsets_for_particular_size
            gc.collect()

        return frequent_itemsets
Пример #43
0
 def readNonPartitionTable(self, project, table, numPartitions, cols=[], bytesCols=[], batchSize=1):
     jcols = self._to_java_array(cols)
     jbytesCols = self._to_java_array(bytesCols)
     jrdd = self._api.readTable(project, table, jcols, jbytesCols, batchSize, numPartitions)
     return RDD(jrdd, self._sc, PickleSerializer())