def loadLabeledPoints(sc, path, minPartitions=None): """ Load labeled points saved using RDD.saveAsTextFile. @param sc: Spark context @param path: file or directory path in any Hadoop-supported file system URI @param minPartitions: min number of partitions @return: labeled data stored as an RDD of LabeledPoint >>> from tempfile import NamedTemporaryFile >>> from pyspark.mllib.util import MLUtils >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \ LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))] >>> tempFile = NamedTemporaryFile(delete=True) >>> tempFile.close() >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name) >>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect() >>> type(loaded[0]) == LabeledPoint True >>> print examples[0] (1.1,(3,[0,2],[-1.23,4.56e-07])) >>> type(examples[1]) == LabeledPoint True >>> print examples[1] (0.0,[1.01,2.02,3.03]) """ minPartitions = minPartitions or min(sc.defaultParallelism, 2) jSerialized = sc._jvm.PythonMLLibAPI().loadLabeledPoints(sc._jsc, path, minPartitions) serialized = RDD(jSerialized, sc, NoOpSerializer()) return serialized.map(lambda bytes: _deserialize_labeled_point(bytearray(bytes)))
def __init__(self, ctx, resource_read=None, query=None, **kwargs): kwargs = make_es_config(kwargs, resource_read=resource_read, query=query) kwargs = as_java_object(ctx._gateway, kwargs) jrdd = helper(ctx).esJsonRDD(ctx._jsc, kwargs) rdd = RDD(jrdd, ctx, NoOpSerializer()) # read the rdd in batches of two (first key then value / doc) def pairwise(iterable): iterator = iter(iterable) return izip(iterator, iterator) kvRdd = rdd.mapPartitions(pairwise, True) super(EsRDD, self).__init__(kvRdd._jrdd, ctx)
def rdd(self): """Returns the content as an :class:`pyspark.RDD` of :class:`Row`. """ if self._lazy_rdd is None: jrdd = self._jdf.javaToPython() rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer())) schema = self.schema def applySchema(it): cls = _create_cls(schema) return map(cls, it) self._lazy_rdd = rdd.mapPartitions(applySchema) return self._lazy_rdd
def rdd(self): """ Return the content of the :class:`DataFrame` as an :class:`RDD` of :class:`Row` s. """ if not hasattr(self, '_lazy_rdd'): jrdd = self._jdf.javaToPython() rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer())) schema = self.schema def applySchema(it): cls = _create_cls(schema) return itertools.imap(cls, it) self._lazy_rdd = rdd.mapPartitions(applySchema) return self._lazy_rdd
def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the standard normal distribution. >>> import numpy as np >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect()) >>> mat.shape (100, 100) >>> abs(mat.mean() - 0.0) < 0.1 True >>> abs(mat.std() - 1.0) < 0.1 True """ jrdd = sc._jvm.PythonMLLibAPI().normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed) normal = RDD(jrdd, sc, NoOpSerializer()) return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the uniform distribution U(0.0, 1.0). >>> import numpy as np >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect()) >>> mat.shape (10, 10) >>> mat.max() <= 1.0 and mat.min() >= 0.0 True >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions() 4 """ jrdd = sc._jvm.PythonMLLibAPI().uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed) uniform = RDD(jrdd, sc, NoOpSerializer()) return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
def poissonRDD(sc, mean, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d samples from the Poisson distribution with the input mean. >>> mean = 100.0 >>> x = RandomRDDGenerators.poissonRDD(sc, mean, 1000, seed=1L) >>> stats = x.stats() >>> stats.count() 1000L >>> abs(stats.mean() - mean) < 0.5 True >>> from math import sqrt >>> abs(stats.stdev() - sqrt(mean)) < 0.5 True """ jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(sc._jsc, mean, size, numPartitions, seed) poisson = RDD(jrdd, sc, NoOpSerializer()) return poisson.map(lambda bytes: _deserialize_double(bytearray(bytes)))
def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the Poisson distribution with the input mean. >>> import numpy as np >>> mean = 100.0 >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L) >>> mat = np.mat(rdd.collect()) >>> mat.shape (100, 100) >>> abs(mat.mean() - mean) < 0.5 True >>> from math import sqrt >>> abs(mat.std() - sqrt(mean)) < 0.5 True """ jrdd = sc._jvm.PythonMLLibAPI().poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed) poisson = RDD(jrdd, sc, NoOpSerializer()) return poisson.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
def normalRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d samples from the standard normal distribution. To transform the distribution in the generated RDD from standard normal to some other normal N(mean, sigma), use C{RandomRDDGenerators.normal(sc, n, p, seed)\ .map(lambda v: mean + sigma * v)} >>> x = RandomRDDGenerators.normalRDD(sc, 1000, seed=1L) >>> stats = x.stats() >>> stats.count() 1000L >>> abs(stats.mean() - 0.0) < 0.1 True >>> abs(stats.stdev() - 1.0) < 0.1 True """ jrdd = sc._jvm.PythonMLLibAPI().normalRDD(sc._jsc, size, numPartitions, seed) normal = RDD(jrdd, sc, NoOpSerializer()) return normal.map(lambda bytes: _deserialize_double(bytearray(bytes)))
def asDataFrames(self, *index_by): ''' Reads the spanned rows as DataFrames if pandas is available, or as a dict of numpy arrays if only numpy is available or as a dict with primitives and objects otherwise. @param index_by If pandas is available, the dataframes will be indexed by the given columns. ''' for c in index_by: if c in self.columns: raise ValueError('column %s cannot be used as index in the data' 'frames as it is a column by which the rows are spanned.') columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in self.columns)) jrdd = self._helper.spanBy(self._cjrdd, columns) rdd = RDD(jrdd, self.ctx) global pd if index_by and pd: return rdd.mapValues(lambda _: _.set_index(*[str(c) for c in index_by])) else: return rdd
def createRDD(sc, kafkaParams, offsetRanges, leaders={}, keyDecoder=utf8_decoder, valueDecoder=utf8_decoder): """ .. note:: Experimental Create a RDD from Kafka using offset ranges for each topic and partition. :param sc: SparkContext object :param kafkaParams: Additional params for Kafka :param offsetRanges: list of offsetRange to specify topic:partition:[start, end) to consume :param leaders: Kafka brokers for each TopicAndPartition in offsetRanges. May be an empty map, in which case leaders will be looked up on the driver. :param keyDecoder: A function used to decode key (default is utf8_decoder) :param valueDecoder: A function used to decode value (default is utf8_decoder) :return: A RDD object """ if not isinstance(kafkaParams, dict): raise TypeError("kafkaParams should be dict") if not isinstance(offsetRanges, list): raise TypeError("offsetRanges should be list") try: helperClass = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper") helper = helperClass.newInstance() joffsetRanges = [o._jOffsetRange(helper) for o in offsetRanges] jleaders = dict([(k._jTopicAndPartition(helper), v._jBroker(helper)) for (k, v) in leaders.items()]) jrdd = helper.createRDD(sc._jsc, kafkaParams, joffsetRanges, jleaders) except Py4JJavaError as e: if 'ClassNotFoundException' in str(e.java_exception): KafkaUtils._printErrorMsg(sc) raise e ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) rdd = RDD(jrdd, sc, ser) return rdd.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
def uniformRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the uniform distribution on [0.0, 1.0]. To transform the distribution in the generated RDD from U[0.0, 1.0] to U[a, b], use C{RandomRDDGenerators.uniformRDD(sc, n, p, seed)\ .map(lambda v: a + (b - a) * v)} >>> x = RandomRDDGenerators.uniformRDD(sc, 100).collect() >>> len(x) 100 >>> max(x) <= 1.0 and min(x) >= 0.0 True >>> RandomRDDGenerators.uniformRDD(sc, 100, 4).getNumPartitions() 4 >>> parts = RandomRDDGenerators.uniformRDD(sc, 100, seed=4).getNumPartitions() >>> parts == sc.defaultParallelism True """ jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(sc._jsc, size, numPartitions, seed) uniform = RDD(jrdd, sc, NoOpSerializer()) return uniform.map(lambda bytes: _deserialize_double(bytearray(bytes)))
def sequenceFile(self, path, keyClass=None, valueClass=None, keyConverter=None, valueConverter=None, minSplits=None, batchSize=0): """ Read a Hadoop SequenceFile with arbitrary key and value Writable class from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI. The mechanism is as follows: 1. A Java RDD is created from the SequenceFile or other InputFormat, and the key and value Writable classes 2. Serialization is attempted via Pyrolite pickling 3. If this fails, the fallback is to call 'toString' on each key and value 4. C{PickleSerializer} is used to deserialize pickled objects on the Python side :param path: path to sequncefile :param keyClass: fully qualified classname of key Writable class (e.g. "org.apache.hadoop.io.Text") :param valueClass: fully qualified classname of value Writable class (e.g. "org.apache.hadoop.io.LongWritable") :param keyConverter: :param valueConverter: :param minSplits: minimum splits in dataset (default min(2, sc.defaultParallelism)) :param batchSize: The number of Python objects represented as a single Java object. (default 0, choose batchSize automatically) """ minSplits = minSplits or min(self.defaultParallelism, 2) jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, path, keyClass, valueClass, keyConverter, valueConverter, minSplits, batchSize) return RDD(jrdd, self)
def hadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConverter=None, valueConverter=None, conf=None, batchSize=0): """ Read an 'old' Hadoop InputFormat with arbitrary key and value class from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI. The mechanism is the same as for sc.sequenceFile. A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java. :param path: path to Hadoop file :param inputFormatClass: fully qualified classname of Hadoop InputFormat (e.g. "org.apache.hadoop.mapred.TextInputFormat") :param keyClass: fully qualified classname of key Writable class (e.g. "org.apache.hadoop.io.Text") :param valueClass: fully qualified classname of value Writable class (e.g. "org.apache.hadoop.io.LongWritable") :param keyConverter: (None by default) :param valueConverter: (None by default) :param conf: Hadoop configuration, passed in as a dict (None by default) :param batchSize: The number of Python objects represented as a single Java object. (default 0, choose batchSize automatically) """ jconf = self._dictToJavaMap(conf) jrdd = self._jvm.PythonRDD.hadoopFile(self._jsc, path, inputFormatClass, keyClass, valueClass, keyConverter, valueConverter, jconf, batchSize) return RDD(jrdd, self)
def union(self, rdds): """ Build the union of a list of RDDs. This supports unions() of RDDs with different serialized formats, although this forces them to be reserialized using the default serializer: >>> path = os.path.join(tempdir, "union-text.txt") >>> with open(path, "w") as testFile: ... _ = testFile.write("Hello") >>> textFile = sc.textFile(path) >>> textFile.collect() [u'Hello'] >>> parallelized = sc.parallelize(["World!"]) >>> sorted(sc.union([textFile, parallelized]).collect()) [u'Hello', 'World!'] """ first_jrdd_deserializer = rdds[0]._jrdd_deserializer if any(x._jrdd_deserializer != first_jrdd_deserializer for x in rdds): rdds = [x._reserialize() for x in rdds] first = rdds[0]._jrdd rest = [x._jrdd for x in rdds[1:]] return RDD(self._jsc.union(first, rest), self, rdds[0]._jrdd_deserializer)
def parallelize(self, c, numSlices=None): """ Distribute a local Python collection to form an RDD. >>> sc.parallelize(range(5), 5).glom().collect() [[0], [1], [2], [3], [4]] """ numSlices = numSlices or self.defaultParallelism # Calling the Java parallelize() method with an ArrayList is too slow, # because it sends O(n) Py4J commands. As an alternative, serialized # objects are written to a file and loaded through textFile(). tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) # Make sure we distribute data evenly if it's smaller than self.batchSize if "__len__" not in dir(c): c = list(c) # Make it a list so we can compute its length batchSize = min(len(c) // numSlices, self.batchSize) if batchSize > 1: c = batched(c, batchSize) for x in c: write_with_length(dump_pickle(x), tempFile) tempFile.close() readRDDFromPickleFile = self._jvm.PythonRDD.readRDDFromPickleFile jrdd = readRDDFromPickleFile(self._jsc, tempFile.name, numSlices) return RDD(jrdd, self)
def productFeatures(self): sc = self._context jpf = self._java_model.productFeatures() jpf = sc._jvm.SerDe.fromTuple2RDD(jpf).toJavaRDD() return RDD(sc._jvm.PythonRDD.javaToPython(jpf), sc, AutoBatchedSerializer(PickleSerializer()))
from pyspark import SparkContext from pyspark import SparkConf import sys from pyspark.rdd import RDD def update_dictionary(map): map.update(test="alex") return map if __name__ == "__main__": print("here1") conf = SparkConf() sc = SparkContext(appName="alex_test_app") print("here2") print("here2b: " + conf.get("spark.aleph2_job_config")) aleph2 = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader().loadClass("com.ikanow.aleph2.analytics.spark.utils.SparkPyTechnologyUtils").newInstance().getAleph2(sc._jsc, sys.argv[1]) print("here3") print aleph2.getRddInputNames() print("here4") #print RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()), sc).count() print("here5") to_output = RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()), sc).map(lambda m: update_dictionary(m)) aleph2.emitRdd(to_output._to_java_object_rdd())
def _checkpointFile(self, name, input_deserializer): jrdd = self._jsc.checkpointFile(name) return RDD(jrdd, self, input_deserializer)
def _checkpointFile(self, name): jrdd = self._jsc.checkpointFile(name) return RDD(jrdd, self)
def __init__(self, jrdd, ctx, jrdd_deserializer): warnings.warn( "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. " "See SPARK-21893.", DeprecationWarning) RDD.__init__(self, jrdd, ctx, jrdd_deserializer)
def import_csv(path, delimiter=",", header=False, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=TkContext.implicit): """ Creates a frame with data from a csv file. Parameters ---------- :param path: (str) Full path to the csv file :param delimiter: (Optional[str]) A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns (unless a schema is provided), and not be included in the data. The default value is false. :param schema: (Optional(list[tuple(str, type)] or list[str])) The are different options for specifying a schema: * Provide the full schema for the frame as a list of tuples (string column name and data type) * Provide the column names as a list of strings. Column data types will be inferred, based on the data. The column names specified will override column names that are found in the header row. * None, where the schema is automatically inferred based on the data. Columns are named based on the header, or will be named generically ("C0", "C1", "C2", etc). :param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html :return: (Frame) Frame that contains the data from the csv file Examples -------- Load a frame from a csv file by specifying the path to the file, delimiter >>> file_path = "../datasets/cities.csv" >>> frame = tc.frame.import_csv(file_path, "|", header=True) -etc- >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 15 Grants Pass 35076 34533 1.57% Josephine [7] 16 Oregon City 34622 31859 8.67% Clackamas [8] 17 McMinnville 33131 32187 2.93% Yamhill [9] 18 Redmond 27427 26215 4.62% Deschutes >>> frame.schema [('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)] The schema parameter can be used to specify a custom schema (column names and data types) or column names (and the data types are inferred based on the data). Here, we will specify the column names, which will override the header from the csv file. >>> column_names = ["Rank", "City", "2013", "2010", "Percent_Change", "County"] >>> frame = tc.frame.import_csv(file_path, "|", header=True, schema=column_names) -etc- >>> frame.schema [('Rank', <type 'int'>), ('City', <type 'str'>), ('2013', <type 'int'>), ('2010', <type 'int'>), ('Percent_Change', <type 'str'>), ('County', <type 'str'>)] <hide> >>> file_path = "../datasets/unicode.csv" >>> schema = [("a", unicode),("b", unicode),("c",unicode)] >>> frame = tc.frame.import_csv(file_path, schema=schema, header=False) -etc- >>> print unicode(frame.get_inspect()).encode('utf-8') # because this file is UT8 and this docstring is str [#] a b c ============ [0] à ë ñ [1] ã ê ü </hide> """ TkContext.validate(tc) require_type.non_empty_str(path, "path") require_type.non_empty_str(delimiter, "delimiter") require_type(bool, header, "header") require_type(str, datetime_format, "datetime_format") infer_schema = True column_names = [] # custom column names if schema is not None: if not isinstance(schema, list): raise TypeError("Unsupported type %s for schema parameter." % type(schema)) elif all(isinstance(item, basestring) for item in schema): # schema is just column names column_names = schema schema = None else: infer_schema = False # if a custom schema is provided, don't waste time inferring the schema during load sparktk_schema.validate(schema) header_str = str(header).lower() infer_schema_str = str(infer_schema).lower() pyspark_schema = None if schema is not None: fields = [] for column in schema: if dtypes._data_type_to_pyspark_type_table.has_key(column[1]): fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True)) else: raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0])) pyspark_schema = StructType(fields) df = tc.sql_context.read.format( "com.databricks.spark.csv.org.trustedanalytics.sparktk").options( delimiter=delimiter, header=header_str, dateformat=datetime_format, inferschema=infer_schema_str).load(path, schema=pyspark_schema) df_schema = [] if schema is None: for i, column in enumerate(df.schema.fields): try: datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType)) except ValueError: raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name)) column_name = column_names[i] if (i < len(column_names)) else column.name df_schema.append((column_name, datatype)) else: df_column_count = len(df.schema.fields) custom_column_count = len(schema) if (df_column_count != custom_column_count): raise ValueError("Bad schema value. The number of columns in the custom schema ({0}) must match the" "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count)) df_schema = schema def cast_datetime(row): """ The spark data frame gives uses datetime objects. Convert them to long (ms since epoch) for our frame. """ data = [] for column_index in xrange(0, len(df_schema)): if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime): data.append(long(dtypes.datetime_to_ms(row[column_index]))) else: data.append(row[column_index]) return data jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd()) rdd = RDD(jrdd, tc.sc) if any(c[1] == dtypes.datetime for c in df_schema): # If any columns are date/time we must do this map rdd = df.rdd.map(cast_datetime) from sparktk.frame.frame import Frame # circular dependency, so import late return Frame(tc, rdd, df_schema)
def predictAll(self, usersProducts): usersProductsJRDD = _get_unmangled_rdd(usersProducts, _serialize_tuple) return RDD(self._java_model.predict(usersProductsJRDD._jrdd), self._context, RatingDeserializer())
def call(self, jrdd, time): # Wrap JavaRDD into python's RDD class rdd = RDD(jrdd, self.ctx, self.deserializer) # Call user defined RDD function self.func(rdd, time)
def pcy_for_rdd(baskets: RDD, support_threshold_total=support_threshold_total) -> list: def check_all_subsets_frequent(itemset: list, frequent_itemsets_dict: dict) -> bool: ''' For example, given a triple ['2', '1', '8'], check if all its subsets ['2', '1'], ['2', '8'], ['1', '8'] are frequent items. :param itemset: :return: ''' itemset_size = len(itemset) for i in range(itemset_size): subset = itemset.copy() subset.pop(i) try: _ = frequent_itemsets_dict[tuple(subset)] # 不再需要sorted这个subset,basket已sort except: return False return True num_baskets = baskets.count() singleton_counts = baskets.\ flatMap(lambda set: [(item, 1) for item in set]).\ reduceByKey(lambda x,y: x+y).\ filter(lambda pair: pair[1] >= support_threshold_total) # frequent_singletons_dict = dict(singleton_counts.collect()).keys() frequent_itemsets_dict = dict(singleton_counts.collect()) # print("frequent_itemsets_dict", frequent_itemsets_dict) frequent_itemsets_list = [sorted(list(frequent_itemsets_dict.keys()))] del singleton_counts gc.collect() # all_pairs = baskets.flatMap(lambda basket: generate_combination(basket, 2)).persist() # 既然first/second pass都要用,为何不persist # # bucket_counts = all_pairs.map(lambda pair:(hash_pair(pair), 1)).reduceByKey(lambda x,y: x+y).collect() # first pass # bitmap = dict(bucket_counts) # for key, value in bitmap.items(): # if value >= support_threshold_total: # bitmap[key] = 1 # else: # bitmap[key] = 0 current_itemset_size = 2 while True: # print("current_itemset_size", current_itemset_size) # if current_itemset_size == 2: # pairs are special # frequent_itemsets = all_pairs.\ # filter(lambda _: qualified_as_candidate_pair(_, frequent_itemsets_dict, bitmap)).\ # map(lambda pair: (tuple(pair), 1)).\ # reduceByKey(lambda x, y: x + y).\ # filter(lambda pair: pair[1] >= support_threshold_total).persist() # del all_pairs # gc.collect() # else: # 双重filter frequent_itemsets = baskets.flatMap(lambda basket: generate_combination_with_filter(basket, frequent_itemsets_dict, current_itemset_size)). \ map(lambda itemset: (tuple(itemset), 1)).\ reduceByKey(lambda x,y: x+y).\ filter(lambda pair: pair[1] >= support_threshold_total).persist() # if frequent_itemsets.count() == 0: # break current_size_frequent_itemsets = sorted(frequent_itemsets.keys().collect()) if current_size_frequent_itemsets == []: break frequent_itemsets_list.append(current_size_frequent_itemsets) frequent_itemsets_dict.update(dict.fromkeys(current_size_frequent_itemsets)) # frequent_itemsets_dict.update(dict(frequent_itemsets.collect())) current_itemset_size += 1 del frequent_itemsets # 也许正确操作应该是释放内存之后再del?我不懂 del current_size_frequent_itemsets gc.collect() gc.collect() return frequent_itemsets_list
def loadAsAvro(self, name, deserializer, minParitions = None): return RDD(self._pc.parquetFileAsAvro(name), self._sc, deserializer)
def loadAsJson(self, name, minPartitions = None): return RDD(self._pc.parquetFileAsJSON(name), self._sc, MUTF8Deserializer()).map(lambda x: json.loads(x))
def loadADAMRecords(self, path): return RDD(self._pc.loadADAMRecords(path), self._sc, AvroDeserializer(self._adam_record_schema))
def remove_stop_words(rdd_in: RDD) -> RDD: file = open(STOP_WORDS_FILENAME, "r") sw_list: list = file.read().split("\n") return rdd_in.filter(lambda x: x not in sw_list)
def into_words(rdd_in: RDD) -> RDD: words: RDD = rdd_in.flatMap(lambda x: x.split(' ')).filter( lambda x: len(x) > 1) return words.map(lambda x: re.sub(',()', "", x))
def func(sc, *a, **kw): jrdd = f(sc, *a, **kw) return RDD(sc._jvm.SerDe.javaToPython(jrdd), sc, BatchedSerializer(PickleSerializer(), 1024))
def __init__(self, jrdd, ctx, jrdd_deserializer): RDD.__init__(self, jrdd, ctx, jrdd_deserializer)
def _rdd(self): if self._lazy_rdd is None: jrdd = self._jdf.javaToPython() self._lazy_rdd = RDD(jrdd, self._sc, BatchedSerializer(PickleSerializer())) return self._lazy_rdd
def getRddInput(self, sc, name): return RDD(sc._jvm.SerDe.javaToPython(self.aleph2.getRddInput(name)), sc)
def read_to_geometry_rdd(sc, input_path): jrdd = sc._jvm.org.datasyslab.geospark.formatMapper.shapefileParser.ShapefileReader.normalrdd( sc._jsc) return RDD(jrdd, sc)
def train_distributed( rdd: RDD, torch_obj: str, iters: int = 10, partition_shuffles: int = 1, verbose: int = 1, mini_batch: int = -1, validation_pct: float = 0.0, world_size: int = 2, device: str = 'cpu', early_stop_patience: int = -1 ) -> Dict: """ Entry point to train the model in distributed fashion. :param rdd: The rdd of data to run on the model. :param torch_obj: The torch object as a string that includes the model and param shapes. :param master_url: The main url for the driver. :param iters: Number of iterations for training. :param partition_shuffles: Number of partition shuffles (Need to implement) :param verbose: Verbosity of logs :param mini_batch: Mini batch for each iteration of training. :param validation_pct: How many items to validate :param world_size: number of partitions. :param device: pytorch device :return: The train dict. """ master_url = retrieve_url() torch_loaded, params = load_base_torch(torch_obj) # Start the driver process. p = Process( target=handle_model, args=(-1, None, params, master_url, iters, world_size, early_stop_patience) ) p.start() try: state_dict = None for i in range(partition_shuffles): # Run model with barrier execution mode. state_dict = mapPartitionsWithIndex( rdd, lambda i, x: handle_model( i, x, torch_obj=torch_loaded, master_url=master_url, iters=iters, verbose=verbose, mini_batch=mini_batch, validation_pct=validation_pct, world_size=world_size, device=device, early_stop_patience=int(early_stop_patience+0) ) ).collect() if partition_shuffles - i > 1: num_partitions = rdd.getNumPartitions() rdd = rdd.repartition(num_partitions) return state_dict[0] finally: p.terminate() p.join()
def getAllRddInputs(self, sc): return RDD(sc._jvm.SerDe.javaToPython(self.aleph2.getAllRddInputs()), sc)
from pyspark import SparkContext from pyspark import SparkConf import sys from pyspark.rdd import RDD def update_dictionary(map): map.update(test="alex") return map if __name__ == "__main__": print("here1") conf = SparkConf() sc = SparkContext(appName="alex_test_app") print("here2") print("here2b: " + conf.get("spark.aleph2_job_config")) aleph2 = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader( ).loadClass( "com.ikanow.aleph2.analytics.spark.utils.SparkPyTechnologyUtils" ).newInstance().getAleph2(sc._jsc, sys.argv[1]) print("here3") print aleph2.getRddInputNames() print("here4") #print RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()), sc).count() print("here5") to_output = RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()), sc).map(lambda m: update_dictionary(m)) aleph2.emitRdd(to_output._to_java_object_rdd())
def m_o(ctx: SparkContext, data: pr.RDD) -> None: assert isinstance(ctx, SparkContext) assert 1 == len(data.collect())
def emptyRDD(self): """ Create an RDD that has no partitions or elements. """ return RDD(self._jsc.emptyRDD(), self, NoOpSerializer())
def import_csv(path, delimiter=",", header=False, infer_schema=True, schema=None, tc=TkContext.implicit): """ Creates a frame with data from a csv file. Parameters ---------- :param path: (str) Full path to the csv file :param delimiter: (Optional[str]) A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns, and not be included in the data. The default value is false. :param infer_schema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred. It requires one extra pass over the data and is false by default. :param: schema: (Optional[List[tuple(str, type)]]) Optionally specify the schema for the dataset. Number of columns specified in the schema must match the number of columns in the csv file provided. :return: (Frame) Frame that contains the data from the csv file Examples -------- Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that there is a header and to infer the schema based on the data. >>> file_path = "../integration-tests/datasets/cities.csv" >>> frame = tc.load_frame_from_csv(file_path, "|", header=True, infer_schema=True) -etc- >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 15 Grants Pass 35076 34533 1.57% Josephine [7] 16 Oregon City 34622 31859 8.67% Clackamas [8] 17 McMinnville 33131 32187 2.93% Yamhill [9] 18 Redmond 27427 26215 4.62% Deschutes >>> frame.schema [('rank', int), ('city', str), ('population_2013', int), ('population_2010', int), ('change', str), ('county', str)] """ if schema is not None: infer_schema = False # if a custom schema is provided, don't waste time inferring the schema during load if not isinstance(header, bool): raise ValueError( "header parameter must be a boolean, but is {0}.".format( type(header))) if not isinstance(infer_schema, bool): raise ValueError( "infer_schema parameter must be a boolean, but is {0}.".format( type(infer_schema))) TkContext.validate(tc) header_str = str(header).lower() infer_schema_str = str(infer_schema).lower() pyspark_schema = None if (not infer_schema) and (schema is not None): fields = [] for column in schema: if dtypes._data_type_to_pyspark_type_table.has_key(column[1]): fields.append( StructField( column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True)) else: raise TypeError( "Unsupported type {0} in schema for column {1}.".format( column[1], column[0])) pyspark_schema = StructType(fields) df = tc.sql_context.read.format( "com.databricks.spark.csv.org.trustedanalytics.sparktk").options( delimiter=delimiter, header=header_str, dateformat="yyyy-MM-dd'T'HH:mm:ss.SSSX", inferschema=infer_schema_str).load(path, schema=pyspark_schema) df_schema = [] if schema is None: for column in df.schema.fields: try: datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type( type(column.dataType)) except ValueError: raise TypeError( "Unsupported data type ({0}) for column {1}.".format( str(column.dataType), column.name)) df_schema.append((column.name, datatype)) else: df_column_count = len(df.schema.fields) custom_column_count = len(schema) if (df_column_count != custom_column_count): raise ValueError( "Bad schema value. The number of columns in the custom schema ({0}) must match the" "number of columns in the csv file data ({1}).".format( custom_column_count, df_column_count)) df_schema = schema def cast_datetime(row): """ The spark data frame gives uses datetime objects. Convert them to long (ms since epoch) for our frame. """ data = [] for column_index in xrange(0, len(df_schema)): if df_schema[column_index][1] == dtypes.datetime and isinstance( row[column_index], datetime): data.append(long(dtypes.datetime_to_ms(row[column_index]))) else: data.append(row[column_index]) return data jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython( df._jdf.rdd()) rdd = RDD(jrdd, tc.sc) if any(c[1] == dtypes.datetime for c in df_schema): # If any columns are date/time we must do this map rdd = df.rdd.map(cast_datetime) from sparktk.frame.frame import Frame # circular dependency, so import late return Frame(tc, rdd, df_schema)
def son(baskets: RDD, support_threshold_total=support_threshold_total) -> list: def pcy_for_list(partition: list, support_threshold_total=support_threshold_total) -> dict: # partition = baskets num_baskets_chunk = len(partition) support_threshold = math.ceil(support_threshold_total * num_baskets_chunk / num_baskets) # first pass singleton_counts = {} bucket_counts = {} for basket in partition: for item in basket: singleton_counts[item] = singleton_counts.get(item, 0) + 1 pairs = generate_combination(basket, size=2) for pair in pairs: key = hash_pair(pair) bucket_counts[key] = bucket_counts.get(key, 0) + 1 for key, value in bucket_counts.items(): if value >= support_threshold: bucket_counts[key] = 1 else: bucket_counts[key] = 0 frequent_itemsets = {} for key, value in singleton_counts.items(): if value >= support_threshold: frequent_itemsets[key] = None # store all frequent singletons # print("singleton_counts", singleton_counts) # print("frequent singletons", frequent_itemsets) del singleton_counts gc.collect() # second pass itemset_counts = {} for basket in partition: pairs = generate_combination(basket, size=2) for pair in pairs: if qualified_as_candidate_pair(pair, frequent_itemsets, bitmap=bucket_counts): key = tuple(pair) itemset_counts[key] = itemset_counts.get(key, 0) + 1 for key, value in itemset_counts.items(): if value >= support_threshold: frequent_itemsets[key] = None # store all frequent pairs # print("pair counts", itemset_counts) del itemset_counts gc.collect() # more passes for larger-size itemsets size = 3 num_frequent_itemsets = len(frequent_itemsets) while True: itemset_counts = {} for basket in partition: itemsets = generate_combination_with_filter(basket, frequent_itemsets, size) for itemset in itemsets: key = tuple(itemset) itemset_counts[key] = itemset_counts.get(key, 0) + 1 for key, value in itemset_counts.items(): if value >= support_threshold: frequent_itemsets[key] = None # store all frequent pairs del itemset_counts gc.collect() current_num_frequent_itemsets = len(frequent_itemsets) # print("frequent_itemsets", frequent_itemsets) if current_num_frequent_itemsets == num_frequent_itemsets: # no more new frequent itemsets # print("break") break num_frequent_itemsets = current_num_frequent_itemsets size += 1 # print("frequent_itemsets", frequent_itemsets) return frequent_itemsets # First stage num_baskets = baskets.count() candidate_itemsets = dict.fromkeys(baskets.mapPartitions(lambda _: pcy_for_list(list(_), support_threshold_total)).distinct().collect(), 0) # print("candidate_itemsets", candidate_itemsets) # Second stage def qualified_as_candidate_itemset(itemset): try: _ = candidate_itemsets[itemset] return True except: return False singleton_counts = baskets.\ flatMap(lambda basket: basket).\ filter(lambda item: qualified_as_candidate_itemset(item)).\ map(lambda _: (_, 1)).\ reduceByKey(lambda x,y: x+y).\ filter(lambda pair: pair[1] >= support_threshold_total).keys().collect() frequent_itemsets = [sorted(singleton_counts)] del singleton_counts gc.collect() size = 2 while True: frequent_itemsets_for_particular_size = baskets.\ flatMap(lambda _: generate_combination_with_filter(_, candidate_itemsets, size)).\ filter(lambda _: qualified_as_candidate_itemset(tuple(_))).\ map(lambda _: (tuple(_), 1)).\ reduceByKey(lambda x,y: x+y).\ filter(lambda pair: pair[1] >= support_threshold_total).keys().collect() if frequent_itemsets_for_particular_size == []: break else: frequent_itemsets.append(sorted(frequent_itemsets_for_particular_size)) size += 1 del frequent_itemsets_for_particular_size gc.collect() return frequent_itemsets
def readNonPartitionTable(self, project, table, numPartitions, cols=[], bytesCols=[], batchSize=1): jcols = self._to_java_array(cols) jbytesCols = self._to_java_array(bytesCols) jrdd = self._api.readTable(project, table, jcols, jbytesCols, batchSize, numPartitions) return RDD(jrdd, self._sc, PickleSerializer())