def test_multiple_python_java_RDD_conversions(self): # Regression test for SPARK-5361 data = [("1", {"director": "David Lean"}), ("2", {"director": "Andrew Dominik"})] data_rdd = self.sc.parallelize(data) data_java_rdd = data_rdd._to_java_object_rdd() data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd) converted_rdd = RDD(data_python_rdd, self.sc) self.assertEqual(2, converted_rdd.count()) # conversion between python and java RDD threw exceptions data_java_rdd = converted_rdd._to_java_object_rdd() data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd) converted_rdd = RDD(data_python_rdd, self.sc) self.assertEqual(2, converted_rdd.count())
def train_model(data: RDD, l=1.0) -> MLNaiveBayesModel: aggregated = data.flatMap(lambda x: [(l, x['features']) for l in x['labels']]) \ .combineByKey(lambda v: (1, v), lambda c, v: (c[0] + 1, c[1] + v), lambda c1, c2: (c1[0] + c2[0], c1[1] + c2[1])) \ .sortBy(lambda x: x[0]) \ .collect() num_labels = len(aggregated) num_documents = data.count() num_features = aggregated[0][1][1].size labels = np.zeros(num_labels) pi = np.zeros(num_labels, dtype=int) theta = np.zeros((num_labels, num_features)) pi_log_denom = math.log(num_documents + num_labels * l) i = 0 for (label, (n, sum_term_freq)) in aggregated: labels[i] = label pi[i] = math.log(n + l) - pi_log_denom sum_term_freq_dense = sum_term_freq.toarray() theta_log_denom = math.log(sum_term_freq.sum() + num_features * l) theta[i, :] = np.log(sum_term_freq_dense + l) - theta_log_denom i += 1 return MLNaiveBayesModel(labels, pi, theta)
def __call__(self, head: RDD): if self.distinct and not self.approximate: head = head.distinct() if self.explained: self._log.info("toDebugString():\n%s", head.toDebugString().decode()) if not self.approximate or not self.distinct: return head.count() return head.countApproxDistinct()
def test_multiple_python_java_RDD_conversions(self): # Regression test for SPARK-5361 data = [ (u'1', {u'director': u'David Lean'}), (u'2', {u'director': u'Andrew Dominik'}) ] data_rdd = self.sc.parallelize(data) data_java_rdd = data_rdd._to_java_object_rdd() data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd) converted_rdd = RDD(data_python_rdd, self.sc) self.assertEqual(2, converted_rdd.count()) # conversion between python and java RDD threw exceptions data_java_rdd = converted_rdd._to_java_object_rdd() data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd) converted_rdd = RDD(data_python_rdd, self.sc) self.assertEqual(2, converted_rdd.count())
def evaluate(self, lables_and_predictions: RDD): TP = lables_and_predictions.map(lambda x: (set(x[0]), set([p for p,w in x[1][:self._pred_n]]))). \ filter(lambda x: len(x[0].intersection(x[1])) > self._intersect_n) accuracy = 100.0 * TP.count() / lables_and_predictions.count() if self._verbose: print('accuracy: ', accuracy) self._results.append(accuracy) return accuracy
def __blocking_matrix(self, train: RDD = None, test: RDD = None, similarity=None) -> RDD: """ Divide matrix into blocks for the purpose of reduce key number. :param train: RDD<(Hashable, Hashable, float)> = RDD<bucket, item, rating> :param test: RDD<(Hashable, Hashable)> = RDD<bucket, item> :param similarity: RDD<(Hashable, Hashable, float)> RDD<bucket, bucket, similarity> :return: RDD<(int, int)(Hashable, Hashable, float)> = RDD<(bucket_block, item_block), (bucket, item, rating)> or RDD<(bucket_block, bucket_block), (bucket, bucket, similarity)> """ seed = self._seed n_bucket_block = self._n_bucket_block n_item_block = self._n_item_block n_cross_block = self._n_cross_block if train is not None: train = train.map(lambda u: ((hash2int( u[0], max_value=n_cross_block, seed=seed ), hash2int(u[1], max_value=n_item_block, seed=seed)), u)).cache() train.count() return train if test is not None: test = test.map(lambda u: ((hash2int( u[0], max_value=n_bucket_block, seed=seed ), hash2int(u[1], max_value=n_item_block, seed=seed)), u)).cache() test.count() return test if similarity is not None: similarity = similarity.flatMap(lambda u: [(u[0], u[1], u[ 2]), (u[1], u[0], u[2])]).map(lambda u: ( (hash2int(u[0], max_value=n_bucket_block, seed=seed), hash2int(u[1], max_value=n_cross_block, seed=seed)), u) ).cache() similarity.count() return similarity
def evaluate(self, labels_and_predictions: RDD) -> float: tp = labels_and_predictions \ .map(lambda x: (set(x[0]), set(features for features, weights in x[1][:self._pred_n]))) \ .filter(lambda x: len(x[0].intersection(x[1])) >= self._intersect_n) accuracy = 100.0 * tp.count() / labels_and_predictions.count() if self._verbose: print('accuracy: ', accuracy) self._results.append(accuracy) return accuracy
def run(self, rdd: RDD) -> RDD: # type: ignore rdd = rdd.cache() n_points = rdd.count() m = n_points / self.n_partitions optimal_p = math.log(n_points * self.n_partitions) / m rdd = self.assign_buckets( # type: ignore rdd, p=optimal_p, key_func=_label_first_coord_and_type ) rdd = self.sort_and_assign_labels(rdd) # type: ignore return rdd
def run( self, rdd: RDD, key_func: Callable[[Tuple[Any]], Tuple[Any]] = lambda x: x ) -> RDD: # type: ignore rdd = rdd.cache() n_points = rdd.count() m = n_points / self.n_partitions optimal_p = math.log(n_points * self.n_partitions) / m rdd = self.assign_buckets(rdd, p=optimal_p, key_func=key_func) # type: ignore rdd = self.sort(rdd, key_func=key_func) # type: ignore return rdd
def partition_per_row(rdd: RDD) -> RDD: """Place each row in an RDD into a separate partition. Only useful if that row represents something large to be computed over, perhaps an external resource such as a multi-gb training dataset. The spark part of the dataset is expected to be tiny and easily fit in a single partition. """ num_rows = rdd.count() # Help out mypy. Also don't use `identity`, as it somehow fails serialization partition_fn = cast(Callable[[int], int], lambda x: x) return ( # bring everything together and assign each row a partition id rdd.repartition(1).mapPartitions(lambda rows: enumerate(rows)) # Partition by the new parition_id .partitionBy(num_rows, partition_fn) # Drop the partition id, giving back the origional shape .map(lambda pair: pair[1]))
def mean(rdd: RDD) -> float: return rdd.sum() / float(rdd.count())
def kurtosis(rdd: RDD, mean: float, stdev: float) -> float: return rdd.map(lambda x: pow(x-mean, 4)).sum() / (pow(stdev, 4)*rdd.count())
def skewness(rdd: RDD, mean: float, stdev: float) -> float: return rdd.map(lambda x: pow(x-mean, 3)).sum() / (pow(stdev, 3)*rdd.count())
def stdev(rdd: RDD, mean: float) -> float: return sqrt(rdd.map(lambda x: pow(x-mean, 2)).sum() / rdd.count())