def test_save_load_simple_estimator(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() # test save/load of CrossValidator cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) cvPath = temp_path + "/cv" cv.save(cvPath) loadedCV = CrossValidator.load(cvPath) self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps()) # test save/load of CrossValidatorModel cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
def test_expose_sub_models(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True) tvsModel = tvs.fit(dataset) self.assertEqual(len(tvsModel.subModels), len(grid)) # Test the default value for option "persistSubModel" to be "true" testSubPath = temp_path + "/testTrainValidationSplitSubModels" savingPathWithSubModels = testSubPath + "cvModel3" tvsModel.save(savingPathWithSubModels) tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels) self.assertEqual(len(tvsModel3.subModels), len(grid)) tvsModel4 = tvsModel3.copy() self.assertEqual(len(tvsModel4.subModels), len(grid)) savingPathWithoutSubModels = testSubPath + "cvModel2" tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels) tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels) self.assertEqual(tvsModel2.subModels, None) for i in range(len(grid)): self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid)
def test_persistence(self): # Test save/load for LDA, LocalLDAModel, DistributedLDAModel. df = self.spark.createDataFrame([ [1, Vectors.dense([0.0, 1.0])], [2, Vectors.sparse(2, {0: 1.0})], ], ["id", "features"]) # Fit model lda = LDA(k=2, seed=1, optimizer="em") distributedModel = lda.fit(df) self.assertTrue(distributedModel.isDistributed()) localModel = distributedModel.toLocal() self.assertFalse(localModel.isDistributed()) # Define paths path = tempfile.mkdtemp() lda_path = path + "/lda" dist_model_path = path + "/distLDAModel" local_model_path = path + "/localLDAModel" # Test LDA lda.save(lda_path) lda2 = LDA.load(lda_path) self._compare(lda, lda2) # Test DistributedLDAModel distributedModel.save(dist_model_path) distributedModel2 = DistributedLDAModel.load(dist_model_path) self._compare(distributedModel, distributedModel2) # Test LocalLDAModel localModel.save(local_model_path) localModel2 = LocalLDAModel.load(local_model_path) self._compare(localModel, localModel2) # Clean up try: rmtree(path) except OSError: pass
def test_java_object_gets_detached(self): df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight", fitIntercept=False) model = lr.fit(df) summary = model.summary self.assertIsInstance(model, JavaWrapper) self.assertIsInstance(summary, JavaWrapper) self.assertIsInstance(model, JavaParams) self.assertNotIsInstance(summary, JavaParams) error_no_object = 'Target Object ID does not exist for this gateway' self.assertIn("LinearRegression_", model._java_obj.toString()) self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString()) model.__del__() with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): model._java_obj.toString() self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString()) try: summary.__del__() except: pass with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): model._java_obj.toString() with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): summary._java_obj.toString()
def test_save_load_simple_estimator(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def test_equals(self): indices = [1, 2, 4] values = [1., 3., 2.] self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.])) self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.])) self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.])) self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]))
def test_output_columns(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr, parallelism=1) model = ovr.fit(df) output = model.transform(df) self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"])
def test_copy(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) ovr1 = ovr.copy({lr.maxIter: 10}) self.assertEqual(ovr.getClassifier().getMaxIter(), 5) self.assertEqual(ovr1.getClassifier().getMaxIter(), 10) model = ovr.fit(df) model1 = model.copy({model.predictionCol: "indexed"}) self.assertEqual(model1.getPredictionCol(), "indexed")
def test_offset(self): df = self.spark.createDataFrame( [(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)), (0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)), (0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)), (0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0))], ["label", "weight", "offset", "features"]) glr = GeneralizedLinearRegression(family="poisson", weightCol="weight", offsetCol="offset") model = glr.fit(df) self.assertTrue(np.allclose(model.coefficients.toArray(), [0.664647, -0.3192581], atol=1E-4)) self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4))
def test_support_for_weightCol(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0), (1.0, Vectors.sparse(2, [], []), 1.0), (2.0, Vectors.dense(0.5, 0.5), 1.0)], ["label", "features", "weight"]) # classifier inherits hasWeightCol lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr, weightCol="weight") self.assertIsNotNone(ovr.fit(df)) # classifier doesn't inherit hasWeightCol dt = DecisionTreeClassifier() ovr2 = OneVsRest(classifier=dt, weightCol="weight") self.assertIsNotNone(ovr2.fit(df))
def test_parallelism_doesnt_change_output(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1) modelPar1 = ovrPar1.fit(df) ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2) modelPar2 = ovrPar2.fit(df) for i, model in enumerate(modelPar1.models): self.assertTrue(np.allclose(model.coefficients.toArray(), modelPar2.models[i].coefficients.toArray(), atol=1E-4)) self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
def test_kmeans_summary(self): data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] df = self.spark.createDataFrame(data, ["features"]) kmeans = KMeans(k=2, seed=1) model = kmeans.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 1)
def test_bisecting_kmeans_summary(self): data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), (Vectors.sparse(1, [], []),)] df = self.spark.createDataFrame(data, ["features"]) bkm = BisectingKMeans(k=2) model = bkm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 20)
def ztest_toPandas(self): data = [(Vectors.dense([0.1, 0.2]),), (Vectors.sparse(2, {0:0.3, 1:0.4}),), (Vectors.sparse(2, {0:0.5, 1:0.6}),)] df = self.sql.createDataFrame(data, ["features"]) self.assertEqual(df.count(), 3) pd = self.converter.toPandas(df) self.assertEqual(len(pd), 3) self.assertTrue(isinstance(pd.features[0], csr_matrix), "Expected pd.features[0] to be csr_matrix but found: %s" % type(pd.features[0])) self.assertEqual(pd.features[0].shape[0], 3) self.assertEqual(pd.features[0].shape[1], 2) self.assertEqual(pd.features[0][0,0], 0.1) self.assertEqual(pd.features[0][0,1], 0.2)
def test_binomial_logistic_regression_with_bound(self): df = self.spark.createDataFrame( [(1.0, 1.0, Vectors.dense(0.0, 5.0)), (0.0, 2.0, Vectors.dense(1.0, 2.0)), (1.0, 3.0, Vectors.dense(2.0, 1.0)), (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"]) lor = LogisticRegression(regParam=0.01, weightCol="weight", lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]), upperBoundsOnIntercepts=Vectors.dense(0.0)) model = lor.fit(df) self.assertTrue( np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4)) self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
def reduce(inputpath,alg,k): n_data = 0 n_features = 0 result = "successful!" inputdir = os.path.dirname(inputpath) print "inputdir: " + inputdir + result inputfile = open(inputpath,'r') for line in inputfile: input_n = len(line.split(" ")) n_data += 1 #print "Selected data set has " + str(input_n) + " features" #break inputfile.close() # result = "File: " + os.path.basename(output_data) + '</br>' # result += "Path: " + os.path.dirname(output_data) + '/' + alg + str(k) + "_Features/" + '</br>' # result += "Dimension: " + str(n_data) + " x " + str(n_features) + "</br>" # context = {'result': result} # yield context if int(k) >= input_n: print "reduced features must be smaller than input features." result = "reduced features must be smaller than input features." else: # os.system("export _JAVA_OPTIONS='-Xms1g -Xmx40g'") # conf = (SparkConf().set("spark.driver.maxResultSize", "5g")) # sc = SparkContext(conf=conf) # sqlContext = SQLContext(sc) lines = sc.textFile(inputpath).map(lambda x:x.split(" ")) lines = lines.map(lambda x:(x[0],[float(y) for y in x[1:]])) df = lines.map(lambda x: Row(labels=x[0],features=Vectors.dense(x[1]))).toDF() if alg == "pca": output_data = pca(inputdir,df,alg,k) #os.system("spark-submit /home/ubuntu/yi-imPro/imagepro/pca.py " + inputpath + " " + k) output_data = inputdir + "/" + alg + str(k) + "_Data" inputfile = open(output_data, 'r') file_size = str(os.stat(output_data).st_size ) counter = 0 n_features = '0' for line in inputfile: input_n = len(line.split(" ")) n_features = str(input_n) counter += 1 inputfile.close() n_data = str(counter) result = "File: " + os.path.basename(output_data) + '</br>' result += "Path: " + os.path.dirname(output_data) + '/' + alg + str(k) + "_Features/" + '</br>' result += "Dimension: " + n_data + " x " + n_features + "</br>" result += "Size: " + file_size + ' bytes' print result # sc.stop() print "Dimension reduction finished!" context = {'n_data': n_data, 'n_features': n_features, 'result': result} return context
def test_vector_size_hint(self): df = self.spark.createDataFrame( [(0, Vectors.dense([0.0, 10.0, 0.5])), (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])), (2, Vectors.dense([2.0, 12.0]))], ["id", "vector"]) sizeHint = VectorSizeHint( inputCol="vector", handleInvalid="skip") sizeHint.setSize(3) self.assertEqual(sizeHint.getSize(), 3) output = sizeHint.transform(df).head().vector expected = DenseVector([0.0, 10.0, 0.5]) self.assertEqual(output, expected)
def test_tweedie_distribution(self): df = self.spark.createDataFrame( [(1.0, Vectors.dense(0.0, 0.0)), (1.0, Vectors.dense(1.0, 2.0)), (2.0, Vectors.dense(0.0, 0.0)), (2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"]) glr = GeneralizedLinearRegression(family="tweedie", variancePower=1.6) model = glr.fit(df) self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.4645, 0.3402], atol=1E-4)) self.assertTrue(np.isclose(model.intercept, 0.7841, atol=1E-4)) model2 = glr.setLinkPower(-1.0).fit(df) self.assertTrue(np.allclose(model2.coefficients.toArray(), [-0.6667, 0.5], atol=1E-4)) self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1E-4))
def test_linear_regression_pmml_basic(self): # Most of the validation is done in the Scala side, here we just check # that we output text rather than parquet (e.g. that the format flag # was respected). df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=1) model = lr.fit(df) path = tempfile.mkdtemp() lr_path = path + "/lr-pmml" model.write().format("pmml").save(lr_path) pmml_text_list = self.sc.textFile(lr_path).collect() pmml_text = "\n".join(pmml_text_list) self.assertIn("Apache Spark", pmml_text) self.assertIn("PMML", pmml_text)
def test_kmean_pmml_basic(self): # Most of the validation is done in the Scala side, here we just check # that we output text rather than parquet (e.g. that the format flag # was respected). data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] df = self.spark.createDataFrame(data, ["features"]) kmeans = KMeans(k=2, seed=1) model = kmeans.fit(df) path = tempfile.mkdtemp() km_path = path + "/km-pmml" model.write().format("pmml").save(km_path) pmml_text_list = self.sc.textFile(km_path).collect() pmml_text = "\n".join(pmml_text_list) self.assertIn("Apache Spark", pmml_text) self.assertIn("PMML", pmml_text)
def test_clustering_evaluator_with_cosine_distance(self): featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]), [([1.0, 1.0], 1.0), ([10.0, 10.0], 1.0), ([1.0, 0.5], 2.0), ([10.0, 4.4], 2.0), ([-1.0, 1.0], 3.0), ([-100.0, 90.0], 3.0)]) dataset = self.spark.createDataFrame(featureAndPredictions, ["features", "prediction"]) evaluator = ClusteringEvaluator(predictionCol="prediction", distanceMeasure="cosine") self.assertEqual(evaluator.getDistanceMeasure(), "cosine") self.assertTrue(np.isclose(evaluator.evaluate(dataset), 0.992671213, atol=1e-5))
def convert_to_flat_by_sparkpy_v3(df): vectorize = udf(lambda vs: Vectors.dense(list(chain.from_iterable(vs))), VectorUDT()) spark_df = df spark_df = df.orderBy("key", "subkey") spark_df = spark_df.groupBy("key").agg(first(col("parameter")).alias("label"), collect_list("reference").alias("features")) spark_df = spark_df.withColumn('features', vectorize('features')) spark_df = spark_df.select("label", "features") return spark_df
def test_gaussian_mixture_summary(self): data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), (Vectors.sparse(1, [], []),)] df = self.spark.createDataFrame(data, ["features"]) gmm = GaussianMixture(k=2) model = gmm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertTrue(isinstance(s.probability, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 3)
def test_onevsrest(self): temp_path = tempfile.mkdtemp() df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))] * 10, ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) ovrPath = temp_path + "/ovr" ovr.save(ovrPath) loadedOvr = OneVsRest.load(ovrPath) self._compare_pipelines(ovr, loadedOvr) modelPath = temp_path + "/ovrModel" model.save(modelPath) loadedModel = OneVsRestModel.load(modelPath) self._compare_pipelines(model, loadedModel)
def test_parallel_evaluation(self): dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvs.setParallelism(1) tvsSerialModel = tvs.fit(dataset) tvs.setParallelism(2) tvsParallelModel = tvs.fit(dataset) self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics)
def mldemo(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),), (Vectors.dense([4.0, 5.0, 0.0, 3.0]),), (Vectors.dense([6.0, 7.0, 0.0, 8.0]),), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)] df = spark.createDataFrame(data, ["features"]) r1 = Correlation.corr(df, "features").head() print("Pearson correlation matrix:\n" + str(r1[0])) r2 = Correlation.corr(df, "features", "spearman").head() print("Spearman correlation matrix:\n" + str(r2[0]))
def test_type_error(self): df = self.spark.createDataFrame([("a", 0), ("b", 0)]).toDF("features", "key") keyedPCA = KeyedEstimator(sklearnEstimator=PCA()) self.assertRaises(TypeError, keyedPCA.fit, df) df = self.spark.createDataFrame([(Vectors.dense([i]), [i], 0) for i in range(10)]) df = df.toDF("features", "y", "key") keyedLR = KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y") self.assertRaises(TypeError, keyedLR.fit, df)
def test_expose_sub_models(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() numFolds = 3 cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, numFolds=numFolds, collectSubModels=True) def checkSubModels(subModels): self.assertEqual(len(subModels), numFolds) for i in range(numFolds): self.assertEqual(len(subModels[i]), len(grid)) cvModel = cv.fit(dataset) checkSubModels(cvModel.subModels) # Test the default value for option "persistSubModel" to be "true" testSubPath = temp_path + "/testCrossValidatorSubModels" savingPathWithSubModels = testSubPath + "cvModel3" cvModel.save(savingPathWithSubModels) cvModel3 = CrossValidatorModel.load(savingPathWithSubModels) checkSubModels(cvModel3.subModels) cvModel4 = cvModel3.copy() checkSubModels(cvModel4.subModels) savingPathWithoutSubModels = testSubPath + "cvModel2" cvModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels) cvModel2 = CrossValidatorModel.load(savingPathWithoutSubModels) self.assertEqual(cvModel2.subModels, None) for i in range(numFolds): for j in range(len(grid)): self.assertEqual(cvModel.subModels[i][j].uid, cvModel3.subModels[i][j].uid)
def test_parallel_evaluation(self): dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build() evaluator = BinaryClassificationEvaluator() # test save/load of CrossValidator cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cv.setParallelism(1) cvSerialModel = cv.fit(dataset) cv.setParallelism(2) cvParallelModel = cv.fit(dataset) self.assertEqual(cvSerialModel.avgMetrics, cvParallelModel.avgMetrics)
def setUp(self): super(MLlibTestCase, self).setUp() self.sc = self.spark.sparkContext self.sql = self.spark self.X = np.array([[1,2,3], [-1,2,3], [1,-2,3], [1,2,-3], [-1,-2,3], [1,-2,-3], [-1,2,-3], [-1,-2,-3]]) self.y = np.array([1, 0, 1, 1, 0, 1, 0, 0]) data = [(float(self.y[i]), Vectors.dense(self.X[i])) for i in range(len(self.y))] self.df = self.sql.createDataFrame(data, ["label", "features"])
from __future__ import print_function # $example on$ from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("VectorAssemblerExample")\ .getOrCreate() # $example on$ dataset = spark.createDataFrame( [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)], ["id", "hour", "mobile", "userFeatures", "clicked"]) assembler = VectorAssembler(inputCols=["hour", "mobile", "userFeatures"], outputCol="features") output = assembler.transform(dataset) print( "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'" ) output.select("features", "clicked").show(truncate=False) # $example off$ spark.stop()
for content_series in content_series_iter: yield featurize_series(model, content_series) # Pandas UDFs on large records (e.g., very large images) can run into Out Of Memory (OOM) errors. # If you hit such errors in the cell below, try reducing the Arrow batch size via `maxRecordsPerBatch`. spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "1024") # We can now run featurization on our entire Spark DataFrame. # NOTE: This can take a long time (about 10 minutes) since it applies a large model to the full dataset. features_df = images_df.repartition(16).select( col("path"), featurize_udf("content").alias("features")) #MLLib needs some post processing of the features column format list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT()) features_df = features_df.select( col("path"), list_to_vector_udf(features_df["features"]).alias("features")) # OMITTED HERE # You need to add the labels to your dataset based on the path of your images # splitting in to training, validate and test set df_train_split, df_validate_split, df_test_split = features_df.randomSplit( [0.6, 0.3, 0.1], 42) #Here we start to train the tail of the model # This concatenates all feature columns into a single feature vector in a new column "featuresModel". vectorAssembler = VectorAssembler(inputCols=['features'],
spark = SparkSession.builder \ .master("local") \ .appName("regression") \ .getOrCreate() features = [] labels = [] for x in range(40): features.append(x) labels.append(2 * x + 1) merged = list(zip(features, labels)) random.shuffle(merged) features[:], labels[:] = zip(*merged) data = [(labels[x], Vectors.dense([features[x]])) for x in range(40)] train = spark.createDataFrame(data[:30], ["label", "features"]) test = spark.createDataFrame(data[30:], ["label", "features"]) # maxIter = 100, regParam = 0.0, tol = 1e-6, fitIntercept = True # poniżej ręcznie dobrane parametry maxIter_param = 70 regParam_param = 0.01 tol_param = 1e-29 fitIntercept_param = True lr = LinearRegression(maxIter=maxIter_param, regParam=regParam_param, tol=tol_param, fitIntercept=fitIntercept_param)
if __name__ == "__main__": if "SPARK_HOME" in os.environ.keys(): print("SPARK_HOME: ", os.environ['SPARK_HOME']) else: raise ValueError( "Environment variable SPARK_HOME needs to be specified," " and make sure spark-iforest.jar is added into your lib path ($SPARK_HOME/jars" ) spark = SparkSession \ .builder.master("local[*]") \ .appName("IForestExample") \ .getOrCreate() data = [(Vectors.dense([0.0, 0.0]), ), (Vectors.dense([7.0, 9.0]), ), (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )] df = spark.createDataFrame(data, ["features"]) from pyspark_iforest.ml.iforest import * iforest = IForest(contamination=0.3, maxDepth=2) model = iforest.fit(df) model.hasSummary summary = model.summary summary.numAnomalies
#accuracy = evaluator.evaluate(predictions) #accuracy = evaluator.evaluate(predictions.predictions) #accuracy = evaluator.evaluate(prediction) Prediction is not defined #print("Test Error = %g" %(1.0 - accuracy)) #print("Test Area under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))) stages = [] rf = RandomForestClassifier(numTrees=100, maxDepth=5, maxBins=5, labelCol="features", featuresCol="features", seed=42) #rf=assembler stages += [rf] #trainingData=temp_df.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"]) trainingData = file_df.rdd.map(lambda x: (Vectors.dense(x[0:-1]), x[-1])).toDF( ["features", "label"]) trainingData.show() #params = pipeline = Pipeline(stages=stages) lr = LogisticRegression().setFeaturesCol("features") params = ParamGridBuilder().build() #params=ParamGridBuilder().addGrid(lr.maxIter, [500]).addGrid(lr.regParam, [0]).addGrid(lr.elasticNetParam, [1]).build() #cvModel cv = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=5) #cvModel = cv.fit(file_df) #IllegalArgumentException features does not exist #cvModel=cv.fit(output_data) #Illegal type
def majority_vote(neighbors): return Vectors.dense( collections.Counter([x[1] for x in neighbors]).most_common()[0][0])
model2.numFeatures model2.save('E:/kaggle/titanic/dt_model10') # ----------------------------------------------------------------------- # Step 8 for deployment --- save and load model for operationalization purpose - from pyspark.ml.classification import DecisionTreeClassificationModel model3 = DecisionTreeClassificationModel() model3 = model3.load('E:/kaggle/titanic/dt_model10') model3.depth model3.numFeatures from pyspark.ml.linalg import Vectors predict_df = spark.createDataFrame( [(1, Vectors.dense(1.0, 0.0, 1.0, 0.0, 1.0, 0.0))], ['index', 'Features']) predict_df.show() model3.transform(predict_df).select('prediction').first()[0] # ------------------------------------------------------------------------ training1 = model2.transform(training) training1.show(5) PredictionandLabels = training1.select(training1.prediction, training1.Survived).rdd PredictionandLabels.collect() from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics # metrics1 = BinaryClassificationMetrics(PredictionandLabels) # (train score/train accuracy --- )
SiteCategoryEncoder, AppCategoryEncoder, DeviceTypeEncoder, DeviceConnTypeEncoder, C15Encoder, C16Encoder, C18Encoder, C19Encoder, C21Encoder, FeatureAssembler ]) modelTmp = pipelineTmp.fit(schemaClick) tmp = modelTmp.transform(schemaClick).select("click", "VectoredFeatures") tmp.registerTempTable("CLICK") # Selecting click and VectoredFeatures from Table "CLICK" and creating new dataFrame as results results = sqlContext.sql("SELECT click, VectoredFeatures from CLICK") results.show() # Creating label points for attributes click and VectoredFeatures click_transformed = results.select( 'click', 'VectoredFeatures').rdd.map(lambda row: LabeledPoint( float(row.click), Vectors.dense((row.VectoredFeatures).toArray()))) click_transformed.take(2) #Divide the data into training and test sets weights = [.8, .2] seed = 15L ClickTrain, ClickTest = click_transformed.randomSplit(weights, seed) # Train the training data set for the Gradient Decent modelGD = LogisticRegressionWithSGD.train(ClickTrain, iterations=15, step=1, miniBatchFraction=1, regType=None, validateData="False")
import findspark findspark.init() from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .appName("Spark MLlib") \ .config("spark.master", "local") \ .getOrCreate() ###################################################### # Example 1 - Dense & Sparse Vectors ###################################################### from pyspark.ml.linalg import Vectors denseVec = Vectors.dense(1.0, 2.0, 3.0, 4.0, 5.0, 6.0) size = 12 idx = [1, 2, 10, 11] # locations of non-zero elements in vector values = [12.0, 32.0, 110.0, 27.0] sparseVec = Vectors.sparse(size, idx, values) print("denseVec: ", denseVec) print("sparseVec: ", sparseVec) spark.stop()
# $example on$ from pyspark.ml.linalg import Vectors from pyspark.ml.classification import LogisticRegression # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("EstimatorTransformerParamExample")\ .getOrCreate() # $example on$ # Prepare training data from a list of (label, features) tuples. training = spark.createDataFrame([ (1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) # Since model1 is a Model (i.e., a transformer produced by an Estimator), # we can view the parameters it used during fit(). # This prints the parameter (name: value) pairs, where names are unique IDs for this
def add_features(feat, *other): raw = feat.toArray() return Vectors.dense(np.append(raw, map(float, other)))
from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .appName("Spark MLlib") \ .config("spark.master", "local") \ .getOrCreate() sc = spark.sparkContext from pyspark.sql import Row from pyspark.ml.linalg import Vectors from pyspark.ml.classification import LogisticRegression bdf = sc.parallelize([ Row(label=1.0, weight=1.0, features=Vectors.dense(0.0, 5.0)), Row(label=0.0, weight=2.0, features=Vectors.dense(1.0, 2.0)), Row(label=1.0, weight=3.0, features=Vectors.dense(2.0, 1.0)), Row(label=0.0, weight=4.0, features=Vectors.dense(3.0, 3.0)) ]).toDF() blor = LogisticRegression(regParam=0.01, weightCol="weight") blorModel = blor.fit(bdf) blorModel.coefficients blorModel.intercept test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF() blorModel.transform(test1).head().prediction save_path = "C:\\PySpark\\spark_ml\\saved_models\\logistic_regression_example_1\\" estimator_path = save_path + "lr"
from collections import Counter from functools import partial from functools import reduce import pandas as pd from pyspark import SparkContext from pyspark.ml.feature import PCAModel from pyspark.ml.linalg import Vectors, VectorUDT from pyspark.sql import DataFrame from pyspark.sql import SQLContext from pyspark.sql import SparkSession from pyspark.sql.functions import udf from pyspark.sql.types import IntegerType, FloatType get_cluster_size = udf(lambda x: len(x), IntegerType()) transform_to_vec = udf(lambda x: Vectors.dense(x), VectorUDT()) get_component1 = udf(lambda x: x.toArray().tolist()[0], FloatType()) get_component2 = udf(lambda x: x.toArray().tolist()[1], FloatType()) def unionAll(dfs): return reduce(DataFrame.unionAll, dfs) def get_cluster_purity(images_p, b_mapping): total_c = list() for img in images_p: img_c = b_mapping.value[img.split("/")[-1]] total_c += img_c N = len(images_p)
# $example on$ from pyspark.ml.linalg import Vectors from pyspark.ml.classification import LogisticRegression # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("EstimatorTransformerParamExample")\ .getOrCreate() # $example on$ # Prepare training data from a list of (label, features) tuples. training = spark.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) # Since model1 is a Model (i.e., a transformer produced by an Estimator), # we can view the parameters it used during fit().
# Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) gbtModel = model.stages[1] print(gbtModel) # summary only from pyspark.ml.regression import AFTSurvivalRegression from pyspark.ml.linalg import Vectors training = spark.createDataFrame([(1.218, 1.0, Vectors.dense(1.560, -0.605)), (2.949, 0.0, Vectors.dense(0.346, 2.158)), (3.627, 0.0, Vectors.dense(1.380, 0.231)), (0.273, 1.0, Vectors.dense(0.520, 1.151)), (4.199, 0.0, Vectors.dense(0.795, -0.226))], ["label", "censor", "features"]) quantileProbabilities = [0.3, 0.6] aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities, quantilesCol="quantiles") model = aft.fit(training) # Print the coefficients, intercept and scale parameter for AFT survival regression print("Coefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept)) print("Scale: " + str(model.scale)) model.transform(training).show(truncate=False)
def transData(data): return data.rdd.map(lambda r: [Vectors.dense(r[:-1]), r[-1]]).toDF( ['features', 'label'])
## wine_data_label_indexed = model.transform(wine_data) ## wine_data_label_indexed.show(5) ## #wine_data = wine_data.withColumn('string_quality', wine_data.quality.cast('string')) ## ## ## from pyspark.ml.feature import StringIndexer ## indexer = StringIndexer(inputCol="label", outputCol="indexed_label") ## model = indexer.fit(ml_wine_data) ## wine_data_label_indexed = model.transform(ml_wine_data) ## wine_data_label_indexed.show(5) # convert data into featuresCol and labelCol structre from pyspark.sql import Row from pyspark.ml.linalg import Vectors ml_wine_data = wine_data.rdd.map(lambda r: [Vectors.dense(r[:-1]), r[-1]]).toDF(['featuresCol', 'label']) ml_wine_data.show(5) ## from pyspark.ml.feature import VectorIndexer ## indexer = VectorIndexer(maxCategories=4, inputCol='featuresCol', outputCol='indexed_features') ## model = indexer.fit(wine_data_label_indexed) ## wine_data_feature_indexed = model.transform(wine_data_label_indexed) ## splitting data into training and test sets training, test = ml_wine_data.randomSplit(weights=[0.7, 0.3], seed=123) training.show(5) ## naive bayes classifier
def dataset_multinomial(spark_session): return spark_session.createDataFrame( [(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], [])), (2.0, Vectors.dense(0.5))] * 100, ["label", "features"], ).cache()
Run with: bin/spark-submit examples/src/main/python/ml/correlation_example.py """ # $example on$ from pyspark.ml.linalg import Vectors from pyspark.ml.stat import Correlation # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("CorrelationExample") \ .getOrCreate() # $example on$ data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ), (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ), (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )] df = spark.createDataFrame(data, ["features"]) r1 = Correlation.corr(df, "features").head() print("Pearson correlation matrix:\n" + str(r1[0])) r2 = Correlation.corr(df, "features", "spearman").head() print("Spearman correlation matrix:\n" + str(r2[0])) # $example off$ spark.stop()
df0=sample_300_df2.select(["_id", "chordRatio", "chordRatioMinHash"]) from pyspark.sql import Row from pyspark.sql.functions import col from sparkaid import flatten df0_flat=flatten(df0) columns_list1=df0_flat.columns[1:-1] array_df=df0_flat.select('_id', 'chordRatioMinHash',array(columns_list1).alias('chordRatioJS')) #fill NaNs with zeros in the array column df2_flat=df0_flat.na.fill(float(0)) columns_list2=df2_flat.columns[1:-1] array_df2=df2_flat.select('_id', 'chordRatioMinHash',array(columns_list2).alias('chordRatioJS_no_Nulls')) ### to_vector = udf(lambda a: Vectors.dense(a), VectorUDT()) data = array_df2.select('_id', 'chordRatioMinHash', "chordRatioJS_no_Nulls", to_vector("chordRatioJS_no_Nulls").alias("chordRatioWJS")) data.show(1, truncate=False) import scipy.sparse from pyspark.ml.linalg import Vectors, _convert_to_vector, VectorUDT from pyspark.sql.functions import udf, col ## from dense to sparse array def dense_to_sparse(vector): return _convert_to_vector(scipy.sparse.csc_matrix(vector.toArray()).T) to_sparse = udf(dense_to_sparse, VectorUDT()) data_sparse=data.withColumn("sparseChordRatioJS", to_sparse(col("chordRatioWJS"))) #data_sparse2=data_sparse.select('_id', 'chordRatio_for_minHash', 'sparseChordRatioJS') indices_udf = udf(lambda vector: vector.indices.tolist(), ArrayType(IntegerType()))
def toSparseVector(index, values): day_list_index, qty_list_values = zip(*sorted(zip(index, values))) #367 for bisextile year (1 to 366 +1) return Vectors.sparse(366, day_list_index, qty_list_values)
from pyspark.ml.feature import MinHashLSH from pyspark.ml.linalg import Vectors from pyspark.sql.functions import col from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() spark.sparkContext.setLogLevel("ERROR") dataA = [( 0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]), ), ( 1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]), ), ( 2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]), )] dfA = spark.createDataFrame(dataA, ["id", "features"]) dataB = [( 3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]), ), ( 4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]), ), ( 5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]), )]
parts = lines.map(lambda row: row.value.split(",")) df = spark.createDataFrame(parts, [ "id", "accommodates", "bathrooms", "bedrooms", "beds", "price", "Shared room", "Entire home/apt", "Private room", "Queens", "Brooklyn", "Staten Island", "Manhattan", "Bronx", "testTrain", "listId" ]) #Put in the same order testTrainData = df.select("accommodates", "bathrooms", "bedrooms", "beds", "price", "Shared room", "Entire home/apt", "Private room", "Queens", "Brooklyn", "Staten Island", "Manhattan", "Bronx", "testTrain", "listId") # Make dataframe into LIBSVM format datak = testTrainData.rdd.map(lambda x: (Vectors.dense(x[0:-2]), x[-1])).toDF( ["features", "label"]) # Trains a k-means model. kmeans = KMeans().setK(10).setSeed(123) model = kmeans.fit(datak) # Make predictions predictions = model.transform(datak) # Clustered dataframe testTrainData = testTrainData.select('listId', 'testTrain') testTrainData = testTrainData.withColumnRenamed('listId', 'newId') OuterCluster = dataset.join(predictions, dataset.listId == predictions.label) OuterCluster = OuterCluster.drop('features').drop('label') OuterCluster = OuterCluster.join(testTrainData,
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Jun 8 15:08:11 2018 @author: luogan """ from pyspark.ml.feature import PolynomialExpansion from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession spark= SparkSession\ .builder \ .appName("dataFrame") \ .getOrCreate() df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ), (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([3.0, -1.0]), )], ["features"]) polyExpansion = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures") polyDF = polyExpansion.transform(df) polyDF.show(truncate=False)
spark = SparkSession.builder.getOrCreate() # Principal Component Analysis (PCA) # In[3]: # PCA is a statistical procedure that uses an orthogonal transformation # to convert a set of observations of possibly correlated variables # into a set of values of linearly uncorrelated variables called # principal components. from pyspark.ml.feature import PCA from pyspark.ml.linalg import Vectors # In[4]: data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )] # In[5]: df = spark.createDataFrame(data, ["features"]) # In[6]: df.show() # In[7]: # The example below shows how to project 5-dimensional feature vectors into # 3-dimensional principal components.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # $example on$ from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("ElementwiseProductExample")\ .getOrCreate() # $example on$ # Create some vector data; also works for sparse vectors data = [(Vectors.dense([1.0, 2.0, 3.0]), ), (Vectors.dense([4.0, 5.0, 6.0]), )] df = spark.createDataFrame(data, ["vector"]) transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]), inputCol="vector", outputCol="transformedVector") # Batch transform the vectors to create new column: transformer.transform(df).show() # $example off$ spark.stop()
path = "./resources/" angry_df = ImageSchema.readImages(path + "0/").withColumn("label", lit(0)) happy_df = ImageSchema.readImages(path + "3/").withColumn("label", lit(1)) sad_df = ImageSchema.readImages(path + "4/").withColumn("label", lit(2)) sc = spark.sparkContext log4jLogger = sc._jvm.org.apache.log4j log = log4jLogger.Logger.getLogger(__name__) log.info("pyspark script logger initialized") df1 = angry_df.union(happy_df).union(sad_df) parse_ = udf(lambda a: Vectors.dense(a), VectorUDT()) df = df1.withColumn("features", parse_(df1["image.data"])) train, test, _ = df.randomSplit([0.1, 0.05, 0.85]) lr = LogisticRegression(maxIter=100, regParam=0.05, elasticNetParam=0.3, featuresCol="features", labelCol="label") train.cache() p = Pipeline(stages=[lr]) p_model = p.fit(train) predictions = p_model.transform(test)
spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/temp").appName("LinearRegression").getOrCreate() inputLines = spark.sparkContext.textFile("file:///Users/lesli/BigData/TA3/College.csv") def parseLine(line): fields = line.split(',') SchoolType = fields[1] SF_Ratio = float(fields[15]) Grad_Rate = float(fields[18]) return (SchoolType, SF_Ratio, Grad_Rate) parsedLines = inputLines.map(parseLine) Private = parsedLines.filter(lambda x: "Yes" in x[0]) data = Private.map(lambda x: (Vectors.dense(float(x[1])),float(x[2]))).cache() #data = inputLines.map(lambda x: x.split(",")).map(lambda x: (Vectors.dense(float(x[15])),float(x[18]))).cache() #[16] is Grad_Rate (Y_variable), [13] is predictor -SF.Ratio # Convert this RDD to a DataFrame colNames = ['SF_Ratio','Grad_Rate'] #Y is Grad.rate df = data.toDF(colNames) # Let's split our data into training data and testing data trainTest = df.randomSplit([0.8, 0.2]) trainingDF = trainTest[0] testDF = trainTest[1] wholeDF = df
def transData(data): return data.rdd.map(lambda r: [r[0],Vectors.dense(r[1:])]).toDF(['CustomerID','rfm']) #Return a new RDD by applying a function to each element of this RDD.
def zero_features(feat): raw = feat.toArray() for idx in idxs: raw[idx] = 0. return Vectors.dense(raw)