def test_save_load_simple_estimator(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() # test save/load of CrossValidator cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) cvPath = temp_path + "/cv" cv.save(cvPath) loadedCV = CrossValidator.load(cvPath) self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps()) # test save/load of CrossValidatorModel cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
def test_save_load_simple_estimator(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def test_expose_sub_models(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True) tvsModel = tvs.fit(dataset) self.assertEqual(len(tvsModel.subModels), len(grid)) # Test the default value for option "persistSubModel" to be "true" testSubPath = temp_path + "/testTrainValidationSplitSubModels" savingPathWithSubModels = testSubPath + "cvModel3" tvsModel.save(savingPathWithSubModels) tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels) self.assertEqual(len(tvsModel3.subModels), len(grid)) tvsModel4 = tvsModel3.copy() self.assertEqual(len(tvsModel4.subModels), len(grid)) savingPathWithoutSubModels = testSubPath + "cvModel2" tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels) tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels) self.assertEqual(tvsModel2.subModels, None) for i in range(len(grid)): self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid)
def test_output_columns(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr, parallelism=1) model = ovr.fit(df) output = model.transform(df) self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"])
def test_support_for_weightCol(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0), (1.0, Vectors.sparse(2, [], []), 1.0), (2.0, Vectors.dense(0.5, 0.5), 1.0)], ["label", "features", "weight"]) # classifier inherits hasWeightCol lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr, weightCol="weight") self.assertIsNotNone(ovr.fit(df)) # classifier doesn't inherit hasWeightCol dt = DecisionTreeClassifier() ovr2 = OneVsRest(classifier=dt, weightCol="weight") self.assertIsNotNone(ovr2.fit(df))
def test_parallelism_doesnt_change_output(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1) modelPar1 = ovrPar1.fit(df) ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2) modelPar2 = ovrPar2.fit(df) for i, model in enumerate(modelPar1.models): self.assertTrue(np.allclose(model.coefficients.toArray(), modelPar2.models[i].coefficients.toArray(), atol=1E-4)) self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
def test_offset(self): df = self.spark.createDataFrame( [(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)), (0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)), (0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)), (0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0))], ["label", "weight", "offset", "features"]) glr = GeneralizedLinearRegression(family="poisson", weightCol="weight", offsetCol="offset") model = glr.fit(df) self.assertTrue(np.allclose(model.coefficients.toArray(), [0.664647, -0.3192581], atol=1E-4)) self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4))
def test_copy(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) ovr1 = ovr.copy({lr.maxIter: 10}) self.assertEqual(ovr.getClassifier().getMaxIter(), 5) self.assertEqual(ovr1.getClassifier().getMaxIter(), 10) model = ovr.fit(df) model1 = model.copy({model.predictionCol: "indexed"}) self.assertEqual(model1.getPredictionCol(), "indexed")
def test_binomial_logistic_regression_with_bound(self): df = self.spark.createDataFrame( [(1.0, 1.0, Vectors.dense(0.0, 5.0)), (0.0, 2.0, Vectors.dense(1.0, 2.0)), (1.0, 3.0, Vectors.dense(2.0, 1.0)), (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"]) lor = LogisticRegression(regParam=0.01, weightCol="weight", lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]), upperBoundsOnIntercepts=Vectors.dense(0.0)) model = lor.fit(df) self.assertTrue( np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4)) self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
def test_bisecting_kmeans_summary(self): data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), (Vectors.sparse(1, [], []),)] df = self.spark.createDataFrame(data, ["features"]) bkm = BisectingKMeans(k=2) model = bkm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 20)
def test_kmeans_summary(self): data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] df = self.spark.createDataFrame(data, ["features"]) kmeans = KMeans(k=2, seed=1) model = kmeans.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 1)
def reduce(inputpath,alg,k): n_data = 0 n_features = 0 result = "successful!" inputdir = os.path.dirname(inputpath) print "inputdir: " + inputdir + result inputfile = open(inputpath,'r') for line in inputfile: input_n = len(line.split(" ")) n_data += 1 #print "Selected data set has " + str(input_n) + " features" #break inputfile.close() # result = "File: " + os.path.basename(output_data) + '</br>' # result += "Path: " + os.path.dirname(output_data) + '/' + alg + str(k) + "_Features/" + '</br>' # result += "Dimension: " + str(n_data) + " x " + str(n_features) + "</br>" # context = {'result': result} # yield context if int(k) >= input_n: print "reduced features must be smaller than input features." result = "reduced features must be smaller than input features." else: # os.system("export _JAVA_OPTIONS='-Xms1g -Xmx40g'") # conf = (SparkConf().set("spark.driver.maxResultSize", "5g")) # sc = SparkContext(conf=conf) # sqlContext = SQLContext(sc) lines = sc.textFile(inputpath).map(lambda x:x.split(" ")) lines = lines.map(lambda x:(x[0],[float(y) for y in x[1:]])) df = lines.map(lambda x: Row(labels=x[0],features=Vectors.dense(x[1]))).toDF() if alg == "pca": output_data = pca(inputdir,df,alg,k) #os.system("spark-submit /home/ubuntu/yi-imPro/imagepro/pca.py " + inputpath + " " + k) output_data = inputdir + "/" + alg + str(k) + "_Data" inputfile = open(output_data, 'r') file_size = str(os.stat(output_data).st_size ) counter = 0 n_features = '0' for line in inputfile: input_n = len(line.split(" ")) n_features = str(input_n) counter += 1 inputfile.close() n_data = str(counter) result = "File: " + os.path.basename(output_data) + '</br>' result += "Path: " + os.path.dirname(output_data) + '/' + alg + str(k) + "_Features/" + '</br>' result += "Dimension: " + n_data + " x " + n_features + "</br>" result += "Size: " + file_size + ' bytes' print result # sc.stop() print "Dimension reduction finished!" context = {'n_data': n_data, 'n_features': n_features, 'result': result} return context
def test_persistence(self): # Test save/load for LDA, LocalLDAModel, DistributedLDAModel. df = self.spark.createDataFrame([ [1, Vectors.dense([0.0, 1.0])], [2, Vectors.sparse(2, {0: 1.0})], ], ["id", "features"]) # Fit model lda = LDA(k=2, seed=1, optimizer="em") distributedModel = lda.fit(df) self.assertTrue(distributedModel.isDistributed()) localModel = distributedModel.toLocal() self.assertFalse(localModel.isDistributed()) # Define paths path = tempfile.mkdtemp() lda_path = path + "/lda" dist_model_path = path + "/distLDAModel" local_model_path = path + "/localLDAModel" # Test LDA lda.save(lda_path) lda2 = LDA.load(lda_path) self._compare(lda, lda2) # Test DistributedLDAModel distributedModel.save(dist_model_path) distributedModel2 = DistributedLDAModel.load(dist_model_path) self._compare(distributedModel, distributedModel2) # Test LocalLDAModel localModel.save(local_model_path) localModel2 = LocalLDAModel.load(local_model_path) self._compare(localModel, localModel2) # Clean up try: rmtree(path) except OSError: pass
def test_tweedie_distribution(self): df = self.spark.createDataFrame( [(1.0, Vectors.dense(0.0, 0.0)), (1.0, Vectors.dense(1.0, 2.0)), (2.0, Vectors.dense(0.0, 0.0)), (2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"]) glr = GeneralizedLinearRegression(family="tweedie", variancePower=1.6) model = glr.fit(df) self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.4645, 0.3402], atol=1E-4)) self.assertTrue(np.isclose(model.intercept, 0.7841, atol=1E-4)) model2 = glr.setLinkPower(-1.0).fit(df) self.assertTrue(np.allclose(model2.coefficients.toArray(), [-0.6667, 0.5], atol=1E-4)) self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1E-4))
def test_java_object_gets_detached(self): df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight", fitIntercept=False) model = lr.fit(df) summary = model.summary self.assertIsInstance(model, JavaWrapper) self.assertIsInstance(summary, JavaWrapper) self.assertIsInstance(model, JavaParams) self.assertNotIsInstance(summary, JavaParams) error_no_object = 'Target Object ID does not exist for this gateway' self.assertIn("LinearRegression_", model._java_obj.toString()) self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString()) model.__del__() with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): model._java_obj.toString() self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString()) try: summary.__del__() except: pass with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): model._java_obj.toString() with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): summary._java_obj.toString()
def test_kmean_pmml_basic(self): # Most of the validation is done in the Scala side, here we just check # that we output text rather than parquet (e.g. that the format flag # was respected). data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] df = self.spark.createDataFrame(data, ["features"]) kmeans = KMeans(k=2, seed=1) model = kmeans.fit(df) path = tempfile.mkdtemp() km_path = path + "/km-pmml" model.write().format("pmml").save(km_path) pmml_text_list = self.sc.textFile(km_path).collect() pmml_text = "\n".join(pmml_text_list) self.assertIn("Apache Spark", pmml_text) self.assertIn("PMML", pmml_text)
def test_vector_size_hint(self): df = self.spark.createDataFrame( [(0, Vectors.dense([0.0, 10.0, 0.5])), (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])), (2, Vectors.dense([2.0, 12.0]))], ["id", "vector"]) sizeHint = VectorSizeHint( inputCol="vector", handleInvalid="skip") sizeHint.setSize(3) self.assertEqual(sizeHint.getSize(), 3) output = sizeHint.transform(df).head().vector expected = DenseVector([0.0, 10.0, 0.5]) self.assertEqual(output, expected)
def test_parallel_evaluation(self): dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvs.setParallelism(1) tvsSerialModel = tvs.fit(dataset) tvs.setParallelism(2) tvsParallelModel = tvs.fit(dataset) self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics)
def test_clustering_evaluator_with_cosine_distance(self): featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]), [([1.0, 1.0], 1.0), ([10.0, 10.0], 1.0), ([1.0, 0.5], 2.0), ([10.0, 4.4], 2.0), ([-1.0, 1.0], 3.0), ([-100.0, 90.0], 3.0)]) dataset = self.spark.createDataFrame(featureAndPredictions, ["features", "prediction"]) evaluator = ClusteringEvaluator(predictionCol="prediction", distanceMeasure="cosine") self.assertEqual(evaluator.getDistanceMeasure(), "cosine") self.assertTrue(np.isclose(evaluator.evaluate(dataset), 0.992671213, atol=1e-5))
def convert_to_flat_by_sparkpy_v3(df): vectorize = udf(lambda vs: Vectors.dense(list(chain.from_iterable(vs))), VectorUDT()) spark_df = df spark_df = df.orderBy("key", "subkey") spark_df = spark_df.groupBy("key").agg(first(col("parameter")).alias("label"), collect_list("reference").alias("features")) spark_df = spark_df.withColumn('features', vectorize('features')) spark_df = spark_df.select("label", "features") return spark_df
def test_gaussian_mixture_summary(self): data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), (Vectors.sparse(1, [], []),)] df = self.spark.createDataFrame(data, ["features"]) gmm = GaussianMixture(k=2) model = gmm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertTrue(isinstance(s.probability, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 3)
def test_onevsrest(self): temp_path = tempfile.mkdtemp() df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))] * 10, ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) ovrPath = temp_path + "/ovr" ovr.save(ovrPath) loadedOvr = OneVsRest.load(ovrPath) self._compare_pipelines(ovr, loadedOvr) modelPath = temp_path + "/ovrModel" model.save(modelPath) loadedModel = OneVsRestModel.load(modelPath) self._compare_pipelines(model, loadedModel)
def test_type_error(self): df = self.spark.createDataFrame([("a", 0), ("b", 0)]).toDF("features", "key") keyedPCA = KeyedEstimator(sklearnEstimator=PCA()) self.assertRaises(TypeError, keyedPCA.fit, df) df = self.spark.createDataFrame([(Vectors.dense([i]), [i], 0) for i in range(10)]) df = df.toDF("features", "y", "key") keyedLR = KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y") self.assertRaises(TypeError, keyedLR.fit, df)
def mldemo(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),), (Vectors.dense([4.0, 5.0, 0.0, 3.0]),), (Vectors.dense([6.0, 7.0, 0.0, 8.0]),), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)] df = spark.createDataFrame(data, ["features"]) r1 = Correlation.corr(df, "features").head() print("Pearson correlation matrix:\n" + str(r1[0])) r2 = Correlation.corr(df, "features", "spearman").head() print("Spearman correlation matrix:\n" + str(r2[0]))
def test_expose_sub_models(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() numFolds = 3 cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, numFolds=numFolds, collectSubModels=True) def checkSubModels(subModels): self.assertEqual(len(subModels), numFolds) for i in range(numFolds): self.assertEqual(len(subModels[i]), len(grid)) cvModel = cv.fit(dataset) checkSubModels(cvModel.subModels) # Test the default value for option "persistSubModel" to be "true" testSubPath = temp_path + "/testCrossValidatorSubModels" savingPathWithSubModels = testSubPath + "cvModel3" cvModel.save(savingPathWithSubModels) cvModel3 = CrossValidatorModel.load(savingPathWithSubModels) checkSubModels(cvModel3.subModels) cvModel4 = cvModel3.copy() checkSubModels(cvModel4.subModels) savingPathWithoutSubModels = testSubPath + "cvModel2" cvModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels) cvModel2 = CrossValidatorModel.load(savingPathWithoutSubModels) self.assertEqual(cvModel2.subModels, None) for i in range(numFolds): for j in range(len(grid)): self.assertEqual(cvModel.subModels[i][j].uid, cvModel3.subModels[i][j].uid)
def applyEstimator(estimator, x): if not estimator: return None if oneDimensional: x = [[x]] else: x = x.toArray().reshape(1, -1) if shouldPredict: return cast(estimator.predict(x)[0]) else: return Vectors.dense(estimator.transform(x)[0])
def setUp(self): super(MLlibTestCase, self).setUp() self.sc = self.spark.sparkContext self.sql = self.spark self.X = np.array([[1,2,3], [-1,2,3], [1,-2,3], [1,2,-3], [-1,-2,3], [1,-2,-3], [-1,2,-3], [-1,-2,-3]]) self.y = np.array([1, 0, 1, 1, 0, 1, 0, 0]) data = [(float(self.y[i]), Vectors.dense(self.X[i])) for i in range(len(self.y))] self.df = self.sql.createDataFrame(data, ["label", "features"])
def test_parallel_evaluation(self): dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build() evaluator = BinaryClassificationEvaluator() # test save/load of CrossValidator cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cv.setParallelism(1) cvSerialModel = cv.fit(dataset) cv.setParallelism(2) cvParallelModel = cv.fit(dataset) self.assertEqual(cvSerialModel.avgMetrics, cvParallelModel.avgMetrics)
def test_apply_binary_term_freqs(self): df = self.spark.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"]) n = 10 hashingTF = HashingTF() hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True) output = hashingTF.transform(df) features = output.select("features").first().features.toArray() expected = Vectors.dense([1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]).toArray() for i in range(0, n): self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + str(i) + ": expected " + str(expected[i]) + ", got " + str(features[i]))
def test_save_load_nested_estimator(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticRegression().setMaxIter(100) lr2 = LogisticRegression().setMaxIter(150) grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() evaluator = MulticlassClassificationEvaluator() # test save/load of CrossValidator cv = CrossValidator(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) cvPath = temp_path + "/cv" cv.save(cvPath) loadedCV = CrossValidator.load(cvPath) self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) originalParamMap = cv.getEstimatorParamMaps() loadedParamMap = loadedCV.getEstimatorParamMaps() for i, param in enumerate(loadedParamMap): for p in param: if p.name == "classifier": self.assertEqual(param[p].uid, originalParamMap[i][p].uid) else: self.assertEqual(param[p], originalParamMap[i][p]) # test save/load of CrossValidatorModel cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
import numpy as np import os from pyspark.ml.linalg import Vectors npz_files_path = "/home/lmtruong1512/Codes/BTL_CSDLDPT/extracted_files/extracted_SIFT100" file_names = os.listdir(npz_files_path)[0:10] np_arrs = [ np.load(os.path.join(npz_files_path, file_name))['arr_0'] for file_name in file_names ] dataset = map(lambda x: Vectors.dense(x, ), np_arrs) print(dataset)
# -*- coding: utf-8 -*- """ Created on Tue Aug 20 19:58:59 2019 @author: amitabh.gunjan """ from pyspark.sql import Row from pyspark.ml.linalg import Vectors import pyspark as spark spark= spark.SparkSession.builder.getOrCreate() df = spark.SQLContext.createDataFrame(data=[Row(label=0.0, weight=0.1, features=Vectors.dense([0.0, 0.0])) ,Row(label=0.0, weight=0.5, features=Vectors.dense([0.0, 1.0])) ,Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0]))]) nb = spark.ml.classification.NaiveBayes(smoothing=1.0, modelType="multinomial", weightCol="weight") model = nb.fit(df) model.pi model.theta test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF() result = model.transform(test0).head() result.prediction result.probability result.rawPrediction test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF() model.transform(test1).head().prediction nb_path = temp_path + "/nb" nb.save(nb_path) nb2 = NaiveBayes.load(nb_path)
print( '***** Creating opcodes list for each document *********************************************' ) # >> (opcode, ((docid, hash, label), cnt)) rdd_opcode = rdd_opcode_cnt.map(lambda x: (x[0][1], (x[0][0], x[1])))\ .leftOuterJoin(rdd_opcode_distinct)\ .map(lambda x: (x[1][0][0], (x[1][1], x[1][0][1])))\ .groupByKey().map(lambda x: (x[0], list(x[1]))) print( '***** Creating opcodes list with document information *************************************' ) # >> (docid, hash, label, vector.dense(opcode)) opcode = rdd_train.map(lambda x: (x[1], (x[0], x[2]))).leftOuterJoin(rdd_opcode)\ .map(lambda x: (x[1][0][0], x[0], x[1][0][1], list(numpy_cartesian(x[1][1], N))))\ .map(lambda x: (x[0], x[1], x[2], Vectors.dense(x[3]))) print( '***** RF feature selection ****************************************************************' ) opcode_imp = RF_features_select(opcode) # >> (index, feature_importance) rdd_opcode_imp = sc.parallelize(opcode_imp) # opcode_r >> (docid, hash, label, vectors.dense(opcode)) # rdd_opcode_distinct_r >> (opcode, index_r) opcode_r, rdd_opcode_distinct_r, N_r = feature_filter( rdd_opcode_imp, rdd_opcode_distinct, rdd_opcode_cnt, rdd_train) print( '***** Transforming RDD into Dateframe *****************************************************' )
from pyspark.ml.feature import VectorAssembler spark = SparkSession.builder.appName('data').getOrCreate() df = spark.read.csv( 'hdfs:///user/maria_dev/MachineLearning/fake_customers.csv', inferSchema=True, header=True) df.show() df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["user_id", "category"]) df.show() indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") indexed = indexer.fit(df).transform(df) indexed.show() dataset = spark.createDataFrame( [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)], ["id", "hour", "mobile", "userFeatures", "clicked"]) dataset.show() assembler = VectorAssembler(inputCols=["hour", "mobile", "userFeatures"], outputCol="features") output = assembler.transform(dataset) print( "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'" ) output.select("features", "clicked").show()
from pyspark.ml.regression import AFTSurvivalRegression from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() spark.sparkContext.setLogLevel("ERROR") training = spark.createDataFrame([(1.218, 1.0, Vectors.dense(1.560, -0.605)), (2.949, 0.0, Vectors.dense(0.346, 2.158)), (3.627, 0.0, Vectors.dense(1.380, 0.231)), (0.273, 1.0, Vectors.dense(0.520, 1.151)), (4.199, 0.0, Vectors.dense(0.795, -0.226))], ["label", "censor", "features"]) quantileProbabilities = [0.3, 0.6] aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities, quantilesCol="quantiles") model = aft.fit(training) print("Coefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept)) print("Scale: " + str(model.scale)) model.transform(training).show(truncate=False) spark.stop()
def f(x, y): ret = {} ret['features'] = Vectors.dense(float(x[0]), float(x[1]), float(x[2]), float(x[3])) ret['label'] = str(y) return ret
def join_vec(term_len, tf_title, tf_desc): return Vectors.dense([int(term_len), int(tf_title), int(tf_desc)])
# $example on$ from pyspark.ml.linalg import Vectors from pyspark.ml.classification import LogisticRegression # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("EstimatorTransformerParamExample")\ .getOrCreate() # $example on$ # Prepare training data from a list of (label, features) tuples. training = spark.createDataFrame([ (1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) # Since model1 is a Model (i.e., a transformer produced by an Estimator), # we can view the parameters it used during fit(). # This prints the parameter (name: value) pairs, where names are unique IDs for this
if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("MinMaxScalerExample")\ .getOrCreate() # # $example on$ # dataFrame = spark.createDataFrame([ # (0, Vectors.dense([1.0, 0.1, -1.0]),), # (1, Vectors.dense([2.0, 1.1, 1.0]),), # (2, Vectors.dense([3.0, 10.1, 3.0]),) # ], ["id", "features"]) dataFrame = spark.createDataFrame([ (0, Vectors.dense([1.0, 0.1, -8.0]),), (1, Vectors.dense([2.0, 1.0, -4.0]),), (2, Vectors.dense([4.0, 10.0, 8.0]),) ], ["id", "features"]) dataFrame.show() scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MinMaxScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [min, max]. scaledData = scalerModel.transform(dataFrame)
from __future__ import print_function from pyspark.sql import SparkSession # $example on$ from pyspark.ml.linalg import Vectors from pyspark.ml.stat import FValueTest # $example off$ if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("FValueTestExample") \ .getOrCreate() # $example on$ data = [(4.6, Vectors.dense(6.0, 7.0, 0.0, 7.0, 6.0, 0.0)), (6.6, Vectors.dense(0.0, 9.0, 6.0, 0.0, 5.0, 9.0)), (5.1, Vectors.dense(0.0, 9.0, 3.0, 0.0, 5.0, 5.0)), (7.6, Vectors.dense(0.0, 9.0, 8.0, 5.0, 6.0, 4.0)), (9.0, Vectors.dense(8.0, 9.0, 6.0, 5.0, 4.0, 4.0)), (9.0, Vectors.dense(8.0, 9.0, 6.0, 4.0, 0.0, 0.0))] df = spark.createDataFrame(data, ["label", "features"]) ftest = FValueTest.test(df, "features", "label").head() print("pValues: " + str(ftest.pValues)) print("degreesOfFreedom: " + str(ftest.degreesOfFreedom)) print("fvalue: " + str(ftest.fValues)) # $example off$ spark.stop()
from pyspark.ml.linalg import Vectors from pyspark.sql.functions import * from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel, BisectingKMeansSummary #from pyspark.mllib.clustering import KMeans, KMeansModel import numpy as np from math import sqrt from operator import add df = spark.read.parquet("regex_table.parquet") df1 = df.rdd.map(lambda x: (x[0],x[1],Vectors.dense(x[2]))) df1 = df1.toDF().withColumnRenamed('_1','table').withColumnRenamed('_2','colunm').withColumnRenamed('_3','features') #df = spark.createDataFrame([["a", "a1", Vectors.dense([0.5,0.5,0.0,0.0])],\ #["a", "a2", Vectors.dense([0.1,0.2,0.3,0.4])],\ #["a", "a3", Vectors.dense([0.2,0.1,0.3,0.4])],\ #["b", "b1", Vectors.dense([0.3,0.1,0.2,0.4])],\ #["b", "b2", Vectors.dense([0.4,0.1,0.2,0.3])],\ #["b", "b3", Vectors.dense([0.5,0.5,0.0,0.0])]],\ #["table", "column", "features"]) #vso = df.rdd.map(lambda x:np.array((x[0],x[1]),x[2])) transformed.select()sort($'prediction'.asc).show() def model_list(): clist = [] df2 = df1.select('features') df2.cache df1.cache for i in range(2,20): kmeans = BisectingKMeans(k=i, minDivisibleClusterSize=1.0) model = kmeans.fit(df2) WSSSE = model.computeCost(df1)
def vectorize(data): return data.rdd.map(lambda r: [r[0], Vectors.dense(r[1:])]).toDF(['label','features'])
from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession if __name__ == "__main__": spark_session = SparkSession \ .builder \ .appName("Spark ML SVM") \ .getOrCreate() model = LinearSVCModel.load("SVMModel") print("Model loaded") test = spark_session.createDataFrame([ (0, Vectors.dense([1.0, 1.2])), (1, Vectors.dense([5.3, 2.4])), (2, Vectors.dense([1.2, 1.3])), (3, Vectors.dense([5.1, 2.3]))], ["label", "features"]) \ .cache() for row in test.collect(): print(row) prediction = model.transform(test) prediction.printSchema() prediction.show() selected = prediction.select("label", "prediction") selected.printSchema()
from __future__ import print_function from pyspark.ml.regression import LinearRegression from pyspark.sql import SparkSession from pyspark.ml.linalg import Vectors if __name__ == "__main__": # Create a SparkSession (Note, the config section is only for Windows!) spark = SparkSession.builder.appName("LinearRegression").getOrCreate() # Load up our data and convert it to the format MLLib expects. inputLines = spark.sparkContext.textFile("data/regression.txt") data = inputLines.map(lambda x: x.split(",")).map( lambda x: (float(x[0]), Vectors.dense(float(x[1])))) # Convert this RDD to a DataFrame colNames = ["label", "features"] df = data.toDF(colNames) # Note, there are lots of cases where you can avoid going from an RDD to a DataFrame. # Perhaps you're importing data from a real database. Or you are using structured streaming # to get your data. # Let's split our data into training data and testing data trainTest = df.randomSplit([0.5, 0.5]) trainingDF = trainTest[0] testDF = trainTest[1] # Now create our linear regression model
# Copyright 2017 Mario Juez. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Ejemplo de prediccion usando el modelo de regresion lineal. # Se debe ejecutar desde PySpark, en la consola SSH del cluster. # # Autor: Mario Juez <*****@*****.**> from pyspark.sql import Row from pyspark.ml.linalg import Vectors from pyspark.ml.regression import LinearRegressionModel test = spark.sparkContext.parallelize( [Row(features=Vectors.dense(27, 30, 41, 63, 9))]).toDF() model = LinearRegressionModel.load( "gs://seminario-gcp/ml-models/pyspark-natality-lr-model") result = model.transform(test).head() print result.prediction
def concat_rf_feats(line): all_feature = list(line["chi"]) + list(line["as"]) + list(line["geo"]) + list(line["mob"]) return Row(feats=Vectors.dense(all_feature), label=line["label"])
sc.parallelize([("Preprocessing", preprocess_time), ("Training time", training_time), ("Testing time", testing_time), ("Total time", total_timetaken)], 1).saveAsTextFile(sys.argv[2] + "_Time_taken") data1 = crimes.where(crimes.Latitude.isNotNull() & crimes.Longitude.isNotNull() & crimes.ID.isNotNull()) # Choosing latitude and longitude after removing rows with null values and outliers for better results data_frame = data1 \ .rdd \ .filter(lambda x: (40.0<float(x[19])<42.0))\ .filter(lambda x: (-88.0<float(x[20])<-86.0))\ .map(lambda x: (x[0], Vectors.dense(float(x[19]), float(x[20])))).toDF(["ID", "features"]) (trainingData1, testData1) = data_frame.randomSplit([0.7, 0.3], seed=100) # Number of cluster choosen k = 6 kmeans = KMeans().setK(k).setSeed(1) kmeans_model = kmeans.fit(trainingData1) # Evaluate clustering by computing Within Set Sum of Squared Errors. wssse = kmeans_model.computeCost(trainingData1) print("Within Set Sum of Squared Errors = " + str(wssse)) # cluster center reults centers = kmeans_model.clusterCenters()
def parsePoint(line): return (line[-1], Vectors.dense(line[:-1]))
float(p[17],float(p[18],float(p[19],float(p[20],float(p[21]))) # In[338]: # Create a DataFrame lending_df = spark.createDataFrame(lend_RDD) lending_df.show(10) # In[339]: # Convert feature type to vector lending_df_vectors = lending_df.rdd.map(lambda row: Row( label=row["lable"], features = Vectors.dense(row["featuresList"]) )).toDF() # In[340]: lending_df_vectors # In[341]: # Scale the data scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)
def fill_nan(vec: np.array, num=0): return Vectors.dense(np.nan_to_num(vec, num))
def predict(features_tab, tab_out, model_path, veh): # 1 配置 spark = SparkSession \ .builder \ .master("yarn") \ .appName("tianzw_vol_fading_predict_second_versions") \ .config("spark.sql.warehouse.dir","hdfs://neicluster/user/hive/warehouse") \ .enableHiveSupport() \ .getOrCreate() spark.sparkContext.setLogLevel("ERROR") # 2 准备数据 sql = """ SELECT vin , sta_time , mils_1000km , sta_soc , charge_c , hours , temp , days , mils_dif , cnt_cha , vol_cha , vol_avg_cha , hou_cha , c_avg , sta_soc_avg_cha , end_soc_avg_cha , dep_soc_avg_cha , sta_soc_mid_cha , end_soc_mid_cha , dep_soc_mid_cha , cnt_tem , tem_mid_yea , tem_avg_yea , tem_dif_yea , tem_var_yea FROM """ + features_tab + """ WHERE veh_head = SUBSTR('""" + veh + """',0,1) AND veh = '""" + veh + """' AND SUBSTR(vin,-1,1) = '""" + vin_tail + """' """ rdd_origin = spark.sql(sql).rdd features_rdd = rdd_origin.map(lambda x: ( x.vin, x.sta_time, Vectors.dense([ x.mils_1000km, x.sta_soc, x.charge_c, x.hours, x.temp, x.days, x. mils_dif, x.cnt_cha, x.vol_cha, x.vol_avg_cha, x.hou_cha, x.c_avg, x.sta_soc_avg_cha, x.end_soc_avg_cha, x.dep_soc_avg_cha, x. sta_soc_mid_cha, x.end_soc_mid_cha, x.dep_soc_mid_cha, x.cnt_tem, x .tem_mid_yea, x.tem_avg_yea, x.tem_dif_yea, x.tem_var_yea ]), )) features_list = features_rdd.collect() print("数据提取成功") spark_df = spark.createDataFrame(features_list, ["vin", "sta_time", "features"]) # 3 模型预测 # model = GBTRegressor.load(model_path) model = GBTRegressionModel.load(model_path) print("模型导入成功") predictions = model.transform(spark_df) print("计算成功") new_list = [(x.vin, x.sta_time, x.prediction) for x in predictions.collect()] result_df = spark.createDataFrame(new_list, ["vin", "sta_time", "vol_fading"]) result_df = result_df.repartition(1) result_df.createOrReplaceTempView("table_temp") # 数据写入 hive 表 # createSQL = """ # CREATE TABLE IF NOT EXISTS """ + tab_out + """ # ( # vin STRING COMMENT '车架号', # sta_time BIGINT COMMENT '充电开始时间(s)', # vol_fading DOUBLE COMMENT '容量衰减百分比预测值(2位小数)' # ) # PARTITIONED BY(veh STRING COMMENT '车型名', # vin_tail STRING COMMENT '车架号尾号') # ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' # """ insertSql = """ INSERT OVERWRITE TABLE """ + tab_out + """ PARTITION(veh = '""" + veh + """',vin_tail = '""" + vin_tail + """') SELECT vin, sta_time, ROUND(vol_fading,2) AS vol_fading FROM table_temp """ # spark.sql("DROP TABLE IF EXISTS " + tab_out) # spark.sql(createSQL) spark.sql(insertSql) print(tab_out + "写入成功")
def cluster(inputpath,alg,k): n_data = 0 n_features = 0 result = "successful!" inputdir = os.path.dirname(inputpath) print "inputdir: " + inputdir + result inputfile = open(inputpath,'r') for line in inputfile: input_n = len(line.split(" ")) n_data += 1 #print "Selected data set has " + str(input_n) + " features" #break inputfile.close() # result = "File: " + os.path.basename(output_data) + '</br>' # result += "Path: " + os.path.dirname(output_data) + '/' + alg + str(k) + "_Features/" + '</br>' # result += "Dimension: " + str(n_data) + " x " + str(n_features) + "</br>" # context = {'result': result} # yield context if int(k) == 1: print "k should be greater than 1" result = "k should be greater than 1" else: # os.system("export _JAVA_OPTIONS='-Xms1g -Xmx40g'") # conf = (SparkConf().set("spark.driver.maxResultSize", "5g")) # sc = SparkContext(conf=conf) # sqlContext = SQLContext(sc) lines = sc.textFile(inputpath).map(lambda x:x.split(" ")) lines = lines.map(lambda x:(x[0],[float(y) for y in x[1:]])) df = lines.map(lambda x: Row(labels=x[0],features=Vectors.dense(x[1]))).toDF() if alg == "kmeans": output_data = kmeans(inputdir,df,alg,k) #os.system("spark-submit /home/ubuntu/yi-imPro/imagepro/pca.py " + inputpath + " " + k) output_data = inputdir + "/" + alg + str(k) + "_Data" inputfile = open(output_data, 'r') file_size = str(os.stat(output_data).st_size ) counter = 0 n_features = '0' for line in inputfile: input_n = len(line.split(" ")) n_features = str(input_n) counter += 1 inputfile.close() n_data = str(counter) result = "File: " + os.path.basename(output_data) + '</br>' result += "Path: " + os.path.dirname(output_data) + '/' + alg + str(k) + "_Features/" + '</br>' result += "Dimension: " + n_data + " x " + n_features + "</br>" result += "Size: " + file_size + ' bytes' print result # sc.stop() print "Clustering finished!" context = {'n_data': n_data, 'n_features': n_features, 'result': result} return context
def tmpDouble2vec(x): return Vectors.dense(x)
def func(x): features_data = [] for feature in feature_indexs: features_data.append(x[feature]) return Row(label=x[label_index], features=Vectors.dense(features_data))
from pyspark.ml.linalg import Vectors from pyspark.ml.stat import Correlation from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() spark.sparkContext.setLogLevel("ERROR") data = [(Vectors.dense([21.0, 110, 3.90]), ), (Vectors.dense([22.8, 93, 3.85]), ), (Vectors.dense([18.1, 105, 2.76]), )] df = spark.createDataFrame(data, ['features']) r1 = Correlation.corr(df, 'features', 'pearson').head() r2 = Correlation.corr(df, 'features', 'spearman').head() print 'Data:' print df.show() print 'Pearson Correlation:' print str(r1[0]) print 'Spearman Correlation:' print str(r2[0]) spark.stop()
def vector_from_inputs(r): return (r["weight_pounds"], Vectors.dense(float(r["mother_age"]), float(r["father_age"]), float(r["gestation_weeks"]), float(r["weight_gain_pounds"]), float(r["apgar_5min"])))
def get_vectors(row): '''根据原始数据的情况对特征集插值''' if row['pe_feature_12']: pe_feature_12_risk = row['pe_feature_12']['risk'] else: pe_feature_12_risk = 0. if row['pe_feature_13']: pe_feature_13_risk = row['pe_feature_13']['risk'] else: pe_feature_13_risk = 0. if row['feature_26']: return (Vectors.dense([ row['pe_feature_1'], row['pe_feature_2'], row['pe_feature_3'], row['pe_feature_4'], row['pe_feature_5'], row['pe_feature_6'], row['pe_feature_7'], row['pe_feature_8'], row['pe_feature_9'], row['pe_feature_10'], row['pe_feature_11'], pe_feature_12_risk, pe_feature_13_risk, row['feature_1']['r'], row['feature_2']['c'], row['feature_3']['z'], row['feature_4']['k'], row['feature_5']['l'], row['feature_6']['z'], row['feature_7']['z'], row['feature_8']['y'], row['feature_9']['n'], row['feature_10']['z'], row['feature_11']['z'], row['feature_12']['z'], row['feature_13']['z'], row['feature_14']['z'], row['feature_15']['r'], row['feature_16']['r'], row['feature_17']['r'], row['feature_18']['z'], row['feature_19']['z'], row['feature_20']['g'], row['feature_21']['y'], row['feature_22']['z'], row['feature_23']['y'], row['feature_24']['z'], row['feature_26']['a_1'], row['feature_26']['a_4'], row['feature_26']['a_5'], row['feature_26']['a_6'], row['feature_26']['b_1'], row['feature_26']['b_2'], row['feature_26']['b_3'], row['feature_26']['c_1'], row['feature_26']['d_2'], ]), row['bbd_qyxx_id'], row['company_name']) elif row['feature_1']: return (Vectors.dense([ row['pe_feature_1'], row['pe_feature_2'], row['pe_feature_3'], row['pe_feature_4'], row['pe_feature_5'], row['pe_feature_6'], row['pe_feature_7'], row['pe_feature_8'], row['pe_feature_9'], row['pe_feature_10'], row['pe_feature_11'], pe_feature_12_risk, pe_feature_13_risk, row['feature_1']['r'], row['feature_2']['c'], row['feature_3']['z'], row['feature_4']['k'], row['feature_5']['l'], row['feature_6']['z'], row['feature_7']['z'], row['feature_8']['y'], row['feature_9']['n'], row['feature_10']['z'], row['feature_11']['z'], row['feature_12']['z'], row['feature_13']['z'], row['feature_14']['z'], row['feature_15']['r'], row['feature_16']['r'], row['feature_17']['r'], row['feature_18']['z'], row['feature_19']['z'], row['feature_20']['g'], row['feature_21']['y'], row['feature_22']['z'], row['feature_23']['y'], row['feature_24']['z'], 0., 0., 0., 0., 0., 0., 0., 0., 0., ]), row['bbd_qyxx_id'], row['company_name']) else: return (Vectors.dense([ row['pe_feature_1'], row['pe_feature_2'], row['pe_feature_3'], row['pe_feature_4'], row['pe_feature_5'], row['pe_feature_6'], row['pe_feature_7'], row['pe_feature_8'], row['pe_feature_9'], row['pe_feature_10'], row['pe_feature_11'], pe_feature_12_risk, pe_feature_13_risk, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., ]), row['bbd_qyxx_id'], row['company_name'])
StructField("pdef", DoubleType(), True), StructField("pbeau", DoubleType(), True), StructField("pnum", IntegerType(), True), StructField("s_term_score", DoubleType(), True), StructField("sumclick", LongType(), True), StructField("sumshow", LongType(), True), StructField("uid", LongType(), True) ]) print "begin to map input" train_set = spark.read.csv( "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/train_feature_test/part-*", schema=fieldSchema) train_set_r = train_set.rdd.map( lambda p: Row(label=p.label, features=Vectors.dense(p.ctr, p.pnum, p.pdef, p.pbeau, p. s_term_score, p.sumclick, p.sumshow))) print train_set_r.take(5) print "finish map input" train_set_d = spark.createDataFrame(train_set_r) (training, test) = train_set_d.randomSplit([0.9, 0.1]) #train lr = LogisticRegression(maxIter=10, regParam=0.3) lrModel = lr.fit(training) print "coefficients" print lrModel.coefficients print "intercept" print lrModel.intercept #summary # $example on$ # Extract the summary from the returned LogisticRegressionModel instance trained
from pyspark.ml.linalg import Vectors from pyspark.ml.classification import LogisticRegression from pyspark.sql import SparkSession spark = SparkSession.builder.appName("e").getOrCreate() training = spark.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) lr = LogisticRegression(maxIter=10, regParam=0.01) model_1 = lr.fit(training) param_map = dict() param_map[lr.maxIter] = 30 param_map.update({lr.regParam: 0.1, lr.threshold: 0.55}) param_map_new = {lr.probabilityCol: "my_probability"} param_map_combined = param_map.copy() param_map_combined.update(param_map_new) model_2 = lr.fit(training, params=param_map_combined) test = spark.createDataFrame([(1.0, Vectors.dense([-1.0, 1.5, 1.3])), (0.0, Vectors.dense([3.0, 2.0, -0.1])), (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"]) predict = model_2.transform(test)
import pyspark from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.linalg import Vectors from pyspark.ml.tuning import CrossValidator, ParamGridBuilder import mlflow import mlflow.pyspark.ml if __name__ == '__main__': print("MLflow version: {}".format(mlflow.__version__)) spark = pyspark.sql.SparkSession.builder.appName("BestParams") \ .getOrCreate() dataset = spark.createDataFrame([(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, parallelism=2) mlflow.pyspark.ml.autolog() cvModel = cv.fit(dataset) print("Average Metric: {}".format(cvModel.avgMetrics[0])) print("Number of folds: {}".format(cvModel.getNumFolds()))