示例#1
0
    def test_save_load_simple_estimator(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        cvPath = temp_path + "/cv"
        cv.save(cvPath)
        loadedCV = CrossValidator.load(cvPath)
        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
        self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())

        # test save/load of CrossValidatorModel
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedModel = CrossValidatorModel.load(cvModelPath)
        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
示例#2
0
    def test_expose_sub_models(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
                                   collectSubModels=True)
        tvsModel = tvs.fit(dataset)
        self.assertEqual(len(tvsModel.subModels), len(grid))

        # Test the default value for option "persistSubModel" to be "true"
        testSubPath = temp_path + "/testTrainValidationSplitSubModels"
        savingPathWithSubModels = testSubPath + "cvModel3"
        tvsModel.save(savingPathWithSubModels)
        tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels)
        self.assertEqual(len(tvsModel3.subModels), len(grid))
        tvsModel4 = tvsModel3.copy()
        self.assertEqual(len(tvsModel4.subModels), len(grid))

        savingPathWithoutSubModels = testSubPath + "cvModel2"
        tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
        tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels)
        self.assertEqual(tvsModel2.subModels, None)

        for i in range(len(grid)):
            self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid)
示例#3
0
 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     df = self.spark.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass
示例#4
0
    def test_java_object_gets_detached(self):
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
                                        ["label", "weight", "features"])
        lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight",
                              fitIntercept=False)

        model = lr.fit(df)
        summary = model.summary

        self.assertIsInstance(model, JavaWrapper)
        self.assertIsInstance(summary, JavaWrapper)
        self.assertIsInstance(model, JavaParams)
        self.assertNotIsInstance(summary, JavaParams)

        error_no_object = 'Target Object ID does not exist for this gateway'

        self.assertIn("LinearRegression_", model._java_obj.toString())
        self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())

        model.__del__()

        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            model._java_obj.toString()
        self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())

        try:
            summary.__del__()
        except:
            pass

        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            model._java_obj.toString()
        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            summary._java_obj.toString()
示例#5
0
    def test_save_load_simple_estimator(self):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)

        tvsPath = temp_path + "/tvs"
        tvs.save(tvsPath)
        loadedTvs = TrainValidationSplit.load(tvsPath)
        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
        self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)
        self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps())

        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
示例#6
0
 def test_equals(self):
     indices = [1, 2, 4]
     values = [1., 3., 2.]
     self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.]))
     self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.]))
     self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.]))
     self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]))
示例#7
0
 def test_output_columns(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, parallelism=1)
     model = ovr.fit(df)
     output = model.transform(df)
     self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"])
示例#8
0
 def test_copy(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     ovr1 = ovr.copy({lr.maxIter: 10})
     self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
     self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
     model = ovr.fit(df)
     model1 = model.copy({model.predictionCol: "indexed"})
     self.assertEqual(model1.getPredictionCol(), "indexed")
示例#9
0
    def test_offset(self):

        df = self.spark.createDataFrame(
            [(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)),
             (0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)),
             (0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)),
             (0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0))], ["label", "weight", "offset", "features"])

        glr = GeneralizedLinearRegression(family="poisson", weightCol="weight", offsetCol="offset")
        model = glr.fit(df)
        self.assertTrue(np.allclose(model.coefficients.toArray(), [0.664647, -0.3192581],
                                    atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4))
示例#10
0
 def test_support_for_weightCol(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
                                      (1.0, Vectors.sparse(2, [], []), 1.0),
                                      (2.0, Vectors.dense(0.5, 0.5), 1.0)],
                                     ["label", "features", "weight"])
     # classifier inherits hasWeightCol
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, weightCol="weight")
     self.assertIsNotNone(ovr.fit(df))
     # classifier doesn't inherit hasWeightCol
     dt = DecisionTreeClassifier()
     ovr2 = OneVsRest(classifier=dt, weightCol="weight")
     self.assertIsNotNone(ovr2.fit(df))
示例#11
0
 def test_parallelism_doesnt_change_output(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1)
     modelPar1 = ovrPar1.fit(df)
     ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2)
     modelPar2 = ovrPar2.fit(df)
     for i, model in enumerate(modelPar1.models):
         self.assertTrue(np.allclose(model.coefficients.toArray(),
                                     modelPar2.models[i].coefficients.toArray(), atol=1E-4))
         self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
示例#12
0
 def test_kmeans_summary(self):
     data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
             (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=2, seed=1)
     model = kmeans.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 1)
示例#13
0
 def test_bisecting_kmeans_summary(self):
     data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
             (Vectors.sparse(1, [], []),)]
     df = self.spark.createDataFrame(data, ["features"])
     bkm = BisectingKMeans(k=2)
     model = bkm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 20)
示例#14
0
 def ztest_toPandas(self):
     data = [(Vectors.dense([0.1, 0.2]),),
             (Vectors.sparse(2, {0:0.3, 1:0.4}),),
             (Vectors.sparse(2, {0:0.5, 1:0.6}),)]
     df = self.sql.createDataFrame(data, ["features"])
     self.assertEqual(df.count(), 3)
     pd = self.converter.toPandas(df)
     self.assertEqual(len(pd), 3)
     self.assertTrue(isinstance(pd.features[0], csr_matrix),
                     "Expected pd.features[0] to be csr_matrix but found: %s" %
                     type(pd.features[0]))
     self.assertEqual(pd.features[0].shape[0], 3)
     self.assertEqual(pd.features[0].shape[1], 2)
     self.assertEqual(pd.features[0][0,0], 0.1)
     self.assertEqual(pd.features[0][0,1], 0.2)
示例#15
0
    def test_binomial_logistic_regression_with_bound(self):

        df = self.spark.createDataFrame(
            [(1.0, 1.0, Vectors.dense(0.0, 5.0)),
             (0.0, 2.0, Vectors.dense(1.0, 2.0)),
             (1.0, 3.0, Vectors.dense(2.0, 1.0)),
             (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"])

        lor = LogisticRegression(regParam=0.01, weightCol="weight",
                                 lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]),
                                 upperBoundsOnIntercepts=Vectors.dense(0.0))
        model = lor.fit(df)
        self.assertTrue(
            np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
示例#16
0
文件: views.py 项目: eason001/imPro
def reduce(inputpath,alg,k):
	n_data = 0
	n_features = 0
	result = "successful!"
	inputdir = os.path.dirname(inputpath)
	print "inputdir: " + inputdir + result
	inputfile = open(inputpath,'r')
	for line in inputfile:
                input_n = len(line.split(" "))
                n_data += 1
		#print "Selected data set has " + str(input_n) + " features"
                #break
        inputfile.close()

       # result = "File: " + os.path.basename(output_data) + '</br>'
       # result += "Path: " + os.path.dirname(output_data) +  '/' + alg + str(k) + "_Features/" + '</br>'
       # result += "Dimension: " + str(n_data) + " x " + str(n_features) + "</br>"
       # context = {'result': result}
       # yield context

	if int(k) >= input_n:
                print "reduced features must be smaller than input features."
                result =  "reduced features must be smaller than input features."
	else:
#		os.system("export _JAVA_OPTIONS='-Xms1g -Xmx40g'")
#		conf = (SparkConf().set("spark.driver.maxResultSize", "5g"))
 #               sc = SparkContext(conf=conf)
  #              sqlContext = SQLContext(sc)
                lines = sc.textFile(inputpath).map(lambda x:x.split(" "))
                lines = lines.map(lambda x:(x[0],[float(y) for y in x[1:]]))
                df = lines.map(lambda x: Row(labels=x[0],features=Vectors.dense(x[1]))).toDF()
	
		if alg == "pca":
			output_data = pca(inputdir,df,alg,k)
			#os.system("spark-submit /home/ubuntu/yi-imPro/imagepro/pca.py " + inputpath + " " + k)

		output_data = inputdir + "/" + alg + str(k) + "_Data"
		inputfile = open(output_data, 'r')
	       	file_size = str(os.stat(output_data).st_size )
        	counter = 0
  	     	n_features = '0'
        	for line in inputfile:
                	input_n = len(line.split(" "))
                	n_features = str(input_n)
                	counter += 1

        	inputfile.close()
        	n_data = str(counter)

                result = "File: " + os.path.basename(output_data) + '</br>'
                result += "Path: " + os.path.dirname(output_data) +  '/' + alg + str(k) + "_Features/" + '</br>'
                result += "Dimension: " + n_data + " x " + n_features + "</br>"
                result += "Size: " + file_size + ' bytes'
		print result
#		sc.stop()		

        print "Dimension reduction finished!"

        context = {'n_data': n_data, 'n_features': n_features, 'result': result}
	return context
示例#17
0
    def test_vector_size_hint(self):
        df = self.spark.createDataFrame(
            [(0, Vectors.dense([0.0, 10.0, 0.5])),
             (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])),
             (2, Vectors.dense([2.0, 12.0]))],
            ["id", "vector"])

        sizeHint = VectorSizeHint(
            inputCol="vector",
            handleInvalid="skip")
        sizeHint.setSize(3)
        self.assertEqual(sizeHint.getSize(), 3)

        output = sizeHint.transform(df).head().vector
        expected = DenseVector([0.0, 10.0, 0.5])
        self.assertEqual(output, expected)
示例#18
0
    def test_tweedie_distribution(self):

        df = self.spark.createDataFrame(
            [(1.0, Vectors.dense(0.0, 0.0)),
             (1.0, Vectors.dense(1.0, 2.0)),
             (2.0, Vectors.dense(0.0, 0.0)),
             (2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"])

        glr = GeneralizedLinearRegression(family="tweedie", variancePower=1.6)
        model = glr.fit(df)
        self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.4645, 0.3402], atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, 0.7841, atol=1E-4))

        model2 = glr.setLinkPower(-1.0).fit(df)
        self.assertTrue(np.allclose(model2.coefficients.toArray(), [-0.6667, 0.5], atol=1E-4))
        self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1E-4))
示例#19
0
 def test_linear_regression_pmml_basic(self):
     # Most of the validation is done in the Scala side, here we just check
     # that we output text rather than parquet (e.g. that the format flag
     # was respected).
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LinearRegression(maxIter=1)
     model = lr.fit(df)
     path = tempfile.mkdtemp()
     lr_path = path + "/lr-pmml"
     model.write().format("pmml").save(lr_path)
     pmml_text_list = self.sc.textFile(lr_path).collect()
     pmml_text = "\n".join(pmml_text_list)
     self.assertIn("Apache Spark", pmml_text)
     self.assertIn("PMML", pmml_text)
示例#20
0
 def test_kmean_pmml_basic(self):
     # Most of the validation is done in the Scala side, here we just check
     # that we output text rather than parquet (e.g. that the format flag
     # was respected).
     data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
             (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=2, seed=1)
     model = kmeans.fit(df)
     path = tempfile.mkdtemp()
     km_path = path + "/km-pmml"
     model.write().format("pmml").save(km_path)
     pmml_text_list = self.sc.textFile(km_path).collect()
     pmml_text = "\n".join(pmml_text_list)
     self.assertIn("Apache Spark", pmml_text)
     self.assertIn("PMML", pmml_text)
示例#21
0
 def test_clustering_evaluator_with_cosine_distance(self):
     featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]),
                                 [([1.0, 1.0], 1.0), ([10.0, 10.0], 1.0), ([1.0, 0.5], 2.0),
                                  ([10.0, 4.4], 2.0), ([-1.0, 1.0], 3.0), ([-100.0, 90.0], 3.0)])
     dataset = self.spark.createDataFrame(featureAndPredictions, ["features", "prediction"])
     evaluator = ClusteringEvaluator(predictionCol="prediction", distanceMeasure="cosine")
     self.assertEqual(evaluator.getDistanceMeasure(), "cosine")
     self.assertTrue(np.isclose(evaluator.evaluate(dataset),  0.992671213, atol=1e-5))
示例#22
0
def convert_to_flat_by_sparkpy_v3(df):
    vectorize = udf(lambda vs: Vectors.dense(list(chain.from_iterable(vs))), VectorUDT())
    spark_df = df
    spark_df = df.orderBy("key", "subkey")
    spark_df = spark_df.groupBy("key").agg(first(col("parameter")).alias("label"), collect_list("reference").alias("features"))
    spark_df = spark_df.withColumn('features', vectorize('features'))
    spark_df = spark_df.select("label", "features")
    return spark_df
示例#23
0
 def test_gaussian_mixture_summary(self):
     data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
             (Vectors.sparse(1, [], []),)]
     df = self.spark.createDataFrame(data, ["features"])
     gmm = GaussianMixture(k=2)
     model = gmm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertTrue(isinstance(s.probability, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 3)
示例#24
0
 def test_onevsrest(self):
     temp_path = tempfile.mkdtemp()
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))] * 10,
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     ovrPath = temp_path + "/ovr"
     ovr.save(ovrPath)
     loadedOvr = OneVsRest.load(ovrPath)
     self._compare_pipelines(ovr, loadedOvr)
     modelPath = temp_path + "/ovrModel"
     model.save(modelPath)
     loadedModel = OneVsRestModel.load(modelPath)
     self._compare_pipelines(model, loadedModel)
示例#25
0
 def test_parallel_evaluation(self):
     dataset = self.spark.createDataFrame(
         [(Vectors.dense([0.0]), 0.0),
          (Vectors.dense([0.4]), 1.0),
          (Vectors.dense([0.5]), 0.0),
          (Vectors.dense([0.6]), 1.0),
          (Vectors.dense([1.0]), 1.0)] * 10,
         ["features", "label"])
     lr = LogisticRegression()
     grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
     evaluator = BinaryClassificationEvaluator()
     tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
     tvs.setParallelism(1)
     tvsSerialModel = tvs.fit(dataset)
     tvs.setParallelism(2)
     tvsParallelModel = tvs.fit(dataset)
     self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics)
示例#26
0
def mldemo():

    spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
    data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
            (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
    df = spark.createDataFrame(data, ["features"])
    
    r1 = Correlation.corr(df, "features").head()
    print("Pearson correlation matrix:\n" + str(r1[0]))
    
    r2 = Correlation.corr(df, "features", "spearman").head()
    print("Spearman correlation matrix:\n" + str(r2[0]))
    def test_type_error(self):
        df = self.spark.createDataFrame([("a", 0), ("b", 0)]).toDF("features", "key")
        keyedPCA = KeyedEstimator(sklearnEstimator=PCA())
        self.assertRaises(TypeError, keyedPCA.fit, df)

        df = self.spark.createDataFrame([(Vectors.dense([i]), [i], 0) for i in range(10)])
        df = df.toDF("features", "y", "key")
        keyedLR = KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y")
        self.assertRaises(TypeError, keyedLR.fit, df)
示例#28
0
    def test_expose_sub_models(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()

        numFolds = 3
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
                            numFolds=numFolds, collectSubModels=True)

        def checkSubModels(subModels):
            self.assertEqual(len(subModels), numFolds)
            for i in range(numFolds):
                self.assertEqual(len(subModels[i]), len(grid))

        cvModel = cv.fit(dataset)
        checkSubModels(cvModel.subModels)

        # Test the default value for option "persistSubModel" to be "true"
        testSubPath = temp_path + "/testCrossValidatorSubModels"
        savingPathWithSubModels = testSubPath + "cvModel3"
        cvModel.save(savingPathWithSubModels)
        cvModel3 = CrossValidatorModel.load(savingPathWithSubModels)
        checkSubModels(cvModel3.subModels)
        cvModel4 = cvModel3.copy()
        checkSubModels(cvModel4.subModels)

        savingPathWithoutSubModels = testSubPath + "cvModel2"
        cvModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
        cvModel2 = CrossValidatorModel.load(savingPathWithoutSubModels)
        self.assertEqual(cvModel2.subModels, None)

        for i in range(numFolds):
            for j in range(len(grid)):
                self.assertEqual(cvModel.subModels[i][j].uid, cvModel3.subModels[i][j].uid)
示例#29
0
    def test_parallel_evaluation(self):
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
        evaluator = BinaryClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        cv.setParallelism(1)
        cvSerialModel = cv.fit(dataset)
        cv.setParallelism(2)
        cvParallelModel = cv.fit(dataset)
        self.assertEqual(cvSerialModel.avgMetrics, cvParallelModel.avgMetrics)
示例#30
0
 def setUp(self):
     super(MLlibTestCase, self).setUp()
     self.sc = self.spark.sparkContext
     self.sql = self.spark
     self.X = np.array([[1,2,3],
                        [-1,2,3], [1,-2,3], [1,2,-3],
                        [-1,-2,3], [1,-2,-3], [-1,2,-3],
                        [-1,-2,-3]])
     self.y = np.array([1, 0, 1, 1, 0, 1, 0, 0])
     data = [(float(self.y[i]), Vectors.dense(self.X[i])) for i in range(len(self.y))]
     self.df = self.sql.createDataFrame(data, ["label", "features"])
示例#31
0
from __future__ import print_function

# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("VectorAssemblerExample")\
        .getOrCreate()

    # $example on$
    dataset = spark.createDataFrame(
        [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
        ["id", "hour", "mobile", "userFeatures", "clicked"])

    assembler = VectorAssembler(inputCols=["hour", "mobile", "userFeatures"],
                                outputCol="features")

    output = assembler.transform(dataset)
    print(
        "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'"
    )
    output.select("features", "clicked").show(truncate=False)
    # $example off$

    spark.stop()
示例#32
0
    for content_series in content_series_iter:
        yield featurize_series(model, content_series)


# Pandas UDFs on large records (e.g., very large images) can run into Out Of Memory (OOM) errors.
# If you hit such errors in the cell below, try reducing the Arrow batch size via `maxRecordsPerBatch`.
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "1024")

# We can now run featurization on our entire Spark DataFrame.
# NOTE: This can take a long time (about 10 minutes) since it applies a large model to the full dataset.
features_df = images_df.repartition(16).select(
    col("path"),
    featurize_udf("content").alias("features"))

#MLLib needs some post processing of the features column format
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
features_df = features_df.select(
    col("path"),
    list_to_vector_udf(features_df["features"]).alias("features"))

# OMITTED HERE
# You need to add the labels to your dataset based on the path of your images

# splitting in to training, validate and test set
df_train_split, df_validate_split, df_test_split = features_df.randomSplit(
    [0.6, 0.3, 0.1], 42)

#Here we start to train the tail of the model

# This concatenates all feature columns into a single feature vector in a new column "featuresModel".
vectorAssembler = VectorAssembler(inputCols=['features'],
示例#33
0
    spark = SparkSession.builder \
        .master("local") \
        .appName("regression") \
        .getOrCreate()

    features = []
    labels = []
    for x in range(40):
        features.append(x)
        labels.append(2 * x + 1)

    merged = list(zip(features, labels))
    random.shuffle(merged)
    features[:], labels[:] = zip(*merged)

    data = [(labels[x], Vectors.dense([features[x]])) for x in range(40)]

    train = spark.createDataFrame(data[:30], ["label", "features"])

    test = spark.createDataFrame(data[30:], ["label", "features"])

    # maxIter = 100, regParam = 0.0,  tol = 1e-6, fitIntercept = True
    # poniżej ręcznie dobrane parametry
    maxIter_param = 70
    regParam_param = 0.01
    tol_param = 1e-29
    fitIntercept_param = True
    lr = LinearRegression(maxIter=maxIter_param,
                          regParam=regParam_param,
                          tol=tol_param,
                          fitIntercept=fitIntercept_param)
if __name__ == "__main__":

    if "SPARK_HOME" in os.environ.keys():
        print("SPARK_HOME: ", os.environ['SPARK_HOME'])
    else:
        raise ValueError(
            "Environment variable SPARK_HOME needs to be specified,"
            " and make sure spark-iforest.jar is added into your lib path ($SPARK_HOME/jars"
        )

    spark = SparkSession \
        .builder.master("local[*]") \
        .appName("IForestExample") \
        .getOrCreate()

    data = [(Vectors.dense([0.0, 0.0]), ), (Vectors.dense([7.0, 9.0]), ),
            (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )]

    df = spark.createDataFrame(data, ["features"])

    from pyspark_iforest.ml.iforest import *

    iforest = IForest(contamination=0.3, maxDepth=2)
    model = iforest.fit(df)

    model.hasSummary

    summary = model.summary

    summary.numAnomalies
示例#35
0
#accuracy = evaluator.evaluate(predictions)
#accuracy = evaluator.evaluate(predictions.predictions)
#accuracy = evaluator.evaluate(prediction) Prediction is not defined
#print("Test Error = %g" %(1.0 - accuracy))
#print("Test Area under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})))
stages = []
rf = RandomForestClassifier(numTrees=100,
                            maxDepth=5,
                            maxBins=5,
                            labelCol="features",
                            featuresCol="features",
                            seed=42)
#rf=assembler
stages += [rf]
#trainingData=temp_df.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])
trainingData = file_df.rdd.map(lambda x: (Vectors.dense(x[0:-1]), x[-1])).toDF(
    ["features", "label"])
trainingData.show()
#params =
pipeline = Pipeline(stages=stages)
lr = LogisticRegression().setFeaturesCol("features")
params = ParamGridBuilder().build()
#params=ParamGridBuilder().addGrid(lr.maxIter, [500]).addGrid(lr.regParam, [0]).addGrid(lr.elasticNetParam, [1]).build()
#cvModel
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=params,
                    evaluator=evaluator,
                    numFolds=5)

#cvModel = cv.fit(file_df) #IllegalArgumentException features does not exist
#cvModel=cv.fit(output_data) #Illegal type
 def majority_vote(neighbors):
     return Vectors.dense(
         collections.Counter([x[1] for x in neighbors]).most_common()[0][0])
示例#37
0
model2.numFeatures

model2.save('E:/kaggle/titanic/dt_model10')

# -----------------------------------------------------------------------
# Step 8 for deployment --- save and load model for operationalization purpose -
from pyspark.ml.classification import DecisionTreeClassificationModel
model3 = DecisionTreeClassificationModel()
model3 = model3.load('E:/kaggle/titanic/dt_model10')

model3.depth
model3.numFeatures

from pyspark.ml.linalg import Vectors
predict_df = spark.createDataFrame(
    [(1, Vectors.dense(1.0, 0.0, 1.0, 0.0, 1.0, 0.0))], ['index', 'Features'])
predict_df.show()
model3.transform(predict_df).select('prediction').first()[0]

# ------------------------------------------------------------------------

training1 = model2.transform(training)
training1.show(5)

PredictionandLabels = training1.select(training1.prediction,
                                       training1.Survived).rdd
PredictionandLabels.collect()

from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics
# metrics1 = BinaryClassificationMetrics(PredictionandLabels)
# (train score/train accuracy   --- )
示例#38
0
    SiteCategoryEncoder, AppCategoryEncoder, DeviceTypeEncoder,
    DeviceConnTypeEncoder, C15Encoder, C16Encoder, C18Encoder, C19Encoder,
    C21Encoder, FeatureAssembler
])
modelTmp = pipelineTmp.fit(schemaClick)
tmp = modelTmp.transform(schemaClick).select("click", "VectoredFeatures")
tmp.registerTempTable("CLICK")

# Selecting click and VectoredFeatures from Table "CLICK" and creating new dataFrame as results
results = sqlContext.sql("SELECT click, VectoredFeatures from CLICK")
results.show()

# Creating label points for attributes click and VectoredFeatures
click_transformed = results.select(
    'click', 'VectoredFeatures').rdd.map(lambda row: LabeledPoint(
        float(row.click), Vectors.dense((row.VectoredFeatures).toArray())))
click_transformed.take(2)

#Divide the data into training and test sets
weights = [.8, .2]
seed = 15L

ClickTrain, ClickTest = click_transformed.randomSplit(weights, seed)

# Train the training data set for the Gradient Decent
modelGD = LogisticRegressionWithSGD.train(ClickTrain,
                                          iterations=15,
                                          step=1,
                                          miniBatchFraction=1,
                                          regType=None,
                                          validateData="False")
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Spark MLlib") \
        .config("spark.master", "local") \
        .getOrCreate()

######################################################
#   Example 1 - Dense & Sparse Vectors
######################################################

from pyspark.ml.linalg import Vectors
denseVec = Vectors.dense(1.0, 2.0, 3.0, 4.0, 5.0, 6.0)
size = 12
idx = [1, 2, 10, 11]  # locations of non-zero elements in vector
values = [12.0, 32.0, 110.0, 27.0]
sparseVec = Vectors.sparse(size, idx, values)

print("denseVec: ", denseVec)
print("sparseVec: ", sparseVec)

spark.stop()
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("EstimatorTransformerParamExample")\
        .getOrCreate()

    # $example on$
    # Prepare training data from a list of (label, features) tuples.
    training = spark.createDataFrame([
        (1.0, Vectors.dense([0.0, 1.1, 0.1])),
        (0.0, Vectors.dense([2.0, 1.0, -1.0])),
        (0.0, Vectors.dense([2.0, 1.3, 1.0])),
        (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

    # Create a LogisticRegression instance. This instance is an Estimator.
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    # Print out the parameters, documentation, and any default values.
    print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

    # Learn a LogisticRegression model. This uses the parameters stored in lr.
    model1 = lr.fit(training)

    # Since model1 is a Model (i.e., a transformer produced by an Estimator),
    # we can view the parameters it used during fit().
    # This prints the parameter (name: value) pairs, where names are unique IDs for this
 def add_features(feat, *other):
     raw = feat.toArray()
     return Vectors.dense(np.append(raw, map(float, other)))
示例#42
0
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Spark MLlib") \
        .config("spark.master", "local") \
        .getOrCreate()

sc = spark.sparkContext

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression

bdf = sc.parallelize([
    Row(label=1.0, weight=1.0, features=Vectors.dense(0.0, 5.0)),
    Row(label=0.0, weight=2.0, features=Vectors.dense(1.0, 2.0)),
    Row(label=1.0, weight=3.0, features=Vectors.dense(2.0, 1.0)),
    Row(label=0.0, weight=4.0, features=Vectors.dense(3.0, 3.0))
]).toDF()

blor = LogisticRegression(regParam=0.01, weightCol="weight")
blorModel = blor.fit(bdf)
blorModel.coefficients
blorModel.intercept

test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF()
blorModel.transform(test1).head().prediction

save_path = "C:\\PySpark\\spark_ml\\saved_models\\logistic_regression_example_1\\"
estimator_path = save_path + "lr"
示例#43
0
from collections import Counter
from functools import partial
from functools import reduce

import pandas as pd
from pyspark import SparkContext
from pyspark.ml.feature import PCAModel
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import DataFrame
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, FloatType

get_cluster_size = udf(lambda x: len(x), IntegerType())
transform_to_vec = udf(lambda x: Vectors.dense(x), VectorUDT())
get_component1 = udf(lambda x: x.toArray().tolist()[0], FloatType())
get_component2 = udf(lambda x: x.toArray().tolist()[1], FloatType())


def unionAll(dfs):
    return reduce(DataFrame.unionAll, dfs)


def get_cluster_purity(images_p, b_mapping):
    total_c = list()
    for img in images_p:
        img_c = b_mapping.value[img.split("/")[-1]]
        total_c += img_c

    N = len(images_p)
示例#44
0
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("EstimatorTransformerParamExample")\
        .getOrCreate()

    # $example on$
    # Prepare training data from a list of (label, features) tuples.
    training = spark.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])),
                                      (0.0, Vectors.dense([2.0, 1.0, -1.0])),
                                      (0.0, Vectors.dense([2.0, 1.3, 1.0])),
                                      (1.0, Vectors.dense([0.0, 1.2, -0.5]))],
                                     ["label", "features"])

    # Create a LogisticRegression instance. This instance is an Estimator.
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    # Print out the parameters, documentation, and any default values.
    print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

    # Learn a LogisticRegression model. This uses the parameters stored in lr.
    model1 = lr.fit(training)

    # Since model1 is a Model (i.e., a transformer produced by an Estimator),
    # we can view the parameters it used during fit().
示例#45
0
# Make predictions.
predictions = model.transform(testData)
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
gbtModel = model.stages[1]
print(gbtModel)  # summary only

from pyspark.ml.regression import AFTSurvivalRegression
from pyspark.ml.linalg import Vectors
training = spark.createDataFrame([(1.218, 1.0, Vectors.dense(1.560, -0.605)),
                                  (2.949, 0.0, Vectors.dense(0.346, 2.158)),
                                  (3.627, 0.0, Vectors.dense(1.380, 0.231)),
                                  (0.273, 1.0, Vectors.dense(0.520, 1.151)),
                                  (4.199, 0.0, Vectors.dense(0.795, -0.226))],
                                 ["label", "censor", "features"])
quantileProbabilities = [0.3, 0.6]
aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities,
                            quantilesCol="quantiles")
model = aft.fit(training)
# Print the coefficients, intercept and scale parameter for AFT survival regression
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))
print("Scale: " + str(model.scale))
model.transform(training).show(truncate=False)
示例#46
0
def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]), r[-1]]).toDF(
        ['features', 'label'])
## wine_data_label_indexed = model.transform(wine_data)
## wine_data_label_indexed.show(5)
## #wine_data = wine_data.withColumn('string_quality', wine_data.quality.cast('string'))
## 
## 
## from pyspark.ml.feature import StringIndexer
## indexer = StringIndexer(inputCol="label", outputCol="indexed_label")
## model = indexer.fit(ml_wine_data)
## wine_data_label_indexed = model.transform(ml_wine_data)
## wine_data_label_indexed.show(5)


# convert data into featuresCol and labelCol structre
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
ml_wine_data = wine_data.rdd.map(lambda r: [Vectors.dense(r[:-1]), r[-1]]).toDF(['featuresCol', 'label'])
ml_wine_data.show(5)



## from pyspark.ml.feature import VectorIndexer
## indexer = VectorIndexer(maxCategories=4, inputCol='featuresCol', outputCol='indexed_features')
## model = indexer.fit(wine_data_label_indexed)
## wine_data_feature_indexed  = model.transform(wine_data_label_indexed)

## splitting data into training and test sets
training, test = ml_wine_data.randomSplit(weights=[0.7, 0.3], seed=123)
training.show(5)


## naive bayes classifier
示例#48
0
def dataset_multinomial(spark_session):
    return spark_session.createDataFrame(
        [(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], [])),
         (2.0, Vectors.dense(0.5))] * 100,
        ["label", "features"],
    ).cache()
示例#49
0
Run with:
  bin/spark-submit examples/src/main/python/ml/correlation_example.py
"""
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("CorrelationExample") \
        .getOrCreate()

    # $example on$
    data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ),
            (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )]
    df = spark.createDataFrame(data, ["features"])

    r1 = Correlation.corr(df, "features").head()
    print("Pearson correlation matrix:\n" + str(r1[0]))

    r2 = Correlation.corr(df, "features", "spearman").head()
    print("Spearman correlation matrix:\n" + str(r2[0]))
    # $example off$

    spark.stop()
示例#50
0
df0=sample_300_df2.select(["_id", "chordRatio", "chordRatioMinHash"])

from pyspark.sql import Row
from pyspark.sql.functions import col
from sparkaid import flatten
df0_flat=flatten(df0)
columns_list1=df0_flat.columns[1:-1]
array_df=df0_flat.select('_id', 'chordRatioMinHash',array(columns_list1).alias('chordRatioJS'))

#fill NaNs with zeros in the array column
df2_flat=df0_flat.na.fill(float(0))
columns_list2=df2_flat.columns[1:-1]
array_df2=df2_flat.select('_id', 'chordRatioMinHash',array(columns_list2).alias('chordRatioJS_no_Nulls'))

###
to_vector = udf(lambda a: Vectors.dense(a), VectorUDT())
data = array_df2.select('_id', 'chordRatioMinHash', "chordRatioJS_no_Nulls", to_vector("chordRatioJS_no_Nulls").alias("chordRatioWJS"))
data.show(1, truncate=False)

import scipy.sparse
from pyspark.ml.linalg import Vectors, _convert_to_vector, VectorUDT
from pyspark.sql.functions import udf, col
## from dense to sparse array
def dense_to_sparse(vector):
    return _convert_to_vector(scipy.sparse.csc_matrix(vector.toArray()).T)

to_sparse = udf(dense_to_sparse, VectorUDT())
data_sparse=data.withColumn("sparseChordRatioJS", to_sparse(col("chordRatioWJS")))
#data_sparse2=data_sparse.select('_id', 'chordRatio_for_minHash', 'sparseChordRatioJS')

indices_udf = udf(lambda vector: vector.indices.tolist(), ArrayType(IntegerType()))
示例#51
0
def toSparseVector(index, values):
    day_list_index, qty_list_values = zip(*sorted(zip(index, values)))
    #367 for bisextile year (1 to 366 +1)
    return Vectors.sparse(366, day_list_index, qty_list_values)
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

dataA = [(
    0,
    Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),
), (
    1,
    Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),
), (
    2,
    Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),
)]

dfA = spark.createDataFrame(dataA, ["id", "features"])

dataB = [(
    3,
    Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),
), (
    4,
    Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),
), (
    5,
    Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),
)]
parts = lines.map(lambda row: row.value.split(","))
df = spark.createDataFrame(parts, [
    "id", "accommodates", "bathrooms", "bedrooms", "beds", "price",
    "Shared room", "Entire home/apt", "Private room", "Queens", "Brooklyn",
    "Staten Island", "Manhattan", "Bronx", "testTrain", "listId"
])

#Put in the same order
testTrainData = df.select("accommodates", "bathrooms", "bedrooms", "beds",
                          "price", "Shared room", "Entire home/apt",
                          "Private room", "Queens", "Brooklyn",
                          "Staten Island", "Manhattan", "Bronx", "testTrain",
                          "listId")

# Make dataframe into LIBSVM format
datak = testTrainData.rdd.map(lambda x: (Vectors.dense(x[0:-2]), x[-1])).toDF(
    ["features", "label"])

# Trains a k-means model.
kmeans = KMeans().setK(10).setSeed(123)
model = kmeans.fit(datak)

# Make predictions
predictions = model.transform(datak)

# Clustered dataframe
testTrainData = testTrainData.select('listId', 'testTrain')
testTrainData = testTrainData.withColumnRenamed('listId', 'newId')
OuterCluster = dataset.join(predictions, dataset.listId == predictions.label)
OuterCluster = OuterCluster.drop('features').drop('label')
OuterCluster = OuterCluster.join(testTrainData,
示例#54
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jun  8 15:08:11 2018

@author: luogan
"""

from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.linalg import Vectors

from pyspark.sql import SparkSession
spark= SparkSession\
                .builder \
                .appName("dataFrame") \
                .getOrCreate()
df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ),
                            (Vectors.dense([0.0, 0.0]), ),
                            (Vectors.dense([3.0, -1.0]), )], ["features"])

polyExpansion = PolynomialExpansion(degree=3,
                                    inputCol="features",
                                    outputCol="polyFeatures")
polyDF = polyExpansion.transform(df)

polyDF.show(truncate=False)
spark = SparkSession.builder.getOrCreate()

# Principal Component Analysis (PCA)

# In[3]:

# PCA is a statistical procedure that uses an orthogonal transformation
# to convert a set of observations of possibly correlated variables
# into a set of values of linearly uncorrelated variables called
# principal components.
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

# In[4]:

data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )]

# In[5]:

df = spark.createDataFrame(data, ["features"])

# In[6]:

df.show()

# In[7]:

# The example below shows how to project 5-dimensional feature vectors into
# 3-dimensional principal components.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# $example on$
from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("ElementwiseProductExample")\
        .getOrCreate()

    # $example on$
    # Create some vector data; also works for sparse vectors
    data = [(Vectors.dense([1.0, 2.0, 3.0]), ),
            (Vectors.dense([4.0, 5.0, 6.0]), )]
    df = spark.createDataFrame(data, ["vector"])
    transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
                                     inputCol="vector",
                                     outputCol="transformedVector")
    # Batch transform the vectors to create new column:
    transformer.transform(df).show()
    # $example off$

    spark.stop()
示例#57
0
path = "./resources/"
angry_df = ImageSchema.readImages(path + "0/").withColumn("label", lit(0))
happy_df = ImageSchema.readImages(path + "3/").withColumn("label", lit(1))
sad_df = ImageSchema.readImages(path + "4/").withColumn("label", lit(2))

sc = spark.sparkContext

log4jLogger = sc._jvm.org.apache.log4j
log = log4jLogger.Logger.getLogger(__name__)

log.info("pyspark script logger initialized")

df1 = angry_df.union(happy_df).union(sad_df)

parse_ = udf(lambda a: Vectors.dense(a), VectorUDT())
df = df1.withColumn("features", parse_(df1["image.data"]))

train, test, _ = df.randomSplit([0.1, 0.05, 0.85])

lr = LogisticRegression(maxIter=100,
                        regParam=0.05,
                        elasticNetParam=0.3,
                        featuresCol="features",
                        labelCol="label")
train.cache()

p = Pipeline(stages=[lr])
p_model = p.fit(train)

predictions = p_model.transform(test)
spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/temp").appName("LinearRegression").getOrCreate()

inputLines = spark.sparkContext.textFile("file:///Users/lesli/BigData/TA3/College.csv")

def parseLine(line):
    fields = line.split(',')
    SchoolType = fields[1]
    SF_Ratio = float(fields[15])
    Grad_Rate = float(fields[18])
    return (SchoolType, SF_Ratio, Grad_Rate)

parsedLines = inputLines.map(parseLine)

Private = parsedLines.filter(lambda x: "Yes" in x[0]) 

data = Private.map(lambda x: (Vectors.dense(float(x[1])),float(x[2]))).cache()

#data = inputLines.map(lambda x: x.split(",")).map(lambda x: (Vectors.dense(float(x[15])),float(x[18]))).cache()
#[16] is Grad_Rate (Y_variable), [13] is predictor -SF.Ratio 
                                                    

# Convert this RDD to a DataFrame
colNames = ['SF_Ratio','Grad_Rate'] #Y is Grad.rate
df = data.toDF(colNames)

# Let's split our data into training data and testing data
trainTest = df.randomSplit([0.8, 0.2])
trainingDF = trainTest[0]
testDF = trainTest[1]
wholeDF = df
示例#59
0
def transData(data):
    return data.rdd.map(lambda r: [r[0],Vectors.dense(r[1:])]).toDF(['CustomerID','rfm']) #Return a new RDD by applying a function to each element of this RDD.
 def zero_features(feat):
     raw = feat.toArray()
     for idx in idxs:
         raw[idx] = 0.
     return Vectors.dense(raw)