Пример #1
0
    def test_save_load_simple_estimator(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        cvPath = temp_path + "/cv"
        cv.save(cvPath)
        loadedCV = CrossValidator.load(cvPath)
        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
        self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())

        # test save/load of CrossValidatorModel
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedModel = CrossValidatorModel.load(cvModelPath)
        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
Пример #2
0
    def test_expose_sub_models(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
                                   collectSubModels=True)
        tvsModel = tvs.fit(dataset)
        self.assertEqual(len(tvsModel.subModels), len(grid))

        # Test the default value for option "persistSubModel" to be "true"
        testSubPath = temp_path + "/testTrainValidationSplitSubModels"
        savingPathWithSubModels = testSubPath + "cvModel3"
        tvsModel.save(savingPathWithSubModels)
        tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels)
        self.assertEqual(len(tvsModel3.subModels), len(grid))
        tvsModel4 = tvsModel3.copy()
        self.assertEqual(len(tvsModel4.subModels), len(grid))

        savingPathWithoutSubModels = testSubPath + "cvModel2"
        tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
        tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels)
        self.assertEqual(tvsModel2.subModels, None)

        for i in range(len(grid)):
            self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid)
Пример #3
0
 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     df = self.spark.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass
Пример #4
0
    def test_java_object_gets_detached(self):
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
                                        ["label", "weight", "features"])
        lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight",
                              fitIntercept=False)

        model = lr.fit(df)
        summary = model.summary

        self.assertIsInstance(model, JavaWrapper)
        self.assertIsInstance(summary, JavaWrapper)
        self.assertIsInstance(model, JavaParams)
        self.assertNotIsInstance(summary, JavaParams)

        error_no_object = 'Target Object ID does not exist for this gateway'

        self.assertIn("LinearRegression_", model._java_obj.toString())
        self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())

        model.__del__()

        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            model._java_obj.toString()
        self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())

        try:
            summary.__del__()
        except:
            pass

        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            model._java_obj.toString()
        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            summary._java_obj.toString()
Пример #5
0
    def test_save_load_simple_estimator(self):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)

        tvsPath = temp_path + "/tvs"
        tvs.save(tvsPath)
        loadedTvs = TrainValidationSplit.load(tvsPath)
        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
        self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)
        self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps())

        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
Пример #6
0
 def test_equals(self):
     indices = [1, 2, 4]
     values = [1., 3., 2.]
     self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.]))
     self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.]))
     self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.]))
     self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]))
Пример #7
0
 def test_output_columns(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, parallelism=1)
     model = ovr.fit(df)
     output = model.transform(df)
     self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"])
Пример #8
0
 def test_copy(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     ovr1 = ovr.copy({lr.maxIter: 10})
     self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
     self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
     model = ovr.fit(df)
     model1 = model.copy({model.predictionCol: "indexed"})
     self.assertEqual(model1.getPredictionCol(), "indexed")
Пример #9
0
    def test_offset(self):

        df = self.spark.createDataFrame(
            [(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)),
             (0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)),
             (0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)),
             (0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0))], ["label", "weight", "offset", "features"])

        glr = GeneralizedLinearRegression(family="poisson", weightCol="weight", offsetCol="offset")
        model = glr.fit(df)
        self.assertTrue(np.allclose(model.coefficients.toArray(), [0.664647, -0.3192581],
                                    atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4))
Пример #10
0
 def test_support_for_weightCol(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
                                      (1.0, Vectors.sparse(2, [], []), 1.0),
                                      (2.0, Vectors.dense(0.5, 0.5), 1.0)],
                                     ["label", "features", "weight"])
     # classifier inherits hasWeightCol
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, weightCol="weight")
     self.assertIsNotNone(ovr.fit(df))
     # classifier doesn't inherit hasWeightCol
     dt = DecisionTreeClassifier()
     ovr2 = OneVsRest(classifier=dt, weightCol="weight")
     self.assertIsNotNone(ovr2.fit(df))
Пример #11
0
 def test_parallelism_doesnt_change_output(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1)
     modelPar1 = ovrPar1.fit(df)
     ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2)
     modelPar2 = ovrPar2.fit(df)
     for i, model in enumerate(modelPar1.models):
         self.assertTrue(np.allclose(model.coefficients.toArray(),
                                     modelPar2.models[i].coefficients.toArray(), atol=1E-4))
         self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
Пример #12
0
 def test_kmeans_summary(self):
     data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
             (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=2, seed=1)
     model = kmeans.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 1)
Пример #13
0
 def test_bisecting_kmeans_summary(self):
     data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
             (Vectors.sparse(1, [], []),)]
     df = self.spark.createDataFrame(data, ["features"])
     bkm = BisectingKMeans(k=2)
     model = bkm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 20)
Пример #14
0
 def ztest_toPandas(self):
     data = [(Vectors.dense([0.1, 0.2]),),
             (Vectors.sparse(2, {0:0.3, 1:0.4}),),
             (Vectors.sparse(2, {0:0.5, 1:0.6}),)]
     df = self.sql.createDataFrame(data, ["features"])
     self.assertEqual(df.count(), 3)
     pd = self.converter.toPandas(df)
     self.assertEqual(len(pd), 3)
     self.assertTrue(isinstance(pd.features[0], csr_matrix),
                     "Expected pd.features[0] to be csr_matrix but found: %s" %
                     type(pd.features[0]))
     self.assertEqual(pd.features[0].shape[0], 3)
     self.assertEqual(pd.features[0].shape[1], 2)
     self.assertEqual(pd.features[0][0,0], 0.1)
     self.assertEqual(pd.features[0][0,1], 0.2)
Пример #15
0
    def test_binomial_logistic_regression_with_bound(self):

        df = self.spark.createDataFrame(
            [(1.0, 1.0, Vectors.dense(0.0, 5.0)),
             (0.0, 2.0, Vectors.dense(1.0, 2.0)),
             (1.0, 3.0, Vectors.dense(2.0, 1.0)),
             (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"])

        lor = LogisticRegression(regParam=0.01, weightCol="weight",
                                 lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]),
                                 upperBoundsOnIntercepts=Vectors.dense(0.0))
        model = lor.fit(df)
        self.assertTrue(
            np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
Пример #16
0
def reduce(inputpath,alg,k):
	n_data = 0
	n_features = 0
	result = "successful!"
	inputdir = os.path.dirname(inputpath)
	print "inputdir: " + inputdir + result
	inputfile = open(inputpath,'r')
	for line in inputfile:
                input_n = len(line.split(" "))
                n_data += 1
		#print "Selected data set has " + str(input_n) + " features"
                #break
        inputfile.close()

       # result = "File: " + os.path.basename(output_data) + '</br>'
       # result += "Path: " + os.path.dirname(output_data) +  '/' + alg + str(k) + "_Features/" + '</br>'
       # result += "Dimension: " + str(n_data) + " x " + str(n_features) + "</br>"
       # context = {'result': result}
       # yield context

	if int(k) >= input_n:
                print "reduced features must be smaller than input features."
                result =  "reduced features must be smaller than input features."
	else:
#		os.system("export _JAVA_OPTIONS='-Xms1g -Xmx40g'")
#		conf = (SparkConf().set("spark.driver.maxResultSize", "5g"))
 #               sc = SparkContext(conf=conf)
  #              sqlContext = SQLContext(sc)
                lines = sc.textFile(inputpath).map(lambda x:x.split(" "))
                lines = lines.map(lambda x:(x[0],[float(y) for y in x[1:]]))
                df = lines.map(lambda x: Row(labels=x[0],features=Vectors.dense(x[1]))).toDF()
	
		if alg == "pca":
			output_data = pca(inputdir,df,alg,k)
			#os.system("spark-submit /home/ubuntu/yi-imPro/imagepro/pca.py " + inputpath + " " + k)

		output_data = inputdir + "/" + alg + str(k) + "_Data"
		inputfile = open(output_data, 'r')
	       	file_size = str(os.stat(output_data).st_size )
        	counter = 0
  	     	n_features = '0'
        	for line in inputfile:
                	input_n = len(line.split(" "))
                	n_features = str(input_n)
                	counter += 1

        	inputfile.close()
        	n_data = str(counter)

                result = "File: " + os.path.basename(output_data) + '</br>'
                result += "Path: " + os.path.dirname(output_data) +  '/' + alg + str(k) + "_Features/" + '</br>'
                result += "Dimension: " + n_data + " x " + n_features + "</br>"
                result += "Size: " + file_size + ' bytes'
		print result
#		sc.stop()		

        print "Dimension reduction finished!"

        context = {'n_data': n_data, 'n_features': n_features, 'result': result}
	return context
Пример #17
0
    def test_vector_size_hint(self):
        df = self.spark.createDataFrame(
            [(0, Vectors.dense([0.0, 10.0, 0.5])),
             (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])),
             (2, Vectors.dense([2.0, 12.0]))],
            ["id", "vector"])

        sizeHint = VectorSizeHint(
            inputCol="vector",
            handleInvalid="skip")
        sizeHint.setSize(3)
        self.assertEqual(sizeHint.getSize(), 3)

        output = sizeHint.transform(df).head().vector
        expected = DenseVector([0.0, 10.0, 0.5])
        self.assertEqual(output, expected)
Пример #18
0
    def test_tweedie_distribution(self):

        df = self.spark.createDataFrame(
            [(1.0, Vectors.dense(0.0, 0.0)),
             (1.0, Vectors.dense(1.0, 2.0)),
             (2.0, Vectors.dense(0.0, 0.0)),
             (2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"])

        glr = GeneralizedLinearRegression(family="tweedie", variancePower=1.6)
        model = glr.fit(df)
        self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.4645, 0.3402], atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, 0.7841, atol=1E-4))

        model2 = glr.setLinkPower(-1.0).fit(df)
        self.assertTrue(np.allclose(model2.coefficients.toArray(), [-0.6667, 0.5], atol=1E-4))
        self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1E-4))
Пример #19
0
 def test_linear_regression_pmml_basic(self):
     # Most of the validation is done in the Scala side, here we just check
     # that we output text rather than parquet (e.g. that the format flag
     # was respected).
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LinearRegression(maxIter=1)
     model = lr.fit(df)
     path = tempfile.mkdtemp()
     lr_path = path + "/lr-pmml"
     model.write().format("pmml").save(lr_path)
     pmml_text_list = self.sc.textFile(lr_path).collect()
     pmml_text = "\n".join(pmml_text_list)
     self.assertIn("Apache Spark", pmml_text)
     self.assertIn("PMML", pmml_text)
Пример #20
0
 def test_kmean_pmml_basic(self):
     # Most of the validation is done in the Scala side, here we just check
     # that we output text rather than parquet (e.g. that the format flag
     # was respected).
     data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
             (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=2, seed=1)
     model = kmeans.fit(df)
     path = tempfile.mkdtemp()
     km_path = path + "/km-pmml"
     model.write().format("pmml").save(km_path)
     pmml_text_list = self.sc.textFile(km_path).collect()
     pmml_text = "\n".join(pmml_text_list)
     self.assertIn("Apache Spark", pmml_text)
     self.assertIn("PMML", pmml_text)
Пример #21
0
 def test_clustering_evaluator_with_cosine_distance(self):
     featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]),
                                 [([1.0, 1.0], 1.0), ([10.0, 10.0], 1.0), ([1.0, 0.5], 2.0),
                                  ([10.0, 4.4], 2.0), ([-1.0, 1.0], 3.0), ([-100.0, 90.0], 3.0)])
     dataset = self.spark.createDataFrame(featureAndPredictions, ["features", "prediction"])
     evaluator = ClusteringEvaluator(predictionCol="prediction", distanceMeasure="cosine")
     self.assertEqual(evaluator.getDistanceMeasure(), "cosine")
     self.assertTrue(np.isclose(evaluator.evaluate(dataset),  0.992671213, atol=1e-5))
Пример #22
0
def convert_to_flat_by_sparkpy_v3(df):
    vectorize = udf(lambda vs: Vectors.dense(list(chain.from_iterable(vs))), VectorUDT())
    spark_df = df
    spark_df = df.orderBy("key", "subkey")
    spark_df = spark_df.groupBy("key").agg(first(col("parameter")).alias("label"), collect_list("reference").alias("features"))
    spark_df = spark_df.withColumn('features', vectorize('features'))
    spark_df = spark_df.select("label", "features")
    return spark_df
Пример #23
0
 def test_gaussian_mixture_summary(self):
     data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
             (Vectors.sparse(1, [], []),)]
     df = self.spark.createDataFrame(data, ["features"])
     gmm = GaussianMixture(k=2)
     model = gmm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertTrue(isinstance(s.probability, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 3)
Пример #24
0
 def test_onevsrest(self):
     temp_path = tempfile.mkdtemp()
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))] * 10,
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     ovrPath = temp_path + "/ovr"
     ovr.save(ovrPath)
     loadedOvr = OneVsRest.load(ovrPath)
     self._compare_pipelines(ovr, loadedOvr)
     modelPath = temp_path + "/ovrModel"
     model.save(modelPath)
     loadedModel = OneVsRestModel.load(modelPath)
     self._compare_pipelines(model, loadedModel)
Пример #25
0
 def test_parallel_evaluation(self):
     dataset = self.spark.createDataFrame(
         [(Vectors.dense([0.0]), 0.0),
          (Vectors.dense([0.4]), 1.0),
          (Vectors.dense([0.5]), 0.0),
          (Vectors.dense([0.6]), 1.0),
          (Vectors.dense([1.0]), 1.0)] * 10,
         ["features", "label"])
     lr = LogisticRegression()
     grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
     evaluator = BinaryClassificationEvaluator()
     tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
     tvs.setParallelism(1)
     tvsSerialModel = tvs.fit(dataset)
     tvs.setParallelism(2)
     tvsParallelModel = tvs.fit(dataset)
     self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics)
Пример #26
0
def mldemo():

    spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
    data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
            (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
    df = spark.createDataFrame(data, ["features"])
    
    r1 = Correlation.corr(df, "features").head()
    print("Pearson correlation matrix:\n" + str(r1[0]))
    
    r2 = Correlation.corr(df, "features", "spearman").head()
    print("Spearman correlation matrix:\n" + str(r2[0]))
Пример #27
0
    def test_type_error(self):
        df = self.spark.createDataFrame([("a", 0), ("b", 0)]).toDF("features", "key")
        keyedPCA = KeyedEstimator(sklearnEstimator=PCA())
        self.assertRaises(TypeError, keyedPCA.fit, df)

        df = self.spark.createDataFrame([(Vectors.dense([i]), [i], 0) for i in range(10)])
        df = df.toDF("features", "y", "key")
        keyedLR = KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y")
        self.assertRaises(TypeError, keyedLR.fit, df)
Пример #28
0
    def test_expose_sub_models(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()

        numFolds = 3
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
                            numFolds=numFolds, collectSubModels=True)

        def checkSubModels(subModels):
            self.assertEqual(len(subModels), numFolds)
            for i in range(numFolds):
                self.assertEqual(len(subModels[i]), len(grid))

        cvModel = cv.fit(dataset)
        checkSubModels(cvModel.subModels)

        # Test the default value for option "persistSubModel" to be "true"
        testSubPath = temp_path + "/testCrossValidatorSubModels"
        savingPathWithSubModels = testSubPath + "cvModel3"
        cvModel.save(savingPathWithSubModels)
        cvModel3 = CrossValidatorModel.load(savingPathWithSubModels)
        checkSubModels(cvModel3.subModels)
        cvModel4 = cvModel3.copy()
        checkSubModels(cvModel4.subModels)

        savingPathWithoutSubModels = testSubPath + "cvModel2"
        cvModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
        cvModel2 = CrossValidatorModel.load(savingPathWithoutSubModels)
        self.assertEqual(cvModel2.subModels, None)

        for i in range(numFolds):
            for j in range(len(grid)):
                self.assertEqual(cvModel.subModels[i][j].uid, cvModel3.subModels[i][j].uid)
Пример #29
0
    def test_parallel_evaluation(self):
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
        evaluator = BinaryClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        cv.setParallelism(1)
        cvSerialModel = cv.fit(dataset)
        cv.setParallelism(2)
        cvParallelModel = cv.fit(dataset)
        self.assertEqual(cvSerialModel.avgMetrics, cvParallelModel.avgMetrics)
Пример #30
0
 def setUp(self):
     super(MLlibTestCase, self).setUp()
     self.sc = self.spark.sparkContext
     self.sql = self.spark
     self.X = np.array([[1,2,3],
                        [-1,2,3], [1,-2,3], [1,2,-3],
                        [-1,-2,3], [1,-2,-3], [-1,2,-3],
                        [-1,-2,-3]])
     self.y = np.array([1, 0, 1, 1, 0, 1, 0, 0])
     data = [(float(self.y[i]), Vectors.dense(self.X[i])) for i in range(len(self.y))]
     self.df = self.sql.createDataFrame(data, ["label", "features"])
Пример #31
0
from __future__ import print_function

# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("VectorAssemblerExample")\
        .getOrCreate()

    # $example on$
    dataset = spark.createDataFrame(
        [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
        ["id", "hour", "mobile", "userFeatures", "clicked"])

    assembler = VectorAssembler(inputCols=["hour", "mobile", "userFeatures"],
                                outputCol="features")

    output = assembler.transform(dataset)
    print(
        "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'"
    )
    output.select("features", "clicked").show(truncate=False)
    # $example off$

    spark.stop()
Пример #32
0
    for content_series in content_series_iter:
        yield featurize_series(model, content_series)


# Pandas UDFs on large records (e.g., very large images) can run into Out Of Memory (OOM) errors.
# If you hit such errors in the cell below, try reducing the Arrow batch size via `maxRecordsPerBatch`.
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "1024")

# We can now run featurization on our entire Spark DataFrame.
# NOTE: This can take a long time (about 10 minutes) since it applies a large model to the full dataset.
features_df = images_df.repartition(16).select(
    col("path"),
    featurize_udf("content").alias("features"))

#MLLib needs some post processing of the features column format
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
features_df = features_df.select(
    col("path"),
    list_to_vector_udf(features_df["features"]).alias("features"))

# OMITTED HERE
# You need to add the labels to your dataset based on the path of your images

# splitting in to training, validate and test set
df_train_split, df_validate_split, df_test_split = features_df.randomSplit(
    [0.6, 0.3, 0.1], 42)

#Here we start to train the tail of the model

# This concatenates all feature columns into a single feature vector in a new column "featuresModel".
vectorAssembler = VectorAssembler(inputCols=['features'],
Пример #33
0
    spark = SparkSession.builder \
        .master("local") \
        .appName("regression") \
        .getOrCreate()

    features = []
    labels = []
    for x in range(40):
        features.append(x)
        labels.append(2 * x + 1)

    merged = list(zip(features, labels))
    random.shuffle(merged)
    features[:], labels[:] = zip(*merged)

    data = [(labels[x], Vectors.dense([features[x]])) for x in range(40)]

    train = spark.createDataFrame(data[:30], ["label", "features"])

    test = spark.createDataFrame(data[30:], ["label", "features"])

    # maxIter = 100, regParam = 0.0,  tol = 1e-6, fitIntercept = True
    # poniżej ręcznie dobrane parametry
    maxIter_param = 70
    regParam_param = 0.01
    tol_param = 1e-29
    fitIntercept_param = True
    lr = LinearRegression(maxIter=maxIter_param,
                          regParam=regParam_param,
                          tol=tol_param,
                          fitIntercept=fitIntercept_param)
Пример #34
0
if __name__ == "__main__":

    if "SPARK_HOME" in os.environ.keys():
        print("SPARK_HOME: ", os.environ['SPARK_HOME'])
    else:
        raise ValueError(
            "Environment variable SPARK_HOME needs to be specified,"
            " and make sure spark-iforest.jar is added into your lib path ($SPARK_HOME/jars"
        )

    spark = SparkSession \
        .builder.master("local[*]") \
        .appName("IForestExample") \
        .getOrCreate()

    data = [(Vectors.dense([0.0, 0.0]), ), (Vectors.dense([7.0, 9.0]), ),
            (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )]

    df = spark.createDataFrame(data, ["features"])

    from pyspark_iforest.ml.iforest import *

    iforest = IForest(contamination=0.3, maxDepth=2)
    model = iforest.fit(df)

    model.hasSummary

    summary = model.summary

    summary.numAnomalies
Пример #35
0
#accuracy = evaluator.evaluate(predictions)
#accuracy = evaluator.evaluate(predictions.predictions)
#accuracy = evaluator.evaluate(prediction) Prediction is not defined
#print("Test Error = %g" %(1.0 - accuracy))
#print("Test Area under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})))
stages = []
rf = RandomForestClassifier(numTrees=100,
                            maxDepth=5,
                            maxBins=5,
                            labelCol="features",
                            featuresCol="features",
                            seed=42)
#rf=assembler
stages += [rf]
#trainingData=temp_df.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])
trainingData = file_df.rdd.map(lambda x: (Vectors.dense(x[0:-1]), x[-1])).toDF(
    ["features", "label"])
trainingData.show()
#params =
pipeline = Pipeline(stages=stages)
lr = LogisticRegression().setFeaturesCol("features")
params = ParamGridBuilder().build()
#params=ParamGridBuilder().addGrid(lr.maxIter, [500]).addGrid(lr.regParam, [0]).addGrid(lr.elasticNetParam, [1]).build()
#cvModel
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=params,
                    evaluator=evaluator,
                    numFolds=5)

#cvModel = cv.fit(file_df) #IllegalArgumentException features does not exist
#cvModel=cv.fit(output_data) #Illegal type
 def majority_vote(neighbors):
     return Vectors.dense(
         collections.Counter([x[1] for x in neighbors]).most_common()[0][0])
Пример #37
0
model2.numFeatures

model2.save('E:/kaggle/titanic/dt_model10')

# -----------------------------------------------------------------------
# Step 8 for deployment --- save and load model for operationalization purpose -
from pyspark.ml.classification import DecisionTreeClassificationModel
model3 = DecisionTreeClassificationModel()
model3 = model3.load('E:/kaggle/titanic/dt_model10')

model3.depth
model3.numFeatures

from pyspark.ml.linalg import Vectors
predict_df = spark.createDataFrame(
    [(1, Vectors.dense(1.0, 0.0, 1.0, 0.0, 1.0, 0.0))], ['index', 'Features'])
predict_df.show()
model3.transform(predict_df).select('prediction').first()[0]

# ------------------------------------------------------------------------

training1 = model2.transform(training)
training1.show(5)

PredictionandLabels = training1.select(training1.prediction,
                                       training1.Survived).rdd
PredictionandLabels.collect()

from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics
# metrics1 = BinaryClassificationMetrics(PredictionandLabels)
# (train score/train accuracy   --- )
Пример #38
0
    SiteCategoryEncoder, AppCategoryEncoder, DeviceTypeEncoder,
    DeviceConnTypeEncoder, C15Encoder, C16Encoder, C18Encoder, C19Encoder,
    C21Encoder, FeatureAssembler
])
modelTmp = pipelineTmp.fit(schemaClick)
tmp = modelTmp.transform(schemaClick).select("click", "VectoredFeatures")
tmp.registerTempTable("CLICK")

# Selecting click and VectoredFeatures from Table "CLICK" and creating new dataFrame as results
results = sqlContext.sql("SELECT click, VectoredFeatures from CLICK")
results.show()

# Creating label points for attributes click and VectoredFeatures
click_transformed = results.select(
    'click', 'VectoredFeatures').rdd.map(lambda row: LabeledPoint(
        float(row.click), Vectors.dense((row.VectoredFeatures).toArray())))
click_transformed.take(2)

#Divide the data into training and test sets
weights = [.8, .2]
seed = 15L

ClickTrain, ClickTest = click_transformed.randomSplit(weights, seed)

# Train the training data set for the Gradient Decent
modelGD = LogisticRegressionWithSGD.train(ClickTrain,
                                          iterations=15,
                                          step=1,
                                          miniBatchFraction=1,
                                          regType=None,
                                          validateData="False")
Пример #39
0
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Spark MLlib") \
        .config("spark.master", "local") \
        .getOrCreate()

######################################################
#   Example 1 - Dense & Sparse Vectors
######################################################

from pyspark.ml.linalg import Vectors
denseVec = Vectors.dense(1.0, 2.0, 3.0, 4.0, 5.0, 6.0)
size = 12
idx = [1, 2, 10, 11]  # locations of non-zero elements in vector
values = [12.0, 32.0, 110.0, 27.0]
sparseVec = Vectors.sparse(size, idx, values)

print("denseVec: ", denseVec)
print("sparseVec: ", sparseVec)

spark.stop()
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("EstimatorTransformerParamExample")\
        .getOrCreate()

    # $example on$
    # Prepare training data from a list of (label, features) tuples.
    training = spark.createDataFrame([
        (1.0, Vectors.dense([0.0, 1.1, 0.1])),
        (0.0, Vectors.dense([2.0, 1.0, -1.0])),
        (0.0, Vectors.dense([2.0, 1.3, 1.0])),
        (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

    # Create a LogisticRegression instance. This instance is an Estimator.
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    # Print out the parameters, documentation, and any default values.
    print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

    # Learn a LogisticRegression model. This uses the parameters stored in lr.
    model1 = lr.fit(training)

    # Since model1 is a Model (i.e., a transformer produced by an Estimator),
    # we can view the parameters it used during fit().
    # This prints the parameter (name: value) pairs, where names are unique IDs for this
Пример #41
0
 def add_features(feat, *other):
     raw = feat.toArray()
     return Vectors.dense(np.append(raw, map(float, other)))
Пример #42
0
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Spark MLlib") \
        .config("spark.master", "local") \
        .getOrCreate()

sc = spark.sparkContext

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression

bdf = sc.parallelize([
    Row(label=1.0, weight=1.0, features=Vectors.dense(0.0, 5.0)),
    Row(label=0.0, weight=2.0, features=Vectors.dense(1.0, 2.0)),
    Row(label=1.0, weight=3.0, features=Vectors.dense(2.0, 1.0)),
    Row(label=0.0, weight=4.0, features=Vectors.dense(3.0, 3.0))
]).toDF()

blor = LogisticRegression(regParam=0.01, weightCol="weight")
blorModel = blor.fit(bdf)
blorModel.coefficients
blorModel.intercept

test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF()
blorModel.transform(test1).head().prediction

save_path = "C:\\PySpark\\spark_ml\\saved_models\\logistic_regression_example_1\\"
estimator_path = save_path + "lr"
Пример #43
0
from collections import Counter
from functools import partial
from functools import reduce

import pandas as pd
from pyspark import SparkContext
from pyspark.ml.feature import PCAModel
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import DataFrame
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, FloatType

get_cluster_size = udf(lambda x: len(x), IntegerType())
transform_to_vec = udf(lambda x: Vectors.dense(x), VectorUDT())
get_component1 = udf(lambda x: x.toArray().tolist()[0], FloatType())
get_component2 = udf(lambda x: x.toArray().tolist()[1], FloatType())


def unionAll(dfs):
    return reduce(DataFrame.unionAll, dfs)


def get_cluster_purity(images_p, b_mapping):
    total_c = list()
    for img in images_p:
        img_c = b_mapping.value[img.split("/")[-1]]
        total_c += img_c

    N = len(images_p)
Пример #44
0
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("EstimatorTransformerParamExample")\
        .getOrCreate()

    # $example on$
    # Prepare training data from a list of (label, features) tuples.
    training = spark.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])),
                                      (0.0, Vectors.dense([2.0, 1.0, -1.0])),
                                      (0.0, Vectors.dense([2.0, 1.3, 1.0])),
                                      (1.0, Vectors.dense([0.0, 1.2, -0.5]))],
                                     ["label", "features"])

    # Create a LogisticRegression instance. This instance is an Estimator.
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    # Print out the parameters, documentation, and any default values.
    print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

    # Learn a LogisticRegression model. This uses the parameters stored in lr.
    model1 = lr.fit(training)

    # Since model1 is a Model (i.e., a transformer produced by an Estimator),
    # we can view the parameters it used during fit().
Пример #45
0
# Make predictions.
predictions = model.transform(testData)
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
gbtModel = model.stages[1]
print(gbtModel)  # summary only

from pyspark.ml.regression import AFTSurvivalRegression
from pyspark.ml.linalg import Vectors
training = spark.createDataFrame([(1.218, 1.0, Vectors.dense(1.560, -0.605)),
                                  (2.949, 0.0, Vectors.dense(0.346, 2.158)),
                                  (3.627, 0.0, Vectors.dense(1.380, 0.231)),
                                  (0.273, 1.0, Vectors.dense(0.520, 1.151)),
                                  (4.199, 0.0, Vectors.dense(0.795, -0.226))],
                                 ["label", "censor", "features"])
quantileProbabilities = [0.3, 0.6]
aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities,
                            quantilesCol="quantiles")
model = aft.fit(training)
# Print the coefficients, intercept and scale parameter for AFT survival regression
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))
print("Scale: " + str(model.scale))
model.transform(training).show(truncate=False)
Пример #46
0
def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]), r[-1]]).toDF(
        ['features', 'label'])
## wine_data_label_indexed = model.transform(wine_data)
## wine_data_label_indexed.show(5)
## #wine_data = wine_data.withColumn('string_quality', wine_data.quality.cast('string'))
## 
## 
## from pyspark.ml.feature import StringIndexer
## indexer = StringIndexer(inputCol="label", outputCol="indexed_label")
## model = indexer.fit(ml_wine_data)
## wine_data_label_indexed = model.transform(ml_wine_data)
## wine_data_label_indexed.show(5)


# convert data into featuresCol and labelCol structre
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
ml_wine_data = wine_data.rdd.map(lambda r: [Vectors.dense(r[:-1]), r[-1]]).toDF(['featuresCol', 'label'])
ml_wine_data.show(5)



## from pyspark.ml.feature import VectorIndexer
## indexer = VectorIndexer(maxCategories=4, inputCol='featuresCol', outputCol='indexed_features')
## model = indexer.fit(wine_data_label_indexed)
## wine_data_feature_indexed  = model.transform(wine_data_label_indexed)

## splitting data into training and test sets
training, test = ml_wine_data.randomSplit(weights=[0.7, 0.3], seed=123)
training.show(5)


## naive bayes classifier
Пример #48
0
def dataset_multinomial(spark_session):
    return spark_session.createDataFrame(
        [(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], [])),
         (2.0, Vectors.dense(0.5))] * 100,
        ["label", "features"],
    ).cache()
Пример #49
0
Run with:
  bin/spark-submit examples/src/main/python/ml/correlation_example.py
"""
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("CorrelationExample") \
        .getOrCreate()

    # $example on$
    data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ),
            (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )]
    df = spark.createDataFrame(data, ["features"])

    r1 = Correlation.corr(df, "features").head()
    print("Pearson correlation matrix:\n" + str(r1[0]))

    r2 = Correlation.corr(df, "features", "spearman").head()
    print("Spearman correlation matrix:\n" + str(r2[0]))
    # $example off$

    spark.stop()
Пример #50
0
df0=sample_300_df2.select(["_id", "chordRatio", "chordRatioMinHash"])

from pyspark.sql import Row
from pyspark.sql.functions import col
from sparkaid import flatten
df0_flat=flatten(df0)
columns_list1=df0_flat.columns[1:-1]
array_df=df0_flat.select('_id', 'chordRatioMinHash',array(columns_list1).alias('chordRatioJS'))

#fill NaNs with zeros in the array column
df2_flat=df0_flat.na.fill(float(0))
columns_list2=df2_flat.columns[1:-1]
array_df2=df2_flat.select('_id', 'chordRatioMinHash',array(columns_list2).alias('chordRatioJS_no_Nulls'))

###
to_vector = udf(lambda a: Vectors.dense(a), VectorUDT())
data = array_df2.select('_id', 'chordRatioMinHash', "chordRatioJS_no_Nulls", to_vector("chordRatioJS_no_Nulls").alias("chordRatioWJS"))
data.show(1, truncate=False)

import scipy.sparse
from pyspark.ml.linalg import Vectors, _convert_to_vector, VectorUDT
from pyspark.sql.functions import udf, col
## from dense to sparse array
def dense_to_sparse(vector):
    return _convert_to_vector(scipy.sparse.csc_matrix(vector.toArray()).T)

to_sparse = udf(dense_to_sparse, VectorUDT())
data_sparse=data.withColumn("sparseChordRatioJS", to_sparse(col("chordRatioWJS")))
#data_sparse2=data_sparse.select('_id', 'chordRatio_for_minHash', 'sparseChordRatioJS')

indices_udf = udf(lambda vector: vector.indices.tolist(), ArrayType(IntegerType()))
Пример #51
0
def toSparseVector(index, values):
    day_list_index, qty_list_values = zip(*sorted(zip(index, values)))
    #367 for bisextile year (1 to 366 +1)
    return Vectors.sparse(366, day_list_index, qty_list_values)
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

dataA = [(
    0,
    Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),
), (
    1,
    Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),
), (
    2,
    Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),
)]

dfA = spark.createDataFrame(dataA, ["id", "features"])

dataB = [(
    3,
    Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),
), (
    4,
    Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),
), (
    5,
    Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),
)]
parts = lines.map(lambda row: row.value.split(","))
df = spark.createDataFrame(parts, [
    "id", "accommodates", "bathrooms", "bedrooms", "beds", "price",
    "Shared room", "Entire home/apt", "Private room", "Queens", "Brooklyn",
    "Staten Island", "Manhattan", "Bronx", "testTrain", "listId"
])

#Put in the same order
testTrainData = df.select("accommodates", "bathrooms", "bedrooms", "beds",
                          "price", "Shared room", "Entire home/apt",
                          "Private room", "Queens", "Brooklyn",
                          "Staten Island", "Manhattan", "Bronx", "testTrain",
                          "listId")

# Make dataframe into LIBSVM format
datak = testTrainData.rdd.map(lambda x: (Vectors.dense(x[0:-2]), x[-1])).toDF(
    ["features", "label"])

# Trains a k-means model.
kmeans = KMeans().setK(10).setSeed(123)
model = kmeans.fit(datak)

# Make predictions
predictions = model.transform(datak)

# Clustered dataframe
testTrainData = testTrainData.select('listId', 'testTrain')
testTrainData = testTrainData.withColumnRenamed('listId', 'newId')
OuterCluster = dataset.join(predictions, dataset.listId == predictions.label)
OuterCluster = OuterCluster.drop('features').drop('label')
OuterCluster = OuterCluster.join(testTrainData,
Пример #54
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jun  8 15:08:11 2018

@author: luogan
"""

from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.linalg import Vectors

from pyspark.sql import SparkSession
spark= SparkSession\
                .builder \
                .appName("dataFrame") \
                .getOrCreate()
df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ),
                            (Vectors.dense([0.0, 0.0]), ),
                            (Vectors.dense([3.0, -1.0]), )], ["features"])

polyExpansion = PolynomialExpansion(degree=3,
                                    inputCol="features",
                                    outputCol="polyFeatures")
polyDF = polyExpansion.transform(df)

polyDF.show(truncate=False)
spark = SparkSession.builder.getOrCreate()

# Principal Component Analysis (PCA)

# In[3]:

# PCA is a statistical procedure that uses an orthogonal transformation
# to convert a set of observations of possibly correlated variables
# into a set of values of linearly uncorrelated variables called
# principal components.
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

# In[4]:

data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )]

# In[5]:

df = spark.createDataFrame(data, ["features"])

# In[6]:

df.show()

# In[7]:

# The example below shows how to project 5-dimensional feature vectors into
# 3-dimensional principal components.
Пример #56
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# $example on$
from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("ElementwiseProductExample")\
        .getOrCreate()

    # $example on$
    # Create some vector data; also works for sparse vectors
    data = [(Vectors.dense([1.0, 2.0, 3.0]), ),
            (Vectors.dense([4.0, 5.0, 6.0]), )]
    df = spark.createDataFrame(data, ["vector"])
    transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
                                     inputCol="vector",
                                     outputCol="transformedVector")
    # Batch transform the vectors to create new column:
    transformer.transform(df).show()
    # $example off$

    spark.stop()
Пример #57
0
path = "./resources/"
angry_df = ImageSchema.readImages(path + "0/").withColumn("label", lit(0))
happy_df = ImageSchema.readImages(path + "3/").withColumn("label", lit(1))
sad_df = ImageSchema.readImages(path + "4/").withColumn("label", lit(2))

sc = spark.sparkContext

log4jLogger = sc._jvm.org.apache.log4j
log = log4jLogger.Logger.getLogger(__name__)

log.info("pyspark script logger initialized")

df1 = angry_df.union(happy_df).union(sad_df)

parse_ = udf(lambda a: Vectors.dense(a), VectorUDT())
df = df1.withColumn("features", parse_(df1["image.data"]))

train, test, _ = df.randomSplit([0.1, 0.05, 0.85])

lr = LogisticRegression(maxIter=100,
                        regParam=0.05,
                        elasticNetParam=0.3,
                        featuresCol="features",
                        labelCol="label")
train.cache()

p = Pipeline(stages=[lr])
p_model = p.fit(train)

predictions = p_model.transform(test)
Пример #58
0
spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/temp").appName("LinearRegression").getOrCreate()

inputLines = spark.sparkContext.textFile("file:///Users/lesli/BigData/TA3/College.csv")

def parseLine(line):
    fields = line.split(',')
    SchoolType = fields[1]
    SF_Ratio = float(fields[15])
    Grad_Rate = float(fields[18])
    return (SchoolType, SF_Ratio, Grad_Rate)

parsedLines = inputLines.map(parseLine)

Private = parsedLines.filter(lambda x: "Yes" in x[0]) 

data = Private.map(lambda x: (Vectors.dense(float(x[1])),float(x[2]))).cache()

#data = inputLines.map(lambda x: x.split(",")).map(lambda x: (Vectors.dense(float(x[15])),float(x[18]))).cache()
#[16] is Grad_Rate (Y_variable), [13] is predictor -SF.Ratio 
                                                    

# Convert this RDD to a DataFrame
colNames = ['SF_Ratio','Grad_Rate'] #Y is Grad.rate
df = data.toDF(colNames)

# Let's split our data into training data and testing data
trainTest = df.randomSplit([0.8, 0.2])
trainingDF = trainTest[0]
testDF = trainTest[1]
wholeDF = df
Пример #59
0
def transData(data):
    return data.rdd.map(lambda r: [r[0],Vectors.dense(r[1:])]).toDF(['CustomerID','rfm']) #Return a new RDD by applying a function to each element of this RDD.
Пример #60
0
 def zero_features(feat):
     raw = feat.toArray()
     for idx in idxs:
         raw[idx] = 0.
     return Vectors.dense(raw)