def ztest_toPandas(self): data = [(Vectors.dense([0.1, 0.2]),), (Vectors.sparse(2, {0:0.3, 1:0.4}),), (Vectors.sparse(2, {0:0.5, 1:0.6}),)] df = self.sql.createDataFrame(data, ["features"]) self.assertEqual(df.count(), 3) pd = self.converter.toPandas(df) self.assertEqual(len(pd), 3) self.assertTrue(isinstance(pd.features[0], csr_matrix), "Expected pd.features[0] to be csr_matrix but found: %s" % type(pd.features[0])) self.assertEqual(pd.features[0].shape[0], 3) self.assertEqual(pd.features[0].shape[1], 2) self.assertEqual(pd.features[0][0,0], 0.1) self.assertEqual(pd.features[0][0,1], 0.2)
def test_java_object_gets_detached(self): df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight", fitIntercept=False) model = lr.fit(df) summary = model.summary self.assertIsInstance(model, JavaWrapper) self.assertIsInstance(summary, JavaWrapper) self.assertIsInstance(model, JavaParams) self.assertNotIsInstance(summary, JavaParams) error_no_object = 'Target Object ID does not exist for this gateway' self.assertIn("LinearRegression_", model._java_obj.toString()) self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString()) model.__del__() with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): model._java_obj.toString() self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString()) try: summary.__del__() except: pass with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): model._java_obj.toString() with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): summary._java_obj.toString()
def test_persistence(self): # Test save/load for LDA, LocalLDAModel, DistributedLDAModel. df = self.spark.createDataFrame([ [1, Vectors.dense([0.0, 1.0])], [2, Vectors.sparse(2, {0: 1.0})], ], ["id", "features"]) # Fit model lda = LDA(k=2, seed=1, optimizer="em") distributedModel = lda.fit(df) self.assertTrue(distributedModel.isDistributed()) localModel = distributedModel.toLocal() self.assertFalse(localModel.isDistributed()) # Define paths path = tempfile.mkdtemp() lda_path = path + "/lda" dist_model_path = path + "/distLDAModel" local_model_path = path + "/localLDAModel" # Test LDA lda.save(lda_path) lda2 = LDA.load(lda_path) self._compare(lda, lda2) # Test DistributedLDAModel distributedModel.save(dist_model_path) distributedModel2 = DistributedLDAModel.load(dist_model_path) self._compare(distributedModel, distributedModel2) # Test LocalLDAModel localModel.save(local_model_path) localModel2 = LocalLDAModel.load(local_model_path) self._compare(localModel, localModel2) # Clean up try: rmtree(path) except OSError: pass
def mldemo(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),), (Vectors.dense([4.0, 5.0, 0.0, 3.0]),), (Vectors.dense([6.0, 7.0, 0.0, 8.0]),), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)] df = spark.createDataFrame(data, ["features"]) r1 = Correlation.corr(df, "features").head() print("Pearson correlation matrix:\n" + str(r1[0])) r2 = Correlation.corr(df, "features", "spearman").head() print("Spearman correlation matrix:\n" + str(r2[0]))
def test_output_columns(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr, parallelism=1) model = ovr.fit(df) output = model.transform(df) self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"])
def test_copy(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) ovr1 = ovr.copy({lr.maxIter: 10}) self.assertEqual(ovr.getClassifier().getMaxIter(), 5) self.assertEqual(ovr1.getClassifier().getMaxIter(), 10) model = ovr.fit(df) model1 = model.copy({model.predictionCol: "indexed"}) self.assertEqual(model1.getPredictionCol(), "indexed")
def test_parallelism_doesnt_change_output(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1) modelPar1 = ovrPar1.fit(df) ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2) modelPar2 = ovrPar2.fit(df) for i, model in enumerate(modelPar1.models): self.assertTrue(np.allclose(model.coefficients.toArray(), modelPar2.models[i].coefficients.toArray(), atol=1E-4)) self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
def test_support_for_weightCol(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0), (1.0, Vectors.sparse(2, [], []), 1.0), (2.0, Vectors.dense(0.5, 0.5), 1.0)], ["label", "features", "weight"]) # classifier inherits hasWeightCol lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr, weightCol="weight") self.assertIsNotNone(ovr.fit(df)) # classifier doesn't inherit hasWeightCol dt = DecisionTreeClassifier() ovr2 = OneVsRest(classifier=dt, weightCol="weight") self.assertIsNotNone(ovr2.fit(df))
def test_bisecting_kmeans_summary(self): data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), (Vectors.sparse(1, [], []),)] df = self.spark.createDataFrame(data, ["features"]) bkm = BisectingKMeans(k=2) model = bkm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 20)
def test_linear_regression_pmml_basic(self): # Most of the validation is done in the Scala side, here we just check # that we output text rather than parquet (e.g. that the format flag # was respected). df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=1) model = lr.fit(df) path = tempfile.mkdtemp() lr_path = path + "/lr-pmml" model.write().format("pmml").save(lr_path) pmml_text_list = self.sc.textFile(lr_path).collect() pmml_text = "\n".join(pmml_text_list) self.assertIn("Apache Spark", pmml_text) self.assertIn("PMML", pmml_text)
def parse(line): obj = json.loads(line) fc = obj[featureCol] if "size" not in fc and "type" not in fc: feature_size = len(fc) dic = [(i, a) for i, a in enumerate(fc)] sv = SparseVector(len(fc), dic) elif "size" not in fc and "type" in fc and fc["type"] == 1: values = fc["values"] feature_size = len(values) dic = [(i, a) for i, a in enumerate(values)] sv = SparseVector(len(values), dic) else: feature_size = fc["size"] sv = Vectors.sparse(fc["size"], list(zip(fc["indices"], fc["values"]))) return sv
def test_gaussian_mixture_summary(self): data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), (Vectors.sparse(1, [], []),)] df = self.spark.createDataFrame(data, ["features"]) gmm = GaussianMixture(k=2) model = gmm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertTrue(isinstance(s.probability, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 3)
def test_onevsrest(self): temp_path = tempfile.mkdtemp() df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))] * 10, ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) ovrPath = temp_path + "/ovr" ovr.save(ovrPath) loadedOvr = OneVsRest.load(ovrPath) self._compare_pipelines(ovr, loadedOvr) modelPath = temp_path + "/ovrModel" model.save(modelPath) loadedModel = OneVsRestModel.load(modelPath) self._compare_pipelines(model, loadedModel)
def test_binary_logistic_regression_summary(self): df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) model = lr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertEqual(s.labelCol, "label") self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") objHist = s.objectiveHistory self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.labels, list)) self.assertTrue(isinstance(s.truePositiveRateByLabel, list)) self.assertTrue(isinstance(s.falsePositiveRateByLabel, list)) self.assertTrue(isinstance(s.precisionByLabel, list)) self.assertTrue(isinstance(s.recallByLabel, list)) self.assertTrue(isinstance(s.fMeasureByLabel(), list)) self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list)) self.assertTrue(isinstance(s.roc, DataFrame)) self.assertAlmostEqual(s.areaUnderROC, 1.0, 2) self.assertTrue(isinstance(s.pr, DataFrame)) self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) self.assertAlmostEqual(s.accuracy, 1.0, 2) self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2) self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2) self.assertAlmostEqual(s.weightedRecall, 1.0, 2) self.assertAlmostEqual(s.weightedPrecision, 1.0, 2) self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2) self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
def test_linear_regression_summary(self): df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight", fitIntercept=False) model = lr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.predictionCol, "prediction") self.assertEqual(s.labelCol, "label") self.assertEqual(s.featuresCol, "features") objHist = s.objectiveHistory self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertAlmostEqual(s.explainedVariance, 0.25, 2) self.assertAlmostEqual(s.meanAbsoluteError, 0.0) self.assertAlmostEqual(s.meanSquaredError, 0.0) self.assertAlmostEqual(s.rootMeanSquaredError, 0.0) self.assertAlmostEqual(s.r2, 1.0, 2) self.assertAlmostEqual(s.r2adj, 1.0, 2) self.assertTrue(isinstance(s.residuals, DataFrame)) self.assertEqual(s.numInstances, 2) self.assertEqual(s.degreesOfFreedom, 1) devResiduals = s.devianceResiduals self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float)) coefStdErr = s.coefficientStandardErrors self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float)) tValues = s.tValues self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float)) pValues = s.pValues self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned # The child class LinearRegressionTrainingSummary runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance)
def test_glr_summary(self): from pyspark.ml.linalg import Vectors df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight", fitIntercept=False) model = glr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertEqual(s.numIterations, 1) # this should default to a single iteration of WLS self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.predictionCol, "prediction") self.assertEqual(s.numInstances, 2) self.assertTrue(isinstance(s.residuals(), DataFrame)) self.assertTrue(isinstance(s.residuals("pearson"), DataFrame)) coefStdErr = s.coefficientStandardErrors self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float)) tValues = s.tValues self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float)) pValues = s.pValues self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float)) self.assertEqual(s.degreesOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedomNull, 2) self.assertEqual(s.rank, 1) self.assertTrue(isinstance(s.solver, basestring)) self.assertTrue(isinstance(s.aic, float)) self.assertTrue(isinstance(s.deviance, float)) self.assertTrue(isinstance(s.nullDeviance, float)) self.assertTrue(isinstance(s.dispersion, float)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned # The child class GeneralizedLinearRegressionTrainingSummary runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.deviance, s.deviance)
from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.tuning import ParamGridBuilder, CrossValidator prediction_features = [ 'overall', 'doctor', 'specialty', 'procedure', 'priority' ] change_to_month_func = udf( lambda record: int( datetime.strftime(datetime.strptime(record, '%d/%m/%Y'), '%Y%m')), IntegerType()) change_to_date_func = udf( lambda record: datetime.strptime(str(record), '%Y%m'), DateType()) change_date_to_month = udf( lambda record: datetime(record.year, record.month, 1), DateType()) to_vector = udf(lambda record: Vectors.dense(record), VectorUDT()) to_vectors = udf(lambda col_a, col_b: Vectors.sparse(col_a, col_b)) # Creating Spark Context and Spark Session scobj = SparkContext.getOrCreate() spark = SparkSession(scobj).builder.config('spark.sql.crossJoin.enabled', 'true').getOrCreate() def perform_prediction(csv, predict_by='Overall', predict_period=3): """ The function is the entry point to the prediction module. :param csv: --string: path to the csv file containing the data :param predict_by: -- string: The choices should be 'Overall', 'Doctor', 'Specialty', 'Procedure', and 'Priority' :param predict_period: -- integer: The choices should be 3, 6, 12, 24, 36 :return:
Returning dataframe with the coresponding cluster of each data. Input: - model - dataframe yang sesuai dengan input training Output: dataframe dengan colom features dan pca_features. ''' transformed = model.transform(df) return transformed # ============================================================================= # Test and examples # ============================================================================= print() print("Data========================") data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )] df = spark.createDataFrame(data, ["features"]) data2 = [(Vectors.sparse(5, [(4, 10.0), (3, 7.0)]), ), (Vectors.dense([20.0, 8.0, 0.3, 400.0, 5.0]), ), (Vectors.dense([40.0, 10.0, 20.0, 600.0, 700.0]), ), (Vectors.dense([4.0, 10.0, 200.0, 600.0, 700.0]), ), (Vectors.dense([3.0, 100.0, 0.0, 6000.0, 7000.0]), )] df2 = spark.createDataFrame(data2, ["features"]) print() print('data') print(df.show()) print('data2')
matrix[cnt].setdefault(shingle, shingles.get(shingle)) else: shingles.setdefault(shingle, sh_count) matrix[cnt].setdefault(shingle, sh_count) sh_count += 1 line = fp.readline().split(" ") cnt += 1 size = len(list(shingles)) cnt = 0 for key, value in tqdm(matrix.items()): aux = [] for index, sh in value.items(): aux.append(sh) data.append( (key, Vectors.sparse(size, sorted(list(aux)), np.ones(len(list(aux)))))) next_prime = sieve_of_eratosthenes(size * 2, size) sc = spark.sparkContext distData = sc.parallelize(data) #df = spark.createDataFrame(data, ["id", "features"]) df = spark.createDataFrame(distData, ["id", "features"]) key = Vectors.dense([1.0, 0.0]) mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5, seed=next_prime) model = mh.fit(df) dft = model.transform(df)
.cols.unnest(["col_int"])\ .table() # ### Spits in 3 parts df\ .cols.unnest(["two strings"], splits= 3, mark = "-")\ .table() # ### Unnest a Vector # + from pyspark.ml.linalg import Vectors df1 = op.sc.parallelize([("assert", Vectors.dense([1, 2, 3])), ("require", Vectors.sparse(3, {1: 2})) ]).toDF(["word", "vector"]) # - df1\ .cols.unnest(["vector"])\ .table() df = df.cols.append("new_col_1", 1) # ## Impute # ### Fill missing data # + df_fill = op.spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")),
def build(self): vec = Vectors.sparse(self.size, (self.indices, self.data))
if not hasattr(os, "mlsql_models"): setattr(os, "mlsql_models", {}) if modelPath not in os.mlsql_models: print("Load sklearn model %s" % modelPath) os.mlsql_models[modelPath] = pickle.load(open(modelPath, "rb")) model = os.mlsql_models[modelPath] rawVector = pickle.loads(items[0]) feature = VectorUDT().deserialize(rawVector) y = model.predict([feature.toArray()]) return [VectorUDT().serialize(Vectors.dense(y))] if run_for_test: import json model_path = '/tmp/__mlsql__/3242514c-4113-4105-bdc5-9987b28f9764/0' data_path = '/Users/allwefantasy/Downloads/data1/part-00000-03769d42-1948-499b-8d8f-4914562bcfc8-c000.json' with open(file=data_path) as f: for line in f.readlines(): s = [] wow = json.loads(line)['features'] feature = Vectors.sparse(wow["size"], list(zip(wow["indices"], wow["values"]))) s.insert(0, pickle.dumps(VectorUDT().serialize(feature))) s.insert(1, pickle.dumps([model_path])) print(VectorUDT().deserialize(predict(1, s)[0])) python_fun.udf(predict)
def sparse_vec(r, count): list = set(r[1]) list = sorted(list) length = len(list) ones = [1.0 for i in range(length)] return r[0], Vectors.sparse(count, list, ones)
#!/usr/bin/env python3 # -*- coding:utf-8 -*- # datetime:2019/3/1 14:28 from mmlspark import LightGBMRegressor from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession # spark=SparkSession.builder \ # .appName("normalizer") \ # .master("local[2]") \ # .getOrCreate() svec=Vectors.sparse(4,{1:4.0,3:3.0}) dvec=Vectors.dense([3.0,-4.0]) print(svec) print(dvec)
countTokens = udf(lambda words: len(words), IntegerType()) # 注意每次transform都是新增列,不会删除之前的列 tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select("sentence", "words") \ .withColumn("tokens", countTokens(col("words"))).show(truncate=False) regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select("sentence", "words") \ .withColumn("tokens", countTokens(col("words"))).show(truncate=False) ################################################################################## # 通过主成分分析提取主要特征 from pyspark.ml.feature import PCA from pyspark.ml.linalg import Vectors data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )] df = spark.createDataFrame(data, ["features"]) pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures") model = pca.fit(df) result = model.transform(df).select("pcaFeatures") result.show(truncate=False) # 多项式特征 from pyspark.ml.feature import PolynomialExpansion from pyspark.ml.linalg import Vectors df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ),
from pyspark.ml.linalg import Vectors denseVec = Vectors.dense(1.0, 2.0, 3.0) size = 3 idx = [1, 2] # locations of non-zero elements in vector values = [2.0, 3.0] sparseVec = Vectors.sparse(size, idx, values) # COMMAND ---------- df = spark.read.json("/data/simple-ml") df.orderBy("value2").show() # COMMAND ---------- from pyspark.ml.feature import RFormula supervised = RFormula(formula="lab ~ . + color:value1 + color:value2") # COMMAND ---------- fittedRF = supervised.fit(df) preparedDF = fittedRF.transform(df) preparedDF.show() # COMMAND ---------- train, test = preparedDF.randomSplit([0.7, 0.3])
def data_describe(self): print('start to read data for rdd:') rawRdd_nlp = self.read_rdd('track2_title.txt').map(lambda line : eval(line)) # print(rawRdd_nlp.take(10)) #转化为dataframe,在不指定schema的情况下会自动推断 sqlContext = SQLContext(self.sc) labels=[ ('item_id',typ.IntegerType()), ('title_features',typ.MapType(typ.StringType(), typ.IntegerType()))] Schema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels]) df = sqlContext.createDataFrame(rawRdd_nlp,Schema) # df.show(10) # df.printSchema() print("统计title中不同词的个数unique,以及title的长度") gdf=df.select("item_id",fn.explode(fn.col("title_features"))).groupBy("item_id") df2=gdf.agg(fn.count("key").alias("title_words_unique")) df3=gdf.agg(fn.sum("value").alias("title_length")) df=df.join(df2,"item_id","left") \ .join(df3,"item_id","left") df=df.drop("title_features") df.printSchema() print('start to deal with the title_features col,and compute the title topic') tokens=df.rdd.map(lambda d:d[1]).map(lambda d:list(d.keys())) #每个titile对应的tokens local_tokens=tokens.flatMap(lambda d :[int(token) for token in d]).distinct() print('local_tokens最大的值') print(local_tokens.top(1)) vocab_size=max(local_tokens.top(1))+1 #将title_feature列转化为向量 toInt=udf(lambda counts :{int(token) :float(counts[token]) for token in counts}, typ.StringType()) df = df.withColumn("title_features_1", toInt(df.title_features)) toVector=udf(lambda vs: Vectors.sparse(vocab_size,vs), VectorUDT()) rescaledData = df.withColumn("features", toVector(df.title_features_1)).select("item_id", "features") df=df.drop("title_features_1") # del df # gc.collect() rescaledData.cache() lda = LDA(k=50,maxIter=200) #lda = LDA(k=2,maxIter=5) ldaModel = lda.fit(rescaledData) transformed = ldaModel.transform(rescaledData) #.select("topicDistribution") #结果显示 每个文档各个类别的权重, transformed表各列名 #主题分布向量转化为类别 # transformed.show(truncate=False) def to_array(col): def to_array_(v): return v.toArray().tolist() return psf.udf(to_array_, typ.ArrayType(typ.DoubleType()))(col) df_topic=transformed.withColumn("topic", to_array(psf.col("topicDistribution"))).select(["item_id"] + [psf.col("topic")[i] for i in range(50)]) topicCol=df_topic.columns topicCol.remove("item_id") print('查看列名') print(topicCol) def getTopicID(p): #改用key-value的形式,再排序,找出最大value对应的key d={} for c in topicCol: #构建字典 d[c]=p[c] z = list(d.keys())[list(d.values()).index(max(d.values()))] return int(z.replace("topic[",'').replace("]",'')) df_topic1=df_topic.rdd.map(lambda p: (p.item_id, getTopicID(p))) labels=[ ('item_id',typ.IntegerType()), ('title_topic',typ.IntegerType())] Schema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels]) df_topic2 = sqlContext.createDataFrame(df_topic1,Schema) # df_topic2 = df_topic1.toDF(['item_id','topic']) # print('观看topic是否为想要的数据格式,并保存于topic2中') df_topic2.show(5) df_nlp=df.join(df_topic2,"item_id","left") #UnboundLocalError: local variable 'df' referenced before assignment df_nlp.printSchema() #item_id|title_features |title_words_unique|title_length|title_features1 |title_topic| print('-------5.保存数据预处理结果-------') file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'nlp_topic_feature2' os.system("hadoop fs -rm -r {}".format(file_path)) df_nlp.rdd.map(tuple).saveAsPickleFile(file_path) print('数据保存结束') print('start to read act data only for uid and item_id :') rawRdd_train = self.read_rdd('final_track2_train.txt').map(lambda line : line.split('\t')) rawRdd_test = self.read_rdd('final_track2_test_no_anwser.txt').map(lambda line : line.split('\t')) actionLogRdd_train = rawRdd_train.map( lambda x :(int(x[0]), int(x[2]))) # total = actionLogRdd_train.count() # print('total: ' + str(total)) actionLogRdd_test = rawRdd_test.map( lambda x :(int(x[0]), int(x[2]))) sqlContext = SQLContext(self.sc) labels=[('uid',typ.IntegerType()), ('item_id',typ.IntegerType()) ] actionLogSchema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels]) dfactionLog_train = sqlContext.createDataFrame(actionLogRdd_train, actionLogSchema) dfactionLog_test = sqlContext.createDataFrame(actionLogRdd_test, actionLogSchema) #根据item_id进行关联 # item_id|title_features||title_words_unique|title_length|title_features_1|title_topic df_nlp=df_nlp.select(["item_id","title_words_unique","title_length"]) df_uid_nlp_test=dfactionLog_test.select(["uid","item_id"]).join(df_nlp,'item_id','left').drop("item_id") df_uid_nlp_train=dfactionLog_train.select(["uid","item_id"]).join(df_nlp,'item_id','left').drop("item_id") del dfactionLog_test del dfactionLog_train gc.collect() #进行处理 gdf=df_uid_nlp_train.groupby("uid") df1=gdf.agg(fn.max("title_words_unique").alias("uid_max_title_words_unique"),fn.avg("title_words_unique").alias("uid_avg_title_words_unique"),\ fn.max("title_length").alias("uid_max_title_length"),fn.avg("title_length").alias("uid_avg_title_length") ) df1.show(1,truncate=False) df_uid_train=df_uid_nlp_train.join(df1,'uid','left').drop("title_words_unique").drop("title_length") df_uid_test=df_uid_nlp_test.join(df1,'uid','left').drop("title_words_unique").drop("title_length") print("理论上应该只有uid,uid_max_beauty,uid_avg_beauty,uid_male_ratio") df_uid_train.printSchema() df_uid_test.printSchema() print('-------保存df_uid_nlp数据-------') file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'df_uid_nlp_train' os.system("hadoop fs -rm -r {}".format(file_path)) #os.system(command) 其参数含义如下所示: command 要执行的命令 df_uid_train.rdd.map(tuple).saveAsPickleFile(file_path) file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'df_uid_nlp_test' os.system("hadoop fs -rm -r {}".format(file_path)) #os.system(command) 其参数含义如下所示: command 要执行的命令 df_uid_test.rdd.map(tuple).saveAsPickleFile(file_path) print('数据保存结束') #检验上面创建lda模型中使用的参数 ll越大越好,lp越小越好 ''' ll = ldaModel.logLikelihood(rescaledData) lp = ldaModel.logPerplexity(rescaledData) print(ll) print(lp) ''' #保存ldaModel,训练集转化的时候直接加载该模型,目前没有必要保存模型,保存df_topic即可 print("开始保存模型") distributed_model_path = self.parser.get("hdfs_path", "hdfs_data_path") + "lda_distributed_model" ldaModel.save(distributed_model_path) print("保存模型结束") #加载的语句 print("加载模型") sameLdaModel = DistributedLDAModel.load(distributed_model_path) print("加载模型结束") # ---------------------------------3 模型及描述------------------------------ # 模型通过describeTopics、topicsMatrix来描述 ''' topicIndices = ldaModel.describeTopics(maxTermsPerTopic=5) topicIndices.show(truncate=False) #*主题 主题包含最重要的词语序号 各词语的权重 ''' '''
def to_sparse_vector(indices, values): indices, values = zip(*sorted(zip(indices, values))) return Vectors.sparse(max_id, indices, values)
# from __future__ import print_function # $example on$ from pyspark.ml.feature import VectorSlicer from pyspark.ml.linalg import Vectors from pyspark.sql.types import Row # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("VectorSlicerExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame([ Row(userFeatures=Vectors.sparse(3, {0: -2.0, 1: 2.3}),), Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0]),)]) slicer = VectorSlicer(inputCol="userFeatures", outputCol="features", indices=[1]) output = slicer.transform(df) output.select("userFeatures", "features").show() # $example off$ spark.stop()
""" An example demonstrating MinHashLSH. Run with: bin/spark-submit examples/src/main/python/ml/min_hash_lsh_example.py """ if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("MinHashLSHExample") \ .getOrCreate() # $example on$ dataA = [( 0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]), ), ( 1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]), ), ( 2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]), )] dfA = spark.createDataFrame(dataA, ["id", "features"]) dataB = [( 3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]), ), ( 4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),
Run with: bin/spark-submit examples/src/main/python/ml/correlation_example.py """ # $example on$ from pyspark.ml.linalg import Vectors from pyspark.ml.stat import Correlation # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("CorrelationExample") \ .getOrCreate() # $example on$ data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ), (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ), (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )] df = spark.createDataFrame(data, ["features"]) r1 = Correlation.corr(df, "features").head() print("Pearson correlation matrix:\n" + str(r1[0])) r2 = Correlation.corr(df, "features", "spearman").head() print("Spearman correlation matrix:\n" + str(r2[0])) # $example off$ spark.stop()
from pyspark.ml.linalg import Vectors from pyspark.ml.stat import Correlation, Summarizer from pyspark.mllib.stat import Statistics from pyspark.sql import SparkSession if __name__ == "__main__": spark_session = SparkSession \ .builder \ .getOrCreate() logger = spark_session._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.WARN) data_list = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),), (Vectors.dense([4.0, 5.0, 0.0, 3.0]),), (Vectors.dense([6.0, 7.0, 0.0, 8.0]),), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)] data_frame = spark_session.createDataFrame(data_list, ["features"]) data_frame.printSchema() data_frame.show() r1 = Correlation.corr(data_frame, "features").head() print("Pearson correlation matrix:\n" + str(r1[0])) r2 = Correlation.corr(data_frame, "features", "spearman").head() print("Spearman correlation matrix:\n" + str(r2[0])) rdd_data = data_frame.rdd print(rdd_data.collect())
import findspark findspark.init() from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .appName("Spark MLlib") \ .config("spark.master", "local") \ .getOrCreate() ###################################################### # Example 1 - Dense & Sparse Vectors ###################################################### from pyspark.ml.linalg import Vectors denseVec = Vectors.dense(1.0, 2.0, 3.0, 4.0, 5.0, 6.0) size = 12 idx = [1, 2, 10, 11] # locations of non-zero elements in vector values = [12.0, 32.0, 110.0, 27.0] sparseVec = Vectors.sparse(size, idx, values) print("denseVec: ", denseVec) print("sparseVec: ", sparseVec) spark.stop()
def make_click_pattern_vector(features, size): vec = Vectors.sparse(size, features)
协方差矩阵 希望投影后的投影值尽可能的分散 协方差 """ try: from pyspark.ml.feature import PCA from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession print("Successfully imported Spark Modules") except ImportError as e: print("Can not import Spark Modules", e) sys.exit(1) spark = SparkSession.builder.appName("PACExample").getOrCreate() data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )] print(data) df = spark.createDataFrame(data, ["features"]) pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures") # 表示用数据df来训练PCA模型 model = pca.fit(df) # 当模型训练好后,对于新输入的数据,都可以用transform方法来降维. result = model.transform(df).select("pcaFeatures") result.show(truncate=False) spark.stop()
def toSparseVector(index, values): day_list_index, qty_list_values = zip(*sorted(zip(index, values))) #367 for bisextile year (1 to 366 +1) return Vectors.sparse(366, day_list_index, qty_list_values)
def array2vec(genreIndexes, indexSize): genreIndexes.sort() fill_list = [1.0 for _ in range(len(genreIndexes))] return Vectors.sparse(indexSize, genreIndexes, fill_list)
from pyspark.ml.linalg import Vectors from pyspark.ml.classification import LogisticRegression bdf = sc.parallelize([ Row(label=1.0, weight=1.0, features=Vectors.dense(0.0, 5.0)), Row(label=0.0, weight=2.0, features=Vectors.dense(1.0, 2.0)), Row(label=1.0, weight=3.0, features=Vectors.dense(2.0, 1.0)), Row(label=0.0, weight=4.0, features=Vectors.dense(3.0, 3.0)) ]).toDF() blor = LogisticRegression(regParam=0.01, weightCol="weight") blorModel = blor.fit(bdf) blorModel.coefficients blorModel.intercept test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF() blorModel.transform(test1).head().prediction save_path = "C:\\PySpark\\spark_ml\\saved_models\\logistic_regression_example_1\\" estimator_path = save_path + "lr" # Save the estimator blor.save(estimator_path) lr2 = LogisticRegression.load(estimator_path) lr2.getRegParam() #save the model model_path = save_path + "lr_model" blorModel.save(model_path) from pyspark.ml.classification import LogisticRegressionModel model2 = LogisticRegressionModel.load(model_path)
def sparseify(users_num, user_index, ratings): feature = Vectors.sparse(users_num, user_index, ratings) return feature
# Task 1: Correlation between fields from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() # How to represent vectors in Spark from pyspark.ml.linalg import Vectors # 1. Dense vector #create a vector of 4 features Vectors.dense([1, 2, 3, 4]) # 2. Sparse vector #create a vector of 4 features Vectors.sparse(4, [(0, 1), (2, 3)]) #Read data # df = spark.read.csv("/home/s_kante/spark/data/developers_survey_training.csv", header='true') df = spark.read.csv( "/home/student/jac_spark/lecture2/data/Task1_2_3/developers_survey_training.csv", header='true') #Replace IsDeveloper value with integer 1 or 0 #Approach1 df.createOrReplaceTempView("inputData") df1 = spark.sql( "SELECT CASE IsDeveloper WHEN 'Yes' THEN 1 ELSE 0 END AS IsDeveloper, CAST(YearsOfExp AS FLOAT) AS YearsOfExp, CAST(Salary AS FLOAT) AS Salary FROM inputData " ) #Approach2
def trans2sparse(line): indices = line["chi"]["indices"] values = line["chi"]["values"] vec = DenseVector(Vectors.sparse(2000, indices, values).toArray()) return Row(chi=vec, window=line["window"])
def dataset_multinomial(spark_session): return spark_session.createDataFrame( [(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], [])), (2.0, Vectors.dense(0.5))] * 100, ["label", "features"], ).cache()
from pyspark.ml.linalg import Vectors from pyspark.ml.stat import Correlation from pyspark.sql import SparkSession from pyspark import SparkConf spark = SparkSession.builder.appName( "CorrelationExample").getOrCreate() data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),), (Vectors.dense([4.0, 5.0, 0.0, 3.0]),), # (c1,c2,c3,..)圆括号里的是column (Vectors.dense([6.0, 7.0, 0.0, 8.0]),), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)] # df = spark.createDataFrame(data, ["features"]) # 每一行都是features df = spark.createDataFrame(data, ['features']) print(df.collect()) r1 = Correlation.corr(df, "features").head() print("Pearson correlation matrix:\n" + str(r1[0])) r2 = Correlation.corr(df, "features", "spearman").head() print("Spearman correlation matrix:\n" + str(r2[0])) # $example off$ spark.stop()
# See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import PCA from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("PCAExample")\ .getOrCreate() # $example on$ data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)] df = spark.createDataFrame(data, ["features"]) pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures") model = pca.fit(df) result = model.transform(df).select("pcaFeatures") result.show(truncate=False) # $example off$ spark.stop()
def data_describe(self): print('start to read data for rdd:') #rawRdd_nlp = self.read_rdd('track2_title.txt').map(lambda line : eval(line)) #rawRdd_nlp = self.read_rdd('track2_title_500.txt').map(lambda line : eval(line)) rawRdd_nlp = self.sc.textFile( '/user/hadoop/icmechallenge2019/track2/test/track2_title_500.txt' ).map(lambda line: eval(line)) # print(rawRdd_nlp.take(10)) sqlContext = SQLContext(self.sc) labels = [('item_id', typ.IntegerType()), ('title_features', typ.MapType(typ.StringType(), typ.IntegerType()))] Schema = typ.StructType( [typ.StructField(e[0], e[1], True) for e in labels]) df = sqlContext.createDataFrame(rawRdd_nlp, Schema) df.show(5) # df.printSchema() print( 'start to deal with the title_features col,and compute the title topic' ) tokens = df.rdd.map(lambda d: d[1]).map(lambda d: list(d.keys())) local_tokens = tokens.flatMap( lambda d: [int(token) for token in d]).distinct() print(local_tokens.top(1)) vocab_size = max(local_tokens.top(1)) + 1 toInt = udf( lambda counts: {int(token): float(counts[token]) for token in counts}, typ.StringType()) df = df.withColumn("title_features_1", toInt(df.title_features)) toVector = udf(lambda vs: Vectors.sparse(vocab_size, vs), VectorUDT()) rescaledData = df.withColumn("features", toVector(df.title_features_1)).select( "item_id", "features") rescaledData.cache() # lda = LDA(k=50,maxIter=200) lda = LDA(k=2, maxIter=5) ldaModel = lda.fit(rescaledData) print("begin save model") distributed_model_path = "/user/hadoop/icmechallenge2019/track2/test/" + "lda_distributed_model_pyspark" ldaModel.write().overwrite().save(distributed_model_path) print("model saved") print("load model") sameLdaModel = LocalLDAModel.load(distributed_model_path) print("model loaded") transformed = sameLdaModel.transform( rescaledData) #.select("topicDistribution") transformed.show(truncate=False)
data1 = sc.parallelize([ Row(label=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), Row(label=0.0, features=Vectors.dense(1.0, 2.0, 3.0)), Row(label=1.0, features=Vectors.dense(2.0, 2.0, 3.0)), Row(label=0.0, features=Vectors.dense(4.0, 2.0, 3.0)) ]).toDF() data2 = sc.parallelize([ Row(label=1.0, weight=1.0, features=Vectors.dense(0.0, 5.0)), Row(label=0.0, weight=2.0, features=Vectors.dense(1.0, 2.0)), Row(label=1.0, weight=3.0, features=Vectors.dense(2.0, 1.0)), Row(label=0.0, weight=4.0, features=Vectors.dense(3.0, 3.0)) ]).toDF() data3 = spark.createDataFrame([(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) def svc_classifier(df, conf): max_iter = conf["params"].get("maxIter") reg_param = conf["params"].get("regParam") svm = LinearSVC(maxIter=max_iter, regParam=reg_param) if conf["tuning"].get("crossval"): grid = ParamGridBuilder().addGrid(svm.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=svm, estimatorParamMaps=grid, evaluator=evaluator) model = cv.fit(df) else:
from pyspark.ml.feature import VectorSlicer from pyspark.ml.linalg import Vectors from pyspark.sql.types import Row from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() spark.sparkContext.setLogLevel("ERROR") df = spark.createDataFrame([ Row(userFeatures=Vectors.sparse(3, { 0: -2.0, 1: 2.3 })), Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0])) ]) slicer = VectorSlicer(inputCol="userFeatures", outputCol="features", indices=[1]) output = slicer.transform(df) output.select("userFeatures", "features").show() spark.stop()
# $example on$ from pyspark.ml.feature import MinHashLSH from pyspark.ml.linalg import Vectors from pyspark.sql.functions import col # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("MinHashLSHExample") \ .getOrCreate() # $example on$ dataA = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),), (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),), (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)] dfA = spark.createDataFrame(dataA, ["id", "features"]) dataB = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),), (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),), (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)] dfB = spark.createDataFrame(dataB, ["id", "features"]) key = Vectors.sparse(6, [1, 3], [1.0, 1.0]) mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5) model = mh.fit(dfA) # Feature Transformation
def main(sc): sqlContext = SQLContext(sc) # In[1]: input_path = '' model_path = '' model_info_path = model_path + '' model_scaler_path = model_path + '' model_train_set_path = model_path + '' # Import the table of features and labels into dataframes df_data = sqlContext.read.format('com.databricks.spark.csv').options( header='true', inferschema='true').load(input_path) # Convert all features to double type except for ID and Label, which remain as strings # This is done because the Random Forest Algorithm requires features to be numbers df_data = df_data.select( *(col(c).cast("double").alias(c) for c in df_data.columns[1:-1]), df_data.u_msisdn.cast('string'), df_data.tag.cast('string')) # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features df_master = df_data.rdd.map(lambda r: Row( cust_id=r[-2], label=r[-1], features=Vectors.dense(r[:-2]))).toDF() # Randomly Split the data into a test and train set (df_master_train, df_master_test) = df_master.randomSplit([0.5, 0.5], seed=123) # Set the Random Forest input to the training set rf_init_data = df_master_train # Indexing labels for Random Forest Algorithm labelIndexer = StringIndexer(inputCol="label", outputCol="indexed_label") model = labelIndexer.fit(rf_init_data) rf_init_data = model.transform(rf_init_data) # Indexing features for Random Forest Algorithm featureIndexer = VectorIndexer(inputCol="features", outputCol="indexed_features", maxCategories=2) model = featureIndexer.fit(rf_init_data) rf_init_data = model.transform(rf_init_data) # Configures inbuilt Random Forest Classifier function with 500 trees, # max depth = 8 and 32 bins rf_init = RandomForestClassifier(labelCol="indexed_label", featuresCol="indexed_features", numTrees=500, impurity="gini", maxDepth=8, maxBins=32) rf_init_data.persist() # Cache the data set rf_init_model = rf_init.fit( rf_init_data) # Run the Random Forest Algorithm rf_init_data.unpersist() # Extract a list of feature importances from the output of the Random Forest # Algorithm with each element corresponding to a feature rf_init_varimp = np.sqrt(rf_init_model.featureImportances.toArray()) # Creates a list containing the 6 most important features to be used later # to subset our entire data from 146 features to just 6! # Create a list containing the names of all features column_names = df_data.columns[:-2] #Creating a dictionary mapping feature names to their respective importances NameToImp = dict() for i in range(len(column_names)): key = column_names[i] value = rf_init_varimp[i] NameToImp[key] = value # Sorted list in reverse order according to the variable importances sorted_varimp = sorted(NameToImp.values(), reverse=True) # Collect importances of 6 most important features sorted_top_varimp = sorted_varimp[:6] # Sorted list of column names in reverse order according to varimp sorted_colnames = sorted(NameToImp, key=NameToImp.get, reverse=True) # Collect colnames of 6 most imp features col_names = sorted_colnames[:6] # Pulling data for most import 6 features df_data_new = df_data.select( df_data.u_msisdn.cast('string'), df_data.tag.cast('string'), *(col(c).cast("double").alias(c) for c in col_names)) # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features df_master_new = df_data_new.rdd.map(lambda r: Row( cust_id=r[0], label=r[1], features=Vectors.dense(r[2:]))).toDF() # Scale and normaize the features so that all features can be compared # and create a new column for the features scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True) # Compute summary statistics by fitting the StandardScaler scalerModel = scaler.fit(df_master_new) # Normalize each feature to have unit standard deviation. df_master_new = scalerModel.transform(df_master_new) #The old features have been replaced with their scaled versions and thus # we no longer care about the old, unbalanced features df_master_new = df_master_new.drop('features') # Randomly Split the data into a test and train set (df_master_train, df_master_test) = df_master_new.randomSplit([0.5, 0.5], seed=123) test_all = df_master_test sqlContext.registerDataFrameAsTable(df_master_train, "df_master_train_table") # Remove the negative labels as only the positive ones are important train_all = sqlContext.sql( 'select * from df_master_train_table where label = 1') # Multiply feature values with corresponding importances m = ElementwiseProduct(scalingVec=Vectors.dense(sorted_top_varimp), inputCol="scaled_features", outputCol="scaled_weighted_features") train_all = m.transform(train_all) test_all = m.transform(test_all) sqlContext.dropTempTable("df_master_train_table") # Create a list of tasks containing tuples of number of neighbours and # cutoff frequencies to be passed to KNN algorithm number_of_neighbours = [250, 550, 750, 1000] popshared = 0.30 num_indices = int(popshared * (test_all.count())) tasks = [] for num_neighbour in number_of_neighbours: tasks = tasks + [(num_neighbour, num_indices)] # Partitioning the tasks for parallel processing tasksRDD = sc.parallelize(tasks, numSlices=len(tasks)) tasksRDD.collect() train_pd = train_all.toPandas() test_pd = test_all.toPandas() train_pd['indices'] = train_pd.index test_pd['indices'] = test_pd.index # Converting features into SparseVector format l_train = list() for k in train_pd.scaled_weighted_features: l_train.append( Vectors.sparse(len(k), [(i, j) for i, j in enumerate(k) if j != 0])) l_test = list() for k in test_pd.scaled_weighted_features: l_test.append( Vectors.sparse(len(k), [(i, j) for i, j in enumerate(k) if j != 0])) # Converting to a numpy array knn_train = np.asarray(l_train) knn_test = np.asarray(l_test) # Broadcasting the training and test sets to all partitions train_broadcast = sc.broadcast(knn_train) test_broadcast = sc.broadcast(knn_test) # Calling K Nearest Neighbour search on each partition tree_type = "kd_tree" resultsRDD = tasksRDD.map(lambda nc: findNearestNeighbour( train_broadcast, test_broadcast, nc[0], nc[1], test_pd, tree_type)) resultsRDD.cache() resultsRDD.count() resultsPD = resultsRDD.toDF().toPandas() resultsPD["popshared"] = popshared resultsPD = resultsPD.rename(columns={'_1': 'Recall'}) resultsPD = resultsPD.rename(columns={'_2': 'Number of Neighbors'}) bestResult = (resultsPD.sort_values(by=["Recall"], ascending=[0])).iloc[0] bestNN = int(bestResult["Number of Neighbors"]) bestRecall = bestResult["Recall"] # saving the model info - varimp,recall,NN,col_names to model_path column_names = [i for i in col_names] model_info = sc.parallelize([{ "varimp": sorted_top_varimp, "recall": bestRecall, "NN": bestNN, "col_names": column_names }]) model_info.saveAsPickleFile(path=model_info_path) # saving the scaler model to model_path scalerModel.write().overwrite().save(model_scaler_path) # saving the train set to model_path df_master_new.rdd.saveAsPickleFile(path=model_train_set_path)
""" from __future__ import print_function # $example on$ from pyspark.ml.linalg import Vectors from pyspark.ml.stat import Correlation # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("CorrelationExample") \ .getOrCreate() # $example on$ data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),), (Vectors.dense([4.0, 5.0, 0.0, 3.0]),), (Vectors.dense([6.0, 7.0, 0.0, 8.0]),), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)] df = spark.createDataFrame(data, ["features"]) r1 = Correlation.corr(df, "features").head() print("Pearson correlation matrix:\n" + str(r1[0])) r2 = Correlation.corr(df, "features", "spearman").head() print("Spearman correlation matrix:\n" + str(r2[0])) # $example off$ spark.stop()
import os import sys import pyspark from pyspark.sql import SparkSession from pyspark.ml.feature import PCA from pyspark.ml.linalg import Vectors from ml2rt import save_sparkml from ml2rt import utils executable = sys.executable os.environ["SPARK_HOME"] = pyspark.__path__[0] os.environ["PYSPARK_PYTHON"] = executable os.environ["PYSPARK_DRIVER_PYTHON"] = executable spark = SparkSession.builder.appName("redisai_trial").getOrCreate() data = spark.createDataFrame([(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )], ["features"]) pca = PCA(k=2, inputCol="features", outputCol="pca_features") model = pca.fit(data) feature_count = data.first()[0].size N = data.count() featurestype = utils.guess_onnx_tensortype(node_name='features', dtype='float32', shape=(N, feature_count)) save_sparkml(model, 'spark.onnx', initial_types=[featurestype],