def spark_ml(): diff_cat_in_train_test=test.select('Product_ID').subtract(train.select('Product_ID')) diff_cat_in_train_test.distinct().count() from pyspark.ml.feature import StringIndexer plan_indexer = StringIndexer(inputCol = 'Product_ID', outputCol = 'product_ID') labeller = plan_indexer.fit(train) Train1 = labeller.transform(train) Test1 = labeller.transform(test) Train1.show() from pyspark.ml.feature import RFormula formula = RFormula(formula="Purchase ~ Age+ Occupation +City_Category+Stay_In_Current_City_Years+Product_Category_1+Product_Category_2+ Gender",featuresCol="features",labelCol="label") t1 = formula.fit(Train1) train1 = t1.transform(Train1) test1 = t1.transform(Test1) train1.show() train1.select('features').show() train1.select('label').show() from pyspark.ml.regression import RandomForestRegressor rf = RandomForestRegressor() (train_cv, test_cv) = train1.randomSplit([0.7, 0.3]) model1 = rf.fit(train_cv) predictions = model1.transform(test_cv) from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator() mse = evaluator.evaluate(predictions,{evaluator.metricName:"mse" }) import numpy as np np.sqrt(mse), mse model = rf.fit(train1) predictions1 = model.transform(test1) df = predictions1.selectExpr("User_ID as User_ID", "Product_ID as Product_ID", 'prediction as Purchase') df.toPandas().to_csv('submission.csv')
def Chi_sqr(dataset_add, feature_colm, label_colm): dataset = spark.read.csv(dataset_add, header=True, inferSchema=True) dataset.show() # using the rformula for indexing, encoding and vectorising label = '' for y in label_colm: label = y print(label) f = "" f = label + " ~ " for x in feature_colm: f = f + x + "+" f = f[:-1] f = (f) formula = RFormula(formula=f, featuresCol="features", labelCol="label") length = feature_colm.__len__() output = formula.fit(dataset).transform(dataset) output.select("features", "label").show() # chi selector from pyspark.ml.feature import ChiSqSelector selector = ChiSqSelector(numTopFeatures=length, featuresCol="features", outputCol="selected_features", labelCol="label") result = selector.fit(output).transform(output) print("chi2 output with top %d features selected " % selector.getNumTopFeatures()) result.show() #runnin gfor the chi vallue test r = ChiSquareTest.test(result, "selected_features", "label").head() print("pValues: " + str(r.pValues)) p_values = str(r.pValues) print("degreesOfFreedom: " + str(r.degreesOfFreedom)) print("statistics: " + str(r.statistics)) json_response = {'pvalues': p_values} return json_response # Chi_sqr(dataset_add, features_colm, label_colm)
def feature_vector(df, idcol, colname, regressors): formula = RFormula(formula=colname + ' ~ ' + '+'.join(regressors), labelCol='label', featuresCol='features') # to dense feature vector df_features = formula.fit(df).transform(df).select(idcol, 'features', 'label') return df_features
def main(): spork = SparkSession.builder.appName("titanic").getOrCreate() #Gathering data df = spork.read.format("csv").option("inferschema", "true").option( "header", "true").load("titanic.csv") # df.show() df.printSchema() df = df.na.drop( "any" ) #has to that if any null value in row otherwise it will show error while feature engineering #feature Engineering #Change the formula and check the result supervised = RFormula( formula="Survived ~ Sex:Age + Pclass : Cabin + SibSp+Embarked ") fittedRF = supervised.fit(df) preparedDF = fittedRF.transform(df) preparedDF.show() #spliting data in train and validation data train, test = preparedDF.randomSplit([0.7, 0.3]) #classification #configure classifier lr = LogisticRegression(featuresCol="features", labelCol="label") #train classifier fittedLR = lr.fit(train) #check result result = fittedLR.transform(test) print("Coefficients:" + str(fittedLR.coefficients)) result.show(100) truePositive = float( result.filter("prediction =1.0 and label =1.0").count()) falsePositive = float( result.filter("prediction =1.0 and label = 0.0").count()) falseNegative = float( result.filter("prediction =0.0 and label = 1.0").count()) trueNegative = float( result.filter("prediction=0.0 and label =0.0 ").count()) print("True Positive :" + str(truePositive)) print("True Negative :" + str(trueNegative)) print("False Positive :" + str(falsePositive)) print("False Negative :" + str(falseNegative)) sensitivityOrRecall = truePositive / (truePositive + falseNegative) specificity = truePositive / (truePositive + falsePositive) precision = truePositive / (truePositive + falsePositive) accuracy = (truePositive + trueNegative) / (truePositive + trueNegative + falsePositive + falseNegative) print("sensitivityOrRecall :" + str(sensitivityOrRecall)) print("specificity :" + str(specificity)) print("precision :" + str(precision)) print("accuracy :" + str(accuracy)) spork.stop()
def test_rformula_string_indexer_order_type(self): df = self.spark.createDataFrame( [(1.0, 1.0, "a"), (0.0, 2.0, "b"), (1.0, 0.0, "a")], ["y", "x", "s"] ) rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc") self.assertEqual(rf.getStringIndexerOrderType(), "alphabetDesc") transformedDF = rf.fit(df).transform(df) observed = transformedDF.select("features").collect() expected = [[1.0, 0.0], [2.0, 1.0], [0.0, 0.0]] for i in range(0, len(expected)): self.assertTrue(all(observed[i]["features"].toArray() == expected[i]))
def test_rformula_string_indexer_order_type(self): df = self.spark.createDataFrame([ (1.0, 1.0, "a"), (0.0, 2.0, "b"), (1.0, 0.0, "a")], ["y", "x", "s"]) rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc") self.assertEqual(rf.getStringIndexerOrderType(), 'alphabetDesc') transformedDF = rf.fit(df).transform(df) observed = transformedDF.select("features").collect() expected = [[1.0, 0.0], [2.0, 1.0], [0.0, 0.0]] for i in range(0, len(expected)): self.assertTrue(all(observed[i]["features"].toArray() == expected[i]))
def test_rformula_force_index_label(self): df = self.spark.createDataFrame([ (1.0, 1.0, "a"), (0.0, 2.0, "b"), (1.0, 0.0, "a")], ["y", "x", "s"]) # Does not index label by default since it's numeric type. rf = RFormula(formula="y ~ x + s") model = rf.fit(df) transformedDF = model.transform(df) self.assertEqual(transformedDF.head().label, 1.0) # Force to index label. rf2 = RFormula(formula="y ~ x + s").setForceIndexLabel(True) model2 = rf2.fit(df) transformedDF2 = model2.transform(df) self.assertEqual(transformedDF2.head().label, 0.0)
def testWorkflow(self): df = self.sqlContext.read.csv(os.path.join(os.path.dirname(__file__), "resources/Iris.csv"), header = True, inferSchema = True) formula = RFormula(formula = "Species ~ .") classifier = DecisionTreeClassifier() pipeline = Pipeline(stages = [formula, classifier]) pipelineModel = pipeline.fit(df) pmmlBuilder = PMMLBuilder(self.sc, df, pipelineModel) \ .verify(df.sample(False, 0.1)) pmml = pmmlBuilder.build() self.assertIsInstance(pmml, JavaObject) pmmlByteArray = pmmlBuilder.buildByteArray() self.assertIsInstance(pmmlByteArray, bytes) pmmlString = pmmlByteArray.decode("UTF-8") self.assertTrue("<PMML xmlns=\"http://www.dmg.org/PMML-4_3\" xmlns:data=\"http://jpmml.org/jpmml-model/InlineTable\" version=\"4.3\">" in pmmlString) self.assertTrue("<VerificationFields>" in pmmlString) pmmlBuilder = pmmlBuilder.putOption(classifier, "compact", False) nonCompactFile = tempfile.NamedTemporaryFile(prefix = "pyspark2pmml-", suffix = ".pmml") nonCompactPmmlPath = pmmlBuilder.buildFile(nonCompactFile.name) pmmlBuilder = pmmlBuilder.putOption(classifier, "compact", True) compactFile = tempfile.NamedTemporaryFile(prefix = "pyspark2pmml-", suffix = ".pmml") compactPmmlPath = pmmlBuilder.buildFile(compactFile.name) self.assertGreater(os.path.getsize(nonCompactPmmlPath), os.path.getsize(compactPmmlPath) + 100)
def rFormula(): rFormula = RFormula(formula="price ~ .", featuresCol="features", labelCol="price", handleInvalid="skip") lr = LinearRegression(labelCol="price", featuresCol="features") return Pipeline(stages=[rFormula, lr])
def dsi_regression(df: DataFrame, dsi: str, trt: str, ps: str, cov_list: list, regParam: float = 1e-2): from pyspark.ml.regression import LinearRegression from pyspark.ml.feature import RFormula if ps: rhs_ls = [trt, ps] + cov_list else: rhs_ls = [trt] + cov_list dsi_formula = RFormula( formula = '%s ~ %s' % (dsi, ' + '.join(rhs_ls)), featuresCol="features", labelCol="label" ) dsi_df = dsi_formula\ .fit(df)\ .transform(df.select(['customer_id', dsi] + rhs_ls))\ .select('customer_id', 'features', 'label') df_stats = df.filter(col(trt) > 0).select( mean(col('Treated_F1M')).alias('mean_dosage'), count(lit(1)).alias('total_treated') ).collect()[0].asDict() lr = LinearRegression( featuresCol='features', labelCol = 'label', tol=1e-4, regParam=regParam, elasticNetParam=0.5 ) lrm = lr.fit(dsi_df) return lrm.coefficients, df_stats
def testWorkflow(self): df = self.sqlContext.read.csv(irisCsvFile, header = True, inferSchema = True) formula = RFormula(formula = "Species ~ .") classifier = DecisionTreeClassifier() pipeline = Pipeline(stages = [formula, classifier]) pipelineModel = pipeline.fit(df) pmmlBytes = toPMMLBytes(self.sc, df, pipelineModel) pmmlString = pmmlBytes.decode("UTF-8") self.assertTrue(pmmlString.find("<PMML xmlns=\"http://www.dmg.org/PMML-4_3\" version=\"4.3\">") > -1)
def data_preparation(df, avg_age, feat_name="features", lab_name='label'): df = df.fillna(avg_age, subset=['Age']) """ ## unnecessary when using Rformula df = df.replace(['male','female'],['-1','1'],'Sex') df = df.withColumn('Sex',df.Sex.cast('int')) df = df.replace(['S','Q','C'],['-1','0','1'],'Embarked') df = df.withColumn('Embarked',df.Embarked.cast('int')) df.printSchema() """ # Rformula automatically formats categorical data (Sex and Embarked) into numerical data formula = RFormula( formula="Survived ~ Sex + Age + Pclass + Fare + SibSp + Parch", featuresCol=feat_name, labelCol=lab_name) df = formula.fit(df).transform(df) df.show(truncate=False) return df
def data_preparation(df, avg_age,feat_name="features",lab_name='label'): df = df.fillna(avg_age,subset=['Age']) """ ## unnecessary when using Rformula df = df.replace(['male','female'],['-1','1'],'Sex') df = df.withColumn('Sex',df.Sex.cast('int')) df = df.replace(['S','Q','C'],['-1','0','1'],'Embarked') df = df.withColumn('Embarked',df.Embarked.cast('int')) df.printSchema() """ # Rformula automatically formats categorical data (Sex and Embarked) into numerical data formula = RFormula(formula="Survived ~ Sex + Age + Pclass + Fare + SibSp + Parch", featuresCol=feat_name, labelCol=lab_name) df = formula.fit(df).transform(df) df.show(truncate=False) return df
def __init__(self, formula="tip_amount ~ passenger_count + \ fare_amount + vendor_index + ratecode_index \ + trip_duration_m + store_and_fwd_flag_index + \ trip_type + pu_location_id + do_location_id + \ trip_distance"): self.reg_formula = RFormula(formula=formula) self.feature_indexer = VectorIndexer(inputCol="features", outputCol="indexed_features", handleInvalid="keep", maxCategories=270) self.indexers = [self.reg_formula, self.feature_indexer] self.form_encoder_model = Pipeline(stages=self.indexers)
def test_rformula_force_index_label(self): df = self.spark.createDataFrame([(1.0, 1.0, "a"), (0.0, 2.0, "b"), (1.0, 0.0, "a")], ["y", "x", "s"]) # Does not index label by default since it's numeric type. rf = RFormula(formula="y ~ x + s") model = rf.fit(df) transformedDF = model.transform(df) self.assertEqual(transformedDF.head().label, 1.0) # Force to index label. rf2 = RFormula(formula="y ~ x + s").setForceIndexLabel(True) model2 = rf2.fit(df) transformedDF2 = model2.transform(df) self.assertEqual(transformedDF2.head().label, 0.0)
def testWorkflow(self): df = self.sqlContext.read.csv(os.path.join(os.path.dirname(__file__), "resources/Iris.csv"), header=True, inferSchema=True) formula = RFormula(formula="Species ~ .") classifier = DecisionTreeClassifier() pipeline = Pipeline(stages=[formula, classifier]) pipelineModel = pipeline.fit(df) pmmlBuilder = PMMLBuilder(self.sc, df, pipelineModel) \ .putOption(classifier, "compact", True) pmmlBytes = pmmlBuilder.buildByteArray() pmmlString = pmmlBytes.decode("UTF-8") self.assertTrue( pmmlString.find( "<PMML xmlns=\"http://www.dmg.org/PMML-4_3\" version=\"4.3\">") > -1)
def add_propensity(df: DataFrame, lhs: str, rhs: list, ps_col: str, regParam: float = 1e-2): from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import RFormula PS_formula = RFormula( formula = '%s ~ %s' % (lhs, ' + '.join(rhs)), featuresCol="features", labelCol="label" ) PS_df = PS_formula\ .fit(df)\ .transform(df.select(['customer_id', lhs] + rhs))\ .select('customer_id', 'features', 'label') lr = LogisticRegression( featuresCol='features', labelCol = 'label', tol=1e-4, regParam=regParam, elasticNetParam=0.3 ) preds = lr\ .fit(PS_df).transform(PS_df)\ .select(['customer_id', 'probability', 'label'])\ .withColumn(ps_col, split2_udf('probability'))\ .drop('probability') mean_PS = preds.rollup('label').mean(ps_col).alias('mean_ps').collect() mean_PS = [x.asDict() for x in mean_PS] mean_trt = preds.rollup('label').count().collect() mean_trt = [x.asDict() for x in mean_trt] df = df.join(preds.drop('label'), on=['customer_id'], how='inner') return df, mean_PS, mean_trt
size = 3 idx = [1, 2] # locations of non-zero elements in vector values = [2.0, 3.0] sparseVec = Vectors.sparse(size, idx, values) # COMMAND ---------- df = spark.read.json("/data/simple-ml") df.orderBy("value2").show() # COMMAND ---------- from pyspark.ml.feature import RFormula supervised = RFormula(formula="lab ~ . + color:value1 + color:value2") # COMMAND ---------- fittedRF = supervised.fit(df) preparedDF = fittedRF.transform(df) preparedDF.show() # COMMAND ---------- train, test = preparedDF.randomSplit([0.7, 0.3]) # COMMAND ----------
from __future__ import print_function # $example on$ from pyspark.ml.feature import RFormula # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("RFormulaExample")\ .getOrCreate() # $example on$ dataset = spark.createDataFrame( [(7, "US", 18, 1.0), (8, "CA", 12, 0.0), (9, "NZ", 15, 0.0)], ["id", "country", "hour", "clicked"]) formula = RFormula( formula="clicked ~ country + hour", featuresCol="features", labelCol="label") output = formula.fit(dataset).transform(dataset) output.select("features", "label").show() # $example off$ spark.stop()
def main(): #静默弃用sklearn警告 warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning) model_name = 'Distr_GBTClassifier' dir_of_dict = sys.argv[1] bag = too.Read_info(dir_of_dict,'supervision') name_dict,options,task_id,job_id,train_result_dir,\ names_str,names_num,names_show,Y_names,dir_of_inputdata,\ dir_of_outputdata,open_pca,train_size,test_size,normalized_type = bag dir_of_storePara = train_result_dir + '/%s_Parameters.json'%(str(task_id)+'_'+str(job_id)+'_'+model_name) dir_of_storeModel = train_result_dir + '/%s_model'%(str(task_id)+'_'+str(job_id)+'_'+model_name) # 配置spark客户端 sess = SparkSession\ .builder\ .master("local[4]")\ .appName("GBTClassifier_spark")\ .config("spark.some.config.option", "some-value")\ .getOrCreate() sc=sess.sparkContext sc.setLogLevel("ERROR") if options == 'train': time_start = time() #获取数据 dataset = pd.read_csv(dir_of_inputdata) #用于测试 #dataset = dataset[0:1000] #限制多数类的数据 #dataset = too.CalcMostLabel(dataset,Y_names) Y_datavec = dataset[Y_names].values #输出每个标签的数量 print 'Counter:original y',Counter(Y_datavec) print'----------------------------------------------' #分别获得字符字段和数值型字段数据,且合并 X_datavec,X_columns,vocabset,datavec_show_list= too.Merge_form(dataset,names_str,names_num,names_show,'vocabset','open') #数据归一化 X_datavec = too.Data_process(X_datavec,normalized_type) #处理数据不平衡问题 #X,Y = mlp.KMeans_unbalanced(X_datavec,Y_datavec,X_columns,Y_names) #X,Y = mlp.Sample_unbalanced(X_datavec,Y_datavec) X,Y = X_datavec, Y_datavec ret_num = 'no_num' #PCA降维 if open_pca == 'open_pca': pca_num,ret = mlp.GS_PCA(X) print 'PCA Information:',pca_num,ret print'----------------------------------------------' ret_num = ret['99%'] X = mlp.Model_PCA(X,ret_num) #存储vocabset这个list和ret_num too.StorePara(dir_of_storePara,vocabset,ret_num) print'--------------Train data shape----------------' print 'X.shape:',X.shape print'----------------------------------------------' print 'Y.shape:',Y.shape print'----------------------------------------------' print'--------------Start %s model------------------'%model_name features = pd.DataFrame(X,) targets = pd.DataFrame(Y, columns = ['Y']) #合拼矩阵 merged = pd.concat([features, targets], axis = 1) #创建spark DataFrame raw_df = sess.createDataFrame(merged) #提取特征与目标 fomula = RFormula(formula = 'Y ~ .', featuresCol="features",labelCol="label") raw_df = fomula.fit(raw_df).transform(raw_df) #拆分训练集和测试集 xy_train, xy_test = raw_df.randomSplit([train_size, test_size],seed=666) #调用模型 clf_model = dmp.Distr_GBTClassifier(xy_train,xy_test) #保存模型参数 clf_model.write().overwrite().save(dir_of_storeModel) print'----------------------------------------------' dmp.Predict_test_data(xy_test, datavec_show_list, names_show, clf_model, dir_of_outputdata) duration = too.Duration(time()-time_start) print 'Total run time: %s'%duration if options == 'predict': time_start = time() with open(dir_of_storePara,'r') as f: para_dict = json.load(f) vocabset = para_dict['vocabset'] ret_num = para_dict['ret_num'] #获取数据 dataset = pd.read_csv(dir_of_inputdata) #分别获得字符字段和数值型字段数据,且合并 X_datavec,datavec_show_list = too.Merge_form(dataset,names_str,names_num,names_show,vocabset,'close') #数据归一化 X = too.Data_process(X_datavec,normalized_type) #PCA降维 if open_pca == 'open_pca': X = mlp.Model_PCA(X,ret_num) print'-------------Pdedict data shape---------------' print 'X.shape:',X.shape print'----------------------------------------------' print'--------------Start %s model------------------'%model_name features = pd.DataFrame(X,) #创建spark DataFrame raw_features = sess.createDataFrame(features) raw_x = VectorAssembler(inputCols=raw_features.columns,outputCol='features').transform(raw_features) clf_model = GBTClassificationModel.load(dir_of_storeModel) dmp.Predict_data(raw_x, datavec_show_list, names_show, clf_model, dir_of_outputdata) duration = too.Duration(time()-time_start) print 'Total run time: %s'%duration
StructField('Cabin', StringType(), True), StructField('Embarked', StringType(), True) ]) rawTraining = spark.read.csv(trainingFilePart, header=True, schema=customSchema) selectedTraining = rawTraining.select( col('Survived').alias('label'), 'PClass', 'Sex', 'Age', 'Fare') addingColTraining = selectedTraining.withColumn( 'Missing_Age', selectedTraining['Age'].isNull()).withColumn( 'Missing_Fare', selectedTraining['Fare'].isNull()) '''build pipeline''' imputer = Imputer(inputCols=['Age', 'Fare'], outputCols=['Out_Age', 'Out_Fare']) rformula = RFormula( formula='~ Sex + Out_Age + Missing_Age + Out_Fare + Missing_Fare', featuresCol='features') lr = LogisticRegression(family='binomial') pipeline = Pipeline(stages=[imputer, rformula, lr]) '''build validation''' evaluator = BinaryClassificationEvaluator() grid = ParamGridBuilder().addGrid(lr.maxIter, [10, 50, 100])\ .addGrid(lr.regParam, [0.0, 0.01, 0.03, 0.1, 0.3])\ .addGrid(lr.elasticNetParam, [0.0, 0.01, 0.03])\ .build() cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5) model = cv.fit(addingColTraining)
filePath = "/databricks-datasets/learning-spark-v2/sf-airbnb/sf-airbnb-clean.parquet" airbnbDF = spark.read.parquet(filePath) (trainDF, testDF) = airbnbDF.randomSplit([.8, .2], seed=42) # COMMAND ---------- from pyspark.sql.functions import col, log from pyspark.ml import Pipeline from pyspark.ml.feature import RFormula from pyspark.ml.regression import LinearRegression logTrainDF = trainDF.withColumn("log_price", log(col("price"))) logTestDF = testDF.withColumn("log_price", log(col("price"))) rFormula = RFormula(formula="log_price ~ . - price", featuresCol="features", labelCol="log_price", handleInvalid="skip") lr = LinearRegression(labelCol="log_price", predictionCol="log_pred") pipeline = Pipeline(stages=[rFormula, lr]) pipelineModel = pipeline.fit(logTrainDF) predDF = pipelineModel.transform(logTestDF) # COMMAND ---------- # MAGIC %md # MAGIC ## Exponentiate # MAGIC # MAGIC In order to interpret our RMSE, we need to convert our predictions back from logarithmic scale. # COMMAND ----------
from pyspark.ml.feature import StringIndexer plan_indexer = StringIndexer(inputCol = 'Product_ID', outputCol = 'product_ID1') labeller = plan_indexer.fit(train) #%% Train1 = labeller.transform(train) Test1 = labeller.transform(test) Train1.show() #%% from pyspark.ml.feature import RFormula formula = RFormula(formula="Purchase ~ Age+ Occupation +City_Category+Stay_In_Current_City_Years+Product_Category_1+Product_Category_2+ Gender",featuresCol="features",labelCol="label") t1 = formula.fit(Train1) #%% train1 = t1.transform(Train1) test1 = t1.transform(Test1) train1.show() train1.select('features').show() train1.select('label').show() #%% from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import StringIndexer ## Index labels, adding metadata to the label column. ## Fit on whole dataset to include all labels in index. data = StringIndexer(inputCol="click", outputCol="label").fit(data).transform(data) data.show() ## 可產生另一個檔案.transform(data)不一定要在(data)檔案裡 #labelIndexer ===> data # RFormula from pyspark.ml.feature import RFormula ## RFormula: string input colums will be one-hot encoded, and numeric columns will be cast to doubles. ##特徵值要被修正formula" " formula = RFormula( formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + device_type + device_conn_type", #formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + C14 + C17 + C18 + C19 + C21", #0.707636 #formula="label ~ banner_pos + site_id + site_domain + C14 + C17 + C21", #0.7 featuresCol="features", labelCol="label") formula_data = formula.fit(data).transform(data) formula_data.select("features","label").show() # Split the data into training and test sets (30% held out for testing) #已經有了! # Split training and test data. (training, test) = formula_data.randomSplit([0.7, 0.3], seed = 12345) #what's seed training.show() from pyspark.ml.classification import LogisticRegression from pyspark.ml.param import Param, Params
#csv데이터를 데이터프래임으로 불러오기 adDF = spark.read.csv("dataset/Advertising.csv", inferSchema=True, header=True) #데이터 위에서 5개 출력 해보자 adDF.show(5) #데이터 총 갯수는? adDF.count() adDF.printSchema() from pyspark.ml.feature import RFormula from pyspark.ml.regression import LinearRegression from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.linalg import Vectors #transformer 라이브러리를 이용해서 벡터화 하는 방법 dataModel = RFormula().setFormula("Sales ~.").setFeaturesCol("features").setLabelCol("label") model_fit = dataModel.fit(adDF).transform(adDF) model_fit.show() model_fit.printSchema() model_fit_select = model_fit.select(["features","label"]) model_fit_select.show() model_fit_select.printSchema() #Vectors 함수를 이용해서 벡터화 하기 adV = adDF.rdd.map(lambda x: [Vectors.dense(x[0:3]), x[-1]]).toDF(['features', 'label']) adV.show() adV.printSchema()
inferSchema=True) dataset.show() abc = dataset.schema.fields featuresCol = [] for x in abc: # print(type(x.dataType)) if (isinstance(x.dataType, StringType)): print(x.name + " " + str(x.dataType)) # dataset.select(x.name) featuresCol.append(x.name) f = "" f = "ACCELERATION" + " ~ " for x in featuresCol: f = f + x + "+" f = f[:-1] f = (f) print(f) formula = RFormula(formula=f, featuresCol="features", labelCol="label") output = formula.fit(dataset).transform(dataset) output.show(truncate=False)
# Not using Price (label) or address in features columns.remove('date') columns.remove('open') columns.remove('high') columns.remove('low') columns.remove('close') columns.remove('Name') #columns.remove('market_value') columns.remove('avgof7D_lag3D') columns.remove('avgof14D_lag0D') columns.remove('avgof28D_lag14D') columns.remove('id') formula = "{} ~ {}".format("avgof14D_lag0D", " + ".join(columns)) print("Formula : {}".format(formula)) rformula = RFormula(formula=formula) lr = LinearRegression() pipeline = Pipeline(stages=[rformula, lr]) # Parameter grid paramGrid = ParamGridBuilder()\ .addGrid(lr.regParam,[0.01, .04])\ .build() cv = CrossValidator()\ .setEstimator(pipeline)\ .setEvaluator(RegressionEvaluator()\ .setMetricName("r2"))\ .setEstimatorParamMaps(paramGrid)\ .setNumFolds(3) cvModel = cv.fit(train_data)
def dr_regression(df: DataFrame, dsi: str, trt: str, ps: str, cov_list: list, regParam: float = 1e-2, max_iptw: float = 1e-4): from pyspark.ml.regression import LinearRegression from pyspark.ml.feature import RFormula if not dsi or not trt or not ps: return # if df.count() < 1000: # return _, df = df.custom.drop_const_cols(cov_list[0], cov_list[1]) if cov_list: flat_cov_list = [x for sublist in cov_list for x in sublist] flat_cov_list = [c for c in flat_cov_list if c in df.schema.names] df = ( df .withColumn('iptw', 1./((col(trt)*col(ps)+(1-col(trt))*(1-col(ps)))+1e-4)) .withColumn('ipt0', 1./(1-col(ps)+1e-4)) .cache() ) rhs_ls = [trt, 'iptw'] + flat_cov_list transformer = RFormula( formula='%s ~ %s' % (dsi, ' + '.join(rhs_ls)), featuresCol="features", labelCol="label" ).fit(df) dsi_df = ( transformer .transform(df.select(['customer_id', dsi] + rhs_ls))\ .select('customer_id', 'features', 'label') ) lr = LinearRegression( featuresCol='features', labelCol = 'label', tol=1e-4, regParam=regParam, elasticNetParam=0.5 ) lrm = lr.fit(dsi_df) dsi_1 = lrm.transform( transformer .transform( df .filter(col(trt) > 0.) .select(['customer_id', dsi] + rhs_ls) ).select('customer_id', 'features', 'label') ).select('customer_id', col('prediction').alias('dsi_1')) dsi_0 = lrm.transform( transformer .transform( df .filter(col(trt) > 0.) .withColumn(trt, 1.-col(trt)) .drop('iptw').withColumnRenamed('ipt0', 'iptw') .select(['customer_id', dsi] + rhs_ls) ).select('customer_id', 'features', 'label') ).select('customer_id', col('prediction').alias('dsi_0')) estimates = ( dsi_1 .join(dsi_0, on=['customer_id'], how='inner') .withColumn('effect', col('dsi_1') - col('dsi_0')) .agg( mean(col('effect')).alias('att_mean'), expr('percentile(effect, array(0.5))')[0].alias('att_median'), stddev(col('effect')).alias('att_std'), count(col('customer_id')).alias('total_trt') ).collect() ) return estimates
StructField("C18", DoubleType(), False), StructField("C19", DoubleType(), False), StructField("C20", DoubleType(), False), StructField("C21", DoubleType(), False)]) # Get file df = sqlContext.read.format("com.databricks.spark.csv").options(header= 'true').schema(customSchema).load("file:///home/bigdatas16/Downloads/train100K.csv") # Displays the content of the DataFrame to stdout df.show() from pyspark.ml.feature import StringIndexer data = StringIndexer(inputCol="click", outputCol="label").fit(df).transform(df) data.show() # RFormula from pyspark.ml.feature import RFormula formula = RFormula(formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + device_model + C14 + C17 + C18 + C19 + C21 ", featuresCol="features", labelCol="label") output = formula.fit(data).transform(data) data1 = output.select("label", "features") data1.show() # Split training and test data. #(training, test) = data1.randomSplit([0.7, 0.3], seed = 12345) training, test = data1.randomSplit([0.7, 0.3], seed = 12345) training.show() # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and rf (random forest). from pyspark.ml.classification import LogisticRegression from pyspark.ml.param import Param, Params from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.sql import Row from pyspark.ml import Pipeline
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn.transform(sales.select("Description")) tokenized.show(20, False) # COMMAND ---------- from pyspark.ml.feature import StandardScaler sScaler = StandardScaler().setInputCol("features") sScaler.fit(scaleDF).transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import RFormula supervised = RFormula(formula="lab ~ . + color:value1 + color:value2") supervised.fit(simpleDF).transform(simpleDF).show() # COMMAND ---------- from pyspark.ml.feature import SQLTransformer basicTransformation = SQLTransformer()\ .setStatement(""" SELECT sum(Quantity), count(*), CustomerID FROM __THIS__ GROUP BY CustomerID """) basicTransformation.transform(sales).show()
categorical = df_train.columns categorical.remove('label') print(categorical) cat_inter = ['C14', 'C15'] concat = '+'.join(categorical) interaction = ':'.join(cat_inter) formula = "label ~ " + concat + '+' + interaction print(formula) from pyspark.ml.feature import RFormula interactor = RFormula(formula=formula, featuresCol="features", labelCol="label").setHandleInvalid("keep") interactor.fit(df_train).transform(df_train).select("features").show() from pyspark.ml.classification import LogisticRegression classifier = LogisticRegression(maxIter=20, regParam=0.000, elasticNetParam=0.000) stages = [interactor, classifier] from pyspark.ml import Pipeline pipeline = Pipeline(stages=stages)
denseVec = Vectors.dense(1.0, 2.0, 3.0) size = 3 idx = [1, 2] # locations of non-zero elements in vector values = [2.0, 3.0] sparseVec = Vectors.sparse(size, idx, values) print(sparseVec) # COMMAND ---------- df = spark.read.json("/databricks-datasets/definitive-guide/data/simple-ml") df.orderBy("value2").show() # COMMAND ---------- from pyspark.ml.feature import RFormula supervised = RFormula(formula="lab ~ . +color:value1 + color:value2") # COMMAND ---------- fittedRF = supervised.fit(df) preparedDF = fittedRF.transform(df) preparedDF.show() # COMMAND ---------- train, test = preparedDF.randomSplit([0.7, 0.3]) # COMMAND ---------- from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(labelCol="label",featuresCol="features")
# We have more work to do! # ## Exercises # (1) Import the # [RFormula](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.RFormula) # class from the `pyspark.ml.feature` module. from pyspark.ml.feature import RFormula # (2) Create an instance of the `RFormula` class with the R formula # `star_rating ~ reviewed + vehicle_year + vehicle_color`. rformula = RFormula(formula = "star_rating ~ reviewed + vehicle_year + vehicle_color") # (3) Specify a pipeline consisting of the `filterer`, `extractor`, and the # RFormula instance specified above. pipeline = Pipeline(stages=[filterer, extractor, rformula]) # (4) Fit the pipeline on the `train` DataFrame. pipeline_model = pipeline.fit(train) # (5) Use the `save` method to save the pipeline model to the # `models/pipeline_model` directory in HDFS. pipeline_model.write().overwrite().save("models/pipeline_model")
sales = spark.read.format("csv")\ .option("header", "true")\ .option("inferSchema", "true")\ .load("/data/retail-data/by-day/*.csv")\ .coalesce(5)\ .where("Description IS NOT NULL") fakeIntDF = spark.read.parquet("/data/simple-ml-integers") simpleDF = spark.read.json("/data/simple-ml") scaleDF = spark.read.parquet("/data/simple-ml-scaling") # COMMAND ---------- from pyspark.ml.feature import RFormula supervised = RFormula(formula="lab ~ . + color:value1 + color:value2") supervised.fit(simpleDF).transform(simpleDF).show() # COMMAND ---------- from pyspark.ml.feature import SQLTransformer basicTransformation = SQLTransformer()\ .setStatement(""" SELECT sum(Quantity), count(*), CustomerID FROM __THIS__ GROUP BY CustomerID """) basicTransformation.transform(sales).show()
StructField("device_model", StringType(), True), StructField("device_type", DoubleType(), False), StructField("device_conn_type", DoubleType(), False), StructField("C14", DoubleType(), False), StructField("C15", DoubleType(), False), StructField("C16", DoubleType(), False), StructField("C17", DoubleType(), False), StructField("C18", DoubleType(), False), StructField("C19", DoubleType(), False), StructField("C20", DoubleType(), False), StructField("C21", DoubleType(), False)]) df = sqlContext.read.format("com.databricks.spark.csv").options(header= 'true').schema(customSchema).load("file:///home/bigdatas16/Downloads/train100K.csv") data = StringIndexer(inputCol="click", outputCol="label").fit(df).transform(df) formula = RFormula(formula="label ~ C1 + banner_pos + site_category + app_category +device_type + device_conn_type + C15 + C16 + C18 + C19", featuresCol="features", labelCol="label") output = formula.fit(data).transform(data) data1 = output.select("label", "features") (training, test) = data1.randomSplit([0.8, 0.2], seed = 12345) #gbt = GBTClassifier(numTrees = 10, maxDepth = 3, maxBins = 64) gbt = GBTClassifier(maxIter = 30, maxDepth = 2, impurityType = gini) #gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10) ##rf = RandomForestClassifier(numTrees = 25, maxDepth = 4, maxBins = 64) pipeline = Pipeline(stages=[gbt]) pipelineModel = pipeline.fit(training) testPredictions = pipelineModel.transform(test) testPredictions.select("prediction", "label", "features").show(5)
# limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import RFormula # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("RFormulaExample")\ .getOrCreate() # $example on$ dataset = spark.createDataFrame([(7, "US", 18, 1.0), (8, "CA", 12, 0.0), (9, "NZ", 15, 0.0)], ["id", "country", "hour", "clicked"]) formula = RFormula(formula="clicked ~ country + hour", featuresCol="features", labelCol="label") output = formula.fit(dataset).transform(dataset) output.select("features", "label").show() # $example off$ spark.stop()
.getOrCreate() data_simple_ml = "C:\\PySpark\\data\\simple-ml" data_simple_ml_persist = "C:\\PySpark\\data\\simple-ml\\persisted-models" df = spark.read.json(data_simple_ml) df.printSchema() df.orderBy("value2").show() train, test = df.randomSplit([0.7, 0.3]) from pyspark.ml.feature import RFormula from pyspark.ml.classification import LogisticRegression rForm = RFormula() lr = LogisticRegression().setLabelCol("label").setFeaturesCol("features") from pyspark.ml import Pipeline stages = [rForm, lr] pipeline = Pipeline().setStages(stages) from pyspark.ml.tuning import ParamGridBuilder params = ParamGridBuilder()\ .addGrid(rForm.formula, [ "lab ~ . + color:value1", "lab ~ . + color:value1 + color:value2"])\ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\ .addGrid(lr.regParam, [0.1, 2.0])\