def sql_transformer_usecase(): """ 通过sql方式实现对数据特征的转换 "_THIS_" 代表的是输入数据对应的dataset """ spark = getSparkSession() df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], ["id", "v1", "v2"]) sqlTrans = SQLTransformer( statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") sqlTrans.transform(df).show(truncate=False)
def main_A(inputs): data = spark.read.option('encoding', 'UTF-8').csv(inputs, schema=tmax_schema) ################ FEATURE ENGINEERING: add yesterday tmax ##################### if USE_YTD_TEMP_FEATURE: syntax = """SELECT today.latitude,today.longitude,today.elevation,today.date, today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station""" sql_trans = SQLTransformer(statement=syntax) df = sql_trans.transform(data) ############################################################################# df = data.withColumn('day_of_year', fn.dayofyear('date')) df = df.withColumn('year', fn.year('date')) df_long_lat = df[['station', 'longitude', 'latitude', 'tmax', 'year']].toPandas() count_year = df_long_lat['year'].value_counts().to_dict() # SELECT YEAR and DURATION YEAR_SELECTED = 2000 YEAR_DURATION = 20 df_long_lat = df_long_lat.loc[(df_long_lat['year'] > YEAR_SELECTED) & ( df_long_lat['year'] < YEAR_SELECTED + YEAR_DURATION)] # UNCLUSTER plot by finding avg temperature (groupby same station and year) df_long_lat['avg_temp'] = df_long_lat.groupby(['station', 'year' ])['tmax'].transform('mean') df_long_lat.drop_duplicates(subset=['station', 'year'], inplace=True) print(df_long_lat) world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) geometry = [ Point(xy) for xy in zip(df_long_lat['longitude'], df_long_lat['latitude']) ] df_long_lat = df_long_lat.drop(['longitude', 'latitude'], axis=1) crs = {'init': 'epsg:4326'} gdf = GeoDataFrame(df_long_lat, crs=crs, geometry=geometry) base = world.plot(color='white', edgecolor='black', figsize=(20, 12)) gdf.plot(column='avg_temp', ax=base, marker='o', cmap='jet', markersize=15, legend=True, legend_kwds={ 'label': "Temperature in Celcius", 'orientation': "horizontal" }) plt.title('Distribution of Temperature between ' + str(YEAR_SELECTED) + " and " + str(YEAR_SELECTED + YEAR_DURATION)) plt.savefig(inputs + "_" + str(YEAR_SELECTED) + "-" + str(YEAR_SELECTED + YEAR_DURATION))
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25]) #use seed here train = train.cache() validation = validation.cache() #creating a pipeline to predict RGB colours -> word rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'], outputCol="features") #dataframe1 = rgb_assembler.transform(data) word_indexer = StringIndexer(inputCol="word", outputCol="target", handleInvalid="error", stringOrderType="frequencyDesc") classifier = MultilayerPerceptronClassifier(featuresCol="features", labelCol="target", layers=[3, 25, 25]) rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier]) rgb_model = rgb_pipeline.fit(train) #creating an evaluator and score the validation data #model_train = rgb_model.transform(train) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="target") rgb_validation = rgb_model.transform(validation) score = evaluator.evaluate(rgb_validation, {evaluator.metricName: "accuracy"}) print('Validation score for RGB model: %g' % (score, )) plot_predictions(rgb_model, 'RGB', labelCol='target') rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) # creating a pipeline to predict RGB colours -> word; train and evaluate. sqlTrans = SQLTransformer(statement=rgb_to_lab_query) labdata = sqlTrans.transform(data) ltrain, lvalidation = labdata.randomSplit([0.75, 0.25]) lrgb_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'], outputCol="LAB") lword_indexer = StringIndexer(inputCol="word", outputCol="labTarget", handleInvalid="error", stringOrderType="frequencyDesc") lclassifier = MultilayerPerceptronClassifier(featuresCol="LAB", labelCol="labTarget", layers=[3, 25, 25]) lrgb_pipeline = Pipeline( stages=[sqlTrans, lrgb_assembler, lword_indexer, lclassifier]) lrgb_model = lrgb_pipeline.fit(ltrain) #lmodel_train = lrgb_model.transform(ltrain) lrgb_validation = lrgb_model.transform(lvalidation) print(lrgb_validation.show()) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="labTarget") lscore = evaluator.evaluate(lrgb_validation, {evaluator.metricName: "accuracy"}) print('Validation score for LAB model: %g' % (lscore, )) plot_predictions(lrgb_model, 'LAB', labelCol='word')
def query(self, sql_expression): """ Implements the transformations which are defined by SQL statement. Currently we only support SQL syntax like "SELECT ... FROM __THIS__ ..." where "__THIS__" represents the underlying table of the input dataframe. :param self: Spark Dataframe :param sql_expression: SQL expression. :return: Dataframe with columns changed by SQL statement. """ sql_transformer = SQLTransformer(statement=sql_expression) return sql_transformer.transform(self)
def main(inputs, model_file): data = spark.read.option('encoding', 'UTF-8').csv(inputs, schema=tmax_schema) ################ FEATURE ENGINEERING: add yesterday tmax ##################### if USE_YTD_TEMP_FEATURE: syntax = """SELECT today.latitude,today.longitude,today.elevation,today.date, today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station""" sql_trans = SQLTransformer(statement=syntax) data = sql_trans.transform(data) ############################################################################# data = data.withColumn('day_of_year', fn.dayofyear('date')) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() if USE_YTD_TEMP_FEATURE: train_feature_assembler = VectorAssembler(inputCols=[ 'yesterday_tmax', 'day_of_year', 'latitude', 'longitude', 'elevation' ], outputCol='features') else: train_feature_assembler = VectorAssembler( inputCols=['day_of_year', 'latitude', 'longitude', 'elevation'], outputCol='features') ############# DIFFERENT ML ALGORITHMS TO BE USED #################### # classifier = GeneralizedLinearRegression(featuresCol = 'features', labelCol='tmax' ) # classifier = GBTRegressor( maxDepth=5,featuresCol = 'features', labelCol='tmax' ) classifier = RandomForestRegressor(numTrees=7, maxDepth=8, featuresCol='features', labelCol='tmax') ##################################################################### train_pipeline = Pipeline(stages=[train_feature_assembler, classifier]) weather_model = train_pipeline.fit(train) prediction = weather_model.transform(validation) # print(prediction.show()) evaluator = RegressionEvaluator(predictionCol="prediction", labelCol='tmax', metricName='r2') #rmse score = evaluator.evaluate(prediction) print('Validation score for weather model: %g' % (score, )) weather_model.write().overwrite().save(model_file)
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() # TODO: create a pipeline to predict RGB colours -> word rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'], outputCol='features') word_indexer = StringIndexer(inputCol='word', outputCol='new_word') classifier = MultilayerPerceptronClassifier(labelCol="new_word", layers=[3, 30, 11]) rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier]) rgb_model = rgb_pipeline.fit(train) # TODO: create an evaluator and score the validation data rgb_validation = rgb_model.transform(validation) # rgb_validation.show() plot_predictions(rgb_model, 'RGB', labelCol='word') vali_evaluator = MulticlassClassificationEvaluator( predictionCol="prediction", labelCol='new_word') score = vali_evaluator.evaluate(rgb_validation) print('Validation score for RGB model: %g' % (score, )) # TODO: create a pipeline RGB colours -> LAB colours -> word; train and evaluate. rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) sql_transformer = SQLTransformer(statement=rgb_to_lab_query) new_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'], outputCol='features') new_pipeline = Pipeline( stages=[sql_transformer, new_assembler, word_indexer, classifier]) new_training = sql_transformer.transform(train) new_model = new_pipeline.fit(new_training) new_validation = new_model.transform(validation) #new_validation.show() new_vali_evaluator = MulticlassClassificationEvaluator( predictionCol='prediction', labelCol='new_word') new_score = new_vali_evaluator.evaluate(new_validation) print('Validation score for LAB model:', new_score) print('Validation score for LAB model:', new_score) print('Validation score for LAB model:', new_score) plot_predictions(new_model, 'LAB', labelCol="word")
def deriveNewMethod(df): from pyspark.ml.feature import SQLTransformer # 把空值去掉 df = df.filter(df['area'].isNotNull()) df = df.filter(df['price'].isNotNull()) df = df.filter(df['room_num'].isNotNull()) #df = df.filter(df['area'] !='NULL') #df = df.filter(df['price'] !='NULL') #df = df.filter(df['room_num'] !='NULL') # 去除点值为0的行 df = df.filter(df['area'] != 0) df = df.filter(df['room_num'] != 0) df = df.select('*', df['area'].cast('Float').alias('tmp_name')).drop('area') df = df.withColumnRenamed('tmp_name', 'area') df = df.select('*', df['price'].cast('Float').alias('tmp_name')).drop('price') df = df.withColumnRenamed('tmp_name', 'price') df = df.select( '*', df['room_num'].cast('Float').alias('tmp_name')).drop('room_num') df = df.withColumnRenamed('tmp_name', 'room_num') print(df.dtypes) sqlTransform = SQLTransformer( statement= 'SELECT *,(area/room_num) AS one_room_area, (price/area) AS one_area_price FROM __THIS__' ) df = sqlTransform.transform(df) # spark.stop() return df
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # $example on$ from pyspark.ml.feature import SQLTransformer # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("SQLTransformerExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], ["id", "v1", "v2"]) sqlTrans = SQLTransformer( statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") sqlTrans.transform(df).show() # $example off$ spark.stop()
supervised = RFormula(formula="lab ~ . + color:value1 + color:value2") supervised.fit(simpleDF).transform(simpleDF).show() # COMMAND ---------- from pyspark.ml.feature import SQLTransformer basicTransformation = SQLTransformer()\ .setStatement(""" SELECT sum(Quantity), count(*), CustomerID FROM __THIS__ GROUP BY CustomerID """) basicTransformation.transform(sales).show() # COMMAND ---------- from pyspark.ml.feature import VectorAssembler va = VectorAssembler().setInputCols(["int1", "int2", "int3"]) va.transform(fakeIntDF).show() # COMMAND ---------- contDF = spark.range(20).selectExpr("cast(id as double)") # COMMAND ---------- from pyspark.ml.feature import Bucketizer
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function from pyspark import SparkContext # $example on$ from pyspark.ml.feature import SQLTransformer # $example off$ from pyspark.sql import SQLContext if __name__ == "__main__": sc = SparkContext(appName="SQLTransformerExample") sqlContext = SQLContext(sc) # $example on$ df = sqlContext.createDataFrame([ (0, 1.0, 3.0), (2, 2.0, 5.0) ], ["id", "v1", "v2"]) sqlTrans = SQLTransformer( statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") sqlTrans.transform(df).show() # $example off$ sc.stop()
# ## Load the data # Read the enhanced (joined) ride data from HDFS: rides = spark.read.parquet("/duocar/joined/") # ## Preprocess the modeling data # A cancelled ride does not have a star rating. Use the # [SQLTransformer](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.SQLTransformer) # to filter out the cancelled rides: from pyspark.ml.feature import SQLTransformer filterer = SQLTransformer(statement="SELECT * FROM __THIS__ WHERE cancelled == 0") filtered = filterer.transform(rides) # **Note:** `__THIS__` is a placeholder for the DataFrame passed into the `transform` method. # ## Generate label # We can treat `star_rating` as a continuous numerical label or an ordered # categorical label: filtered.groupBy("star_rating").count().orderBy("star_rating").show() # Rather than try to predict each value, let us see if we can distinguish # between five-star and non-five-star ratings. We can use the # [Binarizer](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer) # to create our binary label: from pyspark.ml.feature import Binarizer
data["trainVA"] = trainPath data["testVA"] = testPath data["currentTrain"] = trainPath data["currentTest"] = testPath elif config["transformerType"] == "sql": train, test = spark.read.parquet(data["currentTrain"]), spark.read.parquet(data["currentTest"]) train.cache() test.cache() df = train.unionByName(test) sqlTrans = SQLTransformer(statement=config["statement"]) train = sqlTrans.transform(train) test = sqlTrans.transform(test) trainPath = data['scheme'] + "://" + data['save'] + "/trainSQL/" testPath = data['scheme'] + "://" + data['save'] + "/testSQL/" if "partitionCol" in data and data['partitionCol'] in train.schema.names: train.write.partitionBy(data['partitionCol']).format("parquet").save(trainPath) test.write.partitionBy(data['partitionCol']).format("parquet").save(testPath) else: train.write.format("parquet").mode("overwrite").save(trainPath) test.write.format("parquet").mode("overwrite").save(testPath) spark.stop() data["trainSQL"] = trainPath data["testSQL"] = testPath
def test_model(model_file, inputs): # get the data test_tmax = spark.read.csv(inputs, schema=tmax_schema) ######################################################################### if USE_YTD_TEMP_FEATURE: syntax = """SELECT today.latitude,today.longitude,today.elevation,today.date, today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station""" sql_trans = SQLTransformer(statement=syntax) test_tmax = sql_trans.transform(test_tmax) ####################################################################### test_tmax = test_tmax.withColumn('day_of_year', fn.dayofyear('date')) # load the model model = PipelineModel.load(model_file) # ------------------------------------------------------------------------------------------------- '''#################################################################################''' '''########## B1 plot the Temperature Heatmap from trained model ####################''' '''##################################################################################''' lats, lons = np.meshgrid(np.arange(-90, 90, .5), np.arange(-180, 180, .5)) elevs = [ eg.get_elevations(np.array([late, lone]).T) for late, lone in zip(lats, lons) ] num_row = lats.shape[0] num_col = lats.shape[1] total_pixel = num_row * num_col # Col = 3 because of 'latitude,longitude,elevation' grid_lats_lons_elev = np.zeros(shape=(total_pixel, 3)) print(grid_lats_lons_elev.shape) index_row_grid = 0 for i in range(num_row): for j in range(num_col): grid_lats_lons_elev[index_row_grid] = np.array( [lats[i][j], lons[i][j], elevs[i][j]]) index_row_grid += 1 df_lats_lons_elev = pd.DataFrame( grid_lats_lons_elev, columns=['latitude', 'longitude', 'elevation']) # Assume the simulated data comes from today df_date = pd.DataFrame(np.arange(total_pixel), columns=['date']) df_date['date'] = date.today() df_final = pd.concat([df_date, df_lats_lons_elev], axis=1) print(df_final) simulated_tmax_schema = types.StructType([ types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()) ]) df_simulated_tmax = spark.createDataFrame(df_final, schema=simulated_tmax_schema) df_simulated_tmax = df_simulated_tmax.withColumn('day_of_year', fn.dayofyear('date')) predictions = model.transform(df_simulated_tmax) print(predictions.show()) df_predictions = predictions.toPandas() world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) geometry = [ Point(xy) for xy in zip(df_predictions['longitude'], df_predictions['latitude']) ] df_predictions = df_predictions.drop(['longitude', 'latitude'], axis=1) crs = {'init': 'epsg:4326'} gdf = GeoDataFrame(df_predictions, crs=crs, geometry=geometry) base = gdf.plot(column='prediction', marker='o', cmap='jet', markersize=5, legend=True, legend_kwds={ 'label': "Temperature in Celcius", 'orientation': "horizontal" }) world.boundary.plot(ax=base, edgecolor='black') plt.title('Predicted Temperature of Jan 2020') # plt.show() plt.savefig("heatmap") plt.close() ''' ####################---- END of B1 ----###################################### ''' ''' ############################################################################# ''' #--------------------------------------------------------------------------------------------------------------- '''#################################################################################''' '''########## B2 plot the Error Distribution of Temperature ########################''' '''##################################################################################''' # use the model to make predictions predictions = model.transform(test_tmax) predictions = predictions.withColumn( 'error', predictions['prediction'] - predictions['tmax']) df_long_lat = predictions.toPandas() predictions.show() geometry = [ Point(xy) for xy in zip(df_long_lat['longitude'], df_long_lat['latitude']) ] df_long_lat = df_long_lat.drop(['longitude', 'latitude'], axis=1) crs = {'init': 'epsg:4326'} gdf = GeoDataFrame(df_long_lat, crs=crs, geometry=geometry) base = world.plot(color='white', edgecolor='black', figsize=(20, 12)) gdf.plot(column='error', ax=base, marker='o', cmap='jet', markersize=15, legend=True, legend_kwds={ 'label': "Error of Temperature in Celcius", 'orientation': "horizontal" }) plt.title('Distribution of Temperature Prediction Error') # plt.show() plt.savefig('dist_temp_error') '''
def pre_processing(df): ''' create tranform object, apply to df to generate another df ''' sqlTrans = SQLTransformer( statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") sqlTrans.transform(df).show()
supervised = RFormula(formula="lab ~ . + color:value1 + color:value2") supervised.fit(simpleDF).transform(simpleDF).show() # COMMAND ---------- from pyspark.ml.feature import SQLTransformer basicTransformation = SQLTransformer()\ .setStatement(""" SELECT sum(Quantity), count(*), CustomerID FROM __THIS__ GROUP BY CustomerID """) basicTransformation.transform(sales).show() # COMMAND ---------- from pyspark.ml.feature import VectorAssembler va = VectorAssembler().setInputCols(["int1", "int2", "int3"]) va.transform(fakeIntDF).show() # COMMAND ---------- contDF = spark.range(20).selectExpr("cast(id as double)") # COMMAND ----------
def calculate_classification_metrics(model_name, df_test_for_model, output=True): ''' INPUT: model_name - (string) classification model name df_test_for_model - (pyspark dataframe) transformed test dataframe including prediction and label output - (bool) whether to print metrics to stdout OUTPUT: metrics - (dictionary) dictionary storing TP, TN, FP, FN, Precision, Recall, and F1 DESCRIPTION: Print out and return TP, TN, FP, FN, Precision, Recall and F1 ''' # Count True Positive, True Negative, False Positive, False Negative # in test data result sqlTrans = SQLTransformer(statement=" \ SELECT \ SUM(CASE WHEN label = 1 AND prediction = 1\ THEN 1 ELSE 0 END) AS TP, \ SUM(CASE WHEN label = 0 AND prediction = 0\ THEN 1 ELSE 0 END) AS TN, \ SUM(CASE WHEN label = 0 AND prediction = 1\ THEN 1 ELSE 0 END) AS FP, \ SUM(CASE WHEN label = 1 AND prediction = 0\ THEN 1 ELSE 0 END) AS FN \ FROM __THIS__") counts = sqlTrans.transform(df_test_for_model).collect() # calculate precision, recall and f1 score by definition TP, TN, FP, FN = counts[0].TP, counts[0].TN, counts[0].FP, counts[0].FN if (TP + FP) > 0: Precision = TP / (TP + FP) else: Precision = 0 print('[INFO: TP + FP = 0, and Precision is set 0.]') if (TP + FN) > 0: Recall = TP / (TP + FN) else: Recall = 0 print('[INFO: TP + FN = 0, and Recall is set 0.]') if (Recall + Precision) > 0: F1_score = 2 * Recall * Precision / (Recall + Precision) else: F1_score = 0 print('[INFO: Recall + Precision = 0, and F1 is set 0.]') if output: print(model_name) print('precision:{:.4f}, recall:{:.4f}, f1:{:.4f}'.format( Precision, Recall, F1_score)) print('(TP:{}, TN:{}, FP:{}, FN:{})'.format(TP, TN, FP, FN)) metrics = { 'TP': TP, 'TN': TN, 'FP': FP, 'FN': FN, 'Precision': Precision, 'Recall': Recall, 'F1': F1_score } return metrics