Python SQLTransformer.transform 예제들, pyspark.ml.feature.SQLTransformer.transform Python 예제들

예제 #1

0

파일 보기

파일: feature_conversion.py 프로젝트: YanhuiJing/machine_learn

def sql_transformer_usecase():
    """
        通过sql方式实现对数据特征的转换
        "_THIS_" 代表的是输入数据对应的dataset
    """
    spark = getSparkSession()
    df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)],
                               ["id", "v1", "v2"])
    sqlTrans = SQLTransformer(
        statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
    sqlTrans.transform(df).show(truncate=False)

예제 #2

0

파일 보기

def main_A(inputs):
    data = spark.read.option('encoding', 'UTF-8').csv(inputs,
                                                      schema=tmax_schema)
    ################ FEATURE ENGINEERING: add yesterday tmax #####################
    if USE_YTD_TEMP_FEATURE:
        syntax = """SELECT today.latitude,today.longitude,today.elevation,today.date,
                           today.tmax, yesterday.tmax AS yesterday_tmax
                    FROM __THIS__ as today
                    INNER JOIN __THIS__ as yesterday
                    ON date_sub(today.date, 1) = yesterday.date
                       AND today.station = yesterday.station"""
        sql_trans = SQLTransformer(statement=syntax)
        df = sql_trans.transform(data)
    #############################################################################
    df = data.withColumn('day_of_year', fn.dayofyear('date'))
    df = df.withColumn('year', fn.year('date'))

    df_long_lat = df[['station', 'longitude', 'latitude', 'tmax',
                      'year']].toPandas()
    count_year = df_long_lat['year'].value_counts().to_dict()

    # SELECT YEAR and DURATION
    YEAR_SELECTED = 2000
    YEAR_DURATION = 20
    df_long_lat = df_long_lat.loc[(df_long_lat['year'] > YEAR_SELECTED) & (
        df_long_lat['year'] < YEAR_SELECTED + YEAR_DURATION)]

    # UNCLUSTER plot by finding avg temperature (groupby same station and year)
    df_long_lat['avg_temp'] = df_long_lat.groupby(['station', 'year'
                                                   ])['tmax'].transform('mean')
    df_long_lat.drop_duplicates(subset=['station', 'year'], inplace=True)
    print(df_long_lat)

    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    geometry = [
        Point(xy)
        for xy in zip(df_long_lat['longitude'], df_long_lat['latitude'])
    ]

    df_long_lat = df_long_lat.drop(['longitude', 'latitude'], axis=1)
    crs = {'init': 'epsg:4326'}
    gdf = GeoDataFrame(df_long_lat, crs=crs, geometry=geometry)

    base = world.plot(color='white', edgecolor='black', figsize=(20, 12))
    gdf.plot(column='avg_temp',
             ax=base,
             marker='o',
             cmap='jet',
             markersize=15,
             legend=True,
             legend_kwds={
                 'label': "Temperature in Celcius",
                 'orientation': "horizontal"
             })
    plt.title('Distribution of Temperature between ' + str(YEAR_SELECTED) +
              " and " + str(YEAR_SELECTED + YEAR_DURATION))
    plt.savefig(inputs + "_" + str(YEAR_SELECTED) + "-" +
                str(YEAR_SELECTED + YEAR_DURATION))

예제 #3

0

파일 보기

def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])  #use seed here
    train = train.cache()
    validation = validation.cache()

    #creating a pipeline to predict RGB colours -> word
    rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'],
                                    outputCol="features")
    #dataframe1 = rgb_assembler.transform(data)
    word_indexer = StringIndexer(inputCol="word",
                                 outputCol="target",
                                 handleInvalid="error",
                                 stringOrderType="frequencyDesc")
    classifier = MultilayerPerceptronClassifier(featuresCol="features",
                                                labelCol="target",
                                                layers=[3, 25, 25])
    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)

    #creating an evaluator and score the validation data
    #model_train = rgb_model.transform(train)
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                                  labelCol="target")
    rgb_validation = rgb_model.transform(validation)
    score = evaluator.evaluate(rgb_validation,
                               {evaluator.metricName: "accuracy"})

    print('Validation score for RGB model: %g' % (score, ))
    plot_predictions(rgb_model, 'RGB', labelCol='target')

    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    # creating a pipeline to predict RGB colours -> word; train and evaluate.
    sqlTrans = SQLTransformer(statement=rgb_to_lab_query)
    labdata = sqlTrans.transform(data)
    ltrain, lvalidation = labdata.randomSplit([0.75, 0.25])
    lrgb_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'],
                                     outputCol="LAB")
    lword_indexer = StringIndexer(inputCol="word",
                                  outputCol="labTarget",
                                  handleInvalid="error",
                                  stringOrderType="frequencyDesc")
    lclassifier = MultilayerPerceptronClassifier(featuresCol="LAB",
                                                 labelCol="labTarget",
                                                 layers=[3, 25, 25])
    lrgb_pipeline = Pipeline(
        stages=[sqlTrans, lrgb_assembler, lword_indexer, lclassifier])
    lrgb_model = lrgb_pipeline.fit(ltrain)
    #lmodel_train = lrgb_model.transform(ltrain)
    lrgb_validation = lrgb_model.transform(lvalidation)
    print(lrgb_validation.show())
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                                  labelCol="labTarget")
    lscore = evaluator.evaluate(lrgb_validation,
                                {evaluator.metricName: "accuracy"})

    print('Validation score for LAB model: %g' % (lscore, ))
    plot_predictions(lrgb_model, 'LAB', labelCol='word')

예제 #4

0

파일 보기

파일: extension.py 프로젝트: hhy5277/Optimus

def query(self, sql_expression):
    """
    Implements the transformations which are defined by SQL statement. Currently we only support
    SQL syntax like "SELECT ... FROM __THIS__ ..." where "__THIS__" represents the
    underlying table of the input dataframe.
    :param self: Spark Dataframe
    :param sql_expression: SQL expression.
    :return: Dataframe with columns changed by SQL statement.
    """
    sql_transformer = SQLTransformer(statement=sql_expression)
    return sql_transformer.transform(self)

예제 #5

0

파일 보기

def main(inputs, model_file):
    data = spark.read.option('encoding', 'UTF-8').csv(inputs,
                                                      schema=tmax_schema)
    ################ FEATURE ENGINEERING: add yesterday tmax #####################
    if USE_YTD_TEMP_FEATURE:
        syntax = """SELECT today.latitude,today.longitude,today.elevation,today.date,
                           today.tmax, yesterday.tmax AS yesterday_tmax
                    FROM __THIS__ as today
                    INNER JOIN __THIS__ as yesterday
                    ON date_sub(today.date, 1) = yesterday.date
                       AND today.station = yesterday.station"""
        sql_trans = SQLTransformer(statement=syntax)
        data = sql_trans.transform(data)
    #############################################################################
    data = data.withColumn('day_of_year', fn.dayofyear('date'))
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    if USE_YTD_TEMP_FEATURE:
        train_feature_assembler = VectorAssembler(inputCols=[
            'yesterday_tmax', 'day_of_year', 'latitude', 'longitude',
            'elevation'
        ],
                                                  outputCol='features')
    else:
        train_feature_assembler = VectorAssembler(
            inputCols=['day_of_year', 'latitude', 'longitude', 'elevation'],
            outputCol='features')

    ############# DIFFERENT ML ALGORITHMS TO BE USED ####################
    # classifier = GeneralizedLinearRegression(featuresCol = 'features', labelCol='tmax' )
    # classifier = GBTRegressor( maxDepth=5,featuresCol = 'features', labelCol='tmax' )
    classifier = RandomForestRegressor(numTrees=7,
                                       maxDepth=8,
                                       featuresCol='features',
                                       labelCol='tmax')
    #####################################################################

    train_pipeline = Pipeline(stages=[train_feature_assembler, classifier])
    weather_model = train_pipeline.fit(train)

    prediction = weather_model.transform(validation)
    # print(prediction.show())
    evaluator = RegressionEvaluator(predictionCol="prediction",
                                    labelCol='tmax',
                                    metricName='r2')  #rmse
    score = evaluator.evaluate(prediction)
    print('Validation score for weather model: %g' % (score, ))

    weather_model.write().overwrite().save(model_file)

예제 #6

0

파일 보기

파일: colour_predict.py 프로젝트: kacy12/bigdata

def main(inputs):

    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    # TODO: create a pipeline to predict RGB colours -> word

    rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'],
                                    outputCol='features')
    word_indexer = StringIndexer(inputCol='word', outputCol='new_word')
    classifier = MultilayerPerceptronClassifier(labelCol="new_word",
                                                layers=[3, 30, 11])
    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)

    # TODO: create an evaluator and score the validation data

    rgb_validation = rgb_model.transform(validation)
    # rgb_validation.show()
    plot_predictions(rgb_model, 'RGB', labelCol='word')
    vali_evaluator = MulticlassClassificationEvaluator(
        predictionCol="prediction", labelCol='new_word')
    score = vali_evaluator.evaluate(rgb_validation)
    print('Validation score for RGB model: %g' % (score, ))

    # TODO: create a pipeline RGB colours -> LAB colours -> word; train and evaluate.

    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    sql_transformer = SQLTransformer(statement=rgb_to_lab_query)

    new_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'],
                                    outputCol='features')
    new_pipeline = Pipeline(
        stages=[sql_transformer, new_assembler, word_indexer, classifier])
    new_training = sql_transformer.transform(train)
    new_model = new_pipeline.fit(new_training)
    new_validation = new_model.transform(validation)

    #new_validation.show()

    new_vali_evaluator = MulticlassClassificationEvaluator(
        predictionCol='prediction', labelCol='new_word')
    new_score = new_vali_evaluator.evaluate(new_validation)
    print('Validation score for LAB model:', new_score)
    print('Validation score for LAB model:', new_score)
    print('Validation score for LAB model:', new_score)

    plot_predictions(new_model, 'LAB', labelCol="word")

예제 #7

0

파일 보기

def deriveNewMethod(df):
    from pyspark.ml.feature import SQLTransformer

    # 把空值去掉
    df = df.filter(df['area'].isNotNull())
    df = df.filter(df['price'].isNotNull())
    df = df.filter(df['room_num'].isNotNull())

    #df = df.filter(df['area'] !='NULL')
    #df = df.filter(df['price'] !='NULL')
    #df = df.filter(df['room_num'] !='NULL')

    # 去除点值为0的行
    df = df.filter(df['area'] != 0)
    df = df.filter(df['room_num'] != 0)

    df = df.select('*',
                   df['area'].cast('Float').alias('tmp_name')).drop('area')
    df = df.withColumnRenamed('tmp_name', 'area')
    df = df.select('*',
                   df['price'].cast('Float').alias('tmp_name')).drop('price')
    df = df.withColumnRenamed('tmp_name', 'price')
    df = df.select(
        '*', df['room_num'].cast('Float').alias('tmp_name')).drop('room_num')
    df = df.withColumnRenamed('tmp_name', 'room_num')

    print(df.dtypes)

    sqlTransform = SQLTransformer(
        statement=
        'SELECT *,(area/room_num) AS one_room_area, (price/area) AS one_area_price FROM __THIS__'
    )
    df = sqlTransform.transform(df)

    # spark.stop()

    return df

예제 #8

0

파일 보기

#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# $example on$
from pyspark.ml.feature import SQLTransformer
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("SQLTransformerExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)],
                               ["id", "v1", "v2"])
    sqlTrans = SQLTransformer(
        statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
    sqlTrans.transform(df).show()
    # $example off$

    spark.stop()

예제 #9

0

파일 보기

supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")
supervised.fit(simpleDF).transform(simpleDF).show()

# COMMAND ----------

from pyspark.ml.feature import SQLTransformer

basicTransformation = SQLTransformer()\
  .setStatement("""
    SELECT sum(Quantity), count(*), CustomerID
    FROM __THIS__
    GROUP BY CustomerID
  """)

basicTransformation.transform(sales).show()

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler

va = VectorAssembler().setInputCols(["int1", "int2", "int3"])
va.transform(fakeIntDF).show()

# COMMAND ----------

contDF = spark.range(20).selectExpr("cast(id as double)")

# COMMAND ----------

from pyspark.ml.feature import Bucketizer

예제 #10

0

파일 보기

파일: sql_transformer.py 프로젝트: 0xqq/spark

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
# $example on$
from pyspark.ml.feature import SQLTransformer
# $example off$
from pyspark.sql import SQLContext

if __name__ == "__main__":
    sc = SparkContext(appName="SQLTransformerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    df = sqlContext.createDataFrame([
        (0, 1.0, 3.0),
        (2, 2.0, 5.0)
    ], ["id", "v1", "v2"])
    sqlTrans = SQLTransformer(
        statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
    sqlTrans.transform(df).show()
    # $example off$

    sc.stop()

예제 #11

0

파일 보기

파일: 15_classify_solutions.py 프로젝트: marcioshochi/data-scientist-trainning


# ## Load the data

# Read the enhanced (joined) ride data from HDFS:
rides = spark.read.parquet("/duocar/joined/")


# ## Preprocess the modeling data

# A cancelled ride does not have a star rating.  Use the
# [SQLTransformer](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.SQLTransformer)
# to filter out the cancelled rides:
from pyspark.ml.feature import SQLTransformer
filterer = SQLTransformer(statement="SELECT * FROM __THIS__ WHERE cancelled == 0")
filtered = filterer.transform(rides)

# **Note:** `__THIS__` is a placeholder for the DataFrame passed into the `transform` method.


# ## Generate label

# We can treat `star_rating` as a continuous numerical label or an ordered
# categorical label:
filtered.groupBy("star_rating").count().orderBy("star_rating").show()

# Rather than try to predict each value, let us see if we can distinguish
# between five-star and non-five-star ratings.  We can use the
# [Binarizer](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer)
# to create our binary label:
from pyspark.ml.feature import Binarizer

예제 #12

0

파일 보기

파일: transformers.py 프로젝트: swipswaps/node-red-contrib-sparkml

			data["trainVA"] = trainPath
			data["testVA"] = testPath

			data["currentTrain"] = trainPath
			data["currentTest"] = testPath

		elif config["transformerType"] == "sql":

			train, test = spark.read.parquet(data["currentTrain"]), spark.read.parquet(data["currentTest"])
			train.cache()
			test.cache()

			df = train.unionByName(test)
			sqlTrans = SQLTransformer(statement=config["statement"])
			train = sqlTrans.transform(train)
			test = sqlTrans.transform(test)

			trainPath = data['scheme'] + "://" + data['save'] + "/trainSQL/"
			testPath = data['scheme'] + "://" + data['save'] + "/testSQL/"
			if "partitionCol" in data and data['partitionCol'] in train.schema.names:
				train.write.partitionBy(data['partitionCol']).format("parquet").save(trainPath)
				test.write.partitionBy(data['partitionCol']).format("parquet").save(testPath)
			else:
				train.write.format("parquet").mode("overwrite").save(trainPath)
				test.write.format("parquet").mode("overwrite").save(testPath)
			spark.stop()

			data["trainSQL"] = trainPath
			data["testSQL"] = testPath

예제 #13

0

파일 보기

파일: weather_test.py 프로젝트: thenatzzz/Big_Data_2_CMPT-733

def test_model(model_file, inputs):
    # get the data
    test_tmax = spark.read.csv(inputs, schema=tmax_schema)
    #########################################################################
    if USE_YTD_TEMP_FEATURE:
        syntax = """SELECT today.latitude,today.longitude,today.elevation,today.date,
                           today.tmax, yesterday.tmax AS yesterday_tmax
                    FROM __THIS__ as today
                    INNER JOIN __THIS__ as yesterday
                    ON date_sub(today.date, 1) = yesterday.date
                       AND today.station = yesterday.station"""
        sql_trans = SQLTransformer(statement=syntax)
        test_tmax = sql_trans.transform(test_tmax)
    #######################################################################
    test_tmax = test_tmax.withColumn('day_of_year', fn.dayofyear('date'))
    # load the model
    model = PipelineModel.load(model_file)

    # -------------------------------------------------------------------------------------------------
    '''#################################################################################'''
    '''########## B1 plot the Temperature Heatmap from trained model ####################'''
    '''##################################################################################'''
    lats, lons = np.meshgrid(np.arange(-90, 90, .5), np.arange(-180, 180, .5))
    elevs = [
        eg.get_elevations(np.array([late, lone]).T)
        for late, lone in zip(lats, lons)
    ]

    num_row = lats.shape[0]
    num_col = lats.shape[1]
    total_pixel = num_row * num_col

    # Col = 3 because of 'latitude,longitude,elevation'
    grid_lats_lons_elev = np.zeros(shape=(total_pixel, 3))

    print(grid_lats_lons_elev.shape)

    index_row_grid = 0
    for i in range(num_row):
        for j in range(num_col):
            grid_lats_lons_elev[index_row_grid] = np.array(
                [lats[i][j], lons[i][j], elevs[i][j]])
            index_row_grid += 1

    df_lats_lons_elev = pd.DataFrame(
        grid_lats_lons_elev, columns=['latitude', 'longitude', 'elevation'])

    # Assume the simulated data comes from today
    df_date = pd.DataFrame(np.arange(total_pixel), columns=['date'])
    df_date['date'] = date.today()

    df_final = pd.concat([df_date, df_lats_lons_elev], axis=1)
    print(df_final)

    simulated_tmax_schema = types.StructType([
        types.StructField('date', types.DateType()),
        types.StructField('latitude', types.FloatType()),
        types.StructField('longitude', types.FloatType()),
        types.StructField('elevation', types.FloatType())
    ])

    df_simulated_tmax = spark.createDataFrame(df_final,
                                              schema=simulated_tmax_schema)
    df_simulated_tmax = df_simulated_tmax.withColumn('day_of_year',
                                                     fn.dayofyear('date'))

    predictions = model.transform(df_simulated_tmax)

    print(predictions.show())

    df_predictions = predictions.toPandas()
    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    geometry = [
        Point(xy)
        for xy in zip(df_predictions['longitude'], df_predictions['latitude'])
    ]

    df_predictions = df_predictions.drop(['longitude', 'latitude'], axis=1)
    crs = {'init': 'epsg:4326'}
    gdf = GeoDataFrame(df_predictions, crs=crs, geometry=geometry)

    base = gdf.plot(column='prediction',
                    marker='o',
                    cmap='jet',
                    markersize=5,
                    legend=True,
                    legend_kwds={
                        'label': "Temperature in Celcius",
                        'orientation': "horizontal"
                    })
    world.boundary.plot(ax=base, edgecolor='black')

    plt.title('Predicted Temperature of Jan 2020')
    # plt.show()
    plt.savefig("heatmap")
    plt.close()
    ''' ####################---- END of B1 ----###################################### '''
    ''' ############################################################################# '''
    #---------------------------------------------------------------------------------------------------------------
    '''#################################################################################'''
    '''########## B2 plot the Error Distribution of Temperature  ########################'''
    '''##################################################################################'''
    # use the model to make predictions
    predictions = model.transform(test_tmax)
    predictions = predictions.withColumn(
        'error', predictions['prediction'] - predictions['tmax'])
    df_long_lat = predictions.toPandas()
    predictions.show()

    geometry = [
        Point(xy)
        for xy in zip(df_long_lat['longitude'], df_long_lat['latitude'])
    ]
    df_long_lat = df_long_lat.drop(['longitude', 'latitude'], axis=1)
    crs = {'init': 'epsg:4326'}
    gdf = GeoDataFrame(df_long_lat, crs=crs, geometry=geometry)

    base = world.plot(color='white', edgecolor='black', figsize=(20, 12))
    gdf.plot(column='error',
             ax=base,
             marker='o',
             cmap='jet',
             markersize=15,
             legend=True,
             legend_kwds={
                 'label': "Error of Temperature in Celcius",
                 'orientation': "horizontal"
             })
    plt.title('Distribution of Temperature Prediction Error')
    # plt.show()
    plt.savefig('dist_temp_error')
    '''

예제 #14

0

파일 보기

파일: sql_transformer.py 프로젝트: trhongbinwang/data_science_journey

def pre_processing(df):
    ''' create tranform object, apply to df to generate another df '''
    sqlTrans = SQLTransformer(
        statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
    sqlTrans.transform(df).show()

예제 #15

0

파일 보기

파일: Advanced_Analytics_and_Machine_Learning-Chapter_25_Preprocessing_and_Feature_Engineering.py 프로젝트: yehonatc/Spark-The-Definitive-Guide

supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")
supervised.fit(simpleDF).transform(simpleDF).show()


# COMMAND ----------

from pyspark.ml.feature import SQLTransformer

basicTransformation = SQLTransformer()\
  .setStatement("""
    SELECT sum(Quantity), count(*), CustomerID
    FROM __THIS__
    GROUP BY CustomerID
  """)

basicTransformation.transform(sales).show()


# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
va = VectorAssembler().setInputCols(["int1", "int2", "int3"])
va.transform(fakeIntDF).show()


# COMMAND ----------

contDF = spark.range(20).selectExpr("cast(id as double)")


# COMMAND ----------

예제 #16

0

파일 보기

파일: classification_metrics.py 프로젝트: akudo37/sparkify

def calculate_classification_metrics(model_name,
                                     df_test_for_model,
                                     output=True):
    '''
    INPUT:
    model_name - (string) classification model name
    df_test_for_model - (pyspark dataframe) transformed test dataframe
        including prediction and label
    output - (bool) whether to print metrics to stdout

    OUTPUT:
    metrics - (dictionary) dictionary storing TP, TN, FP, FN, Precision,
        Recall, and F1

    DESCRIPTION:
    Print out and return TP, TN, FP, FN, Precision, Recall and F1
    '''
    # Count True Positive, True Negative, False Positive, False Negative
    #   in test data result
    sqlTrans = SQLTransformer(statement=" \
        SELECT \
            SUM(CASE WHEN label = 1 AND prediction = 1\
                THEN 1 ELSE 0 END) AS TP, \
            SUM(CASE WHEN label = 0 AND prediction = 0\
                THEN 1 ELSE 0 END) AS TN, \
            SUM(CASE WHEN label = 0 AND prediction = 1\
                THEN 1 ELSE 0 END) AS FP, \
            SUM(CASE WHEN label = 1 AND prediction = 0\
                THEN 1 ELSE 0 END) AS FN \
            FROM __THIS__")

    counts = sqlTrans.transform(df_test_for_model).collect()

    # calculate precision, recall and f1 score by definition
    TP, TN, FP, FN = counts[0].TP, counts[0].TN, counts[0].FP, counts[0].FN
    if (TP + FP) > 0:
        Precision = TP / (TP + FP)
    else:
        Precision = 0
        print('[INFO: TP + FP = 0, and Precision is set 0.]')

    if (TP + FN) > 0:
        Recall = TP / (TP + FN)
    else:
        Recall = 0
        print('[INFO: TP + FN = 0, and Recall is set 0.]')

    if (Recall + Precision) > 0:
        F1_score = 2 * Recall * Precision / (Recall + Precision)
    else:
        F1_score = 0
        print('[INFO: Recall + Precision = 0, and F1 is set 0.]')

    if output:
        print(model_name)
        print('precision:{:.4f}, recall:{:.4f}, f1:{:.4f}'.format(
            Precision, Recall, F1_score))
        print('(TP:{}, TN:{}, FP:{}, FN:{})'.format(TP, TN, FP, FN))

    metrics = {
        'TP': TP,
        'TN': TN,
        'FP': FP,
        'FN': FN,
        'Precision': Precision,
        'Recall': Recall,
        'F1': F1_score
    }
    return metrics