示例#1
0
    def exercise_in_machine_learning(self):
        self.static_data_frame.printSchema()

        prepped_data_frame = self.static_data_frame.na.fill(0). \
            withColumn("day_of_week", functions.date_format(functions.col("InvoiceDate"), "EEEE")).coalesce(5)

        train_data_frame = prepped_data_frame.where(
            "InvoiceDate < '2011-03-01'")
        test_data_frame = prepped_data_frame.where(
            "InvoiceDate >= '2011-03-01'")

        print(f"TRAINING items: {train_data_frame.count()}")
        print(f"TEST DATA items: {test_data_frame.count()}")

        transformation_pipeline = Pipeline().setStages([
            feature.StringIndexer().setInputCol("day_of_week").setOutputCol(
                "day_of_week_index"),
            feature.OneHotEncoder().setInputCol(
                "day_of_week_index").setOutputCol("day_of_week_encoded"),
            feature.VectorAssembler().setInputCols(
                ["UnitPrice", "Quantity",
                 "day_of_week_encoded"]).setOutputCol("features"),
        ])

        fitted_pipeline = transformation_pipeline.fit(train_data_frame)
        transformed_training = fitted_pipeline.transform(train_data_frame)
        # transformed_training.cache()

        kmeans = clustering.KMeans().setK(2).setSeed(2)
        km_model = kmeans.fit(transformed_training)
        print(f"Training cost: {km_model.summary.trainingCost}")

        transformed_test = fitted_pipeline.transform(test_data_frame)
        transformed_test.summary().show()
示例#2
0
def classification_ml():
	if False:
		spark = SparkSession.builder.appName('classification-ml') \
			.config('spark.jars.packages', 'org.xerial:sqlite-jdbc:3.23.1') \
			.getOrCreate()

		df = spark.read \
			.format('jdbc') \
			.option('url', 'jdbc:sqlite:iris.db') \
			.option('driver', 'org.sqlite.JDBC') \
			.option('dbtable', 'iris') \
			.load()
	else:
		spark = SparkSession.builder.appName('classification-ml').getOrCreate()
		df = spark.read.option('header', 'true').option('inferSchema', 'true').format('csv').load('dataset/iris.csv')
	spark.sparkContext.setLogLevel('WARN')
	df.show()

	labels = [
		('index', types.IntegerType()),
		('a1', types.FloatType()),
		('a2', types.FloatType()),
		('a3', types.FloatType()),
		('a4', types.FloatType()),
		('id', types.StringType()),
		('label', types.StringType())
	]

	stringIndexer = ml_feature.StringIndexer(inputCol='label', outputCol='label_int')
	featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[1:5]], outputCol='features')

	# Create a model.
	logistic = ml_classification.LogisticRegression(featuresCol=featuresCreator.getOutputCol(), labelCol=stringIndexer.getOutputCol(), maxIter=10, regParam=0.01)

	# Create a pipeline.
	pipeline = Pipeline(stages=[stringIndexer, featuresCreator, logistic])

	# Split the dataset into training and testing datasets.
	df_train, df_test = df.randomSplit([0.7, 0.3], seed=666)

	# Run the pipeline and estimate the model.
	model = pipeline.fit(df_train)
	test_result = model.transform(df_test)  # Dataframe.

	#print(test_result.take(1))
	#test_result.show(5, truncate=True, vertical=False)
	test_result.show(truncate=False)

	# Save and load.
	lr_path = './lr'
	logistic.write().overwrite().save(lr_path)
	lr2 = ml_classification.LogisticRegression.load(lr_path)
	print('Param =', lr2.getRegParam())

	model_path = './lr_model'
	model.write().overwrite().save(model_path)
	model2 = PipelineModel.load(model_path)
	print('Stages =', model.stages)
	print(model.stages[2].coefficientMatrix == model2.stages[2].coefficientMatrix)
	print(model.stages[2].interceptVector == model2.stages[2].interceptVector)
示例#3
0
    def buildModel(self,save_pipe_path=None):
        df=self.getModelData()

        label_index=fea.StringIndexer(inputCol='user_type',outputCol='label')
        reTokenizer=fea.RegexTokenizer(inputCol='appnames',outputCol='appname_token',pattern=',')
        cnt_vector=fea.CountVectorizer(inputCol='appname_token',outputCol='appname_vector')
        vecAssembler = fea.VectorAssembler(inputCols=['appname_vector'], outputCol="feature")
        scaler=fea.StandardScaler(inputCol='feature',outputCol='features')

        if not save_pipe_path:
            lr=LogisticRegression()
            grid=ParamGridBuilder().addGrid(lr.elasticNetParam,[0,1]).build()
            evaluator=BinaryClassificationEvaluator(metricName="areaUnderPR")

            pipeline = Pipeline(stages=[label_index,reTokenizer, cnt_vector,vecAssembler,scaler])
            pipe = pipeline.fit(df)
            pipe_out=pipe.transform(df)

            cv=CrossValidator(estimator=lr,estimatorParamMaps=grid,evaluator=evaluator)
            model=cv.fit(pipe_out)

            print evaluator.evaluate(model.transform(pipe_out))
            print 'Best Param (regParam): ', model.bestModel._java_obj.getElasticNetParam()

            predict_result=model.transform(pipe_out).select('probability','label').toPandas()
            predict_result.to_csv('/home/chenchen/data/predict_result1.csv',index=False)
        else:
            lr=LogisticRegression(elasticNetParam=1.0)

            pipeline=Pipeline(stages=[label_index,reTokenizer, cnt_vector,vecAssembler,scaler,lr])
            model=pipeline.fit(df)

            model.save(save_pipe_path)
            print 'pipe saved'
示例#4
0
def train_evaluate(train_data, test_data):
    # 将文字的分类特征转为数字
    stringIndexer = ft.StringIndexer(inputCol='alchemy_category',
                                     outputCol="alchemy_category_Index")

    encoder = ft.OneHotEncoder(dropLast=False,
                               inputCol='alchemy_category_Index',
                               outputCol="alchemy_category_IndexVec")

    assemblerInputs = ['alchemy_category_IndexVec'] + train_data.columns[4:-1]
    assembler = ft.VectorAssembler(inputCols=assemblerInputs,
                                   outputCol="features")

    # dt = cl.DecisionTreeClassifier(labelCol="label",
    #                             featuresCol="features")
    rf = cl.RandomForestClassifier(labelCol="label", featuresCol="features")

    evaluator = ev.BinaryClassificationEvaluator(
        rawPredictionCol="probability",
        labelCol='label',
        metricName='areaUnderROC')

    grid_search = tune.ParamGridBuilder()\
        .addGrid(rf.impurity, [ "gini","entropy"])\
        .addGrid(rf.maxDepth, [ 5,10,15])\
        .addGrid(rf.maxBins, [10, 15,20])\
        .addGrid(rf.numTrees, [10, 20,30])\
        .build()

    rf_cv = tune.CrossValidator(estimator=rf,
                                estimatorParamMaps=grid_search,
                                evaluator=evaluator,
                                numFolds=5)

    # rf_tvs = tune.TrainValidationSplit(
    #     estimator=rf,
    #     estimatorParamMaps=grid_search,
    #     evaluator=evaluator,
    #     trainRatio=0.7
    # )
    pipeline = Pipeline(stages=[stringIndexer, encoder, assembler, rf_cv])
    cv_pipeline_model = pipeline.fit(train_data)

    best_model = cv_pipeline_model.stages[-1]
    best_parm = get_best_param(best_model)

    AUC, AP = evaluate_model(cv_pipeline_model, test_data)

    return AUC, AP, best_parm, cv_pipeline_model
def fill_empty_string(string_in, fill_value='unknown'):
    if not isinstance(string_in, str):
        return fill_value
    elif not string_in:
        return fill_value
    else:
        return string_in


na_handler = ssf.udf(fill_empty_string, sst.StringType())

indexers = {}
for cat_col in cat_cols:
    merged = merged.withColumn(cat_col, na_handler(cat_col))
    indexer = smf.StringIndexer(inputCol=cat_col, outputCol=f"{cat_col}Inx")
    indexer = indexer.fit(merged)
    merged = indexer.transform(merged)
    merged = merged.drop(cat_col).withColumnRenamed(f"{cat_col}Inx", cat_col)
    indexers[cat_col] = indexer

# merged.write.parquet('data/cached/indexed.parquet')

encoder = smf.OneHotEncoder(inputCols=cat_cols,
                            outputCols=[f"{x}Vec" for x in cat_cols])
encoder = encoder.fit(merged)
encoded = encoder.transform(merged)
encoded = encoded.drop(*cat_cols)

encoded = encoded.persist()
示例#6
0
    model_data.show()
    # Create is_late
    model_data = model_data.withColumn("is_late", model_data.arr_delay > 0)
    # Convert to an integer
    model_data = model_data.withColumn("label",
                                       model_data.is_late.cast("integer"))
    # Remove missing values
    model_data = model_data.filter(
        "arr_delay is not NULL and dep_delay is not NULL and air_time is not NULL and plane_year is not NULL"
    )

    model_data.show()

    # Create a StringIndexer
    carr_indexer = features.StringIndexer(inputCol="carrier",
                                          outputCol="carrier_index")
    # Create a OneHotEncoder
    carr_encoder = features.OneHotEncoder(inputCol="carrier_index",
                                          outputCol="carrier_fact")

    # Create a StringIndexer
    dest_indexer = features.StringIndexer(inputCol="dest",
                                          outputCol="dest_index")
    # Create a OneHotEncoder
    dest_encoder = features.OneHotEncoder(inputCol="dest_index",
                                          outputCol="dest_fact")

    # Make a VectorAssembler
    vec_assembler = features.VectorAssembler(inputCols=[
        "month", "air_time", "carrier_fact", "dest_fact", "plane_age"
    ],
示例#7
0
chn = []
for col in df_train.columns:
    count = df_train.filter('{} is null'.format(col)).count()
    if count > 0:
        chn.append({col:count})
print('columns name has entry with none values : {}'.format(chn))

# handel missing data
df_train = df_train.na.fill(value=df_train.groupby().avg('Age').take(1)[0][0], subset=['Age'])

df_train = df_train.dropna(subset = ['Age', 'Embarked'])

# convert categorical data to numeric
convert_cols = ['Sex', 'Embarked']
for col in convert_cols:
    label_indexer = feature.StringIndexer(inputCol=col, outputCol=col + 'C').fit(df_train)
    df_train = label_indexer.transform(df_train).drop(col)
 

# create optimal model with backward elimination
from sklearn.preprocessing import StandardScaler 
import statsmodels.api as sm
df_p = df_train.toPandas()
#sc = StandardScaler()
X = df_p.iloc[:,1:].values
#X = sc.fit_transform(X)
#X = np.append(arr = np.ones((712,1)).astype(int), values=X, axis=1)
y = df_p.iloc[:,0].values

X_Opt = X[:,[0,1,2,3,4,5,6]]
regressor_OLS = sm.OLS(endog=y, exog=X_Opt).fit()
示例#8
0
# 该样本除了district_id之外均为空值,可以直接删去,剩余6052行
data_sample = data_sample.dropna()
print 'Count of rows: {0}'.format(data_sample.count())

# 查看数据的schema并保存数据
data_sample.printSchema()

####################### 创建转换器 Create a transformer #######################

## 对'land_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'position', 'y'六个变量进行转换
import pyspark.ml.feature as ft

# 对'land_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'position'五个多分类变量
# 先使用StringIndexer转换数据类型,再使用OneHotEncoderEstimator进行One Hot编码
indexer1 = ft.StringIndexer(inputCol="land_condition",
                            outputCol="land_condition_index")
data_sample = indexer1.fit(data_sample).transform(data_sample)
indexer2 = ft.StringIndexer(inputCol="foundation_type",
                            outputCol="foundation_type_index")
data_sample = indexer2.fit(data_sample).transform(data_sample)
indexer3 = ft.StringIndexer(inputCol="roof_type", outputCol="roof_type_index")
data_sample = indexer3.fit(data_sample).transform(data_sample)
indexer4 = ft.StringIndexer(inputCol="ground_floor_type",
                            outputCol="ground_floor_type_index")
data_sample = indexer4.fit(data_sample).transform(data_sample)
indexer5 = ft.StringIndexer(inputCol="position", outputCol="position_index")
data_sample = indexer5.fit(data_sample).transform(data_sample)

encoder = ft.OneHotEncoderEstimator( \
            inputCols=['land_condition_index', 'foundation_type_index', 'roof_type_index', 'ground_floor_type_index', 'position_index'], \
            outputCols=['land_condition_vec', 'foundation_type_vec', 'roof_type_vec', 'ground_floor_type_vec', 'position_vec'])
示例#9
0
    logger.info("Starting Spark Context")

    spark = sparknlp.start()
    conf = (pyspark.SparkConf().set("spark.ui.showConsoleProgress", "true"))
    sc = pyspark.SparkContext.getOrCreate(conf=conf)
    sqlcontext = pyspark.SQLContext(sc)
    training_set = (sqlcontext.read.format("parquet").option(
        "header", True).load(data_dir))
    # TF
    cv = sf.CountVectorizer(inputCol=features, outputCol="tf_features")

    # IDF
    idf = sf.IDF(inputCol="tf_features", outputCol="features")

    # StringIndexer
    label_string = sf.StringIndexer(inputCol=label, outputCol="label")

    # Logistic regression
    lr = LogisticRegression(maxIter=10, family="multinomial")
    pipeline = Pipeline(stages=[cv, idf, label_string, lr])

    paramGrid = (ParamGridBuilder().addGrid(cv.vocabSize,
                                            [500, 1000, 1500]).addGrid(
                                                lr.regParam,
                                                [0.1, 0.01, 0.001]).build())

    logger.info("Pipeline created ...")
    logger.info("Starts grid search ...")
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=MulticlassClassificationEvaluator(),
示例#10
0
def to_index(df, col):
    outcol = col + "_idx"
    indexer =  mlf.StringIndexer(inputCol=col, outputCol=outcol)
    #print indexer.params()
    return indexer.fit(df).transform(df).drop(col)
示例#11
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# __author__='zhangyuwei37'

import pyspark.ml.feature as ft
from pyspark.ml import Pipeline

# 特征预处理:对类别变量onehot,对数值变量scaling, 最后整合特征,输出pca降维结果
# onehot
indexers = [
    ft.StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in nomial_features
]
encoders = [
    ft.OneHotEncoder(inputCol=indexer.getOutputCol(),
                     outputCol="{0}_encoded".format(indexer.getOutputCol()))
    for indexer in indexers
]
assembler_onehot = ft.VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders],
    outputCol="onehot_features")

#scaler
assembler_numeric = ft.VectorAssembler(inputCols=numeric_features,
                                       outputCol="numeric_features")
std_scaler = ft.StandardScaler(inputCol="numeric_features",
                               outputCol="numeric_features_scaled")

assembler_final = ft.VectorAssembler(
    inputCols=['onehot_features', 'numeric_features_scaled'],
    outputCol="final_features")
示例#12
0
    logger.info("Starting Spark Context")

    conf = (pyspark.SparkConf().set("spark.ui.showConsoleProgress", "true"))
    sc = pyspark.SparkContext.getOrCreate(conf=conf)
    sqlcontext = pyspark.SQLContext(sc)
    training_set = (sqlcontext.read.format("parquet").option(
        "header", True).load(data_dir))

    # TF
    cv = sf.CountVectorizer(inputCol="text",
                            outputCol="tf_features",
                            vocabSize=input_dim)
    # IDF
    idf = sf.IDF(inputCol="tf_features", outputCol="features")
    label_string = sf.StringIndexer(inputCol="first_label", outputCol="label")
    pipeline_dl = Pipeline(stages=[cv, idf, label_string])
    df = pipeline_dl.fit(training_set).transform(training_set)
    df = df.rdd.map(lambda x: (LabeledPoint(x[
        'label'], MLLibVectors.fromML(x['features']))))
    logger.info("Pipeline created ...")
    logger.info("Transforms the text into tf idf RDD ...")
    model = create_keras_model(input_dim, output_dim)

    logger.info("Starts Training ...")
    spark_model = SparkMLlibModel(model=model,
                                  frequency='epoch',
                                  mode='asynchronous',
                                  parameter_server_mode='socket')
    spark_model.fit(df,
                    epochs=epochs,
示例#13
0
    .builder \
    .appName("pass") \
    .getOrCreate()

# TRAIN
# reading data_train
data_train = spark.read.csv("train.csv", header=True, inferSchema=True)

# preprocessing data
data_train = data_train.drop("PassengerId", "Name", "SibSp", "Parch", "Ticket",
                             "Cabin", "Embarked")
data_train.count()
#data_train = data_train.na.fill(value=data_train.groupby().avg('Age').take(1)[0][0], subset=['Age'])

# convert sex
label_indexer = feature.StringIndexer(inputCol='Sex',
                                      outputCol='Sex_num').fit(data_train)
data_train = label_indexer.transform(data_train).drop('Sex')

#choice feature cols
feature_cols = data_train.columns[1:]
assembler = feature.VectorAssembler(inputCols=feature_cols,
                                    outputCol='features')
data_train = assembler.setHandleInvalid("skip").transform(data_train)
data_train = data_train.withColumnRenamed('Survived', 'label')
data_train = data_train.select('features', 'label')

# TEST
# reading data_test
data_test = spark.read.csv("test.csv", header=True, inferSchema=True)
# preprocessing data
data_test = data_test.drop("PassengerId", "Name", "SibSp", "Parch", "Ticket",
def main(spark):
    n = len(sys.argv) - 1
    if n < 1:
        print('\nParameters are needed!!\n')
        sys.exit()
    else:
        result_type = sys.argv[1]
        sku_type = sys.argv[2]
        end_date = sys.argv[3]
        end_date_1w = sys.argv[4]
        end_date_2w = sys.argv[5]
        input_train_data_table = sys.argv[6]
        input_predict_data_table = sys.argv[7]
        output_predict_result_table = sys.argv[8]
        predict_date = sys.argv[9]

    spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
    spark.sql("set spark.sql.hive.mergeFiles=true")
    spark.sql("set hive.exec.orc.split.strategy=BI")
    spark.sql("set mapred.job.priority = HIGH")
    spark.sql("set hive.default.fileformat=Orc")
    spark.sql("set hive.exec.parallel=true")
    spark.sql("set hive.auto.convert.join=true")
    spark.sql("set hive.merge.mapfiles = true")
    spark.sql("set hive.merge.mapredfiles = true")
    spark.sql("set hive.merge.size.per.task = 256000000")
    spark.sql("set hive.merge.smallfiles.avgsize=128000000")
    spark.sql("set hive.merge.orcfile.stripe.level=false")
    spark.sql("set hive.exec.dynamic.partition=true")
    spark.sql("set hive.exec.max.dynamic.partitions=1000000")
    spark.sql("set hive.exec.max.dynamic.partitions.pernode=1000000")
    spark.sql("set hive.exec.max.created.files=1000000")
    spark.sql("set mapreduce.job.counters.limit=10000")
    spark.sql("set mapred.output.compress=true")
    spark.sql("set hive.exec.compress.output=true")
    spark.sql("set spark.shuffle.service.enabled = true")
    spark.sql("set spark.sql.broadcastTimeout = 10000")

    print('end_date = {}\n'.format(end_date))
    print('sku_type = {}\n'.format(sku_type))
    print('result_type = {}\n'.format(result_type))

    ### 构建训练和预测样本

    # 确定取数口径
    if sku_type == 'old':
        sku_type_sql = ' and otc_days >= 60'
    elif sku_type == 'new':
        sku_type_sql = ' and otc_days < 60'
    else:
        sku_type_sql = ''

    # 当周正样本
    data_now = spark.sql("""
          select 
              t1.*
          from 
              (
              select * 
              from """ + input_train_data_table + """ 
              where end_date = '""" + end_date + """' and label > 0""" +
                         sku_type_sql + """
              )t1
          join
              (
              select 
                  item_third_cate_cd
              from 
                  app.app_vdp_ai_sink_dept3_cate3_scope_mid_da
              where 
                  dt = '""" + predict_date + """'
                  and app_id = 4
                  and scene_id = 1
                  and status = 3
              group by 
                  item_third_cate_cd
              )t2
           on t1.item_third_cate_cd = t2.item_third_cate_cd
    """)

    # 提前1周的独有正样本
    data_1w = spark.sql("""
                select 
                    a.*
                from 
                    (
                    select 
                        t1.*
                    from 
                        (
                        select * 
                        from """ + input_train_data_table + """ 
                        where end_date = '""" + end_date_1w +
                        """' and label > 0""" + sku_type_sql + """
                        )t1
                    join
                        (
                        select 
                            item_third_cate_cd
                        from 
                            app.app_vdp_ai_sink_dept3_cate3_scope_mid_da
                        where 
                            dt = '""" + predict_date + """'
                            and app_id = 4
                            and scene_id = 1
                            and status = 3
                        group by 
                            item_third_cate_cd
                        )t2
                    on t1.item_third_cate_cd = t2.item_third_cate_cd
                    )a
                left join 
                    (
                    select 
                        item_sku_id,1 as index
                    from 
                        """ + input_train_data_table + """ 
                    where 
                        end_date = '""" + end_date + """' and label > 0""" +
                        sku_type_sql + """
                    )b 
                on 
                    a.item_sku_id=b.item_sku_id
                where 
                    index is null or index = ''
                """)

    # 提前2周的独有正样本
    data_2w = spark.sql("""
                select 
                    a.*
                from 
                    (
                    select 
                        t1.*
                    from 
                        (
                        select * 
                        from """ + input_train_data_table + """ 
                        where end_date = '""" + end_date_2w +
                        """' and label > 0""" + sku_type_sql + """
                        )t1
                    join
                        (
                        select 
                            item_third_cate_cd
                        from 
                            app.app_vdp_ai_sink_dept3_cate3_scope_mid_da
                        where 
                            dt = '""" + predict_date + """'
                            and app_id = 4
                            and scene_id = 1
                            and status = 3
                        group by 
                            item_third_cate_cd
                        )t2
                    on t1.item_third_cate_cd = t2.item_third_cate_cd
                    )a
                left join 
                    (
                    select 
                        item_sku_id,1 as index
                    from 
                        """ + input_train_data_table + """ 
                    where 
                        end_date = '""" + end_date + """' and label > 0""" +
                        sku_type_sql + """
                    )b 
                on 
                    a.item_sku_id=b.item_sku_id
                where 
                    index is null or index = ''
                """)

    # 合并正样本
    data = data_now.union(data_1w).union(data_2w)
    data_filter = data.filter("otc_days >= 0").filter("sku_status_cd = 3001")
    data_filter.cache()
    data_count = data_filter.count()
    print('positive data count = {}\n'.format(data_count))

    # 补充负样本
    data_neg = spark.sql("""
          select 
              t1.*
          from 
              (
              select * 
              from """ + input_train_data_table + """ 
              where end_date = '""" + end_date + """' and label = 0""" +
                         sku_type_sql + """
              and otc_days >= 0 and sku_status_cd = 3001
              )t1
          join
              (
              select 
                  item_third_cate_cd
              from 
                  app.app_vdp_ai_sink_dept3_cate3_scope_mid_da
              where 
                  dt = '""" + predict_date + """'
                  and app_id = 4
                  and scene_id = 1
                  and status = 3
              group by 
                  item_third_cate_cd
              )t2
           on t1.item_third_cate_cd = t2.item_third_cate_cd
              """)
    data_neg.cache()
    data_neg_count = data_neg.count()
    neg_sample_ratio = min(data_count /
                           data_neg_count, 1.0) if data_neg_count > 0 else 0.0
    data_neg_sample = data_neg.sample(neg_sample_ratio, seed=66)

    # 合并正负样本
    if result_type == 'ucvr':
        data_union = data_filter.union(data_neg_sample).orderBy(func.rand(seed=66)).filter("item_first_cate_cd is not null")\
                          .withColumn('data_type_int', func.col('data_type').cast(IntegerType())).drop('data_type').withColumnRenamed('data_type_int','data_type')\
                          .withColumn('label_adjust',func.when(func.col('label') > 1,1).otherwise(func.col('label')))\
                          .drop('label').withColumnRenamed('label_adjust','label')
    else:
        data_union = data_filter.union(data_neg_sample).orderBy(func.rand(seed=66)).filter("item_first_cate_cd is not null")\
                          .withColumn('data_type_int', func.col('data_type').cast(IntegerType())).drop('data_type').withColumnRenamed('data_type_int','data_type')\
                          .withColumn('label_binary',func.when(func.col('label') > 0,1).otherwise(0))\
                          .drop('label').withColumnRenamed('label_binary','label')

    # 合并sku embedding特征
    predict_date_str = ''.join(predict_date.split('-'))
    sku_vec = spark.sql(
        "select * from tmp.tmp_qzl_sink_search_08_sku2vec_features_{0}".format(
            predict_date_str))
    vec_size = len(sku_vec.columns) - 1
    data_union_sku2vec = data_union.join(sku_vec, on='item_sku_id', how='left')

    ### 训练模型

    # 特征分类
    # 非特征
    features_useless = [
        'item_first_cate_name', 'item_second_cate_cd', 'item_second_cate_name',
        'item_third_cate_cd', 'item_third_cate_name', 'barndname_full',
        'sku_name', 'item_sku_id', 'uv_value_label', 'first_into_otc_tm',
        'end_date', 'sku_status_cd', 'red_price', 'red_price_level_rank'
    ]
    # 类别型特征
    features_catagory = ['item_first_cate_cd']
    # embedding特征
    features_embedding = ['sku_vec_' + str(i) for i in range(vec_size)]
    # 数值型特征
    features_numerical = [
        f for f in data_union_sku2vec.columns if f not in ['label'] +
        features_useless + features_catagory + features_embedding
    ]

    # 特征缺失值统计
    feature_na = data_union_sku2vec.agg(
        *[(1 - (func.count(c) / func.count('*'))).alias(c)
          for c in data_union_sku2vec.columns])
    feature_na_DF = sqlDF2pandasDF(feature_na).T
    feature_na_DF = feature_na_DF.reset_index()
    feature_na_DF.columns = ['features', 'na_rate']
    for i, row in feature_na_DF.iterrows():
        print('{}: {}'.format(row['features'], row['na_rate']))

    # 处理缺失值
    fillna_value = {c: -1 for c in features_numerical}
    fillna_value.update({c: -10 for c in features_embedding})
    data_union_sku2vec_fillna = data_union_sku2vec.fillna(fillna_value)

    # 数据预处理
    stringIndexer_cd1 = ft.StringIndexer(inputCol="item_first_cate_cd",
                                         outputCol="item_first_cate_cd_index")
    encoder_cd1 = ft.OneHotEncoder(inputCol='item_first_cate_cd_index',
                                   outputCol='item_first_cate_cd_vec')
    featuresCreator = ft.VectorAssembler(inputCols=features_numerical +
                                         [encoder_cd1.getOutputCol()] +
                                         features_embedding,
                                         outputCol='features')
    pipeline = Pipeline(
        stages=[stringIndexer_cd1, encoder_cd1, featuresCreator])
    data_transformer = pipeline.fit(data_union_sku2vec_fillna)
    data_transformed = data_transformer.transform(data_union_sku2vec_fillna)
    data_transformed.cache()
    data_union_count = data_transformed.count()
    print('data_union_count = {}\n'.format(data_union_count))
    data_filter.unpersist()
    data_neg.unpersist()

    p_num = get_best_partition(data_union_count)
    data_transformed = data_transformed.repartition(p_num)

    # 开始训练
    best_depth = 12  # get_best_depth(data_union_count)
    best_iter = 150  # get_best_iter(data_union_count)
    f = '1.0'  # '0.8'
    s = 1.0  # 0.8

    if result_type == 'ucvr':
        gbdt = GBTRegressor(featuresCol='features',labelCol='label',predictionCol='prediction',lossType='squared',seed=66,maxMemoryInMB=2048,cacheNodeIds=True, \
                             maxDepth=best_depth,maxIter=best_iter,featureSubsetStrategy=f,subsamplingRate=s,stepSize=0.01)
    else:
        gbdt = GBTClassifier(featuresCol='features',labelCol='label',predictionCol='prediction',lossType='logistic',seed=66,maxMemoryInMB=2048,cacheNodeIds=True,\
                             maxDepth=best_depth,maxIter=best_iter,featureSubsetStrategy=f,subsamplingRate=s,stepSize=0.01)

    gbdt_model = gbdt.fit(data_transformed)

    ### 预测候选商品的结果

    # 构建待预测样本
    if sku_type == 'old':
        sku_type_sql_2 = ' where otc_days >= 60'
    elif sku_type == 'new':
        sku_type_sql_2 = ' where otc_days < 60'
    else:
        sku_type_sql_2 = ''

    data_test = spark.sql("select * from " + input_predict_data_table + "" +
                          sku_type_sql_2 + "")
    data_test = data_test.withColumn(
        'data_type_int',
        func.col('data_type').cast(
            IntegerType())).drop('data_type').withColumnRenamed(
                'data_type_int', 'data_type')
    data_test.cache()
    data_test_count = data_test.count()
    print('data_test_count = {}\n'.format(data_test_count))
    data_test = data_test.repartition(get_best_partition(data_test_count))

    # 处理预测样本
    data_test_sku2vec = data_test.join(sku_vec, on='item_sku_id', how='left')
    fillna_value_test = {c: -1 for c in features_numerical}
    fillna_value_test.update({c: -10 for c in features_embedding})
    data_test_fillna = data_test_sku2vec.fillna(fillna_value_test)
    data_transformer_test = pipeline.fit(data_test_fillna)
    data_transformed_test = data_transformer_test.transform(data_test_fillna)
    data_transformed_test.cache()
    data_test.unpersist()

    # 得到并输出候选商品池的预测结果
    gbdt_pred_test = gbdt_model.transform(data_transformed_test)
    features_result = [
        'item_third_cate_cd', 'item_sku_id', 'prediction', 'red_price',
        'red_price_level_rank', 'otc_days'
    ]

    if result_type == 'binary_prob':
        gbdt_pred_test = gbdt_pred_test.select(['item_third_cate_cd','item_sku_id','probability','red_price','red_price_level_rank','otc_days'])\
                         .rdd.map(lambda row:(row['item_third_cate_cd'],row['item_sku_id'],float(row['probability'][1]),row['red_price'],row['red_price_level_rank'],row['otc_days'])).toDF(features_result)
    else:
        gbdt_pred_test = gbdt_pred_test.withColumn('prediction_adjust',func.when(func.col('prediction') > 1,1).when(func.col('prediction') < 0,0).otherwise(func.col('prediction')))\
                          .drop('prediction').withColumnRenamed('prediction_adjust','prediction')

    result = gbdt_pred_test.select(features_result).withColumn(
        'new_old',
        func.when(func.col('otc_days') < 90, 'new').otherwise('old'))
    result.createOrReplaceTempView("result_df")
    spark.sql("""
             insert overwrite table """ + output_predict_result_table + """ 
             partition(dt='""" + predict_date + """',sku_type='""" + sku_type +
              """',result_type='""" + result_type + """') 
             select * from result_df
    """)

    data_transformed.unpersist()
    data_transformed_test.unpersist()
示例#15
0
#convert them to vectors
df_conv01 = convDf(df01)

#prepare for ml
df_prepped01 = prep(df_conv01)
df_prepped02 = df02.select("name").distinct()

#function to apply labels
df_labeled = get_labels(df_prepped01, df_prepped02)
df_labeled = df_labeled.na.drop().drop("version_idx")
cols_for_ml = df_prepped01.drop("name").drop("version_idx").schema.names

#pipline stages
#index the label
labelIndexer = mlf.StringIndexer(inputCol="Label", outputCol="Label_idx")
#vectorise the input
toVec = mlf.VectorAssembler(inputCols=cols_for_ml, outputCol="Features")
#classify
classifier = DecisionTreeClassifier(labelCol="Label_idx",
                                    featuresCol="Features",
                                    maxDepth=10,
                                    maxBins=200)

#create pipline of the stages and use it to train and test
pipeline = ml.Pipeline(stages=[labelIndexer, toVec, classifier])
train, test = df_labeled.randomSplit([0.7, 0.3], seed=12345)
df_pip = pipeline.fit(train)
predicted = df_pip.transform(test)
#print result
predicted.select("name", "Label_idx", "prediction", "rawPrediction",
示例#16
0
# 将city作为标签,使用空气浓度,时间和空气质量为特征
# 切分原始数据集为训练集和预测集,预测city的值
# 最后用多分类准确率评价, 0.4085
data = spark.sql(
    "select PM25,PM10,NO2,SO2,O3_1,O3_8h,CO,AQI,level,year,month,date,hour,city from init_df"
)
# 将特征向量聚合到一起
vector_assembler = ft.VectorAssembler(inputCols=[
    "PM25", "PM10", "NO2", "SO2", "O3_1", "O3_8h", "CO", "AQI", "level",
    "year", "month", "date", "hour"
],
                                      outputCol="features")
data = vector_assembler.transform(data)
# data.show()
# 将city转换为数字编码
label_indexer = ft.StringIndexer(inputCol="city",
                                 outputCol="city_int").fit(data)
label_converter = ft.IndexToString(inputCol="pred_int",
                                   outputCol="pred",
                                   labels=label_indexer.labels)

train, test = data.randomSplit([0.7, 0.3])
# 定义随机森林分类器
classifier = RandomForestClassifier(labelCol="city_int",
                                    featuresCol="features",
                                    predictionCol="pred_int",
                                    maxDepth=8,
                                    maxBins=128,
                                    maxMemoryInMB=512,
                                    numTrees=50)

# 模型训练与预测
示例#17
0
    .builder \
    .appName("HW-5-1") \
    .getOrCreate()

# reading data_train
data_train = spark.read.csv("iris_train.csv", header=True, inferSchema=True)

# vectorize all numerical columns into a single feature column
feature_cols = data_train.columns[:-1]
assembler = feature.VectorAssembler(inputCols=feature_cols,
                                    outputCol='features')
data_train = assembler.transform(data_train)

# convert text labels into indices
data_train = data_train.select(['features', 'class'])
label_indexer = feature.StringIndexer(inputCol='class',
                                      outputCol='label').fit(data_train)
data_train = label_indexer.transform(data_train)

# reading data_test
data_test = spark.read.csv("iris_test.csv", header=True, inferSchema=True)

# vectorize all numerical columns into a single feature column
feature_cols = data_test.columns[:-1]
assembler = feature.VectorAssembler(inputCols=feature_cols,
                                    outputCol='features')
data_test = assembler.transform(data_test)

# convert text labels into indices
data_test = data_test.select(['features', 'Species'])
label_indexer = feature.StringIndexer(inputCol='Species',
                                      outputCol='label').fit(data_test)