def exercise_in_machine_learning(self): self.static_data_frame.printSchema() prepped_data_frame = self.static_data_frame.na.fill(0). \ withColumn("day_of_week", functions.date_format(functions.col("InvoiceDate"), "EEEE")).coalesce(5) train_data_frame = prepped_data_frame.where( "InvoiceDate < '2011-03-01'") test_data_frame = prepped_data_frame.where( "InvoiceDate >= '2011-03-01'") print(f"TRAINING items: {train_data_frame.count()}") print(f"TEST DATA items: {test_data_frame.count()}") transformation_pipeline = Pipeline().setStages([ feature.StringIndexer().setInputCol("day_of_week").setOutputCol( "day_of_week_index"), feature.OneHotEncoder().setInputCol( "day_of_week_index").setOutputCol("day_of_week_encoded"), feature.VectorAssembler().setInputCols( ["UnitPrice", "Quantity", "day_of_week_encoded"]).setOutputCol("features"), ]) fitted_pipeline = transformation_pipeline.fit(train_data_frame) transformed_training = fitted_pipeline.transform(train_data_frame) # transformed_training.cache() kmeans = clustering.KMeans().setK(2).setSeed(2) km_model = kmeans.fit(transformed_training) print(f"Training cost: {km_model.summary.trainingCost}") transformed_test = fitted_pipeline.transform(test_data_frame) transformed_test.summary().show()
def train_evaluate(train_data, test_data): # 将文字的分类特征转为数字 stringIndexer = ft.StringIndexer(inputCol='alchemy_category', outputCol="alchemy_category_Index") encoder = ft.OneHotEncoder(dropLast=False, inputCol='alchemy_category_Index', outputCol="alchemy_category_IndexVec") assemblerInputs = ['alchemy_category_IndexVec'] + train_data.columns[4:-1] assembler = ft.VectorAssembler(inputCols=assemblerInputs, outputCol="features") # dt = cl.DecisionTreeClassifier(labelCol="label", # featuresCol="features") rf = cl.RandomForestClassifier(labelCol="label", featuresCol="features") evaluator = ev.BinaryClassificationEvaluator( rawPredictionCol="probability", labelCol='label', metricName='areaUnderROC') grid_search = tune.ParamGridBuilder()\ .addGrid(rf.impurity, [ "gini","entropy"])\ .addGrid(rf.maxDepth, [ 5,10,15])\ .addGrid(rf.maxBins, [10, 15,20])\ .addGrid(rf.numTrees, [10, 20,30])\ .build() rf_cv = tune.CrossValidator(estimator=rf, estimatorParamMaps=grid_search, evaluator=evaluator, numFolds=5) # rf_tvs = tune.TrainValidationSplit( # estimator=rf, # estimatorParamMaps=grid_search, # evaluator=evaluator, # trainRatio=0.7 # ) pipeline = Pipeline(stages=[stringIndexer, encoder, assembler, rf_cv]) cv_pipeline_model = pipeline.fit(train_data) best_model = cv_pipeline_model.stages[-1] best_parm = get_best_param(best_model) AUC, AP = evaluate_model(cv_pipeline_model, test_data) return AUC, AP, best_parm, cv_pipeline_model
stringCSVRDD = spark.sparkContext.parallelize([(123, "Katie", 19, "brown"), (123, "Kkk", 19, "red"), (234, "Michael", 22, "green"), (345, "Simone", 23, "blue")]) schema = StructType([ StructField("id", StringType(), True), StructField("name", StringType(), True), StructField("age", LongType(), True), StructField("eyeColor", StringType(), True) ]) swimmers = spark.createDataFrame(stringCSVRDD, schema) swimmers = swimmers.withColumn('id_int', swimmers['id'].cast(typ.IntegerType())) encoder = ft.OneHotEncoder(inputCol='id_int', outputCol='idvec') swimmers.registerTempTable("swimmers") swimmers.select("idvec").show() from pyspark.sql import SparkSession import pyspark.sql.types as typ import pyspark.ml.feature as ft labels = [('INFANT_ALIVE_AT_REPORT', typ.IntegerType()), ('BIRTH_PLACE', typ.IntegerType()), ('MOTHER_AGE_YEARS', typ.IntegerType()), ('FATHER_COMBINED_AGE', typ.IntegerType()), ('CIG_BEFORE', typ.IntegerType()), ('CIG_1_TRI', typ.IntegerType()), ('CIG_2_TRI', typ.IntegerType()), ('CIG_3_TRI', typ.IntegerType()), ('MOTHER_HEIGHT_IN', typ.IntegerType()), ('MOTHER_PRE_WEIGHT', typ.IntegerType()),
if not isinstance(string_in, str): return fill_value elif not string_in: return fill_value else: return string_in na_handler = ssf.udf(fill_empty_string, sst.StringType()) indexers = {} for cat_col in cat_cols: merged = merged.withColumn(cat_col, na_handler(cat_col)) indexer = smf.StringIndexer(inputCol=cat_col, outputCol=f"{cat_col}Inx") indexer = indexer.fit(merged) merged = indexer.transform(merged) merged = merged.drop(cat_col).withColumnRenamed(f"{cat_col}Inx", cat_col) indexers[cat_col] = indexer # merged.write.parquet('data/cached/indexed.parquet') encoder = smf.OneHotEncoder(inputCols=cat_cols, outputCols=[f"{x}Vec" for x in cat_cols]) encoder = encoder.fit(merged) encoded = encoder.transform(merged) encoded = encoded.drop(*cat_cols) encoded = encoded.persist() encoded.write.parquet("data/cached/encoded.parquet", mode='overwrite')
model_data = model_data.withColumn("is_late", model_data.arr_delay > 0) # Convert to an integer model_data = model_data.withColumn("label", model_data.is_late.cast("integer")) # Remove missing values model_data = model_data.filter( "arr_delay is not NULL and dep_delay is not NULL and air_time is not NULL and plane_year is not NULL" ) model_data.show() # Create a StringIndexer carr_indexer = features.StringIndexer(inputCol="carrier", outputCol="carrier_index") # Create a OneHotEncoder carr_encoder = features.OneHotEncoder(inputCol="carrier_index", outputCol="carrier_fact") # Create a StringIndexer dest_indexer = features.StringIndexer(inputCol="dest", outputCol="dest_index") # Create a OneHotEncoder dest_encoder = features.OneHotEncoder(inputCol="dest_index", outputCol="dest_fact") # Make a VectorAssembler vec_assembler = features.VectorAssembler(inputCols=[ "month", "air_time", "carrier_fact", "dest_fact", "plane_age" ], outputCol="features") # Make the pipeline
births = spark.read.csv('./data/births_transformed.csv.gz', header=True, schema=schema) # In[3]: import pyspark.ml.feature as ft births = births .withColumn( 'BIRTH_PLACE_INT', births['BIRTH_PLACE'] \ .cast(typ.IntegerType())) # In[4]: encoder = ft.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') # In[5]: featuresCreator = ft.VectorAssembler( inputCols=[ col[0] for col in labels[2:]] + \ [encoder.getOutputCol()], outputCol='features' ) # In[6]:
data_matrix = topic_distributions.join(past_complaint_counts_df, on='grid_square', how='inner') # So far, data_matrix contains Row(date, grid_square, topic_distributions, complaint_count). # Get weekday from date. get_weekday_udf = functions.udf(lambda d: d.weekday(), returnType=types.IntegerType()) data_matrix = data_matrix.withColumn('weekday', get_weekday_udf(data_matrix['date'])) # Assemble the feature vectors. weekday_one_hot_encoder = feature.OneHotEncoder(inputCol='weekday', outputCol='weekday_vector') feature_vector_assembler = feature.VectorAssembler( inputCols=['weekday_vector', 'topic_distribution'], outputCol='final_feature_vector') feature_assembly_pipeline = (ml.Pipeline( stages=[weekday_one_hot_encoder, feature_vector_assembler]).fit( data_matrix)) data_matrix = (feature_assembly_pipeline.transform(data_matrix).select( 'date', 'grid_square', 'final_feature_vector', 'complaint_count')) LOGGER.debug( str(data_matrix.count()) + " rows like " + str(data_matrix.take(1))) #logistic_regression = classification.LogisticRegression( # maxIter=10, regParam=0.3, elasticNetParam=0.8,
# def main(): if __name__ == "__main__": # specify schema structure of the df schema = typ.StructType( [typ.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv( "../data/births_transformed.csv.gz", header=True, schema=schema) births = births.withColumn("BIRTH_PLACE_INT", births["BIRTH_PLACE"].cast(typ.IntegerType())) encoder = ft.OneHotEncoder(inputCol="BIRTH_PLACE_INT", outputCol="BIRTH_PLACE_VEC") # column with all features collected together features_creator = ft.VectorAssembler( inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol="features") logistic = cl.LogisticRegression( maxIter=10, regParam=0.01, labelCol="INFANT_ALIVE_AT_REPORT") pipe = Pipeline(stages=[encoder, features_creator, logistic]) # train and test births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) model = pipe.fit(births_train) model_test = model.transform(births_test) print(model_test.take(1))
#!/usr/bin/env python # -*- coding: utf-8 -*- # __author__='zhangyuwei37' import pyspark.ml.feature as ft from pyspark.ml import Pipeline # 特征预处理:对类别变量onehot,对数值变量scaling, 最后整合特征,输出pca降维结果 # onehot indexers = [ ft.StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in nomial_features ] encoders = [ ft.OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ] assembler_onehot = ft.VectorAssembler( inputCols=[encoder.getOutputCol() for encoder in encoders], outputCol="onehot_features") #scaler assembler_numeric = ft.VectorAssembler(inputCols=numeric_features, outputCol="numeric_features") std_scaler = ft.StandardScaler(inputCol="numeric_features", outputCol="numeric_features_scaled") assembler_final = ft.VectorAssembler( inputCols=['onehot_features', 'numeric_features_scaled'], outputCol="final_features")
def hyper_parameter_optimization_ml(): spark = SparkSession.builder.appName('hyper-parameter-optimization-ml').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.IntegerType()), ('DIABETES_GEST', types.IntegerType()), ('HYP_TENS_PRE', types.IntegerType()), ('HYP_TENS_GEST', types.IntegerType()), ('PREV_BIRTH_PRETERM', types.IntegerType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema) # Create transformers. births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType())) # Encode the BIRTH_PLACE column using the OneHotEncoder method. encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features') # Split the dataset into training and testing datasets. births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # Create a purely transforming Pipeline. pipeline = Pipeline(stages=[encoder, featuresCreator]) data_transformer = pipeline.fit(births_train) # Specify our model and the list of parameters we want to loop through. logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT') grid = tune.ParamGridBuilder() \ .addGrid(logistic.maxIter, [2, 10, 50]) \ .addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \ .build() # Define a way of comparing the models. evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') # Create a logic that will do the validation work. cv = tune.CrossValidator(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(data_transformer.transform(births_train)) # See if cvModel performed better than our previous model data_train = data_transformer.transform(births_test) results = cvModel.transform(data_train) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'})) # Parameters which the best model has. results = [ ([{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())], metric) for params, metric in zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics) ] print(sorted(results, key=lambda el: el[1], reverse=True)[0])
def train_validation_splitting_ml(): spark = SparkSession.builder.appName('train-validation-splitting-ml').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.IntegerType()), ('DIABETES_GEST', types.IntegerType()), ('HYP_TENS_PRE', types.IntegerType()), ('HYP_TENS_GEST', types.IntegerType()), ('PREV_BIRTH_PRETERM', types.IntegerType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema) # Create transformers. births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType())) # Encode the BIRTH_PLACE column using the OneHotEncoder method. encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features') # Split the dataset into training and testing datasets. births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # Select only the top five features. selector = ml_feature.ChiSqSelector( numTopFeatures=5, featuresCol=featuresCreator.getOutputCol(), outputCol='selectedFeatures', labelCol='INFANT_ALIVE_AT_REPORT' ) # Create a purely transforming Pipeline. pipeline = Pipeline(stages=[encoder, featuresCreator, selector]) data_transformer = pipeline.fit(births_train) # Create LogisticRegression and Pipeline. logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT', featuresCol='selectedFeatures') grid = tune.ParamGridBuilder() \ .addGrid(logistic.maxIter, [2, 10, 50]) \ .addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \ .build() # Define a way of comparing the models. evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') # Create a TrainValidationSplit object. tvs = tune.TrainValidationSplit(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator) # Fit our data to the model. tvsModel = tvs.fit(data_transformer.transform(births_train)) data_train = data_transformer.transform(births_test) # Calculate results. results = tvsModel.transform(data_train) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))
def infant_survival_ml(): spark = SparkSession.builder.appName('infant-survival-ml').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.IntegerType()), ('DIABETES_GEST', types.IntegerType()), ('HYP_TENS_PRE', types.IntegerType()), ('HYP_TENS_GEST', types.IntegerType()), ('PREV_BIRTH_PRETERM', types.IntegerType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema) # Create transformers. births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType())) # Encode the BIRTH_PLACE column using the OneHotEncoder method. encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') featuresCreator = ml_ft.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features') # Create a model. logistic = ml_classification.LogisticRegression(maxIter=10, regParam=0.01, labelCol='INFANT_ALIVE_AT_REPORT') # Create a pipeline. pipeline = Pipeline(stages=[encoder, featuresCreator, logistic]) # Split the dataset into training and testing datasets. births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # Run the pipeline and estimate the model. model = pipeline.fit(births_train) test_model = model.transform(births_test) print(test_model.take(1)) # Evaluate the performance of the model. evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'})) # Save the Pipeline definition. pipelinePath = './infant_oneHotEncoder_Logistic_Pipeline' pipeline.write().overwrite().save(pipelinePath) # Load the Pipeline definition. loadedPipeline = Pipeline.load(pipelinePath) loadedPipeline.fit(births_train).transform(births_test).take(1) # Save the PipelineModel. modelPath = './infant_oneHotEncoder_Logistic_PipelineModel' model.write().overwrite().save(modelPath) # Load the PipelineModel. loadedPipelineModel = PipelineModel.load(modelPath) test_reloadedModel = loadedPipelineModel.transform(births_test) print(test_reloadedModel.take(1))
def main(spark): n = len(sys.argv) - 1 if n < 1: print('\nParameters are needed!!\n') sys.exit() else: result_type = sys.argv[1] sku_type = sys.argv[2] end_date = sys.argv[3] end_date_1w = sys.argv[4] end_date_2w = sys.argv[5] input_train_data_table = sys.argv[6] input_predict_data_table = sys.argv[7] output_predict_result_table = sys.argv[8] predict_date = sys.argv[9] spark.sql("set hive.exec.dynamic.partition.mode=nonstrict") spark.sql("set spark.sql.hive.mergeFiles=true") spark.sql("set hive.exec.orc.split.strategy=BI") spark.sql("set mapred.job.priority = HIGH") spark.sql("set hive.default.fileformat=Orc") spark.sql("set hive.exec.parallel=true") spark.sql("set hive.auto.convert.join=true") spark.sql("set hive.merge.mapfiles = true") spark.sql("set hive.merge.mapredfiles = true") spark.sql("set hive.merge.size.per.task = 256000000") spark.sql("set hive.merge.smallfiles.avgsize=128000000") spark.sql("set hive.merge.orcfile.stripe.level=false") spark.sql("set hive.exec.dynamic.partition=true") spark.sql("set hive.exec.max.dynamic.partitions=1000000") spark.sql("set hive.exec.max.dynamic.partitions.pernode=1000000") spark.sql("set hive.exec.max.created.files=1000000") spark.sql("set mapreduce.job.counters.limit=10000") spark.sql("set mapred.output.compress=true") spark.sql("set hive.exec.compress.output=true") spark.sql("set spark.shuffle.service.enabled = true") spark.sql("set spark.sql.broadcastTimeout = 10000") print('end_date = {}\n'.format(end_date)) print('sku_type = {}\n'.format(sku_type)) print('result_type = {}\n'.format(result_type)) ### 构建训练和预测样本 # 确定取数口径 if sku_type == 'old': sku_type_sql = ' and otc_days >= 60' elif sku_type == 'new': sku_type_sql = ' and otc_days < 60' else: sku_type_sql = '' # 当周正样本 data_now = spark.sql(""" select t1.* from ( select * from """ + input_train_data_table + """ where end_date = '""" + end_date + """' and label > 0""" + sku_type_sql + """ )t1 join ( select item_third_cate_cd from app.app_vdp_ai_sink_dept3_cate3_scope_mid_da where dt = '""" + predict_date + """' and app_id = 4 and scene_id = 1 and status = 3 group by item_third_cate_cd )t2 on t1.item_third_cate_cd = t2.item_third_cate_cd """) # 提前1周的独有正样本 data_1w = spark.sql(""" select a.* from ( select t1.* from ( select * from """ + input_train_data_table + """ where end_date = '""" + end_date_1w + """' and label > 0""" + sku_type_sql + """ )t1 join ( select item_third_cate_cd from app.app_vdp_ai_sink_dept3_cate3_scope_mid_da where dt = '""" + predict_date + """' and app_id = 4 and scene_id = 1 and status = 3 group by item_third_cate_cd )t2 on t1.item_third_cate_cd = t2.item_third_cate_cd )a left join ( select item_sku_id,1 as index from """ + input_train_data_table + """ where end_date = '""" + end_date + """' and label > 0""" + sku_type_sql + """ )b on a.item_sku_id=b.item_sku_id where index is null or index = '' """) # 提前2周的独有正样本 data_2w = spark.sql(""" select a.* from ( select t1.* from ( select * from """ + input_train_data_table + """ where end_date = '""" + end_date_2w + """' and label > 0""" + sku_type_sql + """ )t1 join ( select item_third_cate_cd from app.app_vdp_ai_sink_dept3_cate3_scope_mid_da where dt = '""" + predict_date + """' and app_id = 4 and scene_id = 1 and status = 3 group by item_third_cate_cd )t2 on t1.item_third_cate_cd = t2.item_third_cate_cd )a left join ( select item_sku_id,1 as index from """ + input_train_data_table + """ where end_date = '""" + end_date + """' and label > 0""" + sku_type_sql + """ )b on a.item_sku_id=b.item_sku_id where index is null or index = '' """) # 合并正样本 data = data_now.union(data_1w).union(data_2w) data_filter = data.filter("otc_days >= 0").filter("sku_status_cd = 3001") data_filter.cache() data_count = data_filter.count() print('positive data count = {}\n'.format(data_count)) # 补充负样本 data_neg = spark.sql(""" select t1.* from ( select * from """ + input_train_data_table + """ where end_date = '""" + end_date + """' and label = 0""" + sku_type_sql + """ and otc_days >= 0 and sku_status_cd = 3001 )t1 join ( select item_third_cate_cd from app.app_vdp_ai_sink_dept3_cate3_scope_mid_da where dt = '""" + predict_date + """' and app_id = 4 and scene_id = 1 and status = 3 group by item_third_cate_cd )t2 on t1.item_third_cate_cd = t2.item_third_cate_cd """) data_neg.cache() data_neg_count = data_neg.count() neg_sample_ratio = min(data_count / data_neg_count, 1.0) if data_neg_count > 0 else 0.0 data_neg_sample = data_neg.sample(neg_sample_ratio, seed=66) # 合并正负样本 if result_type == 'ucvr': data_union = data_filter.union(data_neg_sample).orderBy(func.rand(seed=66)).filter("item_first_cate_cd is not null")\ .withColumn('data_type_int', func.col('data_type').cast(IntegerType())).drop('data_type').withColumnRenamed('data_type_int','data_type')\ .withColumn('label_adjust',func.when(func.col('label') > 1,1).otherwise(func.col('label')))\ .drop('label').withColumnRenamed('label_adjust','label') else: data_union = data_filter.union(data_neg_sample).orderBy(func.rand(seed=66)).filter("item_first_cate_cd is not null")\ .withColumn('data_type_int', func.col('data_type').cast(IntegerType())).drop('data_type').withColumnRenamed('data_type_int','data_type')\ .withColumn('label_binary',func.when(func.col('label') > 0,1).otherwise(0))\ .drop('label').withColumnRenamed('label_binary','label') # 合并sku embedding特征 predict_date_str = ''.join(predict_date.split('-')) sku_vec = spark.sql( "select * from tmp.tmp_qzl_sink_search_08_sku2vec_features_{0}".format( predict_date_str)) vec_size = len(sku_vec.columns) - 1 data_union_sku2vec = data_union.join(sku_vec, on='item_sku_id', how='left') ### 训练模型 # 特征分类 # 非特征 features_useless = [ 'item_first_cate_name', 'item_second_cate_cd', 'item_second_cate_name', 'item_third_cate_cd', 'item_third_cate_name', 'barndname_full', 'sku_name', 'item_sku_id', 'uv_value_label', 'first_into_otc_tm', 'end_date', 'sku_status_cd', 'red_price', 'red_price_level_rank' ] # 类别型特征 features_catagory = ['item_first_cate_cd'] # embedding特征 features_embedding = ['sku_vec_' + str(i) for i in range(vec_size)] # 数值型特征 features_numerical = [ f for f in data_union_sku2vec.columns if f not in ['label'] + features_useless + features_catagory + features_embedding ] # 特征缺失值统计 feature_na = data_union_sku2vec.agg( *[(1 - (func.count(c) / func.count('*'))).alias(c) for c in data_union_sku2vec.columns]) feature_na_DF = sqlDF2pandasDF(feature_na).T feature_na_DF = feature_na_DF.reset_index() feature_na_DF.columns = ['features', 'na_rate'] for i, row in feature_na_DF.iterrows(): print('{}: {}'.format(row['features'], row['na_rate'])) # 处理缺失值 fillna_value = {c: -1 for c in features_numerical} fillna_value.update({c: -10 for c in features_embedding}) data_union_sku2vec_fillna = data_union_sku2vec.fillna(fillna_value) # 数据预处理 stringIndexer_cd1 = ft.StringIndexer(inputCol="item_first_cate_cd", outputCol="item_first_cate_cd_index") encoder_cd1 = ft.OneHotEncoder(inputCol='item_first_cate_cd_index', outputCol='item_first_cate_cd_vec') featuresCreator = ft.VectorAssembler(inputCols=features_numerical + [encoder_cd1.getOutputCol()] + features_embedding, outputCol='features') pipeline = Pipeline( stages=[stringIndexer_cd1, encoder_cd1, featuresCreator]) data_transformer = pipeline.fit(data_union_sku2vec_fillna) data_transformed = data_transformer.transform(data_union_sku2vec_fillna) data_transformed.cache() data_union_count = data_transformed.count() print('data_union_count = {}\n'.format(data_union_count)) data_filter.unpersist() data_neg.unpersist() p_num = get_best_partition(data_union_count) data_transformed = data_transformed.repartition(p_num) # 开始训练 best_depth = 12 # get_best_depth(data_union_count) best_iter = 150 # get_best_iter(data_union_count) f = '1.0' # '0.8' s = 1.0 # 0.8 if result_type == 'ucvr': gbdt = GBTRegressor(featuresCol='features',labelCol='label',predictionCol='prediction',lossType='squared',seed=66,maxMemoryInMB=2048,cacheNodeIds=True, \ maxDepth=best_depth,maxIter=best_iter,featureSubsetStrategy=f,subsamplingRate=s,stepSize=0.01) else: gbdt = GBTClassifier(featuresCol='features',labelCol='label',predictionCol='prediction',lossType='logistic',seed=66,maxMemoryInMB=2048,cacheNodeIds=True,\ maxDepth=best_depth,maxIter=best_iter,featureSubsetStrategy=f,subsamplingRate=s,stepSize=0.01) gbdt_model = gbdt.fit(data_transformed) ### 预测候选商品的结果 # 构建待预测样本 if sku_type == 'old': sku_type_sql_2 = ' where otc_days >= 60' elif sku_type == 'new': sku_type_sql_2 = ' where otc_days < 60' else: sku_type_sql_2 = '' data_test = spark.sql("select * from " + input_predict_data_table + "" + sku_type_sql_2 + "") data_test = data_test.withColumn( 'data_type_int', func.col('data_type').cast( IntegerType())).drop('data_type').withColumnRenamed( 'data_type_int', 'data_type') data_test.cache() data_test_count = data_test.count() print('data_test_count = {}\n'.format(data_test_count)) data_test = data_test.repartition(get_best_partition(data_test_count)) # 处理预测样本 data_test_sku2vec = data_test.join(sku_vec, on='item_sku_id', how='left') fillna_value_test = {c: -1 for c in features_numerical} fillna_value_test.update({c: -10 for c in features_embedding}) data_test_fillna = data_test_sku2vec.fillna(fillna_value_test) data_transformer_test = pipeline.fit(data_test_fillna) data_transformed_test = data_transformer_test.transform(data_test_fillna) data_transformed_test.cache() data_test.unpersist() # 得到并输出候选商品池的预测结果 gbdt_pred_test = gbdt_model.transform(data_transformed_test) features_result = [ 'item_third_cate_cd', 'item_sku_id', 'prediction', 'red_price', 'red_price_level_rank', 'otc_days' ] if result_type == 'binary_prob': gbdt_pred_test = gbdt_pred_test.select(['item_third_cate_cd','item_sku_id','probability','red_price','red_price_level_rank','otc_days'])\ .rdd.map(lambda row:(row['item_third_cate_cd'],row['item_sku_id'],float(row['probability'][1]),row['red_price'],row['red_price_level_rank'],row['otc_days'])).toDF(features_result) else: gbdt_pred_test = gbdt_pred_test.withColumn('prediction_adjust',func.when(func.col('prediction') > 1,1).when(func.col('prediction') < 0,0).otherwise(func.col('prediction')))\ .drop('prediction').withColumnRenamed('prediction_adjust','prediction') result = gbdt_pred_test.select(features_result).withColumn( 'new_old', func.when(func.col('otc_days') < 90, 'new').otherwise('old')) result.createOrReplaceTempView("result_df") spark.sql(""" insert overwrite table """ + output_predict_result_table + """ partition(dt='""" + predict_date + """',sku_type='""" + sku_type + """',result_type='""" + result_type + """') select * from result_df """) data_transformed.unpersist() data_transformed_test.unpersist()