def buildModel(self,save_pipe_path=None): df=self.getModelData() label_index=fea.StringIndexer(inputCol='user_type',outputCol='label') reTokenizer=fea.RegexTokenizer(inputCol='appnames',outputCol='appname_token',pattern=',') cnt_vector=fea.CountVectorizer(inputCol='appname_token',outputCol='appname_vector') vecAssembler = fea.VectorAssembler(inputCols=['appname_vector'], outputCol="feature") scaler=fea.StandardScaler(inputCol='feature',outputCol='features') if not save_pipe_path: lr=LogisticRegression() grid=ParamGridBuilder().addGrid(lr.elasticNetParam,[0,1]).build() evaluator=BinaryClassificationEvaluator(metricName="areaUnderPR") pipeline = Pipeline(stages=[label_index,reTokenizer, cnt_vector,vecAssembler,scaler]) pipe = pipeline.fit(df) pipe_out=pipe.transform(df) cv=CrossValidator(estimator=lr,estimatorParamMaps=grid,evaluator=evaluator) model=cv.fit(pipe_out) print evaluator.evaluate(model.transform(pipe_out)) print 'Best Param (regParam): ', model.bestModel._java_obj.getElasticNetParam() predict_result=model.transform(pipe_out).select('probability','label').toPandas() predict_result.to_csv('/home/chenchen/data/predict_result1.csv',index=False) else: lr=LogisticRegression(elasticNetParam=1.0) pipeline=Pipeline(stages=[label_index,reTokenizer, cnt_vector,vecAssembler,scaler,lr]) model=pipeline.fit(df) model.save(save_pipe_path) print 'pipe saved'
def classification_ml(): if False: spark = SparkSession.builder.appName('classification-ml') \ .config('spark.jars.packages', 'org.xerial:sqlite-jdbc:3.23.1') \ .getOrCreate() df = spark.read \ .format('jdbc') \ .option('url', 'jdbc:sqlite:iris.db') \ .option('driver', 'org.sqlite.JDBC') \ .option('dbtable', 'iris') \ .load() else: spark = SparkSession.builder.appName('classification-ml').getOrCreate() df = spark.read.option('header', 'true').option('inferSchema', 'true').format('csv').load('dataset/iris.csv') spark.sparkContext.setLogLevel('WARN') df.show() labels = [ ('index', types.IntegerType()), ('a1', types.FloatType()), ('a2', types.FloatType()), ('a3', types.FloatType()), ('a4', types.FloatType()), ('id', types.StringType()), ('label', types.StringType()) ] stringIndexer = ml_feature.StringIndexer(inputCol='label', outputCol='label_int') featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[1:5]], outputCol='features') # Create a model. logistic = ml_classification.LogisticRegression(featuresCol=featuresCreator.getOutputCol(), labelCol=stringIndexer.getOutputCol(), maxIter=10, regParam=0.01) # Create a pipeline. pipeline = Pipeline(stages=[stringIndexer, featuresCreator, logistic]) # Split the dataset into training and testing datasets. df_train, df_test = df.randomSplit([0.7, 0.3], seed=666) # Run the pipeline and estimate the model. model = pipeline.fit(df_train) test_result = model.transform(df_test) # Dataframe. #print(test_result.take(1)) #test_result.show(5, truncate=True, vertical=False) test_result.show(truncate=False) # Save and load. lr_path = './lr' logistic.write().overwrite().save(lr_path) lr2 = ml_classification.LogisticRegression.load(lr_path) print('Param =', lr2.getRegParam()) model_path = './lr_model' model.write().overwrite().save(model_path) model2 = PipelineModel.load(model_path) print('Stages =', model.stages) print(model.stages[2].coefficientMatrix == model2.stages[2].coefficientMatrix) print(model.stages[2].interceptVector == model2.stages[2].interceptVector)
def exercise_in_machine_learning(self): self.static_data_frame.printSchema() prepped_data_frame = self.static_data_frame.na.fill(0). \ withColumn("day_of_week", functions.date_format(functions.col("InvoiceDate"), "EEEE")).coalesce(5) train_data_frame = prepped_data_frame.where( "InvoiceDate < '2011-03-01'") test_data_frame = prepped_data_frame.where( "InvoiceDate >= '2011-03-01'") print(f"TRAINING items: {train_data_frame.count()}") print(f"TEST DATA items: {test_data_frame.count()}") transformation_pipeline = Pipeline().setStages([ feature.StringIndexer().setInputCol("day_of_week").setOutputCol( "day_of_week_index"), feature.OneHotEncoder().setInputCol( "day_of_week_index").setOutputCol("day_of_week_encoded"), feature.VectorAssembler().setInputCols( ["UnitPrice", "Quantity", "day_of_week_encoded"]).setOutputCol("features"), ]) fitted_pipeline = transformation_pipeline.fit(train_data_frame) transformed_training = fitted_pipeline.transform(train_data_frame) # transformed_training.cache() kmeans = clustering.KMeans().setK(2).setSeed(2) km_model = kmeans.fit(transformed_training) print(f"Training cost: {km_model.summary.trainingCost}") transformed_test = fitted_pipeline.transform(test_data_frame) transformed_test.summary().show()
def feature_extract(train_t): stopWords = spark_ft.StopWordsRemover.loadDefaultStopWords('english') sw_remover1 = spark_ft.StopWordsRemover(inputCol='ntokens1', outputCol='clean_tokens1', stopWords=stopWords) text2vec1 = spark_ft.Word2Vec(vectorSize=50, minCount=1, seed=123, inputCol='ntokens1', outputCol='text_vec1', windowSize=1, maxSentenceLength=100) assembler1 = spark_ft.VectorAssembler(inputCols=['text_vec1'], outputCol='features1') sw_remover2 = spark_ft.StopWordsRemover(inputCol='ntokens2', outputCol='clean_tokens2', stopWords=stopWords) text2vec2 = spark_ft.Word2Vec(vectorSize=50, minCount=1, seed=123, inputCol='ntokens2', outputCol='text_vec2', windowSize=1, maxSentenceLength=100) assembler2 = spark_ft.VectorAssembler(inputCols=['text_vec2'], outputCol='features2') feature_pipeline = Pipeline(stages=[ sw_remover1, text2vec1, assembler1, sw_remover2, text2vec2, assembler2 ]) feature_model = feature_pipeline.fit(train_t) train_featurized = feature_model.transform(train_t).persist() tA = train_featurized.select('text_vec1').collect() tA_array = np.array(tA) tB = train_featurized.select('text_vec2').collect() tB_array = np.array(tB) return tA_array, tB_array
def compute_corr(df, columns, method="pearson"): assembler = feature.VectorAssembler(inputCols=columns, outputCol="featuresCorrelation") corr_featurized_df = assembler.transform(df) corr_df = stat.Correlation.corr(corr_featurized_df, "featuresCorrelation", method) corr_matrix = corr_df.first()[0].toArray() corr_pddf = pd.DataFrame(corr_matrix, columns=columns, index=columns) return corr_pddf
def df(spark): df = spark.read.parquet('df') pred_cols = [ x for x in df.columns if x not in ['features', 'label', 'response'] ] assembler = mlf.VectorAssembler(inputCols=pred_cols, outputCol='features') df = assembler.transform(df) # type pyspark.sql.DataFrame df.cache() return df
def test_stackedml_pipe(): df = SPARK_SESSION.sparkContext. \ parallelize([Row(sentence='this is a test', label=0.), Row(sentence='this is another test', label=1.)]).\ toDF() pl = feature.Tokenizer().setInputCol( 'sentence') | feature.CountVectorizer() ml = pl | (classification.LogisticRegression(),) | feature.VectorAssembler() | \ classification.\ RandomForestClassifier() ml_model = ml.fit(df) assert_equal(ml_model.transform(df).count(), 2)
def train_evaluate(train_data, test_data): # 将文字的分类特征转为数字 stringIndexer = ft.StringIndexer(inputCol='alchemy_category', outputCol="alchemy_category_Index") encoder = ft.OneHotEncoder(dropLast=False, inputCol='alchemy_category_Index', outputCol="alchemy_category_IndexVec") assemblerInputs = ['alchemy_category_IndexVec'] + train_data.columns[4:-1] assembler = ft.VectorAssembler(inputCols=assemblerInputs, outputCol="features") # dt = cl.DecisionTreeClassifier(labelCol="label", # featuresCol="features") rf = cl.RandomForestClassifier(labelCol="label", featuresCol="features") evaluator = ev.BinaryClassificationEvaluator( rawPredictionCol="probability", labelCol='label', metricName='areaUnderROC') grid_search = tune.ParamGridBuilder()\ .addGrid(rf.impurity, [ "gini","entropy"])\ .addGrid(rf.maxDepth, [ 5,10,15])\ .addGrid(rf.maxBins, [10, 15,20])\ .addGrid(rf.numTrees, [10, 20,30])\ .build() rf_cv = tune.CrossValidator(estimator=rf, estimatorParamMaps=grid_search, evaluator=evaluator, numFolds=5) # rf_tvs = tune.TrainValidationSplit( # estimator=rf, # estimatorParamMaps=grid_search, # evaluator=evaluator, # trainRatio=0.7 # ) pipeline = Pipeline(stages=[stringIndexer, encoder, assembler, rf_cv]) cv_pipeline_model = pipeline.fit(train_data) best_model = cv_pipeline_model.stages[-1] best_parm = get_best_param(best_model) AUC, AP = evaluate_model(cv_pipeline_model, test_data) return AUC, AP, best_parm, cv_pipeline_model
def test_multi_model_pipe(): df = SPARK_SESSION.sparkContext. \ parallelize([Row(sentence='this is a test', label=0.), Row(sentence='this is another test', label=1.)]).\ toDF() pl = feature.Tokenizer().setInputCol( 'sentence') | feature.CountVectorizer() models = (classification.LogisticRegression(), classification.RandomForestClassifier(), classification.LogisticRegression().setElasticNetParam(0.2), classification.GBTClassifier()) ml = pl | models | feature.VectorAssembler().setOutputCol('final_features') | \ classification.LogisticRegression() ml_model = ml.fit(df) assert_equal(ml_model.transform(df).count(), 2)
def n_gram(df, input_col, n=2): """ Converts the input array of strings inside of a Spark DF into an array of n-grams. :param df: Pyspark dataframe to analyze :param input_col: Column to analyzer. :param n: number of elements per n-gram >=1. :return: Spark DataFrame with n-grams calculated. """ is_dataframe(df) tokenizer = feature.Tokenizer().setInputCol(input_col) | feature.StopWordsRemover() count = feature.CountVectorizer() gram = feature.NGram(n=n) | feature.CountVectorizer() tf = tokenizer | (count, gram) | feature.VectorAssembler() tfidf = tf | feature.IDF().setOutputCol('features') tfidf_model = tfidf.fit(df) df_model = tfidf_model.transform(df) return df_model, tfidf_model
def test_unigram_and_bigram(): df = SPARK_SESSION.sparkContext. \ parallelize([['this is the best sentence ever'], ['this is however the worst sentence available']]). \ toDF(schema=types.StructType().add('sentence', types.StringType())) import requests stop_words = requests.get( 'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words' ).text.split() tokenizer = feature.Tokenizer().setInputCol( 'sentence') | feature.StopWordsRemover(stopWords=stop_words) unigram = feature.CountVectorizer() bigram = feature.NGram() | feature.CountVectorizer() trigram = feature.NGram(n=3) | feature.CountVectorizer() tf = tokenizer | (unigram, bigram, trigram) | feature.VectorAssembler() tfidf = tf | feature.IDF().setOutputCol('features') tfidf_model = tfidf.fit(df) assert_equal( tfidf_model.transform(df).select('sentence', 'features').count(), 2)
def main(path_data, path_parameters, dir_models): logger = logging.getLogger(__name__) spark = ( pyspark.sql.SparkSession .builder .appName("Python Spark Random Forest model training") .enableHiveSupport() .getOrCreate() ) logger.info("Reading parquet data and splitting into test and train datasets") data_df = spark.read.parquet(path_data) splits = data_df.randomSplit([0.7, 0.3]) training_df = splits[0] validation_df = splits[1] logger.info("Constructing pipeline for prediction model") with open(path_parameters) as json_file: parameters = json.load(json_file) feature_columns = parameters['feature_columns'] rf_params = parameters['rf_params'] assembler = feature.VectorAssembler( inputCols=feature_columns, outputCol="features") rf = classification.RandomForestClassifier( labelCol="churn", **rf_params) rf_pipeline = pipeline.Pipeline(stages=[assembler, rf]) logger.info("Training prediction model") pipeline_model = rf_pipeline.fit(training_df) logger.info("Calculating model metrics") train_predictions_df = pipeline_model.transform(training_df) validation_predictions_df = pipeline_model.transform(validation_df) accuracy_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="accuracy", labelCol="churn", predictionCol="prediction") precision_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="weightedPrecision", labelCol="churn", predictionCol="prediction") recall_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="weightedRecall", labelCol="churn", predictionCol="prediction") f1_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="f1", labelCol="churn", predictionCol="prediction") auroc_evaluator = evaluation.BinaryClassificationEvaluator(metricName='areaUnderROC', labelCol="churn") logger.info("Saving model and metrics data") train_metrics = { "accuracy": accuracy_evaluator.evaluate(train_predictions_df), "precision": precision_evaluator.evaluate(train_predictions_df), "recall": recall_evaluator.evaluate(train_predictions_df), "f1": f1_evaluator.evaluate(train_predictions_df), "auroc": auroc_evaluator.evaluate(train_predictions_df) } validation_metrics = { "accuracy": accuracy_evaluator.evaluate(validation_predictions_df), "precision": precision_evaluator.evaluate(validation_predictions_df), "recall": recall_evaluator.evaluate(validation_predictions_df), "f1": f1_evaluator.evaluate(validation_predictions_df), "auroc": auroc_evaluator.evaluate(validation_predictions_df) } rf_model = pipeline_model.stages[-1] model_params = rf_model.extractParamMap() model_description = { "name": "Random Forest", "params": {param.name: value for param, value in model_params.items()}, } dir_model = pathlib.Path(dir_models) dir_model.mkdir(parents=True, exist_ok=True) path_pipeline_model = pathlib.Path(dir_model).joinpath("pipeline_model") path_train_metrics = pathlib.Path(dir_model).joinpath("metrics_train.json") path_validation_metrics = pathlib.Path(dir_model).joinpath("metrics_validation.json") path_model_description = pathlib.Path(dir_model).joinpath("model_description.json") pipeline_model.save(str(path_pipeline_model)) with open(path_train_metrics, "w") as f: json.dump(train_metrics, f) with open(path_validation_metrics, "w") as f: json.dump(validation_metrics, f) with open(path_model_description, "w") as f: json.dump(model_description, f)
def _transform_data(sparse_features, continuous_features, data: DataFrame, feature_dict_path: str, sc: SparkContext): """ 转换数据, 生成deepFM所需格式的TFRecord 数据格式形如: trace_id, feature_index, feature_values :param sparse_features: :param continuous_features: :param data: :param feature_dict_path: :param sc: :return: """ def _get_feature_value_index_udf(broadcast_feature_dict, feature_name): feature_dict = broadcast_feature_dict.value def _get_feature_value_index_wrapper(feature_value): # 离散变量返回对应值的index if feature_name in sparse_features: if str(feature_value) in feature_dict[feature_name].keys(): return int(feature_dict[feature_name][str(feature_value)]) else: return int( feature_dict[feature_name][str(UNKNOWN_VALUE_KEY)]) # 连续变量只有一个index else: return feature_dict[feature_name] return F.udf(lambda x: _get_feature_value_index_wrapper(x)) broadcast_feature_dict = \ sc.broadcast(json.loads(utils.read_from_hdfs(sc, feature_dict_path))) features = broadcast_feature_dict.value.keys( ) # 由于dict无需,这里统一获取feature list for col in features: data = data \ .withColumn("feature_index_" + col, _get_feature_value_index_udf( broadcast_feature_dict, col)(data[col]).cast("float")) if col in continuous_features: data = data.withColumn("feature_value_" + col, data[col].cast("float")) else: data = data.withColumn("feature_value_" + col, F.lit(1).cast("float")) data = data.cache() print("[INFO] transformed features: ") data.show(5, False) # vectorAssembler feature_index_vector_assembler = feature.VectorAssembler( inputCols=["feature_index_" + f for f in features], outputCol="feature_index") feature_value_vector_assembler = feature.VectorAssembler( inputCols=["feature_value_" + f for f in features], outputCol="feature_value") data = feature_index_vector_assembler.transform(data) data = feature_value_vector_assembler.transform(data) data = data.select("trace_id", "feature_index", "feature_value", "label") return features, data
df_conv01 = convDf(df01) #prepare for ml df_prepped01 = prep(df_conv01) df_prepped02 = df02.select("name").distinct() #function to apply labels df_labeled = get_labels(df_prepped01, df_prepped02) df_labeled = df_labeled.na.drop().drop("version_idx") cols_for_ml = df_prepped01.drop("name").drop("version_idx").schema.names #pipline stages #index the label labelIndexer = mlf.StringIndexer(inputCol="Label", outputCol="Label_idx") #vectorise the input toVec = mlf.VectorAssembler(inputCols=cols_for_ml, outputCol="Features") #classify classifier = DecisionTreeClassifier(labelCol="Label_idx", featuresCol="Features", maxDepth=10, maxBins=200) #create pipline of the stages and use it to train and test pipeline = ml.Pipeline(stages=[labelIndexer, toVec, classifier]) train, test = df_labeled.randomSplit([0.7, 0.3], seed=12345) df_pip = pipeline.fit(train) predicted = df_pip.transform(test) #print result predicted.select("name", "Label_idx", "prediction", "rawPrediction", "probability").show(30, False)
def main(spark): n = len(sys.argv) - 1 if n < 1: print('\nParameters are needed!!\n') sys.exit() else: i = sys.argv[1] batch = sys.argv[2] end_date = sys.argv[3] end_date_1w = sys.argv[4] end_date_2w = sys.argv[5] input_train_data_table = sys.argv[6] input_predict_data_table = sys.argv[7] output_cd3_score_table = sys.argv[8] output_train_result_table = sys.argv[9] output_predict_result_table = sys.argv[10] predict_date = sys.argv[11] spark.sql("set hive.exec.dynamic.partition.mode=nonstrict") spark.sql("set spark.sql.hive.mergeFiles=true") spark.sql("set hive.exec.orc.split.strategy=BI") spark.sql("set mapred.job.priority = HIGH") spark.sql("set hive.default.fileformat=Orc") spark.sql("set hive.exec.parallel=true") spark.sql("set hive.auto.convert.join=true") spark.sql("set hive.merge.mapfiles = true") spark.sql("set hive.merge.mapredfiles = true") spark.sql("set hive.merge.size.per.task = 256000000") spark.sql("set hive.merge.smallfiles.avgsize=128000000") spark.sql("set hive.merge.orcfile.stripe.level=false") spark.sql("set hive.exec.dynamic.partition=true") spark.sql("set hive.exec.max.dynamic.partitions=1000000") spark.sql("set hive.exec.max.dynamic.partitions.pernode=1000000") spark.sql("set hive.exec.max.created.files=1000000") spark.sql("set mapreduce.job.counters.limit=10000") spark.sql("set mapred.output.compress=true") spark.sql("set hive.exec.compress.output=true") spark.sql("set spark.shuffle.service.enabled = true") spark.sql("set spark.sql.broadcastTimeout = 10000") # 要计算的全部品类 cd3_list_df = spark.sql(""" select item_third_cate_cd from """ + input_predict_data_table + """ group by item_third_cate_cd order by split(item_third_cate_cd,'')[length(item_third_cate_cd)-1], split(item_third_cate_cd,'')[length(item_third_cate_cd)-2], item_third_cate_cd """) # 目前还没有计算完的品类 # cd3_list_df = spark.sql(""" # select # item_third_cate_cd # from # ( # select # a.item_third_cate_cd,label # from # ( # select # item_third_cate_cd # from # """ + input_predict_data_table + """ # group by # item_third_cate_cd # )a # left JOIN # (select item_third_cate_cd, 1 as label from """ + output_cd3_score_table + """ group by item_third_cate_cd)b # on # a.item_third_cate_cd=b.item_third_cate_cd # )t # where # label is null # order by # split(item_third_cate_cd,'')[length(item_third_cate_cd)-1], split(item_third_cate_cd,'')[length(item_third_cate_cd)-2], item_third_cate_cd # """) # 是否有应该出数但是没有出数的品类 # cd3_list_df = spark.sql(""" # select # t1.item_third_cate_cd # from # ( select # item_third_cate_cd # from # app.app_vdp_ai_sink_search_old_model_cd3_score # where # sku_count > 0)t1 # left join # (select item_third_cate_cd,1 as index from app.app_vdp_ai_sink_search_old_model_predict_result group by item_third_cate_cd)t2 # on t1.item_third_cate_cd=t2.item_third_cate_cd # where index is null or index='' # order by t1.item_third_cate_cd # """) cd3_list = cd3_list_df.rdd.map(lambda row: row[0]).collect() cd3_list_batch = get_scope_id_batch(int(i), int(batch), cd3_list) for cd3 in cd3_list_batch: print('\ncd3 = {} 开始计算\n'.format(cd3)) try: ### 验证当前品类是否跑过 if_finish = spark.sql("select * from " + output_cd3_score_table + " where item_third_cate_cd = '" + cd3 + "'") if if_finish.count() > 0: print('already finished yet') continue ### 构建训练和预测样本 # 当周正样本 data_now = spark.sql( """ select * from """ + input_train_data_table + """ where end_date = '""" + end_date + """' and label > 0 and item_third_cate_cd = '""" + cd3 + """' """) # 提前1周的独有正样本 data_1w = spark.sql( """ select a.* from ( select * from """ + input_train_data_table + """ where end_date = '""" + end_date_1w + """' and label > 0 and item_third_cate_cd = '""" + cd3 + """' )a left join ( select item_sku_id,1 as index from """ + input_train_data_table + """ where end_date = '""" + end_date + """' and label > 0 and item_third_cate_cd = '""" + cd3 + """' )b on a.item_sku_id=b.item_sku_id where index is null or index = '' """) # 提前2周的独有正样本 data_2w = spark.sql( """ select a.* from ( select * from """ + input_train_data_table + """ where end_date = '""" + end_date_2w + """' and label > 0 and item_third_cate_cd = '""" + cd3 + """' )a left join ( select item_sku_id,1 as index from """ + input_train_data_table + """ where end_date = '""" + end_date + """' and label > 0 and item_third_cate_cd = '""" + cd3 + """' )b on a.item_sku_id=b.item_sku_id where index is null or index = '' """) # 合并正样本 data = data_now.union(data_1w).union(data_2w) data_filter = data.filter("otc_days >= 0").filter( "sku_status_cd = 3001").filter("label <= 1") data_filter.cache() data_count = data_filter.count() # 构建待预测样本 data_test = spark.sql("select * from " + input_predict_data_table + " where item_third_cate_cd = '" + cd3 + "'") data_test.cache() data_test_count = data_test.count() data_test = data_test.repartition( get_best_partition(data_test_count)) # 判断是否缺少训练正样本或预测样本 if data_count == 0 or data_test_count == 0: print('No train data or no predict data') spark.sql(""" insert overwrite table """ + output_cd3_score_table + """ partition(dt='""" + predict_date + """',item_third_cate_cd='""" + cd3 + """') values ({0},{1},{2},{3},{4},{5}) """.format(0, -1, -1, -1.0, -1.0, -1.0)) continue # 补充负样本 data_neg = spark.sql( """ select * from """ + input_train_data_table + """ where end_date = '""" + end_date_1w + """' and label = 0 and item_third_cate_cd = '""" + cd3 + """' """) data_neg.cache() data_neg_count = data_neg.count() neg_sample_ratio = min(data_count / data_neg_count, 1.0) if data_neg_count > 0 else 0.0 data_neg_sample = data_neg.sample(neg_sample_ratio, seed=66) # 合并正负样本 data_union = data_filter.union(data_neg_sample).orderBy( func.rand(seed=66)) # 合并sku embedding特征 sku_vec = spark.sql( "select * from tmp.tmp_qzl_sink_search_08_sku2vec_features") vec_size = len(sku_vec.columns) - 1 data_union_sku2vec = data_union.join(sku_vec, on='item_sku_id', how='left') ### 训练模型 # 特征分类 # 非特征 features_useless = [ 'item_first_cate_name', 'item_second_cate_cd', 'item_second_cate_name', 'item_third_cate_cd', 'item_third_cate_name', 'barndname_full', 'sku_name', 'item_sku_id', 'uv_value_label', 'first_into_otc_tm', 'end_date', 'sku_status_cd', 'red_price', 'red_price_level_rank' ] # 类别型特征 features_catagory = ['item_first_cate_cd'] # embedding特征 features_embedding = ['sku_vec_' + str(i) for i in range(vec_size)] # 数值型特征 features_numerical = [ f for f in data_union_sku2vec.columns if f not in ['label'] + features_useless + features_catagory + features_embedding ] # 处理缺失值 fillna_value = {c: -1 for c in features_numerical} fillna_value.update({c: -10 for c in features_embedding}) data_union_sku2vec_fillna = data_union_sku2vec.fillna(fillna_value) # 数据预处理 featuresCreator = ft.VectorAssembler(inputCols=features_numerical + features_embedding, outputCol='features') pipeline = Pipeline(stages=[featuresCreator]) data_transformer = pipeline.fit(data_union_sku2vec_fillna) data_transformed = data_transformer.transform( data_union_sku2vec_fillna) data_transformed.cache() data_union_count = data_transformed.count() data_filter.unpersist() data_neg.unpersist() p_num = get_best_partition(data_union_count) data_transformed = data_transformed.repartition(p_num) # 开始训练 best_depth = get_best_depth(data_union_count) best_iter = get_best_iter(data_union_count) gbdt = GBTRegressor(featuresCol='features',labelCol='label',predictionCol='prediction',lossType='squared',seed=66,maxMemoryInMB=2048,cacheNodeIds=True, \ maxDepth=best_depth,maxIter=best_iter,featureSubsetStrategy='0.8',subsamplingRate=0.8,stepSize=0.01) gbdt_model = gbdt.fit(data_transformed) # 模型评估 evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label', metricName='rmse') gbdt_pred = gbdt_model.transform(data_transformed) train_rmse = evaluator.evaluate( gbdt_pred, {evaluator.metricName: 'rmse'}) # 训练集rmse # 训练集label与predict的相关系数 corr_result = gbdt_pred.corr('label', 'prediction') train_corr = corr_result if np.isnan(corr_result) == False else 1.0 # 训练集label与predict的top 50%重合比例 data_pred_df = gbdt_pred.select( ['item_sku_id', 'label', 'prediction']).toPandas() top_n = max(int(data_union_count * 0.5), 1) sku_label_top = data_pred_df.sort_values( by=['label'], ascending=False)['item_sku_id'].values.tolist()[:top_n] sku_pred_top = data_pred_df.sort_values( by=['prediction'], ascending=False)['item_sku_id'].values.tolist()[:top_n] top_cover_ratio = len(set(sku_label_top) & set(sku_pred_top)) / top_n ### 预测候选商品转化率 # 处理预测样本 data_test_sku2vec = data_test.join(sku_vec, on='item_sku_id', how='left') fillna_value_test = {c: -1 for c in features_numerical} fillna_value_test.update({c: -10 for c in features_embedding}) data_test_fillna = data_test_sku2vec.fillna(fillna_value_test) data_transformer_test = pipeline.fit(data_test_fillna) data_transformed_test = data_transformer_test.transform( data_test_fillna) data_transformed_test.cache() data_test.unpersist() # 得到并输出候选商品池的预测结果 gbdt_pred_test = gbdt_model.transform(data_transformed_test) features_result = [ 'item_sku_id', 'prediction', 'red_price', 'red_price_level_rank', 'otc_days' ] result = gbdt_pred_test.select(features_result).withColumn( 'new_old', func.when(func.col('otc_days') < 90, 'new').otherwise('old')) result.createOrReplaceTempView("result_df") spark.sql(""" insert overwrite table """ + output_predict_result_table + """ partition(dt='""" + predict_date + """',item_third_cate_cd='""" + cd3 + """') select * from result_df """) # 输出训练集样本的预测结果 features_result_train = ['item_sku_id', 'label', 'prediction'] train_result = gbdt_pred.select(features_result_train) train_result.createOrReplaceTempView("train_result_df") spark.sql(""" insert overwrite table """ + output_train_result_table + """ partition(dt='""" + predict_date + """',item_third_cate_cd='""" + cd3 + """') select * from train_result_df """) # 输出品类训练模型的验证结果 spark.sql(""" insert overwrite table """ + output_cd3_score_table + """ partition(dt='""" + predict_date + """',item_third_cate_cd='""" + cd3 + """') values ({0},{1},{2},{3},{4},{5}) """.format(data_union_count, best_depth, best_iter, train_rmse, train_corr, top_cover_ratio)) data_transformed.unpersist() data_transformed_test.unpersist() except Exception as e: print('Error:', e) continue
# miss_pool.toPandas().to_csv('/tmp/xieyulong/miss_{}.csv'.format(time.time()),index=False) ##static_variance: data_rdd = data.rdd.map(lambda row: [x for x in row]) mllib_st = st.Statistics.colStats(data_rdd) for col, m, v in zip(data.columns, mllib_st.mean(), mllib_st.variance()): print('{0}: \t{1:.2f} \t{2:.2f}'.format(col, m, np.sqrt(v))) ##static_corr: ##train_model: fea_pool = data.columns fea_pool.remove('y') ##featuerCreator: featuerCreator = ft.VectorAssembler(inputCols=fea_pool, outputCol='features') ##weightCol: data = data.withColumn('weight', fn.when(data['y'] == 1, 1.0).otherwise(0.02)) train, test = data.randomSplit([0.7, 0.3], seed=1234) #42 lr_model = cl.LogisticRegression( # maxIter=10, # regParam=0.01, elasticNetParam=0, family='binomial', threshold=0.5, weightCol='weight', labelCol='y')
# COMMAND ---------- test.count() # COMMAND ---------- # MAGIC %md # MAGIC 2 Vector Assembler Pipeline stages # MAGIC 1 - With all features # MAGIC 2 - With just the intercept # COMMAND ---------- vecScaled = feature.VectorAssembler(inputCols=[ 'ERTPREAT', 'ERTSEAT', 'EUDIETSODA', 'EUEXERCISE', 'TEAGE', 'EEINCOME1', 'EUEXFREQ', 'EUFASTFD', 'EUFFYDAY', 'EUFDSIT', 'EUGENHTH', 'EUGROSHP', 'EUMEAT', 'EUPRPMEL', 'TUACTIVITY_N', 'tuactdur24', 'tewhere', 'TESEX' ], outputCol='features') # COMMAND ---------- vecIntercept = feature.VectorAssembler(inputCols=[], outputCol='emptyFeatures') # COMMAND ---------- # MAGIC %md # MAGIC Scaling stage to scale features from Vector Assembler # COMMAND ---------- scaled = feature.StandardScaler(inputCol='features',
carr_indexer = features.StringIndexer(inputCol="carrier", outputCol="carrier_index") # Create a OneHotEncoder carr_encoder = features.OneHotEncoder(inputCol="carrier_index", outputCol="carrier_fact") # Create a StringIndexer dest_indexer = features.StringIndexer(inputCol="dest", outputCol="dest_index") # Create a OneHotEncoder dest_encoder = features.OneHotEncoder(inputCol="dest_index", outputCol="dest_fact") # Make a VectorAssembler vec_assembler = features.VectorAssembler(inputCols=[ "month", "air_time", "carrier_fact", "dest_fact", "plane_age" ], outputCol="features") # Make the pipeline flights_pipe = Pipeline(stages=[ dest_indexer, dest_encoder, carr_indexer, carr_encoder, vec_assembler ]) # Fit and transform the data piped_data = flights_pipe.fit(model_data).transform(model_data) # Split the data into training and test sets training, test = piped_data.randomSplit([.6, .4]) # Create a LogisticRegression Estimator lr = LogisticRegression() # Create a BinaryClassificationEvaluator evaluator = evals.BinaryClassificationEvaluator(metricName="areaUnderROC")
# COMMAND ---------- trainingdf = store_num_data_ind_enc.filter(store_num_data_ind_enc.Date_Date <'2014-04-28') # COMMAND ---------- validationdf = store_num_data_ind_enc.filter(store_num_data_ind_enc.Date_Date >= "2014-04-28").filter(store_num_data_ind_enc.Date_Date <= "2014-10-31") # COMMAND ---------- testdf = store_num_data_ind_enc.filter(store_num_data_ind_enc.Date_Date >= "2014-10-31") # COMMAND ---------- va = feature.VectorAssembler(inputCols=['typeNumVec','ClusterVec', 'StoreNumVec',"ItemFamilyNumVec","MonthVec","DayVec","DOWNum"], outputCol='features') lr = regression.LinearRegression(featuresCol='features', labelCol='sum(Units)', regParam=0.5, elasticNetParam=0.3, fitIntercept= True) pipe = Pipeline(stages=[va, lr]) model = pipe.fit(trainingdf) # COMMAND ---------- # MAGIC %md *** Calculating RMSE*** # COMMAND ---------- rmse = (fn.avg((fn.col('sum(Units)') - fn.col('prediction'))**2))**.5 # COMMAND ---------- rmse1 = (fn.avg((fn.col('sum(sum(Units))') - fn.col('sum(prediction)'))**2))**.5
] schema = typ.StructType([ typ.StructField(e[0], e[1], False) for e in labels ]) data = spark.read.csv("file:///home/hadoop/zhcao/workspace/spark_test/watermelon.csv", header = True, schema = schema) data.createOrReplaceTempView("data_temp") data.printSchema() data.cache() data.show() # featuresCreator = ft.VectorAssembler( inputCols = ['VIB1', 'VIB2'], outputCol = 'features' ) pca = ft.PCA(k = 1, inputCol = 'features', outputCol = 'pca_features' ) pipeline = Pipeline(stages = [ featuresCreator, pca ]) model = pipeline.fit(data) result = model.transform(data)
def skl_predict(spark): print (1111) data = [(list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 1])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])) ] labels = ['_1', '_2', '_3', '_4','_5','_6','_7','_8','_9','_10','_11','_12','_13','_14','_15','_16','_17','_18','_19','_20','_21','_22','_23','_24','_25','_26','_27','_28','_29','_30', 'INFANT_ALIVE_AT_REPORT'] df = spark.createDataFrame(data, schema = labels) # df = df.withColumn( "age", df['age']+1 ) df.show() # df.select("age").distinct().show() # df.count() # 列数据合并 from pyspark.sql.functions import split, explode, concat, concat_ws df_concat = df.withColumn("_concat", concat(df['_1'], df['_2'], df['_3'], df['_4'])) print ('df_concat>>>>>>>>>>>>>>>>>>>') df_concat.show() # 将所有的特征整和到一起 featuresCreator = ft.VectorAssembler( inputCols=[ col for col in labels], outputCol='features' ) # 创建评估器 import pyspark.ml.classification as cl logistic = cl.LogisticRegression( maxIter=10, regParam=0.01, labelCol='INFANT_ALIVE_AT_REPORT') print ('logistic:', logistic) # 创建一个管道 from pyspark.ml import Pipeline pipeline = Pipeline(stages=[ featuresCreator, logistic ]) # fit births_train, births_test = df.randomSplit([0.7, 0.3], seed=666) print ('births_train', births_train) print ( 'births_test', births_test ) # 运行管道,评估模型。 model = pipeline.fit(births_train) test_model = model.transform(births_test) print ('test_model:', test_model) test_model.take(1) print ('test_model.take(1):', test_model.take(1)) '''
display(reviewAnalysisDF.select('review_stars').distinct()) # In[44]: get_ipython().run_line_magic('fs', 'ls tmp/reviewAnalysisDf') # ### 3: Vector Assembly # Once we are through with the encoder creation step, it is time to essemble the encoders and all the input and output columns to form a final vector_generator that will be passed as input to the machine learning pipeline. # In[46]: import pyspark.ml.feature as ft featuresCreator = ft.VectorAssembler(inputCols=[ 'cool', 'funny', 'useful', 'is_open', 'business_review_count', 'business_stars', 'average_stars', 'fans', 'user_review_count', 'sentiment_score' ], outputCol='features') # ### 4: Estimator Creation # This is the step where we select the machine learning model that we wish to utilize. Here, we create an Estimator object that contains the machine learning model along with all the hyper optimization parameters that need to be passed to it. Here, we are using LogisticRegression. # In[48]: import pyspark.ml.classification as cl logistic_regression_model = cl.LogisticRegression(maxIter=10, regParam=0.01, labelCol='review_stars', family='multinomial') print(type(logistic_regression_model))
from pyspark import SparkContext from pyspark.sql import SparkSession import pyspark.ml.feature as mlf import numpy as np import pandas as pd from sklearn.datasets import make_classification sys.path.append(os.path.abspath('../')) import importlib spark = SparkSession.builder.master("local[2]").getOrCreate() df = spark.read.parquet('testing/df') non_pred_cols = ['label', 'response', 'features'] pred_cols = [x for x in df.columns if x not in non_pred_cols] assembler = mlf.VectorAssembler(inputCols=pred_cols, outputCol='features') df = assembler.transform(df) df.cache() sys.path.append(os.path.abspath('../')) import importlib import propensity_matching importlib.reload(propensity_matching) from propensity_matching.estimator import PropensityEstimator estimator = PropensityEstimator() model, df2 = estimator.fit(df) df3, match_info = model.transform(df2)
how='inner') # So far, data_matrix contains Row(date, grid_square, topic_distributions, complaint_count). # Get weekday from date. get_weekday_udf = functions.udf(lambda d: d.weekday(), returnType=types.IntegerType()) data_matrix = data_matrix.withColumn('weekday', get_weekday_udf(data_matrix['date'])) # Assemble the feature vectors. weekday_one_hot_encoder = feature.OneHotEncoder(inputCol='weekday', outputCol='weekday_vector') feature_vector_assembler = feature.VectorAssembler( inputCols=['weekday_vector', 'topic_distribution'], outputCol='final_feature_vector') feature_assembly_pipeline = (ml.Pipeline( stages=[weekday_one_hot_encoder, feature_vector_assembler]).fit( data_matrix)) data_matrix = (feature_assembly_pipeline.transform(data_matrix).select( 'date', 'grid_square', 'final_feature_vector', 'complaint_count')) LOGGER.debug( str(data_matrix.count()) + " rows like " + str(data_matrix.take(1))) #logistic_regression = classification.LogisticRegression( # maxIter=10, regParam=0.3, elasticNetParam=0.8, # featuresCol='final_feature_vector', labelCol='complaint_count', # probabilityCol='predicted_probability')
('CIG_2_TRI', typ.IntegerType()), ('CIG_3_TRI', typ.IntegerType()), ('MOTHER_HEIGHT_IN', typ.IntegerType()), ('MOTHER_PRE_WEIGHT', typ.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', typ.IntegerType()), ('MOTHER_WEIGHT_GAIN', typ.IntegerType()), ('DIABETES_PRE', typ.IntegerType()), ('DIABETES_GEST', typ.IntegerType()), ('HYP_TENS_PRE', typ.IntegerType()), ('HYP_TENS_GEST', typ.IntegerType()), ('PREV_BIRTH_PRETERM', typ.IntegerType())] births_transformed = "file:///home/yuty/yangzz/births_transformed.csv" schema = typ.StructType([typ.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv(births_transformed, header=True, schema=schema) featuresCreator = ft.VectorAssembler( inputCols=[col[0] for col in labels[1:]], outputCol='features').transform(births).select('features').collect() from pyspark.ml.linalg import Vectors from pyspark.ml.clustering import BisectingKMeans data = [(Vectors.dense([10, 10]), ), (Vectors.dense([3.0, 5.0]), ), (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ), (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )] df = spark.createDataFrame(data, ["features"]) bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0) model = bkm.fit(df) centers = model.clusterCenters() len(centers) model.computeCost(df) model.hasSummary
def _prep_data(self, df: DataFrame): r""" remove highly collinear features, bin the features, and reduce the dimensionality if necessary and in that order Parameters ---------- df : pyspark.sql.DataFrame self.fit_data_prep_args : dict arguments around preparing the data to be fit default args are default_fit_data_prep_args = { 'class_balance': 1, 'train_prop': .8, 'bin_features':True, 'remove_redundant_features':True, } 'class balance' is ratio of control_candidates : treatment to train the model on train_prop is the proportion of the population (post-rebalance) that is in the training set 'bin_features' can be bool, dict, or absent. if you do not want to bin them here, they MUST be binned prior. Unbinned features will undermine validity of outcome. if bin_features is absent or True, bin_features will be run with default args. If it is a dict, it will be passed as kwargs to bin_features. see utils.bin_features for arg details 'remove_redundant_features' can be bool, dict or absent True or absent will run remove redundant features with default args. Dict will passed as kwargs instead. see utils.remove_redundant_features for arg details Returns ------- df : pyspark.sql.DataFrame prepared dataframe Raises ------ UncaughtExceptions See Also -------- remove_redundant_features bin_features reduce_dimensionality """ features_col = self.probability_estimator.getOrDefault('featuresCol') label_col = self.probability_estimator.getOrDefault('labelCol') if ('remove_redundant_features' not in self.fit_data_prep_args) | ( self.fit_data_prep_args['remove_redundant_features'] is True): logging.getLogger(__name__).info( "removing redundant features with default args") df, pred_cols = remove_redundant_features( df=df, features_col=features_col) elif isinstance(self.fit_data_prep_args['remove_redundant_features'], dict): logging.getLogger(__name__).info( "removing redundant features with specified args") df, pred_cols = remove_redundant_features( df=df, **self.fit_data_prep_args['remove_redundant_features']) elif self.fit_data_prep_args['remove_redundant_features'] is False: logging.getLogger(__name__).info("not removing redundant features") else: logging.getLogger(__name__).critical( "illegal arg for remove_redundant_features") raise ValueError( 'illegal argument for "remove_redundant_features" in fit_data_prep_args' ) if ('bin_features' not in self.fit_data_prep_args) | ( self.fit_data_prep_args['bin_features'] is True): logging.getLogger(__name__).info( "binning features with default args") df, pred_cols = bin_features(df=df, features_col=features_col) elif isinstance(self.fit_data_prep_args['bin_features'], dict): logging.getLogger(__name__).info( "binning features with specified args") df, pred_cols = bin_features( df=df, **self.fit_data_prep_args['bin_features']) elif self.fit_data_prep_args['bin_features'] is False: logging.getLogger(__name__).info("not binning features") else: logging.getLogger(__name__).critical( "illegal arg for bin_features") raise ValueError( 'illegal argument for "bin_features" in fit_data_prep_args') # leakage note: evaluation of informativeness of predictors includes test set # not ideal but minimal impact and is expedient for architecture right now. # num cols is limited by size of training set. To get it we must first rebalance, and multiply by train prop. # reduce dim on whole pop df, then apply the same transform to the rebalanced df self.df = df self._rebalance_df() ncols = int( (self.rebalanced_df.where(F.col(label_col) == 1).count() * self.fit_data_prep_args['train_prop']) // SAMPLES_PER_FEATURE) red_dim_args = { 'df': self.df, 'label_col': label_col, 'binned_features_col': features_col, 'ncols': ncols } logging.getLogger(__name__).info("reducing dimensionality of df") self.df, pred_cols = reduce_dimensionality(args=red_dim_args) assembler = mlf.VectorAssembler(inputCols=pred_cols, outputCol=features_col) self.rebalanced_df = assembler.transform( self.rebalanced_df.drop(features_col)) return True
births = births .withColumn( 'BIRTH_PLACE_INT', births['BIRTH_PLACE'] \ .cast(typ.IntegerType())) # In[4]: encoder = ft.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') # In[5]: featuresCreator = ft.VectorAssembler( inputCols=[ col[0] for col in labels[2:]] + \ [encoder.getOutputCol()], outputCol='features' ) # In[6]: import pyspark.ml.classification as cl # In[7]: logistic = cl.LogisticRegression(maxIter=10, regParam=0.01, labelCol='INFANT_ALIVE_AT_REPORT') # In[8]:
inputCols=['land_condition_index', 'foundation_type_index', 'roof_type_index', 'ground_floor_type_index', 'position_index'], \ outputCols=['land_condition_vec', 'foundation_type_vec', 'roof_type_vec', 'ground_floor_type_vec', 'position_vec']) data_sample = encoder.fit(data_sample).transform(data_sample) # 使用Binarizer将因变量y(4个分类)转换为二分类变量 # 其中0,1,2三类合并成一类记作0类,表示不需要重建,将因变量为3的类别记作1类,表示需要重建。 data_sample = data_sample.withColumn('y_double', data_sample['y'].cast(typ.DoubleType())) binarizer = ft.Binarizer(threshold=2, inputCol="y_double", outputCol="label") data_sample = binarizer.transform(data_sample) data_sample.take(1) # 使用VectorAssembler创建特征向量 featuresCreator = ft.VectorAssembler( inputCols=['floors_before', 'floors_after', 'age', 'area', 'height_before', 'height_after', \ 'land_condition_vec', 'foundation_type_vec', 'roof_type_vec', 'ground_floor_type_vec', 'position_vec'], outputCol='features' ) # 使用VectorIndexer自动识别分类变量,设定最大分类数为5 indexer = ft.VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=5) # 划分训练集和测试集 data_train, data_test = data_sample.randomSplit([0.8, 0.2], seed=42) ############################### 描述性统计分析 ############################### data_sample.printSchema() # 查看数据Schema
from pyspark.sql import SQLContext sqlCtx = SQLContext(sc) airlineCleanDF = sqlCtx.createDataFrame(airlineCleanDFP) # In[10]: training, validation, testing = airlineCleanDF.randomSplit([0.6, 0.3, 0.1], seed=0) # In[9]: # 2. Linear regression with avg_overall vaAvgOverall = feature.VectorAssembler(inputCols=[ 'overall_ratingf', 'seat_comfort_ratingf', 'cabin_staff_ratingf', 'food_beverages_ratingf', 'inflight_entertainment_ratingf', 'ground_service_ratingf', 'wifi_connectivity_ratingf', 'value_money_ratingf' ], outputCol='features') lrAvgOverall = regression.LinearRegression(featuresCol='features', labelCol='recommendedi') pipelineAvgOverall = Pipeline(stages=[vaAvgOverall, lrAvgOverall]) pipeline_modelAvgOverall = pipelineAvgOverall.fit(training) pipeline_modelAvgOverall.transform(validation).select( fn.avg((fn.col('prediction') - fn.col('recommendedi'))**2).alias('MSE_Avg_Overall')).show() # In[12]: # 2. Linear regression with avg_overall vaAvgOverall = feature.VectorAssembler(inputCols=[
regressor_OLS.summary() X_Opt = X[:,[1,4,5,6]] regressor_OLS = sm.OLS(endog=y, exog=X_Opt).fit() regressor_OLS.summary() X_Opt = X[:,[1,4,5]] regressor_OLS = sm.OLS(endog=y, exog=X_Opt).fit() regressor_OLS.summary() #choice feature cols feature_cols = [df_train.columns[1], df_train.columns[2], df_train.columns[3], df_train.columns[4]] #feature_cols = df.columns[1:] assembler = feature.VectorAssembler(inputCols=feature_cols, outputCol='features') df_train = assembler.setHandleInvalid("skip").transform(df_train) df_train = df_train.withColumnRenamed('Survived', 'label') df_train = df_train.select('features', 'label') # scaling scaler = feature.StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) scaler = scaler.fit(df_train) df_train =scaler.transform(df_train) df_train = df_train.drop('features').withColumnRenamed('scaledFeatures','features') # TEST # reading data_train