def compute_rho_transitions(vertices, edges, pub_decay_time, data_decay_time): """Compute the initial distribution rho and transitions""" distribution = fn.when(fn.col('type') == 'data', fn.exp(-fn.col('age') / fn.lit(data_decay_time))). \ otherwise(fn.exp(-fn.col('age') / fn.lit(pub_decay_time))) rho = vertices.select('i', distribution.alias('value')) transitions = edges.groupBy('i').count().join(edges, 'i'). \ selectExpr('j as i', 'i as j', '1/count as value') return rho, transitions
def evaluate_agg_prob(self): import pyspark from pyspark.sql.functions import col #terminal_outcome.show() from pyspark.sql.functions import udf, log, sum, exp from pyspark.ml.evaluation import BinaryClassificationEvaluator udf_prob = udf(lambda x: x.toArray().tolist()[1]) cur_terminal_df = self.get_terminal_df() self.flatten_terminal_outcome() for cur_of in [self.target_disch_col]: self.logger.info(cur_of) try: cur_training_df = self.spark.read.parquet( self.training_result_dest_template.format(cur_of)).select( "ID", "TIME_SPAN", udf_prob("Probability").cast("double").alias( "probability"), col("{0}_label".format(cur_of)).alias("label")) cur_testing_df = self.spark.read.parquet( self.testing_result_dest_template.format(cur_of)).select( "ID", "TIME_SPAN", udf_prob("Probability").cast("double").alias( "probability"), col("{0}_label".format(cur_of)).alias("label")) except pyspark.sql.utils.AnalysisException as ex: template = "An exception of type {0} occurred. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) self.logger.info(message) self.logger.info("PROCESS") self.logger.debug("{0} Not exists".format(cur_of)) continue cur_tr_agg = cur_training_df.groupBy("ID").agg( sum(log(1.0 - col("probability"))).alias("agg_prob")).select( "ID", (1.0 - exp("agg_prob")).alias("agg_prob").cast("double")) cur_te_agg = cur_testing_df.groupBy("ID").agg( sum(log(1.0 - col("probability"))).alias("agg_prob")).select( "ID", (1.0 - exp("agg_prob")).alias("agg_prob").cast("double")) # TODO terminal_df is flattened terminal DX for now. Need to merge with other DF with ALI,AKI,ALF,AHF column separately. cur_tr_agg = cur_tr_agg.join(self.target_terminal_outcome_table, "ID") cur_te_agg = cur_te_agg.join(self.target_terminal_outcome_table, "ID") #cur_tr_agg.show() #cur_te_agg.show() from pyspark.sql.functions import count #cur_te_agg.select(cur_of).groupBy(cur_of).agg(count("*")).show() return cur_tr_agg, cur_te_agg
def test( keras_model, working_dir: FlyteDirectory, test_df: pyspark.sql.DataFrame, hp: Hyperparameters, ) -> FlyteDirectory: print("================") print("Final prediction") print("================") pred_df = keras_model.transform(test_df) pred_df.printSchema() pred_df.show(5) # convert from log domain to real Sales numbers pred_df = pred_df.withColumn("Sales_pred", F.exp(pred_df.Sales_output)) submission_df = pred_df.select(pred_df.Id.cast(T.IntegerType()), pred_df.Sales_pred).toPandas() submission_df.sort_values(by=["Id"]).to_csv(os.path.join( working_dir, hp.local_submission_csv), index=False) # predictions are saved to a CSV file. print("Saved predictions to %s" % hp.local_submission_csv) return working_dir
def output(self, scores, thresh=1.5, mode="best-guess"): """Standard output of the algorithm De-anonymisation has two modes: entropic (keeps the full distribution) or best-guess (matching with threshold). """ if mode == "best-guess": return self.matching_set(scores, thresh) elif mode == "entropic": # (custId_1, std) sigma = scores.groupBy('custId_1').agg( F.stddev(scores.value).alias('std')) # (custId_1, custId_2, probas_raw) probas_raw = scores\ .join(sigma, ['custId_1'])\ .withColumn("probas_raw", F.exp(F.col('value')/F.col('std')))\ .select(['custId_1', 'custId_2', 'probas_raw', 'std']) # (custId_1, probas_z) probas_z = probas_raw.groupBy('custId_1').agg( F.sum(probas_raw.probas_raw).alias('probas_z')) # (custId_1, custId_2, probas) return scores\ .join(probas_raw, ['custId_1', 'custId_2'])\ .join(probas_z, ['custId_1'])\ .withColumn("probas", F.col('probas_raw')/F.col('probas_z'))\ .select(['custId_1', 'custId_2', 'probas', 'value', 'std']) else: raise "Mode '{}' is invalid.".format(mode)
def correct_thermal_factor(df, input_cols, T, replace=False): new_df = df kBT = 8.6173303e-5 * T * 1000 # unit: meV for col_name in input_cols: new_df = new_df.withColums( "corrected_" + col_name, col(col_name) * (func.exp(df.E / kBT) - 1.0)) if replace: print("Replace", input_cols, "with corrected values...") new_df = new_df.drop(*input_I_cols) for col_name in input_cols: new_df = new_df.withColumnRenamed("corrected_" + col_name, col_name) else: output_cols = ["corrected_" + col_name for col_name in input_cols] print("Add corrected intensity data as new column(s): ", output_cols) return new_df
def output(self, scores, thresh=1.5, mode="best-guess"): if mode == "best-guess": return self.matching_set(scores, thresh) elif mode == "entropic": # (custId_1, std) sigma = scores.groupBy('custId_1').agg( F.stddev(scores.value).alias('std')) # (custId_1, custId_2, probas_raw) probas_raw = scores\ .join(sigma, ['custId_1'])\ .withColumn("probas_raw", F.exp(F.col('value')/F.col('std')))\ .select(['custId_1', 'custId_2', 'probas_raw', 'std']) # (custId_1, probas_z) probas_z = probas_raw.groupBy('custId_1').agg( F.sum(probas_raw.probas_raw).alias('probas_z')) # (custId_1, custId_2, probas) return scores\ .join(probas_raw, ['custId_1', 'custId_2'])\ .join(probas_z, ['custId_1'])\ .withColumn("probas", F.col('probas_raw')/F.col('probas_z'))\ .select(['custId_1', 'custId_2', 'probas', 'value', 'std']) else: raise "Mode '{}' is invalid.".format(mode)
def dataframe_operation(): spark = SparkSession.builder.appName('dataframe-operation').getOrCreate() spark.sparkContext.setLogLevel('WARN') # Add rows. df1 = spark.range(3) df2 = spark.range(5) df3 = df1.union(df2) df3.show() # Add columns. df1 = spark.createDataFrame([(1, 'a', 23.0), (3, 'B', -23.0)], ('x1', 'x2', 'x3')) df2 = df1.withColumn('x4', func.lit(0)) df2.show() df3 = df2.withColumn('x5', func.exp('x3')) df3.show(truncate=False) df4 = spark.createDataFrame([(1, 'foo'), (2, 'bar')], ('k', 'v')) df5 = df3 \ .join(df4, func.col('x1') == func.col('k'), 'leftouter') \ .drop('k') \ .withColumnRenamed('v', 'x6') df5.show(truncate=False)
def dedup_records(data: DataFrame, key_columns) -> DataFrame: data = data.dropDuplicates(key_columns) data_with_new_feature = data.withColumn("10_exp", exp("10")) return data_with_new_feature
def similarity(r): D_1 = F.exp(-(F.abs(r['rating_1'] - r['rating_2']) / r0)) D_2 = F.exp(-(F.abs(r['days_1'] - r['days_2']) / d0)) D_3 = F.exp(-(F.abs(r['avgMovieRating_1'] - r['avgMovieRating_2']) / avgr0)) return D_1 + D_2 + D_3
print('====> Parsing local arguments') parser = argparse.ArgumentParser() parser.add_argument('--query_month', type=str, help='The format should be YYYYmm') parser.add_argument('--mode', type=str, choices=['train', 'eval', 'test'], default='train') parser.add_argument('--save_model', action='store_true', default=False) args = parser.parse_args() print('====> Start computation') dataset = spark.read.csv('/user/ronghui_safe/hgy/nid/datasets/{}_{}'.format(args.query_month, args.mode), header=True, inferSchema=True) dataset = dataset.withColumn('source', F.when(F.col('source') == '__HIVE_DEFAULT_PARTITION__', 'null').otherwise(F.col('source'))) dataset = dataset.withColumn('source', F.when(F.col('source') == 'cm_mail', 'null').otherwise(F.col('source'))) if args.mode != 'test': dataset = dataset.withColumn('duration', F.when(F.col('duration') == 0, 1e-6).otherwise(F.col('duration'))) dataset = dataset.withColumn('duration', F.log(F.lit(1e-6))/F.col('duration')) dataset = dataset.withColumn('duration', F.exp(F.col('duration'))) stringIndex_model = None if args.mode == 'train': stringIndexer = StringIndexer(inputCol='source', outputCol='source_index') stringIndex_model = stringIndexer.fit(dataset) stringIndex_model.save('/user/ronghui_safe/hgy/nid/edw/stringIndex_model_v2') else: stringIndex_model = StringIndexerModel.load('/user/ronghui_safe/hgy/nid/edw/stringIndex_model_v2') dataset = stringIndex_model.transform(dataset) encoder_model = None if args.mode == 'train': encoder = OneHotEncoder(inputCol='source_index', outputCol='source_vec') encoder_model = encoder.fit(dataset) encoder_model.save('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2') else: encoder_model = OneHotEncoderModel.load('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2')
def get_and_enrich_spark(raw_data: spark.DataFrame, column_name: str): raw_data.show() data_with_new_feature = raw_data.withColumn(column_name + "_exp", exp(column_name)) return data_with_new_feature
def run_RF(self, tr_inst, te_inst, model_of=[]): from pyspark.sql.functions import col if type(self) == data_run_experiment: raise NotImplementedError( "Method need to be called in sub-class but currently called in base class" ) if model_of == []: model_of = self.target_disch_col if type(model_of) == str: model_of = [model_of] self.logger.info("TARGET_OF:") self.logger.info(model_of) from pyspark.ml.classification import GBTClassifier as cur_model_selection cur_classifier = cur_model_selection(featuresCol="features", checkpointInterval=5) from pyspark.ml import Pipeline from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.tuning import CrossValidator, ParamGridBuilder if self.eval_performance_criteria == "AUPRC": target_metric = "areaUnderPR" elif self.eval_performance_criteria == "AUROC": target_metric = "areaUnderROC" else: raise Exception("eval_metric should be either 'AUPRC' or 'AUROC'") evaluator = BinaryClassificationEvaluator(metricName=target_metric) paramGrid = self.get_param_grid(cur_model_selection) if self.eval_cv_or_tvt == "CV": pipeline = Pipeline(stages=[cur_classifier]) orig_tr_inst = tr_inst orig_te_inst = te_inst self.logger.info("ORIGINAL_INSTANCES") from pyspark.sql.functions import count, datediff from pyspark.sql.functions import udf, log, sum, exp, max udf_prob = udf(lambda x: x.toArray().tolist()[1]) from pyspark.sql.functions import corr, udf, isnan for cur_of in model_of: self.logger.debug(cur_of) #should move this to back te_inst = orig_te_inst.withColumn( "label", col("{0}_label".format(cur_of)).cast("double")) self.logger.info("TE_POP") te_inst.groupBy("label").agg(count("*")).show() tr_inst.printSchema() tr_val_pts_dict = self.get_target_tr_val_id() orig_tr_inst = tr_inst tr_pts = tr_val_pts_dict["TR"] val_pts = tr_val_pts_dict["VAL"] self.logger.info(tr_pts) self.logger.info(val_pts) all_training_ids = tr_pts + val_pts from random import shuffle shuffle(all_training_ids) import numpy as np print(len(all_training_ids)) cv_id_list_full = np.array(all_training_ids) perform_dict = dict() for cur_cv_stage in range(self.cur_cv_fold): tr_pts = cv_id_list_full[ np.linspace(0, cv_id_list_full.shape[0] - 1, cv_id_list_full.shape[0]) % self.cur_cv_fold != cur_cv_stage].tolist() val_pts = cv_id_list_full[ np.linspace(0, cv_id_list_full.shape[0] - 1, cv_id_list_full.shape[0]) % self.cur_cv_fold == cur_cv_stage].tolist() print( np.linspace(0, cv_id_list_full.shape[0] - 1, cv_id_list_full.shape[0]) % self.cur_cv_fold == cur_cv_stage) print(cv_id_list_full[ np.linspace(0, cv_id_list_full.shape[0] - 1, cv_id_list_full.shape[0]) % self.cur_cv_fold == cur_cv_stage]) print("VAL_ROUND_{0}_TARGET IDS:{1}".format( cur_cv_stage, val_pts)) tr_inst = orig_tr_inst.where( col("ID").isin(tr_pts)) #.persist() val_inst = orig_tr_inst.where( col("ID").isin(val_pts)) #.persist() self.logger.info( "Excluded instances for training:{0}".format( tr_inst.where( col("{0}_excl".format(cur_of)) == 1).count())) tr_inst = tr_inst.where( col("{0}_excl".format(cur_of)) == 0).withColumn( "label", col("{0}_label".format(cur_of)).cast("double")) val_inst = val_inst.withColumn( "label", col("{0}_label".format(cur_of)).cast("double")) self.logger.info("TR_POP") tr_inst.groupBy("label").agg(count("*")).show() pipeline_models = pipeline.fit(tr_inst, params=paramGrid) for cur_model in pipeline_models: val_pred = cur_model.transform(val_inst) agg_prob_val = val_pred.groupBy("ID").agg(max("label").alias("label"), sum(log(1.0 - udf_prob("Probability"))).alias( "inverse_log_sum")) \ .select("label", (1.0 - exp(col("inverse_log_sum"))).alias("rawPrediction")) agg_prob_val.show(300, truncate=False) cur_pr = BinaryClassificationEvaluator( rawPredictionCol="rawPrediction", labelCol="label", metricName=target_metric).evaluate(agg_prob_val) if str(cur_model.stages[-1].extractParamMap() ) not in perform_dict: perform_dict[str(cur_model.stages[-1]. extractParamMap())] = dict() perform_dict[str( cur_model.stages[-1].extractParamMap( ))]["PERF"] = list() perform_dict[str( cur_model.stages[-1].extractParamMap() )]["PARAM"] = cur_model.stages[-1].extractParamMap( ) perform_dict[str(cur_model.stages[-1].extractParamMap( ))]["PERF"].append(cur_pr) best_pf_measure = -1 best_pf_param = None for key in perform_dict: import numpy as np test_array = np.array(perform_dict[key]["PERF"]) print(key, test_array.mean(), test_array.std()) if best_pf_measure < test_array.mean(): best_pf_measure = test_array.mean() best_pf_param = perform_dict[key]["PARAM"] print("retrain model based on best hp from CV") print("PERF:{0}".format(best_pf_measure)) print("HP:{0}".format(best_pf_param)) tr_inst = orig_tr_inst.where( col("{0}_excl".format(cur_of)) == 0).withColumn( "label", col("{0}_label".format(cur_of)).cast("double")) bestModel = pipeline.fit(tr_inst.where( col("ID").isin(cv_id_list_full.tolist())), params=[best_pf_param])[0] bestModel.save( self.model_dir_template.format(cur_of, best_pf_measure)) prediction = bestModel.transform(te_inst) prediction.show() prediction.write.save( self.testing_result_dest_template.format(cur_of), mode="overwrite") tr_result = bestModel.transform(tr_inst).withColumn( "Prob", udf_prob("Probability")) tr_result.write.save( self.training_result_dest_template.format(cur_of), mode="overwrite") elif self.eval_cv_or_tvt == "TVT": pipeline = Pipeline(stages=[cur_classifier]) orig_tr_inst = tr_inst orig_te_inst = te_inst self.logger.info("ORIGINAL_INSTANCES") #of pop_overview from pyspark.sql.functions import count, datediff from pyspark.sql.functions import udf, log, sum, exp, max udf_prob = udf(lambda x: x.toArray().tolist()[1]) from pyspark.sql.functions import corr, udf, isnan for cur_of in model_of: self.logger.debug(cur_of) if "{0}_excl".format(cur_of) not in orig_tr_inst.columns: self.logger.info("NO TARGET {0} is in pts".format(cur_of)) continue tr_inst = orig_tr_inst.where( col("{0}_excl".format(cur_of)) == 0).withColumn( "label", col("{0}_label".format(cur_of)).cast( "double")).repartition(500).checkpoint() self.logger.info("Excluded instances for training:{0}".format( orig_tr_inst.where( col("{0}_excl".format(cur_of)) == 1).count())) self.logger.info("TR_POP") tr_inst.groupBy("label").agg(count("*")).show() te_inst = orig_te_inst.withColumn( "label", col("{0}_label".format(cur_of)).cast("double")) self.logger.info("TE_POP") te_inst.groupBy("label").agg(count("*")).show() tr_inst.printSchema() tr_val_pts_dict = self.get_target_tr_val_id() tr_pts = tr_val_pts_dict["TR"] val_pts = tr_val_pts_dict["VAL"] self.logger.info(tr_pts) self.logger.info(val_pts) orig_tr_inst = tr_inst tr_inst = orig_tr_inst.where( col("ID").isin(tr_pts)) #.persist() val_inst = orig_tr_inst.where( col("ID").isin(val_pts)) #.persist() tr_inst.show() val_inst.show() self.logger.info("tr_inst_count:{0}//val_inst_count{1}".format( tr_inst.count(), val_inst.count())) te_inst.printSchema() pipeline_models = pipeline.fit(tr_inst, params=paramGrid) max_pr = -1.0 bestModel = None for cur_model in pipeline_models: val_pred = cur_model.transform(val_inst) agg_prob_val = val_pred.groupBy("ID").agg(max("label").alias("label"),sum(log(1.0-udf_prob("Probability"))).alias("inverse_log_sum"))\ .select("label",(1.0-exp(col("inverse_log_sum"))).alias("rawPrediction")) agg_prob_val.show(300, truncate=False) cur_pr = BinaryClassificationEvaluator( rawPredictionCol="rawPrediction", labelCol="label", metricName=target_metric).evaluate(agg_prob_val) self.logger.info(cur_pr) if max_pr < cur_pr: max_pr = cur_pr bestModel = cur_model if not bestModel: self.logger.info("NO MODEL") return self.logger.debug(bestModel) self.logger.debug(max_pr) udf_prob = udf(lambda x: float(x.toArray().tolist()[1])) prediction = bestModel.transform(te_inst) prediction.show() prediction.write.save( self.testing_result_dest_template.format(cur_of), mode="overwrite") tr_result = bestModel.transform(tr_inst).withColumn( "Prob", udf_prob("Probability")) tr_result.write.save( self.training_result_dest_template.format(cur_of), mode="overwrite") #tr_inst.show_corr_result(tr_result) from pyspark.mllib.evaluation import BinaryClassificationMetrics self.logger.info("MAX_PRC_VAL:{0}".format(max_pr)) bestModel.save(self.model_dir_template.format(cur_of, max_pr)) tr_inst.unpersist() val_inst.unpersist()
forecastInDF.createOrReplaceTempView('forecast_df') dfsql = "select period_end_date,rent_space_code,'PROP'||substr(rent_space_code,2,4) as property_code, iteration,quarter,forecast_period_end_date,qtr_number + quarter as cal_qtr_number,net_income,erv,capital_value,lead(net_income) over (partition by rent_space_code,iteration order by quarter) as net_income_1 from forecast_df" df = sess.sql(dfsql).orderBy('period_end_date','rent_space_code','iteration','forecast_period_end_date') df.createOrReplaceTempView('dash_df') dxsql = " select period_end_date,property_code,iteration,forecast_period_end_date,cal_qtr_number,sum(net_income) as net_income,sum(net_income_1) as net_income1,sum(erv) as erv,sum(capital_value) as capital_value, (sum(net_income_1) - sum(net_income)) as net_income_diff,(sum(net_income_1) - sum(net_income))/sum(net_income) as net_income_pq, ln(1 + ((sum(net_income_1) - sum(net_income))/sum(net_income))) as ln_net_income_pq from dash_df where property_code IN $modifier group by period_end_date,property_code,iteration,forecast_period_end_date,cal_qtr_number" dx = sess.sql(dxsql).orderBy('period_end_date','property_code','forecast_period_end_date','iteration') pWindow1y = W.partitionBy('period_end_date','property_code','iteration').orderBy('cal_qtr_number').rangeBetween(0,3) pWindow3y = W.partitionBy('period_end_date','property_code','iteration').orderBy('cal_qtr_number').rangeBetween(0,11) pWindow5y = W.partitionBy('period_end_date','property_code','iteration').orderBy('cal_qtr_number').rangeBetween(0,19) prop = dx.withColumn('net_income_1y_pa',((F.exp(F.sum('ln_net_income_pq').over(pWindow1y)))-1)) \ .withColumn('net_income_3y_pa',((F.exp(F.sum('ln_net_income_pq').over(pWindow3y) * 4/12))-1)) \ .withColumn('net_income_5y_pa',((F.exp(F.sum('ln_net_income_pq').over(pWindow5y) * 4/20))-1)) \ .orderBy('period_end_date','property_code','iteration','forecast_period_end_date') tot = prop.groupBy('period_end_date','property_code','forecast_period_end_date') \ .agg(F.mean('net_income_pq').alias('avg_net_income_pq'), F.stddev('net_income_pq').alias('std_net_income_pq')) \ .orderBy('period_end_date','property_code','forecast_period_end_date') #prop.show(10) #tot.show(10) #maprdd = tot.rdd.groupBy(lambda x:x[0]).map(lambda x:(x[0],{y[1]:y[2] for y in x[1]})) #result_dict = dict(maprdd.collect()) data = tot.toPandas()
# ## Generate Test Data from pyspark.sql.functions import rand n = 10000000 df1 = spark.range(n).withColumn("x", rand(seed=12345)) # ## Built-in Functions from pyspark.sql.functions import log, exp, sum df2 = df1.withColumn("y", log(col("x") / (1.0 - col("x")))) df3 = df2.withColumn("z", 1.0 / (1.0 + exp(-col("y")))) df3.show() %time df3.select(sum("x"), sum("z")).show() # ## Scalar Python UDFs from pyspark.sql.functions import udf from pyspark.sql.types import DoubleType def logit(x): from math import log return log(x / (1.0 - x))
# literal zero. We use lit(), a literal function df_with_x4 = df.withColumn("x4", lit(0)) df_with_x4.show() # +---+---+----+---+ # | x1| x2| x3| x4| # +---+---+----+---+ # |100| a| 3.0| 0| # |300| b| 5.0| 0| # +---+---+----+---+ # Spark enable us to transform an existing column # (by using its column values) to a new column: The # following example computes the exponential of the # column "x3" value as a new column "x5": # creates a new column "x5" and initialize it to exp("x3") df_with_x5 = df_with_x4.withColumn("x5", exp("x3")) df_with_x5.show() # +---+---+---+---+------------------+ # | x1| x2| x3| x4| x5| # +---+---+---+---+------------------+ # |100| a|3.0| 0|20.085536923187668| # |300| b|5.0| 0| 148.4131591025766| # +---+---+---+---+------------------+ # You may perform the `join()` operation between # two DataFrames and hence add new additional columns. # The following example joins two DataFrames (named # as `df_with_x5` and `other_df`) and creates a new # DataFrame as `df_with_x6`. other_data = [(100, "foo1"), (100, "foo2"), (200, "foo")] other_df = spark.createDataFrame(other_data, ("k", "v"))
def convert_grade_back_to_normal(self, df): # type: (dataframe) -> dataframe for col in self.grade_cols: df = df.withColumn(col, F.lit(1) - F.exp(F.col(col))) return df
StructType( [StructField('_1', StringType()), StructField('_2', DoubleType())]))) if __name__ == '__main__': spark = SparkSession.builder.appName('data_frame_creation').getOrCreate() data = [(100, "a", 3.0), (300, "b", 5.0)] col = ("x1", "x2", "x3") df = spark.createDataFrame(data, col) df.show() # --------------------------- # 1- 添加字段 # --------------------------- ## 1.1 简单添加 df_with = df.withColumn("x4", lit(0)).withColumn("x5", exp("x3")) df_with.show() ## 1.2 left join + rename | rand() other_data = [(100, "foo1"), (100, "foo2"), (200, "foo")] other_df = spark.createDataFrame(other_data, ("k", "v")) df_with_x6 = df_with.join(other_df, df_with.x1 == other_df.k, 'leftouter')\ .drop('k').withColumnRenamed('v', 'x6') df_with_x6.show() df_with_x6.withColumn("x8", rand()).show() # --------------------------- # 2- aggregate_multiple_columns # 在关联规则中可能用到 # --------------------------- df = spark.sparkContext.parallelize([("mary", "lemon", 2.00), ("adam", "grape", 1.22),
""" There are many ways that you can use to create a column in a PySpark Dataframe. 1. Using Spark Native Functions {withColumn} 2. Using Spark UDFs 3. Using RDDs 4. Using Pandas UDF """ """ 1. Using Spark Native Functions {withColumn} : """ casesWithNewConfirmed = cases.withColumn("NewConfirmed", 100 + func.col("confirmed")) # casesWithNewConfirmed.show() casesWithExpConfirmed = cases.withColumn("ExpConfirmed", func.exp("confirmed")) # casesWithExpConfirmed.show() """ 2. Using Spark UDFs : Sometimes we want to do complicated things to a column or multiple columns.This could be thought of as a map operation on a PySpark Dataframe to a single column or multiple columns. While Spark SQL functions do solve many use cases when it comes to column creation, I use Spark UDF whenever I need more matured Python functionality. """ def casesHighLow(confirmed): if confirmed < 50: return "Low" else: return "High"
def run(self, rawCountsSparkDF, columnBatchSize, outFileGroupedByGene=None): ''' Arguments: rawCountsSparkDF a dataframe with columns The name column from the salmon quant.sf files for each sample there is a column containing the NumReads column of the salmon quant.sf file. The column name == the sample Name columnBatchSize: an integer The GTEx training data set has 10409 numeric columns. This cause a java.lang.StackOverflowError because the DAG is to big. increasing spark driver memory does not help. The work around is sum the smaller batches of columns and cache the results of each batch outFileGroupedByGene: optional file path if defined groupedByGene dataframe will be saved. is a work around. We are having trouble calculate the row sums needed to calculate the estimated scaling factors. OOM exceptions. Using the grouped by gene counts matrix you can try having DESeq calculate the scaling factors with out having to process all the salmon quant.sf file returns: (scalingFactorsDF, countDF) scalingFactorsDF: a spark data frame with columns 'sampleName' and 'scalingFactor' countDF: contains the integer counts of the transcripts grouped by geneId. the first column name will be 'geneId'. The following column names will be the sample names ''' self.logger.warn("run BEGIN") self.logger.warn( "run rawCountsSparkDF numRows:{} numCols:{}"\ .format( rawCountsSparkDF.count(), len( rawCountsSparkDF.columns ) ) ) # pass transients to enable unit testing rawCountsSparkDF.createOrReplaceTempView("rawCounts") countsSparkDF = self._groupByGeneAndSum(rawCountsSparkDF) retIntSparkDF = self._convertToLong(countsSparkDF) if outFileGroupedByGene: self.logger.warn("saving integer grouped by counts to :{}".format( outFileGroupedByGene)) retIntSparkDF.coalesce(1).write.csv(outFileGroupedByGene, mode='overwrite', header=True) self.logger.warn( "finished writing integer grouped by counts to :{}".format( outFileGroupedByGene)) countsSparkDF = None # 6.a) # skip first column, i.e. gene_id columnNames = retIntSparkDF.columns[1:] logCountsSparkDF = self._calculateLogs(retIntSparkDF, columnNames) # 6.b) filter out genes with one or more nulls # i) removes genes with zero in one or more samples. # that are type specific. I.e. we want to focus on the house keeping genes. # These are genes that are trascripted at simpilar levels regradless of tissue type filteredDF = logCountsSparkDF.na.drop() filteredDF.checkpoint() logCountsSparkDF = None # 6.c) calculate the mean of the row sum # skip gene_id column columns = filteredDF.columns[1:] rowSumsDF = self.rowSums(filteredDF, columns, columnBatchSize) rowSumsDF.checkpoint() n = len(rowSumsDF.columns) - 2 # do not count geneId or rowSum columns rowMeansDF = rowSumsDF.withColumn("rowMean", (rowSumsDF.rowSum / n)) rowMeansDF.checkpoint() filteredDF = None # 6.d) subtract the avereage log values from the log(counts) # i) this is equal to log( numRead_x / average numRead_x) # skip the first and last 2 columns, ie. geneId, rowSum, rowMean columnNames = rowMeansDF.columns[1:-2] ratioDF = self._subtractRowMeanFromLogCounts(rowMeansDF, columnNames) ratioDF.checkpoint() rowMeansDF = None # 6.e calculate the median of the ratio for each sample # i) median is robust # skip geneId columnNames = ratioDF.columns[1:] logMedianDF = self.median(ratioDF, columnNames) logMedianDF.checkpoint() ratioDF = None newColNames = [self.getSampleNames(c) for c in logMedianDF.columns] logScalingFactorsDF = logMedianDF.toDF(*newColNames) logScalingFactorsDF.checkpoint() # 6.f) convert the medians back to linear scale scalingFactorsDF = logScalingFactorsDF.select( *(exp(c) for c in logScalingFactorsDF.columns)) # fix the column names change 'EXP(ctrl_1)' to 'ctrl_1' # transpose into a 2 columns, 'sampleName' and 'scalingFactor' retScalingFactorsDF = self._fixScalingFactors(scalingFactorsDF) # fix the column names change 'sum(kras)' to 'kras' retIntCountSparkDF = self._fixSumColNames(retIntSparkDF) self.logger.warn("run END\n") return (retScalingFactorsDF, retIntCountSparkDF)
def expit_pandas_udf(x): from numpy import exp return 1.0 / (1.0 + exp(-x))
def expit_udf(x): from math import exp return 1.0 / (1.0 + exp(-x))
rFormula = RFormula(formula="log_price ~ . - price", featuresCol="features", labelCol="log_price", handleInvalid="skip") lr = LinearRegression(labelCol="log_price", predictionCol="log_pred") pipeline = Pipeline(stages=[rFormula, lr]) pipelineModel = pipeline.fit(logTrainDF) predDF = pipelineModel.transform(logTestDF) # COMMAND ---------- # MAGIC %md # MAGIC ## Exponentiate # MAGIC # MAGIC In order to interpret our RMSE, we need to convert our predictions back from logarithmic scale. # COMMAND ---------- from pyspark.ml.evaluation import RegressionEvaluator from pyspark.sql.functions import col, exp expDF = predDF.withColumn("prediction", exp(col("log_pred"))) regressionEvaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction") rmse = regressionEvaluator.setMetricName("rmse").evaluate(expDF) r2 = regressionEvaluator.setMetricName("r2").evaluate(expDF) print(f"RMSE is {rmse}") print(f"R2 is {r2}")
from pyspark.ml.regression import LinearRegression linreg = LinearRegression(maxIter=500, regParam=0.0) lm = linreg.fit(train_df) print("Intercept ", lm.intercept) print("Coefficients ", lm.coefficients) y_pred = lm.transform(test_df) y_pred.select('features', 'label', 'prediction').show(5) from pyspark.sql.functions import exp y_pred = y_pred.withColumn("y_pred", exp('prediction')) y_pred.show(5) from pyspark.ml.evaluation import RegressionEvaluator rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="y_pred", metricName="rmse") lm_rmse = rmse_evaluator.evaluate(y_pred) print("Root mean square ", lm_rmse) rsquare_evaluator = RegressionEvaluator(labelCol="price", predictionCol="y_pred",
def compile_exp(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) return F.exp(src_column)
parallelism=args.num_workers) model = model_selection.fit(train_df).set_output_columns(['Sales']) history = model.get_history() best_val_rmspe = min(history['val_exp_rmspe']) print('Best RMSPE: %f' % best_val_rmspe) # Save the trained model. model.keras().save(args.local_checkpoint_file) print('Written checkpoint to %s' % args.local_checkpoint_file) # =================== # # 3. FINAL PREDICTION # # =================== # print('================') print('Final prediction') print('================') pred_df = model.transform(test_df) # Convert from log domain to real Sales numbers pred_df = pred_df.withColumn('Sales', F.exp(pred_df.Sales)) submission_df = pred_df.select(pred_df.Id.cast(T.IntegerType()), pred_df.Sales).toPandas() submission_df.sort_values(by=['Id']).to_csv(args.local_submission_csv, index=False) print('Saved predictions to %s' % args.local_submission_csv) spark.stop()
def tocolumns(df, expr): import pyspark.sql.functions as fcns if isinstance(expr, histbook.expr.Const): return fcns.lit(expr.value) elif isinstance(expr, (histbook.expr.Name, histbook.expr.Predicate)): return df[expr.value] elif isinstance(expr, histbook.expr.Call): if expr.fcn == "abs" or expr.fcn == "fabs": return fcns.abs(tocolumns(df, expr.args[0])) elif expr.fcn == "max" or expr.fcn == "fmax": return fcns.greatest(*[tocolumns(df, x) for x in expr.args]) elif expr.fcn == "min" or expr.fcn == "fmin": return fcns.least(*[tocolumns(df, x) for x in expr.args]) elif expr.fcn == "arccos": return fcns.acos(tocolumns(df, expr.args[0])) elif expr.fcn == "arccosh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "arcsin": return fcns.asin(tocolumns(df, expr.args[0])) elif expr.fcn == "arcsinh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "arctan2": return fcns.atan2(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "arctan": return fcns.atan(tocolumns(df, expr.args[0])) elif expr.fcn == "arctanh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "ceil": return fcns.ceil(tocolumns(df, expr.args[0])) elif expr.fcn == "copysign": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "cos": return fcns.cos(tocolumns(df, expr.args[0])) elif expr.fcn == "cosh": return fcns.cosh(tocolumns(df, expr.args[0])) elif expr.fcn == "rad2deg": return tocolumns(df, expr.args[0]) * (180.0 / math.pi) elif expr.fcn == "erfc": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "erf": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "exp": return fcns.exp(tocolumns(df, expr.args[0])) elif expr.fcn == "expm1": return fcns.expm1(tocolumns(df, expr.args[0])) elif expr.fcn == "factorial": return fcns.factorial(tocolumns(df, expr.args[0])) elif expr.fcn == "floor": return fcns.floor(tocolumns(df, expr.args[0])) elif expr.fcn == "fmod": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "gamma": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "hypot": return fcns.hypot(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "isinf": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "isnan": return fcns.isnan(tocolumns(df, expr.args[0])) elif expr.fcn == "lgamma": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "log10": return fcns.log10(tocolumns(df, expr.args[0])) elif expr.fcn == "log1p": return fcns.log1p(tocolumns(df, expr.args[0])) elif expr.fcn == "log": return fcns.log(tocolumns(df, expr.args[0])) elif expr.fcn == "pow": return fcns.pow(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "deg2rad": return tocolumns(df, expr.args[0]) * (math.pi / 180.0) elif expr.fcn == "sinh": return fcns.sinh(tocolumns(df, expr.args[0])) elif expr.fcn == "sin": return fcns.sin(tocolumns(df, expr.args[0])) elif expr.fcn == "sqrt": return fcns.sqrt(tocolumns(df, expr.args[0])) elif expr.fcn == "tanh": return fcns.tanh(tocolumns(df, expr.args[0])) elif expr.fcn == "tan": return fcns.tan(tocolumns(df, expr.args[0])) elif expr.fcn == "trunc": raise NotImplementedError( expr.fcn) # FIXME (fcns.trunc is for dates) elif expr.fcn == "xor": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "conjugate": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "exp2": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "heaviside": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "isfinite": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "left_shift" and isinstance(expr.args[1], histbook.expr.Const): return fcns.shiftLeft(tocolumns(df, expr.args[0]), expr.args[1].value) elif expr.fcn == "log2": return fcns.log2(tocolumns(df, expr.args[0])) elif expr.fcn == "logaddexp2": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "logaddexp": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "mod" or expr.fcn == "fmod": return tocolumns(df, expr.args[0]) % tocolumns(df, expr.args[1]) elif expr.fcn == "right_shift" and isinstance(expr.args[1], histbook.expr.Const): return fcns.shiftRight(tocolumns(df, expr.args[0]), expr.args[1].value) elif expr.fcn == "rint": return fcns.rint(tocolumns(df, expr.args[0])) elif expr.fcn == "sign": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "where": return fcns.when(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])).otherwise( tocolumns(df, expr.args[2])) elif expr.fcn == "numpy.equal": return tocolumns(df, expr.args[0]) == tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.not_equal": return tocolumns(df, expr.args[0]) != tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.less": return tocolumns(df, expr.args[0]) < tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.less_equal": return tocolumns(df, expr.args[0]) <= tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.isin": return tocolumns(df, expr.args[0]) in tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_not": return ~tocolumns(df, expr.args[0]) elif expr.fcn == "numpy.add": return tocolumns(df, expr.args[0]) + tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.subtract": return tocolumns(df, expr.args[0]) - tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.multiply": return tocolumns(df, expr.args[0]) * tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.true_divide": return tocolumns(df, expr.args[0]) / tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_or": return tocolumns(df, expr.args[0]) | tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_and": return tocolumns(df, expr.args[0]) & tocolumns(df, expr.args[1]) else: raise NotImplementedError(expr.fcn) else: raise AssertionError(expr)
history = keras_model.getHistory() best_val_rmspe = min(history['val_exp_rmspe']) print('Best RMSPE: %f' % best_val_rmspe) # Save the trained model. keras_model.save(args.local_checkpoint_file) print('Written checkpoint to %s' % args.local_checkpoint_file) # ================ # # FINAL PREDICTION # # ================ # print('================') print('Final prediction') print('================') pred_df = keras_model.transform(test_df) pred_df.printSchema() pred_df.show(5) # Convert from log domain to real Sales numbers pred_df = pred_df.withColumn('Sales_pred', F.exp(pred_df.Sales_output)) submission_df = pred_df.select(pred_df.Id.cast(T.IntegerType()), pred_df.Sales_pred).toPandas() submission_df.sort_values(by=['Id']).to_csv(args.local_submission_csv, index=False) print('Saved predictions to %s' % args.local_submission_csv) spark.stop()