def _recency_aggregation(self, save_results, eval_path): print('[ '+str(datetime.utcnow())+' ] : Calculating recency aggregations for predictions by model = '+self.name) df = self.pred_df \ .filter((F.col('recency_x') >= 0) & (F.col('recency_x') <= 365)) \ .withColumn('days_until_deact', (F.lit(365) - F.col('recency_x')).cast(IntegerType())) \ .withColumn('log_loss', F.when(F.col('deactivated') == 1, -F.log(F.col('prob_deact'))) .otherwise(-F.log(F.lit(1.0) - F.col('prob_deact')))) \ .groupBy('days_until_deact') \ .agg(F.count('consumer_id').alias('count_users'), F.sum('deactivated').alias('deacts_actual'), F.sum('prob_deact').cast(IntegerType()).alias('deacts_pred'), F.avg('log_loss').alias('avg_log_loss'), F.avg('prob_deact').alias('avg_prob_deact'), # Accuracy uses "prediction" column, which assigns consumers to class based on cutoff point of 0.50 F.avg((F.col('deactivated') == F.col('prediction')).cast(IntegerType())).alias('accuracy')) \ .withColumn('pct_deact_actual', F.col('deacts_actual') / F.col('count_users')) \ .withColumn('pct_deact_pred', F.col('deacts_pred') / F.col('count_users')) \ .withColumn('pred_over_actual_deacts', F.col('deacts_pred') / F.col('deacts_actual')) \ .withColumn('diff_pct_deact', F.col('pct_deact_pred') - F.col('pct_deact_actual')) \ .sort('days_until_deact') \ .toPandas() self.recency_agg = df if save_results: df.to_csv(eval_path + 'model=' + self.name + '/recency_aggregation.tsv', sep='\t', index=False)
def create_KMeans_features(df, original=True): if original: df = df.withColumn( 'non_passive_events', F.log(df.frequency - (df.DeleteEvent_count + df.GollumEvent_count + df.IssueCommentEvent_count + df.MemberEvent_count + df.WatchEvent_count + 1))) df = df.withColumn( 'public_repos_gists', F.log(df.public_repos_count + df.public_gists_count + 1)) # Assemble pipeline stages = [ VectorAssembler( inputCols=['non_passive_events', 'public_repos_gists'], outputCol="KMeans_features").setHandleInvalid("skip") ] else: # Assemble pipeline stages = [ VectorAssembler( inputCols=['frequency', 'recency'], outputCol="KMeans_features").setHandleInvalid("skip") ] pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(df) df = pipelineModel.transform(df) #selectedCols = ['label', 'features'] #churn_data = churn_data.select(selectedCols) #churn_data.printSchema() return df
def prepare_data(): """Commodity function to read the data from the files and prepare the features for the kmeans model fit. """ # Read data from files. _data = load_data() # As the distribution of the following feature is not normal they will be log scaled to have a more # normally distributed distribution. This is required for kmeans algorithm to work better. _data = _data.withColumn('log_age', F.log('age')).withColumn('log_avg_buy', F.log('avg_buy'))\ .withColumn('log_min_buy', F.log('min_buy')).withColumn('log_max_buy', F.log('max_buy')) # Select the features to use in kmeans. The features will be also standard scaled, that is mean centered # and scaled to have standard deviation of one. features = _data.columns[4:] assembler = VectorAssembler(inputCols=features, outputCol='features_unscaled') assembled = assembler.transform(_data) scaler = StandardScaler(inputCol='features_unscaled', outputCol='features', withStd=True, withMean=True) scaler_model = scaler.fit(assembled) scaled_data = scaler_model.transform(assembled) return scaled_data, features
def popularity_based_metrics(ratings, tips): total_reviews = ratings.groupBy("business_id").agg( F.count(F.lit(1)).alias("total_reviews")) all_pairs = ratings.join( ren(ratings, ["business_id"]), "business_id").filter(col("user_id") < col("user_id_2")) all_pairs = all_pairs.join(total_reviews, "business_id") adamic_ratings = all_pairs.groupBy("user_id", "user_id_2").agg( F.sum(1 / F.log("total_reviews")).cast("float").alias("aa_pop_ratings")) tips = tips.join(ratings.select("user_id").distinct(), "user_id", "right") total_tips = tips.groupBy("business_id").agg( F.count(F.lit(1)).alias("total_tips")) all_pairs = ratings.join( ren(ratings, ["business_id"]), "business_id").filter(col("user_id") < col("user_id_2")) all_pairs = all_pairs.join(total_tips, "business_id") adamic_tips = all_pairs.groupBy("user_id", "user_id_2").agg( F.sum(1 / F.log("total_tips")).cast("float").alias("aa_pop_tips")) return adamic_ratings.join(adamic_tips, ["user_id", "user_id_2"], "outer")
def recency_aggregation(self): print('[ {0} ] : Calculating aggregations by recency for model predictions'.format(datetime.utcnow())) prob_col = 'masked_prob' if self.masked else 'prob_deact_cal' if self.calibrated else 'prob_deact' df = self.prediction_df \ .filter((F.col('recency_x') >= 0) & (F.col('recency_x') <= 365)) \ .withColumn('days_until_deact', (F.lit(365) - F.col('recency_x')).cast(IntegerType())) \ .withColumn('log_loss', F.when(F.col('deactivated') == 1, -F.log(F.col(prob_col))) .otherwise(-F.log(F.lit(1.0) - F.col(prob_col)))) \ .groupBy('days_until_deact') \ .agg(F.count('consumer_id').alias('count_users'), F.sum('deactivated').alias('deacts_actual'), F.sum(prob_col).cast(IntegerType()).alias('deacts_pred'), F.avg('log_loss').alias('avg_log_loss'), F.avg(prob_col).alias('avg_prob_deact'), F.avg((F.col('deactivated') == F.round(F.col(prob_col), 0)).cast(IntegerType())).alias('accuracy'), ) \ .withColumn('pct_deact_actual', F.col('deacts_actual') / F.col('count_users')) \ .withColumn('pct_deact_pred', F.col('deacts_pred') / F.col('count_users')) \ .withColumn('pred_over_actual_deacts', F.col('deacts_pred') / F.col('deacts_actual')) \ .withColumn('diff_pct_deact', F.col('pct_deact_pred') - F.col('pct_deact_actual')) \ .sort('days_until_deact') \ .toPandas() self.recency_df = df if self.save_results: df.to_csv(self.eval_path + 'recency_aggregation.tsv', sep='\t', index=False) self._recency_plots()
def calculate_volatility(rolling_windows=20): spark = SparkSession.builder.master('local[*]').appName('Volatility').getOrCreate() df = spark.read.format('csv')\ .option('header', 'true')\ .load('/media/guolewen/research_data/compustats/*.csv') # adjust price with stock/dividend split ratio df = df.withColumn('adjprccd', df['prccd'] / df['ajexdi']) df = df.withColumn('adjprchd', df['prchd'] / df['ajexdi']) df = df.withColumn('adjprcld', df['prcld'] / df['ajexdi']) # create window win_spec = Window.partitionBy('isin').orderBy('datadate') # lag price df = df.withColumn('ladjprccd', lag('adjprccd').over(win_spec)) # compute squared daily log returns as the square of natural logarithm of # the current closing price divided by previous closing price. df = df.withColumn('retsq', pow(log(df['adjprccd'] / df['ladjprccd']), 2)) # construct a 20-trading-day rolling window win_rolling = Window.partitionBy('isin').orderBy('datadate').rowsBetween(-rolling_windows, -1) # traditional volatility approach: square root of the average squared daily log returns in a 20-rolling window df = df.withColumn('volatility', sqrt(avg('retsq').over(win_rolling))) # compute squared daily log high low as the square of natural logarithm # of daily high price divided by low price. # fill na values with 0 (this is for the case if no trading during the day) df = df.withColumn('loghlsq', pow(log(df['adjprchd'] / df['adjprcld']), 2)).fillna(0, subset=['loghlsq']) # Parkison's extreme value method: square root of 1/4*Ln2 times the average of squared daily log high low # in a 20-rolling window df = df.withColumn('Parkinsonvol', sqrt((1/(4*np.log(2))) * avg('loghlsq').over(win_rolling))) return df.selectExpr('datadate as Date', 'isin as ISIN', 'volatility', 'Parkinsonvol').toPandas()
def _cross_entropy(y_true, y_prob, df=None, normalize=False): """Function to calculate cross entropy If y_true or y_prob is of shape (num_samples, ), the labels are assumed to be binary """ eps = 1e-15 if df is None: # Pre-processing if y_prob.ndim == 1: y_prob = np.vstack((1 - y_prob, y_prob)).T if y_true.ndim == 1: y_true = np.vstack((1 - y_true, y_true)).T y_prob = np.clip(y_prob, eps, 1 - eps) # Re-normalize and calculate entropy y_prob /= y_prob.sum(axis=1)[:, np.newaxis] entropy_arr = -(y_true * np.log(y_prob)).sum(axis=1) return entropy_arr.mean() if normalize else entropy_arr.sum() else: df = df.withColumn(y_prob, F.when(F.col(y_prob) < eps, eps) .when(F.col(y_prob) > (1 - eps), 1 - eps) .otherwise(F.col(y_prob))) df = df.withColumn('entropy', -F.col(y_true) * F.log(F.col(y_prob)) - (1 - F.col(y_true)) * F.log(1 - F.col(y_prob))) if normalize: return df.agg(F.avg('entropy').alias('loss')).select('loss') else: return df.agg(F.sum('entropy').alias('loss')).select('loss')
def log_loss(df): epsilon = 1e-12 temp = df.select("label", when(df.outcome == 1.0, 1.0-epsilon).otherwise(df.outcome).alias("p")) temp = temp.select("label", when(temp.p == .0,epsilon).otherwise(temp.p).alias("p")) temp = temp.select("p","label", when(temp.label == 1, -log(temp.p)).otherwise(-log(1-temp.p)).alias("log_loss")) return temp.selectExpr("mean(log_loss)").first()[0]
def evaluate_agg_prob(self): import pyspark from pyspark.sql.functions import col #terminal_outcome.show() from pyspark.sql.functions import udf, log, sum, exp from pyspark.ml.evaluation import BinaryClassificationEvaluator udf_prob = udf(lambda x: x.toArray().tolist()[1]) cur_terminal_df = self.get_terminal_df() self.flatten_terminal_outcome() for cur_of in [self.target_disch_col]: self.logger.info(cur_of) try: cur_training_df = self.spark.read.parquet( self.training_result_dest_template.format(cur_of)).select( "ID", "TIME_SPAN", udf_prob("Probability").cast("double").alias( "probability"), col("{0}_label".format(cur_of)).alias("label")) cur_testing_df = self.spark.read.parquet( self.testing_result_dest_template.format(cur_of)).select( "ID", "TIME_SPAN", udf_prob("Probability").cast("double").alias( "probability"), col("{0}_label".format(cur_of)).alias("label")) except pyspark.sql.utils.AnalysisException as ex: template = "An exception of type {0} occurred. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) self.logger.info(message) self.logger.info("PROCESS") self.logger.debug("{0} Not exists".format(cur_of)) continue cur_tr_agg = cur_training_df.groupBy("ID").agg( sum(log(1.0 - col("probability"))).alias("agg_prob")).select( "ID", (1.0 - exp("agg_prob")).alias("agg_prob").cast("double")) cur_te_agg = cur_testing_df.groupBy("ID").agg( sum(log(1.0 - col("probability"))).alias("agg_prob")).select( "ID", (1.0 - exp("agg_prob")).alias("agg_prob").cast("double")) # TODO terminal_df is flattened terminal DX for now. Need to merge with other DF with ALI,AKI,ALF,AHF column separately. cur_tr_agg = cur_tr_agg.join(self.target_terminal_outcome_table, "ID") cur_te_agg = cur_te_agg.join(self.target_terminal_outcome_table, "ID") #cur_tr_agg.show() #cur_te_agg.show() from pyspark.sql.functions import count #cur_te_agg.select(cur_of).groupBy(cur_of).agg(count("*")).show() return cur_tr_agg, cur_te_agg
def logloss(model, df, probabilities_col='probability'): df = model.transform(df) df = df.withColumn( 'proba', F.udf(lambda v: float(v[1]), FloatType())(F.col(probabilities_col))) df = df.withColumn( 'logloss', -F.col('label') * F.log(F.col('proba')) - (1. - F.col('label')) * F.log(1. - F.col('proba'))) return df.agg(F.mean('logloss')).first()[0]
def logloss(model, df, probabilities_col='probability', negative_downsampling_rate=1.0): df = df_with_proba_column(model, df, probabilities_col, negative_downsampling_rate) df = df.withColumn( 'logloss', -F.col('label') * F.log(F.col('proba')) - (1. - F.col('label')) * F.log(1. - F.col('proba'))) return df.agg(F.mean('logloss')).first()[0]
def preprocess(df, logger): # Check if database is empty if df.rdd.isEmpty(): logger.error("Couldn't read data") sys.exit(0) # Select interesting columns (the only ones containing data) logger.info('Selecting interesting columns') df = df.select(df.td.cast('float'), df.sp.cast('int'), df.dp.cast('int'), df.pr, df.flg, df.ipkt.cast('int'), df.ibyt.cast('int')) # Remove rows with NaN values df = df.dropna() # Apply a logarithmic transformation to the td, ipkt and ibyt columns logger.info('Applying logarithmic transorm to columns') df = df.withColumn('td', F.log(df.td + 1)).withColumn( 'ipkt', F.log(df.ipkt + 1)).withColumn('ibyt', F.log(df.ibyt + 1)) proto_transform = F.udf(lambda z: transform_protocol(z), ArrayType(IntegerType())) logger.info('Transforming flag into one-hot encoding') flag_transform = F.udf(lambda z: process_flag(z), ArrayType(IntegerType())) # Transform protocol column into one-hot encoding logger.info('Transforming protocol into one-hot encoding') df = df.withColumn('proto_onehot', proto_transform(df.pr)) df = df.withColumn('proto_onehot0', df.proto_onehot[0]) df = df.withColumn('proto_onehot1', df.proto_onehot[1]) df = df.withColumn('proto_onehot2', df.proto_onehot[2]) df = df.withColumn('proto_onehot3', df.proto_onehot[3]) df = df.withColumn('proto_onehot4', df.proto_onehot[4]) # Decode flag column and transform it into one-hot encoding logger.info('Transforming flag into one-hot encoding') df = df.withColumn('flag_onehot', flag_transform(df.flg)) df = df.withColumn('flag_onehot0', df.flag_onehot[0]) df = df.withColumn('flag_onehot1', df.flag_onehot[1]) df = df.withColumn('flag_onehot2', df.flag_onehot[2]) df = df.withColumn('flag_onehot3', df.flag_onehot[3]) df = df.withColumn('flag_onehot4', df.flag_onehot[4]) df = df.withColumn('flag_onehot5', df.flag_onehot[5]) # Select final columns for training the algorithms df = df.select(df.td.cast('float'), df.flag_onehot0, df.flag_onehot1, df.flag_onehot2, df.flag_onehot3, df.flag_onehot4, df.flag_onehot5, df.proto_onehot0, df.proto_onehot1, df.proto_onehot2, df.proto_onehot3, df.proto_onehot4, df.ipkt.cast('float'), df.ibyt.cast('float')) return df
def main(spark, train_file, val_file, test_file, ext_type): train_df = spark.read.parquet(train_file) print("Loaded train file") val_df = spark.read.parquet(val_file) print("Loaded val file") test_df = spark.read.parquet(test_file) print("Loaded test file") val_df.createOrReplaceTempView("val_df") users_val = spark.sql("SELECT DISTINCT userIndex FROM val_df") print("Created val users list") test_df.createOrReplaceTempView("test_df") users_test = spark.sql("SELECT DISTINCT userIndex FROM test_df") print("Created test users list") if ext_type == "log": train_df = train_df.withColumn("count", F.log(1 + train_df["count"])) elif ext_type == "square": train_df = train_df.withColumn("count", train_df["count"] * train_df["count"]) elif ext_type == "cube": train_df = train_df.withColumn( "count", train_df["count"] * train_df["count"] * train_df["count"]) elif ext_type == "log2": train_df = train_df.withColumn( "count", F.log(1 + train_df["count"]) / math.log(2)) print("Transformed counts") params = {"regParam": 10, "rank": 100, "alpha": 40.0} reg = params["regParam"] rank = params["rank"] alpha = params["alpha"] als = ALS(maxIter=10, regParam=reg, rank=rank, alpha=alpha, implicitPrefs=True, userCol="userIndex", itemCol="trackIndex", ratingCol="count") model = als.fit(train_df) print("Fitted ALS model") map_val = compute_MAP(model, users_val, val_df) print('Validation: RegParam:{} | Rank:{} | Alpha:{} | MAP:{}'.format( reg, rank, alpha, map_val)) map_test = compute_MAP(model, users_test, test_df) print('Test: RegParam:{} | Rank:{} | Alpha:{} | MAP:{}'.format( reg, rank, alpha, map_test))
def log_loss_from_prediction(predictions): # predictions are what returns from model.transform # the data frame should have a column named probability, which is a tuple: # we need to extract the second item of the tuple and calculate log loss with it epsilon = 1e-16 split1_udf = udf(lambda value: value[1].item(), FloatType()) predictions = predictions.select('*', split1_udf('probability').\ alias('prob')) loss = predictions.select("*", when(predictions.label == 1, 0. - log(predictions.prob + epsilon)).\ otherwise(0. - log(1. - predictions.prob + epsilon)).\ alias('log_loss')).\ agg({'log_loss': 'avg'}).\ take(1) return loss
def socialBasedMetrics(ratings): fold_user_friend = user_friend.join( ratings.select("user_id").distinct(), "user_id", "right") fu_with_friendsize = fold_user_friend.join(fold_user_friend.select(col("user_id").alias("friend_id"), col("nf").alias("nf_friend")).distinct(), "friend_id") \ .select("user_id", "nf", "friend_id", "nf_friend") ufJoin = fu_with_friendsize.join( ren(fu_with_friendsize, ["friend_id"]), "friend_id").filter(col("user_id") < col("user_id_2")) intersection = ufJoin.groupBy("user_id", "user_id_2", "nf", "nf_2").agg( count(lit(1)).alias("intersection"), sum_sql(1 / log("nf_friend")).cast("float").alias("adamic_adar_graph")) graph = intersection.withColumn( "jaccard_graph", (col("intersection") / (col("nf") + col("nf_2") - col("intersection")) ).cast("float")).withColumn( "cosine_graph", (col("intersection") / (sqrt(col("nf") * col("nf_2")))).cast("float")).withColumn( "preferential_attachment", col("nf") * col("nf_2")).select( "user_id", "user_id_2", "adamic_adar_graph", "jaccard_graph", "cosine_graph", "preferential_attachment").filter( (col("adamic_adar_graph") > 0) | (col("jaccard_graph") > 0) | (col("cosine_graph") > 0)) return graph
def Trainer(spark, df_train, rank, regParam, alpha, K=500): df_train = df_train.withColumn('count', F.log('count')) #takes log output_file = 'ALSModel_%s_%s__%s' % (str(rank), str(regParam), str(alpha)) FileExistFlag = os.system('hadoop fs -test -e %s' % output_file) if not FileExistFlag == 0: beg = time() als = ALS(rank=rank, maxIter=10, regParam=regParam, alpha=alpha, implicitPrefs=True, userCol="user_id_numeric", itemCol="track_id_numeric", ratingCol="count", coldStartStrategy="drop") model = als.fit(df_train) print('Train Finished') model.write().overwrite().save(output_file) end = time() print('ALSModel_%s_%s__%s Saved. Took %f s' % (str(rank), str(regParam), str(alpha), end - beg)) else: print('ALSModel_%s_%s__%s Already Exist.' % (str(rank), str(regParam), str(alpha))) return
def log_transform(dataset, features): """ This function is used to do log transformation on quantitative features to make them follow normal distribution """ for feature in features: dataset = dataset.withColumn(feature, F.log(feature)) return dataset
def __init__(self): super(FeaturePayloadSizeLogAverage, self).__init__() self.group_by_aggs = { 'reply_length_log': F.avg(F.log(F.col('reply_length_bytes') + 1.)).cast('float') }
def insuranceFrame(hc, spark): df = spark \ .read.csv("file://" + unit_test_utils.locate("smalldata/insurance.csv"), header=True, inferSchema=True) \ .withColumn("Offset", log(col("Holders"))) frame = hc.asH2OFrame(df) frame["Group"] = frame["Group"].asfactor() frame["Age"] = frame["Age"].asfactor() return frame
def insuranceFrame(hc, spark, insuranceDatasetPath): df = spark \ .read.csv(insuranceDatasetPath, header=True, inferSchema=True) \ .withColumn("Offset", log(col("Holders"))) frame = hc.asH2OFrame(df) frame["Group"] = frame["Group"].asfactor() frame["Age"] = frame["Age"].asfactor() return frame
def get_log_of_grades(self, df): # type: (dataframe) -> dataframe for col in self.grade_cols: df = df.withColumn( col, F.coalesce(F.log(F.lit(1) - F.col(col)), F.lit(self.log_value_for_ones))) return df
def cumprod(scol): @pandas_udf(returnType=self._kdf._internal.spark_type_for(self.name)) def negative_check(s): assert len(s) == 0 or ((s > 0) | (s.isnull())).all(), \ "values should be bigger than 0: %s" % s return s return F.sum(F.log(negative_check(scol)))
def _mutual_info_todo(when, then, df): """ Returns what (columns, as in spark columns) to compute to get the results requested by the parameters. :param when: :type when: str/int :param then: :type then: str/int :param df: :type df: DataFrame :return: Pyspark columns representing what to compute. """ # group on the pair of columns, count occurrences pairs_table = df.groupBy([when, then]).agg(count("*").alias("_pairs_count")) # ignore nulls pairs_table = pairs_table.filter((~col(when).isNull()) & (~col(then).isNull())) pairs_table.cache() when_table = pairs_table.groupBy(col(when).alias("wt")).agg( sum("_pairs_count").alias("_when_count")) then_table = pairs_table.groupBy(col(then).alias("tt")).agg( sum("_pairs_count").alias("_then_count")) final_table = pairs_table.join( when_table, pairs_table[when].eqNullSafe(when_table["wt"])) final_table = final_table.join( then_table, final_table[then].eqNullSafe(then_table["tt"])) # prepare 4 subformulas of MI to later sum, plus the total todo = final_table.select( sum(col("_pairs_count") * log(col("_pairs_count"))).alias("_s1"), # c_xy * logc_xy sum(col("_pairs_count")).alias("_s2"), # c_xy sum(col("_pairs_count") * log(col("_when_count"))).alias("_s3"), # c_xy * logc_x sum(col("_pairs_count") * log(col("_then_count"))).alias("_s4"), # c_xy * logc_y sum(col("_pairs_count")).alias("_total") # total ) todo = todo.select((col("_s1") / col("_total")) + (log(col("_total")) * (col("_s2") / col("_total"))) - ((col("_s3")) / col("_total")) - ((col("_s4")) / col("_total")).alias("mutual_info")) return todo
def compile_log(t, expr, scope, timecontext, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope, timecontext) # Spark log method only takes float return F.log( float(t.translate(op.base, scope, timecontext, raw=True)), src_column )
def main(spark): train_data = spark.read.parquet('anshul_project/train_index.parquet') val_data = spark.read.parquet('anshul_project/val_index.parquet') # train_data.createOrReplaceTempView('train_data') train_data_log = train_data.withColumn("logcount", log(train_data["count"])) val_data_log = val_data.withColumn("logcount", log(val_data["count"])) uid_indexer = StringIndexer(inputCol="user_id", outputCol="user_num", handleInvalid ="skip") tid_indexer = StringIndexer(inputCol="track_id", outputCol="track_num", handleInvalid ="skip") ranks =[4] regs = [1] alphas = [0.5] best_rmse = None best_rank = None best_alpha = None best_reg = None for rank in ranks : for alpha in alphas : for reg in regs : als = ALS(maxIter = 3 , regParam= reg, userCol= "user_num" , itemCol= "track_num" , ratingCol ="logcount" , implicitPrefs=True , coldStartStrategy="drop" , alpha= a , rank = r) pipeline = Pipeline(stages=[uid_indexer, tid_indexer, als]) als_model = pipeline.fit(train_data_log) predictions = als_model.transform(val_data_log) evaluator = RegressionEvaluator(metricName="rmse", labelCol="count", predictionCol="prediction") rmse = evaluator.evaluate(predictions) if best_rmse is None or best_rmse > rmse : best_rmse = rmse best_rank = rank best_alpha = alpha best_reg = reg print('The best hyper parameters: Rank: {}, Reg: {}, Alpha: {}, RMSE: {}'.format(best_rank,best_alpha,best_reg,best_rmse)) als_model.save('anshul_project/log_model') pass
def cumprod(scol): @pandas_udf(returnType=self._kdf._sdf.schema[self.name].dataType) def negative_check(s): assert len(s) == 0 or ((s > 0) | (s.isnull())).all(), \ "values should be bigger than 0: %s" % s return s return F.sum(F.log(negative_check(scol)))
def main(spark, data_file, model_file, user_file, track_file, model_formulation=None): df = spark.read.parquet(data_file) if model_formulation == 'log': #log compression on training df = df.withColumn('count', F.log(F.col('count'))) print("log") elif model_formulation == 'ct1': #subsetting all train counts greater than 1 df.createOrReplaceTempView('df') df = spark.sql('SELECT * FROM df WHERE count > 1') print("ct1") elif model_formulation == 'ct2': #subsetting all train counts greater than 2 df.createOrReplaceTempView('df') df = spark.sql('SELECT * FROM df WHERE count > 2') print("ct2") else: #If no model formulation is specified, pass print("default") pass user_indexer = StringIndexer(inputCol="user_id", outputCol="user_idx", handleInvalid="keep") track_indexer = StringIndexer(inputCol="track_id", outputCol="track_idx", handleInvalid="keep") pipeline = Pipeline(stages=[user_indexer, track_indexer]) mapping = pipeline.fit(df) df = mapping.transform(df) #create + fit an ALS model als = ALS(maxIter=5, regParam=0.01, implicitPrefs=True, ratingCol="count", userCol="user_idx", itemCol="track_idx") als_model = als.fit(df) #save trained ALS model als_model.write().overwrite().save(model_file) print("Model sucessfully saved to HFS") #save string indexers user_indexer.write().overwrite().save(user_file) track_indexer.write().overwrite().save(track_file) print("String Indexers sucessfully saved to HFS")
def main(spark, df_test, model_file): # import model model = PipelineModel.load(model_file) print("imported model") # import test data test = spark.read.parquet(df_test) print("imported test data") # log transform test data test = test.withColumn("count", log(test["count"] + 1)) print("log-transformed test data") # predict on test data testdf = model.transform(test) testdf = testdf.select([ c for c in testdf.columns if c in ["user_index", "count", "track_index"] ]) # make labels testdf.createOrReplaceTempView('testdf') Labels = spark.sql( 'SELECT user_index, collect_list(track_index) AS label FROM testdf GROUP BY user_index' ) Labels.createOrReplaceTempView('Labels') print("created ground truth labels") # generate top 500 track recommendations for each user in validation set user_subset = testdf.select("user_index").distinct() userRecs = model.stages[-1].recommendForUserSubset(user_subset, 500) userRecs.createOrReplaceTempView("userRecs") print("made user recommendations") # explode recommendations in long format Recs = (userRecs.select("user_index", explode("recommendations").alias("pred")).select( "user_index", "pred.*")) Recs.createOrReplaceTempView("Recs") # make predictions Preds = spark.sql( 'SELECT user_index, collect_list(track_index) AS prediction FROM Recs GROUP BY user_index' ) Preds.createOrReplaceTempView("Preds") # make label pairs Preds_labels = spark.sql( 'SELECT Preds.prediction AS prediction, Labels.label as label FROM Preds INNER JOIN Labels ON Preds.user_index = Labels.user_index' ) print("inner join preds & labels") # calculate MAP MAPrecommendationsAndTruth = Preds_labels.select("prediction", "label") metrics = RankingMetrics(MAPrecommendationsAndTruth.rdd) MAP = metrics.meanAveragePrecision print("MAP = %s" % MAP)
def feature_scaling(df): '''Log transform all numeric cols. ''' # scale remaining cols if isinstance(df, DataFrame): for col in count_columns: df = df.withColumn(col, F.log(df[col].cast(DoubleType()) + 1)) for col in scale_columns: df = df.withColumn(col, F.log(df[col].cast(DoubleType()) + 1)) # Scale recency df = df.withColumn('recency', F.log(df.recency.cast(DoubleType()) + 1)) else: df[count_columns] = df[count_columns].apply(lambda x: np.log(x + 1)) df[scale_columns] = df[scale_columns].apply(lambda x: np.log(x + 1)) df['recency'] = df['recency'].apply(lambda x: np.log(x + 1)) return df
def logLoss(predDF): # Define a function clamp to restrict the values of probability to be greater than 0 and less than one def clamp(n): epsilon = .000000000000001 minn = 0 + epsilon maxn = 1 - epsilon return max(min(maxn, n), minn) # Define a UDF to extract the first element of the probability array returned which is probability of one firstelement=udf(lambda v:clamp(float(v[1]))) #,FloatType() after [] was inserted and removed for epsilon # Create a new dataframe that contains a probability of one column (true) predict_df = predDF.withColumn('prob_one', firstelement(predDF.probability)) # Compute the log loss for the spark dataframe for each row row_logloss = (predict_df.withColumn( 'logloss', -f.col('Label')*f.log(f.col('prob_one')) - (1.-f.col('Label'))*f.log(1.-f.col('prob_one')))) logloss = row_logloss.agg(f.mean('logloss').alias('ll')).collect()[0]['ll'] return logloss
def get_vfr_index(): try: print(f"{str(dt.now())} 零售户周边人流指数") # 有人流数据的零售户 vfr = get_around_vfr(spark) vfr.cache() # 零售户 co_cust = get_co_cust(spark).select("cust_id") # 有经纬度的零售户 cust_lng_lat = get_cust_lng_lat(spark) \ .select("city", "cust_id", "lng", "lat") \ .join(co_cust, "cust_id") cust_lng_lat.cache() # 周边没有人流的零售户 not_df = cust_lng_lat.select("cust_id") \ .exceptAll(vfr.select("cust_id")) \ .join(cust_lng_lat, "cust_id") exist_df = vfr.join(cust_lng_lat, ["city", "cust_id"]) if not_df.count()>0: #knn填充 fill_df = fillWithKNN(exist_df.toPandas(), not_df.toPandas(), "avg_vfr") all_df = spark.createDataFrame(fill_df) \ .unionByName(exist_df) else: all_df=exist_df #阈值 threshold = all_df.groupBy("city") \ .agg((f.mean("avg_vfr") + 3 * f.stddev_pop("avg_vfr")).alias("threshold")) truncate_df = all_df.join(threshold, "city") \ .withColumn("avg_vfr", f.when(col("avg_vfr") > col("threshold"), col("threshold")) .otherwise(col("avg_vfr")) ) log_df = truncate_df.withColumn("log", f.log(col("avg_vfr") + 1)) log_max = log_df.groupBy("city").agg(f.max("log").alias("log_max")) colName = "people_count" log_df.join(log_max, "city") \ .withColumn(colName, col("log") / col("log_max") * 5) \ .foreachPartition(lambda x: write_hbase1(x, [colName], hbase)) vfr.unpersist() cust_lng_lat.unpersist() except Exception: tb.print_exc()
# MAGIC Now, let's visualize the results from the last example. We can use the built-in `display()` function to show a bar chart of the count for each response code. After running this cell, select the bar graph option, and then use "Plot Options..." and drag `status` to the key entry field and drag `count` to the value entry field. See the diagram, below, for an example. # MAGIC # MAGIC <img src="http://spark-mooc.github.io/web-assets/images/cs105x/plot_options_1.png" style="float: right; margin-right: 30px; border: 1px solid #999999"/> # COMMAND ---------- display(status_to_count_df) # COMMAND ---------- # MAGIC %md # MAGIC You can see that this is not a very effective plot. Due to the large number of '200' codes, it is very hard to see the relative number of the others. We can alleviate this by taking the logarithm of the count, adding that as a column to our DataFrame and displaying the result. # COMMAND ---------- log_status_to_count_df = status_to_count_df.withColumn('log(count)', sqlFunctions.log(status_to_count_df['count'])) display(log_status_to_count_df) # COMMAND ---------- # MAGIC %md # MAGIC While this graph is an improvement, we might want to make more adjustments. The [`matplotlib` library](http://matplotlib.org/) can give us more control in our plot and is also useful outside the Databricks environment. In this case, we're essentially just reproducing the Databricks graph using `matplotlib`. However, `matplotlib` exposes far more controls than the Databricks graph, allowing you to change colors, label the axes, and more. We're using a set of helper functions from the [`spark_notebook_helpers`](https://pypi.python.org/pypi/spark_notebook_helpers/1.0.1) library. # COMMAND ---------- # np is just an alias for numpy. # cm and plt are aliases for matplotlib.cm (for "color map") and matplotlib.pyplot, respectively. # prepareSubplot is a helper. from spark_notebook_helpers import prepareSubplot, np, plt, cm