def runDateFunctions(spark): # 파이썬의 경우 아래와 같이 튜플을 이용하여 데이터프레임을 생성하는 것도 가능함 df1 = spark.createDataFrame([(1.512,), (2.234,), (3.42,)], ['value']) df2 = spark.createDataFrame([(25.0,), (9.0,), (10.0,)], ['value']) df1.select(round(df1["value"], 1)).show() df2.select(functions.sqrt('value')).show()
def compute(day): # On veut les jours day-30 à day-1 sums = wikipediadata.where( (wikipediadata.day >= day-30) & (wikipediadata.day <= day-1)) # Sous-ensemble de test #sums = sums.where((sums.page == 'Cadillac_Brougham') | ((sums.page == 'Roald_Dahl') & (sums.projectcode == 'fr'))) # On somme les heures de la journées sums = sums.groupby('projectcode', 'page', 'day').sum('views') # On cache pour plus tard sums.cache() # on définit une windows := jour precedent window_spec = Window.partitionBy(sums.projectcode, sums.page) \ .orderBy(sums.day.asc()).rowsBetween(-1, -1) # on calcule la différence entre views(d) - views(d-1) diffs = sums.withColumn('diff', sums.views - F.sum(sums.views) \ .over(window_spec)) # on calcule les coefs à appliquer à chaque jour coefs = pd.DataFrame({'day': range(day-30, day)}) coefs['coef'] = 1. / (day - coefs.day) coefs = hc.createDataFrame(coefs) diffs = diffs.join(coefs, 'day') # on calcul le score de chaque jour diffs = diffs.withColumn('sub_score', diffs.diff * diffs.coef) totals = diffs.groupby('projectcode', 'page').sum('views', 'sub_score') # on normalise par la racine de la somme des views totals = totals.withColumn('score', totals['SUM(sub_score)'] / F.sqrt(totals['SUM(views)'])) \ .orderBy(F.desc('score')) \ .withColumnRenamed('SUM(views)', 'total_views') \ .limit(10) views = sums.select('projectcode', 'page', 'day', 'views') \ .join(totals.select('projectcode', 'page', 'total_views', 'score'), (totals.projectcode == sums.projectcode) & (totals.page == sums.page), 'right_outer') df = totals.select('projectcode', 'page', 'total_views', 'score').toPandas() df2 = views.toPandas() df2 = df2.iloc[:, 2:] df2 = df2.pivot_table(values='views', columns=['day'], index=['projectcode', 'page'], fill_value=0) df = df.merge(df2, left_on=['projectcode', 'page'], right_index=True) df.to_csv(filename(day), index=False) # on vide le cache hc.clearCache()
def geodistance(df, target_name,lng1, lat1, lng2, lat2): result = df.withColumn("dlon", radians(col(lng1)) - radians(col(lng2))) \ .withColumn("dlat", radians(col(lat1)) - radians(col(lat2))) \ .withColumn(target_name, asin(sqrt(sin(col("dlat") / 2) ** 2 + cos(radians(col(lat2)))* cos(radians(col(lat1))) * sin(col("dlon") / 2) ** 2)) * 2 * 3963 * 5280) \ .drop("dlon", "dlat") return result
def processing_loop(spark_master, input_queue, output_queue, wikieod_file): """Create a model and process requests for new predictions. This function is the heart of the application. It accepts a URL to a Spark master, multiprocessing input and output Queue objects, and the location of the end of day stock data in parquet format. With this information it will load the end of day data and create a model to base future predictions upon. After creating the model, it will enter a blocking loop waiting for new prediction requests to arrive. After receiving a new request, it will simulate the requested stock predictions and place the results into the output queue. It is important to note that this function will run as a separate process started by the main function. This is done to isolate the Spark processing components from the thread of execution that is running the Flask web server. In this manner the application will be reactive to incoming input without blocking on the processing activity. """ # import these here to allow the debug mode to function properly in the # absence of spark import pyspark from pyspark import sql as pysql from pyspark.sql import functions as pyfuncs spark = pysql.SparkSession.builder.master(spark_master).getOrCreate() sc = spark.sparkContext output_queue.put('ready') df = spark.read.load(wikieod_file) ddf = df.select('ticker', 'date', 'close').withColumn( 'change', (pyfuncs.col('close') / pyfuncs.lag('close', 1).over( pysql.Window.partitionBy('ticker').orderBy(df['date'])) - 1.0) * 100) mv = ddf.groupBy('ticker').agg( pyfuncs.avg('change').alias('mean'), pyfuncs.sqrt(pyfuncs.variance('change')).alias('stddev')) dist_map = mv.rdd.map(lambda r: (r[0], (r[1], r[2]))).collectAsMap() priceDF = ddf.orderBy('date', ascending=False).groupBy('ticker').agg( pyfuncs.first('close').alias('price'), pyfuncs.first('date').alias('date')) prices = priceDF.rdd.map(lambda r: (r[0], r[1])).collectAsMap() while True: req = input_queue.get() portfolio = {} for stock in req['stocks']: portfolio[stock['symbol']] = (prices[stock['symbol']] * stock['quantity']) seed_rdd = sc.parallelize(seeds(10000)) bparams = sc.broadcast(dist_map) bpf = sc.broadcast(portfolio) initial_value = portfolio_value(portfolio) results = seed_rdd.map(lambda s: portfolio_value( simulate(s, bpf.value, bparams.value, req['days'])) - initial_value ) simulated_results = list(zip(results.collect(), seed_rdd.collect())) simulated_values = [v for (v, _) in simulated_results] simulated_values.sort() num_samples = req['simulations'] if req['simulations'] < 100 else 100 prediction = [ simulated_values[int(len(simulated_values) * i / num_samples)] for i in range(num_samples) ] percentage_var = 0.05 fivepercent = '{:0.2f}'.format(simulated_values[int( len(simulated_values) * percentage_var)]) req.update({ 'status': 'ready', 'fivepercent': fivepercent, 'prediction': prediction }) output_queue.put(req)
def compile_sqrt(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) return F.sqrt(src_column)
def select_acc_intervals(df, ts_name, interval, window, incl_vect=False, incl_acc=False): """ Filter DataFrame with a new epoch duration. :param df: Spark DataFrame object with timestamp data :param ts_name: column with timestamp data :param interval: initial epoch duration (in seconds) :param window: new epoch duration (in seconds) :param incl_vect: if true, calculate vector magnitude and include it in the DataFrame :param incl_acc: if true, all raw accelerometer data are included in the DataFrame :return: Spark DataFrame object with timestamp data """ ## the window must be larger tha a single epoch assert interval <= 60, "Epoch larger than 1 minute." assert window >= interval, "Window smaller than epoch." cols = df.columns selected_cols = ['axis1', 'axis2', 'axis3', 'steps'] # TODO: add eeAccumulator minp = df.select(F.min(ts_name).cast('long')).first()[0] if interval < window: df2 = df.withColumn('tmp', F.row_number().over(Window.orderBy(ts_name)) - 1) df2 = df2.withColumn('total_sec', F.col(ts_name).cast('long')).cache() df2 = df2.checkpoint() df2.count() for col in selected_cols: df2 = df2.withColumn(col, F.when(((F.col('total_sec') - minp) % window == 0), F.sum(col).over(Window.orderBy('total_sec') .rangeBetween(0, window - interval) ) ).otherwise(0) ) df2 = df2.withColumn('duration', F.col(ts_name).cast(IntegerType()) - F.lag(F.col(ts_name).cast(IntegerType()), 1, minp) .over(Window.orderBy(ts_name)) ).drop('total_sec') df2 = df2.withColumn('tmp', (F.col('tmp') * F.col('duration')) % window).drop('duration').orderBy(ts_name) df2 = df2.filter(F.col('tmp') == 0).drop('tmp').orderBy(ts_name) else: df2 = df if incl_vect: df2 = df2.withColumn('vectMag', F.sqrt(F.col('axis1') ** 2 + F.col('axis2') ** 2 + F.col('axis3') ** 2)) cols.insert(1, 'vectMag') df2 = df2.select(cols).orderBy(ts_name) if not incl_acc: df2 = df2.select(ts_name, cols[1]) return df2
# explodes all array columns d1 = d.withColumn("new", F.arrays_zip("heart_rate","timestamp","latitude","longitude","lat2","long2","time2"))\ .withColumn("new", F.explode("new"))\ .select("userId","id", F.col("new.heart_rate").alias("heart_rate"), F.col("new.timestamp").alias("timestamp"), F.col("new.latitude").alias("lat"), F.col("new.longitude").alias("long"), F.col("new.lat2").alias("lat2"), F.col("new.long2").alias("long2"), F.col("new.time2").alias("time2")) # haversine formula, calculates distance and speed between two points d2 = d1.withColumn("distance", 3956 *(2 * F.asin(F.sqrt(F.sin((F.radians("lat") - F.radians("lat2"))/2)**2 + F.cos(F.radians("lat")) * F.cos(F.radians("lat2")) * F.sin((F.radians("long") - F.radians("long2"))/2)**2))))\ .withColumn("speed", F.col("distance")/((F.col("timestamp") - F.col("time2"))/3600)) d2 = d2.fillna({"speed": "0"}) # aggregations that compute metrics related to an individual bike trip query = d2.groupBy("id", "userid").agg( F.round(F.mean("speed"), 2).alias("avgspeed"), F.round(F.max("speed"), 2).alias("max_speed"), F.round(F.mean("heart_rate")).cast("integer").alias("avg_heart_rate"), F.max("heart_rate").alias("max_heart_rate"), F.round(F.sum("distance"), 2).alias("distance"), conv_sec_udf(F.last("timestamp") - F.first("timestamp")).alias("duration"), conv_sec_udf((F.last("timestamp") - F.first("timestamp")) /
f_section={"U_sequence_square":'sum', "U_sequence":'sum',"Interval_int":'count',"avg(PRICE)":'mean'} df_section=df8.groupby(["SYMBOL","DATE"]).agg(f_section) df_section=df_section.withColumnRenamed('avg(avg(PRICE))', 'daily_average_price') df_section=df_section.withColumnRenamed('count(Interval_int)', 'n') df_section=df_section.withColumnRenamed('sum(U_sequence)', 'U_sum') df_section=df_section.withColumnRenamed('sum(U_sequence_square)', 'U_squre_sum') df_section=df_section.withColumn("Section_volatility",sqrt(col('U_squre_sum')/col('n')) - col('U_sum')*col('U_sum')/(col('n')*(col('n')-1))) df_section=df_section.withColumnRenamed('U_squre_sum', 'Realized_volatility') import pandas as pd import matplotlib.pyplot as plt import matplotlib.pylab as pylab company_list=df_section.select(df_section.SYMBOL).distinct().collect() company_list_panda=pd.DataFrame(company_list) company_plot=dict() date_plot=dict()
def distance(CLat, CLon, data, col_name): return data.withColumn('CLon', f.lit(CLon)).withColumn('CLat',f.lit(CLat)).withColumn("dlon", f.radians(f.col("CLon")) - f.radians(f.col("longitude"))).withColumn("dlat", f.radians(f.col("CLat")) - f.radians(f.col("latitude"))).withColumn(col_name, f.asin(f.sqrt( f.sin(f.col("dlat") / 2) ** 2 + f.cos(f.radians(f.col("latitude"))) * f.cos(f.radians(f.col("CLat"))) * f.sin(f.col("dlon") / 2) ** 2 ) ) * 2 * 6371 * 1000) \ .drop("dlon", "dlat",'CLon', 'CLat')
df_join = df_join.withColumn( 'longitude_distance', functions.radians(over_station_coord['near_longitude']) - functions.radians(short_station_coord['start_longitude'])) df_join = df_join.withColumn( 'a', (pow(functions.sin('latitude_distance'), 2) + functions.cos(functions.radians(short_station_coord['start_latitude'])) * functions.cos(functions.radians(over_station_coord['near_latitude'])) * (pow(functions.sin('longitude_distance'), 2)))) df_join = df_join.withColumn( 'distance', 6373 * 2 * functions.atan2(sqrt(df_join['a']), sqrt(1 - df_join['a']))) # distance less than 3 km #df_join = df_join.filter(df_join['distance'] < 3) df_join = df_join.select('date', 'hour', 'start_station_name', 'near_station_name', 'distance') df_join = df_join.dropDuplicates( ['date', 'hour', 'start_station_name', 'near_station_name']) df_join = df_join.orderBy('date', 'hour', 'distance').select('date', 'hour', 'start_station_name', 'near_station_name')
def ratingBasedMetrics(ratings): ratings_quad = ratings.select( "user_id", "business_id", "stars").withColumn( "stars_quad", col("stars") * col("stars")).alias("user_business_rating") sum_stars = ratings_quad.groupBy("user_id").agg( sum_sql("stars_quad").alias("sum_quad_stars"), count(lit(1)).alias("nr") ) \ .alias("user_business_stars_quad") ratings_sum = ratings_quad.join(sum_stars, "user_id").select("business_id", "user_id", "stars", "stars_quad", "sum_quad_stars", "nr") all_pairs = ratings_sum.join( ren(ratings_sum, ["business_id"]), "business_id").filter(col("user_id") < col("user_id_2")) cosine_data = all_pairs.groupBy( "user_id", "user_id_2", "sum_quad_stars", "sum_quad_stars_2").agg( sum_sql("stars").alias("sum_stars"), sum_sql("stars_2").alias("sum_stars_2"), sum_sql(col("stars") * col("stars_2")).alias("sum_xy"), sum_sql((col("stars") - col("stars_2")) * (col("stars") - col("stars_2"))).alias("sumxy_diff_quad")) cosine_rating = cosine_data.withColumn( "cosine_rating", ((col("sum_xy")) / (sqrt("sum_quad_stars") * sqrt("sum_quad_stars_2")) ).cast("float")).select( "user_id", "user_id_2", "cosine_rating").filter(col("cosine_rating") > 0) item_count = ratings.select("business_id").distinct().count() item_count_sqrt = math.sqrt(item_count) dfDiff = all_pairs.withColumn("diff", (col("stars") - col("stars_2")) * (col("stars") - col("stars_2")) - col("stars_quad") - col("stars_quad_2")) euclidean = dfDiff.groupBy( "user_id", "user_id_2", "sum_quad_stars", "sum_quad_stars_2").agg(sum_sql("diff").alias("sum_diff")).withColumn( "diff_quad", col("sum_diff") + col("sum_quad_stars") + col("sum_quad_stars_2")) euclidean_rating = euclidean.withColumn( "euclidean_rating", (1 / (1 + sqrt("diff_quad") / item_count_sqrt)).cast("float")).select( "user_id", "user_id_2", "euclidean_rating").filter(col("euclidean_rating") > 0) intersection = all_pairs.groupBy("user_id", "user_id_2", "nr", "nr_2").agg( count(lit(1)).alias("intersection")) jaccard_rating = intersection.withColumn("jaccard_rating", ( col("intersection") / (col("nr") + col("nr_2") - col("intersection"))).cast("float")).select( "user_id", "user_id_2", "jaccard_rating").filter(col("jaccard_rating") > 0) mean_ratings = ratings_quad.groupBy("user_id").agg( mean("stars").alias("mean_stars")).alias("mean_ratings") centered_stars = ratings_quad.join(mean_ratings, "user_id").withColumn( "centered_stars", col("stars") - col("mean_stars")).withColumn( "centered_quad_stars", col("centered_stars") * col("centered_stars")) centered_stars_sums = centered_stars.groupBy("user_id").agg(sum_sql("centered_stars").alias("sum_centered_stars"), sum_sql("centered_quad_stars").alias( "sum_centered_quad_stars")) \ .alias("centered_stars_sums") centered_stars = centered_stars.join(centered_stars_sums, "user_id") centered_stars = centered_stars.join( ren(centered_stars, ["business_id"]), "business_id").filter(col("user_id") < col("user_id_2")) centered_grouped = centered_stars.groupBy("user_id", "user_id_2", "sum_centered_quad_stars", "sum_centered_quad_stars_2").agg( sum_sql(col("centered_stars") * col("centered_stars_2")).alias("sum_xy_centered")) \ .alias("centered_sum_quad") pearson_rating = centered_grouped.withColumn( "pearson_rating", ((col("sum_xy_centered")) / (sqrt("sum_centered_quad_stars") * sqrt("sum_centered_quad_stars_2"))).cast("float")).select( "user_id", "user_id_2", "pearson_rating").filter(col("pearson_rating") > 0) return cosine_rating.join(jaccard_rating, ["user_id", "user_id_2"], "outer").join( euclidean_rating, ["user_id", "user_id_2"], "outer").join(pearson_rating, ["user_id", "user_id_2"], "outer")
def linearReg(self, dataset_add, feature_colm, label_colm, relation_list, relation, userId, locationAddress): try: dataset = spark.read.parquet(dataset_add) dataset.show() label = '' for val in label_colm: label = val #ETL part Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType" or str(x.dataType) == 'TimestampType' or str(x.dataType) == 'DateType' or str(x.dataType) == 'BooleanType' or str(x.dataType) == 'BinaryType'): for y in feature_colm: if x.name == y: dataset = dataset.withColumn( y, dataset[y].cast(StringType())) stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) if relation == 'linear': dataset = dataset if relation == 'non_linear': dataset = Relationship(dataset, relation_list) categoryColmList = [] categoryColmListFinal = [] categoryColmListDict = {} countOfCategoricalColmList = [] for value in stringFeatures: categoryColm = value listValue = value listValue = [] categoryColm = dataset.groupby(value).count() countOfCategoricalColmList.append(categoryColm.count()) categoryColmJson = categoryColm.toJSON() for row in categoryColmJson.collect(): categoryColmSummary = json.loads(row) listValue.append(categoryColmSummary) categoryColmListDict[value] = listValue if not stringFeatures: maxCategories = 5 else: maxCategories = max(countOfCategoricalColmList) for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer( inputCol=label, outputCol='indexed_' + label, handleInvalid="skip").fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] # encodedFeatures = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) '''from pyspark.ml.feature import OneHotEncoderEstimator oneHotEncodedFeaturesList = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) oneHotEncodedFeaturesList.append('OneHotEncoded_' + colm) oneHotEncoder=OneHotEncoderEstimator(inputCols=indexed_features, outputCols=oneHotEncodedFeaturesList) oneHotEncoderFit=oneHotEncoder.fit(dataset) oneHotEncoderFeaturesDataset=oneHotEncoderFit.transform(dataset)''' featureAssembler = VectorAssembler(inputCols=indexed_features + numericalFeatures, outputCol='features', handleInvalid="skip") dataset = featureAssembler.transform(dataset) vectorIndexer = VectorIndexer(inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=maxCategories, handleInvalid="skip").fit(dataset) dataset = vectorIndexer.transform(dataset) trainDataRatioTransformed = self.trainDataRatio testDataRatio = 1 - trainDataRatioTransformed train_data, test_data = dataset.randomSplit( [trainDataRatioTransformed, testDataRatio], seed=40) lr = LinearRegression(featuresCol="vectorIndexedFeatures", labelCol=label) regressor = lr.fit(train_data) # locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/' print("coefficient : " + str(regressor.coefficients)) coefficient_t = str(regressor.coefficients) print("intercept : " + str(regressor.intercept)) intercept_t = str(regressor.intercept) featurePredictedLabel = feature_colm featurePredictedLabel.append('prediction') featurePredictedLabel.append(label) # testDataEvaluation = regressor.evaluate(test_data) # testDataPrediction = testDataEvaluation.predictions # testDataPrediction.select(featurePredictedLabel).show() prediction = regressor.evaluate(test_data) prediction_val = prediction.predictions testDataPrediction = prediction_val.select(featurePredictedLabel) # storing test predicted value to the dataset prediction_val_pand = prediction_val.select( label, "prediction").toPandas() prediction_val_pand = prediction_val_pand.assign( residual_vall=prediction_val_pand[label] - prediction_val_pand["prediction"]) prediction_val_pand_residual = prediction_val_pand["residual_vall"] prediction_val_pand_label = prediction_val_pand[label] prediction_val_pand_predict = prediction_val_pand["prediction"] lr_prediction = regressor.transform(test_data) lr_prediction.groupBy(label, "prediction").count().show() lr_prediction_quantile = lr_prediction.select(label, "prediction") training_summary = regressor.summary print("numof_Iterations...%d\n" % training_summary.totalIterations) print("ObjectiveHistory...%s\n" % str(training_summary.objectiveHistory)) print("RMSE...%f\n" % training_summary.rootMeanSquaredError) RMSE = training_summary.rootMeanSquaredError print("MSE....%f\n" % training_summary.meanSquaredError) MSE = training_summary.meanSquaredError print("r**2(r-square)....::%f\n" % training_summary.r2) r_square = training_summary.r2 print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj) adjsted_r_square = training_summary.r2adj print("deviance residuals %s" % str(training_summary.devianceResiduals)) training_summary.residuals.show() residual_graph = training_summary.residuals residual_graph_pandas = residual_graph.toPandas() print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors)) coefficientStdError = str( training_summary.coefficientStandardErrors) print(" Tvalues :\n" + str(training_summary.tValues)) T_values = str(training_summary.tValues) tValuesList = training_summary.tValues print(" p values :\n" + str(training_summary.pValues)) P_values = str(training_summary.pValues) coefficientList = list(regressor.coefficients) #summaryData import pyspark.sql.functions as F import builtins round = getattr(builtins, 'round') print(coefficientList) coefficientListRounded = [] for value in coefficientList: coefficientListRounded.append(round(value, 4)) # print(coefficientListRounded) # print(intercept_t) interceptRounded = round(float(intercept_t), 4) # print(interceptRounded) # print(RMSE) RMSERounded = round(RMSE, 4) # print(RMSERounded) MSERounded = round(MSE, 4) rSquareRounded = round(r_square, 4) adjustedrSquareRounded = round(adjsted_r_square, 4) coefficientStdError = training_summary.coefficientStandardErrors coefficientStdErrorRounded = [] for value in coefficientStdError: coefficientStdErrorRounded.append(round(float(value), 4)) print(coefficientStdErrorRounded) tValuesListRounded = [] for value in tValuesList: tValuesListRounded.append(round(value, 4)) print(tValuesListRounded) pValuesListRounded = [] PValuesList = training_summary.pValues for value in PValuesList: pValuesListRounded.append(round(value, 4)) print(pValuesListRounded) # regression equation intercept_t = float(intercept_t) coefficientList = list(regressor.coefficients) equation = label, '=', interceptRounded, '+' for feature, coeff in zip(feature_colm, coefficientListRounded): coeffFeature = coeff, '*', feature, '+' equation += coeffFeature equation = equation[:-1] print(equation) equationAsList = list(equation) '''# statTable function def summaryTable(self,featuresName,featuresStat): statTable={} for name, stat in zip(featuresName.values(), featuresStat.values()): print(name, ": ", stat) statTable[name]=stat return statTable ''' # significance value PValuesList = training_summary.pValues significanceObject = {} for pValue in pValuesListRounded: if (0 <= pValue < 0.001): significanceObject[pValue] = '***' if (0.001 <= pValue < 0.01): significanceObject[pValue] = '**' if (0.01 <= pValue < 0.05): significanceObject[pValue] = '*' if (0.05 <= pValue < 0.1): significanceObject[pValue] = '.' if (0.1 <= pValue < 1): significanceObject[pValue] = '-' print(significanceObject) # storing test predicted value to the dataset predictionData = 'prediction.parquet' predictionDataStoring = locationAddress + userId + predictionData testDataPrediction.write.parquet(predictionDataStoring, mode='overwrite') # residual vs predicted value prediction_data = regressor.summary.predictions prediction_data.show() prediction_data.select(['prediction']).show() predicted = prediction_data.select(['prediction']) regressor.summary.residuals.show() residuals = regressor.summary.residuals pred_d = predicted.withColumn('row_index', f.monotonically_increasing_id()) res_d = residuals.withColumn('row_index', f.monotonically_increasing_id()) pred_residuals = pred_d.join( res_d, on=['row_index']).sort('row_index').drop('row_index') pred_residuals.show() QQPlot = 'QQPlot.parquet' # locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/' # userId = '6786103f-b49b-42f2-ba40-aa8168b65e67' QQPlotAddress = locationAddress + userId + QQPlot pred_residuals.write.parquet(QQPlotAddress, mode='overwrite') # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet', # mode='overwrite') #################################################################################3 # scale location plot from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev df_label = prediction_data.select( label, 'prediction', sqrt(ab(prediction_data[label])).alias("sqrt_label")) df_label.show() df_sqrt_label_index = df_label.withColumn( 'row_index', f.monotonically_increasing_id()) df_sqrt_label_index.show() res_d.show() sqrt_label_residual_join = df_sqrt_label_index.join( res_d, on=['row_index']).sort('row_index').drop('row_index') sqrt_label_residual_join.show() std_resid = sqrt_label_residual_join.select( 'sqrt_label', 'prediction', (sqrt_label_residual_join['residuals'] / sqrt_label_residual_join['sqrt_label']).alias('std_res')) std_resid.show() sqrt_std_res = std_resid.select( "std_res", 'prediction', sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid")) sqrt_std_res.show() sqrt_std_res_fitted = sqrt_std_res.select('prediction', 'sqrt_std_resid') scaleLocationPlot = 'scaleLocation.parquet' scaleLocationPlotAddress = locationAddress + userId + scaleLocationPlot sqrt_std_res_fitted.write.parquet(scaleLocationPlotAddress, mode='overwrite') # sqrt_std_res_fitted.write.parquet( # 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet', # mode='overwrite') ########### #QQplot # QUANTILE from scipy.stats import norm import statistics import math res_d.show() sorted_res = res_d.sort('residuals') sorted_res.show() # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'), # meann(col('residuals')).alias('mean')) # stdev_ress.show() # mean_residual = stdev_ress.select(['mean']).toPandas() # l = mean_residual.values.tolist() # print(l) # stddev_residual = stdev_ress.select(['std_dev']).toPandas() # length of the sorted std residuals count = sorted_res.groupBy().count().toPandas() countList = count.values.tolist() tuple1 = () for k in countList: tuple1 = k for tu in tuple1: lengthResiduals = tu print(lengthResiduals) quantileList = [] for x in range(0, lengthResiduals): quantileList.append((x - 0.5) / (lengthResiduals)) print(quantileList) # Z-score on theoritical quantile zTheoriticalTrain = [] for x in quantileList: zTheoriticalTrain.append(norm.ppf(abs(x))) print(zTheoriticalTrain) sortedResidualPDF = sorted_res.select('residuals').toPandas() sortedResidualPDF = sortedResidualPDF['residuals'] stdevResidualTrain = statistics.stdev(sortedResidualPDF) meanResidualTrain = statistics.mean(sortedResidualPDF) zPracticalTrain = [] for x in sortedResidualPDF: zPracticalTrain.append( (x - meanResidualTrain) / stdevResidualTrain) ########## target = dataset.select(label) pred = prediction_data.select(['prediction']) pred_d = pred.withColumn('row_index', f.monotonically_increasing_id()) target_d = target.withColumn('row_index', f.monotonically_increasing_id()) pred_target = pred_d.join(target_d, on=['row_index']).drop('row_index') pred_target.show() dataset.show() pred_target_data_update = dataset.join(pred_target, on=[label]) pred_target_data_update.show(100) ##########3 # table_response = { # # "Intercept": intercept_t, # "Coefficients": coefficient_t, # "RMSE": RMSE, # "MSE": MSE, # "R_square": r_square, # "Adj_R_square": adjsted_r_square, # "coefficientStdError": coefficientStdError, # "T_value": T_values, # "P_value": P_values # # } y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_label = lr_prediction_quantile.approxQuantile( label, x, 0.01) quantile_prediction = lr_prediction_quantile.approxQuantile( "prediction", x, 0.01) Q_label_pred = '' print(len(quantile_label)) length = len(quantile_label) for i in range(0, len(quantile_label)): Q_label_pred += str(quantile_label[i]) + 't' + str( quantile_prediction[i]) + 'n' import math fitted_residual = '' print(len(prediction_val_pand_residual)) length = len(prediction_val_pand_residual) for i in range(0, len(prediction_val_pand_residual)): fitted_residual += str( prediction_val_pand_predict[i]) + 't' + str( prediction_val_pand_residual[i]) + 'n' ## scale location graph data prediction_val_pand_residual prediction_val_pand_predict prediction_val_pand_residual_abs = prediction_val_pand_residual.abs( ) import math sqrt_residual = [] for x in prediction_val_pand_residual_abs: sqrt_residual.append(math.sqrt(x)) # print ("____________________ ",x) sqrt_residual # calculating std deviation import statistics print(statistics.stdev(prediction_val_pand_residual)) stdev_ = statistics.stdev(prediction_val_pand_residual) # calcuate stnd residuals std_res = [] for x in prediction_val_pand_residual: std_res.append(x / stdev_) print(std_res) # calculating the square root of std_res import math sqr_std_res = [] for x in std_res: sqr_std_res.append(math.sqrt(abs(x))) print(sqr_std_res) scale_predict_residual = '' for pre, res in zip(prediction_val_pand_predict, sqr_std_res): scale_predict_residual += str(pre) + 't' + str(res) + 'n' print(scale_predict_residual) # QUANTILE y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_std_res = spark.createDataFrame(std_res, FloatType()) quantile_std_res.show() quantile_std_res_t = quantile_std_res.approxQuantile( 'value', x, 0.01) print(quantile_std_res_t) print(x) # calculating the z_score from scipy.stats import norm ## sort the list sorted_std_res = sorted(std_res) mean = statistics.mean(sorted_std_res) stdev = statistics.stdev(sorted_std_res) # print(mean) quantile = [] n = len(std_res) print(n) for x in range(0, n): quantile.append((x - 0.5) / (n)) print(quantile) # z_score theoratical z_theory = [] for x in quantile: z_theory.append(norm.ppf(abs(x))) # z score for real val z_pract = [] for x in sorted_std_res: z_pract.append((x - mean) / stdev) Q_label_pred = '' for quant, val in zip(z_theory, z_pract): Q_label_pred += str(quant) + 't' + str(val) + 'n' graph_response = { "Q_Q_plot": Q_label_pred, "residual_fitted": fitted_residual, "scale_location": scale_predict_residual } tableContent = \ { 'coefficientValuesKey': coefficientListRounded, 'tValuesKey': tValuesListRounded, 'pValuesKey': pValuesListRounded, 'significanceValuesKey': significanceObject, 'interceptValuesKey': interceptRounded, "RMSE": RMSERounded, "RSquare": rSquareRounded, "AdjRSquare": adjustedrSquareRounded, "CoefficientStdError": coefficientStdErrorRounded, 'equationKey': equation } json_response = { 'table_data': tableContent, 'graph_data': graph_response } print(json_response) return (json_response) except Exception as e: print('exception is =' + str(e))
df_invoices_temp1 = df_inner_joined.withColumnRenamed('price', 'price_per_unit') df_invoices_temp1.show(2) df_invoices_temp2 = df_invoices_temp1.withColumn\ ('total_price', df_invoices_temp1.price_per_unit * df_invoices_temp1.quantity_ordered) df_invoices_temp2.select('customer_name', 'product_name', 'price_per_unit', 'quantity_ordered', 'total_price').show(2) from pyspark.sql.functions import sqrt, pow df_invoices_temp3 = df_invoices_temp2.withColumn\ ('shipping_distance', sqrt(pow(df_invoices_temp2.geolocation_x - \ df_invoices_temp2.warehouse_x,2) + \ pow(df_invoices_temp2.geolocation_y - \ df_invoices_temp2.warehouse_y,2))) df_invoices_temp3.select('customer_name', 'geolocation_x', 'geolocation_y', 'shipping_distance').show(2) df_invoices_temp4 = df_invoices_temp3.withColumn\ ('shipping_costs', df_invoices_temp3.shipping_distance * 10) df_invoices_temp4.select('customer_name', 'geolocation_x', 'geolocation_y', 'shipping_distance', 'shipping_costs').show(2) df_invoices_final = df_invoices_temp4 df_invoices_final.cache() df_invoices_final.select('customer_id', 'customer_name', 'address',
when( substring(trim(df.state), 1, 10).contains('n') == False, df["position"].getItem(1)).otherwise(0)) # Recherche des values (état, position x y) de la ligne suivant ; ajout dans la ligne actuelle df = df.withColumn("nxt_state", lead(df.state).over(my_window)) df = df.withColumn("nxt_x", lead(df.x).over(my_window)) df = df.withColumn("nxt_y", lead(df.y).over(my_window)) # Reconnaissance des frames qui correspondant aux buts selon les critères suivantes: # 1. Un ball est détecté das le frame actuel mais disparaît dans le frame suivant # 2. La position du ball est dans la zone porte de babyfoot (Cette zone n'est pas très precise pour l'instant) df = df.withColumn( "goal", when((substring(trim(df.state), 1, 10).contains('n') == False) & (substring(trim(df.nxt_state), 1, 10).contains('n')) & (df.y > 150) & (df.y < 300) & ((df.x < 100) | (df.x > 500)), 1).otherwise(0)) df = df.withColumn("goal", lead(df.goal).over(my_window)) df.groupBy('goal').count().show() df = df.filter(df.goal.isNotNull()) # Calcul de vitesse selon la distance entre des coordonnées et une coeifficent de transformation (Une coeifficent expérimentale est utilisée pour le test ) df = df.withColumn("vitesse", sqrt(pow(df.x - df.nxt_x, 2) + pow(df.y - df.nxt_y, 2))) df = df.withColumn("vitesse", round(df["vitesse"] * 30 / 500, 2)) df = df.withColumn("total_goal", sum(df.goal).over(my_window)) df.show() # Enregistrement dans un nouveau fichier json df.write.json("file:///home/ymo/babyfoot.json", mode='overwrite') exit()
(col('Start_Longitude') > -80) &\ (col('Start_Longitude') < -70) &\ (col('Start_Latitude') > 40) &\ (col('Start_Latitude') < 46) &\ (col('End_Longitude') > -80) &\ (col('End_Longitude') < -70) &\ (col('End_Latitude') > 40) &\ (col('End_Latitude') < 46) &\ (col('Cost') > 0)) yellow_tripdata_1m = yellow_tripdata_1m.withColumn("Duration", ((unix_timestamp(col("End_Datetime")) - unix_timestamp(col("Start_Datetime")))/60))\ .withColumn("Diff_Longitude", col("End_Longitude") - col("Start_Longitude"))\ .withColumn("Diff_Latitude", col("End_Latitude") - col("Start_Latitude"))\ .withColumn("a", F.pow(F.sin(col("Diff_Latitude")/2),2) +\ F.cos(col("Start_Latitude"))*F.cos(col("End_Latitude"))*F.pow(F.sin(col("Diff_Longitude")/2),2))\ .withColumn("Distance", 2 * 6371 * F.atan2(F.sqrt(col("a")), F.sqrt(1.0 - col("a"))))\ .drop("Diff_Longitude").drop("Diff_Latitude").drop("Start_Datetime")\ .drop("End_Datetime").drop("Start_Longitude").drop("Start_Latitude")\ .drop("End_Longitude").drop("End_Latitude").drop("a").drop("Cost") yellow_trip_joined = yellow_tripdata_1m.join(yellow_tripvendors_1m, "ID", "inner").drop("ID") yellow_trip_joined.createOrReplaceTempView("yellow_trip_joined") window = Window.partitionBy("Vendor") res = yellow_trip_joined.withColumn("Max_Distance", F.max("Distance").over(window))\ .where(col("Distance") == col("Max_Distance"))\ .drop("Max_Distance").select(["Vendor", "Distance", "Duration"]) res.show() print("Time of Q2 using SQL with parquet is: %s seconds" % (time.time() - start_time_parquet))
"dupeCount = 1").drop('dupeCount') # 2 # get the POI list POI_df = sqlContext.read.format('csv').options( header='true', inferschema='true').load('../../tmp/data/POIList.csv') # cross join dataframes to have reference to POI longitude and latitude in the same row for calculation df_2 = df.crossJoin(POI_df.select('*')) # belated realizing that calculating geographical distance is supposed to be done with Haversine formula # I used the formula for distance between points: sqrt((x2-x1)^2 + (y2-y1)^2) # Using the Haversine formula, I would have to create a UDF which takes in both geographical points and returns the distance df_2 = df_2.withColumn( 'dist', f.sqrt(((df_2["lat"] - df_2[" Latitude"])**2) + ((df_2["long"] - df_2["Longitude"])**2))) # get the minimum distance for each unique request, then join with the original table to get the closest POIID df_3 = df_2.groupby(['time', 'lat', 'long']).min('dist') df_3 = df_3.join(df_2, (df_3['time'] == df_2['time']) & (df_3['lat'] == df_2['lat']) & (df_3['long'] == df_2['long']) & (df_3['min(dist)'] == df_2['dist'])).drop(df_2['lat']).drop( df_2['long']).drop(df_2['time']) df_3 = df_3.drop(df_3[' Latitude']).drop(df_3['Longitude']) # 3 # aggregation for average and standard deviation df_5 = df_3.groupby('POIID').agg(f.avg('dist'), f.stddev('dist')) # using max(dist) as the radius - the furthest point from the POI should be the radius to use to draw the circle df_6 = df_3.groupby('POIID').agg(f.max('dist'), f.count('dist'))
def train(self, df_train, top_N=None, user_column_name="user", item_column_name="item", rating_column_name="rating"): """ Calculate cosine similarities between user pairs Parameters ---------- df_train : DataFrame Ratings of items in the following format: [user, item, rating] top_N : int or None Number of top similarities for a given user pair that will compose a similarity matrix. It is used in the train phase rating_column_name : str user_column_name : str item_column_name : str """ top_N = int(top_N) if top_N else self.top_N_similarities user_column_name = str(user_column_name) item_column_name = str(item_column_name) rating_column_name = str(rating_column_name) clmn_names = [ F.col(user_column_name).alias("user"), F.col(item_column_name).alias("item"), F.col(rating_column_name).alias("rating") ] df_train = df_train.select(clmn_names) left_clmn_names = [ F.col("item").alias("p"), F.col("user").alias("u1"), F.col("rating").alias("v1") ] right_clmn_names = [ F.col("item").alias("p"), F.col("user").alias("u2"), F.col("rating").alias("v2") ] # Step 1. Create matrix df_dot = df_train.select(left_clmn_names)\ .join(df_train.select(right_clmn_names), on="p")\ .where(F.col("u1") != F.col("u2"))\ .groupBy([F.col("u1"), F.col("u2")])\ .agg(F.sum(F.col("v1") * F.col("v2")).alias("dot")) # Step 2. Calculate norms df_norm = df_train.select(left_clmn_names)\ .groupBy(F.col("u1"))\ .agg(F.sqrt(F.sum(F.col("v1") * F.col("v1"))).alias("norm")) similarity_clmns = [ F.col("u1"), F.col("u2"), (F.col("dot") / F.col("n1") / F.col("n2")).alias("sim") ] # Step 4. Calculate similarities df_similarity = df_dot.join(df_norm.select(F.col("u1"), F.col("norm").alias("n1")), on="u1")\ .join(df_norm.select(F.col("u1").alias("u2"), F.col("norm").alias("n2")), on="u2")\ .select(similarity_clmns) window = Window.partitionBy(F.col("u1"), F.col("u2")) # Step 5. Turncate similarities df_similarity_N = df_similarity.select("*", F.count("sim").over(window).alias("rank"))\ .filter(F.col("rank") <= top_N) # Step 6. Save data self.top_N_similarities = top_N self.df_similarity = df_similarity_N.select("u1", "u2", "sim").persist() self.df_train = df_train.persist() # Forse persists by calling to count self.df_similarity.count() self.df_train.count()
'x', F.cos(data['latitude'] * g2r) * F.cos(data['longitude'] * g2r)) data = data.withColumn( 'y', F.cos(data['latitude'] * g2r) * F.sin(data['longitude'] * g2r)) data = data.withColumn('z', F.sin(data['latitude'] * g2r)) "c里面是几何中心,TODO需要改成patterns里面算出来的车窝" c = data.groupBy('engine_serial_number').agg( F.mean('x').alias('x'), F.mean('y').alias('y'), F.mean('z').alias('z')) c = c.withColumn('clon', F.atan(c['y'] / c['x']) / g2r + 180) c = c.withColumn( 'clat', F.atan(c['z'] / F.sqrt(c['x'] * c['x'] + c['y'] * c['y'])) / g2r) c = c.drop('x', 'y', 'z') #TODO 直接用车窝替代 #将里程规整到100米 data = data.drop('x', 'y', 'z') data = data.withColumn('dis', (data["high_resolution_total_vehicle_distance"] / 100).cast(IntegerType()) * 100) data = data.withColumn( 'time', F.unix_timestamp('occurrence_date_time').cast(IntegerType())) temp = data.groupBy('dis', 'engine_serial_number').agg( (F.max(data['time']) - F.min(data['time'])).alias('staydur')) data = data.withColumn(
# XXX: laplacian = sys.argv[1] if laplacian == 'unnormalized': entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], x[1])) D = CoordinateMatrix(entries, numCols=N, numRows=N) L = D.toBlockMatrix().subtract(W.toBlockMatrix()).toCoordinateMatrix() elif laplacian == 'normalized': entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], 1 / x[1])) D_inv = CoordinateMatrix(entries, numCols=N, numRows=N).toBlockMatrix() I = CoordinateMatrix(sc.range(N).map(lambda i: MatrixEntry(i, i, 1.0)), numCols=N, numRows=N).toBlockMatrix() L = I.subtract(D_inv.multiply(W.toBlockMatrix())).toCoordinateMatrix() elif laplacian == 'symmetric': entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], 1 / sqrt(x[1]))) D_invsq = CoordinateMatrix(entries, numCols=N, numRows=N).toBlockMatrix() I = sc.range(N).map(lambda i: MatrixEntry(i, i, 1.0), N, N) tmp = D_invsq.multiply(W.toBlockMatrix()).multiply(D_invsq) L = I.toBlockMatrix().subtract(tmp) else: raise ValueError('Unknown type of Laplacian.') ## SVD, and transform from dense matrix to dataframe. svd = L.toRowMatrix().computeSVD(k=K, computeU=False) V = svd.V.toArray().tolist() VV = spark.createDataFrame(V) kmeans = KMeans().setK(K).setSeed(1) vecAssembler = VectorAssembler(inputCols=VV.schema.names, outputCol='features') VV = vecAssembler.transform(VV)
def _fit(self, ratings_df): ''' Fit ALS model using reviews as training data. Parameters ========== ratings_df (pyspark.sql.DataFrame) Data used to train recommender model. Columns are 'user', 'item', and 'rating'. Values of user and item must be numeric. Values of rating range from 1 to 5. Returns ======= self ''' # avg_rating_df = ( # ratings_df # .groupBy() # .avg(self.getRatingCol()) # .withColumnRenamed('avg({})'.format(self.getRatingCol()), # 'avg_rating') # ) # print('Fit starting!') start_time = time.monotonic() # print('ratings_df') # ratings_df.show() rating_stats_df = ( ratings_df .agg( F.avg(self.getRatingCol()).alias('avg_rating'), F.stddev_samp(self.getRatingCol()).alias('stddev_rating') ) ) # print('ratings_stats_df:') # rating_stats_df.show() # if not self.getUseALS(): # self.setLambda_1(0.0) # self.setLambda_2(0.0) item_bias_df = ( ratings_df .crossJoin(rating_stats_df) .withColumn( 'diffs_item_rating', F.col(self.getRatingCol()) - F.col('avg_rating') ) .groupBy(self.getItemCol()) .agg( F.avg(F.col('diffs_item_rating')).alias('avg_diffs_item_rating'), F.nanvl( F.stddev_samp(F.col('diffs_item_rating')), F.lit(2.147483647E9) ).alias('stddev_diffs_item_rating'), F.count("*").alias('count_item_rating') ) .withColumn( 'stderr_diffs_item_rating', (self.getLambda_1() + F.col('stddev_diffs_item_rating')) / F.sqrt('count_item_rating') ) .withColumn( 'item_bias', F.col('avg_diffs_item_rating') / (1 + F.col('stderr_diffs_item_rating')) ) .select( self.getItemCol(), 'item_bias', 'avg_diffs_item_rating', 'stderr_diffs_item_rating', 'stddev_diffs_item_rating', 'count_item_rating' ) ) # print('item_bias_df:') # item_bias_df.show(5) # item_bias_df.printSchema() # print('item_bias_df NaN') # item_bias_df.where(F.isnan("item_bias")).show() user_bias_df = ( ratings_df .crossJoin(rating_stats_df) .join(item_bias_df, on=self.getItemCol()) .withColumn( 'diffs_user_rating', F.col(self.getRatingCol()) - F.col('avg_rating') - F.col('item_bias') ) .groupBy(self.getUserCol()) .agg( F.avg(F.col('diffs_user_rating')).alias('avg_diffs_user_rating'), F.nanvl( F.stddev_samp(F.col('diffs_user_rating')), F.lit(2.147483647E9) ).alias('stddev_diffs_user_rating'), F.count("*").alias('count_user_rating') ) .withColumn( 'stderr_diffs_user_rating', (self.getLambda_2() + F.col('stddev_diffs_user_rating')) / F.sqrt('count_user_rating') ) .withColumn( 'user_bias', F.col('avg_diffs_user_rating') / (1 + F.col('stderr_diffs_user_rating')) ) .select( self.getUserCol(), 'user_bias', 'avg_diffs_user_rating', 'stderr_diffs_user_rating', 'stddev_diffs_user_rating', 'count_user_rating' ) ) # print('user_bias_df:') # user_bias_df.show(5) # print('user_bias_df NaN') # user_bias_df.where(F.isnan("user_bias")).show() if self.getUseALS(): if self.getUseBias(): residual_df = ( ratings_df .crossJoin(rating_stats_df) .join(user_bias_df, on=self.getUserCol()) .join(item_bias_df, on=self.getItemCol()) .withColumn( self.getRatingCol(), F.col(self.getRatingCol()) - F.col('avg_rating') - F.col('user_bias') - F.col('item_bias') ) .select( self.getUserCol(), self.getItemCol(), self.getRatingCol() ) ) else: residual_df = ratings_df # self.setColdStartStrategy('drop') residual_stats_df = ( residual_df .agg( F.avg(F.col(self.getRatingCol())).alias('avg_residual'), F.stddev(F.col(self.getRatingCol())).alias('stddev_residual') ) ) # print('residual_df') # residual_df.show() # print('residual_df NaN') # residual_df.where(F.isnan("rating")).show() # print('residual_stats_df') # residual_stats_df.show() als_model = ALS( rank=self.getRank(), maxIter=self.getMaxIter(), regParam=self.getRegParam(), numUserBlocks=self.getNumUserBlocks(), numItemBlocks=self.getNumItemBlocks(), implicitPrefs=self.getImplicitPrefs(), alpha=self.getAlpha(), userCol=self.getUserCol(), itemCol=self.getItemCol(), ratingCol=self.getRatingCol(), nonnegative=self.getNonnegative(), checkpointInterval=self.getCheckpointInterval(), intermediateStorageLevel=self.getIntermediateStorageLevel(), finalStorageLevel=self.getFinalStorageLevel() ) recommender = als_model.fit(residual_df) else: recommender = None residual_stats_df = None print('Fit done in {} seconds'.format(time.monotonic() - start_time)) return( RecommenderModel( self.getUseALS(), self.getUseBias(), self.getLambda_3(), # self.getColdStartStrategy(), recommender, rating_stats_df, residual_stats_df, user_bias_df, item_bias_df ) )
# In[66]: get_ipython().magic('matplotlib inline') # In[67]: import seaborn # In[69]: sentiment_pd = best_model. transform(airportCleanDF). groupby('airport_name'). agg(fn.avg('prediction').alias('prediction'), (2*fn.stddev('prediction')/fn.sqrt(fn.count('*'))).alias('err')).\ toPandas() # In[ ]: # In[ ]: # In[70]:
for window in [1, 5, 10, 20, 80]: df = df.withColumn( "squared_error_Window_{}".format(window), pow((col("Close_Actual_Window_{}".format(window)) - col("close_ma")), 2)) df = df.withColumn( "s_abs_percentage_error_Window_{}".format(window), (abs(col("close_ma") - col("Close_Actual_Window_{}".format(window))) / ((col("Close_Actual_Window_{}".format(window)) + col("close_ma")) / 2)) * 100) df.show() df = df.withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("date"))) train_data = df.where("rank <= .9").drop("rank") test_data = df.where("rank > .9").drop("rank") for window in [1, 5, 10, 20, 80]: total_rmse.append( test_data.select( sqrt(mean(col( "squared_error_Window_{}".format(window))))).collect()) total_smape.append( test_data.select( mean(col( "s_abs_percentage_error_Window_{}".format(window)))).collect()) print(total_mape) print(total_rmse) print(total_smape)
def get_sd(col): return (func.sqrt(func.avg(col * col) - func.avg(col) * func.avg(col)))
def main(): # Args args = parse_args() # args.in_ld_folder = 'input_data/ld_each_variant' # args.in_manifest = 'input_data/190625/ld_analysis_input.tsv' # args.in_top_loci = 'input_data/190625/toploci.parquet' # args.out = 'output/ld_w_crediblesets.parquet' # args.min_r2 = 0.5 # Make spark session global spark spark = (pyspark.sql.SparkSession.builder.config("spark.master", "local[*]").getOrCreate()) print('Spark version: ', spark.version) # # Load data --------------------------------------------------------------- # # Load LD ld = ( load_ld(args.in_ld_folder).withColumn( 'index_variant_id', regexp_replace(col('index_variant_id'), ':', '_')).withColumn( 'tag_variant_id', regexp_replace(col('tag_variant_id'), ':', '_')) # .limit(10000) # Debug ) # Load manifest manifest = (load_manifest(args.in_manifest).withColumnRenamed( 'variant_id', 'index_variant_id')) # # Weight correlations by study population --------------------------------- # # Join LD to manifest data = manifest.join(ld, on='index_variant_id') # Replace R fields for coln in ['R_AFR', 'R_AMR', 'R_EAS', 'R_EUR', 'R_SAS']: data = ( data # Replace all R values == 1 with 0.9999995, otherwise we get error # This is reverted later by rounding to 6 dp .withColumn(coln, when(col(coln) == 1, 0.9999995).otherwise(col(coln)) ) # Fill nulls with 0 .withColumn(coln, when(col(coln).isNull(), 0).otherwise(col(coln)) ) ) # Fisher transform correlations to z-scores for coln in ['R_AFR', 'R_AMR', 'R_EAS', 'R_EUR', 'R_SAS']: data = data.withColumn(coln.replace('R_', 'Z_'), arctanh(col(coln))) # Compute weighted average across populations data = data.withColumn( 'Z_overall', ((col('AFR_prop') * col('Z_AFR')) + (col('AMR_prop') * col('Z_AMR')) + (col('EAS_prop') * col('Z_EAS')) + (col('EUR_prop') * col('Z_EUR')) + (col('SAS_prop') * col('Z_SAS')))) # Inverse Fisher transform weigthed z-score back to correlation data = data.withColumn('R_overall', tanh(col('Z_overall'))) # Round R_overall to 6 dp data = data.withColumn('R_overall', round6dp(col('R_overall'))) # Convert R to R2 data = data.withColumn('R2_overall', pow(col('R_overall'), 2)) # Drop rows where R2 is null data = data.filter(col('R2_overall').isNotNull()) # Filter based on overall R2 data = data.filter(col('R2_overall') >= args.min_r2) # Drop unneeded columns data = data.drop(*[ 'Z_overall', 'R_overall', 'R_AFR', 'R_AMR', 'R_EAS', 'R_EUR', 'R_SAS', 'Z_AFR', 'Z_AMR', 'Z_EAS', 'Z_EUR', 'Z_SAS', 'index_variant_id' ]) # Denormalise variant IDs data = (data.withColumnRenamed('chrom', 'lead_chrom').withColumnRenamed( 'pos', 'lead_pos').withColumnRenamed( 'ref', 'lead_ref').withColumnRenamed('alt', 'lead_alt').withColumn( 'tag_split', split(col('tag_variant_id'), '_')).withColumn( 'tag_chrom', col('tag_split').getItem(0)).withColumn( 'tag_pos', col('tag_split').getItem(1).cast('int')).withColumn( 'tag_ref', col('tag_split').getItem(2)).withColumn( 'tag_alt', col('tag_split').getItem(3)).drop( 'tag_split', 'tag_variant_id')) # # Conduct credible set analysis using PICS adjustment --------------------- # ''' Probabilistic Identification of Causal SNPs (PICS) from Farh (2014): https://www.nature.com/articles/nature13835 Adjusts the p-values for tag SNPs based on the p-value of the lead SNP and it's LD. ''' # Empiric constant that can be adjusted to fit the curve, 6.4 recommended. k = 6.4 # Load toploci toploci = spark.read.parquet(args.in_top_loci) # Join negative log pvalue from toploci onto data toploci = (toploci.withColumn( 'neglog_p', -1 * (log10(col('pval_mantissa')) + col('pval_exponent'))).withColumnRenamed( 'chrom', 'lead_chrom').withColumnRenamed( 'pos', 'lead_pos').withColumnRenamed( 'ref', 'lead_ref').withColumnRenamed( 'alt', 'lead_alt').select('study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt', 'neglog_p')) data = data.join( toploci, on=['study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt']) # Calculate PICS statistics data = (data.withColumn('pics_mu', col('R2_overall') * col('neglog_p')).withColumn( 'pics_std', sqrt(1 - sqrt(col('R2_overall'))**k) * sqrt(col('neglog_p')) / 2).withColumn( 'pics_relative_prob', when(col('pics_std') == 0, 1.0).otherwise( norm_sf(col('pics_mu'), col('pics_std'), col('neglog_p'))))) # Calculate the sum of the posterior probabilities at each locus pics_prob_sums = (data.groupby( 'study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt').agg( sum('pics_relative_prob').alias('pics_relative_prob_sum'))) # Merge back onto data data = data.join( pics_prob_sums, on=['study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt']) # Calculate posterior probability at each locus data = (data.withColumn( 'pics_postprob', col('pics_relative_prob') / col('pics_relative_prob_sum')).drop( 'pics_relative_prob_sum', 'neglog_p')) # Calculate cumulative sum per locus window_spec = (Window.partitionBy('study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt').orderBy( desc('pics_postprob')).rowsBetween( Window.unboundedPreceding, Window.currentRow)) data = (data.withColumn('pics_postprob_cumsum', sum('pics_postprob').over(window_spec))) # Label whether each row is in the 95 and 99% credible sets window_spec = (Window.partitionBy( 'study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt').orderBy('pics_postprob_cumsum')) data = (data.withColumn( 'pics_95perc_credset', when(lag('pics_postprob_cumsum', 1).over(window_spec) >= 0.95, False).otherwise(True)).withColumn( 'pics_99perc_credset', when( lag('pics_postprob_cumsum', 1).over(window_spec) >= 0.99, False).otherwise(True))) # # Write output ------------------------------------------------------------ # # Rename columns and format data = (data.withColumnRenamed( 'AFR_prop', 'AFR_1000G_prop').withColumnRenamed( 'AMR_prop', 'AMR_1000G_prop').withColumnRenamed( 'EAS_prop', 'EAS_1000G_prop').withColumnRenamed( 'EUR_prop', 'EUR_1000G_prop').withColumnRenamed( 'SAS_prop', 'SAS_1000G_prop').withColumnRenamed( 'R2_overall', 'overall_r2').select( 'study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt', 'tag_chrom', 'tag_pos', 'tag_ref', 'tag_alt', 'overall_r2', 'pics_mu', 'pics_postprob', 'pics_95perc_credset', 'pics_99perc_credset', 'AFR_1000G_prop', 'AMR_1000G_prop', 'EAS_1000G_prop', 'EUR_1000G_prop', 'SAS_1000G_prop')) # Save output (data.repartitionByRange('study_id', 'lead_chrom', 'lead_pos').write.parquet(args.out, mode='overwrite')) return 0
lat1 = 0.9345569159727344 lon1 = -1.9806997123424743 lat2 = 0.7945023069213337 lon2 = -1.2839693364011688 lat3 = 0.7893221871547071 lon3 = -1.1036193160713015 dlat1 = lat1 - lat0 dlon1 = lon1 - lon0 dlat2 = lat2 - lat0 dlon2 = lon2 - lon0 dlat3 = lat3 - lat0 dlon3 = lon3 - lon0 a1 = F.sin(dlat1 / 2)**2 + F.cos(lat0) * F.cos(lat0) * F.sin(dlon1 / 2)**2 a2 = F.sin(dlat2 / 2)**2 + F.cos(lat0) * F.cos(lat0) * F.sin(dlon2 / 2)**2 a3 = F.sin(dlat3 / 2)**2 + F.cos(lat0) * F.cos(lat0) * F.sin(dlon3 / 2)**2 c1 = F.lit(2) * F.asin(F.sqrt(a1)) c2 = F.lit(2) * F.asin(F.sqrt(a2)) c3 = F.lit(2) * F.asin(F.sqrt(a3)) r = F.lit(6371) dist1 = (c1 * r).alias('dist1') dist2 = (c2 * r).alias('dist2') dist3 = (c3 * r).alias('dist3') distances = clean2.select("_ID", "TimeSt", "City", "Province", "Latitude", "Longitude", dist1, dist2, dist3) distances.registerTempTable("dist0") # POI assignation and minimal distance to poi query = """SELECT _ID, TimeSt, City, Province, dist1, dist2, dist3, CASE WHEN (dist1 < dist2) AND (dist1 < dist3) THEN "POI1 - EDMONTON"
def lassoRegression(self, dataset_add, feature_colm, label_colm, relation_list, relation, userId): try: dataset = spark.read.parquet(dataset_add) dataset.show() Rsqr_list = [] Rsqr_regPara = {} print(self.xt) # print(data_add) label = '' for val in label_colm: label = val #ETL part Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType" or str(x.dataType) == 'TimestampType' or str(x.dataType) == 'DateType' or str(x.dataType) == 'BooleanType' or str(x.dataType) == 'BinaryType'): for y in feature_colm: if x.name == y: dataset = dataset.withColumn( y, dataset[y].cast(StringType())) stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) if relation == 'linear': dataset = dataset if relation == 'non_linear': dataset = Relationship(dataset, relation_list) categoryColmList = [] categoryColmListFinal = [] categoryColmListDict = {} countOfCategoricalColmList = [] for value in stringFeatures: categoryColm = value listValue = value listValue = [] categoryColm = dataset.groupby(value).count() countOfCategoricalColmList.append(categoryColm.count()) categoryColmJson = categoryColm.toJSON() for row in categoryColmJson.collect(): categoryColmSummary = json.loads(row) listValue.append(categoryColmSummary) categoryColmListDict[value] = listValue if not stringFeatures: maxCategories = 5 else: maxCategories = max(countOfCategoricalColmList) for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer( inputCol=label, outputCol='indexed_' + label, handleInvalid="skip").fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] encodedFeatures = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) featureAssembler = VectorAssembler(inputCols=indexed_features + numericalFeatures, outputCol='features', handleInvalid="skip") dataset = featureAssembler.transform(dataset) vectorIndexer = VectorIndexer(inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=maxCategories, handleInvalid="skip").fit(dataset) dataset = vectorIndexer.transform(dataset) trainDataRatioTransformed = self.trainDataRatio testDataRatio = 1 - trainDataRatioTransformed train_data, test_data = dataset.randomSplit( [trainDataRatioTransformed, testDataRatio], seed=40) ######################################################################33 # lasso final for t in self.xt: lr1 = LinearRegression(featuresCol="vectorIndexedFeatures", labelCol=label, elasticNetParam=1, regParam=t) regressor1 = lr1.fit(train_data) print(t) print("coefficient : " + str(regressor1.coefficients)) reg_sum = regressor1.summary r2 = reg_sum.r2 Rsqr_list.append(r2) Rsqr_regPara[r2] = t print(r2) print(Rsqr_list) print(max(Rsqr_list)) maximum_rsqr = max(Rsqr_list) print(Rsqr_regPara) final_regPara = [] for key, val in Rsqr_regPara.items(): if (key == maximum_rsqr): print(val) final_regPara.append(val) for reg in final_regPara: lr_lasso = LinearRegression( featuresCol="vectorIndexedFeatures", labelCol=label, elasticNetParam=1, regParam=reg) regressor = lr_lasso.fit(train_data) training_summary = regressor.summary r2 = training_summary.r2 print(r2) print("coefficient : " + str(regressor.coefficients)) coefficient_t = str(regressor.coefficients) print("intercept : " + str(regressor.intercept)) intercept_t = str(regressor.intercept) prediction = regressor.evaluate(test_data) prediction_val = prediction.predictions prediction_val.show() prediction_val_pand = prediction_val.select( label, "prediction").toPandas() prediction_val_pand = prediction_val_pand.assign( residual_vall=prediction_val_pand[label] - prediction_val_pand["prediction"]) prediction_val_pand_residual = prediction_val_pand["residual_vall"] prediction_val_pand_label = prediction_val_pand[label] prediction_val_pand_predict = prediction_val_pand["prediction"] lr_prediction = regressor.transform(test_data) lr_prediction.groupBy(label, "prediction").count().show() lr_prediction_quantile = lr_prediction.select(label, "prediction") lr_prediction_onlypred = lr_prediction.select('prediction') # lr_prediction_quantile.show() # training_summary = regressor.summary print("numof_Iterations...%d\n" % training_summary.totalIterations) print("ObjectiveHistory...%s\n" % str(training_summary.objectiveHistory)) print("RMSE...%f\n" % training_summary.rootMeanSquaredError) RMSE = training_summary.rootMeanSquaredError print("MSE....%f\n" % training_summary.meanSquaredError) MSE = training_summary.meanSquaredError print("r**2(r-square)....::%f\n" % training_summary.r2) r_square = training_summary.r2 print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj) adjsted_r_square = training_summary.r2adj print("deviance residuals %s" % str(training_summary.devianceResiduals)) training_summary.residuals.show() # residual_graph = training_summary.residuals # test = (residual_graph, lr_prediction_onlypred) # residual_graph.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode='append' ) # print(test) # test.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode= 'append') # residual_graph_pandas = residual_graph.toPandas() # print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors)) # coefficient_error = str(training_summary.coefficientStandardErrors) # print(" Tvalues :\n" + str(training_summary.tValues)) # T_values = str(training_summary.tValues) # print(" p values :\n" + str(training_summary.pValues)) # P_values = str(training_summary.pValues) ####################################################################################################### table_response = { "Intercept": intercept_t, "Coefficients": coefficient_t, "RMSE": RMSE, "MSE": MSE, "R_square": r_square, "Adj_R_square": adjsted_r_square } ####################################################################################################### # residual vs predicted value prediction_data = regressor.summary.predictions prediction_data.show() prediction_data.select(['prediction']).show() predicted = prediction_data.select(['prediction']) regressor.summary.residuals.show() residuals = regressor.summary.residuals pred_d = predicted.withColumn('row_index', f.monotonically_increasing_id()) res_d = residuals.withColumn('row_index', f.monotonically_increasing_id()) pred_residuals = pred_d.join( res_d, on=['row_index']).sort('row_index').drop('row_index') pred_residuals.show() QQPlot = 'QQPlot.parquet' locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/' # userId = '6786103f-b49b-42f2-ba40-aa8168b65e67' QQPlotAddress = locationAddress + userId + QQPlot pred_residuals.write.parquet(QQPlotAddress, mode='overwrite') # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet', # mode='overwrite') #################################################################################3 # scale location plot from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev df_label = prediction_data.select( label, 'prediction', sqrt(ab(prediction_data[label])).alias("sqrt_label")) df_label.show() df_sqrt_label_index = df_label.withColumn( 'row_index', f.monotonically_increasing_id()) df_sqrt_label_index.show() res_d.show() sqrt_label_residual_join = df_sqrt_label_index.join( res_d, on=['row_index']).sort('row_index').drop('row_index') sqrt_label_residual_join.show() std_resid = sqrt_label_residual_join.select( 'sqrt_label', 'prediction', (sqrt_label_residual_join['residuals'] / sqrt_label_residual_join['sqrt_label']).alias('std_res')) std_resid.show() sqrt_std_res = std_resid.select( "std_res", 'prediction', sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid")) sqrt_std_res.show() sqrt_std_res_fitted = sqrt_std_res.select('prediction', 'sqrt_std_resid') scaleLocationPlot = 'scaleLocation.parquet' scaleLocationPlotAddress = locationAddress + userId + scaleLocationPlot sqrt_std_res_fitted.write.parquet(scaleLocationPlotAddress, mode='overwrite') # sqrt_std_res_fitted.write.parquet( # 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet', # mode='overwrite') ########### #QQplot # QUANTILE from scipy.stats import norm import statistics import math res_d.show() sorted_res = res_d.sort('residuals') sorted_res.show() # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'), # meann(col('residuals')).alias('mean')) # stdev_ress.show() # mean_residual = stdev_ress.select(['mean']).toPandas() # l = mean_residual.values.tolist() # print(l) # stddev_residual = stdev_ress.select(['std_dev']).toPandas() # length of the sorted std residuals count = sorted_res.groupBy().count().toPandas() countList = count.values.tolist() tuple1 = () for k in countList: tuple1 = k for tu in tuple1: lengthResiduals = tu print(lengthResiduals) quantileList = [] for x in range(0, lengthResiduals): quantileList.append((x - 0.5) / (lengthResiduals)) print(quantileList) # Z-score on theoritical quantile zTheoriticalTrain = [] for x in quantileList: zTheoriticalTrain.append(norm.ppf(abs(x))) print(zTheoriticalTrain) sortedResidualPDF = sorted_res.select('residuals').toPandas() sortedResidualPDF = sortedResidualPDF['residuals'] stdevResidualTrain = statistics.stdev(sortedResidualPDF) meanResidualTrain = statistics.mean(sortedResidualPDF) zPracticalTrain = [] for x in sortedResidualPDF: zPracticalTrain.append( (x - meanResidualTrain) / stdevResidualTrain) ########## target = dataset.select(label) pred = prediction_data.select(['prediction']) pred_d = pred.withColumn('row_index', f.monotonically_increasing_id()) target_d = target.withColumn('row_index', f.monotonically_increasing_id()) pred_target = pred_d.join(target_d, on=['row_index']).drop('row_index') pred_target.show() dataset.show() pred_target_data_update = dataset.join(pred_target, on=[label]) pred_target_data_update.show(100) ########################################################################################## # scale location plot # for scale location plot # from pyspark.sql.functions import udf # # def std_res(x): # res_list = [] # res_list.append(x) # # std_residuals = udf(lambda y: std_res(y), FloatType()) # # residuals_std = residuals.withColumn('residuals', std_residuals(col('residuals').cast(FloatType()))) # # import statistics # import numpy as np # residuals_panda = residuals.toPandas() # # residuals_panda.residuals = range(residuals_panda.shape[1]) # residuals_panda = residuals_panda.values # print(residuals_panda) # stdev_training = statistics.stdev(residuals_panda) # print(stdev_training) ############################################################################################################ # creating the dictionary for storing the result # json_response = coefficient_t # print(json_response) # json_response = {"adjusted r**2 value" : training_summary.r2adj} # DATA VISUALIZATION PART # finding the quantile in the dataset(Q_Q plot) import matplotlib.pyplot as plt y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_label = lr_prediction_quantile.approxQuantile( label, x, 0.01) quantile_prediction = lr_prediction_quantile.approxQuantile( "prediction", x, 0.01) Q_label_pred = '' print(len(quantile_label)) length = len(quantile_label) for i in range(0, len(quantile_label)): Q_label_pred += str(quantile_label[i]) + 't' + str( quantile_prediction[i]) + 'n' import math fitted_residual = '' print(len(prediction_val_pand_residual)) length = len(prediction_val_pand_residual) for i in range(0, len(prediction_val_pand_residual)): fitted_residual += str( prediction_val_pand_predict[i]) + 't' + str( prediction_val_pand_residual[i]) + 'n' ## scale location graph data prediction_val_pand_residual prediction_val_pand_predict prediction_val_pand_residual_abs = prediction_val_pand_residual.abs( ) import math sqrt_residual = [] for x in prediction_val_pand_residual_abs: sqrt_residual.append(math.sqrt(x)) # print ("____________________ ",x) sqrt_residual # calculating std deviation import statistics print(statistics.stdev(prediction_val_pand_residual)) stdev_ = statistics.stdev(prediction_val_pand_residual) # calcuate stnd residuals std_res = [] for x in prediction_val_pand_residual: std_res.append(x / stdev_) print(std_res) # calculating the square root of std_res import math sqr_std_res = [] for x in std_res: sqr_std_res.append(math.sqrt(abs(x))) print(sqr_std_res) scale_predict_residual = '' for pre, res in zip(prediction_val_pand_predict, sqr_std_res): scale_predict_residual += str(pre) + 't' + str(res) + 'n' print(scale_predict_residual) # QUANTILE y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_std_res = spark.createDataFrame(std_res, FloatType()) quantile_std_res.show() quantile_std_res_t = quantile_std_res.approxQuantile( 'value', x, 0.01) print(quantile_std_res_t) print(x) # calculating the z_score from scipy.stats import norm ## sort the list sorted_std_res = sorted(std_res) mean = statistics.mean(sorted_std_res) stdev = statistics.stdev(sorted_std_res) # print(mean) quantile = [] n = len(std_res) print(n) for x in range(0, n): quantile.append((x - 0.5) / (n)) print(quantile) # z_score theoratical z_theory = [] for x in quantile: z_theory.append(norm.ppf(abs(x))) # z score for real val z_pract = [] for x in sorted_std_res: z_pract.append((x - mean) / stdev) Q_label_pred = '' for quant, val in zip(z_theory, z_pract): Q_label_pred += str(quant) + 't' + str(val) + 'n' graph_response = { "Q_Q_plot": Q_label_pred, "residual_fitted": fitted_residual, "scale_location": scale_predict_residual } json_response = { 'table_data': table_response, 'graph_data': graph_response } return json_response except Exception as e: print('exception is =' + str(e))
def ridge(self, dataset_add, feature_colm, label_colm, relation_list, relation, userId): try: dataset = spark.read.csv(dataset_add, header=True, inferSchema=True) dataset.show() Rsqr_list = [] Rsqr_regPara = {} print(self.xt) # print(data_add) # data = spark.read.csv('/home/fidel/mltest/BI.csv', header=True, inferSchema=True) # data.show() # f_data = data.select('Sub Total', 'Tax Amount', 'Freight', 'Profit') # f_data.show() # class A(): # def __init__(self, feature='sahil', label='fcuk'): # self.feature = feature # # feature = 'sahil' # self.label = label # # self.test # self.name = 'bro' # # def linear_c(self): # print(self.feature, '\n', self.label) # print(self.name) # # # a = A(feature='test', label='f_t') # A(feature='test', label='f_t').linear_c() # renaming the colm # print(label_colm) # dataset.withColumnRenamed(label_colm, "label") # print(label_colm) # dataset.show() label = '' for y in label_colm: label = y print(label) # relationship if relation == 'linear': print('linear relationship') if relation == 'non_linear': dataset = Relationship(dataset, relation_list) dataset.show() # implementing the vector assembler featureassembler = VectorAssembler( inputCols=feature_colm, outputCol="Independent_features") output = featureassembler.transform(dataset) output.show() output.select("Independent_features").show() finalized_data = output.select("Independent_features", label) finalized_data.show() # splitting the dataset into taining and testing train_data, test_data = finalized_data.randomSplit([0.75, 0.25], seed=40) ######################################################################33 # lasso final for t in self.xt: lr1 = LinearRegression(featuresCol="Independent_features", labelCol=label, elasticNetParam=0, regParam=t) regressor1 = lr1.fit(train_data) print(t) print("coefficient : " + str(regressor1.coefficients)) reg_sum = regressor1.summary r2 = reg_sum.r2 Rsqr_list.append(r2) Rsqr_regPara[r2] = t print(r2) print(Rsqr_list) print(max(Rsqr_list)) maximum_rsqr = max(Rsqr_list) print(Rsqr_regPara) final_regPara = [] for key, val in Rsqr_regPara.items(): if (key == maximum_rsqr): print(val) final_regPara.append(val) for reg in final_regPara: lr_lasso = LinearRegression(featuresCol="Independent_features", labelCol=label, elasticNetParam=0, regParam=reg) regressor = lr_lasso.fit(train_data) training_summary = regressor.summary r2 = training_summary.r2 print(r2) print("coefficient : " + str(regressor.coefficients)) coefficient_t = str(regressor.coefficients) print("intercept : " + str(regressor.intercept)) intercept_t = str(regressor.intercept) prediction = regressor.evaluate(test_data) prediction_val = prediction.predictions prediction_val.show() prediction_val_pand = prediction_val.select( label, "prediction").toPandas() prediction_val_pand = prediction_val_pand.assign( residual_vall=prediction_val_pand[label] - prediction_val_pand["prediction"]) prediction_val_pand_residual = prediction_val_pand["residual_vall"] prediction_val_pand_label = prediction_val_pand[label] # print prediction_val_pand_residual prediction_val_pand_predict = prediction_val_pand["prediction"] # print prediction_val_pand_predict # test_summary = prediction.summary # for test data lr_prediction = regressor.transform(test_data) lr_prediction.groupBy(label, "prediction").count().show() lr_prediction_quantile = lr_prediction.select(label, "prediction") lr_prediction_onlypred = lr_prediction.select('prediction') # lr_prediction_quantile.show() # training_summary = regressor.summary print("numof_Iterations...%d\n" % training_summary.totalIterations) print("ObjectiveHistory...%s\n" % str(training_summary.objectiveHistory)) print("RMSE...%f\n" % training_summary.rootMeanSquaredError) RMSE = training_summary.rootMeanSquaredError print("MSE....%f\n" % training_summary.meanSquaredError) MSE = training_summary.meanSquaredError print("r**2(r-square)....::%f\n" % training_summary.r2) r_square = training_summary.r2 print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj) adjsted_r_square = training_summary.r2adj print("deviance residuals %s" % str(training_summary.devianceResiduals)) training_summary.residuals.show() # residual_graph = training_summary.residuals # test = (residual_graph, lr_prediction_onlypred) # residual_graph.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode='append' ) # print(test) # test.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode= 'append') # residual_graph_pandas = residual_graph.toPandas() print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors)) coefficient_error = str(training_summary.coefficientStandardErrors) print(" Tvalues :\n" + str(training_summary.tValues)) T_values = str(training_summary.tValues) print(" p values :\n" + str(training_summary.pValues)) P_values = str(training_summary.pValues) ####################################################################################################### table_response = { "Intercept": intercept_t, "Coefficients": coefficient_t, "RMSE": RMSE, "MSE": MSE, "R_square": r_square, "Adj_R_square": adjsted_r_square, "Coefficient_error": coefficient_error, "T_value": T_values, "P_value": P_values } ####################################################################################################### # residual vs fitted graph prediction_data = regressor.summary.predictions prediction_data.show() prediction_data.select(['prediction']).show() predicted = prediction_data.select(['prediction']) regressor.summary.residuals.show() residuals = regressor.summary.residuals pred_d = predicted.withColumn('row_index', f.monotonically_increasing_id()) res_d = residuals.withColumn('row_index', f.monotonically_increasing_id()) pred_residuals = pred_d.join( res_d, on=['row_index']).sort('row_index').drop('row_index') pred_residuals.show() pred_residuals.write.parquet( 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/residual_fitted_train.parquet', mode='overwrite') ###################################################################################### # scale location plot training data from pyspark.sql.functions import sqrt from pyspark.sql.functions import abs as ab df_label = prediction_data.select( label, 'prediction', sqrt(ab(prediction_data[label])).alias("sqrt_label")) df_label.show() df_sqrt_label_index = df_label.withColumn( 'row_index', f.monotonically_increasing_id()) df_sqrt_label_index.show() # df_residual_index = df_residual.withColumn('row_index', f.monotonically_increasing_id()) # df_residual_index.show() res_d.show() sqrt_label_residual_join = df_sqrt_label_index.join( res_d, on=['row_index']).sort('row_index').drop('row_index') sqrt_label_residual_join.show() std_resid = sqrt_label_residual_join.select( 'sqrt_label', 'prediction', (sqrt_label_residual_join['residuals'] / sqrt_label_residual_join['sqrt_label']).alias('std_res')) std_resid.show() # std_resid_std_res = std_resid.select("std_res") sqrt_std_res = std_resid.select( "std_res", 'prediction', sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid")) # sqrt_std_res = sqrt(abs(std_resid_std_res["std_res"])) sqrt_std_res.show() sqrt_std_res_fitted = sqrt_std_res.select('prediction', 'sqrt_std_resid') sqrt_std_res_fitted.write.parquet( 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet', mode='overwrite') ###################################################################################### # QUANTILE ''' from pyspark.sql.functions import * res_d.show() sorted_res = res_d.sort('residuals') sorted_res.show() stdev_ress = sorted_res.select(stddev(col('residuals')).alias('std_dev'),mean(col('residuals')).alias('mean')) stdev_ress.show() mean_residual = stdev_ress.select(['mean']).toPandas() stddev_residual = stdev_ress.select(['std_dev']).toPandas() for x in range(0, 5): print(x/mean_residual) ''' #################################################################################### # appending predicted value to the dataset target = dataset.select(label) pred = prediction_data.select(['prediction']) pred_d = pred.withColumn('row_index', f.monotonically_increasing_id()) target_d = target.withColumn('row_index', f.monotonically_increasing_id()) pred_target = pred_d.join(target_d, on=['row_index']).drop('row_index') pred_target.show() dataset.show() pred_target_data_update = dataset.join(pred_target, on=[label]) pred_target_data_update.show(100) ########################################################### import matplotlib.pyplot as plt y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) # # for z in x: # print ("~~~~~ ",z) # quantile_label = lr_prediction_quantile.approxQuantile( label, x, 0.01) # print quantile_label quantile_prediction = lr_prediction_quantile.approxQuantile( "prediction", x, 0.01) # creating the csv file and writitng into it fitted_residual = '' print(len(prediction_val_pand_residual)) length = len(prediction_val_pand_residual) for i in range(0, len(prediction_val_pand_residual)): fitted_residual += str( prediction_val_pand_predict[i]) + 't' + str( prediction_val_pand_residual[i]) + 'n' with open('residual_vs_fitted.csv', 'w') as r_f: writer_r_f = csv.writer(r_f) writer_r_f.writerows((prediction_val_pand_predict, prediction_val_pand_residual)) # parquet file writing ## residual vs leverage graph data prediction_val_pand_residual # extreme value in the predictor colm prediction_col_extremeval = lr_prediction_quantile.agg( {"prediction": "max"}) # prediction_col_extremeval.show() # plt.plot(prediction_col_extremeval, prediction_val_pand_residual) # plt.show() ## scale location graph data import math prediction_val_pand_residual prediction_val_pand_predict prediction_val_pand_residual_abs = prediction_val_pand_residual.abs( ) sqrt_residual = [] for x in prediction_val_pand_residual_abs: sqrt_residual.append(math.sqrt(x)) # print ("____________________ ",x) sqrt_residual # plt.scatter(sqrt_residual, prediction_val_pand_predict) ####################################################################################3 # calculating std deviation import statistics print(statistics.stdev(prediction_val_pand_residual)) stdev_pred = statistics.stdev(prediction_val_pand_residual) # mean = statistics.mean(prediction_val_pand_residual) # calcuate stnd residuals std_res = [] for x in prediction_val_pand_residual: std_res.append(x / stdev_pred) print(std_res) # calculating the square root of std_res import math sqr_std_res = [] for x in std_res: sqr_std_res.append(math.sqrt(abs(x))) print(sqr_std_res) #######################################################################################3 # QUANTILE ## sort the list sorted_std_res = sorted(std_res) print(sorted_std_res) # mean = statistics.mean(sorted_std_res) stdev = statistics.stdev(sorted_std_res) print(mean) quantile = [] n = len(sorted_std_res) print(n) for x in range(0, n): quantile.append((x - 0.5) / (n)) print(quantile) # # z_score theoritical from scipy.stats import norm z_theory = [] for x in quantile: z_theory.append((norm.ppf(abs(x)))) print(z_theory) # z score for real val z_pract = [] for x in sorted_std_res: z_pract.append((x - mean) / stdev) # y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_std_res = spark.createDataFrame(std_res, FloatType()) quantile_std_res.show() quantile_std_res_t = quantile_std_res.approxQuantile( 'value', x, 0.01) print(quantile_std_res_t) print(x) Q_label_pred = '' print(len(quantile_label)) length = len(quantile_label) for quant, val in zip(z_theory, z_pract): Q_label_pred += str(val) + 't' + str(quant) + 'n' plt.scatter(z_theory, z_pract) plt.savefig('q_q') #################################################### # creating the std residuals # square root of label sqrt_label = [] for x in prediction_val_pand_label: sqrt_label.append(math.sqrt(abs(x))) sqrt_label prediction_val_pand_residual std_residual = [] for sqr, resid in zip(sqrt_label, prediction_val_pand_residual): std_residual.append(resid / sqr) # print(std_sqrt_residual) # creating the std sqr root sqrt_std_residuals = [] for x in std_residual: # print(math.sqrt(abs(x))) sqrt_std_residuals.append(math.sqrt(abs(x))) print(sqrt_std_residuals) # print(std_sqrt_residual) scale_predict_residual = '' for pre, res in zip(prediction_val_pand_predict, sqrt_std_residuals): scale_predict_residual += str(pre) + 't' + str(res) + 'n' print(scale_predict_residual) ########################################################################## """ pred_residuals.show() pred_residuals_pandas = pred_residuals.toPandas() print(pred_residuals_pandas) res_pandas = pred_residuals_pandas['residuals'] pred_pandas = pred_residuals_pandas['prediction'] label_list = [] # for res, pred in zip(res_pandas, pred_pandas): # label_list.append(res+pred) label_pand = prediction_data.select([label]).toPandas() labe_panda = label_pand[label] # sqrt of label column sqrt_lab = [] for lab in labe_panda: sqrt_lab.append(math.sqrt(abs(lab))) print(res_pandas) stdev_res = statistics.stdev(res_pandas) std_res_list = [] for valr, labe in zip(res_pandas,sqrt_lab): std_res_list.append(valr/labe) print(std_res_list) """ ########################################################################## ########################################################################## # import math # sqrt_stdres = [] # for x in std_sqrt_residual: # sqrt_stdres.append(math.sqrt(x)) # # scale_predict_residual = '' # for pre, res in zip(prediction_val_pand_predict, sqrt_stdres): # scale_predict_residual += str(pre) + 't' + str(res) + 'n' # print(scale_predict_residual) ###################################3 # plt.show() # scale_predict_residual='' # # print(len(sqrt_residual)) # length = len(sqrt_residual) # # for i in range(0, len(std_sqrt_residual)): # scale_predict_residual += str(prediction_val_pand_predict[i]) + '|' + str(std_sqrt_residual[i]) + '\n' # with open('scale_location_plot.csv', 'w') as s_l: # writer_s_l = csv.writer(s_l) # writer_s_l.writerows((prediction_val_pand_predict, sqrt_residual)) # writing to the parquet # prediction_val_pand_predict_tospark = spark.createDataFrame(prediction_val_pand_predict, FloatType()) # prediction_val_pand_predict_tospark = prediction_val_pand_predict_tospark.withColumnRenamed("value", # "prediction") # # sqrt_residual_tospark= spark.createDataFrame(sqrt_residual, FloatType()) # sqrt_residual_tospark = sqrt_residual_tospark.withColumnRenamed("value", # "sqrt_residual") # # pred_spark = prediction_val_pand_predict_tospark.withColumn('row_index', f.monotonically_increasing_id()) # res_spark = sqrt_residual_tospark.withColumn('row_index', f.monotonically_increasing_id()) # # final_scale_fitted = pred_spark.join(res_spark,on=['row_index']) \ # .sort('row_index').drop('row_index') # # final_scale_fitted.show() # # final_scale_fitted.write.parquet( # 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/SCALE_LOCATION_PLOT.parquet', # mode='overwrite') # # dumping the dictionary into json object # json_response = {'run_status': 'success', 'PredictiveResponse': resultdf} graph_response = { "Q_Q_plot": Q_label_pred, "residual_fitted": fitted_residual, "scale_location": scale_predict_residual } json_response = { 'table_data': table_response, 'graph_data': graph_response } return json_response except Exception as e: print('exception is =' + str(e))
def tocolumns(df, expr): import pyspark.sql.functions as fcns if isinstance(expr, histbook.expr.Const): return fcns.lit(expr.value) elif isinstance(expr, (histbook.expr.Name, histbook.expr.Predicate)): return df[expr.value] elif isinstance(expr, histbook.expr.Call): if expr.fcn == "abs" or expr.fcn == "fabs": return fcns.abs(tocolumns(df, expr.args[0])) elif expr.fcn == "max" or expr.fcn == "fmax": return fcns.greatest(*[tocolumns(df, x) for x in expr.args]) elif expr.fcn == "min" or expr.fcn == "fmin": return fcns.least(*[tocolumns(df, x) for x in expr.args]) elif expr.fcn == "arccos": return fcns.acos(tocolumns(df, expr.args[0])) elif expr.fcn == "arccosh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "arcsin": return fcns.asin(tocolumns(df, expr.args[0])) elif expr.fcn == "arcsinh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "arctan2": return fcns.atan2(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "arctan": return fcns.atan(tocolumns(df, expr.args[0])) elif expr.fcn == "arctanh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "ceil": return fcns.ceil(tocolumns(df, expr.args[0])) elif expr.fcn == "copysign": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "cos": return fcns.cos(tocolumns(df, expr.args[0])) elif expr.fcn == "cosh": return fcns.cosh(tocolumns(df, expr.args[0])) elif expr.fcn == "rad2deg": return tocolumns(df, expr.args[0]) * (180.0 / math.pi) elif expr.fcn == "erfc": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "erf": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "exp": return fcns.exp(tocolumns(df, expr.args[0])) elif expr.fcn == "expm1": return fcns.expm1(tocolumns(df, expr.args[0])) elif expr.fcn == "factorial": return fcns.factorial(tocolumns(df, expr.args[0])) elif expr.fcn == "floor": return fcns.floor(tocolumns(df, expr.args[0])) elif expr.fcn == "fmod": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "gamma": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "hypot": return fcns.hypot(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "isinf": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "isnan": return fcns.isnan(tocolumns(df, expr.args[0])) elif expr.fcn == "lgamma": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "log10": return fcns.log10(tocolumns(df, expr.args[0])) elif expr.fcn == "log1p": return fcns.log1p(tocolumns(df, expr.args[0])) elif expr.fcn == "log": return fcns.log(tocolumns(df, expr.args[0])) elif expr.fcn == "pow": return fcns.pow(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "deg2rad": return tocolumns(df, expr.args[0]) * (math.pi / 180.0) elif expr.fcn == "sinh": return fcns.sinh(tocolumns(df, expr.args[0])) elif expr.fcn == "sin": return fcns.sin(tocolumns(df, expr.args[0])) elif expr.fcn == "sqrt": return fcns.sqrt(tocolumns(df, expr.args[0])) elif expr.fcn == "tanh": return fcns.tanh(tocolumns(df, expr.args[0])) elif expr.fcn == "tan": return fcns.tan(tocolumns(df, expr.args[0])) elif expr.fcn == "trunc": raise NotImplementedError( expr.fcn) # FIXME (fcns.trunc is for dates) elif expr.fcn == "xor": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "conjugate": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "exp2": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "heaviside": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "isfinite": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "left_shift" and isinstance(expr.args[1], histbook.expr.Const): return fcns.shiftLeft(tocolumns(df, expr.args[0]), expr.args[1].value) elif expr.fcn == "log2": return fcns.log2(tocolumns(df, expr.args[0])) elif expr.fcn == "logaddexp2": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "logaddexp": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "mod" or expr.fcn == "fmod": return tocolumns(df, expr.args[0]) % tocolumns(df, expr.args[1]) elif expr.fcn == "right_shift" and isinstance(expr.args[1], histbook.expr.Const): return fcns.shiftRight(tocolumns(df, expr.args[0]), expr.args[1].value) elif expr.fcn == "rint": return fcns.rint(tocolumns(df, expr.args[0])) elif expr.fcn == "sign": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "where": return fcns.when(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])).otherwise( tocolumns(df, expr.args[2])) elif expr.fcn == "numpy.equal": return tocolumns(df, expr.args[0]) == tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.not_equal": return tocolumns(df, expr.args[0]) != tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.less": return tocolumns(df, expr.args[0]) < tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.less_equal": return tocolumns(df, expr.args[0]) <= tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.isin": return tocolumns(df, expr.args[0]) in tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_not": return ~tocolumns(df, expr.args[0]) elif expr.fcn == "numpy.add": return tocolumns(df, expr.args[0]) + tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.subtract": return tocolumns(df, expr.args[0]) - tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.multiply": return tocolumns(df, expr.args[0]) * tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.true_divide": return tocolumns(df, expr.args[0]) / tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_or": return tocolumns(df, expr.args[0]) | tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_and": return tocolumns(df, expr.args[0]) & tocolumns(df, expr.args[1]) else: raise NotImplementedError(expr.fcn) else: raise AssertionError(expr)
# count = sum of daily counts feat + 'count' + dd: f.sum(f.col(feat + 'count_0d')).over(window), # A few more complicated examples: # mean = weighted mean of daily means feat + 'count' + dd: f.sum(f.col(feat + 'mean_0d') * f.col(feat + 'count_0d')).over(window) / f.sum(f.col(feat + 'count_0d')).over(window), # stddev = sqrt(weighted mean of daily variances) feat + 'stddev' + dd: f.sqrt( f.mean(f.col(feat + 'count_0d') * f.col(feat + 'stddev_0d')**2).over(window) / f.sum(f.col(feat + 'count_0d')).over(window)), } # Loop through the dictionary of new columns and add them to the aggregated # dataframe for col_name, col_obj in new_cols.items(): add = SparkWithColumn(name='add_' + col_name, read_key='df_agg', store_key='df_agg', new_col_name=col_name, new_col=col_obj) lookback_chain.add(add) # STEP 5: Save the results
def skewness_custom(column, mean, count): return ((np.sqrt(count) * df_sum(df_pow(column - mean, int(3)))) / df_pow(sqrt(df_sum(df_pow(column - mean, int(2)))), 3))
histData = histData.groupBy("User", "device").agg(*expr).show() # COMMAND ---------- # Question 2 - Using the dataset “activity-data”, create a stream that outputs in one table the total number of meters user g travels per activity in time intervals of resp. 15 minutes and 30 minutes. Order the table by the most distance travelled per activity. Hint: you can use the columns x, y, z to calculate the distance travelled # COMMAND ---------- staticDF.show(4) # COMMAND ---------- from pyspark.sql.functions import sqrt totalDist = streamingDF.select("User", "x", "y", "z").withColumn("Distance", sqrt(pow((streamingDF['x']), 2) + pow((streamingDF['y']), 2) + pow((streamingDF['y']), 2))) user_g_dist = totalDist\ .cube("User").sum("distance")\ .where("User == 'g'")\ .writeStream\ .queryName("user_g_distance")\ .format("memory")\ .outputMode("complete")\ .start() # COMMAND ---------- from time import sleep for x in range(5): spark.sql("select * from user_g_distance").show(3)
def skewness_custom(column, mean, count): return ((np.sqrt(count) * df_sum(df_pow(column - mean, int(3)))) / df_pow(sqrt(df_sum(df_pow(column - mean, int(2)))),3))
spark = SparkSession.builder.appName("RLSRateSourceOLS").getOrCreate() spark.sparkContext.setLogLevel("WARN") # OLS problem, states to be estimated are a, b and c # z = a*x + b * y + c + w, where w ~ N(0, 1) a = 0.5 b = 0.2 c = 1.2 noise_param = 1 label_expression = F.col("x") * a + F.col("y") * b + c + F.col("w") input_df = spark.readStream.format("rate").option("rowsPerSecond", mps).load()\ .withColumn("mod", F.col("value") % num_states)\ .withColumn("stateKey", F.col("mod").cast("String"))\ .withColumn("x", (F.col("value")/num_states).cast("Integer").cast("Double"))\ .withColumn("y", F.sqrt("x"))\ .withColumn("bias", F.lit(1.0))\ .withColumn("w", F.randn(0) * noise_param)\ .withColumn("label", label_expression) rls = RecursiveLeastSquaresFilter(3)\ .setStateKeyCol("stateKey")\ .setRegularizationMatrixFactor(10E6)\ .setForgettingFactor(0.99) assembler = VectorAssembler(inputCols=["x", "y", "bias"], outputCol="features") measurements = assembler.transform(input_df) query = rls.transform(measurements)\ .writeStream\