def test_window_functions_cumulative_sum(self): df = self.spark.createDataFrame([("one", 1), ("two", 2)], ["key", "value"]) from pyspark.sql import functions as F # Test cumulative sum sel = df.select( df.key, F.sum(df.value).over( Window.rowsBetween(Window.unboundedPreceding, 0))) rs = sorted(sel.collect()) expected = [("one", 1), ("two", 3)] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)]) # Test boundary values less than JVM's Long.MinValue and make sure we don't overflow sel = df.select( df.key, F.sum(df.value).over( Window.rowsBetween(Window.unboundedPreceding - 1, 0))) rs = sorted(sel.collect()) expected = [("one", 1), ("two", 3)] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)]) # Test boundary values greater than JVM's Long.MaxValue and make sure we don't overflow frame_end = Window.unboundedFollowing + 1 sel = df.select( df.key, F.sum(df.value).over( Window.rowsBetween(Window.currentRow, frame_end))) rs = sorted(sel.collect()) expected = [("one", 3), ("two", 2)] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)])
def PercentOut(self, groupCol): Num_clusters = self.dataframe.select(groupCol).distinct().count() window = Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) df = self.dataframe.groupBy("Features_Joined").pivot(groupCol).count() total = df.groupBy().sum().collect()[0] dataout = [] out = {} for i in range(0, Num_clusters): df1 = df.where(col(str(i)).isNotNull()).select( "Features_Joined", str(i)).withColumn( 'total', sum(col(str(i))).over(window)).withColumn( 'Percent', col(str(i)) * 100 / col('total')).drop(col('total')) ClustData = {} ClustData['name'] = 'Cluster_' + str(i) ClustData['children'] = [] out[i] = {} #df1 = df.where(col(str(i)).isNotNull()).select("Features_Joined",str(i)).withColumn('Percent',col(str(i))*100/total[i]) for colname in ["Features_Joined", str(i), 'Percent']: out[i][colname] = df1.rdd.map(lambda x: x[colname]).collect() ClustData['size'] = len(out[i]['Features_Joined']) for childnum in range(0, ClustData['size']): child = {} child["display"] = True child["name"] = out[i]['Features_Joined'][childnum] child['subname'] = 'Number of clicks: ' + str( out[i][str(i)][childnum]) child['key'] = 'Percent: ' + str( round(out[i]['Percent'][childnum], 2)) child["size"] = 1 ClustData['children'].append(child) dataout.append(ClustData) return dataout
def _reverse_enumeration(column: Column, window: Window) -> Column: """Helper function to reverse enumeration of given column. Parameters ---------- column: pyspark.sql.column.Column Column containing enumerated values. window: pyspark.sql.Window Window spec for which the reversing should take place. Returns ------- reverse: pyspark.sql.column.Column """ window_max = window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) diff_score = F.max(column).over(window_max) + 1 reverse = (column - diff_score) * -1 return reverse
def temporal_differencing(df, diff_factor): lagWindow = Window.rowsBetween(diff_factor, 0) for col in df.columns: if col != "timestamp": df = df.withColumn(col, df[col] - F.first(df[col]).over(lagWindow)) return df
def compute(cls, base: DataFrame, parameters: Dict[str, Any] = None) -> Column: return F.count(StudentPerformance.STUDENT_ID).over( Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing))
def get_eda_plots(df, only_categorical=False, only_numerical=False ,\ hspace=0.5,wspace=0.5,numerical_figsize=(15,15),\ categorical_figsize=(15,25),bins=25): """ The function takes in a pyspark dataframe and gives subplots of numerical labels and categorical labels. For numerical labels it will give the histogram of the numerical values for each label. For categorical labels it will give percentages of each of the category in each for each label """ if only_categorical != True: numerical_labels = [ item[0] for item in df.dtypes if not item[1].startswith('string') ] # print (numerical_labels) if (len(numerical_labels) % 2) == 0: numerical_labels2 = numerical_labels else: numerical_labels2 = numerical_labels numerical_labels2.append(numerical_labels[-1]) print( "Numerical columns has Odd number of features\n hence last subplot will be repeated" ) fig = plt.figure(figsize=numerical_figsize) fig.subplots_adjust(hspace=hspace, wspace=wspace) print("Plotting numerical columns...") for column, i in tqdm(zip(numerical_labels2, range(1, len(numerical_labels2) + 1)), total=len(numerical_labels2)): ax = fig.add_subplot(round((len(numerical_labels2) / 2) + 0.5), 2, i) hist(ax, x=df.select(column), bins=bins) ax.set_title(column) ax.legend() if only_numerical != True: categorical_labels = [ item[0] for item in df.dtypes if item[1].startswith('string') ] # print (categorical_labels) if (len(categorical_labels) % 2) == 0: categorical_labels2 = categorical_labels else: categorical_labels2 = categorical_labels categorical_labels2.append(categorical_labels[-1]) print( "Categorical labels has Odd number of features\n hence last subplot will be repeated" ) fig = plt.figure(figsize=(categorical_figsize)) fig.subplots_adjust(hspace=hspace, wspace=wspace) # plt.xticks(rotation=45) print("Plotting categorical columns...") for column, i in tqdm(zip(categorical_labels2, range(1, len(categorical_labels2) + 1)), total=len(categorical_labels2)): window = Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) tab = df.select([column]).\ groupBy(column).\ agg(F.count(column).alias('num'), ).\ withColumn('total',F.sum(F.col('num')).over(window)).\ withColumn('percent',F.col('num')*100/F.col('total')).\ drop(F.col('total')) categories = [(row[column]) for row in tab.collect()] category_percentage = [(row.percent) for row in tab.collect()] ax = fig.add_subplot(round((len(categorical_labels2) / 2) + 0.5), 2, i) ax.bar(categories, category_percentage, label="percentage") plt.xticks(rotation=45) ax.set_title(column) ax.legend()
df_news = imputer.fit(df_news).transform(df_news) df_news = df_news.dropna()""" #df_news = df_news.select([col for col in df_news.columns if ("_cleaned" in col) or (col in time_cols)]) df_news = rescaledData.select("news_features") #NEWS SENTIMENT DATA (FROM KAGGLE AND VADER) lags = 10 df_news_sent = sqlContext.sql("SELECT Date, Subjectivity, Objectivity, Positive, Neutral, Negative FROM news_sentiment") #GENERATE LAGS (News might take time to take effect) for lag_num in range(1,lags+1): df_news_sent = df_news_sent.withColumn("Subjectivity_Lag_{}".format(lag_num),avg(col("Subjectivity")).over(Window.rowsBetween(-lag_num,-lag_num))) df_news_sent = df_news_sent.withColumn("Objectivity_Lag_{}".format(lag_num),avg(col("Objectivity")).over(Window.rowsBetween(-lag_num,-lag_num))) df_news_sent = df_news_sent.withColumn("Positive_Lag_{}".format(lag_num),avg(col("Positive")).over(Window.rowsBetween(-lag_num,-lag_num))) df_news_sent = df_news_sent.withColumn("Neutral_Lag_{}".format(lag_num),avg(col("Neutral")).over(Window.rowsBetween(-lag_num,-lag_num))) df_news_sent = df_news_sent.withColumn("Negative_Lag_{}".format(lag_num),avg(col("Negative")).over(Window.rowsBetween(-lag_num,-lag_num))) drop_list = ["Subjectivity","Objectivity","Positive","Neutral","Negative"] df_news_sent = df_news_sent.select([column for column in df_news_sent.columns if column not in drop_list]) """imputer = Imputer( inputCols=[column for column in df_news_sent.columns if column not in ["Date"]], outputCols=["{}_clean".format(c) for c in [column for column in df_news_sent.columns if column not in ["Date"]]], strategy = "median" ) df_news_sent = imputer.fit(df_news_sent).transform(df_news_sent) """
ma_order = 1 #Metrics total_mape = [] total_rmse = [] total_smape = [] #read in data sqlContext = SQLContext(sc) df = sqlContext.sql( "SELECT * FROM aapl WHERE YEAR(aapl.date) >= 2009 and YEAR(aapl.date) <= 2015" ) #forecast using ma df = df.withColumn("close_ma", avg(col("close")).over(Window.rowsBetween(-ma_order, -1))) for window in [1, 5, 10, 20, 80]: df = df.withColumn( "Close_Actual_Window_{}".format(window), lead("close", window - 1, None).over(Window.orderBy("Date"))) for window in [1, 5, 10, 20, 80]: df = df.withColumn( "squared_error_Window_{}".format(window), pow((col("Close_Actual_Window_{}".format(window)) - col("close_ma")), 2)) df = df.withColumn( "s_abs_percentage_error_Window_{}".format(window), (abs(col("close_ma") - col("Close_Actual_Window_{}".format(window))) / ((col("Close_Actual_Window_{}".format(window)) + col("close_ma")) /
from IPython import display import matplotlib.pyplot as plt import seaborn as sns import numpy as np import pandas as pd from pyspark import * import pyspark.sql.functions as f from pyspark.sql import functions as f from pyspark.sql.functions import rank, sum, col from pyspark.sql import Window window = Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) usr = spark.read.json("s3://my-little-pony/yelp/yelp_academic_dataset_user.json") view = spark.read.load('s3://my-little-pony/yelp/reviews.json', format='json') business = spark.read.json("s3://my-little-pony/yelp/yelp_academic_dataset_business.json") usr.createOrReplaceTempView("usr") query = """ SELECT name, user_id as ID, explode(split(elite, ',')), elite FROM usr"""
def _transform(self, dataset): #LOG VOLUME for tick in self.tickers: dataset = dataset.withColumn( "Volume_{}_float".format(tick), log(col("Volume_{}".format(tick)).cast( DoubleType()))) #log because large value dataset = dataset.drop("Volume_{}".format(tick)) dataset = dataset.withColumnRenamed("Volume_{}_float".format(tick), "Volume_{}".format(tick)) #GENERATE TIME COLUMNS dataset = dataset.withColumn("Day_Of_Week", dayofweek("Date")) dataset = dataset.withColumn("Month", month("Date")) dataset = dataset.withColumn("Quarter", quarter(col("Date"))) dataset = dataset.withColumn("Week_Of_Year", weekofyear(col("Date"))) #dataset = dataset.withColumn("Day_Of_Year",dayofyear(col("Date"))) dataset = dataset.withColumn("Day_Of_Month", dayofmonth(col("Date"))) #dataset = dataset.withColumn("Year",year(col("Date"))) time_cols = [ "Date", "Day_Of_Week", "Month", "Quarter", "Week_Of_Year", "Day_Of_Month" ] #GENERATE LAG COLUMNS for lag_num in range(1, self.lags + 1): for feature in self.inputCols: dataset = dataset.withColumn( "{}_Lag_{}".format(feature, lag_num), lag(feature, lag_num, None).over(Window.orderBy("Date"))) #SIMPLE MOVING AVERAGES, MOVING VARIANCE AND Z SCORE for ma_length in self.ma_windows: for feature in self.inputCols: dataset = dataset.withColumn( "{}_ma_{}".format(feature, ma_length), avg(col(feature)).over(Window.rowsBetween(-ma_length, -1))) dataset = dataset.withColumn( "{}_var_{}".format(feature, ma_length), variance(col(feature)).over( Window.rowsBetween(-ma_length, -1))) dataset = dataset.withColumn( "{}_Z_{}".format(feature, ma_length), (col("{}_Lag_1".format(feature)) - col("{}_ma_{}".format(feature, ma_length))) / (col("{}_var_{}".format(feature, ma_length)))) #OPEN/CLOSE RATIO for lag_num in range(1, self.lags + 1): for tick in self.tickers: dataset = dataset.withColumn( "Open_Close_{}_Ratio_Lag_{}".format(tick, lag_num), col("Open_{}_Lag_{}".format(tick, lag_num)) / col("Close_{}_Lag_{}".format(tick, lag_num))) #DIFFERENCING, PERCENT CHANGE, SIGN AND ROLLING SUM for lag_num in range(1, int((self.lags) / 3)): for feature in self.inputCols: dataset = dataset.withColumn( "{}_Diff_{}".format(feature, lag_num), when( isnull( col("{}_Lag_{}".format(feature, lag_num)) - col("{}_Lag_{}".format(feature, lag_num + 1))), 0).otherwise( col("{}_Lag_{}".format(feature, lag_num)) - col("{}_Lag_{}".format(feature, lag_num + 1)))) dataset = dataset.withColumn( "{}_Diff_Percent_{}".format(feature, lag_num), when( isnull( (col("{}_Lag_{}".format(feature, lag_num)) - col("{}_Lag_{}".format(feature, lag_num + 1))) / col("{}_Lag_{}".format(feature, lag_num + 1))), 0).otherwise( (col("{}_Lag_{}".format(feature, lag_num)) - col("{}_Lag_{}".format(feature, lag_num + 1))) / col("{}_Lag_{}".format(feature, lag_num + 1)))) dataset = dataset.withColumn( "{}_Diff_{}_Sign".format(feature, lag_num), when(col("{}_Diff_{}".format(feature, lag_num)) > 0, 1.0).otherwise(-1.0)) dataset = dataset.withColumn( "{}_Rolling_Sign_{}".format(feature, lag_num), sum(col("{}_Diff_{}_Sign".format(feature, lag_num))).over( Window.rowsBetween(-(lag_num + 1), -1))) #IMPUTE VALUES """imputer = Imputer(inputCols=[column for column in dataset.columns if column not in time_cols], outputCols=["{}_imputed".format(c) for c in [column for column in dataset.columns if column not in time_cols]], strategy = "median") dataset = imputer.fit(dataset).transform(dataset) dataset = dataset.dropna()""" #Drop Columns drop_list_fil = [col for col in self.inputCols if col != "Close_aapl"] drop_list = [col for col in drop_list_fil] dataset = dataset.select( [col for col in dataset.columns if col not in drop_list]) #PREDICTION WINDOW dataset = dataset.withColumn( "Close_aapl_Window", lead("Close_aapl", self.pred_window - 1, None).over(Window.orderBy("Date"))) dataset = dataset.drop("Close_aapl") dataset = dataset.withColumnRenamed("Close_aapl_Window", "Close_aapl") dataset = dataset.withColumnRenamed("Close_aapl", "label") #DROP NULL CREATED BY dataset = dataset.dropna() #REMOVE NON IMPUTED COLUMNS #dataset = dataset.select([col for col in dataset.columns if ("_imputed" in col) or (col in time_cols)]) #self.outputCols = dataset.columns return dataset