Exemplo n.º 1
0
    def test_window_functions_cumulative_sum(self):
        df = self.spark.createDataFrame([("one", 1), ("two", 2)],
                                        ["key", "value"])
        from pyspark.sql import functions as F

        # Test cumulative sum
        sel = df.select(
            df.key,
            F.sum(df.value).over(
                Window.rowsBetween(Window.unboundedPreceding, 0)))
        rs = sorted(sel.collect())
        expected = [("one", 1), ("two", 3)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])

        # Test boundary values less than JVM's Long.MinValue and make sure we don't overflow
        sel = df.select(
            df.key,
            F.sum(df.value).over(
                Window.rowsBetween(Window.unboundedPreceding - 1, 0)))
        rs = sorted(sel.collect())
        expected = [("one", 1), ("two", 3)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])

        # Test boundary values greater than JVM's Long.MaxValue and make sure we don't overflow
        frame_end = Window.unboundedFollowing + 1
        sel = df.select(
            df.key,
            F.sum(df.value).over(
                Window.rowsBetween(Window.currentRow, frame_end)))
        rs = sorted(sel.collect())
        expected = [("one", 3), ("two", 2)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])
Exemplo n.º 2
0
    def PercentOut(self, groupCol):
        Num_clusters = self.dataframe.select(groupCol).distinct().count()
        window = Window.rowsBetween(Window.unboundedPreceding,
                                    Window.unboundedFollowing)
        df = self.dataframe.groupBy("Features_Joined").pivot(groupCol).count()
        total = df.groupBy().sum().collect()[0]
        dataout = []
        out = {}
        for i in range(0, Num_clusters):
            df1 = df.where(col(str(i)).isNotNull()).select(
                "Features_Joined", str(i)).withColumn(
                    'total',
                    sum(col(str(i))).over(window)).withColumn(
                        'Percent',
                        col(str(i)) * 100 / col('total')).drop(col('total'))

            ClustData = {}
            ClustData['name'] = 'Cluster_' + str(i)
            ClustData['children'] = []
            out[i] = {}
            #df1 = df.where(col(str(i)).isNotNull()).select("Features_Joined",str(i)).withColumn('Percent',col(str(i))*100/total[i])
            for colname in ["Features_Joined", str(i), 'Percent']:
                out[i][colname] = df1.rdd.map(lambda x: x[colname]).collect()

            ClustData['size'] = len(out[i]['Features_Joined'])

            for childnum in range(0, ClustData['size']):
                child = {}
                child["display"] = True
                child["name"] = out[i]['Features_Joined'][childnum]
                child['subname'] = 'Number of clicks: ' + str(
                    out[i][str(i)][childnum])
                child['key'] = 'Percent: ' + str(
                    round(out[i]['Percent'][childnum], 2))
                child["size"] = 1
                ClustData['children'].append(child)
            dataout.append(ClustData)
        return dataout
    def _reverse_enumeration(column: Column, window: Window) -> Column:
        """Helper function to reverse enumeration of given column.

        Parameters
        ----------
        column: pyspark.sql.column.Column
            Column containing enumerated values.
        window: pyspark.sql.Window
            Window spec for which the reversing should take place.

        Returns
        -------
        reverse: pyspark.sql.column.Column

        """

        window_max = window.rowsBetween(Window.unboundedPreceding,
                                        Window.unboundedFollowing)

        diff_score = F.max(column).over(window_max) + 1
        reverse = (column - diff_score) * -1

        return reverse
def temporal_differencing(df, diff_factor):
    lagWindow = Window.rowsBetween(diff_factor, 0)
    for col in df.columns:
        if col != "timestamp":
            df = df.withColumn(col, df[col] - F.first(df[col]).over(lagWindow))
    return df
Exemplo n.º 5
0
 def compute(cls,
             base: DataFrame,
             parameters: Dict[str, Any] = None) -> Column:
     return F.count(StudentPerformance.STUDENT_ID).over(
         Window.rowsBetween(Window.unboundedPreceding,
                            Window.unboundedFollowing))
Exemplo n.º 6
0
def get_eda_plots(df, only_categorical=False, only_numerical=False ,\
                  hspace=0.5,wspace=0.5,numerical_figsize=(15,15),\
                  categorical_figsize=(15,25),bins=25):
    """
  
  The function takes in a pyspark dataframe and gives subplots of numerical 
  labels and categorical labels. 
  For numerical labels it will give the histogram of the numerical values for 
  each label.
  For categorical labels it will give percentages of each of the category in 
  each for each label
  """

    if only_categorical != True:
        numerical_labels = [
            item[0] for item in df.dtypes if not item[1].startswith('string')
        ]
        # print (numerical_labels)

        if (len(numerical_labels) % 2) == 0:
            numerical_labels2 = numerical_labels

        else:
            numerical_labels2 = numerical_labels
            numerical_labels2.append(numerical_labels[-1])

            print(
                "Numerical columns has Odd number of features\n hence last subplot will be repeated"
            )

        fig = plt.figure(figsize=numerical_figsize)
        fig.subplots_adjust(hspace=hspace, wspace=wspace)
        print("Plotting numerical columns...")
        for column, i in tqdm(zip(numerical_labels2,
                                  range(1,
                                        len(numerical_labels2) + 1)),
                              total=len(numerical_labels2)):

            ax = fig.add_subplot(round((len(numerical_labels2) / 2) + 0.5), 2,
                                 i)
            hist(ax, x=df.select(column), bins=bins)
            ax.set_title(column)
            ax.legend()

    if only_numerical != True:
        categorical_labels = [
            item[0] for item in df.dtypes if item[1].startswith('string')
        ]
        # print (categorical_labels)

        if (len(categorical_labels) % 2) == 0:
            categorical_labels2 = categorical_labels
        else:
            categorical_labels2 = categorical_labels
            categorical_labels2.append(categorical_labels[-1])

            print(
                "Categorical labels has Odd number of features\n hence last subplot will be repeated"
            )

        fig = plt.figure(figsize=(categorical_figsize))
        fig.subplots_adjust(hspace=hspace, wspace=wspace)
        # plt.xticks(rotation=45)

        print("Plotting categorical columns...")
        for column, i in tqdm(zip(categorical_labels2,
                                  range(1,
                                        len(categorical_labels2) + 1)),
                              total=len(categorical_labels2)):

            window = Window.rowsBetween(Window.unboundedPreceding,
                                        Window.unboundedFollowing)
            tab = df.select([column]).\
              groupBy(column).\
              agg(F.count(column).alias('num'),
                  ).\
              withColumn('total',F.sum(F.col('num')).over(window)).\
              withColumn('percent',F.col('num')*100/F.col('total')).\
              drop(F.col('total'))

            categories = [(row[column]) for row in tab.collect()]
            category_percentage = [(row.percent) for row in tab.collect()]

            ax = fig.add_subplot(round((len(categorical_labels2) / 2) + 0.5),
                                 2, i)
            ax.bar(categories, category_percentage, label="percentage")
            plt.xticks(rotation=45)
            ax.set_title(column)
            ax.legend()
Exemplo n.º 7
0
df_news = imputer.fit(df_news).transform(df_news)
df_news = df_news.dropna()"""

#df_news  = df_news.select([col for col in df_news.columns if ("_cleaned" in col) or (col in time_cols)])

df_news = rescaledData.select("news_features")

#NEWS SENTIMENT DATA (FROM KAGGLE AND VADER)
lags = 10

df_news_sent = sqlContext.sql("SELECT Date, Subjectivity, Objectivity, Positive, Neutral, Negative FROM news_sentiment")

#GENERATE LAGS (News might take time to take effect)
for lag_num in range(1,lags+1): 
    df_news_sent = df_news_sent.withColumn("Subjectivity_Lag_{}".format(lag_num),avg(col("Subjectivity")).over(Window.rowsBetween(-lag_num,-lag_num)))
    df_news_sent = df_news_sent.withColumn("Objectivity_Lag_{}".format(lag_num),avg(col("Objectivity")).over(Window.rowsBetween(-lag_num,-lag_num)))
    df_news_sent = df_news_sent.withColumn("Positive_Lag_{}".format(lag_num),avg(col("Positive")).over(Window.rowsBetween(-lag_num,-lag_num)))
    df_news_sent = df_news_sent.withColumn("Neutral_Lag_{}".format(lag_num),avg(col("Neutral")).over(Window.rowsBetween(-lag_num,-lag_num)))
    df_news_sent = df_news_sent.withColumn("Negative_Lag_{}".format(lag_num),avg(col("Negative")).over(Window.rowsBetween(-lag_num,-lag_num)))

drop_list = ["Subjectivity","Objectivity","Positive","Neutral","Negative"]
df_news_sent  = df_news_sent.select([column for column in df_news_sent.columns if column not in drop_list])

"""imputer = Imputer(
    inputCols=[column for column in df_news_sent.columns if column not in ["Date"]], 
    outputCols=["{}_clean".format(c) for c in [column for column in df_news_sent.columns if column not in ["Date"]]],
    strategy = "median"
)
df_news_sent = imputer.fit(df_news_sent).transform(df_news_sent)
"""
Exemplo n.º 8
0
ma_order = 1

#Metrics
total_mape = []
total_rmse = []
total_smape = []

#read in data
sqlContext = SQLContext(sc)
df = sqlContext.sql(
    "SELECT * FROM aapl WHERE YEAR(aapl.date) >= 2009 and YEAR(aapl.date) <= 2015"
)

#forecast using ma
df = df.withColumn("close_ma",
                   avg(col("close")).over(Window.rowsBetween(-ma_order, -1)))

for window in [1, 5, 10, 20, 80]:
    df = df.withColumn(
        "Close_Actual_Window_{}".format(window),
        lead("close", window - 1, None).over(Window.orderBy("Date")))

for window in [1, 5, 10, 20, 80]:
    df = df.withColumn(
        "squared_error_Window_{}".format(window),
        pow((col("Close_Actual_Window_{}".format(window)) - col("close_ma")),
            2))
    df = df.withColumn(
        "s_abs_percentage_error_Window_{}".format(window),
        (abs(col("close_ma") - col("Close_Actual_Window_{}".format(window))) /
         ((col("Close_Actual_Window_{}".format(window)) + col("close_ma")) /

from IPython import display 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd
from pyspark import *
import pyspark.sql.functions as f
from pyspark.sql import functions as f
from pyspark.sql.functions import rank, sum, col 



from pyspark.sql import Window 
window = Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

usr = spark.read.json("s3://my-little-pony/yelp/yelp_academic_dataset_user.json")
view = spark.read.load('s3://my-little-pony/yelp/reviews.json', format='json')
business = spark.read.json("s3://my-little-pony/yelp/yelp_academic_dataset_business.json")



usr.createOrReplaceTempView("usr")
query = """
SELECT 
name,
user_id as ID,
explode(split(elite, ',')),
elite
FROM usr"""
    def _transform(self, dataset):

        #LOG VOLUME
        for tick in self.tickers:
            dataset = dataset.withColumn(
                "Volume_{}_float".format(tick),
                log(col("Volume_{}".format(tick)).cast(
                    DoubleType())))  #log because large value
            dataset = dataset.drop("Volume_{}".format(tick))
            dataset = dataset.withColumnRenamed("Volume_{}_float".format(tick),
                                                "Volume_{}".format(tick))

        #GENERATE TIME COLUMNS
        dataset = dataset.withColumn("Day_Of_Week", dayofweek("Date"))
        dataset = dataset.withColumn("Month", month("Date"))
        dataset = dataset.withColumn("Quarter", quarter(col("Date")))
        dataset = dataset.withColumn("Week_Of_Year", weekofyear(col("Date")))
        #dataset = dataset.withColumn("Day_Of_Year",dayofyear(col("Date")))
        dataset = dataset.withColumn("Day_Of_Month", dayofmonth(col("Date")))
        #dataset = dataset.withColumn("Year",year(col("Date")))

        time_cols = [
            "Date", "Day_Of_Week", "Month", "Quarter", "Week_Of_Year",
            "Day_Of_Month"
        ]

        #GENERATE LAG COLUMNS
        for lag_num in range(1, self.lags + 1):
            for feature in self.inputCols:
                dataset = dataset.withColumn(
                    "{}_Lag_{}".format(feature, lag_num),
                    lag(feature, lag_num, None).over(Window.orderBy("Date")))

        #SIMPLE MOVING AVERAGES, MOVING VARIANCE AND Z SCORE
        for ma_length in self.ma_windows:
            for feature in self.inputCols:
                dataset = dataset.withColumn(
                    "{}_ma_{}".format(feature, ma_length),
                    avg(col(feature)).over(Window.rowsBetween(-ma_length, -1)))
                dataset = dataset.withColumn(
                    "{}_var_{}".format(feature, ma_length),
                    variance(col(feature)).over(
                        Window.rowsBetween(-ma_length, -1)))
                dataset = dataset.withColumn(
                    "{}_Z_{}".format(feature, ma_length),
                    (col("{}_Lag_1".format(feature)) -
                     col("{}_ma_{}".format(feature, ma_length))) /
                    (col("{}_var_{}".format(feature, ma_length))))

        #OPEN/CLOSE RATIO
        for lag_num in range(1, self.lags + 1):
            for tick in self.tickers:
                dataset = dataset.withColumn(
                    "Open_Close_{}_Ratio_Lag_{}".format(tick, lag_num),
                    col("Open_{}_Lag_{}".format(tick, lag_num)) /
                    col("Close_{}_Lag_{}".format(tick, lag_num)))

        #DIFFERENCING, PERCENT CHANGE, SIGN AND ROLLING SUM
        for lag_num in range(1, int((self.lags) / 3)):
            for feature in self.inputCols:
                dataset = dataset.withColumn(
                    "{}_Diff_{}".format(feature, lag_num),
                    when(
                        isnull(
                            col("{}_Lag_{}".format(feature, lag_num)) -
                            col("{}_Lag_{}".format(feature, lag_num + 1))),
                        0).otherwise(
                            col("{}_Lag_{}".format(feature, lag_num)) -
                            col("{}_Lag_{}".format(feature, lag_num + 1))))
                dataset = dataset.withColumn(
                    "{}_Diff_Percent_{}".format(feature, lag_num),
                    when(
                        isnull(
                            (col("{}_Lag_{}".format(feature, lag_num)) -
                             col("{}_Lag_{}".format(feature, lag_num + 1))) /
                            col("{}_Lag_{}".format(feature, lag_num + 1))),
                        0).otherwise(
                            (col("{}_Lag_{}".format(feature, lag_num)) -
                             col("{}_Lag_{}".format(feature, lag_num + 1))) /
                            col("{}_Lag_{}".format(feature, lag_num + 1))))
                dataset = dataset.withColumn(
                    "{}_Diff_{}_Sign".format(feature, lag_num),
                    when(col("{}_Diff_{}".format(feature, lag_num)) > 0,
                         1.0).otherwise(-1.0))
                dataset = dataset.withColumn(
                    "{}_Rolling_Sign_{}".format(feature, lag_num),
                    sum(col("{}_Diff_{}_Sign".format(feature, lag_num))).over(
                        Window.rowsBetween(-(lag_num + 1), -1)))

        #IMPUTE VALUES
        """imputer = Imputer(inputCols=[column for column in dataset.columns if column not in time_cols], 
                           outputCols=["{}_imputed".format(c) for c in [column for column in dataset.columns if column not in time_cols]],
                           strategy = "median")
        dataset = imputer.fit(dataset).transform(dataset)
        dataset = dataset.dropna()"""

        #Drop Columns
        drop_list_fil = [col for col in self.inputCols if col != "Close_aapl"]
        drop_list = [col for col in drop_list_fil]

        dataset = dataset.select(
            [col for col in dataset.columns if col not in drop_list])

        #PREDICTION WINDOW
        dataset = dataset.withColumn(
            "Close_aapl_Window",
            lead("Close_aapl", self.pred_window - 1,
                 None).over(Window.orderBy("Date")))
        dataset = dataset.drop("Close_aapl")
        dataset = dataset.withColumnRenamed("Close_aapl_Window", "Close_aapl")
        dataset = dataset.withColumnRenamed("Close_aapl", "label")

        #DROP NULL CREATED BY
        dataset = dataset.dropna()

        #REMOVE NON IMPUTED COLUMNS
        #dataset  = dataset.select([col for col in dataset.columns if ("_imputed" in col) or (col in time_cols)])

        #self.outputCols = dataset.columns

        return dataset