Exemplo n.º 1
0
def get_last_user_event_value(
    target_column: str, user_column: str = "user_id"
) -> Column:
    return F.last(F.col(target_column)).over(
        Window()
        .partitionBy(user_column)
        .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
    )
def add_paths(df) -> Dataset:
    df = index_state(df)
    user_window = Window().partitionBy(USER_ID)
    w = user_window.orderBy(F.col(EVENT_TIME).asc())
    df = df.withColumn(PATH, F.collect_list(STATE_INDEX).over(w))
    last_date_col = "last_date"
    df = df.withColumn(last_date_col, F.max(EVENT_TIME).over(user_window))
    df = df.where(df[EVENT_TIME] == df[last_date_col])
    return df
 def filter_events(self, input_data: DataFrame) -> DataFrame:
     self._log_info("filtering events")
     user_window = Window().partitionBy("user_id")
     article_not_null = F.col("article_id").isNotNull()
     wiki_not_null = F.col("wiki_id").isNotNull()
     more_than_one_visit = F.col("visits") > 1
     return (input_data.where(article_not_null & wiki_not_null).withColumn(
         "visits",
         func.count_user_events()).where(more_than_one_visit).drop("visits")
             )
Exemplo n.º 4
0
def sql_window_api(spark):

    print("Start running Window and WindowSpec API")

    sc = spark.sparkContext
    sqlContext = SQLContext(sc)

    # orderBy, partitionBy, rowsBetween, rangeBetween
    df = spark.createDataFrame([("Alice", 2, 50), ("Alice", 3, 50),
                                ("Alice", 2, 60), ("Alice", 3, 60),
                                ("Alice", 2, 70), ("Bob", 3, 50),
                                ("Bob", 3, 60), ("Bob", 4, 50)],
                               ["name", "age", "height"])
    window = Window().partitionBy("name")
    df.withColumn("mean", mean("height").over(window)).show()
    window = Window().partitionBy("name").orderBy("height").rangeBetween(-4, 0)
    df.withColumn("mean", mean("height").over(window)).show()
    window = Window().partitionBy("name").orderBy("height").rowsBetween(
        Window.currentRow, 1)
    df.withColumn("mean", mean("height").over(window)).show()

    print("Finish running Window and WindowSpec API")
 def find_edge_events(self,
                      input_data: DataFrame,
                      user_column: str = "user_id") -> DataFrame:
     user_window = (Window().partitionBy(user_column).rowsBetween(
         Window.unboundedPreceding, Window.unboundedFollowing))
     return (input_data.orderBy("timestamp", ascending=True).select(
         "*",
         F.array(F.col("article_id"),
                 F.col("wiki_id")).alias("article_wiki_ids"),
     ).select(
         "user_id",
         F.first("article_wiki_ids").over(user_window).alias("first_event"),
         F.last("article_wiki_ids").over(user_window).alias("last_event"),
     ))
Exemplo n.º 6
0
import pyspark.sql.functions as f


def quarterly_asset_change(w):
    return (f.col('TOTAL ASSETS') - f.lag('TOTAL ASSETS', 1).over(w)) / f.lag('TOTAL ASSETS', 1).over(w)


if __name__ == '__main__':
    spark = SparkSession.builder \
                        .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.2.2') \
                        .getOrCreate()

    banks_df = spark.read.format('com.mongodb.spark.sql.DefaultSource').option('uri',
            'mongodb://127.0.0.1:27017/precision_lender.banks').load()

    banks_df = banks_df.select('IDRSSD', 'Financial Institution Name').distinct()

    balance_sheet_df = spark.read.format('com.mongodb.spark.sql.DefaultSource').option('uri',
            'mongodb://127.0.0.1:27017/precision_lender.balance_sheet').load()
    w = Window().partitionBy('IDRSSD').orderBy('Reporting Period')

    average_quarterly_asset_growth = balance_sheet_df \
        .withColumn('Quarterly Percent Change',
                    quarterly_asset_change(w)) \
        .groupBy('IDRSSD').agg(f.mean('Quarterly Percent Change').alias('Mean Quarterly Percent Change'))

    df = banks_df.join(average_quarterly_asset_growth, on=['IDRSSD']) \
                 .select('Financial Institution Name', 'Mean Quarterly Percent Change') \
                 .sort(f.col('Mean Quarterly Percent Change').desc())
    df.show(truncate=False)
Exemplo n.º 7
0
def count_user_events(user_column: str = "user_id") -> Column:
    user_window = Window().partitionBy(user_column)
    return F.count("*").over(user_window)
def process_log_data(spark, input_data, output_data):
    """ Reads log dataset from S3 and transfroms it into users, time & songplays tables,
        finally these tables are written back to S3.
    
    Args:
    spark : Spark Session object
    input_data (str): input S3 bucket path
    output_data (str): output S3 bucket path
    """
    # Get filepath to log data file
    log_data_path = os.path.join(input_data, 'log-data/*/*/*.json')
    print("{}: start processing {}".format(log_prefix, log_data_path))

    # Define the schema for log data files
    schema_log = StructType([
        StructField('artist', StringType(), True),
        StructField('auth', StringType(), True),
        StructField('firstName', StringType(), True),
        StructField('gender', StringType(), True),
        StructField('itemInSession', LongType(), True),
        StructField('lastName', StringType(), True),
        StructField('length', DoubleType(), True),
        StructField('level', StringType(), True),
        StructField('location', StringType(), True),
        StructField('method', StringType(), True),
        StructField('page', StringType(), True),
        StructField('registration', StringType(), True),
        StructField('sessionId', LongType(), True),
        StructField('song', StringType(), True),
        StructField('status', LongType(), True),
        StructField('ts', LongType(), True),
        StructField('userAgent', StringType(), True),
        StructField('userId', StringType(), True)
    ])

    # Read log data file
    df_log = spark.read.json(log_data_path, schema=schema_log)

    # Repartition
    df_log = df_log.repartition(num_partitions)

    # Persist logs dataframe for reuse
    df_log.persist(StorageLevel.MEMORY_AND_DISK)

    # Filter by actions for song plays
    df_log_nextSong = df_log.filter(
        df_log.userId.isNotNull()).filter(df_log.page == 'NextSong')

    # Extract columns for users table
    users_latest_state = df_log_nextSong.groupBy('userId').max('ts') \
                         .select("userId", col("max(ts)").alias("ts"))

    users_table = df_log_nextSong.join(users_latest_state, on = ['userId','ts']) \
                             .select(col('userId').alias('user_id'), \
                                     col('firstName').alias('first_name'), \
                                     col('lastName').alias('last_name'), \
                                     'gender', 'level')

    # Write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users'))
    print("{}: users table is written into S3".format(log_prefix))

    # Create datetime column from original timestamp column
    convert_ms_to_s = udf(lambda x: x // 1000, LongType())
    df_timestamp = df_log_nextSong.select(
        col('ts').alias('start_time')).dropDuplicates()
    df_timestamp = df_timestamp.withColumn(
        "datetime", from_unixtime(convert_ms_to_s(df_timestamp.start_time)))

    # Extract columns to create time table
    time_table = df_timestamp.withColumn("hour", hour("datetime")) \
                         .withColumn("day", dayofmonth("datetime")) \
                         .withColumn("week", weekofyear("datetime")) \
                         .withColumn("month", month("datetime")) \
                         .withColumn("year", year("datetime")) \
                         .withColumn("weekday", dayofweek("datetime")) \
                         .drop('datetime')

    # Write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'time'))
    print("{}: time table is written into S3".format(log_prefix))

    # Read in songs & artists tables to use for songplays table
    songs_table = spark.read.parquet(os.path.join(output_data, 'songs'))
    artists_table = spark.read.parquet(os.path.join(output_data, 'artists'))

    # Extract columns from joined song and log datasets to create songplays table
    songplays_table = df_log_nextSong.join(songs_table, df_log_nextSong.song == songs_table.title) \
                                 .join(artists_table, df_log_nextSong.artist == artists_table.name) \
                                 .join(time_table, df_log_nextSong.ts == time_table.start_time) \
                                 .select(df_log_nextSong.ts.alias('start_time'), \
                                         df_log_nextSong.userId.alias('user_id'), \
                                         df_log_nextSong.level, \
                                         songs_table.song_id, \
                                         artists_table.artist_id, \
                                         df_log_nextSong.sessionId.alias('session_id'), \
                                         df_log_nextSong.location, \
                                         df_log_nextSong.userAgent.alias('user_agent'), \
                                         time_table.year, \
                                         time_table.month ) \
                                  .withColumn('songplay_id', row_number().over(Window().orderBy('song_id')))

    # Write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'songplays'))

    # Unpersist song dataframe
    df_log.unpersist()
    print("{}: logs dataset processing is finished".format(log_prefix))
def process_log_data(spark, input_data, output_data, song_df):
    """
    Description: This function can be used to process log-data files from the
    given input path and transform the data from json files into users, time and songplays
    spark tables and writing these tables to the given output path as parquet tables.

    Arguments:
        spark: SparkSession object.
        input_data: Path to the input JSON files.
        output_data: Path to the output directory that stores output parquet tables.
        song_df: Song data dataframe.

    Returns:
        None.
    """
    # get filepath to log data file
    log_data = input_data + 'log-data/2018/11'

    # define schema for log data file
    log_schema = t.StructType([
        t.StructField("artist", t.StringType(), True),
        t.StructField("auth", t.StringType(), True),
        t.StructField("firstName", t.StringType(), True),
        t.StructField("gender", t.StringType(), True),
        t.StructField("itemInSession", t.IntegerType(), True),
        t.StructField("lastName", t.StringType(), True),
        t.StructField("length", t.DecimalType(12, 7), True),
        t.StructField("level", t.StringType(), True),
        t.StructField("location", t.StringType(), True),
        t.StructField("method", t.StringType(), True),
        t.StructField("page", t.StringType(), True),
        t.StructField("registration", t.DecimalType(16, 2), True),
        t.StructField("sessionId", t.IntegerType(), True),
        t.StructField("song", t.StringType(), True),
        t.StructField("status", t.IntegerType(), True),
        t.StructField("ts", t.LongType(), True),
        t.StructField("userAgent", t.StringType(), True),
        t.StructField("userId", t.StringType(), True)
    ])

    # read log data file using schema
    df = spark \
        .read \
        .format("json") \
        .schema(log_schema) \
        .load(log_data)

    # filter by actions for song plays
    df = df \
        .filter('page = "NextSong"')

    # group by userId for unique users
    users_list = df \
        .groupBy('userId') \
        .agg(f.max('ts').alias('ts'))

    # extract columns to create users table
    users_table = df \
        .join(users_list, ['userId', 'ts'], 'inner') \
        .select([df.userId.cast(t.IntegerType()).alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), 'gender', 'level']) \
        .dropDuplicates()

    # write users table to parquet files
    users_output = output_data + 'users'

    users_table \
        .write \
        .option("path", users_output) \
        .saveAsTable('users', format='parquet')

    # create timestamp column from original timestamp column
    df = df \
        .withColumn('timestamp', f.from_utc_timestamp((df.ts/1000.0).cast('timestamp'), 'UTC'))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda ts: datetime.fromtimestamp(ts / 1000.0),
                       t.TimestampType())
    df = df.withColumn('datetime', get_datetime('ts'))

    # extract columns to create time table
    time_table = df \
        .select([col('datetime').alias('start_time'), dayofmonth(col('datetime')).alias('day'), weekofyear(col('datetime')).alias('week'), month(col('datetime')).alias('month'), year(col('datetime')).alias('year'), dayofweek(col('datetime')).alias('weekday')]) \
        .dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_output = output_data + 'time'

    time_table \
        .write \
        .partitionBy('year', 'month') \
        .option("path", time_output) \
        .saveAsTable('time', format='parquet')

    # join and extract columns from song and log datasets to create songplays table
    cond = [
        df.artist == song_df.artist_name, df.song == song_df.title,
        df.length == song_df.duration
    ]
    songplays_df = df.join(song_df, cond, 'left')

    songplays_df = songplays_df \
        .select(df.datetime.alias('start_time'), df.userId.alias('user_id'), df.level.alias('level'), song_df.song_id.alias('song_id'), song_df.artist_id.alias('artist_id'), df.sessionId.alias('session_id'), df.location.alias('location'), df.userAgent.alias('user_agent'), year(df.datetime).alias('year'), month(df.datetime).alias('month'))
    w = Window().orderBy(f.lit('A'))
    songplays_table = songplays_df.withColumn('songplay_id',
                                              f.row_number().over(w))

    # write songplays table to parquet files partitioned by year and month
    songplays_output = output_data + 'songplays'

    songplays_table \
        .select(['songplay_id', 'start_time', 'user_id', 'level', 'song_id', 'artist_id', 'session_id', 'location', 'user_agent', 'year', 'month'])\
        .write \
        .partitionBy('year', 'month') \
        .option("path", songplays_output) \
        .saveAsTable('songplays', format='parquet')
Exemplo n.º 10
0
#Разделить поле title на два отдельных – smartphone_brand и smartphone_model. 
df1 = df.withColumn('smartphone_brand', F.split(df['title'],' ').getItem(0))
df1 = df1.withColumn('smartphone_brand1', split_col.getItem(F.size(split_col)-1))
df1 = df1.withColumn('smartphone_brand2', split_col.getItem(F.size(split_col)-2))
df1 = df1.drop('seller','ram','memory')
df1 = df1.withColumn('smartphone_model',F.expr('substring(title,length(smartphone_brand)+2, length(title)-length(smartphone_brand)-length(smartphone_brand1)-length(smartphone_brand2)-3)')).drop('smartphone_brand2','smartphone_brand1')
# я не знал как сделать по-другому - ничего не нашел


#Найти средние стоимости моделей смартфонов. 
df1 = df1.withColumn("price", df1["price"].cast(IntegerType()))
df1 = df1.groupby('smartphone_brand','smartphone_model').agg(F.avg('price').alias('smartphone_price'))

#Каждой уникальной записи dataframe’а необходимо присвоить свой порядковый smartphone_id, начиная с 1
df1= df1.withColumn("new_column",F.lit("ABC"))
w = Window().partitionBy('new_column').orderBy(F.lit('A'))
df1 = df1.withColumn("smartphone_id", F.row_number().over(w)).drop("new_column")
df1 = df1.select('smartphone_id', 'smartphone_brand', 'smartphone_model', 'smartphone_price')
df1.show(10,False)




# In[11]:


df1.write.mode('overwrite').parquet("phones.parquet")


# In[33]:
# ## Data Enrichment & Additional Transformations (Continued...)

# In[8]:

# Filter - Keep where Playtype in ['Run','Pass']
nfldata2 = nfldata2.filter((nfldata2.PlayType == "Run")
                           | (nfldata2.PlayType == "Pass"))

# Derive Date var(s)
nfldata2 = nfldata2.withColumn(
    "month_day",
    concat(nfldata2["Date"].substr(6, 2),
           nfldata2["Date"].substr(9, 2)).cast("int"))

# Lag (Get previous PlayType)
w = Window().partitionBy('GameID', 'Drive').orderBy('GameID', 'Drive',
                                                    col('TimeSecs').desc())
nfldata2 = nfldata2.withColumn("PlayType_lag",
                               lag("PlayType").over(w)).withColumn(
                                   "PlayType_lag",
                                   when(isnull('PlayType_lag'),
                                        'FirstPlay').otherwise(
                                            col('PlayType_lag'))).orderBy(
                                                'GameID', 'Drive',
                                                col('TimeSecs').desc())

# Print Results
#nfldata2.select(["GameID","Drive","qtr","down","TimeSecs","PlayType","PlayType_lag","yrdline100","posteam","month_day"]).show(50,False)

# Split into "Run" and "Pass" (I want to build two models)
nfldata2_run = nfldata2.filter(col('PlayType') == 'Run')
nfldata2_pass = nfldata2.filter(col('PlayType') == 'Pass')
Exemplo n.º 12
0
                                    .reduceByKey(lambda a, b: a + b) \

    # convert the RDD into a DataFrame temporarily used to further convert into a Pandas DataFrame
    columns = ["Word", "Frequency"]
    df = word_and_frequency_pairs.toDF(columns)

    # convert the dataFrame into a Pandas dataframe for easy sorting
    pandasDataframe = df.toPandas()
    pandasDataframeSorted = pandasDataframe.sort_values(
        by=['Frequency', 'Word'], ascending=[False, True])

    # convert back to a PySpark DataFrame
    pySparkDataFrame = spark.createDataFrame(pandasDataframeSorted)

    # Add Rank (row index) column
    w = Window().orderBy(F.col("Frequency").desc(), F.col("Word").asc())
    pySparkDataFrame = pySparkDataFrame.withColumn("Rank",
                                                   F.row_number().over(w))

    categoriseAndPrintEntities(distinct_words, pySparkDataFrame, 'words')

    print('----------------------\n')

    # extract letters from each word - the words are in the RDD created before
    letters = words.flatMap(lambda word: [character for character in word]) \
                   .filter(lambda letter: letter != '-')

    distinct_letters = letters.distinct()

    # print totals
    printTotalsInformation(letters, distinct_letters, 'letters')
Exemplo n.º 13
0
def analyze(spark: SparkSession,
            input_file='emissions.parquet',
            output_file='rasters.parquet',
            time_granularity=600,
            num_cols=100,
            cell_size=None,
            use_type=False):
    prt_high("""
            Running compute emissions.
            ##################################
            Parameters
             - Input file: {}
             - Output file: {}
             - Time granularity: {}
             - Cell size: {}
             - Number of raster columns: {}
             - Rasters by type: {}
            ##################################

            """.format(input_file, output_file, time_granularity, cell_size,
                       num_cols, use_type))

    cell_size_meters = None
    # Process parameters
    if cell_size is not None:
        prt_high("Info: Cell size set in parameters, using it")
        if cell_size[-1] == 'm':  # Meters
            cell_size_meters = int(cell_size[:-1])
            cell_size = meters_to_deg(cell_size_meters)
            # All except the last char
        else:
            cell_size = int(cell_size)
    else:
        num_cols = int(num_cols)
    time_granularity = int(time_granularity)

    # TODO: FILL NUM_VARS automatically
    prt_warn("WARNING: Number of variables is manually set to 10")
    num_vars = 10

    df = spark.read.parquet(input_file)

    min_max_lat_lon = df.agg(F.min(df.latitude), F.max(df.latitude),
                             F.min(df.longitude), F.max(df.longitude))
    min_max_time = df.agg(F.min(df.time), F.max(df.time))

    min_max_row = min_max_lat_lon.first()
    min_lat = min_max_row[0]
    max_lat = min_max_row[1]
    min_lon = min_max_row[2]
    max_lon = min_max_row[3]

    min_max_time = min_max_time.first()
    min_time = min_max_time[0]
    max_time = min_max_time[1]

    if cell_size is None:
        # Calculate cell dimension and number of rows using the number of
        # columns defined
        cell_size = (max_lon - min_lon) / num_cols
    else:
        num_cols = int(np.ceil((max_lon - min_lon) / cell_size))

    num_rows = int(np.ceil((max_lat - min_lat) / cell_size))
    prt_info("NUM COLS: " + str(num_cols))
    prt_info("NUM ROWS: " + str(num_rows))
    prt_info("CELL DIM: " + str(cell_size))

    # Create metadata file
    prt_info("Building metadata")
    meta = [(num_cols, num_rows, num_vars, min_lat, max_lat, min_lon, max_lon,
             cell_size, cell_size_meters, min_time, max_time, time_granularity)
            ]
    rdd = spark.sparkContext.parallelize(meta)
    metarow = rdd.map(lambda x: Row(num_cols=int(x[0]),
                                    num_rows=int(x[1]),
                                    num_vars=int(x[2]),
                                    min_lat=float(x[3]),
                                    max_lat=float(x[4]),
                                    min_lon=float(x[5]),
                                    max_lon=float(x[6]),
                                    cell_size=float(x[7]),
                                    cell_size_meters=float(x[8]),
                                    min_time=int(x[9]),
                                    max_time=int(x[10]),
                                    time_granularity=int(x[11])))
    metadf = spark.createDataFrame(metarow)

    # Create rasters
    prt_info("Building rasters")
    # TODO: Implement a way to define cell size
    to_cell = partial(lat_lon_to_cell, min_lon, min_lat, num_cols, num_rows,
                      cell_size)
    convertToCellFunc = udf(to_cell, IntegerType())

    to_hours = partial(to_time_resolution, time_granularity)
    convertToHours = udf(to_hours, IntegerType())

    df_cell = df.withColumn('cell',
                            convertToCellFunc(df['longitude'], df['latitude']))
    df_cell = df_cell.withColumn('hour', convertToHours(df['time']))

    # use desc order and first to get the last value
    # https://stackoverflow.com/questions/43114445/how-to-use-first-and-last-function-in-pyspark
    w = Window().partitionBy('imo', 'hour').orderBy(df_cell.time.desc())

    # lower than any possible sin/cos value so max only gets valid values

    df_cell = df_cell.withColumn('last_time', first("time").over(w))
    # PLACEHOLDER = -9999.0
    # df_cell = df_cell.withColumn('last_amp_v',
    #                             when(df_cell['last_time'] == df_cell['time'],
    #                                  df_cell['amp_v']).otherwise(PLACEHOLDER))
    # df_cell = df_cell.withColumn('last_cos',
    #                             when(df_cell['last_time'] == df_cell['time'],
    #                                  df_cell['cos_v']).otherwise(PLACEHOLDER))
    # df_cell = df_cell.withColumn('last_sin',
    #                             when(df_cell['last_time'] == df_cell['time'],
    #                                  df_cell['sin_v']).otherwise(PLACEHOLDER))

    # potential bug: max(last_amp_v), max('last_cos'), max('last_sin'). Each
    # may be from different ships if they happen to be in the same raster cell
    if use_type:
        raster = df_cell.groupBy("cell", "hour", "type")
    else:
        raster = df_cell.groupBy("cell", "hour")

    raster = raster.agg(
        F.sum("sox_me").alias("sox_me"),
        F.sum("sox_ae").alias("sox_ae"),
        F.sum('co2_me').alias('co2_me'),
        F.sum("co2_ae").alias("co2_ae"),
        F.sum("nox_me").alias("nox_me"),
        F.sum('nox_ae').alias('nox_ae'),
        # F.max('last_amp_v').alias('last_amp_v'),
        # F.max('last_cos').alias('last_cos'),
        # F.max('last_sin').alias('last_sin'),
        F.count('*').alias('sample_count'))

    # Lower limit for these attributes
    # raster = raster.withColumn(
    #    'last_amp_v', when(raster['last_amp_v'] <= (PLACEHOLDER + 1), 0)
    #    .otherwise(raster['last_amp_v']))
    # raster = raster.withColumn(
    #    'last_cos', when(raster['last_cos'] <= (PLACEHOLDER + 1), 0)
    #    .otherwise(raster['last_cos']))
    # raster = raster.withColumn(
    #    'last_sin', when(raster['last_sin'] <= (PLACEHOLDER + 1), 0)
    #    .otherwise(raster['last_sin']))

    raster = change_column_type(raster,
                                'sample_count',
                                IntegerType(),
                                force=True)
    raster = change_column_type(raster, 'sox_me', FloatType(), force=True)
    raster = change_column_type(raster, 'sox_ae', FloatType(), force=True)
    raster = change_column_type(raster, 'co2_me', FloatType(), force=True)
    raster = change_column_type(raster, 'co2_ae', FloatType(), force=True)
    raster = change_column_type(raster, 'nox_me', FloatType(), force=True)
    raster = change_column_type(raster, 'nox_ae', FloatType(), force=True)
    # raster = change_column_type(raster, 'last_amp_v', FloatType(),
    #                             force=True)
    # raster = change_column_type(raster, 'last_cos', FloatType(), force=True)
    # raster = change_column_type(raster, 'last_sin', FloatType(), force=True)

    raster = ensure_columns_type(raster)

    # Write rasters
    raster.write.mode('overwrite').parquet(output_file)

    # Write metadata
    metadf.write.mode('overwrite').parquet(output_file + '.meta')
    return
Exemplo n.º 14
0
## Extracting weekday from nav_date column
data = data.withColumn('weekday', f.date_format('nav_date', 'E'))

## converting weekday from string to numerical EX: Monday as 0 so on
data = data.withColumn('weekday', weekday(data.weekday))

## Using window function to calculate cumulative sum
windowval = Window.partitionBy('nav_date').orderBy('Bin').rangeBetween(
    Window.unboundedPreceding, 0)
data = data.withColumn('cumsum', f.sum('count').over(windowval))
df_bin_sorted = data.withColumn('cumsum', data.cumsum.cast(IntegerType()))

###

w = Window().partitionBy('nav_date')
df_bin_sorted = df_bin_sorted.select(
    "*",
    f.collect_list("cumsum").over(w).alias("group_cumsum"))


def cumsumpct_calculate(num, maximum):
    return num / float(np.max(maximum))


udf_cum_pct = f.udf(cumsumpct_calculate, FloatType())

df_bin_sorted = df_bin_sorted.withColumn(
    'cumsumpct',
    f.format_number(
        udf_cum_pct(df_bin_sorted.cumsum, df_bin_sorted.group_cumsum), 4))
Exemplo n.º 15
0
'''
Created on 27-Jun-2020

Window Functions or Windowed Aggregates:
Window (also, windowing or windowed) functions perform a calculation over a set of rows.
    It is an important tool to do statistics. Most Databases support Window functions.
    Spark from version 1.4 start supporting Window functions. Spark Window Functions have the following traits:
        a. perform a calculation over a group of rows, called the Frame.
        b. a frame corresponding to the current row
        c. return a new value to for each row by an aggregate/window function
        d. Can use SQL grammar or DataFrame API.

@author: kasho
'''

from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.sql.functions import *

conf = BaseConfUtils()
sparkContxt = conf.createSparkContext("Windows Fun")
sqlContxt = conf.createSQLContext(sparkContxt)


@udf("long")
def median_udf(s):
    index = int(len(s) / 2)
    return s[index]
Exemplo n.º 16
0
    "RespondsToMailOffers", "OptOutMailings", "NonUSTravel", "OwnsComputer",
    "HasCreditCard", "NewCellphoneUser", "MadeCallToRetentionTeam",
    "CreditRating", "PrizmCode", "Occupation", "MaritalStatus"
}

# Cast given columns to type double
churn = casting(churn, string_columns)

# Change output ame
churn = churn.withColumn("label", col("Churn") == "Yes")

# Cast output to type int
churn = churn.withColumn("label", col("label").cast('int'))

# Generate window
w = Window().orderBy(lit('A'))

# Create column to produce row numbers
churn = churn.withColumn("row_num", row_number().over(w))

########################
#---Machine Learning---#
########################

# Create machine learning pipeline
piped = ml_pipeline(churn)

# Standardize dataset
training, testing = piped.randomSplit([.75, .25])

# Create evaluator