Exemplo n.º 1
0
    def dateTransform(self, columns, currentFormat, outputFormat):
        """
        :param  columns     Name date columns to be transformed. Columns ha
        :param  currentFormat   currentFormat is the current string dat format of columns specified. Of course,
                                all columns specified must have the same format. Otherwise the function is going
                                to return tons of null values because the transformations in the columns with
                                different formats will fail.
        :param  outputFormat    output date string format to be expected.
        """
        # Check if currentFormat argument a string datatype:
        self.__assertTypeStr(currentFormat, "currentFormat")
        # Check if outputFormat argument a string datatype:
        self.__assertTypeStr(outputFormat, "outputFormat")
        # Check if columns argument must be a string or list datatype:
        self.__assertTypeStrOrList(columns, "columns")

        if type(columns) == type('str'): columns = [columns]

        # Check if columns to be process are in dataframe
        self.__assertColsInDF(columnsProvided=columns, columnsDF=self.__df.columns)

        exprs = [date_format(unix_timestamp(c, currentFormat).cast("timestamp"), outputFormat).alias(
            c) if c in columns else c for c in self.__df.columns]

        self.__df = self.__df.select(*exprs)

        self.__addTransformation()  # checkpoint in case

        return self
Exemplo n.º 2
0
    def ageCalculate(self, column, dateFormat, nameColAge):
        """
        This method compute the age of clients based on their born dates.
        :param  column      Name of the column born dates column.
        :param  dateFormat  String format date of the column provided.
        :param  nameColAge  Name of the new column, the new columns is the resulting column of ages.

        """
        # Check if column argument a string datatype:
        self.__assertTypeStr(column, "column")

        # Check if dateFormat argument a string datatype:
        self.__assertTypeStr(dateFormat, "dateFormat")

        # Asserting if column if in dataFrame:
        assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame"

        # Output format date
        Format = "yyyy-MM-dd"  # Some SimpleDateFormat string

        exprs = format_number(
            mag(
                months_between(date_format(
                    unix_timestamp(column, dateFormat).cast("timestamp"), Format), current_date()) / 12), 4).alias(
            nameColAge)

        self.__df = self.__df.withColumn(nameColAge, exprs)

        self.__addTransformation()  # checkpoint in case

        return self
Exemplo n.º 3
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = os.path.join(input_data, 'log_data/*/*/*.json')

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(col('page') == 'NextSong')

    # extract columns for users table, columns user_id, first_name, last_name, gender, level
    users_table = df.select(
        ['userId', 'firstName', 'lastName', 'gender',
         'level']).drop_duplicates(subset=['userID']).dropna(subset=['userID'])

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'user'), 'overwrite')
    print('users table count:', users_table.count())

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: x / 1000, IntegerType())
    df = df.withColumn('start_time', get_timestamp('ts'))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp(x / 1000),
                       TimestampType())
    df = df.withColumn('datetime', get_datetime('ts'))

    # extract columns to create time table, columns = start_time, hour, day, week, month, year, weekday
    time_table = df.select(
        col('datetime').alias('start_time'),
        hour('datetime').alias('hour'),
        dayofmonth('datetime').alias('day'),
        weekofyear('datetime').alias('week'),
        month('datetime').alias('month'),
        year('datetime').alias('year'),
        date_format('datetime', 'EEEE').alias('weekday'))

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy(['year', 'month'
                                  ]).parquet(os.path.join(output_data, 'time'),
                                             'overwrite')
    print('time table row counts:', time_table.count())

    # read in song and artist data to use for songplays table
    song_df = spark.read.parquet(os.path.join(output_data, 'song'))
    artist_df = spark.read.parquet(os.path.join(output_data, 'artist'))

    # create temp view before join and extracted
    song_df.createOrReplaceTempView('song')
    df.createOrReplaceTempView('log')
    artist_df.createOrReplaceTempView('artist')

    # extract columns from joined song and log datasets to create songplays table
    columns = songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent
    songplays_table = spark.sql("""
                                SELECT l.datetime as start_time, l.userId as user_id, l.level, s.song_id, a.artist_id,
                                        l.sessionId as session_id, l.location, l.userAgent as user_agent, 
                                        year(l.datetime) as year, month(l.datetime) as month
                                FROM log as l 
                                LEFT JOIN song as s ON (l.song = s.title)
                                LEFT JOIN artist as a ON (l.artist = a.name) AND (s.artist_id = a.artist_id)
                                """)

    # add a primary key column
    from pyspark.sql.functions import monotonically_increasing_id
    songplays_table = songplays_table.withColumn('songplays_id',
                                                 monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy(['year', 'month']).parquet(
        os.path.join(output_data, 'songplay'), 'overwrite')
    print('song play row counts:', songplays_table.count())
Exemplo n.º 4
0
    StructField("locality", StringType(), True),
    StructField("town_city", StringType(), True),
    StructField("district", StringType(), True),
    StructField("county", StringType(), True),
    StructField("ppd_category_type", StringType(), True),
    StructField("record_status", StringType(), True)
])

log("Loading file...")

pricePaid = df.option("header", "false").csv("/Users/manamohanpanda/Downloads/pp-complete.csv", enforceSchema=True, schema=schema)

# filter greater london data
filterGreaterLondon = pricePaid.filter(pricePaid["county"] == "GREATER LONDON")
# filter district and county and their sale date as well as transform date in MM/yyyy format
formatteddf = filterGreaterLondon.select("district","county",date_format(unix_timestamp("date_of_transfer", "yyyy-MM-dd").cast("timestamp"),"MM/yyyy").alias("dot"))

log("---1--")
# group data by date of transfer and district, count each district sale
sorteddf = formatteddf.groupBy("dot","district").count()

log("--2---")

# current_year = 2018
months = ["01","02","03","04","05","06","07","08","09","10","11","12"]

# iterate data for all years from 1995 to 2018, then sort and write top 5 data to month files
# using coalesce(1) only for test purpose, it doesnt have to be and avoiding it will make it much faster
for y in range(1995,2019):

    for x in months:
 def do_post_preprocessing(self, preprocessed):
     return preprocessed.withColumn(
         "hour", func.hour("datetime")).withColumn(
             "week_day_number",
             func.date_format("datetime", "u").cast(IntegerType()))
Exemplo n.º 6
0
def main():
    # Get command line arguments
    BUCKET_NAME = sys.argv[1]
    DATASET_NAME = sys.argv[2]

    # Create a SparkSession under the name "setup"
    spark = SparkSession.builder.appName("setup").getOrCreate()

    spark.conf.set("temporaryGcsBucket", BUCKET_NAME)

    create_bigquery_dataset(DATASET_NAME)

    # Whether we are running the job as a test
    test = False

    # Check whether or not the job is running as a test
    if "--test" in sys.argv:
        test = True
        print("A subset of the whole dataset will be uploaded to BigQuery")
    else:
        print("Results will be uploaded to BigQuery")

    # Ingest External Datasets
    for table_name, data in EXTERNAL_TABLES.items():
        df = spark.createDataFrame(pd.read_csv(data["url"]),
                                   schema=data["schema"])

        write_to_bigquery(df, table_name, DATASET_NAME)

    # Check if table exists
    try:
        df = spark.read.format("bigquery").option("table", TABLE).load()
        # if we are running a test, perform computations on a subset of the data
        if test:
            df = df.sample(False, 0.00001)
    except Py4JJavaError:
        print(f"{TABLE} does not exist. ")
        return

    # Declare dictionary with keys column names and values user defined
    # functions and return types
    udf_map = {
        "tripduration": (trip_duration, StringType()),
        "start_station_name": (station_name, StringType()),
        "start_station_latitude": (convert_angle, StringType()),
        "start_station_longitude": (convert_angle, StringType()),
        "end_station_name": (station_name, StringType()),
        "end_station_latitude": (convert_angle, StringType()),
        "end_station_longitude": (convert_angle, StringType()),
        "usertype": (user_type, StringType()),
        "gender": (gender, StringType()),
    }

    # Declare which columns to set some values to null randomly
    null_columns = [
        "tripduration",
        "starttime",
        "stoptime",
        "start_station_latitude",
        "start_station_longitude",
        "end_station_latitude",
        "end_station_longitude",
    ]

    # Dirty the columns
    for name, udf in udf_map.items():
        df = df.withColumn(name, UserDefinedFunction(*udf)(name))

    # Format the datetimes correctly
    for name in ["starttime", "stoptime"]:
        df = df.withColumn(name, date_format(name, "yyyy-MM-dd'T'HH:mm:ss"))

    # Randomly set about 5% of the values in some columns to null
    for name in null_columns:
        df = df.withColumn(
            name,
            when(expr("rand() < 0.05"), None).otherwise(df[name]))

    # Duplicate about 0.01% of the rows
    dup_df = df.sample(True, 0.0001)

    # Create final dirty dataframe
    df = df.union(dup_df)

    print("Uploading citibike dataset...")
    write_to_bigquery(df, CITIBIKE_TABLE_NAME, DATASET_NAME)
Exemplo n.º 7
0
# COMMAND ----------

# MAGIC %md <h4>Create Time Dimension</h4>

# COMMAND ----------

from pyspark.sql.functions import *
from pyspark.sql.functions import concat, col, lit

time = money.select("IssuedAt") \
             .withColumn("year",year(money['IssuedAt'])) \
             .withColumn("day",dayofmonth(money['IssuedAt'])) \
             .withColumn("month",month(money['IssuedAt'])) \
             .withColumn("hour",hour(money['IssuedAt'])) \
             .withColumn("minute",minute(money['IssuedAt'])) \
             .withColumn("dayofweek",date_format(money['IssuedAt'],'EEEE')) \
             .withColumn("weekofyear",weekofyear(money['IssuedAt'])) \
             .withColumn("time_id",concat(col("year"), col("month"), col("day"),col("hour"),col("minute"))) \
             .drop('IssuedAt') \
             .distinct()

# COMMAND ----------

# MAGIC %md <h4>Create a Product Dimension</h4>

# COMMAND ----------

from pyspark.sql.functions import *
from pyspark.sql.functions import concat, col, lit

product = money.select('ticketType','Price')
Exemplo n.º 8
0
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.functions import *
import sys
import pandas as pd

sc = SparkSession \
    .builder \
    .appName("temperatures") \
    .getOrCreate()

# Please enter a set of cities as the command-line arguments
cities = sys.argv  # for example, Kyiv Paris

data_file = "C:/Users/eugen/Documents/temperatures/temperatures.csv"
df = sc.read.csv(data_file, header=True, sep=",").cache()
df2 = df.withColumn('Temperature', df['Temperature'].cast('double'))
dfByMonth = df2.withColumn('Date', F.date_format('Date', 'YYYY-MM'))

# 4 Max, Min, Avg Temperature per month for a given N of cities
df_cities = dfByMonth.filter(dfByMonth.City.isin(cities))

df_result = df_cities.groupBy("City", "Date").agg(
    max(col("Temperature")).alias("MaxTemp"),
    min(col("Temperature")).alias("MinTem"),
    round(mean(col("Temperature")), 1).alias("AvgTemp"))

df_result.toPandas().to_csv("C:/Users/eugen/Documents/temperatures/task-4.csv")
Exemplo n.º 9
0
def prep_travelers_data(config):
    """
    Read travelers data in from SAS files into Spark and export to CSV
    """
    # Initiate spark connection
    spark = SparkSession.builder.config("spark.jars.packages","saurfang:spark-sas7bdat:2.0.0-s_2.11")\
        .enableHiveSupport().getOrCreate()

    # Read data file into spark dataframe
    i94_df = spark.read.format('com.github.saurfang.sas.spark').load(
        config['INPUT']['TRAVELERS'])

    # Rename columns
    travel_data = i94_df.selectExpr("i94port as iata_code",
                                    "arrdate as arrival_date", "i94bir as age",
                                    "i94visa as visa",
                                    "biryear as year_of_birth", "gender")

    # Filter out any non-existant airport codes
    travel_data = travel_data.filter(travel_data.iata_code != 'XXX')

    # Convert the SAS date to a regular date type
    start_date = datetime.datetime(1960, 1, 1)
    convert_sas_date = F.udf(
        lambda x: start_date + datetime.timedelta(days=int(x))
        if x is not None else None, T.DateType())
    travel_data_clean = travel_data.withColumn(
        'arrival_date', convert_sas_date('arrival_date'))

    # Extract the arrival year, month, and day into separate columns
    travel_data_clean = travel_data_clean.withColumn(
        "arrival_year", F.date_format(F.col("arrival_date"), "y"))
    travel_data_clean = travel_data_clean.withColumn(
        "arrival_month", F.date_format(F.col("arrival_date"), "M"))
    travel_data_clean = travel_data_clean.withColumn(
        "arrival_day", F.date_format(F.col("arrival_date"), "d"))

    # Drop additional column and filter out nulls from gender
    travel_data_clean = travel_data_clean.drop(F.col('arrival_date'))
    travel_data_clean = travel_data_clean.filter(
        travel_data_clean.gender.isNotNull())

    # Cast datatypes to the appropriate column types
    travel_data_final = travel_data_clean.selectExpr(
        "iata_code", "cast(age as int) as age", "cast(visa as int) as visa",
        "gender", "cast(year_of_birth as int) as year_of_birth",
        "cast(arrival_year as int) as arrival_year",
        "cast(arrival_month as int) as arrival_month",
        "cast(arrival_day as int) as arrival_day")

    # Export the dataframe to csv format
    travel_data_final.write.mode("overwrite").csv(
        config['OUTPUT']['FOLDER'] + '/' + config['OUTPUT']['TRAVELERS'])

    # Remove files that are not necessary for import to redshift
    for f in os.listdir(config['OUTPUT']['FOLDER'] + '/' +
                        config['OUTPUT']['TRAVELERS']):
        if f.endswith('crc') or f.startswith('_'):
            os.remove(
                f"{config['OUTPUT']['FOLDER'] + '/' + config['OUTPUT']['TRAVELERS']}/{f}"
            )
Exemplo n.º 10
0
gps_df = gps_df.withColumn(
    "Datetime", F.to_timestamp(F.substring(F.col("Datetime"), 0, 19)))

# Import train_hire_stats.csv as dataframe using the defined schema
schema = StructType([
    StructField("Zone_ID", ByteType(), False),
    StructField("Date", TimestampType(), False),
    StructField("Hour_slot", ByteType(), False),
    StructField("Hire_count", ShortType(), False)
])
train_df = spark.read.format("csv").option("header", "true").option(
    "delimiter", ",").schema(schema).load("data/train_hire_stats.csv")

train_df = train_df.withColumn(
    'Day_of_the_week',
    (F.date_format(train_df["Date"], "u").cast(IntegerType())))

train_df = train_df.withColumn(
    'Month', (F.date_format(train_df["Date"], "M").cast(IntegerType())))

# Import test_hire_stats.csv as dataframe using the defined schema
schema = StructType([
    StructField("Test_ID", ShortType(), False),
    StructField("Zone_ID", ByteType(), False),
    StructField("Date", TimestampType(), False),
    StructField("Hour_slot", ByteType(), False),
    StructField("Hire_count", ByteType(), False)
])
test_df = spark.read.format("csv").option("header", "true").option(
    "delimiter", ",").schema(schema).load("data/test_hire_stats.csv")
# COMMAND ----------

train_df = spark.sql("SELECT * FROM traindf where StoreState ='Pichincha'")

# COMMAND ----------

train_df.show()

# COMMAND ----------

train_df = train_df.withColumnRenamed("Date","Date_Date")

# COMMAND ----------

from pyspark.sql.functions import date_format
df3 = train_df.select('Date_Date', date_format('Date_Date', 'u').alias('dow_number'), date_format('Date_Date', 'E').alias('dow_string'))
df3 = df3.distinct()
df3 = df3.withColumnRenamed("Date_Date","Date2")

# COMMAND ----------

store_dept_data = train_df.groupBy("StoreState","ItemFamily", "Date_Date").sum("Units").orderBy("Date_Date").join(df3,df3.Date2 == train_df.Date_Date)

# COMMAND ----------

store_dept_data.show(10)

# COMMAND ----------

store_dept_data = store_dept_data.join(stores, (stores.state == store_dept_data.StoreState), "left")
    "cell_250m_lon",
    llc_lon + nyc["x_250m_cell"] * x_grid_step + 0.5 * x_grid_step)

nyc = nyc.withColumn("y_250m_cell",
                     ((nyc["lat"] - llc_lat) / y_grid_step).cast('integer'))
nyc = nyc.withColumn(
    "cell_250m_lat",
    llc_lat + nyc["y_250m_cell"] * y_grid_step + 0.5 * y_grid_step)

nyc = nyc.withColumn('cell_index',
                     concat(col("x_250m_cell"), lit(";"), col("y_250m_cell")))

# create hour column
nyc = nyc.withColumn(
    "hour",
    date_format(col("timestamp").cast("timestamp"), "yyyy-MM-dd HH:00"))

# count cell aggregations and save to file
hourly_counts = nyc.groupby("hour", "cell_index",
                            "class").agg(countDistinct("ad_id_upper"))

hourly_counts.write \
.format("com.databricks.spark.csv") \
.mode("overwrite") \
.save("/user/bjb417/covid/output/nyc/nyc_land_use/nyc_250mGrid_landUse_uniqueDev_hourlyCounts_active14days.csv")

# save 250m x 250m grid information
grid = nyc.select("cell_index", "x_250m_cell", "y_250m_cell", "cell_250m_lon", "cell_250m_lat") \
 .drop_duplicates(subset=['cell_index'])

grid.write \
Exemplo n.º 13
0
def join_function(path_nbr, path_nlu):
    '''
    returns joined spark df 
    
    '''

    #nlu

    # path = '/Users/amirdavidoff/Desktop/data/enriched_data/nlu'
    spark_nlu = sqlContext.read.parquet(path_nlu)

    spark_nlu = spark_nlu.filter((F.to_date("date") >= F.lit("2019-07-01")))

    #nbr

    # path = '/Users/amirdavidoff/Desktop/data/enriched_data/nbr'
    spark_nbr = sqlContext.read.parquet(path_nbr)

    spark_nbr = spark_nbr.filter((F.to_date("date") >= F.lit("2019-07-01")))

    spark_nbr = spark_nbr.withColumn('source', F.lit('nbr'))
    spark_nlu = spark_nlu.withColumn('source', F.lit('nlu'))

    #changed column names

    for c in spark_nbr.columns:

        spark_nbr = spark_nbr.withColumnRenamed(c, "nbr_{}".format(c))

    for c in spark_nlu.columns:

        spark_nlu = spark_nlu.withColumnRenamed(c, "nlu_{}".format(c))

    nbr_cols = [
        'nbr_sender_id', 'nbr_retailer_id', 'nbr_timestamp',
        'nbr_ts_plus_response', 'nbr_conv', 'nbr_ack_text',
        'nbr_response_code', 'nbr_possible_values', 'nbr_source', 'nbr_date'
    ]

    nlu_cols = [
        'nlu_sender_id', 'nlu_retailer_id', 'nlu_gender', 'nlu_age_group',
        'nlu_text', 'nlu_timestamp', 'nlu_intents_list', 'nlu_subvertical',
        'nlu_positive_aspects', 'nlu_positive_product_type',
        'nlu_positive_brands', 'nlu_negative_aspects',
        'nlu_negative_product_type', 'nlu_conv', 'nlu_source', 'nlu_date'
    ]

    spark_nbr2 = spark_nbr.select(nbr_cols)
    spark_nlu2 = spark_nlu.select(nlu_cols)

    jnd = spark_nlu2.join(spark_nbr2,
                          spark_nbr2.nbr_source == spark_nlu2.nlu_source,
                          how='full_outer')

    #jnd.count()

    collect_values_udf = F.udf(collect_values, ArrayType(StringType()))

    jnd = jnd.withColumn('nbr_possible_answers',
                         collect_values_udf(F.col('nbr_possible_values')))

    jnd = jnd.withColumn(
        'jnd_sender_id',
        F.when(F.col('nlu_sender_id').isNull(),
               F.col('nbr_sender_id')).otherwise(F.col('nlu_sender_id')))
    jnd = jnd.withColumn(
        'jnd_ts',
        F.when(F.col('nbr_ts_plus_response').isNull(),
               F.col('nlu_timestamp')).otherwise(
                   F.col('nbr_ts_plus_response')))
    jnd = jnd.withColumn(
        'jnd_retailer',
        F.when(F.col('nlu_retailer_id').isNull(),
               F.col('nbr_retailer_id')).otherwise(F.col('nlu_retailer_id')))

    # function that marks q's as answered
    ''' could make this function beter with first\last (that are not none's) instead of taking lad and lag2'''
    def check_isin(lead_nlu_text, lead_nlu_text2, possible_values,
                   question_code, lag_positive_aspects, lead_positive_aspects,
                   lag_subvertical, lead_subvertical, lag_pos_product_type,
                   lead_pos_product_type, lead_pos_product_type2):

        try:

            # check if response value is in quick replies
            if (lead_nlu_text in possible_values) or (lead_nlu_text2
                                                      in possible_values):
                return 1

            if (question_code == 'color_question') and (
                    'color' not in lag_positive_aspects) and (
                        'color' in lead_positive_aspects):
                return 1

            if (question_code == 'subvertical_selection'
                    or question_code == 'subvertical_selection_second') and (
                        lag_subvertical is None) and (lead_subvertical
                                                      is not None):
                return 1

            if (question_code == 'product_type_selection') and (
                    lag_pos_product_type is None) and (
                        (lead_pos_product_type is not None) or
                        (lead_pos_product_type2 is not None)):
                return 1

            else:
                return 0

        except:

            None

    check_isin_udf = F.udf(check_isin, IntegerType())

    window = Window.partitionBy("jnd_sender_id").orderBy(["jnd_ts"])

    jnd = jnd.withColumn(
        'is_answered',
        check_isin_udf(
            F.lead('nlu_text').over(window),
            F.lead('nlu_text', 2).over(window), F.col('nbr_possible_answers'),
            F.col('nbr_response_code'),
            F.lag('nlu_positive_aspects').over(window),
            F.lead('nlu_positive_aspects').over(window),
            F.lag('nlu_subvertical').over(window),
            F.lead('nlu_subvertical').over(window),
            F.lag('nlu_positive_product_type').over(window),
            F.lead('nlu_positive_product_type').over(window),
            F.lead('nlu_positive_product_type', 2).over(window)))

    jnd = jnd.fillna({'is_answered': 0})

    # fix ids  l22y83vocf, 00fma5y5xgf
    ''' data set features '''
    ''' DONT FORGET THAT YOUVE ADDED RESPONSE TIME TO TS MIGHT BE A HUGE BIAS '''

    jnd = jnd.withColumn(
        'question_rank',
        F.sum(F.when(F.col('nbr_response_code').isNotNull(),
                     1).otherwise(0)).over(window))

    jnd = jnd.withColumn('time_from_start',
                         F.col('jnd_ts') - F.min('jnd_ts').over(window))

    jnd = jnd.withColumn('sum_answer',
                         F.sum(F.lag('is_answered').over(window)).over(window))

    jnd = jnd.withColumn('num_quick_replies', F.size('nbr_possible_answers'))

    jnd = jnd.withColumn('hour', F.hour('nbr_date'))

    jnd = jnd.withColumn('day_of_week', F.date_format('nbr_date', 'u'))

    jnd = jnd.withColumn(
        "last_nbr_code",
        F.last(F.lag("nbr_response_code").over(window), True).over(window))

    nlu_cols = [
        'nlu_intents_list', 'nlu_age_group', 'nlu_gender', 'nlu_subvertical',
        'nlu_positive_aspects', 'nlu_positive_product_type',
        'nlu_positive_brands', 'nlu_negative_aspects',
        'nlu_negative_product_type'
    ]

    for c in nlu_cols:

        jnd = jnd.withColumn("last_{}".format(c), F.last(c, True).over(window))

    return jnd
Exemplo n.º 14
0
bins_y=np.array(y_cells).tolist()[0]
# get the bound value
interval_lon = (r_lon+lon_det-(l_lon-lon_det))/x_n
interval_lat = (t_lat+lat_det-(b_lat-lat_det))/y_n
min_lon = l_lon-lon_det
min_lat = b_lat-lat_det

Get the coordinates of each position


# read trackestimate table which contanis each location
trackestimate_table = "birds.trackestimate"
trackestimate = hc.read.table(trackestimate_table) 
track_subset =trackestimate.persist()
# transform the time format to drop those half seconds
trackestimate_subset = track_subset.withColumn('dt', F.date_format('timestamp', 'yyyy-MM-dd HH:mm'))
# define function to get the coordinates
# udf_x = UserDefinedFunction(lambda x: str(loads(x,hex=True).__geo_interface__['coordinates'][0]), StringType())
# udf_y = UserDefinedFunction(lambda x: str(loads(x,hex=True).__geo_interface__['coordinates'][1]), StringType())
def do_something_to_cell(geo_string):
    return [cell.split(' ') for cell in str(geo_string[9:-1]).split(' ')]

udf_x = UserDefinedFunction(lambda x: do_something_to_cell(x)[0][0], StringType())
udf_y = UserDefinedFunction(lambda x: do_something_to_cell(x)[1][0], StringType())
# transform the coordinates and the datatype
trackestimate_subset_coord=trackestimate_subset.withColumn('position_x', udf_x(F.col('st_astext')).astype('float'))
trackestimate_subset_coord=trackestimate_subset_coord.withColumn('position_y',udf_y(F.col('st_astext')).astype('float'))


Assign the coordinates into cells
Exemplo n.º 15
0
def process_log_data(spark, input_data, output_data):
    '''
    Loads the users,time and song_plays tables from the s3 location, creates tables and loads them back to s3 location as parquet

            Parameters:
                    spark -- spark session
                    input_data -- input s3 location
                    output data -- output s3 location
                    
            Returns:
                    None
    '''
    # get filepath to log data file
    log_data = os.path.join(input_data, "log-data/2018/11/*.json")

    # read log data file
    dflogs = spark.read.json(log_data)

    # filter by actions for song plays
    filterDF = dflogs.where("page=='NextSong'")

    # extract columns for users table
    #artists_table =
    dfUserWithSchema = dflogs.select([
        c for c in dflogs.columns
        if c in ['userId', 'firstName', 'lastName', 'gender', 'level']
    ])
    dfUserWithSchema.createOrReplaceTempView("users")

    # write users table to parquet files
    dfUserWithSchema.write.mode('overwrite').parquet(
        "s3a://udacity-demo-1-1/users.parquet")
    print("complete users file")
    spark.sql("SELECT count(*) FROM users").show()

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime(
        '%Y-%m-%d %H:%M:%S'))
    timestDF = filterDF.withColumn("timestamp", get_timestamp(filterDF.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(
        lambda x: datetime.fromtimestamp(x / 1000).strftime('%Y-%m-%d'))
    datetimeDF = filterDF.withColumn("datetime", get_datetime(filterDF.ts))

    # extract columns to create time table
    get_time_val = udf(
        lambda x: datetime.fromtimestamp(x / 1000).strftime('%H-%M-%S'))
    timeallDF = timestDF.withColumn('starttime', get_time_val(filterDF.ts))
    timeallDF = timeallDF.withColumn('hour', hour('timestamp'))
    timeallDF = timeallDF.withColumn('day', dayofmonth('timestamp'))
    timeallDF = timeallDF.withColumn('week', weekofyear('timestamp'))
    timeallDF = timeallDF.withColumn('month', month('timestamp'))
    timeallDF = timeallDF.withColumn('year', year('timestamp'))
    timeallDF = timeallDF.withColumn('weekday', date_format('timestamp', 'E'))
    timeDF = timeallDF.select([
        c for c in timeallDF.columns if c in
        ['starttime', 'hour', 'day', 'week', 'month', 'year', 'weekday']
    ])
    parqtimeDF = timeDF.write.partitionBy("year", "month").mode(
        'overwrite').parquet("s3a://udacity-demo-1-1/time.parquet")
    parquettimeDF = spark.read.parquet("s3a://udacity-demo-1-1/time.parquet")
    parquettimeDF.createOrReplaceTempView("time")
    print("complete time file")
    spark.sql("SELECT count(*) FROM time").show()

    # read in song data to use for songplays table
    song_artist_df = spark.sql(
        "SELECT song_id, a.artist_id,title,artist_name,duration FROM songs a inner join artists b where a.artist_id=b.artist_id"
    )
    song_artist_df.createOrReplaceTempView("song_artist")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = filterDF.join(song_artist_df, (filterDF.song == song_artist_df.title) & (filterDF.artist == song_artist_df.artist_name), 'left_outer')\
        .select(
            get_time_val(filterDF.ts).alias('starttime'),
            col("userId").alias('user_id'),
            filterDF.level,
            (song_artist_df.title).alias('song_id'),
            song_artist_df.artist_id,
            col("sessionId").alias("session_id"),
            filterDF.location,
            col("useragent").alias("user_agent"),
            year(get_datetime(filterDF.ts)).alias('year'),
            month(get_datetime(filterDF.ts)).alias('month')

        )
    songplays_table = songplays_table.withColumn('songplay_id',
                                                 monotonically_increasing_id())
    songplays_table.createOrReplaceTempView("song_plays")

    # write songplays table to parquet files partitioned by year and month
    parqsongplaysDF = songplays_table.write.partitionBy('year', 'month').mode(
        'overwrite').parquet("s3a://udacity-demo-1-1/songplay.parquet")
    print("complete song_plays file")
    spark.sql("SELECT count(*) FROM song_plays").show()
Exemplo n.º 16
0
# MAGIC %md
# MAGIC Next, we'll use the time functions to convert our timestamp into Central European Summer Time (CEST).

# COMMAND ----------

filtered.select('timestamp').show(5)

# COMMAND ----------

# MAGIC %md
# MAGIC Let's try applying `date_format` to see how it operates.

# COMMAND ----------

(filtered
 .select('timestamp', func.date_format('timestamp', 'MM/dd/yyyy').alias('date'))
 .show(5))

# COMMAND ----------

withDate = filtered.withColumn('date', func.date_format('timestamp', 'MM/dd/yyyy'))
withDate.printSchema()
withDate.select('title', 'timestamp', 'date').show(3)

# COMMAND ----------

# MAGIC %md
# MAGIC It seems like we want a different function for time zone manipulation and to store the object as a timestamp rather than a string.  Let's use `from_utc_timestamp` to get a timestamp object back with the correct time zone.

# COMMAND ----------
def getFeature(hourlyfeaturedf, scoreBegin):
    featureeddf = hourlyfeaturedf
    print(hourlyfeaturedf.columns)
    hourlyfeaturedf.show(5)
    scoreBegin, scoreEnd, featureBegin, scoreEndDateTime, featureBeginDateTime, featureEndDateTime = getScoreTime(
        scoreBegin)

    featureeddf = featureeddf.filter(
        featureeddf.StartHour >= lit(scoreBegin).cast(TimestampType())).filter(
            featureeddf.StartHour < lit(scoreEnd).cast(TimestampType()))

    # Extract some time features from "SessionStartHourTime" column
    featureeddf = featureeddf.withColumn('year',
                                         year(featureeddf['StartHour']))
    featureeddf = featureeddf.withColumn('month',
                                         month(featureeddf['StartHour']))
    featureeddf = featureeddf.withColumn('hourofday',
                                         hour(featureeddf['StartHour']))

    featureeddf = featureeddf.withColumn('weekofyear',
                                         weekofyear(featureeddf['StartHour']))
    dayofweek = F.date_format(featureeddf['StartHour'], 'EEEE')
    featureeddf = featureeddf.withColumn('dayofweek', dayofweek)
    featureeddf = featureeddf.withColumn('dayofmonth',
                                         hour(featureeddf['StartHour']))

    import datetime
    trainBeginTimestamp = int(
        datetime.datetime.strftime(
            datetime.datetime.strptime(trainBegin, "%Y-%m-%d %H:%M:%S"), "%s"))

    def linearTrend(x):
        if x is None:
            return 0
        # return # of hour since the start of the training period
        return (x - trainBeginTimestamp) / 3600 / 24 / 365.25

    linearTrendUdf = udf(linearTrend, IntegerType())
    featureeddf = featureeddf.withColumn(
        'linearTrend', linearTrendUdf(F.unix_timestamp('StartHour')))
    cal = USFederalHolidayCalendar()
    holidays_datetime = cal.holidays(start=holidayBegin,
                                     end=holidayEnd).to_pydatetime()
    holidays = [t.strftime("%Y-%m-%d") for t in holidays_datetime]

    def isHoliday(x):
        if x is None:
            return 0
        if x in holidays:
            return 1
        else:
            return 0

    isHolidayUdf = udf(isHoliday, IntegerType())
    featureeddf = featureeddf.withColumn(
        'date', date_format(col('StartHour'), 'yyyy-MM-dd'))
    featureeddf = featureeddf.withColumn("Holiday", isHolidayUdf('date'))

    def isBusinessHour(x):
        if x is None:
            return 0
        if x >= 8 and x <= 18:
            return 1
        else:
            return 0

    isBusinessHourUdf = udf(isBusinessHour, IntegerType())
    featureeddf = featureeddf.withColumn("BusinessHour",
                                         isBusinessHourUdf('hourofday'))

    def isMorning(x):
        if x is None:
            return 0
        if x >= 6 and x <= 9:
            return 1
        else:
            return 0

    isMorningUdf = udf(isMorning, IntegerType())
    featureeddf = featureeddf.withColumn("Morning", isMorningUdf('hourofday'))

    featureeddf.persist()

    return featureeddf
Exemplo n.º 18
0
from __future__ import print_function
import pyspark
from pyspark.sql import functions as F
import drpyspark


drpyspark.enable_debug_output()
with pyspark.SparkContext() as sc:
    sqlContext = pyspark.sql.SQLContext(sc)
    logs = sc.parallelize([
        {'timestamp': 1470663000, 'url': 'http://example.com/', 'ip': '192.168.1.1'},
        {'timestamp': 1470663163, 'url': 'http://example.com/', 'ip': '192.168.1.1'},
        {'timestamp': 1470663277, 'url': 'http://example.com/article1', 'ip': '192.168.1.2'},
        {'timestamp': 1470663277, 'url': 'http://example.com/article2', 'ip': '192.168.1.2'},
        {'timestamp': 1470663277, 'url': 'http://example.com/article3', 'ip': '192.168.1.2'},
    ])
    logs = logs.map(lambda l: pyspark.sql.Row(**l))
    logs = (sqlContext.createDataFrame(logs)
            .withColumn('timestamp', F.to_date(F.from_unixtime('timestamp')))
            .withColumn('minute', F.date_format('timestamp', "yyyy-MM-dd'T'HH")))
    (logs
     .groupBy(['minute', 'url'])
     .count()
     .show())
Exemplo n.º 19
0
    def ingest(self,
               src_resource=None,
               src_path=None,
               src_provider=None,
               dest_resource=None,
               dest_path=None,
               dest_provider=None,
               eventsourcing=False):

        logger = logging.getLogger()

        #### contants:
        now = datetime.now()
        reserved_cols = ['_ingested', '_date', '_state']

        #### Source metadata:
        md_src = data.metadata(src_resource, src_path, src_provider)
        if not md_src:
            logger.error("No metadata")
            return

        # filter settings from src (provider and resource)
        filter_params = utils.merge(
            md_src['provider'].get('read', {}).get('filter', {}),
            md_src['resource'].get('read', {}).get('filter', {}))

        #### Target metadata:

        # default path for destination is src path
        if (not dest_resource) and (not dest_path) and dest_provider:
            dest_path = md_src['resource']['path']

        md_dest = data.metadata(dest_resource, dest_path, dest_provider)
        if not md_dest:
            return

        if 'read' not in md_dest['resource']:
            md_dest['resource']['read'] = {}

        # match filter with the one from source resource
        md_dest['resource']['read']['filter'] = filter_params

        #### Read source resource
        try:
            df_src = self._read(md_src)
        except Exception as e:
            logger.exception(e)
            return

        #### Read destination schema info
        try:
            schema_path = '{}/schema'.format(md_dest['resource']['path'])
            md = data.metadata(path=schema_path, provider=dest_provider)
            df_schema = self._read(md)
            schema_date_str = df_schema.sort(
                desc("date")).limit(1).collect()[0]['id']
        except Exception as e:
            # logger.warning('source schema does not exist yet.'')
            schema_date_str = now.strftime('%Y%m%dT%H%M%S')

        # destination path - append schema date
        dest_path = '{}/{}'.format(md_dest['resource']['path'],
                                   schema_date_str)
        md_dest['resource']['path'] = dest_path
        md_dest['url'] = data._url(md_dest)

        # if schema not present or schema change detected
        schema_changed = True

        try:
            df_dest = self._read(md_dest)

            # compare schemas
            df_src_cols = [x for x in df_src.columns if x not in reserved_cols]
            df_dest_cols = [
                x for x in df_dest.columns if x not in reserved_cols
            ]
            schema_changed = df_src[df_src_cols].schema.json(
            ) != df_dest[df_dest_cols].schema.json()
        except Exception as e:
            # logger.warning('schema does not exist yet.'')
            df_dest = df_src.filter("False")

        if schema_changed:
            # Different schema, update schema table with new entry
            schema_entry = (schema_date_str, now, df_src.schema.json())
            df_schema = self.context().createDataFrame(
                [schema_entry], ['id', 'date', 'schema'])

            # write the schema to destination provider
            md = data.metadata(path=schema_path,
                               provider=md_dest['resource']['provider'])
            self._write(df_schema, md, mode='append')

        # partitions
        partition_cols = ['_ingested']

        #init df_diff to empty dest dataframe
        df_diff = df_dest.filter("False")

        if not eventsourcing:
            if filter_params.get('policy') == 'date' and filter_params.get(
                    'column'):
                df_diff = dataframe_update(df_src,
                                           df_dest,
                                           updated_col='_ingested',
                                           eventsourcing=eventsourcing)
                df_diff = df_diff.withColumn(
                    '_date',
                    date_format(
                        from_utc_timestamp(filter_params['column'], 'GMT+7'),
                        'yyyy-MM-dd'))
                partition_cols += ['_date']
                ingest_mode = 'append'
                options = {'mode': ingest_mode, 'partitionBy': partition_cols}
            else:
                df_diff = dataframe_update(df_src,
                                           df_dest.filter("False"),
                                           updated_col='_ingested',
                                           eventsourcing=eventsourcing)
                ingest_mode = 'overwrite'
                options = {'mode': ingest_mode, 'partitionBy': partition_cols}
        else:
            # to do
            logger.fatal('event sourcing not implemented yet')

        records_add = df_diff.filter("_state = 0").count()
        records_del = df_diff.filter("_state = 1").count()

        if records_add or records_del or schema_changed:
            md = data.metadata(path=dest_path,
                               provider=md_dest['resource']['provider'])
            self._write(df_diff, md, **options)

        end = datetime.now()
        time_diff = end - now

        logdata = {
            'src_url': md_src['url'],
            'src_table': md_src['resource']['path'],
            'source_option': filter_params,
            'schema_change': schema_changed,
            'target': dest_path,
            'upserts': records_add,
            'deletes': records_del,
            'diff_time': time_diff.total_seconds()
        }

        logtype = {
            'dlf_type': '{}.{}'.format(self.__class__.__name__, func_name())
        }
        logger.info(logdata, extra=logtype)
        db_analytical_temp +
        ".Euw_aggregated_cust_zip_temp b on  a.CONCAT_AGMNT_NO==b.concat_agmnt_no and a.CUSTOMER_ID==b.customer_id and a.ACCOUNT_SEQ==b.account_seq and a.AGREEMENT_SEQ==b.agreement_seq"
    )
    AMInonAMI_Zip = AMInonAMI_Zip.withColumnRenamed("TOWN_CODE", "ZIP_CODE")
    AMInonAMI_Zip = AMInonAMI_Zip.withColumn(
        'USAGE_VALUE',
        F.col('USAGE_VALUE').cast(DoubleType()))
    fileLog("reading AMInonAMI_Zip data")
    ### reading weather_dcast data
    Wthr_Dcast = spark.sql("select * from " + db_analytical_temp +
                           ".Euw_weather_data_temp")
    date_format_function = udf(lambda x: dtm.strptime(x, '%Y-%m-%d'),
                               DateType())
    Wthr_Dcast = Wthr_Dcast.withColumn(
        "WEATHER_DATE",
        date_format_function(date_format(col("WEATHER_DATE"), "yyyy-MM-dd")))
    fileLog("reading weather_dcast data")
    # take the unique set out of it
    usage_set = AMInonAMI_Zip.select('CONCAT_AGMNT_NO', 'CUSTOMER_ID',
                                     'ACCOUNT_SEQ', 'AGREEMENT_SEQ', 'SITE_ID',
                                     'SERVICE_SEQ', 'ZIP_CODE').distinct()
    ##populate additional future dates wrt usage_date and weather dates ###
    fileLog(
        "reading unique set of agreements and populating the future dates wrt usage_date and weather dates"
    )
    last_usage_date = AMInonAMI_Zip.agg(
        max('USAGE_DATE').alias('max_usage')).first()[0]  #+timedelta(1)
    last_weather_date = Wthr_Dcast.agg(
        max('WEATHER_DATE').alias('max_weather')).first()[0]

    my_udf = lambda domain: [
def process_log_data(spark, input_data, output_data):
    '''Creates time, users and songplays tables in S3
    Args:
        spark: the spark session created by the create_spark_sessioin function
        input_data: the location of the song data in S3
        output_data: the location of the time, users and songplays tables in S3
    '''
    # get filepath to log data file
    log_data = f'{input_data}/log_data/*/*/*.json'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(col('page').isin('NextSong'))

    # extract columns for users table
    users_table = df.selectExpr('userId AS user_id', 'firstName AS first_name',
                                'lastName AS last_name', 'gender',
                                'level').distinct()

    # write users table to parquet files
    users_table.write.parquet(path=f'{output_data}/users_table/',
                              mode='overwrite')

    # create tiimestamp column from original timestamp column
    df = df.withColumn(
        'start_time',
        date_format(to_timestamp(col('ts') / 1000),
                    format='yyyy-MM-dd hh:mm:ss'))

    # extract columns to create time table
    time_table = (df.select('start_time').withColumn(
        'year', year(col('start_time'))).withColumn(
            'month', month(col('start_time'))).withColumn(
                'dayofmonth', dayofmonth(col('start_time'))).withColumn(
                    'hour', hour(col('start_time'))).withColumn(
                        'weekofyear',
                        weekofyear(col('start_time'))).distinct())

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(path=f'{output_data}/time_table/',
                             mode='overwrite',
                             partitionBy=['year', 'month'])

    # read in song data to use for songplays table
    dfSong = spark.read.format("json").load(song_data)

    # extract columns from joined song and log datasets to create songplays table
    cond = [
        dfSong.title == df.song, dfSong.artist_name == df.artist,
        dfSong.duration == df.length
    ]
    dfJoined = df.join(dfSong, cond, how='inner')
    songplays_table = dfJoined.selectExpr(
        'ts AS start_time', 'userId AS user_id', 'level', 'song_id',
        'artist_id', 'sessionId AS session_id', 'artist_location AS location',
        'userAgent AS user_agent').distinct()

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(path=f'{output_data}/songplay_table/',
                                  mode='overwrite')
Exemplo n.º 22
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="stream2raw", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Create a streaming dataframe pointing to a Kafka stream
    df = connect_to_kafka(servers=args.servers,
                          topic=args.topic,
                          startingoffsets=args.startingoffsets_stream,
                          failondataloss=False)

    # Get Schema of alerts
    _, _, alert_schema_json = get_schemas_from_avro(args.schema)

    # Decode the Avro data, and keep only (timestamp, data)
    df_decoded = df.select([
        "timestamp", "topic",
        from_avro(df["value"], alert_schema_json).alias("decoded")
    ])

    # Partition the data hourly
    df_partitionedby = df_decoded\
        .withColumn("year", date_format("timestamp", "yyyy"))\
        .withColumn("month", date_format("timestamp", "MM"))\
        .withColumn("day", date_format("timestamp", "dd"))\
        .withColumn("hour", date_format("timestamp", "HH"))

    # Append new rows every `tinterval` seconds
    countquery_tmp = df_partitionedby\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", args.checkpointpath_raw) \
        .option("path", args.rawdatapath)\
        .partitionBy("topic", "year", "month", "day", "hour")

    # Fixed interval micro-batches or ASAP
    if args.tinterval > 0:
        countquery = countquery_tmp\
            .trigger(processingTime='{} seconds'.format(args.tinterval)) \
            .start()
        ui_refresh = args.tinterval
    else:
        countquery = countquery_tmp.start()
        # Update the UI every 2 seconds to place less load on the browser.
        ui_refresh = 2

    # Monitor the progress of the stream, and save data for the webUI
    colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"]
    monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath,
                           "live_raw.csv", "live")

    monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath,
                           "history.csv", "history")

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the stream2raw service normally...")
    else:
        countquery.awaitTermination()
Exemplo n.º 23
0
def process_log_data(spark, input_data, output_data, songs_data):
    """
    Transforms expected input data JSON file into 3 analytics dataframes, and writes them out to the output location
    given.
    
    Parameters:
    -- spark - spark session object
    -- input_data - string ['LOCAL' OR 'REMOTE'] specifying which path to read from config file
    -- output_data - string ['LOCAL' OR 'REMOTE'] specifying which path to read from config file
    """
    # get filepath to log data file
    log_data = config[input_data]['LOG_DATA']

    # Set output filepath
    output_location = config[output_data]['OUTPUT_PATH']

    # read log data file
    log_df = spark.read.format('json').load(log_data)

    # filter by actions for song plays
    log_df = log_df.filter(col('page') == 'NextSong')

    # extract columns for users table
    users_table = log_df.select(
        col('userId').alias('user_id'),
        col('firstName').alias('first_name'),
        col('lastName').alias('last_name'), col('gender'),
        col('level')).distinct()

    # write users table to parquet files
    users_table.write.partitionBy('gender').parquet(
        os.path.join(output_location + "/users", "users_table"), "overwrite")

    # create datetime column from original timestamp column
    log_df = log_df.withColumn('timestamp',
                               from_unixtime(col('ts') / 1000)).drop('ts')

    # extract columns to create time table
    time_table = log_df.select(
        date_format('timestamp', 'HH:MM:ss').alias('start_time'),
        hour('timestamp').alias('hour'),
        dayofmonth('timestamp').alias('day'),
        weekofyear('timestamp').alias('week'),
        month('timestamp').alias('month'),
        year('timestamp').alias('year'),
        date_format('timestamp', 'u').alias('weekday'))

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_location + "/time", "time_table"), "overwrite")

    # read in song data to use for songplays table
    song_df = songs_data.distinct()

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = log_df.join(song_df, (log_df.song == song_df.title) & (log_df.artist == song_df.artist_name))\
    .withColumn('songplay_id', monotonically_increasing_id())\
    .withColumn('month', month('timestamp'))\
    .select(col('songplay_id')
            , date_format('timestamp', 'HH:MM:ss').alias('start_time')
            , col('userId').alias('user_id')
            , col('level')
            , col('song_id')
            , col('artist_id')
            , col('sessionId').alias('session_id')
            , col('location')
            , col('userAgent').alias('user_agent')
            , col('year')
            , col('month'))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_location + "/songplays", "songplays_table"),
        "overwrite")
Exemplo n.º 24
0
def process_log_data(spark, input_data, output_data):
    """
    This function process log data and extracts 3 tables in parquet format - users, time, and songplay.
    The timestamp in log data is broken down into hour, day, week, month, year, and weekday.
    Args:
        - spark: A Spark Session
        - input_data: S3 link to Log data
        - output_data: S3 link to drop extracted tables and grab the songs data that was created in func 'process_song_data'
    Returns: None
    """
    # read log data file
    log_df = spark.read.json(input_data)

    # filter by actions for song plays
    log_filtered_df = log_df.where(log_df.page == "NextSong")

    # extract columns for users table
    users_table = log_filtered_df.select('userId', 'firstName', 'lastName',
                                         'gender',
                                         'level').dropDuplicates(['userId'])

    # write users table to parquet files
    users_table.write.mode('overwrite').parquet(output_data +
                                                '/users/users.parquet')

    # create timestamp column from original timestamp column
    # Columns ts is in milliseconds. Divide by 1000 to get the results in seconds and convert to Timestamp Type.
    log_filtered_df = log_filtered_df.withColumn(
        'tsconvert', (col('ts') / 1000).cast(TimestampType()))
    log_filtered_df.createOrReplaceTempView("log_staging")

    # extract columns to create time table
    time_table = log_filtered_df.select(
        col('tsconvert').alias('start_time'),
        hour('tsconvert').alias('hour'),
        dayofmonth('tsconvert').alias('day'),
        weekofyear('tsconvert').alias('week'),
        month('tsconvert').alias('month'),
        year('tsconvert').alias('year'),
        date_format('tsconvert', 'EEEE').alias('weekday')).dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy(
        'year',
        'month').mode('overwrite').parquet(output_data + '/time/time.parquet')

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + "/song_data/songs.parquet")
    song_df.createOrReplaceTempView("songs_staging")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = spark.sql(
        """SELECT DISTINCT logs.tsconvert AS starttime, 
                                          logs.userId, 
                                          logs.level, 
                                          songs.song_id, 
                                          songs.artist_id, 
                                          logs.sessionId, logs.location, 
                                          logs.userAgent, 
                                          year(logs.tsconvert) as year, 
                                          month(logs.tsconvert) as month
        FROM songs_staging AS songs 
        INNER JOIN log_staging AS logs ON logs.song = songs.title""")

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy(
        'year',
        'month').mode('overwrite').parquet(output_data +
                                           '/songplays/songplays.parquet')
Exemplo n.º 25
0
def process_log_data(spark, input_data, output_data):
    """
    Processes log data and writes the users, the time, and the songplays table into specified S3 bucket in parquet format.
    
    Parameters
    -------
        spark: object
            Spark Session object to handle the Spark Processes
        
        input_data: str
            The location of the files to read from S3 Bucket
        
        output_data: str
            The location of the files to write into S3 Bucket
    """

    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*events.json"

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # extract columns for users table
    users_cols = [
        "userId as user_id", "firstName as first_name",
        "lastName as last_name", "gender", "level"
    ]

    users_table = df.selectExpr(users_cols).drop_duplicates()

    # write users table to parquet files
    users_table.write.parquet(output_data + "users/")

    # create datetime column from original timestamp column
    get_datetime = F.udf(lambda x: D.fromtimestamp(int(x / 1000)),
                         T.TimestampType())
    df = df.withColumn("start_time", get_datetime("ts"))

    # extract columns to create time table
    time_table = df.select("start_time") \
                   .withColumn("hour", F.hour("start_time")) \
                   .withColumn("day", F.dayofmonth("start_time")) \
                   .withColumn("week", F.weekofyear("start_time")) \
                   .withColumn("month", F.month("start_time")) \
                   .withColumn("year", F.year("start_time")) \
                   .withColumn("weekday", F.date_format('start_time', 'EEEE')).drop_duplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year",
                                 "month").parquet(output_data + 'time/')

    # read in song data to use for songplays table
    songs_table = spark.read.parquet(output_data + "songs/")
    songs_table = songs_table.selectExpr(
        ["song_id", "artist_id as s_artist_id", "title"])
    artists_table = spark.read.parquet(output_data + "artists/")
    artists_table = artists_table.select(["artist_id", "location", "name"])

    song_df = songs_table.join(
        artists_table, songs_table.s_artist_id == artists_table.artist_id,
        "inner")

    cols = [
        "start_time", "userId as user_id", "level", "sessionId as session_id",
        "userAgent as user_agent", "song", "artist"
    ]
    df = df.selectExpr(cols) \
           .withColumn("songplay_id",  F.monotonically_increasing_id()) \
           .withColumn("month", F.month("start_time")) \
           .withColumn("year", F.year("start_time"))

    df = df.join(song_df,
                 (df.song == song_df.title) & (df.artist == song_df.name),
                 "left")

    # extract columns from joined song and log datasets to create songplays table
    songplays_cols = [
        "songplay_id", "start_time", "user_id", "level", "song_id",
        "artist_id", "session_id", "location", "user_agent", "year", "month"
    ]
    songplays_table = df.select(songplays_cols)

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(output_data +
                                                               'songplays/')
Exemplo n.º 26
0
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import format_string, date_format

if __name__ == "__main__":
    spark = SparkSession.builder.getOrCreate()
    # reading into DF
    parkingDF = spark.read.format('csv').options(header = 'true', inferschema = 'true').load(sys.argv[1])
    openDF = spark.read.format('csv').options(header = 'true', inferschema = 'true').load(sys.argv[2])
    # creating SQL temp view from DF
    parkingDF.createOrReplaceTempView("parking")
    openDF.createOrReplaceTempView("open")
    # using subtract to get (parking - open)
    parkingDF.select('summons_number').subtract(openDF.select('summons_number')).createOrReplaceTempView("temp1")
    query = """
select parking.summons_number, plate_id, violation_precinct, violation_code, issue_date
from parking join temp1 using(summons_number)
order by parking.summons_number
    """
    result = spark.sql(query)
    # formatting and saving the result
    result.select(format_string('%d\t%s, %d, %d, %s', result.summons_number, result.plate_id, result.violation_precinct, result.violation_code, date_format(result.issue_date, 'yyyy-MM-dd'))).write.save("task1-sql.out", format = "text")
def generate_dim_date(spark, start_year=1901, number_years_out_from_start=300):
    """Create `dim_date` table containing various date feature columns.

    Args:
        spark (SparkSession): Instantiated SparkSession
        start_year (int): starting year for dim_date table.
        number_years_out_from_start (int): number out from `start_year` to increment.

    Returns:
        Spark DataFrame.
    """
    years = [start_year + i for i in range(number_years_out_from_start + 1)]
    months = [i for i in range(1, 13)]
    days = [i for i in range(1, 32)]

    years_df = spark.createDataFrame(
        pd.DataFrame({
            'year': years,
            'temp_join_key': '1'
        }))
    months_df = spark.createDataFrame(
        pd.DataFrame({
            'month': months,
            'temp_join_key': '1'
        }))
    days_df = spark.createDataFrame(
        pd.DataFrame({
            'day_of_month': days,
            'temp_join_key': '1'
        }))

    years_months_df = (years_df.join(months_df, ['temp_join_key'],
                                     how='inner'))

    years_month_days_df = (years_months_df.join(days_df, ['temp_join_key'],
                                                how='inner'))

    date_keys = (
        years_month_days_df.withColumn(
            'date',
            to_date(
                concat(col('year'), lpad(col('month'), 2, '0'),
                       lpad(col('day_of_month'), 2, '0')), 'yyyyMMdd'))
        # remove invalid dates
        .filter("date IS NOT NULL").withColumn(
            'date_key',
            regexp_replace(col('date').cast('string'), '-',
                           '').cast('integer')))

    date_features = (date_keys
                     # get `week` and `quarter`
                     .withColumn('week', weekofyear(col('date')))
                     .withColumn('quarter', quarter(col('date')))
                     # get `day_name` and `month_name`
                     .withColumn('day_name', date_format(col('date'), 'EEEE'))
                     .withColumn('month_name', date_format(col('date'), 'MMMM'))
                     # get `date_year`, `date_quarter`, `date_month`, `date_week`
                     .withColumn('date_week', expr("MIN(date) OVER(PARTITION BY week, year)"))
                     .withColumn('date_month', date_format(col('date'), 'yyyy-MM-01'))
                     .withColumn('date_quarter', expr("MIN(date) OVER(PARTITION BY quarter, year)"))
                     .withColumn('date_year', date_format(col('date'), 'yyyy-01-01'))
                     # get `day_of_week`, `day_of_quarter`, `day_of_year`
                     .withColumn('day_of_week', dayofweek(col('date')))
                     .withColumn('day_of_quarter', datediff(col('date'), col('date_quarter')) + lit(1))
                     .withColumn('day_of_year', dayofyear(col('date')))
                     # get `weekend_flag`, `us_holiday_flag`, `business_day_flag`, `leap_year_flag`,
                     # `month_start_flag`, `month_end_flag`
                     .withColumn('weekend_flag', when(col('day_of_week').isin([7, 1]), 'Y').otherwise('N'))
                     .withColumn('us_holiday_flag', pd_is_holiday_usa(col('date').cast('timestamp')))
                     .withColumn('us_biz_day_flag', when((col('weekend_flag') == lit('Y')) |
                                                         (col('us_holiday_flag') == lit('Y')), 'Y').otherwise('N'))
                     .withColumn('leap_year_flag',
                                 when(dayofmonth(last_day(concat(col('year'), lit('-02-01')).cast('date'))) == 29, 'Y')
                                 .otherwise('N'))
                     .withColumn('month_start_flag', when(col('day_of_month') == lit(1), 'Y').otherwise('N'))
                     .withColumn('month_end_flag', when(col('date') == last_day(col('date')), 'Y').otherwise('N'))
                     # get `pct_into_month`, `pct_into_quarter`, `pct_into_year`
                     .withColumn('pct_into_month',
                                 (col('day_of_month') / dayofmonth(last_day(col('date')))).cast('decimal(7, 6)'))
                     .withColumn('date_quarter_end',
                                 when(col('quarter') == lit(1), concat(col('year'), lit('-03-31')))
                                 .when(col('quarter') == lit(2), concat(col('year'), lit('-06-30')))
                                 .when(col('quarter') == lit(3), concat(col('year'), lit('-09-30')))
                                 .when(col('quarter') == lit(4), concat(col('year'), lit('-12-31')))
                                 .otherwise(None)
                                 .cast('date'))
                     .withColumn('days_in_quarter', datediff(col('date_quarter_end'), col('date_quarter')) + lit(1))
                     .withColumn('pct_into_quarter',
                                 (col('day_of_quarter') / col('days_in_quarter')).cast('decimal(7, 6)'))
                     .withColumn('pct_into_year',
                                 (col('day_of_year') / when(col('leap_year_flag') == lit('Y'), 366.0).otherwise(365.0))
                                 .cast('decimal(7, 6)'))
                     # get seasons
                     .withColumn('season_northern',
                                 when(col('month').isin(12, 1, 2), 'Winter')
                                 .when(col('month').isin(3, 4, 5), 'Spring')
                                 .when(col('month').isin(6, 7, 8), 'Summer')
                                 .when(col('month').isin(9, 10, 11), 'Fall')
                                 .otherwise('UNKNOWN'))
                     .withColumn('season_southern',
                                 when(col('month').isin(6, 7, 8), 'Winter')
                                 .when(col('month').isin(9, 10, 11), 'Spring')
                                 .when(col('month').isin(12, 1, 2), 'Summer')
                                 .when(col('month').isin(3, 4, 5), 'Fall')
                                 .otherwise('UNKNOWN')))

    dim_date = (date_features.sort('date').select([
        'date_key', 'date', 'date_week', 'date_month', 'date_quarter',
        'date_year', 'day_of_week', 'day_of_month', 'day_of_quarter',
        'day_of_year', 'week', 'month', 'quarter', 'year', 'days_in_quarter',
        'day_name', 'month_name', 'season_northern', 'season_southern',
        'weekend_flag', 'us_holiday_flag', 'us_biz_day_flag',
        'month_start_flag', 'month_end_flag', 'leap_year_flag',
        'pct_into_month', 'pct_into_quarter', 'pct_into_year'
    ]))
    return dim_date
Exemplo n.º 28
0
def process_log_data(spark, input_data, output_data):
    """
    Fetch log data from S3, processes it and extract users_table, time_table and songplays_tables from it.
    Convert the data frames to parquet files and loaded back to S3 as output_data.
        
    Parameters:
        spark       : Spark Session
        input_data  : Input json files location in S3 bucket
        output_data : Parquet format stored in S3

    """

    # get filepath to log data file
    log_data = input_data + 'log_data/*/*/*.json'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    actions_df = df.filter(df.page == 'NextSong')
    actions_df.printSchema()

    # extract columns for users table
    users_table = actions_df.select(actions_df.userId, actions_df.firstName,
                                    actions_df.lastName, actions_df.gender,
                                    actions_df.level).dropDuplicates()

    # write users table to parquet files
    users_table = users_table.write.mode('overwrite').parquet("/users.parquet")

    # # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x) / 1000)))
    actions_df = actions_df.withColumn("timestamp",
                                       get_timestamp(actions_df.ts))
    print("creating timestamp column...")
    actions_df.printSchema()

    # # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000)))
    actions_df = actions_df.withColumn("datetime", get_datetime(actions_df.ts))
    print("creating datetime column...")
    actions_df.printSchema()

    # extract columns to create time table
    time_table = actions_df.select(
        col('datetime').alias('start_time'),
        hour(col('datetime')).alias('hour'),
        dayofmonth(col('datetime')).alias('day'),
        weekofyear(col('datetime')).alias('week'),
        month(col('datetime')).alias('month'),
        year(col('datetime')).alias('year'),
        date_format(col('datetime'), "u").alias('weekday')).dropDuplicates()

    print("creating time_table...")
    time_table.printSchema()

    # write time table to parquet files partitioned by year and month
    time_table = time_table.write.partitionBy(
        "year", "month").mode('overwrite').parquet("/time_table.parquet")

    # read in song data to use for songplays table
    song_data = input_data + 'song_data/*/*/*/*.json'
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table
    complete_df = song_df.join(actions_df, song_df.title == actions_df.song,
                               "inner")
    songplays_table = complete_df.select(
        col('datetime').alias('start_time'),
        col('userId').alias('userId'),
        col('level').alias('level'),
        col('song_id').alias('songId'),
        col('artist_id').alias('artistId'),
        col('sessionId').alias('sessionId'),
        col('location').alias('location'),
        col('userAgent').alias('user_agent'),
        year(col('datetime')).alias('year'),
        month(col('datetime')).alias('month'),
    ).withColumn('songplay_id', monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    songplays_table = songplays_table.write.partitionBy(
        "year", "month").mode('overwrite').parquet("/songplays_table.parquet")
Exemplo n.º 29
0
def process_log_data(spark, input_data, output_data):
    """
    This function loads log data into S3 after having retrieved the data from S3.
    """
    # get filepath to log data file
    log_data = input_data + 'log_data/*/*/*.json'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    users_columns = [
        "userId as user_id", "firstName as first_name",
        "lastName as last_name", "gender", "level"
    ]
    users_table = df.selectExpr(users_columns).dropDuplicates()

    # write users table to parquet files
    users_table.write.parquet(output_data + 'users/')

    # create timestamp column from original timestamp column
    get_timestamp = udf(date_convert, TimestampType())
    df = df.withColumn("start_time", get_datetime('ts'))

    #songs_table.write.partitionBy("year", "artist_id").parquet(output_data + 'songs/')
    # extract columns to create time table
    time_table = df.select("start_time").dropDuplicates() \
        .withColumn("hour", hour(col("start_time"))).withColumn("day", day(col("start_time"))) \
        .withColumn("week", week(col("start_time"))).withColumn("month", month(col("start_time"))) \
        .withColumn("year", year(col("start_time"))).withColumn("weekday", date_format(col("start_time"), 'E'))

    # write time table to parquet files partitioned by year and month
    time_table.write.mode("overwrite").partitionBy(
        "year", "month").parquet(output_data + "time/")

    # read in song data to use for songplays table
    df_song = spark.read.parquet(output_data + 'songs/*/*/*')
    df_artists = spark.read.parquet(output_data + 'artists/*')

    songs_logs = df.join(songs_df, (df.song == songs_df.title))
    artists_songs_logs = songs_logs.join(
        df_artists, (songs_logs.artist == df_artists.name))

    # extract columns from joined song and log datasets to create songplays table
    songplays = artists_songs_logs.join(
        time_table, artists_songs_logs.ts == time_table.start_time,
        'left').drop(artists_songs_logs.year)
    songplays_table = songplays.select(
        col('start_time').alias('start_time'),
        col('userId').alias('user_id'),
        col('level').alias('level'),
        col('song_id').alias('song_id'),
        col('artist_id').alias('artist_id'),
        col('sessionId').alias('session_id'),
        col('location').alias('location'),
        col('userAgent').alias('user_agent'),
        col('year').alias('year'),
        col('month').alias('month'),
    ).repartition("year", "month")

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(output_data +
                                                               'songplays/')
Exemplo n.º 30
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="stream2raw", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Create a streaming dataframe pointing to a Kafka stream
    kerberos = 'public2.alerts.ztf' in args.servers
    df = connect_to_kafka(servers=args.servers,
                          topic=args.topic,
                          startingoffsets=args.startingoffsets_stream,
                          failondataloss=False,
                          kerberos=kerberos)

    # Get Schema of alerts
    alert_schema, _, alert_schema_json = get_schemas_from_avro(args.schema)

    # Decode the Avro data, and keep only (timestamp, data)
    if '134.158.' in args.servers or 'localhost' in args.servers:
        # using custom from_avro (not available for Spark 2.4.x)
        # it will be available from Spark 3.0 though
        df_decoded = df.select(
            [from_avro(df["value"], alert_schema_json).alias("decoded")])
    elif 'public2.alerts.ztf' in args.servers:
        # Decode on-the-fly using fastavro
        f = udf(lambda x: fastavro.reader(io.BytesIO(x)).next(), alert_schema)
        df_decoded = df.select([f(df['value']).alias("decoded")])
    else:
        msg = "Data source {} is not known - a decoder must be set".format(
            args.servers)
        logger.warn(msg)
        spark.stop()

    # Flatten the data columns to match the incoming alert data schema
    cnames = df_decoded.columns
    cnames[cnames.index('decoded')] = 'decoded.*'
    df_decoded = df_decoded.selectExpr(cnames)

    # Partition the data hourly
    df_partitionedby = df_decoded\
        .withColumn("timestamp", jd_to_datetime(df_decoded['candidate.jd']))\
        .withColumn("year", date_format("timestamp", "yyyy"))\
        .withColumn("month", date_format("timestamp", "MM"))\
        .withColumn("day", date_format("timestamp", "dd"))

    # Append new rows every `tinterval` seconds
    countquery_tmp = df_partitionedby\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", args.checkpointpath_raw) \
        .option("path", args.rawdatapath)\
        .partitionBy("year", "month", "day")

    # Fixed interval micro-batches or ASAP
    if args.tinterval > 0:
        countquery = countquery_tmp\
            .trigger(processingTime='{} seconds'.format(args.tinterval)) \
            .start()
    else:
        countquery = countquery_tmp.start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the stream2raw service normally...")
    else:
        countquery.awaitTermination()
Exemplo n.º 31
0
def process_sdf(sdf_drive, sdf_vehicle):
  sdf_join_drive_vehicle = sdf_drive.alias("drive").join(sdf_vehicle.alias("vehicle"), ["vehicle_id"])
  sdf_join_drive_vehicle_fillna = sdf_join_drive_vehicle.fillna(0)
  sdf_drive_start_of_week = sdf_join_drive_vehicle_fillna.withColumn("week_start_date", \
                                                                     (F.date_sub(F.next_day(
                                                                       F.from_utc_timestamp(F.col("datetime"),
                                                                                            "America/New_York"),
                                                                       'monday'), 7)))

  sdf_Active_horsepower =  sdf_drive_start_of_week.withColumn("Active_horsepower" ,  (F.col("eng_load") / 255) \
                                                              * (F.col("max_torque") * F.col("rpm"))  / 5252)

  # Horsepower utilization – Active horsepower / Max Horsepower
  sdf_Horsepower_utilization = sdf_Active_horsepower.withColumn("Horsepower_utilization", F.col("Active_horsepower") / F.col("max_horsepower"))

  # # Torque Utilization - calculated as Engine load/ 255
  sdf_Torque_Utilization = sdf_Horsepower_utilization.withColumn("Torque_Utilization", F.col("eng_load") / 255)

  # # RPM Utilization – RPM / Maximum horsepower rpm
  sdf_RPM_Utilization = sdf_Torque_Utilization.withColumn("RPM_Utilization", F.col("rpm") / F.col("max_horsepower_rpm") )

  sdf_engine_features = sdf_RPM_Utilization.withColumn("ft_torque_util_60pct_s",
                                                       F.when((F.col("Torque_Utilization") >= 0.6) \
                                                              & (F.col("Torque_Utilization") < 0.7), \
                                                              F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_torque_util_70pct_s", F.when((F.col("Torque_Utilization") >= 0.7) \
                                                 & (F.col("Torque_Utilization") < 0.8), \
                                                 F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_torque_util_80pct_s", F.when((F.col("Torque_Utilization") >= 0.8) \
                                                 & (F.col("Torque_Utilization") < 0.9), \
                                                 F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_torque_util_90pct_s", F.when((F.col("Torque_Utilization") >= 0.9) \
                                                 & (F.col("Torque_Utilization") < 1), \
                                                 F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_horsepower_util_50pct_s", F.when((F.col("Horsepower_utilization") >= 0.5) \
                                                     & (F.col("Horsepower_utilization") < 0.6), \
                                                     F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_horsepower_util_60pct_s", F.when((F.col("Horsepower_utilization") >= 0.6) \
                                                     & (F.col("Horsepower_utilization") < 0.7), \
                                                     F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_horsepower_util_70pct_s", F.when((F.col("Horsepower_utilization") >= 0.7) \
                                                     & (F.col("Horsepower_utilization") < 0.8), \
                                                     F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_horsepower_util_80pct_s", F.when((F.col("Horsepower_utilization") >= 0.8) \
                                                     & (F.col("Horsepower_utilization") < 0.9), \
                                                     F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_rpm_util_50pct_s", F.when((F.col("RPM_Utilization") >= 0.5) \
                                              & (F.col("RPM_Utilization") < 0.6), \
                                              F.lit(1)).otherwise(F.lit(0))) \
    .withColumn("ft_rpm_util_60pct_s", F.when((F.col("RPM_Utilization") >= 0.6) \
                                              & (F.col("RPM_Utilization") < 0.7), \
                                              F.lit(1)).otherwise(F.lit(0))) \
 \
          sdf_engine_features_total = sdf_engine_features.select("vehicle_id", "week_start_date", "datetime", \
                                                                 "ft_torque_util_60pct_s", "ft_torque_util_70pct_s",
                                                                 "ft_torque_util_80pct_s", "ft_torque_util_90pct_s", \
                                                                 "ft_horsepower_util_50pct_s",
                                                                 "ft_horsepower_util_60pct_s",
                                                                 "ft_horsepower_util_70pct_s",
                                                                 "ft_horsepower_util_80pct_s", \
                                                                 "ft_rpm_util_50pct_s", "ft_rpm_util_60pct_s")

  sdf_sdf_engine_features_agg = sdf_engine_features_total.groupBy("vehicle_id", "week_start_date") \
    .agg(F.sum("ft_torque_util_60pct_s").alias("ft_torque_util_60pct_s"), \
         F.sum("ft_torque_util_70pct_s").alias("ft_torque_util_70pct_s"), \
         F.sum("ft_torque_util_80pct_s").alias("ft_torque_util_80pct_s"), \
         F.sum("ft_torque_util_90pct_s").alias("ft_torque_util_90pct_s"), \
         F.sum("ft_horsepower_util_50pct_s").alias("ft_horsepower_util_50pct_s"), \
         F.min("ft_horsepower_util_60pct_s").alias("ft_horsepower_util_60pct_s"), \
         F.min("ft_horsepower_util_70pct_s").alias("ft_horsepower_util_70pct_s"), \
         F.min("ft_horsepower_util_80pct_s").alias("ft_horsepower_util_80pct_s"), \
         F.min("ft_rpm_util_50pct_s").alias("ft_rpm_util_50pct_s"), \
         F.min("ft_rpm_util_60pct_s").alias("ft_rpm_util_60pct_s"), )

  sdf_sdf_engine_features_final = sdf_sdf_engine_features_agg.select("vehicle_id", "week_start_date", \
                                                                     "ft_torque_util_60pct_s", "ft_torque_util_70pct_s",
                                                                     "ft_torque_util_80pct_s", "ft_torque_util_90pct_s", \
                                                                     "ft_horsepower_util_50pct_s",
                                                                     "ft_horsepower_util_60pct_s",
                                                                     "ft_horsepower_util_70pct_s",
                                                                     "ft_horsepower_util_80pct_s", \
                                                                     "ft_rpm_util_50pct_s", "ft_rpm_util_60pct_s")

  sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.sort(F.col("vehicle_id"), F.col("week_start_date"))

  sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.withColumn("week_start_date",
                                                                           F.date_format(F.col("week_start_date"),
                                                                                         "yyyy-MM-dd"))

  sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.fillna(0)

  return sdf_sdf_engine_features_final
Exemplo n.º 32
0
from pyspark.sql.functions import row_number
from pyspark.sql.functions import monotonically_increasing_id
import pandas as pandas

sc = SparkContext('local')
spark = SparkSession(sc)

#Original Data
df = spark.read.csv("data.csv", header=True, inferSchema=True)
df.show()

#Changing Date format and Renaming a column
df = df.select(
    'Input Data',
    date_format(
        unix_timestamp("Date", "yyyy-MM-dd").cast("timestamp"),
        "dd-MM-yyyy").alias('Date'), 'Type',
    'Value').withColumnRenamed('Input Data', 'Output Data')
df.show()

#Displaying selected columns
df1 = df.select('Output Data', 'Date', 'Type')
df1.show()

#Displaying selected columns
df2 = df.select('Output Data', 'Date', 'Value')
df2.show()

#Inserting a new column with the string value 'Type' in df1
df1 = df1.withColumn('Variable', F.lit('Type'))
df1.show()
Exemplo n.º 33
0
def process_log_data(spark, input_data, output_data):
    '''
        Processes log_data files from an S3, extracting the user, time and songplays tables. Outputs a compressed parquet file for each table.
        spark      : Spark session
        input_data : S3 path for log_data
        output_data: S3 bucket path were tables will be stored in parquet format
    '''
    # get filepath to log data file
    log_data = input_data + 'log_data/*/*/*.json'
    # log_data = input_data + 'log_data/'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df['page'] == 'NextSong').filter(df.userId.isNotNull())

    # extract columns for users table
    users_table = df.select(
        col('userId').alias('user_id'),
        col('firstName').alias('first_name'),
        col('lastName').alias('last_name'), 'gender', 'level').distinct()

    # write users table to parquet files
    users_table.write.mode('overwrite').parquet(output_data + 'users.parquet')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda ts: str(int(int(ts) / 1000)))
    df = df.withColumn('timestamp', get_timestamp(col('ts')))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda dt: str(datetime.fromtimestamp(int(dt) / 1000)))
    df = df.withColumn('datetime', get_datetime(col('ts')))

    # extract columns to create time table
    time_table = df.select('timestamp',
                           hour('datetime').alias('hour'),
                           dayofmonth('datetime').alias('day'),
                           weekofyear('datetime').alias('week'),
                           month('datetime').alias('month'),
                           year('datetime').alias('year'),
                           date_format('datetime', 'E').alias('weekday'))

    # write time table to parquet files partitioned by year and month
    time_table.write.mode('overwrite').partitionBy(
        'year', 'month').parquet(output_data + 'time.parquet')

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + 'songs.parquet')

    # extract columns from joined song and log datasets to create songplays table
    ts_Format = 'yyyy/MM/dd HH:MM:ss z'
    songplays_table = song_df.join(df, song_df.artist_id == df.artist)\
    .withColumn('songplay_id', monotonically_increasing_id())\
    .withColumn('start_time', to_timestamp(date_format((col('ts') / 1000)\
                                                       .cast(dataType = TimestampType()), ts_Format), ts_Format))\
    .select(
        'songplay_id',
        'start_time',
        'level',
        'song_id',
        'artist_id',
        'userAgent',
        'location',
        col('userId').alias('user_id'),
        col('sessionId').alias('session_id'),
        month(col('start_time')).alias('month'),
        year(col('start_time')).alias('year')
    )

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.mode('overwrite').partitionBy(
        'year', 'month').parquet(output_data + 'songplays.parquet')
    print('Ok processing log_data')
    # initialize spark-session
    spark = initialize_spark_session()

    JDBC_URL = args.jdbc_uri
    TABLE_SINK = args.table_sink
    OUTPUT_PATH = args.output + "/dimension_date/"
    TMP_DIR = args.tmp

    df = spark.sql("""SELECT * FROM stag_immigration""")

    # get date range
    df = df.select("arrival_date").distinct().orderBy("arrival_date")

    # generate columns
    df = df.withColumn("year", F.date_format("arrival_date", "y")) \
        .withColumn("month", F.date_format("arrival_date", "M")) \
        .withColumn("day", F.date_format("arrival_date", "d")) \
        .withColumn("month_string", F.date_format("arrival_date", "MMM")) \
        .withColumn("day_string", F.date_format("arrival_date", "E")) \
        .withColumn("week", F.date_format("arrival_date", "w")) \
        .withColumn("day_of_year", F.dayofyear("arrival_date")) \
        .withColumn("day_of_week", F.dayofweek("arrival_date")) \
        .withColumn("quarter", F.quarter("arrival_date"))

    # create unique identifier
    df = df.withColumn("id", F.monotonically_increasing_id() + 1)

    # select relevant columns
    df = df.select("id", "arrival_date", "year", "month", "day",
                   "month_string", "day_string", "week", "day_of_year",
Exemplo n.º 35
0
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# datetime:2020/3/27 9:17
from delta.tables import *
from pyspark.sql.functions import *
from pyspark.sql import functions as f

spark=SparkSession.builder.getOrCreate()

df=spark.range(5)\
     .withColumn("date",f.date_format(f.current_timestamp(),"yyyyMMdd HHmmss"))\
     .withColumn("value",f.lit("1"))

path="e://test//delta//test"


#创建delta表
# df.write.format("delta").save(path)
#创建delta分区表
df.write.format("delta").partitionBy("date").save(path)


#读取delta表
spark.read.format("delta").load(path)
#读取时指定版本,或时间戳,不指定默认最新
spark.read.format("delta").option("timestampAsOf", '2020-03-27').load(path)
spark.read.format("delta").option("versionAsOf", 1).load(path)

#可以通过delta的api查看表的版本
DeltaTable.forPath(path).history().show()
# COMMAND ----------

spark.sql("""
  SELECT *
  FROM customer_purchases
  ORDER BY `sum(total_cost)` DESC
  """)\
  .show(5)


# COMMAND ----------

from pyspark.sql.functions import date_format, col
preppedDataFrame = staticDataFrame\
  .na.fill(0)\
  .withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE"))\
  .coalesce(5)


# COMMAND ----------

trainDataFrame = preppedDataFrame\
  .where("InvoiceDate < '2011-07-01'")
testDataFrame = preppedDataFrame\
  .where("InvoiceDate >= '2011-07-01'")


# COMMAND ----------

from pyspark.ml.feature import StringIndexer
indexer = StringIndexer()\
 logger = sc._jvm.org.apache.log4j
 logger.LogManager.getRootLogger().setLevel(logger.Level.FATAL)
 
 sqlContext = SQLContext(sc)
 u_employee_trans = udf( employee_trans )
 
 # Load the data
 url_String='jdbc:oracle:thin:apps/[email protected]:1524/TESTDEV'
 if (db_location=='PROD'):
     url_String='jdbc:oracle:thin:apps/[email protected]:1524/PROD'
     
 tblname="(select * from (select v.legal_entity_id as org_id, org.name as org_name, v.dept_id as dept_id, dept.name as dept_name, v.emp_id as emp_id, emp.emp_name, emp.employee_number as emp_number, v.sub_hours, f.ferial_name, v.leave_date from narl_leave_detail_info_v v, narl_leave_main m, narl_ferial_header f, narl_login_emp_info_hist_v emp, HR_ALL_ORGANIZATION_UNITS org, HR_ALL_ORGANIZATION_UNITS dept where v.leave_id=m.leave_id and m.ferial_code=f.ferial_code and v.emp_id=emp.employee_id and v.legal_entity_id=org.ORGANIZATION_ID and v.dept_id=dept.ORGANIZATION_ID and v.status in ('APPROVE','INPROCESS','PROCESSING','FREE') and TO_CHAR(v.leave_date,'YYYY')='%s') ORDER BY org_id, dept_id, emp_id) tmp" %strYear
 
 df= sqlContext.read.format('jdbc').options(url=url_String, dbtable=tblname).load() 
 #oracle 取出值,其欄位都是大寫
 df = df.select(df.ORG_ID.cast('int').alias('org_id'),df.ORG_NAME.alias('org_name'), df.DEPT_ID.cast('int').alias('dept_id'), df.DEPT_NAME.alias('dept_name'), df.EMP_ID.cast('int').alias('emp_id'), df.EMP_NAME.alias('emp_name'),df.EMP_NUMBER.alias('emp_number'),date_format(df.LEAVE_DATE, 'E').alias('name_day'),concat(lit('Day_'),date_format(df.LEAVE_DATE,'dd')).alias('day_month') ,df.FERIAL_NAME.alias('ferial_name'), df.SUB_HOURS.cast('int').alias('sub_hours'))
 
 df = df.withColumn( 'employee_num', u_employee_trans('emp_number') ).drop('emp_number')
 df = df.withColumnRenamed("employee_num", "emp_number")
 df.cache()
 
 #Load org by WEEK data--start
 print 'start Load org by WEEK data>>', datetime.datetime.now()
 df_groupBy_org_name_day = df.select('org_id','org_name','name_day','ferial_name', 'sub_hours').groupBy('org_id','org_name','name_day').pivot("ferial_name",['特別休假','加班或假日出差轉補休','生理假','傷病假','婚假', '家庭照顧假','事假', '產檢假','陪產假','產假','喪假','國內公假','國外公假','公傷病假','安胎假']).sum('sub_hours')
 
 df_groupBy_org_name_day=df_groupBy_org_name_day.fillna(0)
 
 df_groupBy_org_name_day = df_groupBy_org_name_day.select('org_id','org_name','name_day', '特別休假','加班或假日出差轉補休','生理假','傷病假','婚假','家庭照顧假', '事假','產檢假', '陪產假','產假','喪假','國內公假','國外公假','公傷病假', '安胎假').groupBy('org_id','org_name').pivot("name_day", ['Mon', 'Tue', 'Wed','Thu', 'Fri', 'Sat','Sun']).sum('特別休假', '加班或假日出差轉補休','生理假','傷病假','婚假','家庭照顧假','事假','產檢假', '陪產假','產假','喪假','國內公假','國外公假','公傷病假', '安胎假')
 
 df_groupBy_org_name_day=df_groupBy_org_name_day.fillna(0)
 df_groupBy_org_name_day=df_groupBy_org_name_day.orderBy(df_groupBy_org_name_day.org_id)