def add_solar_features(df):
    return \
        (df
         .withColumn('declination_angle',
                     radians(-23.45
                             * cos(((2 * pi)/365) * (dayofyear('date') + 10))))
         .withColumn('diff_local_time_UTC', timezone_from_date('date'))
         .withColumn('d', (2 * pi * dayofyear('date')) / 365)
         .withColumn('equation_of_time',
                     -7.655 * sin(col('d'))
                     + 9.873 * sin(2 * col('d') + 3.588))
         .drop('d')
         .withColumn('time_correction',
                     4 * (col('loc_long') - (15 * col('diff_local_time_UTC')))
                     + col('equation_of_time'))
         .withColumn('local_solar_hour',
                     col('hour') + 0.5 + col('time_correction') / 60)
         .withColumn('hour_angle', 0.2618 * (col('local_solar_hour') - 12))
         .drop('diff_local_time_UTC', 'equation_of_time', 'time_correction',
               'local_solar_hour')
         .withColumn('solar_elevation',
                     degrees(asin(sin('declination_angle')
                                  * sin(radians('loc_lat'))
                                  + cos('declination_angle')
                                  * cos(radians('loc_lat'))
                                  * cos('hour_angle'))))
         .drop('declination_angle', 'hour_angle'))
Пример #2
0
def add_solar_features(df):
    return (df.withColumn(
        "declination_angle",
        radians(-23.45 * cos(((2 * pi) / 365) * (dayofyear("date") + 10))),
    ).withColumn("diff_local_time_UTC", timezone_from_date("date")).withColumn(
        "d", (2 * pi * dayofyear("date")) / 365).withColumn(
            "equation_of_time",
            -7.655 * sin(col("d")) + 9.873 * sin(2 * col("d") + 3.588),
        ).drop("d").withColumn(
            "time_correction",
            4 * (col("loc_long") - (15 * col("diff_local_time_UTC"))) +
            col("equation_of_time"),
        ).withColumn(
            "local_solar_hour",
            col("hour") + 0.5 + col("time_correction") / 60).withColumn(
                "hour_angle", 0.2618 * (col("local_solar_hour") - 12)).drop(
                    "diff_local_time_UTC",
                    "equation_of_time",
                    "time_correction",
                    "local_solar_hour",
                ).withColumn(
                    "solar_elevation",
                    degrees(
                        asin(
                            sin("declination_angle") *
                            sin(radians("loc_lat")) +
                            cos("declination_angle") *
                            cos(radians("loc_lat")) * cos("hour_angle"))),
                ).drop("declination_angle", "hour_angle"))
Пример #3
0
def createDimDate(df):
    '''
    Creates date dimensional table from DateTime field in upstream dataframe
    :param df:
    :return: Date Dimensional Dataframe
    '''
    df = df.withColumn("rawKey", F.col('rawKey'))
    df = df.withColumn("year", F.year(F.col('DateTime')))
    df = df.withColumn("month", F.month(F.col('DateTime')))
    df = df.withColumn("dayofmonth", F.dayofmonth(F.col('DateTime')))
    df = df.withColumn("dayofweek", F.dayofweek(F.col('DateTime')))
    df = df.withColumn("dayofyear", F.dayofyear(F.col('DateTime')))
    df = df.withColumn("hour", F.hour(F.col('DateTime')))
    df = df.withColumn("minute", F.minute(F.col('DateTime')))
    df = df.withColumn("dateMinute", F.date_format(F.col("DateTime"), "yyyyMMddHHmm"))
    df = df.withColumn("quarter", F.quarter(F.col('DateTime')))
    df = df.withColumn("date", F.to_date(F.col('DateTime')))
    df.createOrReplaceTempView('tempDimDateTable')
    dimDateDF = spark.sql(" SELECT * FROM \
                    (select rawKey,dateMinute,dateTime, date,year, month,dayofmonth,dayofweek,dayofyear,hour, minute,quarter \
                    from tempDimDateTable \
                    group by rawKey,dateMinute,dateTime, date,year, month,dayofmonth,dayofweek,dayofyear,hour, minute,quarter \
                    order by dateMinute ASC) \
                    ")

    # Generating dateKey field
    dimDateDF = dimDateDF.withColumn('dateKey', F.monotonically_increasing_id() + 1)
    # Creating dataframe including date field which will help to generate Fact table
    factHelperDateDF = dimDateDF.select(F.col('rawKey'), F.col('dateKey'), F.col('dateMinute'))
    # Dropping unnecessary rawKey field
    dimDateDF = dimDateDF.drop(F.col('rawKey'))
    return dimDateDF, factHelperDateDF
Пример #4
0
    def _extract_dim_date(self, imgDF: DataFrame, output_path: str):
        print('Extracting dim_date from immigration data...')

        dateCols = [col for col in imgDF.columns if col.endswith('_date')]
        dates: DataFrame = None
        for dateCol in dateCols:
            dt = imgDF. \
                where(
                F.col(dateCol).isNotNull()
            ). \
                select(
                    F.col(dateCol).alias('date'),
                    F.year(F.col(dateCol)).cast('smallint').alias('year'),
                    F.quarter(F.col(dateCol)).cast('smallint').alias('quarter'),
                    F.month(F.col(dateCol)).cast('smallint').alias('month'),
                    F.dayofweek(F.col(dateCol)).cast('smallint').alias('day_of_week'),
                    F.dayofmonth(F.col(dateCol)).cast('smallint').alias('day_of_month'),
                    F.dayofyear(F.col(dateCol)).cast('smallint').alias('day_of_year'),
                    F.weekofyear(F.col(dateCol)).cast('smallint').alias('week_of_year')
            )
            if dates:
                dates = dates.union(dt).distinct()
            else:
                dates = dt

        print('Saving dim_date')
        self._write_data(dates, output_path, 'date')
        print('Finished saving dim_date')

        print('Finished extracting dim_date from immigration data.')
Пример #5
0
def process_dim_time(spark):
    """
    Creates the dimension time and stores it to stage 2.

    Creates a time sequence with a daily interval.

    spark: spark session
    """

    #create a daily time sequence starting at 2019-10-01 ending now()
    #src: https://stackoverflow.com/questions/43141671/
    #     sparksql-on-pyspark-how-to-generate-time-series
    df_dim_time = spark.sql("""SELECT sequence(
                                    to_date('2019-10-01'),
                                    now(), 
                            interval 1 day) as date""")\
                        .withColumn("date", explode(col("date")))

    #create additional columns e.g. weekofyear
    df_dim_time = df_dim_time.withColumn("month",F.month("date"))\
           .withColumn("year",F.year("date"))\
           .withColumn("dayofmonth",F.dayofmonth("date"))\
           .withColumn("dayofyear",F.dayofyear("date"))\
           .withColumn("weekofyear",F.weekofyear("date"))

    #write as dimTime to stage2
    df_dim_time.write.mode('overwrite').parquet(folder_s2 + 'dimTime.parquet')

    return df_dim_time
 def get_dayofyear_avg_v(self):
     data = self.data
     day = data.withColumn('day', functions.dayofyear(data['timestamp']))
     day_avg = day.groupby('day').avg('Volume BTC')
     result = day_avg.select('day', 'avg(Volume BTC)').orderBy('day')
     result.cache()
     return get_label_value(result.collect())
Пример #7
0
def generate_calendar(start_date, end_date):
    try:

        import pandas as pd
        df = pd.DataFrame({'DATE': pd.date_range(start_date, end_date)})

        from pyspark.sql.functions import year, month, date_format, expr, dayofyear, dayofmonth

        df_spark = spark.createDataFrame(df)

        df_spark = df_spark.withColumn('DATE_ID', expr("REPLACE(DATE,'-','') DATE"))

        df_spark = df_spark.withColumn('DATE_ID', expr("CAST(REPLACE(CAST(DATE AS DATE),'-','') AS INTEGER)")) \
            .withColumn('DATE_COL', expr("CAST(DATE AS DATE)")) \
            .withColumn('YEAR', year('DATE')) \
            .withColumn('MONTH', month('DATE')) \
            .withColumn('DAY', dayofmonth('DATE')) \
            .withColumn('DAY_OF_MONTH', dayofmonth('DATE')) \
            .withColumn('DAY_OF_YEAR', dayofyear('DATE')) \
            .withColumn('DAY_NAME', date_format('DATE', 'E')) \
            .drop('DATE')

        return df_spark
    except Exception as e:
        logging.error('Failed to generate CALENDAR data')
        raise Exception(f'Failed to generate CALENDAR data,{e}')
Пример #8
0
    def transform_i94_data(self):
        #Read the Immigration dataset in SAS format 
        i94_df = self.spark.read.format('com.github.saurfang.sas.spark').load(self.load_path + config['LOAD']['I94_DATA'] )
        
        #drop the columns that are not required for analysis
        i94_drop_cols =  ['count','visapost','occup','matflag','biryear','insnum']
        i94_df = i94_df.drop(*i94_drop_cols)
        
        #convert the numeric columns to Integer datatype
        int_cols = ['cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'arrdate', 'i94mode','depdate', 'i94bir', 'i94visa']
        i94_df = fn.cast_type(i94_df,dict(zip(int_cols,len(int_cols)*[IntegerType()])))
        
        #convert the SAS date format to String format
        date_cols = ['arrdate','depdate']
        for c in date_cols:
            i94_df = i94_df.withColumn(c,fn.sas_to_date_udf(i94_df[c]))

        #Create dates dataframe to be stored as a separate file
        arrdate  = i94_df.select('arrdate').distinct()
        depdate  = i94_df.select('depdate').distinct()
        date_df = arrdate.union(depdate)
        date_df = date_df.withColumnRenamed('arrdate','date')
        date_df = date_df.withColumn('year',F.year(date_df.date))   \
                         .withColumn('month',F.month(date_df.date)) \
                         .withColumn('day',F.dayofmonth(date_df.date))  \
                         .withColumn('dayofweek',F.dayofweek(date_df.date))  \
                         .withColumn('dayofyear',F.dayofyear(date_df.date))
        
        #Write the dates dataframe to a file in S3 in parquet format
        date_df.write.mode('overwrite').parquet(self.save_path + date_save_path)

        #Write the I94 dataframe to a file in S3 in parquet format partioned by year and month   
        i94_df.write.partitionBy('i94yr','i94mon').mode('overwrite').parquet(self.save_path + immigration_save_path)
Пример #9
0
def main_A(inputs):
    data = spark.read.option('encoding', 'UTF-8').csv(inputs,
                                                      schema=tmax_schema)
    ################ FEATURE ENGINEERING: add yesterday tmax #####################
    if USE_YTD_TEMP_FEATURE:
        syntax = """SELECT today.latitude,today.longitude,today.elevation,today.date,
                           today.tmax, yesterday.tmax AS yesterday_tmax
                    FROM __THIS__ as today
                    INNER JOIN __THIS__ as yesterday
                    ON date_sub(today.date, 1) = yesterday.date
                       AND today.station = yesterday.station"""
        sql_trans = SQLTransformer(statement=syntax)
        df = sql_trans.transform(data)
    #############################################################################
    df = data.withColumn('day_of_year', fn.dayofyear('date'))
    df = df.withColumn('year', fn.year('date'))

    df_long_lat = df[['station', 'longitude', 'latitude', 'tmax',
                      'year']].toPandas()
    count_year = df_long_lat['year'].value_counts().to_dict()

    # SELECT YEAR and DURATION
    YEAR_SELECTED = 2000
    YEAR_DURATION = 20
    df_long_lat = df_long_lat.loc[(df_long_lat['year'] > YEAR_SELECTED) & (
        df_long_lat['year'] < YEAR_SELECTED + YEAR_DURATION)]

    # UNCLUSTER plot by finding avg temperature (groupby same station and year)
    df_long_lat['avg_temp'] = df_long_lat.groupby(['station', 'year'
                                                   ])['tmax'].transform('mean')
    df_long_lat.drop_duplicates(subset=['station', 'year'], inplace=True)
    print(df_long_lat)

    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    geometry = [
        Point(xy)
        for xy in zip(df_long_lat['longitude'], df_long_lat['latitude'])
    ]

    df_long_lat = df_long_lat.drop(['longitude', 'latitude'], axis=1)
    crs = {'init': 'epsg:4326'}
    gdf = GeoDataFrame(df_long_lat, crs=crs, geometry=geometry)

    base = world.plot(color='white', edgecolor='black', figsize=(20, 12))
    gdf.plot(column='avg_temp',
             ax=base,
             marker='o',
             cmap='jet',
             markersize=15,
             legend=True,
             legend_kwds={
                 'label': "Temperature in Celcius",
                 'orientation': "horizontal"
             })
    plt.title('Distribution of Temperature between ' + str(YEAR_SELECTED) +
              " and " + str(YEAR_SELECTED + YEAR_DURATION))
    plt.savefig(inputs + "_" + str(YEAR_SELECTED) + "-" +
                str(YEAR_SELECTED + YEAR_DURATION))
def process_trip_data(spark, input_data, output_data):
	trip_data = os.path.join(input_data, 'trip_data/*.csv')

	df = spark.read.csv(trip_data, header=True)

	df = df.where(col("start_station_id").isNotNull()) \
		.where(col("end_station_id").isNotNull()) \
		.select(['duration_sec', 'start_time', 'end_time', 'start_station_id', 'end_station_id', 'bike_id', 'user_type'])

	df.show(10)

	# Create start_time_table
	start_time_table = df.select(["start_time"]) \
            .withColumn("start_year", year("start_time")) \
            .withColumn("start_month", month("start_time")) \
            .withColumn("start_dayofyear", dayofyear("start_time")) \
            .withColumn("start_dayofmonth", dayofmonth("start_time")) \
            .withColumn("start_week", weekofyear("start_time")) \
            .withColumn("start_dayofweek", dayofweek("start_time")) \
            .withColumn("start_hour", hour("start_time"))

	start_time_table.show(5)

	# Create end_time_table
	end_time_table = df.select(["end_time"]) \
            .withColumn("end_year", year("end_time")) \
            .withColumn("end_month", month("end_time")) \
            .withColumn("end_dayofyear", dayofyear("end_time")) \
            .withColumn("end_dayofmonth", dayofmonth("end_time")) \
            .withColumn("end_week", weekofyear("end_time")) \
            .withColumn("end_dayofweek", dayofweek("end_time")) \
            .withColumn("end_hour", hour("end_time"))

	end_time_table.show(5)   
	
	# Write trip talbe
	df.write.partitionBy(['start_station_id']).parquet(os.path.join(output_data, 'trips'), 'overwrite')     

	# Write start time table
	start_time_table.write.partitionBy(['start_year', 'start_month']).parquet(os.path.join(output_data, 'start_time'), 'overwrite')

	# Write end time table
	end_time_table.write.partitionBy(['end_year', 'end_month']).parquet(os.path.join(output_data, 'end_time'), 'overwrite')
Пример #11
0
def add_date_features(samples):
    samples = add_cyclic_feature(samples, dayofmonth("date"), "dayofmonth", 31)
    samples = add_cyclic_feature(samples, dayofyear("date"), "dayofyear", 366)

    samples = samples.withColumn("dayofweek", dayofweek("date"))
    samples = samples.withColumn("hour2", col("hour"))
    samples = add_cyclic_feature(samples, col("hour"), "hour", 24)
    samples = samples.withColumnRenamed("hour2", "hour")

    return samples
def add_date_features(samples):
    samples = add_cyclic_feature(samples, dayofmonth('date'), 'dayofmonth', 31)
    samples = add_cyclic_feature(samples, dayofyear('date'), 'dayofyear', 366)

    samples = samples.withColumn('dayofweek', dayofweek('date'))
    samples = samples.withColumn('hour2', col('hour'))
    samples = add_cyclic_feature(samples, col('hour'), 'hour', 24)
    samples = samples.withColumnRenamed('hour2', 'hour')

    return samples
Пример #13
0
 def num_days(self) -> int:
   """
   Number of distinct days in the ezlink.
   """
   if self._num_days:
     return self._num_days
   
   self._num_days = (self.dataframe
                     .select(self.year,
                             F.dayofyear(self.tap_in_time).alias('day'))
                     .distinct()
                     .count())
   return self._num_days
Пример #14
0
  def _transform(self, df):
    input = self.getInputCol()

    df = df.withColumn("dt_day", F.dayofmonth(input))
    df = df.withColumn("dt_hour", F.hour(input))
    df = df.withColumn("dt_minute", F.minute(input))
    df = df.withColumn("dt_second", F.second(input))

    df = df.withColumn("dt_dayofyear", F.dayofyear(input))
    df = df.withColumn("dt_dayofweek", F.dayofweek(input))
    df = df.withColumn("dt_weekofyear", F.weekofyear(input))

    return df
Пример #15
0
def main(inputs, model_file):
    data = spark.read.option('encoding', 'UTF-8').csv(inputs,
                                                      schema=tmax_schema)
    ################ FEATURE ENGINEERING: add yesterday tmax #####################
    if USE_YTD_TEMP_FEATURE:
        syntax = """SELECT today.latitude,today.longitude,today.elevation,today.date,
                           today.tmax, yesterday.tmax AS yesterday_tmax
                    FROM __THIS__ as today
                    INNER JOIN __THIS__ as yesterday
                    ON date_sub(today.date, 1) = yesterday.date
                       AND today.station = yesterday.station"""
        sql_trans = SQLTransformer(statement=syntax)
        data = sql_trans.transform(data)
    #############################################################################
    data = data.withColumn('day_of_year', fn.dayofyear('date'))
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    if USE_YTD_TEMP_FEATURE:
        train_feature_assembler = VectorAssembler(inputCols=[
            'yesterday_tmax', 'day_of_year', 'latitude', 'longitude',
            'elevation'
        ],
                                                  outputCol='features')
    else:
        train_feature_assembler = VectorAssembler(
            inputCols=['day_of_year', 'latitude', 'longitude', 'elevation'],
            outputCol='features')

    ############# DIFFERENT ML ALGORITHMS TO BE USED ####################
    # classifier = GeneralizedLinearRegression(featuresCol = 'features', labelCol='tmax' )
    # classifier = GBTRegressor( maxDepth=5,featuresCol = 'features', labelCol='tmax' )
    classifier = RandomForestRegressor(numTrees=7,
                                       maxDepth=8,
                                       featuresCol='features',
                                       labelCol='tmax')
    #####################################################################

    train_pipeline = Pipeline(stages=[train_feature_assembler, classifier])
    weather_model = train_pipeline.fit(train)

    prediction = weather_model.transform(validation)
    # print(prediction.show())
    evaluator = RegressionEvaluator(predictionCol="prediction",
                                    labelCol='tmax',
                                    metricName='r2')  #rmse
    score = evaluator.evaluate(prediction)
    print('Validation score for weather model: %g' % (score, ))

    weather_model.write().overwrite().save(model_file)
def extract_date(df):
    #df = df.na.fill('NORMAL',subset=['types'])
    df = df.dropna(how='all')
    df = (df
    .withColumn('Yearday', F.dayofyear(F.col("SalDate")))
    .withColumn('Month', F.month(F.col('SalDate')))
    .withColumn('DayofWeek', F.dayofweek(F.col('SalDate')))
    .withColumn('Year', F.year(F.col('SalDate')))
    .withColumn('Quarter', F.quarter(F.col('SalDate')))
    .withColumn('WeekOfYear', F.weekofyear(F.col('SalDate')))
    .withColumn('MonthQuarter', F.when((df['DayofMonth'] <= 8), 0)\
     .otherwise(F.when((df['DayofMonth'] <= 16), 1).otherwise(F.when((df['DayofMonth'] <= 24), 2)\
      .otherwise(3))))
    )
    df = df.cache()
    return df
Пример #17
0
    def compare_calories_across_week(self, start_date, days_through_week):
        """
        This method compares calories burned between the start day's week & the week before.
        :param start_date: %Y-%m-%d string, date to base comparison on.
        :param days_through_week: int, days into current week to compare against

        There will always be 7 days of data for the last week, but when the current week is
        in progress, we need to tell the method how many days worth of last week's data to
        compare against so that it'll be a 1:1 comparison.

        mon: 1, tues: 2, weds: 3, thurs:4, fri: 5, sat: 6
        0 is not a valid input, because there are no completed days of
        calories to compare against before a day in that week ends. i.e.,
        there is no total Sunday data to compare against last Sunday until
        it's Monday.

        :return: tuple containing the summed calories for the week of the start_date & the week preceding.
        """
        spark_session = create_spark_session()
        df = get_df_from_db(database=self.config['database'],
                            schema='activity',
                            table='calories',
                            spark_session=spark_session)

        df = df.withColumn('day_of_year', F.dayofyear(df.date))
        start_date_datetime = datetime.datetime.strptime(
            start_date, '%Y-%m-%d')
        first_week_df = df.where(
            df.date.between(
                start_date,
                start_date_datetime + timedelta(days_through_week - 1)))

        next_week_df = df.where(
            df.date.between(
                start_date_datetime + timedelta(days=7), start_date_datetime +
                timedelta(days=7) + timedelta(days=days_through_week - 1)))

        first_week_total = first_week_df.select(F.sum(
            first_week_df.total)).collect()[0][0]
        next_week_total = next_week_df.select(F.sum(
            next_week_df.total)).collect()[0][0]

        print(
            "You had burned {} calories at this point during the week of {}, compared to {} the following week."
            .format(first_week_total, start_date, next_week_total))

        return next_week_total, first_week_total
Пример #18
0
def process_log_data(spark, input_data, output_data):
    """
    Description: This function can be used to process log data to get tables we want ant and save these tables to files.

    Arguments:
        spark: the spark session object. 
        input_data: input data path 
        out_data: output data path

    Returns:
    None
    """

    staging_logs = spark.read.json(input_data, schema=staging_events_schema)
    print("create tempTable staging_events")
    staging_logs.registerTempTable("staging_events")

    users_table = spark.sql(select_users)
    users_table.registerTempTable("users")
    print("create users", )
    users_table.write.parquet('./output/users/')
    print_show(users_table)

    time_ = spark.sql(select_time)
    time_table = time_.select('start_time',
                              hour('start_time').alias('hour'),
                              dayofyear('start_time').alias('day'),
                              weekofyear('start_time').alias('week'),
                              month('start_time').alias('month'),
                              year('start_time').alias('year'),
                              dayofweek('start_time').alias('weekday'))
    time_table.write.partitionBy(['year', 'month']).parquet('./output/time/')
    print("create time")
    print_show(time_table)

    songplays_table = spark.sql(select_songplays)
    songplays_table = songplays_table.withColumn(
        'year', year(songplays_table.start_time))
    songplays_table = songplays_table.withColumn(
        'month', month(songplays_table.start_time))

    songplays_table.write.partitionBy(['year',
                                       'month']).parquet('./output/songplays/')
    print("read song_plays")
    print_show(songplays_table)
Пример #19
0
def log_emission_summary(emis):
    # Add ME and AE
    emis = emis\
            .withColumn('trans_p', col('trans_p_me') + col('trans_p_ae'))\
            .withColumn('nox', col('nox_me') + col('nox_ae'))\
            .withColumn('sox', col('sox_me') + col('sox_ae'))\
            .withColumn('co2', col('co2_me') + col('co2_ae'))

    # Log total
    all = emis.agg(
                sum(col('trans_p')).alias('total_trans_p'),
                sum(col('trans_p_me')).alias('total_trans_p_me'),
                sum(col('trans_p_ae')).alias('total_trans_p_ae'),
                sum(col('nox')).alias('total_nox'),
                sum(col('sox')).alias('total_sox'),
                sum(col('co2')).alias('total_co2'),
                sum(col('nox_me')).alias('total_nox_me'),
                sum(col('sox_me')).alias('total_sox_me'),
                sum(col('co2_ae')).alias('total_co2_me'),
                sum(col('nox_ae')).alias('total_nox_ae'),
                sum(col('sox_ae')).alias('total_sox_ae'),
                sum(col('co2_ae')).alias('total_co2_ae')).toPandas()
    mlflow.log_metrics(all.iloc[0].to_dict())

    # Generate time features
    emis = emis.withColumn('time', emis.time.cast(dataType=TimestampType()))
    emis = emis\
        .withColumn('day', dayofyear(col('time')))\
        .withColumn('week', weekofyear(col('time')))\
        .withColumn('month', month(col('time')))\
        .withColumn('dayofweek', dayofweek(col('time'))).cache()

    day_df = group_emis(emis, 'day')
    week_df = group_emis(emis, 'week')
    month_df = group_emis(emis, 'month')
    dayofweek_df = group_emis(emis, 'dayofweek')

    prt_high("Generated summary. Logging it.")
    # Log everything
    log_dataframe_metric(day_df, 'day_day')
    log_dataframe_metric(week_df, 'week_week')
    log_dataframe_metric(month_df, 'month_month')
    log_dataframe_metric(dayofweek_df, 'dayofweek_dayofweek')
Пример #20
0
def create_features_from_transaction_timestamp(data):
    """Extraction of transact timestamp.
        New features are created based on.

    Features:
        - Hour: hour time transaction
        - DayOfWeek: day of week transaction
        - DayOfYear: day of year transaction
        - WeekOfYear: week of year transaction

    Args:
        data (spark dataframe): input spark data frame.
    """
    utils.save_log('{0} :: {1}'.format(
        create_features_from_transaction_timestamp.__module__,
        create_features_from_transaction_timestamp.__name__))

    data = data.withColumn('TransactionHour',
                           hour(data[config.feature_column_timestamp]))
    data = data.withColumn('TransactionDayOfWeek',
                           dayofweek(data[config.feature_column_timestamp]))
    data = data.withColumn('TransactionDayOfYear',
                           dayofyear(data[config.feature_column_timestamp]))
    data = data.withColumn('TransactionWeekOfYear',
                           weekofyear(data[config.feature_column_timestamp]))

    data = data.withColumn(
        'WeekAction',
        when(col('TransactionWeekOfYear').between(50, 52), 1).otherwise(0))

    update_list_features("numerical", [
        'TransactionHour', 'TransactionDayOfWeek', 'TransactionDayOfYear',
        'TransactionWeekOfYear', 'WeekAction'
    ])

    return data
Пример #21
0
    def add_date(
            df: DataFrame,
            date: str
    ) -> DataFrame:
        """
        Creates three columns 'year', 'month', 'day' with
            1  for january,
            2  for february,
            3  for march,
            4  for april,
            5  for may,
            6  for june,
            7  for july,
            8  for august,
            9  for september,
            10 for october,
            11 for november,
            12 for december

        Attributes:
        -----------
        df: pyspark.sql.DataFrame
            DataFrame of interest
        date: String
            Column with DateType

        :return: pyspark.sql.DataFrame

        """
        return df.select(
            df["*"],
            year(date).alias('year'),
            (month(date)-1).alias('month'),
            (dayofmonth(date)-1).alias('day'),
            (dayofyear(date)-1).alias('dayofyear')
        )
Пример #22
0
def test_dayofyear(data_gen):
    assert_gpu_and_cpu_are_equal_collect(lambda spark: unary_op_df(
        spark, data_gen).select(f.dayofyear(f.col('a'))))
Пример #23
0
        -9,
        0)  # timestamp of each interval is the superior limit by guidelines
df_time_ma = df_time.select("asset", "variable", "timestamp",
                            avg("value").over(win).alias("ma")).cache()

# COMMAND ----------

# visualize that moving average performed well
import matplotlib.pyplot as plt
from pyspark.sql.functions import dayofyear, hour

the_asset = df_time_ma.select('asset').distinct().take(1)[0].asset
the_variable = df_time_ma.select('variable').distinct().take(1)[0].variable
the_day = (sorted([
    x.dayofyear for x in (df_time_ma.select(
        dayofyear('timestamp').alias('dayofyear')).distinct().take(5))
]))[0]  # min doesn't work on javalist
the_hours = 12
the_title = ("Data for asset {}, variable {} for day {}, {} hours".format(
    the_asset, the_variable, the_day, the_hours))

# currently exports to pandas for visualization and export in CSV format, later on the pyspark dataframe is exported in CSV
test_df = df_time_ma.filter(df_time_ma.asset == the_asset).filter(
    df_time_ma.variable == the_variable).filter(
        dayofyear('timestamp') == the_day).filter(
            hour('timestamp') <= the_hours).cache()
test_df_1s = test_df.toPandas()
test_df_60s = test_df.filter(second(df_time_ma.timestamp) == 0).toPandas()
test_df_10m = test_df.filter(minute(df_time_ma.timestamp) % 10 == 0).filter(
    second(df_time_ma.timestamp) == 0).toPandas()
Пример #24
0
def process_log_data(spark, input_data, output_data):
    '''
    Transform log data to songs and artists tables
    '''
    
    # get filepath to log data file
    log_data = f'{input_data}log_data/*/*/*.json'
    # log_data = f'{input_data}log_data/2018/11/*.json' # subset
    
    # read log data file
    df = spark.read.json(log_data) 
    print(f'Read log data {log_data}')
    df.printSchema()

    # filter by actions for song plays
    df = df.filter(df['page'] == 'NextSong') 

    # extract columns for users table    
    users_table = df.select('ts', 'userId', 'gender', 'level', 
                            col('firstName').alias('first_name'), 
                            col('lastName').alias('last_name')).drop_duplicates(subset=['userId'])
    
    # write users table to parquet files
    print(f'Save {output_data}users_table')
    users_table.show(5)
    users_table.write.parquet(f'{output_data}users_table', mode='overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(int(x) / 1000), TimestampType())
    time_df = df.select('ts').drop_duplicates().withColumn('timestamp', get_timestamp(df.ts))

    # extract columns to create time table
    time_table = time_df.select('ts', 'timestamp').drop_duplicates() \
        .withColumn('hour', hour(time_df.timestamp)) \
        .withColumn('day', dayofyear(time_df.timestamp)) \
        .withColumn('week', weekofyear(time_df.timestamp)) \
        .withColumn('month', month(time_df.timestamp)) \
        .withColumn('year', year(time_df.timestamp)) \
        .withColumn('weekday', dayofweek(time_df.timestamp)) \
        .withColumn('start_time', time_df.timestamp) \
        .drop_duplicates(subset=['start_time'])
    
    # write time table to parquet files partitioned by year and month
    print(f'Save {output_data}time_table')
    time_table.show(5)
    time_table.write.parquet(f'{output_data}time_table', mode='overwrite', partitionBy=["year", "month"])
    
    # read in song data to use for songplays table
    song_data = f'{input_data}song_data/*/*/*/*.json'
    # song_data = f'{input_data}song_data/A/A/A/*.json' # subset
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table 
    songplays_table = df.join(song_df, [df.song == song_df.title, df.artist == song_df.artist_name, df.length == song_df.duration], how='inner') \
        .select(monotonically_increasing_id().alias("songplay_id"), "ts", "level", "song_id", "artist_id", "location", 
                col("sessionId").alias("session_id"), col("userId").alias("user_id"), col("userAgent").alias("user_agent")) \
        .join(time_table, df.ts == time_table.ts, how="inner") \
        .select("songplay_id", "start_time", "user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month") \
        .drop_duplicates()

    # write songplays table to parquet files partitioned by year and month
    print(f'Save {output_data}songplays_table')
    songplays_table.show(5)
    songplays_table.write.parquet(f'{output_data}songplays_table', mode='overwrite', partitionBy=["year", "month"])
Пример #25
0
from pyspark.ml.linalg import Vectors, MatrixUDT, VectorUDT, DenseMatrix, DenseVector
import time
import math
import pandas as pd


#df = sqlContext.read.csv("/datasets/district11.csv", header='true')
df = sqlContext.read.csv("/datasets/crimes.csv", header='true')
start = time.time()
#Define date derivatives
df = (df
       .withColumn('Timestamps', F.to_timestamp("Date", 'MM/dd/yyyy hh:mm:ss a'))
       .withColumn('Day', F.to_date("Date", 'MM/dd/yyyy hh:mm:ss a'))
       .withColumn("Month", F.month("Day"))
       .withColumn("Hour", F.hour("Timestamps"))
       .withColumn("DayOfYear", F.dayofyear("Day"))
       .withColumn("DayOfWeek", F.dayofweek("Day"))

       
      )

cols = ["ID","Day","Year","Month","Hour","DayOfYear","DayOfWeek","District","Primary Type"]

df = df.select(*cols)

#Rename Primary Types with less than 1% share OTHER CRIMES
def least_frequent_columns(df,threshold=0.01):
    res = df.groupBy("Primary Type").count()\
                            .withColumn('tot',F.lit(df.count()))\
                            .withColumn('frac',F.expr('count/tot'))\
                            .filter('frac<'+str(threshold))\
Пример #26
0
# Pickups/Dropoffs in entire NYC
taxi_nyc_df = taxi_df.groupby(taxi_df.Time).agg(*sum_aggregations('Nyc')).cache()

taxi_nyc_1h_df = get_agg_taxi_df(taxi_nyc_df, 1, 'Time', sum_aggregations('Nyc', 1))
taxi_nyc_4h_df = get_agg_taxi_df(taxi_nyc_df, 4, 'Time', sum_aggregations('Nyc', 4))


# Time features
date_df = taxi_df.select(taxi_df.Time).distinct()

weekday_udf = udf(lambda date_time: date_time.weekday(), IntegerType())
is_holiday_udf = udf(lambda date_time: date_time.date() in holidays.UnitedStates(), BooleanType())

date_df = date_df.withColumn('Hour', func.hour(date_df.Time))
date_df = date_df.withColumn('Day_Of_Week', weekday_udf(date_df.Time))
date_df = date_df.withColumn('Day_Of_Year', func.dayofyear(date_df.Time))
date_df = date_df.withColumn('Is_Holiday', is_holiday_udf(date_df.Time))


# Aggregate events happening in last and next 3 hours for each hour
event_3h_df = event_df.withColumnRenamed('Venues', 'Venues_0h')
for i in range(-3, 4):
    if i != 0:
        add_hours_udf = udf(lambda date_time: date_time + datetime.timedelta(hours=i), TimestampType())
        event_3h_df = event_3h_df.join(event_df.withColumn('Time', add_hours_udf(event_df.Time)).withColumnRenamed('Venues', 'Venues_%sh' % str(i)), 'Time')


# Join single feature groups
features_df = taxi_df.select(index_columns + [taxi_df.Pickup_Count]) \
                     .join(taxi_dis_1h_df, index_columns) \
                     .join(taxi_dis_4h_df, index_columns) \
Пример #27
0
 def _transform(self, df):
     self.check_input_type(df.schema)
     return df.withColumn(self.outputCol, F.dayofyear(df[self.inputCol]))
Пример #28
0
 def extract_datetime_info(self, datetime_col, info_to_extract):
     self._data_frame = self._data_frame.withColumn(
         datetime_col + '_temp', self.to_date_(datetime_col))
     timestamped = datetime_col + "_timestamped"
     # self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.count())
     uniqueVals = self._data_frame.select(
         datetime_col + '_temp').distinct().na.drop().limit(10).collect()
     try:
         date_format = self._metaHelperInstance.get_datetime_format(
             uniqueVals)
         to_date_udf = udf(
             lambda x: datetime.strptime(x, date_format)
             if x != None else x, DateType())
         self._data_frame = self._data_frame.withColumn(
             datetime_col + '_temp',
             to_date_udf(self._data_frame[datetime_col +
                                          '_temp']).alias(datetime_col +
                                                          '_temp'))
         if info_to_extract == "year":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_year",
                 year(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "month_of_year":
             dict = {
                 1: "January",
                 2: "February",
                 3: "March",
                 4: "April",
                 5: "May",
                 6: "June",
                 7: "July",
                 8: "August",
                 9: "September",
                 10: "October",
                 11: "November",
                 12: "December"
             }
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_month",
                 month(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_etf_month_of_year",
                 self.month_to_string(dict)(col(datetime_col + "_month")))
         if info_to_extract == "day_of_month":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_day_of_month",
                 dayofmonth(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "day_of_year":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_day_of_year",
                 dayofyear(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "day_of_week":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_etf_day_of_week",
                 dayofweek(datetime_col + '_temp'))
         if info_to_extract == "week_of_year":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_week_of_year",
                 weekofyear(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "hour":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_hour",
                 hour(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "minute":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_minute",
                 minute(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "date":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_date",
                 to_timestamp(self._data_frame[datetime_col + '_temp'],
                              "dd/MM/yyyy").cast("date"))
         else:
             pass
     except TypeError:
         if info_to_extract == "year":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_year",
                 year(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "month_of_year":
             dict = {
                 1: "January",
                 2: "February",
                 3: "March",
                 4: "April",
                 5: "May",
                 6: "June",
                 7: "July",
                 8: "August",
                 9: "September",
                 10: "October",
                 11: "November",
                 12: "December"
             }
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_month",
                 month(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_etf_month_of_year",
                 self.month_to_string(dict)(col(datetime_col + "_month")))
         if info_to_extract == "day_of_month":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_day_of_month",
                 dayofmonth(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "day_of_year":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_day_of_year",
                 dayofyear(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "day_of_week":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_etf_day_of_week",
                 dayofweek(datetime_col + '_temp'))
         if info_to_extract == "week_of_year":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_week_of_year",
                 weekofyear(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "hour":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_hour",
                 hour(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "minute":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_minute",
                 minute(
                     to_timestamp(self._data_frame[datetime_col + '_temp'],
                                  "dd/MM/yyyy")))
         if info_to_extract == "date":
             self._data_frame = self._data_frame.withColumn(
                 datetime_col + "_date",
                 to_timestamp(self._data_frame[datetime_col + '_temp'],
                              "dd/MM/yyyy").cast("date"))
         else:
             pass
     self._data_frame = self._data_frame.drop(datetime_col + '_temp')
     # self._data_frame = self._data_frame.withColumn(datetime_col, to_timestamp(self._data_frame[datetime_col +'_temp'], "dd/MM/yyyy"))
     # self._data_frame = self._data_frame.withColumn(datetime_col, F.from_unixtime(F.unix_timestamp(self._data_frame[datetime_col +'_temp']), "dd/MM/yyyy"))
     return self._data_frame
Пример #29
0
from pyspark.sql.functions import countDistinct, avg, stddev
df.select(countDistinct("Sales")).show() #count distinct
df.select(countDistinct("Sales").alias("Distinct Sales")).show() #change alias name
df.select(avg('Sales')).show() #average
df.select(stddev("Sales")).show() #stdev


## DATETIME
#--------------------------------------------------------
from pyspark.sql.functions import (format_number, dayofmonth, hour, 
                                    dayofyear, month, year, 
                                    weekofyear, date_format)

df.select(dayofmonth(df['Date'])).show() #date of month
df.select(hour(df['Date'])).show() #hour
df.select(dayofyear(df['Date'])).show() #day of year
df.select(month(df['Date'])).show() #month
df.select(year(df['Date'])).show() #year







## 2) USING RDD (Resilient Distributed Dataset)
    # spark is transiting slowly to spark dataframe, but its stil good to learn the original parsing in RDD
    # especially when data is non-dataframe type
#--------------------------------------------------------
from pyspark import SparkConf, SparkContext
Пример #30
0
# get sourcefile name from input_file_name()
df = df.withColumn("path", fun.input_file_name())
regex_str = "[\/]([^\/]+[^\/]+)$"  #regex to extract text after the last / or \
df = df.withColumn("sourcefile", fun.regexp_extract("path", regex_str, 1))
df.show()

#######################################################################
# handle dates and times
df = df.withColumn('timestamp', fun.to_date("timestamp"))
df.show(2)

# now we should be able to convert or extract date features from timestamp
df.withColumn('dayofmonth', fun.dayofmonth("timestamp")).show(2)
df.withColumn('month', fun.month("timestamp")).show(2)
df.withColumn('year', fun.year("timestamp")).show(2)
df.withColumn('dayofyear', fun.dayofyear("timestamp")).show(2)

# calculate the difference from the current date ('days_ago')
df.withColumn('days_ago', fun.datediff(fun.current_date(), "timestamp")).show()

########################################################################
#group_by
# summarize within group data
df.groupBy("sourcefile").count().show(99)
df.groupBy("sourcefile").min('open').show(99)
df.groupBy("sourcefile").mean('open').show(99)
df.groupBy("sourcefile").max('open', 'close').show(99)

########################################################################
#window functions
from pyspark.sql.window import Window
Пример #31
0
def process_log_data(spark, input_data, output_data):
    """
    This function is to read the log data in the filepath (bucket/log_data)
    to get the info. to populate the users, time and song tables.
    
    Args:
    ------------------------------
        spark:       the cursor object
        input_data:  the path to the bucket containing song data
        output_data: the path where the parquet files stored
        
    Returns:
        None
    
    """

    # get filepath to log data file
    log_data = f'{input_data}/log_data/*/*/*.json'

    # read log data file
    df = spark.read.json(log_data)
    print("Success of reading log_data from S3")

    # filter by actions for song plays
    df = df.filter(df['page'] == 'NextSong')

    # extract columns for users table
    # users table: user_id, first_name, last_name, gender, level
    user_table = df.select('userId', 'firstName', 'lastName', 'gender',
                           'level').dropDuplicates()

    # write users table to parquet files
    user_table.write.parquet(f'{output_data}/user_table', mode='overwrite')
    print('Success of writing user_table to parquet')

    # convert ts column to timestamp
    df = df.withColumn('start_time', F.from_unixtime(F.cols('ts') / 1000))

    # extract columns to create time table
    # time table: start_time, hour, day, week, month, year, weekday
    time_table = df.select('ts', 'start_time') \
                   .withColumn('year', F.year('start_time')) \
                   .withColumn('month', F.month('start_time')) \
                   .withColumn('week', F.weekofyear('start_time')) \
                   .withColumn('weekday', F.dayofweek('start_time')) \
                   .withColumn('day', F.dayofyear('start_time')) \
                   .withColumn('hour', F.hour('start_time')).dropDuplicates()
    print('Success of extracting time column')

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(f'{output_table}/time_table',
                             mode='overwrite',
                             partitionBy=['year', 'month'])
    print('Success of writing time_table to parquet')

    # read in song data to use for songplays table
    song_data = f'{input_data}/song_data/A/A/A/*.json'
    song_dataset = spark.read.json(song_data)
    print('Success of reading song_dataset from S3')

    # create temporary view of song_dataset, time_table and log_dataset
    song_dataset.createOrReplaceTempView('song_dataset')
    time_table.createOrReplaceTempView('time_table')
    df.createOrReplaceTempView('log_dataset')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = spark.sql("""SELECT DISTINCT
                                   FROM song_dataset s
                                   JOIN log_dataset l
                                   ON s.title = l.song
                                       AND s.duration = l.length
                                       AND s.artist_name  = l.artist
                                       JOIN time_table t
                                   ON t.ts = l.ts""").dropDuplicates()

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(f'{output_data}/songplays_table',
                                  mode='overwrite',
                                  partitionBy=['year', 'month'])