conf = SparkConf().setAppName("pyspark")
sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)

#filter vic jak 5 adres nebo vic jak 3 porty na jdne IP

#get data
data = sqlContext.read.parquet("CTU-Flows_main/Flows.parquet/_yyyymd=2018-3-7")

#filter flows with dstIP outside of the university and srcIP inside range 80-83 mask 22
df = data.filter(data.Proto=="tcp").filter(data.DstAddr.startswith("147.32.")).filter(~data.SrcAddr.startswith("147.32.")).select("DstAddr", "Dport", "State", "StartTime", "SrcAddr")

touchAddrLimit = 5
touchPortLimit = 3

#predelat typy IP, port

#select day from timestamp
df = df.withColumn('Dport', df["Dport"].cast(T.IntegerType()))
df = df.withColumn('day', unix_timestamp('StartTime', 'yyyy/MM/dd').cast(T.TimestampType()))
df = df.withColumn('timestamp', unix_timestamp('StartTime', 'yyyy/MM/dd hh:mm:ss.SSSSSS').cast(T.TimestampType()))

srcAddrs = df.select('SrcAddr','DstAddr','Dport','day').distinct().groupBy('SrcAddr', 'day').agg(F.countDistinct('DstAddr').alias('addrCount'), F.countDistinct('Dport').alias('portCount')).filter((col('addrCount')>=touchAddrLimit) | (col('portCount')>=touchPortLimit)).select('SrcAddr',"day")
df = df.join(srcAddrs, ['SrcAddr','day'], 'leftsemi')
#distinct SrcAddr and DSTport
res = df.select("SrcAddr", "day","Dport")distinct().groupBy("day", "Dport").count().selectExpr("day","Dport","count as connectionCount")
result = res.groupBy(df.Dport).agg(F.avg('connectionCount').alias("average_count")).select("Dport","average_count").sort(col('average_count').desc()).head(100)
print("********RESULTS*************")
for row in result:
    print("{},{}".format(row["Dport"],row["average_count"]))
print("********RESULTS END*************")
df_convert_result = df_base
    .withColumn("today", F.lit("today"))


# In[ ]:


"""
DataFrameのカラム型変換
"""
# Date
df_convert_result1 = df_base
    .withColumn("tdate", F.lit(str_yyyymmdd_to_date(target_date)))    .withColumn("tdate", F.lit(F.col("tdate").cast("date")))
    
# Timestamp
df_wakati_result2 = df_base    .withColumn("created_at", df_wakati_base.created_at.cast(T.TimestampType()))


# In[ ]:


# -------------- BigQuery Connector -------------- 


# In[ ]:


"""
dataproc-mecab-init-shellで作成した環境では、BigQueryコネクタをinstallしているため、
BigQueryの読み込み、書き込みができる。
https://cloud.google.com/dataproc/docs/tutorials/bigquery-connector-spark-example?hl=ja
示例#3
0
def get_spark_testing_client(data_directory):
    global _spark_testing_client

    if _spark_testing_client is not None:
        return _spark_testing_client

    pytest.importorskip('pyspark')
    import pyspark.sql.types as pt

    _spark_testing_client = ibis.spark.connect()
    s = _spark_testing_client._session

    df_functional_alltypes = s.read.csv(
        path=str(data_directory / 'functional_alltypes.csv'),
        schema=pt.StructType([
            pt.StructField('index', pt.IntegerType(), True),
            pt.StructField('Unnamed: 0', pt.IntegerType(), True),
            pt.StructField('id', pt.IntegerType(), True),
            # cast below, Spark can't read 0/1 as bool
            pt.StructField('bool_col', pt.ByteType(), True),
            pt.StructField('tinyint_col', pt.ByteType(), True),
            pt.StructField('smallint_col', pt.ShortType(), True),
            pt.StructField('int_col', pt.IntegerType(), True),
            pt.StructField('bigint_col', pt.LongType(), True),
            pt.StructField('float_col', pt.FloatType(), True),
            pt.StructField('double_col', pt.DoubleType(), True),
            pt.StructField('date_string_col', pt.StringType(), True),
            pt.StructField('string_col', pt.StringType(), True),
            pt.StructField('timestamp_col', pt.TimestampType(), True),
            pt.StructField('year', pt.IntegerType(), True),
            pt.StructField('month', pt.IntegerType(), True),
        ]),
        mode='FAILFAST',
        header=True,
    )
    df_functional_alltypes = df_functional_alltypes.withColumn(
        "bool_col", df_functional_alltypes["bool_col"].cast("boolean"))
    df_functional_alltypes.createOrReplaceTempView('functional_alltypes')

    df_batting = s.read.csv(
        path=str(data_directory / 'batting.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('stint', pt.IntegerType(), True),
            pt.StructField('teamID', pt.StringType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('G', pt.IntegerType(), True),
            pt.StructField('AB', pt.DoubleType(), True),
            pt.StructField('R', pt.DoubleType(), True),
            pt.StructField('H', pt.DoubleType(), True),
            pt.StructField('X2B', pt.DoubleType(), True),
            pt.StructField('X3B', pt.DoubleType(), True),
            pt.StructField('HR', pt.DoubleType(), True),
            pt.StructField('RBI', pt.DoubleType(), True),
            pt.StructField('SB', pt.DoubleType(), True),
            pt.StructField('CS', pt.DoubleType(), True),
            pt.StructField('BB', pt.DoubleType(), True),
            pt.StructField('SO', pt.DoubleType(), True),
            pt.StructField('IBB', pt.DoubleType(), True),
            pt.StructField('HBP', pt.DoubleType(), True),
            pt.StructField('SH', pt.DoubleType(), True),
            pt.StructField('SF', pt.DoubleType(), True),
            pt.StructField('GIDP', pt.DoubleType(), True),
        ]),
        header=True,
    )
    df_batting.createOrReplaceTempView('batting')

    df_awards_players = s.read.csv(
        path=str(data_directory / 'awards_players.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('awardID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('tie', pt.StringType(), True),
            pt.StructField('notes', pt.StringType(), True),
        ]),
        header=True,
    )
    df_awards_players.createOrReplaceTempView('awards_players')

    df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar'])
    df_simple.createOrReplaceTempView('simple')

    df_struct = s.createDataFrame([((1, 2, 'a'), )], ['struct_col'])
    df_struct.createOrReplaceTempView('struct')

    df_nested_types = s.createDataFrame([(
        [1, 2],
        [[3, 4], [5, 6]],
        {
            'a': [[2, 4], [3, 5]]
        },
    )], [
        'list_of_ints', 'list_of_list_of_ints',
        'map_string_list_of_list_of_ints'
    ])
    df_nested_types.createOrReplaceTempView('nested_types')

    df_complicated = s.createDataFrame([({
        (1, 3): [[2, 4], [3, 5]]
    }, )], ['map_tuple_list_of_list_of_ints'])
    df_complicated.createOrReplaceTempView('complicated')

    df_udf = s.createDataFrame([
        ('a', 1, 4.0, 'a'),
        ('b', 2, 5.0, 'a'),
        ('c', 3, 6.0, 'b'),
    ], ['a', 'b', 'c', 'key'])
    df_udf.createOrReplaceTempView('udf')

    df_udf_nan = s.createDataFrame(
        pd.DataFrame({
            'a': np.arange(10, dtype=float),
            'b': [3.0, np.NaN] * 5,
            'key': list('ddeefffggh'),
        }))
    df_udf_nan.createOrReplaceTempView('udf_nan')

    df_udf_null = s.createDataFrame(
        [(float(i), None if i % 2 else 3.0, 'ddeefffggh'[i])
         for i in range(10)], ['a', 'b', 'key'])
    df_udf_null.createOrReplaceTempView('udf_null')

    df_udf_random = s.createDataFrame(
        pd.DataFrame({
            'a':
            np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
            'b':
            np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
            'key':
            list('ddeefff'),
        }))
    df_udf_random.createOrReplaceTempView('udf_random')

    return _spark_testing_client
示例#4
0
def process_log_data(spark, input_data, output_data):
    """Function that processes log data into tables
    Args:
        spark (SparkSession): SparkSession 
        input_data(str): path to input storage bucket
        output_data(str): path to output storage bucket
    Returns:
        None
    """

    # get filepath to log data file
    # log_data = inpuut_data + 'log_data/*/*/*.json'
    log_data = input_data + 'log_data/2018/11/2018-11*.json'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df[df['page'] == 'NextSong']

    #df.printSchema()
    df.createOrReplaceTempView("temp_data")

    # extract columns for users table
    users_table = spark.sql('''
    SELECT 
        DISTINCT userId,
        firstName,
        lastName,
        gender, 
        level
    FROM
        temp_data
    WHERE 
        userId not IN ('')
    ''')

    # write users table to parquet files
    users_table.limit(5).write.parquet(os.path.join(output_data, 'users'))

    df = df.filter(df.ts.isNotNull())

    # create timestamp column from original timestamp column
    get_timestamp = udf(
        lambda x: datetime.fromtimestamp((x / 1000.0)) if x != '' else '',
        t.TimestampType())
    df = df.withColumn('start_time', get_timestamp(df.ts))

    # extract columns to create time table
    df=df.dropDuplicates()\
        .withColumn("hour", hour("start_time"))\
        .withColumn("day", dayofmonth("start_time"))\
        .withColumn("week", weekofyear("start_time"))\
        .withColumn("month", month("start_time"))\
        .withColumn("year", year("start_time"))\
        .withColumn("weekday", dayofweek("start_time"))

    time_table = df.select(
        ['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday'])

    # write time table to parquet files partitioned by year and month
    time_table.limit(5).write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'time'))

    # read in song data to use for songplays table
    songs_parquet = output_data + 'songs'
    songs_df = spark.read.parquet(songs_parquet)
    artists_parquet = output_data + 'artists/*.parquet'
    artists_df = spark.read.parquet(artists_parquet)

    songs_df = songs_df.join(artists_df, ['artist_id'])
    songs_df = songs_df.drop('year')

    df = df.join(songs_df, ((df.song == songs_df.song_id) &
                            (df.artist == songs_df.artist_name)),
                 how='left')

    df.createOrReplaceTempView("temp_data")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = spark.sql('''
    SELECT 
        year, 
        month,
        start_time,
        userID, 
        level,
        song_id, 
        artist_id, 
        sessionId,
        location,
        userAgent
    FROM
        temp_data  
    ''')

    # sort by ts and use row number for songplay_id
    window = Window.orderBy(col('start_time'))
    songplays_table = songplays_table.withColumn('songplay_id',
                                                 row_number().over(window))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.limit(10).write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'song_plays'))
示例#5
0
KAFKA_BROKER_URL = os.environ.get("KAFKA_BROKER_URL", "localhost:9092")
KAFKA_TOPIC = "udacity.project.spark-streaming.police"

schema = pst.StructType([
    pst.StructField("crime_id", pst.StringType()),  # : "183653763",
    pst.StructField("original_crime_type_name",
                    pst.StringType()),  # : "Traffic Stop",
    pst.StructField("report_date",
                    pst.DateType()),  # : "2018-12-31T00:00:00.000",
    pst.StructField("call_date",
                    pst.DateType()),  # : "2018-12-31T00:00:00.000",
    pst.StructField("offense_date",
                    pst.DateType()),  # : "2018-12-31T00:00:00.000",
    pst.StructField("call_time", pst.StringType()),  # : "23:57",
    pst.StructField("call_date_time",
                    pst.TimestampType()),  # : "2018-12-31T23:57:00.000",
    pst.StructField("disposition", pst.StringType()),  # : "ADM",
    pst.StructField("address",
                    pst.StringType()),  # : "Geary Bl/divisadero St",
    pst.StructField("city", pst.StringType()),  # : "San Francisco",
    pst.StructField("state", pst.StringType()),  # : "CA",
    pst.StructField("agency_id", pst.StringType()),  # : "1",
    pst.StructField("address_type", pst.StringType()),  # : "Intersection",
    pst.StructField("common_location", pst.StringType()),  # : ""
])


def run_spark_job(spark):

    df = (
        spark.readStream.format("kafka").option(
示例#6
0
def process_log_data(spark, input_data, output_data):
    """
    Processes log data and writes the users, the time, and the songplays table into specified S3 bucket in parquet format.
    
    Parameters
    -------
        spark: object
            Spark Session object to handle the Spark Processes
        
        input_data: str
            The location of the files to read from S3 Bucket
        
        output_data: str
            The location of the files to write into S3 Bucket
    """

    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*events.json"

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # extract columns for users table
    users_cols = [
        "userId as user_id", "firstName as first_name",
        "lastName as last_name", "gender", "level"
    ]

    users_table = df.selectExpr(users_cols).drop_duplicates()

    # write users table to parquet files
    users_table.write.parquet(output_data + "users/")

    # create datetime column from original timestamp column
    get_datetime = F.udf(lambda x: D.fromtimestamp(int(x / 1000)),
                         T.TimestampType())
    df = df.withColumn("start_time", get_datetime("ts"))

    # extract columns to create time table
    time_table = df.select("start_time") \
                   .withColumn("hour", F.hour("start_time")) \
                   .withColumn("day", F.dayofmonth("start_time")) \
                   .withColumn("week", F.weekofyear("start_time")) \
                   .withColumn("month", F.month("start_time")) \
                   .withColumn("year", F.year("start_time")) \
                   .withColumn("weekday", F.date_format('start_time', 'EEEE')).drop_duplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year",
                                 "month").parquet(output_data + 'time/')

    # read in song data to use for songplays table
    songs_table = spark.read.parquet(output_data + "songs/")
    songs_table = songs_table.selectExpr(
        ["song_id", "artist_id as s_artist_id", "title"])
    artists_table = spark.read.parquet(output_data + "artists/")
    artists_table = artists_table.select(["artist_id", "location", "name"])

    song_df = songs_table.join(
        artists_table, songs_table.s_artist_id == artists_table.artist_id,
        "inner")

    cols = [
        "start_time", "userId as user_id", "level", "sessionId as session_id",
        "userAgent as user_agent", "song", "artist"
    ]
    df = df.selectExpr(cols) \
           .withColumn("songplay_id",  F.monotonically_increasing_id()) \
           .withColumn("month", F.month("start_time")) \
           .withColumn("year", F.year("start_time"))

    df = df.join(song_df,
                 (df.song == song_df.title) & (df.artist == song_df.name),
                 "left")

    # extract columns from joined song and log datasets to create songplays table
    songplays_cols = [
        "songplay_id", "start_time", "user_id", "level", "song_id",
        "artist_id", "session_id", "location", "user_agent", "year", "month"
    ]
    songplays_table = df.select(songplays_cols)

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(output_data +
                                                               'songplays/')
示例#7
0
文件: etl.py 项目: arneme/DataLake
def process_log_data(spark, input_data, output_data):
    """
    Process log data, i.e. read the log files into Spark and 
    transform into OLAP like user, time (dimensions) tables
    and songplay (fact) table. Store the fact and dimension tables
    into Spark parquet files.

    Parameters
    ----------
    spark : 
        The Spark session to use
    input_data : string
        The path to the input log files (in S3)
    output_data : string
        The path to where the generated parquet files will be stored
    """
    # get filepath to log data file
    song_data_path = input_data + 'song_data' + '/*/*/*/*.json'
    log_data = input_data + 'log_data' + '/*/*/*.json'

    event_df = spark.read.json(log_data)
    
    # filter by actions for song plays
    event_df = event_df.filter(event_df.page == 'NextSong')

    # extract columns for users table    
    user_table = event_df.select('userId', 'firstName', 'lastName', \
                                 'gender', 'level', ) \
                         .dropDuplicates()
    
    # write users table to parquet files
    user_table.write.mode('overwrite').parquet(output_data + '/user_table')

    # create timestamp column from original timestamp column
    def get_ts (ts):
        return datetime.fromtimestamp(ts / 1000.0)

    get_datetime = udf(lambda z: get_ts(z), t.TimestampType())

    datetime_df = event_df.select('ts').dropDuplicates() \
                          .withColumn('datetime', get_datetime('ts'))
    
    # extract columns to create time table
    time_table = datetime_df.select(col('ts').alias('starttime'),
                   hour('datetime').alias('hour'), 
                   dayofmonth('datetime').alias('day'), 
                   weekofyear('datetime').alias('week'),
                   month('datetime').alias('month'),
                   year('datetime').alias('year'),
                   dayofweek('datetime').alias('weekday'))
    
    # write time table to parquet files partitioned by year and month
    time_table.write.mode("overwrite") \
                    .partitionBy("year", "month") \
                    .parquet('s3://data-lake-sparkify/time_table')

    # read in song data again to use for creating songplays table
    # Would probably have been a good idea to keep the dataframe
    # created in process_song_data, but the template seems to
    # dictate that it should be read again...

    song_df = spark.read.json(song_data_path)

    # extract columns from joined song and log datasets to create
    # songplays table 
    songplays_table = event_df.join(song_df, \
                                    event_df.artist == song_df.artist_name) \
                              .select('ts', 'userId', 'song_id', \
                                      'artist_id', 'level', 'sessionId',\
                                      'location', 'userAgent') \
                              .dropDuplicates()

    # write songplays table to parquet files
    # partitioned by year and month (which does not make sense since it is
    # not in table, I will use song_id and artist_id instead).

    songplays_table.write.mode("overwrite") \
                        .partitionBy("song_id", "artist_id") \
                        .parquet('s3://data-lake-sparkify/songplay_table')
def process_data(data_path, database, table):
    '''
        Purpose: To Process the API results using SPARK and store in HDFS
        '''
    try:
        logger('INFO', "LOADING the Data in Spark for Processing ")

        logger(
            'INFO',
            "In case of java.lang.OutOfMemoryError Tune Spark Parameters in conf/proprties.yml"
        )

        df = myspark.read.format("json").options(
            inferSchema=True,
            dateFormat="yyyy-MM-dd",
            timestampFormat="yyyy-MM-dd'T'HH:mm:ss.SSSZZ",
            ignoreLeadingWhiteSpace=True,
            ignoreTrailingWhiteSpace=True,
            path="/tmp/tempfiles/").load()

        logger('INFO', "Changing the Data Type to Timestamp for few Columns")

        df = df.withColumn(
            "created",
            F.from_utc_timestamp(
                df.created, "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast(
                    T.TimestampType())).withColumn(
                        "lastModified",
                        F.from_utc_timestamp(
                            df.lastModified,
                            "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast(
                                T.TimestampType())).withColumn(
                                    "lastAccessed",
                                    F.from_utc_timestamp(
                                        df.lastAccessed,
                                        "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast(
                                            T.TimestampType()))

        df = df.withColumn(
            "created",
            F.date_format(df.created, "yyyy-MM-dd HH:mm:ss")).withColumn(
                "lastModified",
                F.date_format(df.lastModified,
                              "yyyy-MM-dd HH:mm:ss")).withColumn(
                                  "lastAccessed",
                                  F.date_format(df.lastAccessed,
                                                "yyyy-MM-dd HH:mm:ss"))

        logger('INFO', "Choosing the 11 required Columns ")

        finaldf = df.select("owner", "parentPath", "originalName", "created",
                            "lastModified", "lastAccessed", "size",
                            "sourceType", "permissions", "group", "identity")

        logger(
            'INFO', "%s Records will be LOADED into %s.%s Table " %
            (finaldf.count(), database, table))

        logger('INFO', "Create Table Command Prepared:")

        cmd = "CREATE TABLE IF NOT EXISTS %s.%s (owner STRING,parentPath STRING,originalName STRING,created timestamp,lastModified timestamp,lastAccessed timestamp,size BIGINT,sourceType STRING,permissions STRING,group STRING,identity BIGINT)STORED AS PARQUET" % (
            database, table)

        logger('INFO', cmd)

        logger('INFO',
               "Creating the Table %s.%s if NOT EXISTS" % (database, table))

        myspark.sql(cmd)

        finaldf.createOrReplaceTempView("mytable")

        logger('INFO', "INSERT INTO TABLE Table Command Prepared:")

        cmd = """INSERT INTO TABLE {database}.{table} SELECT owner,parentPath,originalName,created,lastModified,lastAccessed,size,sourceType,permissions,group,identity FROM mytable""".format(
            database=database, table=table)

        logger('INFO', cmd)

        logger('INFO', "Inserting the Data in %s.%s Table" % (database, table))

        myspark.sql(cmd)

        logger(
            'INFO', "%s Records Inserted successfully in %s.%s" %
            (finaldf.count(), database, table))

    except Exception, e:
        logger('ERROR', "Job Failed with below Details !!")
        os.system("rm -f " + data_path + "/../pids/*.pid")
        logger('ERROR', "%s" % e)
        logger('ERROR', "Exiting !!")
        sys.exit()
示例#9
0
def process_log_data(spark, input_data_ld, input_data_sd, output_data, \
                    run_start_time):
    """
    Load JSON input data (log_data) from input_data path,
        process the data to extract users_table, time_table,
        songplays_table, and store the queried data to parquet files.
    """

    print("Start processing log_data JSON files...")
    # get filepath to log data file
    log_data = input_data_ld

    # read log data file
    print("Reading log_data files from {}...".format(log_data))
    df_ld = spark.read.json(log_data).dropDuplicates()

    print("...finished reading log_data...")

    # filter by actions for song plays

    df_ld_filtered = df_ld.filter(df_ld.page == 'NextSong')

    # extract columns for users table
    df_ld_filtered.createOrReplaceTempView("users_table_DF")
    users_table = spark.sql("""
        SELECT  DISTINCT userId    AS user_id,
                         firstName AS first_name,
                         lastName  AS last_name,
                         gender,
                         level
        FROM users_table_DF
        ORDER BY last_name
    """)
    print("Users_table schema:")
    users_table.printSchema()
    print("Users_table examples:")
    users_table.show(5)

    # write users table to parquet files
    users_table_path = output_data + "users_table.parquet" + "_" \
                        + run_start_time
    print("Writing users_table parquet files to {}..."\
            .format(users_table_path))
    users_table.write.mode("overwrite").parquet(users_table_path)

    print("...finished writing users_table...")

    # create timestamp column from original timestamp column

    print("Creating timestamp column...")

    @udf(t.TimestampType())
    def get_timestamp(ts):
        return datetime.fromtimestamp(ts / 1000.0)

    df_ld_filtered = df_ld_filtered.withColumn("timestamp", \
                        get_timestamp("ts"))
    df_ld_filtered.printSchema()
    df_ld_filtered.show(5)

    # create datetime column from original timestamp column
    print("Creating datetime column...")

    @udf(t.StringType())
    def get_datetime(ts):
        return datetime.fromtimestamp(ts / 1000.0)\
                       .strftime('%Y-%m-%d %H:%M:%S')

    df_ld_filtered = df_ld_filtered.withColumn("datetime", \
                        get_datetime("ts"))
    print("Log_data + timestamp + datetime columns schema:")
    df_ld_filtered.printSchema()
    print("Log_data + timestamp + datetime columns examples:")
    df_ld_filtered.show(5)

    # extract columns to create time table
    df_ld_filtered.createOrReplaceTempView("time_table_DF")
    time_table = spark.sql("""
        SELECT  DISTINCT datetime AS start_time,
                         hour(timestamp) AS hour,
                         day(timestamp)  AS day,
                         weekofyear(timestamp) AS week,
                         month(timestamp) AS month,
                         year(timestamp) AS year,
                         dayofweek(timestamp) AS weekday
        FROM time_table_DF
        ORDER BY start_time
    """)
    print("Time_table schema:")
    time_table.printSchema()
    print("Time_table examples:")
    time_table.show(5)

    # write time table to parquet files partitioned by year and month
    time_table_path = output_data + "time_table.parquet" + "_" \
                    + run_start_time
    print("Writing time_table parquet files to {}..."\
            .format(time_table_path))
    time_table.write.mode("overwrite").partitionBy("year", "month")\
            .parquet(time_table_path)

    print("...finished writing time_table...")

    # read in song data to use for songplays table

    song_data = input_data_sd
    print("Reading song_data files from {}...".format(song_data))
    df_sd = spark.read.json(song_data)

    # Join log_data and song_data DFs
    print("Joining log_data and song_data DFs...")
    df_ld_sd_joined = df_ld_filtered\
        .join(df_sd, (df_ld_filtered.artist == df_sd.artist_name) & \
                     (df_ld_filtered.song == df_sd.title))
    print("...finished joining song_data and log_data DFs.")
    print("Joined song_data + log_data schema:")
    df_ld_sd_joined.printSchema()
    print("Joined song_data + log_data examples:")
    df_ld_sd_joined.show(5)

    # extract columns from joined song and log datasets
    # to create songplays table
    print("Extracting columns from joined DF...")
    df_ld_sd_joined = df_ld_sd_joined.withColumn("songplay_id", \
                        monotonically_increasing_id())
    df_ld_sd_joined.createOrReplaceTempView("songplays_table_DF")
    songplays_table = spark.sql("""
        SELECT  songplay_id AS songplay_id,
                timestamp   AS start_time,
                userId      AS user_id,
                level       AS level,
                song_id     AS song_id,
                artist_id   AS artist_id,
                sessionId   AS session_id,
                location    AS location,
                userAgent   AS user_agent
        FROM songplays_table_DF
        ORDER BY (user_id, session_id)
    """)

    print("Songplays_table schema:")
    songplays_table.printSchema()
    print("Songplays_table examples:")
    songplays_table.show(5, truncate=False)

    # write songplays table to parquet files partitioned by year and month
    songplays_table_path = output_data + "songplays_table.parquet" + "_" \
                            + run_start_time

    print("Writing songplays_table parquet files to {}..."\
            .format(songplays_table_path))
    time_table.write.mode("overwrite").partitionBy("year", "month")\
            .parquet(songplays_table_path)

    print("...finished writing songplays_table...")

    return users_table, time_table, songplays_table
def process_log_data(spark, input_data, output_data):
    """
    The function to process song data
    
    Parameters:
        spark  : The Spark session that will be used to execute commands.
        input_data : The input data to be processed.
        output_data : The location where to store the parquet tables.
    """
    # get filepath to log data file
    log_data = input_data

    # read log data file
    df_log = spark.read.json(input_data)

    # filter by actions for song plays
    df_log = df_log.filter(F.col("page") == "NextSong")

    # Extract columns for users table
    users_cols = [user_id, first_name, last_name, gender, level]

    # remove duplicate rows
    users_table_df = df_log.select(users_cols).dropDuplicates()

    # write users table to parquet files
    users_table_df.write.parquet(output_data + 'users_table', mode='Overwrite')

    # define functions for extracting time components from ts field
    get_timestamp = F.udf(lambda x: datetime.fromtimestamp((x / 1000.0)),
                          T.TimestampType())
    get_hour = F.udf(lambda x: x.hour, T.IntegerType())
    get_day = F.udf(lambda x: x.day, T.IntegerType())
    get_week = F.udf(lambda x: x.isocalendar()[1], T.IntegerType())
    get_month = F.udf(lambda x: x.month, T.IntegerType())
    get_year = F.udf(lambda x: x.year, T.IntegerType())
    get_weekday = F.udf(lambda x: x.weekday(), T.IntegerType())

    # create timestamp column from original timestamp column
    df_log = df_log.withColumn("timestamp", get_timestamp(df_log.ts))
    df_log = df_log.withColumn("hour", get_hour(df_log.timestamp))
    df_log = df_log.withColumn("day", get_day(df_log.timestamp))
    df_log = df_log.withColumn("week", get_week(df_log.timestamp))
    df_log = df_log.withColumn("month", get_month(df_log.timestamp))
    df_log = df_log.withColumn("year", get_year(df_log.timestamp))
    df_log = df_log.withColumn("weekday", get_weekday(df_log.timestamp))

    # extract columns to create time table
    time_cols = [start_time, hour, day, week, month, year, weekday]
    time_table_df = df_log.select(time_cols)

    # write time table to parquet files partitioned by year and month
    time_table_df.write.parquet(output_data + 'time_table',
                                partitionBy=['year', 'month'],
                                mode='Overwrite')

    # read in song data to use for songplays table
    # read the partitioned data
    df_artists_read = spark.read.option(
        "mergeSchema", "true").parquet(output_data + "artists_table")
    df_songs_read = spark.read.option(
        "mergeSchema", "true").parquet(output_data + "songs_table")

    # extract columns from joined song and log datasets to create songplays table
    songplay_cols = [
        start_time, user_id, song_id, artist_id, session_id, locationSP,
        user_agent, level, month, year
    ]

    # join artists and songs so that we can join this table in the next step
    df_joined_songs_artists = df_songs_read.join(
        df_artists_read, 'artist_id').select("artist_id", "song_id", "title",
                                             "artist_name")
    # join df_log with the earlier joined artist and songs table
    songplay_table_df = df_log.join(
        df_joined_songs_artists, df_log.artist ==
        df_joined_songs_artists.artist_name).select(songplay_cols)

    # create songplay_id
    songplay_table_df = songplay_table_df.withColumn(
        "songplay_id", F.monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    songplay_table_df.write.parquet(output_data + 'songplays_table',
                                    partitionBy=['year', 'month'],
                                    mode='Overwrite')
示例#11
0
def main(base_path):

    spark = SparkSession.builder.config("spark.default.parallelism",
                                        1).appName(APP_NAME).getOrCreate()

    #
    # Load all models to be used in making predictions
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load all the string field vectorizer pipelines into a dict
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Messages look like:
    #

    # {
    #   "Carrier": "DL",
    #   "DayOfMonth": 25,
    #   "DayOfWeek": 4,
    #   "DayOfYear": 359,
    #   "DepDelay": 10.0,
    #   "Dest": "LAX",
    #   "Distance": 2475.0,
    #   "FlightDate": "2015-12-25",
    #   "FlightNum": null,
    #   "Origin": "JFK",
    #   "Timestamp": "2019-10-31T00:19:47.633280",
    #   "UUID": "af74b096-ecc7-4493-a79a-ebcdff699385"
    # }

    #
    # Process Prediction Requests from Kafka
    #
    message_df = spark \
      .readStream \
      .format("kafka") \
      .option("kafka.bootstrap.servers", BROKERS) \
      .option("subscribe", PREDICTION_TOPIC) \
      .load()

    # Create a DataFrame out of the one-hot encoded RDD
    schema = T.StructType([
        T.StructField("Carrier", T.StringType()),
        T.StructField("DayOfMonth", T.IntegerType()),
        T.StructField("DayOfWeek", T.IntegerType()),
        T.StructField("DayOfYear", T.IntegerType()),
        T.StructField("DepDelay", T.FloatType()),
        T.StructField("Dest", T.StringType()),
        T.StructField("Distance", T.FloatType()),
        T.StructField("FlightDate", T.StringType()),
        T.StructField("FlightNum", T.StringType()),
        T.StructField("Origin", T.StringType()),
        T.StructField("Timestamp", T.TimestampType()),
        T.StructField("UUID", T.StringType()),
    ])

    prediction_requests_df = message_df.select(
        F.from_json(F.col("value").cast("string"),
                    schema).alias("data")).select("data.*")

    #
    # Add a Route variable to replace FlightNum
    #
    prediction_requests_with_route = prediction_requests_df.withColumn(
        'Route',
        F.concat(prediction_requests_df.Origin, F.lit('-'),
                 prediction_requests_df.Dest))

    # Vectorize string fields with the corresponding pipeline for that column
    # Turn category fields into categoric feature vectors, then drop intermediate fields
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer_model = string_indexer_models[column]
        prediction_requests_with_route = string_indexer_model.transform(
            prediction_requests_with_route)

    # Vectorize numeric columns: DepDelay, Distance and index columns
    final_vectorized_features = vector_assembler.transform(
        prediction_requests_with_route)

    # Drop the individual index columns
    index_columns = [
        "Carrier_index", "Origin_index", "Dest_index", "Route_index"
    ]
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)

    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop(
        "rawPrediction").drop("probability")

    # Store the results to MongoDB
    class MongoWriter:
        def open(self, partition_id, epoch_id):
            print(f"Opened partition id: {partition_id}, epoch: {epoch_id}")

            self.mongo_client = pymongo.MongoClient()
            print(f"Opened MongoClient: {self.mongo_client}")

            return True

        def process(self, row):
            print(f"Processing row: {row}")

            as_dict = row.asDict()
            print(f"Inserting row.asDict(): {as_dict}")

            id = self.mongo_client.agile_data_science.flight_delay_classification_response.insert_one(
                as_dict)
            print(f"Inserted row, got ID: {id.inserted_id}")

            self.mongo_client.close()

            return True

        def close(self, error):
            print("Closed with error: %s" % str(error))

            return True

    query = final_predictions.writeStream.foreach(MongoWriter()).start()

    query.awaitTermination()
listOfCompanies = [
    'pixel', 'iphone', 'samsung', 'huawei', 'xiaomi', 'vivo', 'oppo',
    'motorola', 'realme', 'sony', 'oneplus'
]

fields = ("videoId", "channelId", "date", "mobileCompany", "views", "commnets",
          "likes", "dislikes")
video = namedtuple("video", fields)

channels = spark.read.csv("users.csv", header=True,
                          inferSchema=True).drop('_c0')

video_schema = st.StructType([
    st.StructField("videoId", st.StringType(), True),
    st.StructField("channelId", st.StringType(), True),
    st.StructField("creationDate", st.TimestampType(), True),
    st.StructField("mobileCompany", st.StringType(), True),
    st.StructField("views", st.IntegerType(), True),
    st.StructField("comments", st.IntegerType(), True),
    st.StructField("likes", st.IntegerType(), True),
    st.StructField("dislikes", st.IntegerType(), True)
])


def getCompany(t):
    t = t.lower()
    t = re.sub('[^a-z ]+', '', t)
    t = t.split()
    t = list(filter(lambda word: word in listOfCompanies, t))
    if len(t) > 0:
        return t[0]
示例#13
0
def process_log_data(spark, input_data, output_data):
    """Process log data and creates users table, time table and songplays table"""

    # get filepath to log data file
    log_data = input_data + "log_data"

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    # note: song plays needs all 'NextSong' results
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    users_table = df[['userId', 'firstName', 'lastName', 'gender', 'level']]

    # write users table to parquet files
    users_table.dropDuplicates().write.parquet(output_data + "user.parquet")

    # create timestamp column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp((x / 1000.0)),
                       T.TimestampType())
    df = df.withColumn("start_time", get_datetime(df.ts))

    # create table to extract columns
    df.createOrReplaceTempView("log_staging_table")

    # create columns for time table
    time_table = spark.sql('''
        SELECT start_time,
        EXTRACT(hour from start_time) as hour,
        EXTRACT(day from start_time) as day,
        EXTRACT(week from start_time) as week,
        EXTRACT(month from start_time) as month,
        EXTRACT(year from start_time) as year,
        DAYOFWEEK(start_time) as weekday
        
        from log_staging_table
    ''').collect()

    # write time table to parquet files partitioned by year and month
    time_table_dataframe = spark.createDataFrame(time_table)
    time_table_dataframe.dropDuplicates().write.partitionBy(
        "year", "month").parquet(output_data + "time.parquet")

    # read in song data to use for songplays table
    song_data = input_data + "song_data"
    song_df = spark.read.json(song_data + "/*/*/*")
    song_df.createOrReplaceTempView("songs_staging_table")

    # extract columns from joined song and log datasets to create songplays table
    temp_table = spark.sql('''
        SELECT 
        
            a.start_time, a.userId, a.level, b.song_id,
            b.artist_id, a.sessionId, a.location, a.userAgent,
            EXTRACT(month from a.start_time) as month,
            EXTRACT(year from a.start_time) as year
        from log_staging_table a
        inner join songs_staging_table b on a.song = b.title
    ''').collect()

    songplays_table = spark.createDataFrame(temp_table)

    # write songplays table to parquet files partitioned by year and month
    songplays_table.dropDuplicates().write.partitionBy(
        "year", "month").parquet(output_data + "songplays.parquet")
示例#14
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = '{}log_data/*.json'.format(input_data)

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(col("page") == "NextSong")

    # extract columns for users table
    users_table = df \
        .selectExpr( \
            "userId as user_id", "firstName as first_name", \
            "lastName as last_name", "gender", "level") \
        .dropDuplicates()

    # write users table to parquet files
    users_table \
        .write \
        .parquet(os.path.join(output_data, 'users'), 'overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf( \
        lambda x: datetime.fromtimestamp( (x/1000.0) ), T.TimestampType())
    df = df.withColumn("timestamp", get_timestamp(df.ts))

    # create datetime columns from original timestamp column
    df = df \
        .withColumn("hour", hour("timestamp")) \
        .withColumn("day", dayofmonth("timestamp")) \
        .withColumn("week", weekofyear("timestamp")) \
        .withColumn("month", month("timestamp")) \
        .withColumn("year", year("timestamp")) \
        .withColumn("weekday", date_format("timestamp", 'EEEE'))

    # extract columns to create time table
    time_table = df \
        .selectExpr( \
            "ts as start_time", "hour", "day", \
            "week", "month", "year", "weekday") \
        .dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table \
        .write \
        .partitionBy("year", "month") \
        .parquet(os.path.join(output_data, 'time'), 'overwrite')

    # read in song data to use for songplays table
    song_df = spark \
        .read \
        .parquet('{}songs/*/*/*.parquet'.format(output_data))

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df \
        .join(song_df, df.song == song_df.title) \
        .selectExpr("df.ts AS start_time", "df.user_id", "df.level", \
            "song_df.song_id", "song_df.artist_id", "df.session_id", \
            "df.location", "df.user_agent", "df.month", "df.year") \
        .dropDuplicates()

    # write songplays table to parquet files partitioned by year and month
    songplays_table \
        .write \
        .partitionBy("year", "month") \
        .parquet(os.path.join(output_data, 'songplays'), 'overwrite')
示例#15
0
def as_spark_type(
    tpe: Union[str, type, Dtype], *, raise_error: bool = True, prefer_timestamp_ntz: bool = False
) -> types.DataType:
    """
    Given a Python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in Python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - Python3's typing system
    """
    if isinstance(tpe, np.dtype) and tpe == np.dtype("object"):
        pass
    # ArrayType
    elif tpe in (np.ndarray,):
        return types.ArrayType(types.StringType())
    elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list):  # type: ignore
        element_type = as_spark_type(tpe.__args__[0], raise_error=raise_error)  # type: ignore
        if element_type is None:
            return None
        return types.ArrayType(element_type)
    # BinaryType
    elif tpe in (bytes, np.character, np.bytes_, np.string_):
        return types.BinaryType()
    # BooleanType
    elif tpe in (bool, np.bool_, "bool", "?"):
        return types.BooleanType()
    # DateType
    elif tpe in (datetime.date,):
        return types.DateType()
    # NumericType
    elif tpe in (np.int8, np.byte, "int8", "byte", "b"):
        return types.ByteType()
    elif tpe in (decimal.Decimal,):
        # TODO: considering about the precision & scale for decimal type.
        return types.DecimalType(38, 18)
    elif tpe in (float, np.float_, np.float64, "float", "float64", "double"):
        return types.DoubleType()
    elif tpe in (np.float32, "float32", "f"):
        return types.FloatType()
    elif tpe in (np.int32, "int32", "i"):
        return types.IntegerType()
    elif tpe in (int, np.int64, "int", "int64", "long"):
        return types.LongType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    # StringType
    elif tpe in (str, np.unicode_, "str", "U"):
        return types.StringType()
    # TimestampType or TimestampNTZType if timezone is not specified.
    elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"):
        return types.TimestampNTZType() if prefer_timestamp_ntz else types.TimestampType()

    # categorical types
    elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str) and type == "category"):
        return types.LongType()

    # extension types
    elif extension_dtypes_available:
        # IntegralType
        if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str) and tpe == "Int8"):
            return types.ByteType()
        elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str) and tpe == "Int16"):
            return types.ShortType()
        elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str) and tpe == "Int32"):
            return types.IntegerType()
        elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str) and tpe == "Int64"):
            return types.LongType()

        if extension_object_dtypes_available:
            # BooleanType
            if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str) and tpe == "boolean"):
                return types.BooleanType()
            # StringType
            elif isinstance(tpe, StringDtype) or (isinstance(tpe, str) and tpe == "string"):
                return types.StringType()

        if extension_float_dtypes_available:
            # FractionalType
            if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str) and tpe == "Float32"):
                return types.FloatType()
            elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str) and tpe == "Float64"):
                return types.DoubleType()

    if raise_error:
        raise TypeError("Type %s was not understood." % tpe)
    else:
        return None
示例#16
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = 's3a://udacity-dend/log_data/*/*/*/*.json'

    # read log data file
    df_log = spark.read.json(log_data)

    # filter by actions for song plays
    df_log = df_log.filter(df_log['page'] == 'NextSong')

    # extract columns for users table
    users_table = df_log.select("userId", "firstName", "lastName", "gender",
                                "level")

    # write users table to parquet files
    users_table.write.mode('overwrite').parquet(output_data + "users")

    # create timestamp column from original timestamp column
    get_timestamp = F.udf(lambda x: datetime.fromtimestamp((x / 1000.0)),
                          T.TimestampType())
    df_log = df_log.withColumn("timestamp", get_timestamp(df_log.ts))

    # create datetime column from original timestamp column
    df_log = df_log.withColumn(
        'date_time',
        from_unixtime(df_log.ts / 1000).cast(dataType=T.TimestampType()))

    # extract columns to create time table
    #start_time, hour, day, week, month, year, weekday
    time_table = df_log.select(
        col("date_time").alias("start_time"),
        year(col('date_time')).alias('year'),
        month(col('date_time')).alias('month'),
        dayofmonth(col('date_time')).alias('day'),
        hour(col('date_time')).alias('hour'),
        weekofyear(col('date_time')).alias('week'))

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month").parquet(output_data + "time")

    # extract columns from joined song and log datasets to create songplays table

    #must change dataframe to table for SQL functions
    df_log.createOrReplaceTempView("log_df_table")
    df_songs.createOrReplaceTempView("song_df_table")
    time_table.createOrReplaceTempView("time_table_table")

    songplays_table = spark.sql("""
                                SELECT DISTINCT log_df_table.userId, log_df_table.level, log_df_table.location,
                                log_df_table.userAgent, log_df_table.sessionId, log_df_table.date_time, 
                                song_df_table.artist_id, song_df_table.song_id, 
                                time_table_table.month, time_table_table.year 
                                FROM log_df_table
                                JOIN song_df_table
                                ON song_df_table.artist_name = log_df_table.artist
                                JOIN time_table_table
                                ON time_table_table.start_time = log_df_table.date_time
                                """)
    #monotonically_increasing_id assigns unique id number to each row
    songplays_table = songplays_table.withColumn("songplay_id",
                                                 monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(output_data +
                                                               "songplays")
示例#17
0
import sys, re, math, datetime, json
from pyspark import SparkConf, SparkContext
assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+
from pyspark.sql import SQLContext, Row, SparkSession, functions as f, types as t

nasa_schema = t.StructType([
    t.StructField('hostname', t.StringType(), False),
    t.StructField('path', t.StringType(), False),
    t.StructField('size', t.FloatType(), False),
    t.StructField('timestamp', t.TimestampType(), False)
])


def make_rdd(line):
    return Row(hostname=line[1],
               timestamp=datetime.datetime.strptime(line[2],
                                                    '%d/%b/%Y:%H:%M:%S'),
               path=line[3],
               size=float(line[4]))


def main(inputs):
    text = sc.textFile(inputs)
    linere = re.compile(
        "^(\\S+) - - \\[(\\S+) [+-]\\d+\\] \"[A-Z]+ (\\S+) HTTP/\\d\\.\\d\" \\d+ (\\d+)$"
    )
    nasa_rdd = text.map(lambda x: linere.split(x)).filter(
        lambda line: len(line) > 4).map(make_rdd)
    nasa_df = sqlContext.createDataFrame(nasa_rdd, nasa_schema)
    agg_df = nasa_df.groupBy(nasa_df.hostname).agg(f.count('*').alias('x') \
                                                ,f.sum(nasa_df.size).alias('y')) \
示例#18
0
from datetime import datetime
assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+
from pyspark.sql import SparkSession, functions, types

cluster_seeds = ['199.60.17.32', '199.60.17.65']
spark = SparkSession.builder.appName('Spark Cassandra example') \
    .config('spark.cassandra.connection.host', ','.join(cluster_seeds)).getOrCreate()
spark.sparkContext.setLogLevel('WARN')
sc = spark.sparkContext

line_re = re.compile(
    r'^(\S+) - - \[(\S+ [+-]\d+)\] \"[A-Z]+ (\S+) HTTP/\d\.\d\" \d+ (\d+)$')
schema = types.StructType([
    #types.StructField('id', types.StringType()),
    types.StructField('host', types.StringType()),
    types.StructField('datetime', types.TimestampType()),
    types.StructField('path', types.StringType()),
    types.StructField('bytes', types.IntegerType())
])


def read_line(line):
    m = line_re.match(line)
    if m is None:
        return None
    return (m.group(1), datetime.strptime(m.group(2), '%d/%b/%Y:%H:%M:%S %z'),
            m.group(3), int(m.group(4)))


def main(input_dir, keyspace, table):
    text = sc.textFile(input_dir).repartition(10)
示例#19
0
def process_log_data(spark, input_data, output_data):
    """
    This function reads song_data and log_data from S3 (public folder), 
    makes user, artists and songplay tables and uploads them back on S3
    
    Parameters: 
    spark : SparkSession
    input_data : public S3 path where input data is scored
    output_data : Our S3 path where output data is scored
  
    Returns: 
    user, artists, songplay tables are saved as parquet files on a personal folder on S3
    """
    # get filepath to log data file
    log_data = f'{input_data}/log_data/*.json'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    user_table = df.select(
        ["userId", "firstname", "lastname", "gender",
         "level"]).where(df["userId"].isNotNull())

    # write users table to parquet files
    user_data_out = f'{output_data}/user_table/user_table.parquet'
    user_table.write.mode('overwrite').parquet(user_data_out)

    # create timestamp column from original timestamp column
    get_timestamp = F.udf(lambda x: datetime.fromtimestamp((x / 1000.0)),
                          T.TimestampType())
    df = df.withColumn("timestamp", get_timestamp(df.ts))

    # extract columns to create time table
    time_table = df.select(['timestamp']).dropDuplicates()
    time_table = time_table.withColumn("hour", hour(time_table["timestamp"]))
    time_table = time_table.withColumn("day",
                                       dayofyear(time_table["timestamp"]))
    time_table = time_table.withColumn("week",
                                       weekofyear(time_table["timestamp"]))
    time_table = time_table.withColumn("month", month(time_table["timestamp"]))
    time_table = time_table.withColumn("year", year(time_table["timestamp"]))
    time_table = time_table.withColumn("weekday",
                                       dayofweek(time_table["timestamp"]))

    # write time table to parquet files partitioned by year and month
    time_data_out = f'{output_data}/time_table/time_table.parquet'
    time_table.write.mode('overwrite').partitionBy(
        'year', 'month').parquet(time_data_out)

    # read in song data to use for songplays table
    song_data = f'{input_data}/song_data/*/*/*/*.json'
    sdf = spark.read.json(song_data)
    sdf.createOrReplaceTempView("song_df_table")

    # Adding month and year column to log data read and preparing log data table
    df = df.withColumn("month", month(df["timestamp"]))
    df = df.withColumn("year", year(df["timestamp"]))
    df.createOrReplaceTempView("log_df_table")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = spark.sql("""
    SELECT ldf.timestamp as start_time,
        ldf.userid as user_id,
        ldf.level,
        sdf.song_id,
        sdf.artist_id,
        ldf.sessionid as session_id,
        ldf.location,
        ldf.useragent as user_agent,
        ldf.month,
        ldf.year
    FROM log_df_table ldf
    JOIN song_df_table sdf
    ON (ldf.song = sdf.title) AND (ldf.artist = sdf.artist_name) AND (ldf.length = sdf.duration)
    WHERE ldf.page = 'NextSong' and ldf.userid is not null
    """)

    # adding the songplay_id column
    window = Window.orderBy(F.col('start_time'))
    songplays_table = songplays_table.withColumn('songplay_id',
                                                 F.row_number().over(window))
    songplays_table.select('songplay_id', 'start_time', 'user_id', 'level',
                           'song_id', 'artist_id', 'session_id', 'location',
                           'user_agent', 'month', 'year').show()

    # write songplays table to parquet files partitioned by year and month
    songplays_data_out = f'{output_data}/songplays_table/songplays_table.parquet'
    songplays_table.write.mode('overwrite').partitionBy(
        'year', 'month').parquet(songplays_data_out)
示例#20
0
    #types.BooleanType : bool,
    types.LongType: int,
    types.IntegerType: int,
    types.DoubleType: float,
    types.DecimalType: float,
    types.StringType: str,
    types.TimestampType: datetime
}

# map data type to pyspark sql type
_data_type_to_pyspark_type_table = {
    int: types.IntegerType(),
    long: types.LongType(),
    float: types.DoubleType(),
    str: types.StringType(),
    datetime: types.TimestampType()
}

# build reverse map string -> type
_primitive_str_to_type_table = dict([
    (s, t) for t, s in _primitive_type_to_str_table.iteritems()
])

_primitive_alias_type_to_type_table = {
    float: float64,
    int: int32,
    long: int64,
    str: unicode,
    list: vector,
}
示例#21
0
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.sql.functions import struct, window, col, lit

from utils.spark import kafka_source

import config

a3_struct_common = T.StructType([
    T.StructField("timetamp_start", T.TimestampType()),
    T.StructField("timetamp_end", T.TimestampType()),
    T.StructField("country_name", T.StringType()),
    T.StructField("topic_name_exp", T.StringType()),
    T.StructField("topic_sum", T.IntegerType()),
])


def task_a_3_step_0(json_parsed_df):
    result = json_parsed_df.withColumn('topic_name_exp', F.explode('topic_name')) \
        .withWatermark("timestamp", "1 minute").groupBy(
        F.window("timestamp", "1 hour", "1 hour"),
        'country_name',
        'topic_name_exp'
    ).agg(
        F.count('topic_name_exp').alias('topic_count')
    ).select(
        F.struct(
            col('window.end').alias("datetime_end"),
            col('country_name'),
            col('topic_name_exp'),
示例#22
0
 def _apply_dateutil_parse(column):
     assert len(column.columns) == 1, "Expected DataFrame with 1 column"
     col_name = column.columns[0]
     _udf = udf(parse, sparktypes.TimestampType())
     return column.withColumn(col_name, _udf(col_name))
示例#23
0
# Read in raw data
impressionFields = [
    T.StructField('advertiserID', T.IntegerType(), False),
    T.StructField('domain', T.StringType(), False),
    T.StructField('viewable', T.BooleanType(), False),
    T.StructField('city', T.StringType(), False),
    T.StructField('mobileDevice', T.StringType(), False),
    T.StructField('country', T.StringType(), False),
    T.StructField('sellerPrice', T.IntegerType(), False),
    T.StructField('userID', T.IntegerType(), False),
    T.StructField('impressionID', T.IntegerType(), False),
    T.StructField('postalCode', T.StringType(), False),
    T.StructField('carrier', T.StringType(), False),
    T.StructField('eventType', T.StringType(), False),
    T.StructField('lineItemID', T.IntegerType(), False),
    T.StructField('time', T.TimestampType(), False),
    T.StructField('duration', T.IntegerType(), False),
    T.StructField('browser', T.StringType(), False),
    T.StructField('os', T.StringType(), False),
    T.StructField('audienceSegmentID', T.IntegerType(), False)
]
impressionSchema = T.StructType(impressionFields)

currentDate = dt.now().strftime('%Y-%m-%d')
impressionsFiles = os.path.join('gs://sgupta_doubleclick', 'staging', '*',
                                currentDate, 'impressions.csv')

rawImpressions = spark.read.load(impressionsFiles, format="csv", header=True, schema=impressionSchema)\
  .withColumn('filename', F.input_file_name())\
  .withColumn('clientID',F.regexp_extract('filename','.*staging/([0-9]*)/.*',1).cast('int'))\
  .withColumn('date',F.regexp_extract('filename','.*staging/[0-9]*/([0123456789-]*)/.*',1).cast('date'))\
示例#24
0
from pyspark.sql import types

# base type
DType = types.DataType
# individual types
String = types.StringType()
Date = types.DateType()
Datetime = types.TimestampType()
# numeric types
Float = types.FloatType()
Double = types.DoubleType()
Byte = types.ByteType()
Short = types.ShortType()
Integer = types.IntegerType()
Long = types.LongType()
# groups
Floats = (Float, Double)
Integers = (Byte, Short, Integer, Long)
Numerics = Floats + Integers
示例#25
0
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LinearSVC
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml import Pipeline
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

spark = SparkSession.builder.appName('commuter').getOrCreate()
assert spark.version >= '2.4'  # make sure we have Spark 2.4+
spark.sparkContext.setLogLevel('WARN')
#sc = spark.sparkContext

amenity_schema = types.StructType([
    types.StructField('lat', types.DoubleType(), nullable=False),
    types.StructField('lon', types.DoubleType(), nullable=False),
    types.StructField('timestamp', types.TimestampType(), nullable=False),
    types.StructField('amenity', types.StringType(), nullable=False),
    types.StructField('name', types.StringType(), nullable=True),
    types.StructField('tags',
                      types.MapType(types.StringType(), types.StringType()),
                      nullable=False),
])


def main(inputs):
    poi = spark.read.json(inputs, schema=amenity_schema)
    # poi.show()
    stage1 = VectorAssembler(inputCols=['lon', 'lat'], outputCol='features')
    stage2 = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    stage3 = KMeans().setK(7).setFeaturesCol(
        "scaledFeatures").setPredictionCol('prediction')
示例#26
0
        return _Unknown(tpe)
    else:
        return _Scalar(inner)


# First element of the list is the python base type
_base = {
    types.StringType(): [str, "str", "string"],
    types.BinaryType(): [bytes],
    types.ByteType(): [np.int8, "int8", "byte"],
    types.ShortType(): [np.int16, "int16", "short"],
    types.IntegerType(): [int, "int", np.int, np.int32],
    types.LongType(): [np.int64, "int64", "long", "bigint"],
    types.FloatType(): [float, "float", np.float],
    types.DoubleType(): [np.float64, "float64", "double"],
    types.TimestampType(): [datetime.datetime, np.datetime64],
    types.DateType(): [datetime.date],
    types.BooleanType(): [bool, "boolean", "bool", np.bool],
    types.ArrayType(types.StringType()): [],
}


def _build_type_dict():
    return dict([(other_type, spark_type) for (spark_type, l) in _base.items()
                 for other_type in l] + [(spark_type, spark_type)
                                         for (spark_type, _) in _base.items()])


def _build_py_type_dict():
    return dict([(spark_type, l[0]) for (spark_type, l) in _base.items()
                 if len(l) > 0])
示例#27
0
def process_log_data(spark, input_data, output_data):
    
    """ 
        It creates time, user dimensions and songplays fact 
        by processing source log dataset
        and writes these result data as parquet files in S3. 
    
        Parameters: 
           spark: spark session
           input_data: root path of source data
           output_data: root path of target 
    
    """
    # turn Python function into Pyspark function    
    timestamp_udf = F.udf(get_timestamp, T.TimestampType())
    
    # get filepath to log data file
    log_data_path =input_data + "log_data/*/*/*.json"
    print(log_data_path)
    
    # read log data file
    log_data_df = spark.read.json(log_data_path)
        
    # filter by actions for song plays
    log_data_df = log_data_df.filter((log_data_df.page== 'NextSong'))
    
    #convert unixtimestamp "ts" column to timestamp
    log_data_df= log_data_df.withColumn('timestamp', timestamp_udf("ts"))
    
   
    # extract columns for users table
    
    #log data has duplicated UserIds.But we want only one row for each user. 
    ## So we get the most recent record of the user by using a row_number. 
    
    user_columns= ['userId', 'firstName', 'lastName', 'gender', 'level', 'timestamp']
    
    ## remove duplicates using row_number.
    user_df_rn = log_data_df.select(*user_columns)\
            .withColumn('row_num', F.row_number().over(Window.partitionBy("userId").orderBy(F.desc("timestamp"))))
    
    users_table_df = user_df_rn.filter((user_df_rn.row_num)==1).select(*user_columns[0:-1])

    
    # write users table to parquet files
    users_table_df.coalesce(5).write.parquet(output_data+'users_table.parquet', mode='overwrite')

    
    # extract columns to create time table
    time_table_df= log_data_df.select( log_data_df.timestamp.alias('start_time'), 
                                      F.hour(log_data_df.timestamp).alias('hour'),
                                      F.dayofmonth(log_data_df.timestamp).alias('day'),
                                      F.weekofyear(log_data_df.timestamp).alias('week'),
                                      F.month(log_data_df.timestamp).alias('month') , 
                                      F.year(log_data_df.timestamp).alias('year'), 
                                      F.dayofweek(log_data_df.timestamp).alias('weekday')).dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table_df.write.partitionBy("year","month").parquet(output_data+'time_table.parquet', mode='overwrite')

    #get full path 
    song_data=input_data + 'song_data/*/*/*/*.json'    

    # read song data file
    song_data_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table 
    
    songplays_table_df= log_data_df.join\
                        (song_data_df, (log_data_df.artist == song_data_df.artist_name) 
                                         & (log_data_df.song== song_data_df.title)
                                         & ( log_data_df.length== song_data_df.duration), how='inner')\
                        .select(log_data_df.timestamp , log_data_df.userId, 
                                log_data_df.level, song_data_df.song_id, song_data_df.artist_id, 
                                log_data_df.sessionId, log_data_df.location, 
                                log_data_df.userAgent  )\
                        .withColumn('year',F.year(log_data_df.timestamp))\
                        .withColumn('month',F.month(log_data_df.timestamp))

    # write songplays table to parquet files partitioned by year and month
    songplays_table_df.write.partitionBy("year", "month").parquet(output_data+'songplays_table.parquet', mode='overwrite')
    
    print(users_table_df.count())
    print(time_table_df.count())
    print(songplays_table_df.count())
示例#28
0
import datetime
from pyspark.sql import functions as F
from pyspark.sql import types as T

complex_event_expression = (F.when(
    F.col("nested.input_key_1").isNotNull(),
    F.col("nested.input_key_1") / 1000).otherwise(
        F.col("nested.input_key_2") / 1000).cast(T.TimestampType()).cast(
            T.DateType()))

# fmt: off
fixtures_for_spark_sql_object = [
    # input_value_1           # input_value_1        # mapper function                   # expected_value
    ("place_holder", "place_holder", F.current_date(), datetime.date.today()),
    ("place_holder", "place_holder", F.current_timestamp(),
     datetime.datetime.now()),
    ("some string", "place_holder", F.col("nested.input_key_1"),
     "some string"),
    ("some string to count", "place_holder", F.length("nested.input_key_1"),
     20),
    ("some string", None, F.coalesce("nested.input_key_1",
                                     "nested.input_key_2"), "some string"),
    (None, "some other string",
     F.coalesce("nested.input_key_1",
                "nested.input_key_2"), "some other string"),
    (1597069446, "placeholder",
     (F.col("nested.input_key_1").cast(T.TimestampType()).cast(T.DateType())),
     datetime.date(2020, 8, 10)),
    (1597069446000, None, complex_event_expression, datetime.date(2020, 8,
                                                                  10)),
    (None, 1597069446000, complex_event_expression, datetime.date(2020, 8,
示例#29
0
#READ STRATEGIES
strategies = sqlContext.read.json(sys.argv[2])
#filter strategy
strategies = strategies.filter(
    col("rationality") == float(sys.argv[3])).filter(
        col("num_of_hp") == float(sys.argv[4]))
#add unique name
strategies_full = strategies.withColumn(
    "strategyID",
    concat(col("date"), lit("_"), col("num_of_hp"), lit("_"),
           col("rationality")))

#cast date to timestamp
strategies = strategies_full.withColumn(
    'date',
    unix_timestamp('date', 'yyyy-MM-dd').cast(T.TimestampType()))
#unfold the structs
strategies = strategies_full.select("date", "strategyID",
                                    explode("stg")).select(
                                        "date", "strategyID", "col.port",
                                        "col.prob")
#add new column with date when the strategy should be applied
strategies = strategies.withColumn('application_date',
                                   F.date_add(strategies['date'], 1)).select(
                                       "application_date", "strategyID",
                                       "port", "prob")
strategies = strategies.withColumn(
    'application_date',
    unix_timestamp('application_date',
                   'yyyy-MM-dd hh:mm:ss').cast(T.TimestampType()))
def main():
    # main logic starts here

    #Read bike_trips_data
    data = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='transaction_data2', keyspace="bike_share_analytics").load()
    data = data.dropna()

    #read bike station data
    df_station = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='station_data', keyspace="bike_share_analytics").load()
    df_station = df_station.select("id", "weather_station_id")

    #join data
    cond = [data['start station id'] == df_station.id]
    df_combined = data.join(df_station, cond, how='inner')

    df_combined = df_combined.select(
        'tripduration', 'starttime', 'stoptime', 'start station id',
        'start station name', 'start station latitude',
        'start station longitude', 'end station id', 'end station name',
        'end station latitude', 'end station longitude', 'bikeid', 'usertype',
        'birth year', 'gender', 'weather_station_id')

    #Create new columns
    df_combined = df_combined.withColumn(
        'startyear', functions.year(df_combined['starttime']))
    df_combined = df_combined.withColumn(
        'dayofyear', functions.dayofyear(df_combined['starttime']))

    ##Adding weather data to trips data
    df_weather = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='weather_data1', keyspace="bike_share_analytics").load()

    df_weather = df_weather.withColumn(
        "timestamp",
        functions.unix_timestamp('time', "yyyy-MM-dd HH:mm:ss").cast(
            types.TimestampType()))

    df_weather = df_weather.withColumn('hour_weather',
                                       functions.hour(df_weather['timestamp']))
    df_weather = df_weather.withColumn(
        'month_weather', functions.month(df_weather['timestamp']))
    df_weather = df_weather.withColumn('year_weather',
                                       functions.year(df_weather['timestamp']))
    df_weather = df_weather.withColumn(
        'dayofyear_weather', functions.dayofyear(df_weather['timestamp']))

    cond1 = [
        df_combined.weather_station_id == df_weather.id,
        df_combined.startyear == df_weather.year_weather,
        df_combined.dayofyear == df_weather.dayofyear_weather,
        df_combined.starthour == df_weather.hour_weather
    ]

    df_combined_new = df_combined.join(df_weather, cond1, how="inner")

    df_combined_new = df_combined_new.select(
        'tripduration', 'starttime', 'stoptime', 'start station id',
        'start station name', 'start station latitude',
        'start station longitude', 'end station id', 'end station name',
        'end station latitude', 'end station longitude', 'bikeid', 'usertype',
        'birth year', 'gender', 'age', 'dayofyear', 'starthour', 'startmonth',
        'startyear', 'stophour', 'stopmonth', 'weekday', 'weekend',
        'temperature', 'precipitation', 'humidity', 'dewpoint', 'windspeed')

    #pushing data to cassandra
    df_combined_new.write.format("org.apache.spark.sql.cassandra") \
    .options(table='transaction_data5', keyspace='bike_share_analytics').save()