예제 #1
0
else:
    data_to_proc = arg

# for running spark stand-alone (not spark interactive)
conf = (
    SparkConf()
    # .setMaster("local")
    # .set("spark.executor.memory", "1g")
    .setAppName("mysparkprocingjob"))
sc = SparkContext(conf=conf)
sc.addPyFile('../envir_vars.py')
sc_sql = SQLContext(sc)

data_path_in_hdfs = 'waze_data/topics/*/*'
# data_path_in_hdfs = 'waze/topics/*/*'
hdfs_in_path = hdfs_data_path.format(data_path_in_hdfs)
df = sc_sql.read.load(hdfs_in_path)

# What each cleaned up row looks like:
# [Row(city=u'atlanta', datetime=u'2016-01-25 08:35', lat=33.764181, lng=-84.371954,
#      numOfThumbsUp=1, subtype=u'POLICE_VISIBLE', time_stamp=1453710945, type=u'POLICE',
#      weekday=u'Monday')]

# do time based stuff the "spark way"
# from pyspark.sql import functions as sql_funcs
# hours = df.select(sql_funcs.hour(df.datetime))


def make_data_for_cassandra(row):
    year_month_day, hour_minute = row.datetime.split(' ')
    hour = int(hour_minute.split(':')[0])
예제 #2
0
sc.addPyFile('../envir_vars.py')
sc_sql = SQLContext(sc)


cities = envir_vars.cities_lat_and_long.keys()
# cities = ['san-fran-small-wo-newline']    # for testing/debugging
# topic = cities[0]

for city in cities:
    topic = city

    # For example:  camus/topics/san_fransisco/hourly/2016/01/21/10
    data_path_in_hdfs = 'camus/topics/{}/*/*/*/*/*'.format(topic)
    # data_path_in_hdfs = 'testing/{}/part-r-00184-f5234aaf-93dc-412a-8cea-ca6354e1f72f.gz.parquet'.format(topic)

    hdfs_in_path =  hdfs_data_path.format(data_path_in_hdfs)

    json_data = sc_sql.read.json(hdfs_in_path)
    # json_data.printSchema()


    def get_alerts(row):
        alerts = []
        for alert in row.alerts:
            # save back to hdfs below and then read those hdfs files back in later after preprocing;

            # convert to proper date time for cassandra
            # https://docs.datastax.com/en/cql/3.0/cql/cql_reference/timestamp_type_r.html
            # yyyy-mm-dd HH:mm:ssZ
            # https://pymotw.com/2/time/#working-with-time-zones
            data_timezone = time_zones[city]
예제 #3
0
sc = SparkContext(conf=conf)
sc.addPyFile('../envir_vars.py')
sc_sql = SQLContext(sc)

cities = envir_vars.cities_lat_and_long.keys()
# cities = ['san-fran-small-wo-newline']    # for testing/debugging
# topic = cities[0]

for city in cities:
    topic = city

    # For example:  camus/topics/san_fransisco/hourly/2016/01/21/10
    data_path_in_hdfs = 'camus/topics/{}/*/*/*/*/*'.format(topic)
    # data_path_in_hdfs = 'testing/{}/part-r-00184-f5234aaf-93dc-412a-8cea-ca6354e1f72f.gz.parquet'.format(topic)

    hdfs_in_path = hdfs_data_path.format(data_path_in_hdfs)

    json_data = sc_sql.read.json(hdfs_in_path)

    # json_data.printSchema()


    def get_alerts(row):
        alerts = []
        for alert in row.alerts:
            # save back to hdfs below and then read those hdfs files back in later after preprocing;

            # convert to proper date time for cassandra
            # https://docs.datastax.com/en/cql/3.0/cql/cql_reference/timestamp_type_r.html
            # yyyy-mm-dd HH:mm:ssZ
            # https://pymotw.com/2/time/#working-with-time-zones
예제 #4
0


# for running spark stand-alone (not spark interactive)
conf = (SparkConf()
         # .setMaster("local")
         # .set("spark.executor.memory", "1g")
         .setAppName("mysparkprocingjob")
       )
sc = SparkContext(conf=conf)
sc.addPyFile('../envir_vars.py')
sc_sql = SQLContext(sc)

data_path_in_hdfs = 'waze_data/topics/*/*'
# data_path_in_hdfs = 'waze/topics/*/*'
hdfs_in_path =  hdfs_data_path.format(data_path_in_hdfs)
df = sc_sql.read.load(hdfs_in_path)

# What each cleaned up row looks like:
# [Row(city=u'atlanta', datetime=u'2016-01-25 08:35', lat=33.764181, lng=-84.371954,
#      numOfThumbsUp=1, subtype=u'POLICE_VISIBLE', time_stamp=1453710945, type=u'POLICE',
#      weekday=u'Monday')]

# do time based stuff the "spark way"
# from pyspark.sql import functions as sql_funcs
# hours = df.select(sql_funcs.hour(df.datetime))


def make_data_for_cassandra(row):
    year_month_day, hour_minute = row.datetime.split(' ')
    hour = int(hour_minute.split(':')[0])