else: data_to_proc = arg # for running spark stand-alone (not spark interactive) conf = ( SparkConf() # .setMaster("local") # .set("spark.executor.memory", "1g") .setAppName("mysparkprocingjob")) sc = SparkContext(conf=conf) sc.addPyFile('../envir_vars.py') sc_sql = SQLContext(sc) data_path_in_hdfs = 'waze_data/topics/*/*' # data_path_in_hdfs = 'waze/topics/*/*' hdfs_in_path = hdfs_data_path.format(data_path_in_hdfs) df = sc_sql.read.load(hdfs_in_path) # What each cleaned up row looks like: # [Row(city=u'atlanta', datetime=u'2016-01-25 08:35', lat=33.764181, lng=-84.371954, # numOfThumbsUp=1, subtype=u'POLICE_VISIBLE', time_stamp=1453710945, type=u'POLICE', # weekday=u'Monday')] # do time based stuff the "spark way" # from pyspark.sql import functions as sql_funcs # hours = df.select(sql_funcs.hour(df.datetime)) def make_data_for_cassandra(row): year_month_day, hour_minute = row.datetime.split(' ') hour = int(hour_minute.split(':')[0])
sc.addPyFile('../envir_vars.py') sc_sql = SQLContext(sc) cities = envir_vars.cities_lat_and_long.keys() # cities = ['san-fran-small-wo-newline'] # for testing/debugging # topic = cities[0] for city in cities: topic = city # For example: camus/topics/san_fransisco/hourly/2016/01/21/10 data_path_in_hdfs = 'camus/topics/{}/*/*/*/*/*'.format(topic) # data_path_in_hdfs = 'testing/{}/part-r-00184-f5234aaf-93dc-412a-8cea-ca6354e1f72f.gz.parquet'.format(topic) hdfs_in_path = hdfs_data_path.format(data_path_in_hdfs) json_data = sc_sql.read.json(hdfs_in_path) # json_data.printSchema() def get_alerts(row): alerts = [] for alert in row.alerts: # save back to hdfs below and then read those hdfs files back in later after preprocing; # convert to proper date time for cassandra # https://docs.datastax.com/en/cql/3.0/cql/cql_reference/timestamp_type_r.html # yyyy-mm-dd HH:mm:ssZ # https://pymotw.com/2/time/#working-with-time-zones data_timezone = time_zones[city]
sc = SparkContext(conf=conf) sc.addPyFile('../envir_vars.py') sc_sql = SQLContext(sc) cities = envir_vars.cities_lat_and_long.keys() # cities = ['san-fran-small-wo-newline'] # for testing/debugging # topic = cities[0] for city in cities: topic = city # For example: camus/topics/san_fransisco/hourly/2016/01/21/10 data_path_in_hdfs = 'camus/topics/{}/*/*/*/*/*'.format(topic) # data_path_in_hdfs = 'testing/{}/part-r-00184-f5234aaf-93dc-412a-8cea-ca6354e1f72f.gz.parquet'.format(topic) hdfs_in_path = hdfs_data_path.format(data_path_in_hdfs) json_data = sc_sql.read.json(hdfs_in_path) # json_data.printSchema() def get_alerts(row): alerts = [] for alert in row.alerts: # save back to hdfs below and then read those hdfs files back in later after preprocing; # convert to proper date time for cassandra # https://docs.datastax.com/en/cql/3.0/cql/cql_reference/timestamp_type_r.html # yyyy-mm-dd HH:mm:ssZ # https://pymotw.com/2/time/#working-with-time-zones
# for running spark stand-alone (not spark interactive) conf = (SparkConf() # .setMaster("local") # .set("spark.executor.memory", "1g") .setAppName("mysparkprocingjob") ) sc = SparkContext(conf=conf) sc.addPyFile('../envir_vars.py') sc_sql = SQLContext(sc) data_path_in_hdfs = 'waze_data/topics/*/*' # data_path_in_hdfs = 'waze/topics/*/*' hdfs_in_path = hdfs_data_path.format(data_path_in_hdfs) df = sc_sql.read.load(hdfs_in_path) # What each cleaned up row looks like: # [Row(city=u'atlanta', datetime=u'2016-01-25 08:35', lat=33.764181, lng=-84.371954, # numOfThumbsUp=1, subtype=u'POLICE_VISIBLE', time_stamp=1453710945, type=u'POLICE', # weekday=u'Monday')] # do time based stuff the "spark way" # from pyspark.sql import functions as sql_funcs # hours = df.select(sql_funcs.hour(df.datetime)) def make_data_for_cassandra(row): year_month_day, hour_minute = row.datetime.split(' ') hour = int(hour_minute.split(':')[0])