def get_weather_stations(path): sc = SparkContext('local') sqlContext = SQLContext(sc) rdd = sc.textFile('s3a://{path}'.format(path = path))\ .map(lambda x: parse_line(x))\ .map(lambda x: add_state(x))\ .filter(lambda x: x.get('country') == 'US')\ .filter(lambda x: x.get('us_state') != None ) return rdd
def count_stations(path): sc = SparkContext('local') rdd = sc.textFile(path)\ .map(lambda x: parse_line(x))\ .map(lambda x: add_state(x))\ .map(count_states_non) rdd.persist() print(rdd.reduce(lambda accum, n: accum + n) / rdd.count()) return rdd
def to_df(path): sc = SparkContext('local') sqlContext = SQLContext(sc) rdd = sc.textFile('s3a://{path}'.format(path = path))\ .map(lambda x: parse_line(x))\ .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') != None )\ .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') < -67 )\ .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') > -125 )\ .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') != None)\ .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') < 48)\ .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') > 25) df = rdd.toDF() df.registerTempTable('my_table') df2 = sqlContext.sql(""" select distinct fixed_weather_station_usaf_master_station_catalog_identifier from my_table """) return df2
import boto3 s3 = boto3.resource('s3') bucket = 'paulhtremblay' my_bucket = s3.Bucket(bucket) from io import BytesIO from gzip import GzipFile from parsers.parse_noaa import parse_line from parsers.stations_us_dict import d as us_stations_dict import pprint pp = pprint.PrettyPrinter(indent=4) us_paths = [] for i in my_bucket.objects.filter(Prefix='noaa/data/1990'): obj = s3.Object(bucket, i.key) content = s3.Object(bucket, i.key).get() bytestream = BytesIO(content['Body'].read()) lines = GzipFile(None, 'rb', fileobj=bytestream).read().decode('utf-8').split('\n') the_dict = parse_line(lines[0]) station_id = the_dict.get( 'fixed_weather_station_usaf_master_station_catalog_identifier') if us_stations_dict.get(station_id, {}).get('ctry') == "US": us_paths.append(i.key) with open('../data/us_stations_keys.txt', 'w') as write_obj: for i in us_paths: write_obj.write('{p}\n'.format(p=i))
#!/usr/bin/env python """mapper.py""" import sys import pprint pp = pprint.PrettyPrinter(indent=4) from parsers.parse_noaa import parse_line from parsers.stations_us_dict import d as us_stations_dict for line in sys.stdin: line = line.strip() if line == '': continue the_dict = parse_line(line) temp = the_dict['air_temperature_observation_air_temperature'] state = us_stations_dict.get( str(the_dict[ 'fixed_weather_station_usaf_master_station_catalog_identifier']), {}).get('st') if state == '' or state == None or state == 'AK' or state == 'HI' or temp == None: continue print('{state}\t{temp}'.format(state=state, temp=temp))
current = None counter = 0 the_max = 0 for i in l: if i[0] == None: continue counter += 1 if not (current == None or float(i[0]) <= current): if counter > the_max: the_max = counter counter = 0 current = float(i[0]) return (x[0], the_max) conf = SparkConf().setAppName('Summer_Course').setMaster('local') path = '/home/paul/Documents/projects/big_data_course/workspace/us_stations_90_sample_small.txt' sc = SparkContext(conf=conf) rdd = sc.textFile('file://{path}'.format(path = path))\ .map(lambda x: parse_line(x))\ .filter(get_us_state)\ .filter(lambda x: x.get('point_observation_date_time') > datetime.datetime(1990, 1, 1))\ .filter(lambda x: x.get('point_observation_date_time') < datetime.datetime(1990, 2, 1))\ .map(lambda x: ( stations_us_dict.d.get(x.get('fixed_weather_station_usaf_master_station_catalog_identifier')).get('st'), (x.get('air_temperature_observation_air_temperature'), x.get('point_observation_date_time') )))\ .groupByKey()\ .map(find_consecutive_temps)\ .saveAsTextFile('file:////home/paul/Documents/projects/big_data_course/workspace/consecutive_temps_out')