示例#1
0
def get_weather_stations(path):
    sc = SparkContext('local')
    sqlContext = SQLContext(sc)
    rdd = sc.textFile('s3a://{path}'.format(path = path))\
       .map(lambda x: parse_line(x))\
       .map(lambda x: add_state(x))\
       .filter(lambda x: x.get('country') == 'US')\
       .filter(lambda x: x.get('us_state') != None )
    return rdd
示例#2
0
def count_stations(path):
    sc = SparkContext('local')
    rdd = sc.textFile(path)\
       .map(lambda x: parse_line(x))\
       .map(lambda x: add_state(x))\
       .map(count_states_non)
    rdd.persist()
    print(rdd.reduce(lambda accum, n: accum + n) / rdd.count())
    return rdd
def to_df(path):
    sc = SparkContext('local')
    sqlContext = SQLContext(sc)
    rdd = sc.textFile('s3a://{path}'.format(path = path))\
       .map(lambda x: parse_line(x))\
       .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') != None )\
       .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') < -67 )\
       .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') > -125 )\
       .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') != None)\
       .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') < 48)\
       .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') > 25)
    df = rdd.toDF()
    df.registerTempTable('my_table')
    df2 = sqlContext.sql(""" select distinct
    fixed_weather_station_usaf_master_station_catalog_identifier
from my_table
    """)
    return df2
import boto3
s3 = boto3.resource('s3')
bucket = 'paulhtremblay'
my_bucket = s3.Bucket(bucket)
from io import BytesIO
from gzip import GzipFile
from parsers.parse_noaa import parse_line
from parsers.stations_us_dict import d as us_stations_dict
import pprint
pp = pprint.PrettyPrinter(indent=4)

us_paths = []

for i in my_bucket.objects.filter(Prefix='noaa/data/1990'):
    obj = s3.Object(bucket, i.key)
    content = s3.Object(bucket, i.key).get()
    bytestream = BytesIO(content['Body'].read())
    lines = GzipFile(None, 'rb',
                     fileobj=bytestream).read().decode('utf-8').split('\n')
    the_dict = parse_line(lines[0])
    station_id = the_dict.get(
        'fixed_weather_station_usaf_master_station_catalog_identifier')
    if us_stations_dict.get(station_id, {}).get('ctry') == "US":
        us_paths.append(i.key)
with open('../data/us_stations_keys.txt', 'w') as write_obj:
    for i in us_paths:
        write_obj.write('{p}\n'.format(p=i))
示例#5
0
#!/usr/bin/env python
"""mapper.py"""

import sys
import pprint
pp = pprint.PrettyPrinter(indent=4)

from parsers.parse_noaa import parse_line
from parsers.stations_us_dict import d as us_stations_dict

for line in sys.stdin:
    line = line.strip()
    if line == '':
        continue
    the_dict = parse_line(line)
    temp = the_dict['air_temperature_observation_air_temperature']
    state = us_stations_dict.get(
        str(the_dict[
            'fixed_weather_station_usaf_master_station_catalog_identifier']),
        {}).get('st')
    if state == '' or state == None or state == 'AK' or state == 'HI' or temp == None:
        continue
    print('{state}\t{temp}'.format(state=state, temp=temp))
示例#6
0
    current = None
    counter = 0
    the_max = 0
    for i in l:
        if i[0] == None:
            continue
        counter += 1
        if not (current == None or float(i[0]) <= current):
            if counter > the_max:
                the_max = counter
            counter = 0
        current = float(i[0])
    return (x[0], the_max)


conf = SparkConf().setAppName('Summer_Course').setMaster('local')
path = '/home/paul/Documents/projects/big_data_course/workspace/us_stations_90_sample_small.txt'
sc = SparkContext(conf=conf)
rdd = sc.textFile('file://{path}'.format(path = path))\
   .map(lambda x: parse_line(x))\
   .filter(get_us_state)\
   .filter(lambda x: x.get('point_observation_date_time') > datetime.datetime(1990, 1, 1))\
   .filter(lambda x: x.get('point_observation_date_time') < datetime.datetime(1990, 2, 1))\
   .map(lambda x: ( stations_us_dict.d.get(x.get('fixed_weather_station_usaf_master_station_catalog_identifier')).get('st'),
        (x.get('air_temperature_observation_air_temperature'),
        x.get('point_observation_date_time')
            )))\
   .groupByKey()\
   .map(find_consecutive_temps)\
   .saveAsTextFile('file:////home/paul/Documents/projects/big_data_course/workspace/consecutive_temps_out')