示例#1
0
def lambda_handler(event, context):
    """
    lambda_handler(event, context):

    This function creates .json, .pickle, and .png files from a netCDF file. This netCDF file
    comes from an opendapp url (contained within the event paramater object).
    -----------------------------------------------------------------------
    Inputs:

    event: AWS Lambda uses this parameter to pass in event data to the handler. 
    This parameter is usually of the Python dict type. It can also be list, str, int, float, or NoneType type.
    
    context: AWS Lambda uses this parameter to provide runtime information to your handler. 
    This parameter is of the LambdaContext type.
    -----------------------------------------------------------------------
    Output: .json file, .pickle file, and .png files are save to S3
    -----------------------------------------------------------------------
    Author: Michael Christensen
    Date Modified: 10/08/2018
    """

    AWS_BUCKET_NAME = 'oceanmapper-data-storage'
    TOP_LEVEL_FOLDER = 'GFS_DATA'
    SUB_RESOURCE = 'wind_speed'
    DATA_PREFIX = 'gfs_winds'

    # unpack event data
    url = event['url']
    model_field_time = datetime.datetime.strptime(event['forecast_time'],
                                                  '%Y%m%dT%H:%M')
    model_field_indx = event['forecast_indx']
    level = '10m'

    file = get_opendapp_netcdf(url)
    formatted_folder_date = datetime.datetime.strftime(model_field_time,
                                                       '%Y%m%d_%H')

    output_json_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' +
                        SUB_RESOURCE + '/' + level + '/json/' + DATA_PREFIX +
                        '_' + formatted_folder_date + '.json')

    output_pickle_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date +
                          '/' + SUB_RESOURCE + '/' + level + '/pickle/' +
                          DATA_PREFIX + '_' + formatted_folder_date +
                          '.pickle')

    output_tile_scalar_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date +
                               '/' + SUB_RESOURCE + '/' + level +
                               '/tiles/scalar/')

    output_tile_data_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date +
                             '/' + SUB_RESOURCE + '/' + level + '/tiles/data/')

    output_info_path = TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/info.json'

    # get model origin time
    init_time = file.variables['time'][0]
    basetime_int = int(init_time)
    extra_days = init_time - basetime_int
    time_origin = (datetime.datetime.fromordinal(basetime_int) +
                   datetime.timedelta(days=extra_days) -
                   datetime.timedelta(days=1))

    lat = file.variables['lat'][:]
    lon = file.variables['lon'][:]

    u_data_raw = file.variables['ugrd10m'][
        model_field_indx, :, :]  #[time,lat,lon]
    v_data_raw = file.variables['vgrd10m'][model_field_indx, :, :]

    # ordered lat array
    lat_sort_indices = np.argsort(lat)
    lat_ordered = lat[lat_sort_indices]

    # remap and sort to -180 to 180 grid
    lon_translate = np.where(lon > 180, lon - 360.0, lon)
    lon_sort_indices = np.argsort(lon_translate)

    # ordered longitude arrays
    lon_ordered = lon_translate[lon_sort_indices]

    # rebuild u/v data with correct longitude sorting (monotonic increasing)
    u_data_cleaned = u_data_raw[lat_sort_indices, :][:, lon_sort_indices]
    v_data_cleaned = v_data_raw[lat_sort_indices, :][:, lon_sort_indices]

    # assign the raw data to a variable so we can pickle it for use with other scripts
    raw_data = {
        'lat': lat_ordered,
        'lon': lon_ordered,
        'u_vel': u_data_cleaned,
        'v_vel': v_data_cleaned,
        'datetime': formatted_folder_date,
        'level': level,
        'time_origin': time_origin
    }
    raw_data_pickle = pickle.dumps(raw_data)

    # create interpolation functions
    u_interp_func = interpolate.interp2d(lon_ordered,
                                         lat_ordered,
                                         u_data_cleaned,
                                         kind='cubic')
    v_interp_func = interpolate.interp2d(lon_ordered,
                                         lat_ordered,
                                         v_data_cleaned,
                                         kind='cubic')

    output_lat_array = np.arange(
        int(min(lat)),
        int(max(lat)) + 0.5,
        0.5)  # last point is excluded with arange (80 to -80)
    output_lon_array = np.arange(
        -180, 180.5, 0.5)  # last point is excluded with arange (-180 to 180)

    u_data_interp = u_interp_func(output_lon_array, output_lat_array)
    v_data_interp = v_interp_func(output_lon_array, output_lat_array)

    minLat = np.min(output_lat_array)
    maxLat = np.max(output_lat_array)
    minLon = np.min(output_lon_array)
    maxLon = np.max(output_lon_array)

    dx = np.diff(output_lon_array)[0]
    dy = np.diff(output_lat_array)[0]

    output_data = [
        {
            'header': {
                'parameterUnit':
                "m.s-1",
                'parameterNumber':
                2,
                'dx':
                dx,
                'dy':
                dy,
                'parameterNumberName':
                "Eastward wind",
                'la1':
                maxLat,
                'la2':
                minLat,
                'parameterCategory':
                2,
                'lo1':
                minLon,
                'lo2':
                maxLon,
                'nx':
                len(output_lon_array),
                'ny':
                len(output_lat_array),
                'refTime':
                datetime.datetime.strftime(model_field_time,
                                           '%Y-%m-%d %H:%M:%S'),
                'timeOrigin':
                datetime.datetime.strftime(time_origin, '%Y-%m-%d %H:%M:%S'),
            },
            'data': [
                float('{:.3f}'.format(el)) if np.abs(el) > 0.0001 else 0
                for el in u_data_interp[::-1].flatten().tolist()
            ]
        },
        {
            'header': {
                'parameterUnit':
                "m.s-1",
                'parameterNumber':
                3,
                'dx':
                dx,
                'dy':
                dy,
                'parameterNumberName':
                "Northward wind",
                'la1':
                maxLat,
                'la2':
                minLat,
                'parameterCategory':
                2,
                'lo1':
                minLon,
                'lo2':
                maxLon,
                'nx':
                len(output_lon_array),
                'ny':
                len(output_lat_array),
                'refTime':
                datetime.datetime.strftime(model_field_time,
                                           '%Y-%m-%d %H:%M:%S'),
                'timeOrigin':
                datetime.datetime.strftime(time_origin, '%Y-%m-%d %H:%M:%S'),
            },
            'data': [
                float('{:.3f}'.format(el)) if np.abs(el) > 0.0001 else 0
                for el in v_data_interp[::-1].flatten().tolist()
            ]
        },
    ]

    client = boto3.client('s3')
    client.put_object(Body=json.dumps(output_data),
                      Bucket=AWS_BUCKET_NAME,
                      Key=output_json_path)
    client.put_object(Body=raw_data_pickle,
                      Bucket=AWS_BUCKET_NAME,
                      Key=output_pickle_path)

    # save an info file for enhanced performance (get_model_field_api.py)
    client.put_object(Body=json.dumps({
        'time_origin':
        datetime.datetime.strftime(time_origin, '%Y-%m-%d %H:%M:%S')
    }),
                      Bucket=AWS_BUCKET_NAME,
                      Key=output_info_path)

    # call an intermediate function to distribute pickling workload (subsetting data by tile)
    data_zoom_level = datasets[TOP_LEVEL_FOLDER]['sub_resource'][SUB_RESOURCE][
        'data_tiles_zoom_level']
    pickle_task_distributor(output_pickle_path, AWS_BUCKET_NAME,
                            output_tile_data_path, data_zoom_level)

    file.close()
def lambda_handler(event, context):
    """
    lambda_handler(event, context):

    This function reads, parses, and saves a .json and .pickle file from 
    a netCDF file from a provided opendapp url (contained within the event paramater object).
    -----------------------------------------------------------------------
    Inputs:

    event: AWS Lambda uses this parameter to pass in event data to the handler. 
    This parameter is usually of the Python dict type. It can also be list, str, int, float, or NoneType type.
    
    context: AWS Lambda uses this parameter to provide runtime information to your handler. 
    This parameter is of the LambdaContext type.
    -----------------------------------------------------------------------
    Output: A .pickle file are save to S3
    -----------------------------------------------------------------------
    Author: Michael Christensen
    Date Modified: 10/08/2018
    """

    AWS_BUCKET_NAME = 'oceanmapper-data-storage'
    TOP_LEVEL_FOLDER = 'WW3_DATA'
    SUB_RESOURCE_HTSGWSFC = 'sig_wave_height'
    SUB_RESOURCE_DIRPWSFC = 'primary_wave_dir'
    SUB_RESOURCE_PERPWSFC = 'primary_wave_period'

    # unpack event data
    url = event['url']
    model_field_time = datetime.datetime.strptime(event['forecast_time'],
                                                  '%Y%m%dT%H:%M')
    model_field_indx = event['forecast_indx']

    file = get_opendapp_netcdf(url)
    formatted_folder_date = datetime.datetime.strftime(model_field_time,
                                                       '%Y%m%d_%H')

    output_pickle_path_htsgwsfc = (TOP_LEVEL_FOLDER + '/' +
                                   formatted_folder_date + '/' +
                                   SUB_RESOURCE_HTSGWSFC + '/pickle/' +
                                   'ww3_htsgwsfc_' + formatted_folder_date +
                                   '.pickle')

    output_tile_scalar_path_htsgwsfc = (TOP_LEVEL_FOLDER + '/' +
                                        formatted_folder_date + '/' +
                                        SUB_RESOURCE_HTSGWSFC +
                                        '/tiles/scalar/')

    output_tile_data_path_htsgwsfc = (TOP_LEVEL_FOLDER + '/' +
                                      formatted_folder_date + '/' +
                                      SUB_RESOURCE_HTSGWSFC + '/tiles/data/')

    output_pickle_path_dirpwsfc = (TOP_LEVEL_FOLDER + '/' +
                                   formatted_folder_date + '/' +
                                   SUB_RESOURCE_DIRPWSFC + '/pickle/' +
                                   'ww3_dirpwsfc_' + formatted_folder_date +
                                   '.pickle')

    output_tile_vector_path_dirpwsfc = (TOP_LEVEL_FOLDER + '/' +
                                        formatted_folder_date + '/' +
                                        SUB_RESOURCE_DIRPWSFC +
                                        '/tiles/vector/')

    output_tile_data_path_dirpwsfc = (TOP_LEVEL_FOLDER + '/' +
                                      formatted_folder_date + '/' +
                                      SUB_RESOURCE_DIRPWSFC + '/tiles/data/')

    output_pickle_path_perpwsfc = (TOP_LEVEL_FOLDER + '/' +
                                   formatted_folder_date + '/' +
                                   SUB_RESOURCE_PERPWSFC + '/pickle/' +
                                   'ww3_perpwsfc_' + formatted_folder_date +
                                   '.pickle')

    output_tile_scalar_path_perpwsfc = (TOP_LEVEL_FOLDER + '/' +
                                        formatted_folder_date + '/' +
                                        SUB_RESOURCE_PERPWSFC +
                                        '/tiles/scalar/')

    output_tile_data_path_perpwsfc = (TOP_LEVEL_FOLDER + '/' +
                                      formatted_folder_date + '/' +
                                      SUB_RESOURCE_PERPWSFC + '/tiles/data/')

    output_info_path = TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/info.json'

    # get model origin time
    init_time = file.variables['time'][0]
    basetime_int = int(init_time)
    extra_days = init_time - basetime_int
    time_origin = (datetime.datetime.fromordinal(basetime_int) +
                   datetime.timedelta(days=extra_days) -
                   datetime.timedelta(days=1))

    lat = file.variables['lat'][:]
    lon = file.variables['lon'][:]

    # significant height of combined wind waves and swell [m]
    height_raw = file.variables['htsgwsfc'][
        model_field_indx, :, :]  #[time,lat,lon]

    # primary wave direction [deg]
    primary_dir_raw = file.variables['dirpwsfc'][
        model_field_indx, :, :]  #[time,lat,lon]

    # primary wave mean period [s]
    primary_period_raw = file.variables['perpwsfc'][
        model_field_indx, :, :]  #[time,lat,lon]

    # ordered lat array
    lat_sort_indices = np.argsort(lat)
    lat_ordered = lat[lat_sort_indices]

    # remap and sort to -180 to 180 grid
    lon_translate = np.where(lon > 180, lon - 360.0, lon)
    lon_sort_indices = np.argsort(lon_translate)

    # ordered longitude arrays
    lon_ordered = lon_translate[lon_sort_indices]

    # rebuild sig wave height data with correct longitude sorting (monotonic increasing)
    height_data_cleaned = height_raw[lat_sort_indices, :][:, lon_sort_indices]

    # rebuild primary wave direction data with correct longitude sorting (monotonic increasing)
    direction_data_cleaned = primary_dir_raw[
        lat_sort_indices, :][:, lon_sort_indices]

    # rebuild primary wave period data with correct longitude sorting (monotonic increasing)
    period_data_cleaned = primary_period_raw[
        lat_sort_indices, :][:, lon_sort_indices]

    # assign the raw data to variables so we can pickle it for use with other scripts
    raw_data_htsgwsfc = {
        'lat': lat_ordered,
        'lon': lon_ordered,
        'sig_wave_height': height_data_cleaned,
        'time_origin': time_origin
    }
    raw_data_pickle_htsgwsfc = pickle.dumps(raw_data_htsgwsfc)

    raw_data_dirpwsfc = {
        'lat': lat_ordered,
        'lon': lon_ordered,
        'primary_wave_dir': direction_data_cleaned,
        'time_origin': time_origin
    }
    raw_data_pickle_dirpwsfc = pickle.dumps(raw_data_dirpwsfc)

    raw_data_perpwsfc = {
        'lat': lat_ordered,
        'lon': lon_ordered,
        'primary_wave_period': period_data_cleaned,
        'time_origin': time_origin
    }
    raw_data_pickle_perpwsfc = pickle.dumps(raw_data_perpwsfc)

    client = boto3.client('s3')
    client.put_object(Body=raw_data_pickle_htsgwsfc,
                      Bucket=AWS_BUCKET_NAME,
                      Key=output_pickle_path_htsgwsfc)
    client.put_object(Body=raw_data_pickle_dirpwsfc,
                      Bucket=AWS_BUCKET_NAME,
                      Key=output_pickle_path_dirpwsfc)
    client.put_object(Body=raw_data_pickle_perpwsfc,
                      Bucket=AWS_BUCKET_NAME,
                      Key=output_pickle_path_perpwsfc)

    # save an info file for enhanced performance (get_model_field_api.py)
    client.put_object(Body=json.dumps({
        'time_origin':
        datetime.datetime.strftime(time_origin, '%Y-%m-%d %H:%M:%S')
    }),
                      Bucket=AWS_BUCKET_NAME,
                      Key=output_info_path)

    # call an intermediate function to distribute pickling workload (subsetting data by tile)
    data_zoom_level_htsgwsfc = datasets[TOP_LEVEL_FOLDER]['sub_resource'][
        SUB_RESOURCE_HTSGWSFC]['data_tiles_zoom_level']
    pickle_task_distributor(output_pickle_path_htsgwsfc, AWS_BUCKET_NAME,
                            output_tile_data_path_htsgwsfc,
                            data_zoom_level_htsgwsfc)

    data_zoom_level_dirpwsfc = datasets[TOP_LEVEL_FOLDER]['sub_resource'][
        SUB_RESOURCE_DIRPWSFC]['data_tiles_zoom_level']
    pickle_task_distributor(output_pickle_path_dirpwsfc, AWS_BUCKET_NAME,
                            output_tile_data_path_dirpwsfc,
                            data_zoom_level_dirpwsfc)

    data_zoom_level_perpwsfc = datasets[TOP_LEVEL_FOLDER]['sub_resource'][
        SUB_RESOURCE_PERPWSFC]['data_tiles_zoom_level']
    pickle_task_distributor(output_pickle_path_perpwsfc, AWS_BUCKET_NAME,
                            output_tile_data_path_perpwsfc,
                            data_zoom_level_perpwsfc)

    file.close()
示例#3
0
def get_ww3_forecast_info(ww3_url):
    """
    get_ww3_forecast_info(ww3_url)

    This function assembles an array tuples containing the model forecast field datetime
    as well as the index of the forecast field. This facilitates to concurrent downloads of model data.
    -----------------------------------------------------------------------
    Input: {string} ww3_url - displays available Wave Watch 3 forecast model runs 

    i.e. https://nomads.ncep.noaa.gov:9090/dods/wave/nww3
    -----------------------------------------------------------------------
    Output: array of tuples with this structure:

    forecast_info = [(forecast_indx, forecast_field_datetime), ...]
	-----------------------------------------------------------------------
    Author: Michael Christensen
    Date Modified: 02/06/2019
    """
    
    page = urllib.urlopen(ww3_url).read()
    soup = BeautifulSoup(page,'html.parser')
    soup.prettify()
    
    date_array = np.array([])
    for datetime_element in soup.findAll('b'):
        match = re.search(r'(\d{8})[/]:$', datetime_element.string)
    
        if match:
            unformatted_date = match.group(1)
            datetime_element = datetime.datetime.strptime(unformatted_date,'%Y%m%d')
            date_array = np.append(date_array, datetime_element)
    
    max_forecast_run_date = np.max(date_array)
    formatted_latest_date = datetime.datetime.strftime(max_forecast_run_date, '%Y%m%d')

    # find the latest run using bs4
    forecast_run_url = ww3_url +'/nww3' + formatted_latest_date
    page = urllib.urlopen(forecast_run_url).read()
    soup = BeautifulSoup(page,'html.parser')
    soup.prettify()

    forecast_run_array = {}
    for model_run in soup.findAll('b'):
        match = re.search(r'nww3\d{8}_(\d{2})z', model_run.string)
    
        if match:
            run_name = match.group(0)
            forecast_run_hour = match.group(1)
            forecast_run_array.setdefault(int(forecast_run_hour), run_name)

    # build forecast field datetime/indx array
    max_run = max(forecast_run_array.keys())
    opendapp_url = forecast_run_url + '/' + run_name
    file = get_opendapp_netcdf(opendapp_url)
    product_times = file.variables['time'][:]
    file.close()

    forecast_info = {}
    forecast_info['url'] = opendapp_url
    forecast_info['data'] = []
    for forecast_indx, forecast_time in enumerate(product_times):
        basetime_int = int(forecast_time)
        extra_days = forecast_time - basetime_int

        # need to subtract 1 since WW3 is days since 0001-01-01 (yyyy-mm-dd)
        full_forecast_time = (datetime.datetime.fromordinal(basetime_int) + 
        datetime.timedelta(days = extra_days) - datetime.timedelta(days=1))
        forecast_info['data'].append((forecast_indx, full_forecast_time))

    return forecast_info
def get_gfs_forecast_info(gfs_url):
    """
    get_gfs_forecast_info(gfs_url)

    This function assembles an array tuples containing the model forecast field datetime
    as well as the index of the forecast field. This facilitates to concurrent downloads of model data.
    -----------------------------------------------------------------------
    Input: {string} gfs_url - displays available GFS forecast model runs 

    i.e. http://nomads.ncep.noaa.gov:9090/dods/gfs_0p25
    -----------------------------------------------------------------------
    Output: array of tuples with this structure:

    forecast_info = [(forecast_indx, forecast_field_datetime), ...]

    """
    page = urllib.urlopen(gfs_url).read()
    soup = BeautifulSoup(page, 'html.parser')
    soup.prettify()

    date_array = np.array([])
    for anchor in soup.findAll('a', href=True):
        anchor_str = anchor['href']
        match = re.search(r'gfs(\d{8})$', anchor_str)

        if match:
            unformatted_date = match.group(1)
            datetime_element = datetime.datetime.strptime(
                unformatted_date, '%Y%m%d')
            date_array = np.append(date_array, datetime_element)

    max_forecast_run_date = np.max(date_array)
    formatted_latest_date = datetime.datetime.strftime(max_forecast_run_date,
                                                       '%Y%m%d')

    # find the latest run using bs4
    forecast_run_url = gfs_url + '/gfs' + formatted_latest_date
    page = urllib.urlopen(forecast_run_url).read()
    soup = BeautifulSoup(page, 'html.parser')
    soup.prettify()

    forecast_run_array = {}
    for model_run in soup.findAll('b'):
        match = re.search(r'(gfs_.*_(\d{2})z):$', model_run.string)

        if match:
            run_name = match.group(1)
            forecast_run_hour = match.group(2)
            forecast_run_array.setdefault(int(forecast_run_hour), run_name)

    # build forecast field datetime/indx array
    max_run = max(forecast_run_array.keys())
    opendapp_url = forecast_run_url + '/' + run_name
    file = get_opendapp_netcdf(opendapp_url)
    product_times = file.variables['time'][:]
    file.close()

    forecast_info = {}
    forecast_info['url'] = opendapp_url
    forecast_info['data'] = []
    for forecast_indx, forecast_time in enumerate(product_times):
        basetime_int = int(forecast_time)
        extra_days = forecast_time - basetime_int

        # need to subtract 1 since GFS is days since 0001-01-01 (yyyy-mm-dd)
        full_forecast_time = (datetime.datetime.fromordinal(basetime_int) +
                              datetime.timedelta(days=extra_days) -
                              datetime.timedelta(days=1))
        forecast_info['data'].append((forecast_indx, full_forecast_time))

    return forecast_info
def lambda_handler(event, context):
    """
    lambda_handler(event, context):

    This function reads, parses, and saves a .json and .pickle file from 
    a netCDF file from a provided opendapp url (contained within the event paramater object).
    -----------------------------------------------------------------------
    Inputs:

    event: AWS Lambda uses this parameter to pass in event data to the handler. 
    This parameter is usually of the Python dict type. It can also be list, str, int, float, or NoneType type.
    
    context: AWS Lambda uses this parameter to provide runtime information to your handler. 
    This parameter is of the LambdaContext type.
    -----------------------------------------------------------------------
    Output: A .json file and a .pickle file are save to S3
    -----------------------------------------------------------------------
    Author: Michael Christensen
    Date Modified: 08/26/2018
    """

    AWS_BUCKET_NAME = 'oceanmapper-data-storage'
    TOP_LEVEL_FOLDER = 'RTOFS_OCEAN_CURRENTS_HIGHRES'

    # unpack event data
    url = event['url']
    model_field_time = datetime.datetime.strptime(event['forecast_time'],
                                                  '%Y%m%dT%H:%M')
    model_field_indx = event['forecast_indx']

    file = get_opendapp_netcdf(url)
    formatted_folder_date = datetime.datetime.strftime(model_field_time,
                                                       '%Y%m%d_%H')

    # update this when fetching 4d data (right now only use surface depth
    output_json_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date +
                        '/0m/json/' + 'rtofs_currents_' +
                        formatted_folder_date + '.json')

    output_pickle_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date +
                          '/0m/pickle/' + 'rtofs_currents_' +
                          formatted_folder_date + '.pickle')

    output_tile_scalar_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date +
                               '/' + str(model_level_depth) +
                               'm/tiles/scalar/')

    lat = file.variables['lat'][:]
    lon = file.variables['lon'][:]

    # transform masked values to 0
    u_data_raw = file.variables['u_velocity'][model_field_indx,
                                              0, :, :]  #[time,level,lat,lon]
    v_data_raw = file.variables['v_velocity'][model_field_indx, 0, :, :]

    u_data_mask_applied = np.where(~u_data_raw.mask, u_data_raw, 0)
    v_data_mask_applied = np.where(~v_data_raw.mask, v_data_raw, 0)

    # rtofs longitudes go from 74.16 to 434.06227 -- remap and sort to -180 to 180 grid
    lon_translate = np.where(lon > 180, lon - 360.0, lon)
    lon_sort_indices = np.argsort(lon_translate)

    # ordered clongitude arrays
    lon_ordered = lon_translate[lon_sort_indices]

    # rebuild u/v data with correct longitude sorting (monotonic increasing)
    u_data_cleaned = np.array(
        [lat_row[lon_sort_indices] for lat_row in u_data_mask_applied])
    v_data_cleaned = np.array(
        [lat_row[lon_sort_indices] for lat_row in v_data_mask_applied])

    # assign the raw data to a variable so we can pickle it for use with other scripts
    raw_data = {
        'lat': lat,
        'lon': lon_ordered,
        'u_vel': u_data_cleaned,
        'v_vel': v_data_cleaned
    }
    raw_data_pickle = pickle.dumps(raw_data)

    output_lat_array = np.arange(
        -90, 90.5, 0.5)  # last point is excluded with arange (90 to -90)
    output_lon_array = np.arange(
        -180, 180.5, 0.5)  # last point is excluded with arange (-180 to 180)

    u_interp_func = interpolate.interp2d(lon_ordered,
                                         lat,
                                         u_data_cleaned,
                                         kind='cubic')
    v_interp_func = interpolate.interp2d(lon_ordered,
                                         lat,
                                         v_data_cleaned,
                                         kind='cubic')

    u_data_interp = u_interp_func(output_lon_array, output_lat_array)
    v_data_interp = v_interp_func(output_lon_array, output_lat_array)

    minLat = np.min(output_lat_array)
    maxLat = np.max(output_lat_array)
    minLon = np.min(output_lon_array)
    maxLon = np.max(output_lon_array)

    dx = np.diff(output_lon_array)[0]
    dy = np.diff(output_lat_array)[0]

    output_data = [
        {
            'header': {
                'parameterUnit':
                "m.s-1",
                'parameterNumber':
                2,
                'dx':
                dx,
                'dy':
                dy,
                'parameterNumberName':
                "Eastward current",
                'la1':
                maxLat,
                'la2':
                minLat,
                'parameterCategory':
                2,
                'lo1':
                minLon,
                'lo2':
                maxLon,
                'nx':
                len(output_lon_array),
                'ny':
                len(output_lat_array),
                'refTime':
                datetime.datetime.strftime(model_field_time,
                                           '%Y-%m-%d %H:%M:%S'),
            },
            'data': [
                float('{:.3f}'.format(el)) if np.abs(el) > 0.0001 else 0
                for el in u_data_interp[::-1].flatten().tolist()
            ]
        },
        {
            'header': {
                'parameterUnit':
                "m.s-1",
                'parameterNumber':
                3,
                'dx':
                dx,
                'dy':
                dy,
                'parameterNumberName':
                "Northward current",
                'la1':
                maxLat,
                'la2':
                minLat,
                'parameterCategory':
                2,
                'lo1':
                minLon,
                'lo2':
                maxLon,
                'nx':
                len(output_lon_array),
                'ny':
                len(output_lat_array),
                'refTime':
                datetime.datetime.strftime(model_field_time,
                                           '%Y-%m-%d %H:%M:%S'),
            },
            'data': [
                float('{:.3f}'.format(el)) if np.abs(el) > 0.0001 else 0
                for el in v_data_interp[::-1].flatten().tolist()
            ]
        },
    ]

    client = boto3.client('s3')
    client.put_object(Body=json.dumps(output_data),
                      Bucket=AWS_BUCKET_NAME,
                      Key=output_json_path)
    client.put_object(Body=raw_data_pickle,
                      Bucket=AWS_BUCKET_NAME,
                      Key=output_pickle_path)

    # call an intermediate function to distribute tiling workload
    tile_task_distributor(output_pickle_path, 'current_speed', AWS_BUCKET_NAME,
                          output_tile_scalar_path, range(3, 5))

    file.close()
def get_hycom_forecast_info(hycom_url):
    """
    get_hycom_forecast_info(hycom_url)

    This function assembles an object the latest available forecast date that
    contains the full forecast extent (168hrs) as well as an array of all opendapp 
    urls (1 for each timestep) as well as an array containing the various depth levels
    supported by this model.

    Model Info: https://www.hycom.org/dataserver/gofs-3pt1/analysis
    -----------------------------------------------------------------------
    Input: {string} hycom_url - the HYCOM forecast data catalog url

    i.e. http://tds.hycom.org/thredds/catalog/datasets/GLBv0.08/expt_93.0/data/forecasts/catalog.html
    -----------------------------------------------------------------------
    Output: object with this structure:

    forecast_info = {forecast: {'latest_date': 'yyyymmdd', 'data_urls': [xxx, xxx, xxx], 
    'field_datetimes': [dt,dt...]}, 'levels': [0,2,...]}}

    """
    
    page = urllib.urlopen(hycom_url).read()
    soup = BeautifulSoup(page,'html.parser')
    soup.prettify()

    forecast_dict = {}
    for anchor in soup.findAll('a', href=True):
        anchor_str = anchor['href']
        match = re.search(r'hycom_glbv_930_(\d{10})_t(\d{3})_uv3z.nc$', anchor_str)
    
        if match:
            unformatted_date = match.group(1)
            datetime_element = datetime.datetime.strptime(unformatted_date,'%Y%m%d%H')
            forecast_hour_extent = int(match.group(2))

            full_forecast_time = datetime_element + datetime.timedelta(hours = forecast_hour_extent)

            forecast_dict.setdefault(datetime_element, []).append({'forecast_date': full_forecast_time, 
                'forecast_hour_extent': forecast_hour_extent})

    # sort available unique forecast dates in reverse order so most recent is first
    unique_dates = sorted(forecast_dict.keys(),reverse=True)
    max_forecast_run_date = unique_dates[0]
    
    # use the forecast which gets full coverage (at this point in time its 168 hrs into the future)
    # deal with possibility of only 1 date available
    if len(unique_dates) > 1:
        previous_forecast_extent = forecast_dict[unique_dates[1]][-1]['forecast_hour_extent']
    else:
        previous_forecast_extent = 0 
    
    present_forecast_extent = forecast_dict[unique_dates[0]][-1]['forecast_hour_extent']

    if present_forecast_extent >= previous_forecast_extent:
        latest_date = unique_dates[0]
    else:
        latest_date = unique_dates[1]

    formatted_latest_date = datetime.datetime.strftime(latest_date, '%Y%m%d%H')
    base_opendapp_url = 'http://tds.hycom.org/thredds/dodsC/datasets/GLBv0.08/expt_93.0/data/forecasts/hycom_glbv_930_'

    data_urls = []
    field_datetimes=[]
    for forecast_field in forecast_dict[latest_date]:
        formatted_hour_extent = str(forecast_field['forecast_hour_extent']).zfill(3)
        output_url = base_opendapp_url + formatted_latest_date + '_t' + formatted_hour_extent + '_uv3z.nc'
        data_urls.append(output_url)
        field_datetimes.append(forecast_field['forecast_date'])

    forecast_info = {'forecast': {'latest_date': datetime.datetime.strftime(latest_date,'%Y%m%d_%H%M'), 
    'data_urls': data_urls, 'field_datetimes': field_datetimes}}

    # use the first data url to get the various depth levels (they are the same for each .nc file)
    file = get_opendapp_netcdf(data_urls[0])
    levels = file.variables['depth'][:]
    file.close()

    # add levels to output data structure
    forecast_info['levels'] = [int(lev) for lev in levels.tolist()]

    return forecast_info
def lambda_handler(event, context):
    """
    lambda_handler(event, context):

    This function reads, parses, and saves a .json and .pickle file from 
    a netCDF file from a provided opendapp url (contained within the event paramater object).
    -----------------------------------------------------------------------
    Inputs:

    event: AWS Lambda uses this parameter to pass in event data to the handler. 
    This parameter is usually of the Python dict type. It can also be list, str, int, float, or NoneType type.
    
    context: AWS Lambda uses this parameter to provide runtime information to your handler. 
    This parameter is of the LambdaContext type.
    -----------------------------------------------------------------------
    Output: A .json file and a .pickle file are save to S3
    -----------------------------------------------------------------------
    Author: Michael Christensen
    Date Modified: 12/20/2018
    """

    AWS_BUCKET_NAME = 'oceanmapper-data-storage'
    TOP_LEVEL_FOLDER = 'RTOFS_DATA'
    SUB_RESOURCE = 'ocean_current_speed'
    DATA_PREFIX = 'rtofs_currents'
        
    # unpack event data
    url = event['url']
    model_field_time = datetime.datetime.strptime(event['forecast_time'],'%Y%m%dT%H:%M')
    model_field_indx = event['forecast_indx']
    model_level_depth = event['level']['level_depth']
    model_level_indx = event['level']['level_indx']

    u_comp_url = url;
    v_comp_url = url.replace('uvel','vvel')

    file_u = get_opendapp_netcdf(u_comp_url)
    file_v = get_opendapp_netcdf(v_comp_url)  

    formatted_folder_date = datetime.datetime.strftime(model_field_time,'%Y%m%d_%H')
    
    output_json_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE + '/' +
        str(model_level_depth) + 'm/json/' + DATA_PREFIX + '_' + formatted_folder_date + '.json')

    output_pickle_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE + '/' +
        str(model_level_depth) + 'm/pickle/' + DATA_PREFIX + '_' + formatted_folder_date + '.pickle')

    output_tile_scalar_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE + '/' +
        str(model_level_depth) + 'm/tiles/scalar/')

    output_tile_data_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE + '/' +
        str(model_level_depth) + 'm/tiles/data/')

    output_info_path = TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/info.json'

    # get model origin time
    if 'nowcast' in u_comp_url:
        time_orig_str = file_u.variables['time'].maximum
    else:
        time_orig_str = file_u.variables['time'].minimum
    time_origin = datetime.datetime.strptime(time_orig_str,'%Hz%d%b%Y')

    lat  = file_u.variables['lat'][:]
    lon  = file_u.variables['lon'][:]

    # transform masked values to 0
    u_data_raw = file_u.variables['u'][model_field_indx,model_level_indx,:,:] #[time,level,lat,lon]
    v_data_raw = file_v.variables['v'][model_field_indx,model_level_indx,:,:]
	
    u_data_mask_applied = np.where(~u_data_raw.mask, u_data_raw, 0)
    v_data_mask_applied = np.where(~v_data_raw.mask, v_data_raw, 0)

    # ordered lat array
    lat_sort_indices = np.argsort(lat)
    lat_ordered = lat[lat_sort_indices]

    # rtofs longitudes go from 74.16 to 434.06227 -- remap and sort to -180 to 180 grid
    lon_translate = np.where(lon>180, lon-360.0, lon)
    lon_sort_indices = np.argsort(lon_translate)

    # ordered longitude arrays
    lon_ordered = lon_translate[lon_sort_indices]

    # rebuild u/v data with correct longitude sorting (monotonic increasing) 
    u_data_cleaned_filled = u_data_mask_applied[lat_sort_indices,:][:,lon_sort_indices]
    v_data_cleaned_filled = v_data_mask_applied[lat_sort_indices,:][:,lon_sort_indices]

    u_data_cleaned = u_data_raw[lat_sort_indices,:][:,lon_sort_indices]
    v_data_cleaned = v_data_raw[lat_sort_indices,:][:,lon_sort_indices]

    # assign the raw data to a variable so we can pickle it for use with other scripts
    raw_data = {'lat': lat_ordered, 'lon': lon_ordered, 'u_vel': u_data_cleaned, 'v_vel': v_data_cleaned,
        'time_origin': time_origin}
    raw_data_pickle = pickle.dumps(raw_data)

    output_lat_array = np.arange(int(min(lat)),int(max(lat))+0.5,0.5) # last point is excluded with arange (90 to -90)
    output_lon_array = np.arange(-180,180.5,0.5) # last point is excluded with arange (-180 to 180)

    u_interp_func = interpolate.interp2d(lon_ordered, lat_ordered, u_data_cleaned_filled, kind='cubic')
    v_interp_func = interpolate.interp2d(lon_ordered, lat_ordered, v_data_cleaned_filled, kind='cubic')

    u_data_interp = u_interp_func(output_lon_array, output_lat_array)
    v_data_interp = v_interp_func(output_lon_array, output_lat_array)
	
    minLat = np.min(output_lat_array)
    maxLat = np.max(output_lat_array)
    minLon = np.min(output_lon_array)
    maxLon = np.max(output_lon_array)
    
    dx = np.diff(output_lon_array)[0]
    dy = np.diff(output_lat_array)[0]
    
    output_data = [
    		{'header': {
    			'parameterUnit': "m.s-1",
    			'parameterNumber': 2,
    			'dx': dx,
    			'dy': dy,
    			'parameterNumberName': "Eastward current",
    			'la1': maxLat,
    			'la2': minLat,
    			'parameterCategory': 2,
    			'lo1': minLon,
    			'lo2': maxLon,
    			'nx': len(output_lon_array),
    			'ny': len(output_lat_array),
    			'refTime': datetime.datetime.strftime(model_field_time,'%Y-%m-%d %H:%M:%S'),
                'timeOrigin': datetime.datetime.strftime(time_origin,'%Y-%m-%d %H:%M:%S'),
    			},
    			'data': [float('{:.3f}'.format(el)) if np.abs(el) > 0.0001 else 0 for el in u_data_interp[::-1].flatten().tolist()]
    		},
    		{'header': {
    			'parameterUnit': "m.s-1",
    			'parameterNumber': 3,
    			'dx': dx,
    			'dy': dy,
    			'parameterNumberName': "Northward current",
    			'la1': maxLat,
    			'la2': minLat,
    			'parameterCategory': 2,
    			'lo1': minLon,
    			'lo2': maxLon,
    			'nx': len(output_lon_array),
    			'ny': len(output_lat_array),
    			'refTime': datetime.datetime.strftime(model_field_time,'%Y-%m-%d %H:%M:%S'),
                'timeOrigin': datetime.datetime.strftime(time_origin,'%Y-%m-%d %H:%M:%S'),
    			},
    			'data': [float('{:.3f}'.format(el)) if np.abs(el) > 0.0001 else 0 for el in v_data_interp[::-1].flatten().tolist()]
    		},
      ]

    client = boto3.client('s3')
    client.put_object(Body=json.dumps(output_data), Bucket=AWS_BUCKET_NAME, Key=output_json_path)
    client.put_object(Body=raw_data_pickle, Bucket=AWS_BUCKET_NAME, Key=output_pickle_path)

    # save an info file for enhanced performance (get_model_field_api.py)
    client.put_object(Body=json.dumps({'time_origin': datetime.datetime.strftime(time_origin,'%Y-%m-%d %H:%M:%S')}), 
        Bucket=AWS_BUCKET_NAME, Key=output_info_path)

    # call an intermediate function to distribute pickling workload (subsetting data by tile)
    data_zoom_level = datasets[TOP_LEVEL_FOLDER]['sub_resource'][SUB_RESOURCE]['data_tiles_zoom_level']
    pickle_task_distributor(output_pickle_path, AWS_BUCKET_NAME, output_tile_data_path, data_zoom_level)

    file_u.close()
    file_v.close()