def lambda_handler(event, context): """ lambda_handler(event, context): This function creates .json, .pickle, and .png files from a netCDF file. This netCDF file comes from an opendapp url (contained within the event paramater object). ----------------------------------------------------------------------- Inputs: event: AWS Lambda uses this parameter to pass in event data to the handler. This parameter is usually of the Python dict type. It can also be list, str, int, float, or NoneType type. context: AWS Lambda uses this parameter to provide runtime information to your handler. This parameter is of the LambdaContext type. ----------------------------------------------------------------------- Output: .json file, .pickle file, and .png files are save to S3 ----------------------------------------------------------------------- Author: Michael Christensen Date Modified: 10/08/2018 """ AWS_BUCKET_NAME = 'oceanmapper-data-storage' TOP_LEVEL_FOLDER = 'GFS_DATA' SUB_RESOURCE = 'wind_speed' DATA_PREFIX = 'gfs_winds' # unpack event data url = event['url'] model_field_time = datetime.datetime.strptime(event['forecast_time'], '%Y%m%dT%H:%M') model_field_indx = event['forecast_indx'] level = '10m' file = get_opendapp_netcdf(url) formatted_folder_date = datetime.datetime.strftime(model_field_time, '%Y%m%d_%H') output_json_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE + '/' + level + '/json/' + DATA_PREFIX + '_' + formatted_folder_date + '.json') output_pickle_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE + '/' + level + '/pickle/' + DATA_PREFIX + '_' + formatted_folder_date + '.pickle') output_tile_scalar_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE + '/' + level + '/tiles/scalar/') output_tile_data_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE + '/' + level + '/tiles/data/') output_info_path = TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/info.json' # get model origin time init_time = file.variables['time'][0] basetime_int = int(init_time) extra_days = init_time - basetime_int time_origin = (datetime.datetime.fromordinal(basetime_int) + datetime.timedelta(days=extra_days) - datetime.timedelta(days=1)) lat = file.variables['lat'][:] lon = file.variables['lon'][:] u_data_raw = file.variables['ugrd10m'][ model_field_indx, :, :] #[time,lat,lon] v_data_raw = file.variables['vgrd10m'][model_field_indx, :, :] # ordered lat array lat_sort_indices = np.argsort(lat) lat_ordered = lat[lat_sort_indices] # remap and sort to -180 to 180 grid lon_translate = np.where(lon > 180, lon - 360.0, lon) lon_sort_indices = np.argsort(lon_translate) # ordered longitude arrays lon_ordered = lon_translate[lon_sort_indices] # rebuild u/v data with correct longitude sorting (monotonic increasing) u_data_cleaned = u_data_raw[lat_sort_indices, :][:, lon_sort_indices] v_data_cleaned = v_data_raw[lat_sort_indices, :][:, lon_sort_indices] # assign the raw data to a variable so we can pickle it for use with other scripts raw_data = { 'lat': lat_ordered, 'lon': lon_ordered, 'u_vel': u_data_cleaned, 'v_vel': v_data_cleaned, 'datetime': formatted_folder_date, 'level': level, 'time_origin': time_origin } raw_data_pickle = pickle.dumps(raw_data) # create interpolation functions u_interp_func = interpolate.interp2d(lon_ordered, lat_ordered, u_data_cleaned, kind='cubic') v_interp_func = interpolate.interp2d(lon_ordered, lat_ordered, v_data_cleaned, kind='cubic') output_lat_array = np.arange( int(min(lat)), int(max(lat)) + 0.5, 0.5) # last point is excluded with arange (80 to -80) output_lon_array = np.arange( -180, 180.5, 0.5) # last point is excluded with arange (-180 to 180) u_data_interp = u_interp_func(output_lon_array, output_lat_array) v_data_interp = v_interp_func(output_lon_array, output_lat_array) minLat = np.min(output_lat_array) maxLat = np.max(output_lat_array) minLon = np.min(output_lon_array) maxLon = np.max(output_lon_array) dx = np.diff(output_lon_array)[0] dy = np.diff(output_lat_array)[0] output_data = [ { 'header': { 'parameterUnit': "m.s-1", 'parameterNumber': 2, 'dx': dx, 'dy': dy, 'parameterNumberName': "Eastward wind", 'la1': maxLat, 'la2': minLat, 'parameterCategory': 2, 'lo1': minLon, 'lo2': maxLon, 'nx': len(output_lon_array), 'ny': len(output_lat_array), 'refTime': datetime.datetime.strftime(model_field_time, '%Y-%m-%d %H:%M:%S'), 'timeOrigin': datetime.datetime.strftime(time_origin, '%Y-%m-%d %H:%M:%S'), }, 'data': [ float('{:.3f}'.format(el)) if np.abs(el) > 0.0001 else 0 for el in u_data_interp[::-1].flatten().tolist() ] }, { 'header': { 'parameterUnit': "m.s-1", 'parameterNumber': 3, 'dx': dx, 'dy': dy, 'parameterNumberName': "Northward wind", 'la1': maxLat, 'la2': minLat, 'parameterCategory': 2, 'lo1': minLon, 'lo2': maxLon, 'nx': len(output_lon_array), 'ny': len(output_lat_array), 'refTime': datetime.datetime.strftime(model_field_time, '%Y-%m-%d %H:%M:%S'), 'timeOrigin': datetime.datetime.strftime(time_origin, '%Y-%m-%d %H:%M:%S'), }, 'data': [ float('{:.3f}'.format(el)) if np.abs(el) > 0.0001 else 0 for el in v_data_interp[::-1].flatten().tolist() ] }, ] client = boto3.client('s3') client.put_object(Body=json.dumps(output_data), Bucket=AWS_BUCKET_NAME, Key=output_json_path) client.put_object(Body=raw_data_pickle, Bucket=AWS_BUCKET_NAME, Key=output_pickle_path) # save an info file for enhanced performance (get_model_field_api.py) client.put_object(Body=json.dumps({ 'time_origin': datetime.datetime.strftime(time_origin, '%Y-%m-%d %H:%M:%S') }), Bucket=AWS_BUCKET_NAME, Key=output_info_path) # call an intermediate function to distribute pickling workload (subsetting data by tile) data_zoom_level = datasets[TOP_LEVEL_FOLDER]['sub_resource'][SUB_RESOURCE][ 'data_tiles_zoom_level'] pickle_task_distributor(output_pickle_path, AWS_BUCKET_NAME, output_tile_data_path, data_zoom_level) file.close()
def lambda_handler(event, context): """ lambda_handler(event, context): This function reads, parses, and saves a .json and .pickle file from a netCDF file from a provided opendapp url (contained within the event paramater object). ----------------------------------------------------------------------- Inputs: event: AWS Lambda uses this parameter to pass in event data to the handler. This parameter is usually of the Python dict type. It can also be list, str, int, float, or NoneType type. context: AWS Lambda uses this parameter to provide runtime information to your handler. This parameter is of the LambdaContext type. ----------------------------------------------------------------------- Output: A .pickle file are save to S3 ----------------------------------------------------------------------- Author: Michael Christensen Date Modified: 10/08/2018 """ AWS_BUCKET_NAME = 'oceanmapper-data-storage' TOP_LEVEL_FOLDER = 'WW3_DATA' SUB_RESOURCE_HTSGWSFC = 'sig_wave_height' SUB_RESOURCE_DIRPWSFC = 'primary_wave_dir' SUB_RESOURCE_PERPWSFC = 'primary_wave_period' # unpack event data url = event['url'] model_field_time = datetime.datetime.strptime(event['forecast_time'], '%Y%m%dT%H:%M') model_field_indx = event['forecast_indx'] file = get_opendapp_netcdf(url) formatted_folder_date = datetime.datetime.strftime(model_field_time, '%Y%m%d_%H') output_pickle_path_htsgwsfc = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE_HTSGWSFC + '/pickle/' + 'ww3_htsgwsfc_' + formatted_folder_date + '.pickle') output_tile_scalar_path_htsgwsfc = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE_HTSGWSFC + '/tiles/scalar/') output_tile_data_path_htsgwsfc = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE_HTSGWSFC + '/tiles/data/') output_pickle_path_dirpwsfc = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE_DIRPWSFC + '/pickle/' + 'ww3_dirpwsfc_' + formatted_folder_date + '.pickle') output_tile_vector_path_dirpwsfc = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE_DIRPWSFC + '/tiles/vector/') output_tile_data_path_dirpwsfc = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE_DIRPWSFC + '/tiles/data/') output_pickle_path_perpwsfc = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE_PERPWSFC + '/pickle/' + 'ww3_perpwsfc_' + formatted_folder_date + '.pickle') output_tile_scalar_path_perpwsfc = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE_PERPWSFC + '/tiles/scalar/') output_tile_data_path_perpwsfc = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE_PERPWSFC + '/tiles/data/') output_info_path = TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/info.json' # get model origin time init_time = file.variables['time'][0] basetime_int = int(init_time) extra_days = init_time - basetime_int time_origin = (datetime.datetime.fromordinal(basetime_int) + datetime.timedelta(days=extra_days) - datetime.timedelta(days=1)) lat = file.variables['lat'][:] lon = file.variables['lon'][:] # significant height of combined wind waves and swell [m] height_raw = file.variables['htsgwsfc'][ model_field_indx, :, :] #[time,lat,lon] # primary wave direction [deg] primary_dir_raw = file.variables['dirpwsfc'][ model_field_indx, :, :] #[time,lat,lon] # primary wave mean period [s] primary_period_raw = file.variables['perpwsfc'][ model_field_indx, :, :] #[time,lat,lon] # ordered lat array lat_sort_indices = np.argsort(lat) lat_ordered = lat[lat_sort_indices] # remap and sort to -180 to 180 grid lon_translate = np.where(lon > 180, lon - 360.0, lon) lon_sort_indices = np.argsort(lon_translate) # ordered longitude arrays lon_ordered = lon_translate[lon_sort_indices] # rebuild sig wave height data with correct longitude sorting (monotonic increasing) height_data_cleaned = height_raw[lat_sort_indices, :][:, lon_sort_indices] # rebuild primary wave direction data with correct longitude sorting (monotonic increasing) direction_data_cleaned = primary_dir_raw[ lat_sort_indices, :][:, lon_sort_indices] # rebuild primary wave period data with correct longitude sorting (monotonic increasing) period_data_cleaned = primary_period_raw[ lat_sort_indices, :][:, lon_sort_indices] # assign the raw data to variables so we can pickle it for use with other scripts raw_data_htsgwsfc = { 'lat': lat_ordered, 'lon': lon_ordered, 'sig_wave_height': height_data_cleaned, 'time_origin': time_origin } raw_data_pickle_htsgwsfc = pickle.dumps(raw_data_htsgwsfc) raw_data_dirpwsfc = { 'lat': lat_ordered, 'lon': lon_ordered, 'primary_wave_dir': direction_data_cleaned, 'time_origin': time_origin } raw_data_pickle_dirpwsfc = pickle.dumps(raw_data_dirpwsfc) raw_data_perpwsfc = { 'lat': lat_ordered, 'lon': lon_ordered, 'primary_wave_period': period_data_cleaned, 'time_origin': time_origin } raw_data_pickle_perpwsfc = pickle.dumps(raw_data_perpwsfc) client = boto3.client('s3') client.put_object(Body=raw_data_pickle_htsgwsfc, Bucket=AWS_BUCKET_NAME, Key=output_pickle_path_htsgwsfc) client.put_object(Body=raw_data_pickle_dirpwsfc, Bucket=AWS_BUCKET_NAME, Key=output_pickle_path_dirpwsfc) client.put_object(Body=raw_data_pickle_perpwsfc, Bucket=AWS_BUCKET_NAME, Key=output_pickle_path_perpwsfc) # save an info file for enhanced performance (get_model_field_api.py) client.put_object(Body=json.dumps({ 'time_origin': datetime.datetime.strftime(time_origin, '%Y-%m-%d %H:%M:%S') }), Bucket=AWS_BUCKET_NAME, Key=output_info_path) # call an intermediate function to distribute pickling workload (subsetting data by tile) data_zoom_level_htsgwsfc = datasets[TOP_LEVEL_FOLDER]['sub_resource'][ SUB_RESOURCE_HTSGWSFC]['data_tiles_zoom_level'] pickle_task_distributor(output_pickle_path_htsgwsfc, AWS_BUCKET_NAME, output_tile_data_path_htsgwsfc, data_zoom_level_htsgwsfc) data_zoom_level_dirpwsfc = datasets[TOP_LEVEL_FOLDER]['sub_resource'][ SUB_RESOURCE_DIRPWSFC]['data_tiles_zoom_level'] pickle_task_distributor(output_pickle_path_dirpwsfc, AWS_BUCKET_NAME, output_tile_data_path_dirpwsfc, data_zoom_level_dirpwsfc) data_zoom_level_perpwsfc = datasets[TOP_LEVEL_FOLDER]['sub_resource'][ SUB_RESOURCE_PERPWSFC]['data_tiles_zoom_level'] pickle_task_distributor(output_pickle_path_perpwsfc, AWS_BUCKET_NAME, output_tile_data_path_perpwsfc, data_zoom_level_perpwsfc) file.close()
def get_ww3_forecast_info(ww3_url): """ get_ww3_forecast_info(ww3_url) This function assembles an array tuples containing the model forecast field datetime as well as the index of the forecast field. This facilitates to concurrent downloads of model data. ----------------------------------------------------------------------- Input: {string} ww3_url - displays available Wave Watch 3 forecast model runs i.e. https://nomads.ncep.noaa.gov:9090/dods/wave/nww3 ----------------------------------------------------------------------- Output: array of tuples with this structure: forecast_info = [(forecast_indx, forecast_field_datetime), ...] ----------------------------------------------------------------------- Author: Michael Christensen Date Modified: 02/06/2019 """ page = urllib.urlopen(ww3_url).read() soup = BeautifulSoup(page,'html.parser') soup.prettify() date_array = np.array([]) for datetime_element in soup.findAll('b'): match = re.search(r'(\d{8})[/]:$', datetime_element.string) if match: unformatted_date = match.group(1) datetime_element = datetime.datetime.strptime(unformatted_date,'%Y%m%d') date_array = np.append(date_array, datetime_element) max_forecast_run_date = np.max(date_array) formatted_latest_date = datetime.datetime.strftime(max_forecast_run_date, '%Y%m%d') # find the latest run using bs4 forecast_run_url = ww3_url +'/nww3' + formatted_latest_date page = urllib.urlopen(forecast_run_url).read() soup = BeautifulSoup(page,'html.parser') soup.prettify() forecast_run_array = {} for model_run in soup.findAll('b'): match = re.search(r'nww3\d{8}_(\d{2})z', model_run.string) if match: run_name = match.group(0) forecast_run_hour = match.group(1) forecast_run_array.setdefault(int(forecast_run_hour), run_name) # build forecast field datetime/indx array max_run = max(forecast_run_array.keys()) opendapp_url = forecast_run_url + '/' + run_name file = get_opendapp_netcdf(opendapp_url) product_times = file.variables['time'][:] file.close() forecast_info = {} forecast_info['url'] = opendapp_url forecast_info['data'] = [] for forecast_indx, forecast_time in enumerate(product_times): basetime_int = int(forecast_time) extra_days = forecast_time - basetime_int # need to subtract 1 since WW3 is days since 0001-01-01 (yyyy-mm-dd) full_forecast_time = (datetime.datetime.fromordinal(basetime_int) + datetime.timedelta(days = extra_days) - datetime.timedelta(days=1)) forecast_info['data'].append((forecast_indx, full_forecast_time)) return forecast_info
def get_gfs_forecast_info(gfs_url): """ get_gfs_forecast_info(gfs_url) This function assembles an array tuples containing the model forecast field datetime as well as the index of the forecast field. This facilitates to concurrent downloads of model data. ----------------------------------------------------------------------- Input: {string} gfs_url - displays available GFS forecast model runs i.e. http://nomads.ncep.noaa.gov:9090/dods/gfs_0p25 ----------------------------------------------------------------------- Output: array of tuples with this structure: forecast_info = [(forecast_indx, forecast_field_datetime), ...] """ page = urllib.urlopen(gfs_url).read() soup = BeautifulSoup(page, 'html.parser') soup.prettify() date_array = np.array([]) for anchor in soup.findAll('a', href=True): anchor_str = anchor['href'] match = re.search(r'gfs(\d{8})$', anchor_str) if match: unformatted_date = match.group(1) datetime_element = datetime.datetime.strptime( unformatted_date, '%Y%m%d') date_array = np.append(date_array, datetime_element) max_forecast_run_date = np.max(date_array) formatted_latest_date = datetime.datetime.strftime(max_forecast_run_date, '%Y%m%d') # find the latest run using bs4 forecast_run_url = gfs_url + '/gfs' + formatted_latest_date page = urllib.urlopen(forecast_run_url).read() soup = BeautifulSoup(page, 'html.parser') soup.prettify() forecast_run_array = {} for model_run in soup.findAll('b'): match = re.search(r'(gfs_.*_(\d{2})z):$', model_run.string) if match: run_name = match.group(1) forecast_run_hour = match.group(2) forecast_run_array.setdefault(int(forecast_run_hour), run_name) # build forecast field datetime/indx array max_run = max(forecast_run_array.keys()) opendapp_url = forecast_run_url + '/' + run_name file = get_opendapp_netcdf(opendapp_url) product_times = file.variables['time'][:] file.close() forecast_info = {} forecast_info['url'] = opendapp_url forecast_info['data'] = [] for forecast_indx, forecast_time in enumerate(product_times): basetime_int = int(forecast_time) extra_days = forecast_time - basetime_int # need to subtract 1 since GFS is days since 0001-01-01 (yyyy-mm-dd) full_forecast_time = (datetime.datetime.fromordinal(basetime_int) + datetime.timedelta(days=extra_days) - datetime.timedelta(days=1)) forecast_info['data'].append((forecast_indx, full_forecast_time)) return forecast_info
def lambda_handler(event, context): """ lambda_handler(event, context): This function reads, parses, and saves a .json and .pickle file from a netCDF file from a provided opendapp url (contained within the event paramater object). ----------------------------------------------------------------------- Inputs: event: AWS Lambda uses this parameter to pass in event data to the handler. This parameter is usually of the Python dict type. It can also be list, str, int, float, or NoneType type. context: AWS Lambda uses this parameter to provide runtime information to your handler. This parameter is of the LambdaContext type. ----------------------------------------------------------------------- Output: A .json file and a .pickle file are save to S3 ----------------------------------------------------------------------- Author: Michael Christensen Date Modified: 08/26/2018 """ AWS_BUCKET_NAME = 'oceanmapper-data-storage' TOP_LEVEL_FOLDER = 'RTOFS_OCEAN_CURRENTS_HIGHRES' # unpack event data url = event['url'] model_field_time = datetime.datetime.strptime(event['forecast_time'], '%Y%m%dT%H:%M') model_field_indx = event['forecast_indx'] file = get_opendapp_netcdf(url) formatted_folder_date = datetime.datetime.strftime(model_field_time, '%Y%m%d_%H') # update this when fetching 4d data (right now only use surface depth output_json_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/0m/json/' + 'rtofs_currents_' + formatted_folder_date + '.json') output_pickle_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/0m/pickle/' + 'rtofs_currents_' + formatted_folder_date + '.pickle') output_tile_scalar_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + str(model_level_depth) + 'm/tiles/scalar/') lat = file.variables['lat'][:] lon = file.variables['lon'][:] # transform masked values to 0 u_data_raw = file.variables['u_velocity'][model_field_indx, 0, :, :] #[time,level,lat,lon] v_data_raw = file.variables['v_velocity'][model_field_indx, 0, :, :] u_data_mask_applied = np.where(~u_data_raw.mask, u_data_raw, 0) v_data_mask_applied = np.where(~v_data_raw.mask, v_data_raw, 0) # rtofs longitudes go from 74.16 to 434.06227 -- remap and sort to -180 to 180 grid lon_translate = np.where(lon > 180, lon - 360.0, lon) lon_sort_indices = np.argsort(lon_translate) # ordered clongitude arrays lon_ordered = lon_translate[lon_sort_indices] # rebuild u/v data with correct longitude sorting (monotonic increasing) u_data_cleaned = np.array( [lat_row[lon_sort_indices] for lat_row in u_data_mask_applied]) v_data_cleaned = np.array( [lat_row[lon_sort_indices] for lat_row in v_data_mask_applied]) # assign the raw data to a variable so we can pickle it for use with other scripts raw_data = { 'lat': lat, 'lon': lon_ordered, 'u_vel': u_data_cleaned, 'v_vel': v_data_cleaned } raw_data_pickle = pickle.dumps(raw_data) output_lat_array = np.arange( -90, 90.5, 0.5) # last point is excluded with arange (90 to -90) output_lon_array = np.arange( -180, 180.5, 0.5) # last point is excluded with arange (-180 to 180) u_interp_func = interpolate.interp2d(lon_ordered, lat, u_data_cleaned, kind='cubic') v_interp_func = interpolate.interp2d(lon_ordered, lat, v_data_cleaned, kind='cubic') u_data_interp = u_interp_func(output_lon_array, output_lat_array) v_data_interp = v_interp_func(output_lon_array, output_lat_array) minLat = np.min(output_lat_array) maxLat = np.max(output_lat_array) minLon = np.min(output_lon_array) maxLon = np.max(output_lon_array) dx = np.diff(output_lon_array)[0] dy = np.diff(output_lat_array)[0] output_data = [ { 'header': { 'parameterUnit': "m.s-1", 'parameterNumber': 2, 'dx': dx, 'dy': dy, 'parameterNumberName': "Eastward current", 'la1': maxLat, 'la2': minLat, 'parameterCategory': 2, 'lo1': minLon, 'lo2': maxLon, 'nx': len(output_lon_array), 'ny': len(output_lat_array), 'refTime': datetime.datetime.strftime(model_field_time, '%Y-%m-%d %H:%M:%S'), }, 'data': [ float('{:.3f}'.format(el)) if np.abs(el) > 0.0001 else 0 for el in u_data_interp[::-1].flatten().tolist() ] }, { 'header': { 'parameterUnit': "m.s-1", 'parameterNumber': 3, 'dx': dx, 'dy': dy, 'parameterNumberName': "Northward current", 'la1': maxLat, 'la2': minLat, 'parameterCategory': 2, 'lo1': minLon, 'lo2': maxLon, 'nx': len(output_lon_array), 'ny': len(output_lat_array), 'refTime': datetime.datetime.strftime(model_field_time, '%Y-%m-%d %H:%M:%S'), }, 'data': [ float('{:.3f}'.format(el)) if np.abs(el) > 0.0001 else 0 for el in v_data_interp[::-1].flatten().tolist() ] }, ] client = boto3.client('s3') client.put_object(Body=json.dumps(output_data), Bucket=AWS_BUCKET_NAME, Key=output_json_path) client.put_object(Body=raw_data_pickle, Bucket=AWS_BUCKET_NAME, Key=output_pickle_path) # call an intermediate function to distribute tiling workload tile_task_distributor(output_pickle_path, 'current_speed', AWS_BUCKET_NAME, output_tile_scalar_path, range(3, 5)) file.close()
def get_hycom_forecast_info(hycom_url): """ get_hycom_forecast_info(hycom_url) This function assembles an object the latest available forecast date that contains the full forecast extent (168hrs) as well as an array of all opendapp urls (1 for each timestep) as well as an array containing the various depth levels supported by this model. Model Info: https://www.hycom.org/dataserver/gofs-3pt1/analysis ----------------------------------------------------------------------- Input: {string} hycom_url - the HYCOM forecast data catalog url i.e. http://tds.hycom.org/thredds/catalog/datasets/GLBv0.08/expt_93.0/data/forecasts/catalog.html ----------------------------------------------------------------------- Output: object with this structure: forecast_info = {forecast: {'latest_date': 'yyyymmdd', 'data_urls': [xxx, xxx, xxx], 'field_datetimes': [dt,dt...]}, 'levels': [0,2,...]}} """ page = urllib.urlopen(hycom_url).read() soup = BeautifulSoup(page,'html.parser') soup.prettify() forecast_dict = {} for anchor in soup.findAll('a', href=True): anchor_str = anchor['href'] match = re.search(r'hycom_glbv_930_(\d{10})_t(\d{3})_uv3z.nc$', anchor_str) if match: unformatted_date = match.group(1) datetime_element = datetime.datetime.strptime(unformatted_date,'%Y%m%d%H') forecast_hour_extent = int(match.group(2)) full_forecast_time = datetime_element + datetime.timedelta(hours = forecast_hour_extent) forecast_dict.setdefault(datetime_element, []).append({'forecast_date': full_forecast_time, 'forecast_hour_extent': forecast_hour_extent}) # sort available unique forecast dates in reverse order so most recent is first unique_dates = sorted(forecast_dict.keys(),reverse=True) max_forecast_run_date = unique_dates[0] # use the forecast which gets full coverage (at this point in time its 168 hrs into the future) # deal with possibility of only 1 date available if len(unique_dates) > 1: previous_forecast_extent = forecast_dict[unique_dates[1]][-1]['forecast_hour_extent'] else: previous_forecast_extent = 0 present_forecast_extent = forecast_dict[unique_dates[0]][-1]['forecast_hour_extent'] if present_forecast_extent >= previous_forecast_extent: latest_date = unique_dates[0] else: latest_date = unique_dates[1] formatted_latest_date = datetime.datetime.strftime(latest_date, '%Y%m%d%H') base_opendapp_url = 'http://tds.hycom.org/thredds/dodsC/datasets/GLBv0.08/expt_93.0/data/forecasts/hycom_glbv_930_' data_urls = [] field_datetimes=[] for forecast_field in forecast_dict[latest_date]: formatted_hour_extent = str(forecast_field['forecast_hour_extent']).zfill(3) output_url = base_opendapp_url + formatted_latest_date + '_t' + formatted_hour_extent + '_uv3z.nc' data_urls.append(output_url) field_datetimes.append(forecast_field['forecast_date']) forecast_info = {'forecast': {'latest_date': datetime.datetime.strftime(latest_date,'%Y%m%d_%H%M'), 'data_urls': data_urls, 'field_datetimes': field_datetimes}} # use the first data url to get the various depth levels (they are the same for each .nc file) file = get_opendapp_netcdf(data_urls[0]) levels = file.variables['depth'][:] file.close() # add levels to output data structure forecast_info['levels'] = [int(lev) for lev in levels.tolist()] return forecast_info
def lambda_handler(event, context): """ lambda_handler(event, context): This function reads, parses, and saves a .json and .pickle file from a netCDF file from a provided opendapp url (contained within the event paramater object). ----------------------------------------------------------------------- Inputs: event: AWS Lambda uses this parameter to pass in event data to the handler. This parameter is usually of the Python dict type. It can also be list, str, int, float, or NoneType type. context: AWS Lambda uses this parameter to provide runtime information to your handler. This parameter is of the LambdaContext type. ----------------------------------------------------------------------- Output: A .json file and a .pickle file are save to S3 ----------------------------------------------------------------------- Author: Michael Christensen Date Modified: 12/20/2018 """ AWS_BUCKET_NAME = 'oceanmapper-data-storage' TOP_LEVEL_FOLDER = 'RTOFS_DATA' SUB_RESOURCE = 'ocean_current_speed' DATA_PREFIX = 'rtofs_currents' # unpack event data url = event['url'] model_field_time = datetime.datetime.strptime(event['forecast_time'],'%Y%m%dT%H:%M') model_field_indx = event['forecast_indx'] model_level_depth = event['level']['level_depth'] model_level_indx = event['level']['level_indx'] u_comp_url = url; v_comp_url = url.replace('uvel','vvel') file_u = get_opendapp_netcdf(u_comp_url) file_v = get_opendapp_netcdf(v_comp_url) formatted_folder_date = datetime.datetime.strftime(model_field_time,'%Y%m%d_%H') output_json_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE + '/' + str(model_level_depth) + 'm/json/' + DATA_PREFIX + '_' + formatted_folder_date + '.json') output_pickle_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE + '/' + str(model_level_depth) + 'm/pickle/' + DATA_PREFIX + '_' + formatted_folder_date + '.pickle') output_tile_scalar_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE + '/' + str(model_level_depth) + 'm/tiles/scalar/') output_tile_data_path = (TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/' + SUB_RESOURCE + '/' + str(model_level_depth) + 'm/tiles/data/') output_info_path = TOP_LEVEL_FOLDER + '/' + formatted_folder_date + '/info.json' # get model origin time if 'nowcast' in u_comp_url: time_orig_str = file_u.variables['time'].maximum else: time_orig_str = file_u.variables['time'].minimum time_origin = datetime.datetime.strptime(time_orig_str,'%Hz%d%b%Y') lat = file_u.variables['lat'][:] lon = file_u.variables['lon'][:] # transform masked values to 0 u_data_raw = file_u.variables['u'][model_field_indx,model_level_indx,:,:] #[time,level,lat,lon] v_data_raw = file_v.variables['v'][model_field_indx,model_level_indx,:,:] u_data_mask_applied = np.where(~u_data_raw.mask, u_data_raw, 0) v_data_mask_applied = np.where(~v_data_raw.mask, v_data_raw, 0) # ordered lat array lat_sort_indices = np.argsort(lat) lat_ordered = lat[lat_sort_indices] # rtofs longitudes go from 74.16 to 434.06227 -- remap and sort to -180 to 180 grid lon_translate = np.where(lon>180, lon-360.0, lon) lon_sort_indices = np.argsort(lon_translate) # ordered longitude arrays lon_ordered = lon_translate[lon_sort_indices] # rebuild u/v data with correct longitude sorting (monotonic increasing) u_data_cleaned_filled = u_data_mask_applied[lat_sort_indices,:][:,lon_sort_indices] v_data_cleaned_filled = v_data_mask_applied[lat_sort_indices,:][:,lon_sort_indices] u_data_cleaned = u_data_raw[lat_sort_indices,:][:,lon_sort_indices] v_data_cleaned = v_data_raw[lat_sort_indices,:][:,lon_sort_indices] # assign the raw data to a variable so we can pickle it for use with other scripts raw_data = {'lat': lat_ordered, 'lon': lon_ordered, 'u_vel': u_data_cleaned, 'v_vel': v_data_cleaned, 'time_origin': time_origin} raw_data_pickle = pickle.dumps(raw_data) output_lat_array = np.arange(int(min(lat)),int(max(lat))+0.5,0.5) # last point is excluded with arange (90 to -90) output_lon_array = np.arange(-180,180.5,0.5) # last point is excluded with arange (-180 to 180) u_interp_func = interpolate.interp2d(lon_ordered, lat_ordered, u_data_cleaned_filled, kind='cubic') v_interp_func = interpolate.interp2d(lon_ordered, lat_ordered, v_data_cleaned_filled, kind='cubic') u_data_interp = u_interp_func(output_lon_array, output_lat_array) v_data_interp = v_interp_func(output_lon_array, output_lat_array) minLat = np.min(output_lat_array) maxLat = np.max(output_lat_array) minLon = np.min(output_lon_array) maxLon = np.max(output_lon_array) dx = np.diff(output_lon_array)[0] dy = np.diff(output_lat_array)[0] output_data = [ {'header': { 'parameterUnit': "m.s-1", 'parameterNumber': 2, 'dx': dx, 'dy': dy, 'parameterNumberName': "Eastward current", 'la1': maxLat, 'la2': minLat, 'parameterCategory': 2, 'lo1': minLon, 'lo2': maxLon, 'nx': len(output_lon_array), 'ny': len(output_lat_array), 'refTime': datetime.datetime.strftime(model_field_time,'%Y-%m-%d %H:%M:%S'), 'timeOrigin': datetime.datetime.strftime(time_origin,'%Y-%m-%d %H:%M:%S'), }, 'data': [float('{:.3f}'.format(el)) if np.abs(el) > 0.0001 else 0 for el in u_data_interp[::-1].flatten().tolist()] }, {'header': { 'parameterUnit': "m.s-1", 'parameterNumber': 3, 'dx': dx, 'dy': dy, 'parameterNumberName': "Northward current", 'la1': maxLat, 'la2': minLat, 'parameterCategory': 2, 'lo1': minLon, 'lo2': maxLon, 'nx': len(output_lon_array), 'ny': len(output_lat_array), 'refTime': datetime.datetime.strftime(model_field_time,'%Y-%m-%d %H:%M:%S'), 'timeOrigin': datetime.datetime.strftime(time_origin,'%Y-%m-%d %H:%M:%S'), }, 'data': [float('{:.3f}'.format(el)) if np.abs(el) > 0.0001 else 0 for el in v_data_interp[::-1].flatten().tolist()] }, ] client = boto3.client('s3') client.put_object(Body=json.dumps(output_data), Bucket=AWS_BUCKET_NAME, Key=output_json_path) client.put_object(Body=raw_data_pickle, Bucket=AWS_BUCKET_NAME, Key=output_pickle_path) # save an info file for enhanced performance (get_model_field_api.py) client.put_object(Body=json.dumps({'time_origin': datetime.datetime.strftime(time_origin,'%Y-%m-%d %H:%M:%S')}), Bucket=AWS_BUCKET_NAME, Key=output_info_path) # call an intermediate function to distribute pickling workload (subsetting data by tile) data_zoom_level = datasets[TOP_LEVEL_FOLDER]['sub_resource'][SUB_RESOURCE]['data_tiles_zoom_level'] pickle_task_distributor(output_pickle_path, AWS_BUCKET_NAME, output_tile_data_path, data_zoom_level) file_u.close() file_v.close()