class MarineWeatherRealtimeLoader(BaseLoader): """Marine weather real-time loader""" def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.filepath = None self.region_name_code = None self.language = None self.root = None self.area = {} self.items = [] # create marine weather indices if it don't exist for item in MAPPINGS: SETTINGS['mappings']['properties']['properties'][ 'properties'] = MAPPINGS[item] self.conn.create(INDEX_NAME.format(item), SETTINGS) def parse_filename(self): """ Parses a marine weather forecast XML filename to get the region name code and language. :return: `bool` of parse status """ # parse filepath pattern = '{region_name_code}_{language}.xml' filename = self.filepath.name parsed_filename = parse(pattern, filename) # set class variables self.region_name_code = parsed_filename.named['region_name_code'] self.language = parsed_filename.named['language'] return True def create_datetime_dict(self, datetime_elems): """ Used to pass a pair of timeStamp elements from the XML. These elements contain the UTC and local time for various marine forecast sections (warnings, regular forecasts, extended forecasts). The first element contains UTC datetime info and the second local datetime info. :param datetime_elems: list of lmxl `Element` objects representing the dateTime nodes to parse. :returns: `dict` with "utc" and "local" keys containing respective parsed datetime objects. """ datetime_utc = datetime.strptime( datetime_elems[0].find('timeStamp').text, '%Y%m%d%H%M') local_offset = float(datetime_elems[1].attrib['UTCOffset']) datetime_local = datetime_utc + timedelta(hours=local_offset) datetime_local = datetime_local.replace( tzinfo=timezone(timedelta(hours=local_offset))) return {'utc': datetime_utc, 'local': datetime_local} def set_area_info(self): """ Gets the area name from the marine weather XML document and looks up the equivalent meteocode forecast polygon feature ID to query the forecast_polygons_water ES index for the corresponding document. If document is found, assigns the self.area class attribute that contains region name, subregion name, area name and the associated geometry. :return: `bool` representing successful setting of self.area attribute """ area_name = self.root.find('area').text with open( os.path.join( MSC_PYGEOAPI_BASEPATH, 'lib/msc_pygeoapi/', 'resources/meteocode_lookup.json', )) as json_file: meteocode_lookup = json.load(json_file) forecast_id = meteocode_lookup[self.region_name_code] try: result = self.conn.Elasticsearch.get( index='forecast_polygons_water_detail', id=forecast_id, _source=['geometry'], ) self.area = { # get area element value **{ 'name': area_name }, # get area element attribute values **{ key: self.root.find('area').attrib[key] for key in ['countryCode', 'region', 'subRegion'] }, **result['_source'], } return True except exceptions.NotFoundError: LOGGER.warning("Could not get forecast polygon document with id: " "{}".format(forecast_id)) def generate_warnings(self): """ Generates and yields a series of marine weather warnings for a given marine weather area. Warnings are returned as Elasticsearch bulk API upsert actions, with a single document for the marine weather region in GeoJSON to match the Elasticsearch index mappings. :returns: Generator of Elasticsearch actions to upsert the marine weather warnings. """ warnings = self.root.findall('warnings/') feature = {'type': 'Feature', 'geometry': {}, 'properties': {}} feature['geometry'] = self.area['geometry'] feature['properties']['area_{}'.format( self.language)] = self.area['name'] feature['properties']['region_{}'.format( self.language)] = self.area['region'] feature['properties']['sub_region_{}'.format( self.language)] = self.area['subRegion'] feature['properties']['warnings_{}'.format(self.language)] = [] if len(warnings) > 0: for elem in warnings: datetimes = self.create_datetime_dict( elem.findall('event/' 'dateTime')) location = { 'location_{}'.format(self.language): elem.attrib['name'], 'issued_datetime_utc_{}'.format(self.language): strftime_rfc3339(datetimes['utc']), 'issued_datetime_local_{}'.format(self.language): strftime_rfc3339(datetimes['local']), 'event_type_{}'.format(self.language): elem.find('event').attrib['type'], 'event_category_{}'.format(self.language): elem.find('event').attrib['category'], 'event_name_{}'.format(self.language): elem.find('event').attrib['name'], 'event_status_{}'.format(self.language): elem.find('event').attrib['status'], } feature['properties']['warnings_{}'.format( self.language)].append(location) self.items.append(feature) action = { '_id': self.filepath.stem.split('_')[0], '_index': 'marine_weather_warnings', '_op_type': 'update', 'doc': feature, 'doc_as_upsert': True, } yield action def generate_regular_forecasts(self): """ Generates and yields a series of marine weather regular forecasts for a given marine weather area. Each regular forecast is returned as Elasticsearch bulk API upsert actions, with documents in GeoJSON to match the Elasticsearch index mappings. :returns: Generator of Elasticsearch actions to upsert the marine weather regular forecast. """ regular_forecasts = self.root.findall('regularForecast/') feature = {'type': 'Feature', 'geometry': {}, 'properties': {}} feature['geometry'] = self.area['geometry'] feature['properties']['area_{}'.format( self.language)] = self.area['name'] feature['properties']['region_{}'.format( self.language)] = self.area['region'] feature['properties']['sub_region_{}'.format( self.language)] = self.area['subRegion'] feature['properties']['forecasts_{}'.format(self.language)] = [] if len(regular_forecasts) > 0: datetimes = self.create_datetime_dict([ element for element in regular_forecasts if element.tag == 'dateTime' ]) feature['properties']['issued_datetime_utc'] = strftime_rfc3339( datetimes['utc']) feature['properties']['issued_datetime_local'] = strftime_rfc3339( datetimes['local']) locations = [ element for element in regular_forecasts if element.tag == 'location' ] for location in locations: location = { 'location_{}'.format(self.language): location.attrib['name'] if 'name' in location.attrib else self.area['name'], 'period_of_coverage_{}'.format(self.language): location.find('weatherCondition/periodOfCoverage').text if location.find('weatherCondition/periodOfCoverage') is not None else None, 'wind_{}'.format(self.language): location.find('weatherCondition/wind').text if location.find('weatherCondition/wind') is not None else None, 'weather_visibility_{}'.format(self.language): location.find('weatherCondition/weatherVisibility').text if location.find('weatherCondition/weatherVisibility') is not None else None, 'air_temperature_{}'.format(self.language): location.find('weatherCondition/airTemperature').text if location.find('weatherCondition/airTemperature') is not None else None, 'freezing_spray_{}'.format(self.language): location.find('weatherCondition/freezingSpray').text if location.find('weatherCondition/freezingSpray') is not None else None, 'status_statement_{}'.format(self.language): location.find('statusStatement').text if location.find('statusStatement') is not None else None, } feature['properties']['forecasts_{}'.format( self.language)].append(location) self.items.append(feature) action = { '_id': self.filepath.stem.split('_')[0], '_index': 'marine_weather_regular-forecasts', '_op_type': 'update', 'doc': feature, 'doc_as_upsert': True, } yield action def generate_extended_forecasts(self): """ Generates and yields a series of marine weather extended forecasts for a given marine weather area. Each extended forecast is returned as Elasticsearch bulk API upsert actions, with documents in GeoJSON to match the Elasticsearch index mappings. :returns: Generator of Elasticsearch actions to upsert the marine weather extended forecast. """ extended_forecasts = self.root.findall('extendedForecast/') feature = {'type': 'Feature', 'geometry': {}, 'properties': {}} feature['geometry'] = self.area['geometry'] feature['properties']['area_{}'.format( self.language)] = self.area['name'] feature['properties']['region_{}'.format( self.language)] = self.area['region'] feature['properties']['sub_region_{}'.format( self.language)] = self.area['subRegion'] feature['properties']['extended_forecasts_{}'.format( self.language)] = [] if len(extended_forecasts) > 0: datetimes = self.create_datetime_dict([ element for element in extended_forecasts if element.tag == 'dateTime' ]) feature['properties']['issued_datetime_utc'] = strftime_rfc3339( datetimes['utc']) feature['properties']['issued_datetime_local'] = strftime_rfc3339( datetimes['local']) locations = [ element for element in extended_forecasts if element.tag == 'location' ] for location in locations: location = { 'location_{}'.format(self.language): location.attrib['name'] if 'name' in location.attrib else self.area['name'], 'forecast_periods_{}'.format(self.language): [{ 'forecast_period_{}'.format(self.language): forecast_period.attrib['name'], 'forecast_{}'.format(self.language): forecast_period.text, } for forecast_period in location.findall( 'weatherCondition/') if location.findall('weatherCondition/') is not None], 'status_statement_{}'.format(self.language): location.find('statusStatement').text if location.find('statusStatement') is not None else None, } feature['properties']['extended_forecasts_{}'.format( self.language)].append(location) self.items.append(feature) action = { '_id': self.filepath.stem.split('_')[0], '_index': 'marine_weather_extended-forecasts', '_op_type': 'update', 'doc': feature, 'doc_as_upsert': True, } yield action def load_data(self, filepath): """ loads data from event to target :returns: `bool` of status result """ self.filepath = Path(filepath) self.parse_filename() LOGGER.debug('Received file {}'.format(self.filepath)) self.root = etree.parse(str(self.filepath.resolve())).getroot() # set area info for both languages from XML self.set_area_info() warnings = self.generate_warnings() regular_forecasts = self.generate_regular_forecasts() extended_forecasts = self.generate_extended_forecasts() for package in [warnings, regular_forecasts, extended_forecasts]: self.conn.submit_elastic_package(package, request_size=80000) return True
class CapAlertsRealtimeLoader(BaseLoader): """Cap Alerts real-time loader""" def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.conn.create(INDEX_NAME, mapping=SETTINGS) self.references_arr = [] def load_data(self, filepath): """ fonction from base to load the data in ES :param filepath: filepath for parsing the current condition file :returns: True/False """ data = self.weather_warning2geojson(filepath) try: self.bulk_data = [] for doc in data: op_dict = { 'index': { '_index': INDEX_NAME, '_type': '_doc' } } op_dict['index']['_id'] = doc['properties']['identifier'] self.bulk_data.append(op_dict) self.bulk_data.append(doc) r = self.conn.Elasticsearch.bulk( index=INDEX_NAME, body=self.bulk_data ) LOGGER.debug('Result: {}'.format(r)) previous_alerts = self.delete_references_alerts() click.echo('done importing in ES') if previous_alerts: LOGGER.debug('Deleted old warning') else: LOGGER.debug('New warning, no deletion') return True except Exception as err: LOGGER.warning('Error bulk indexing: {}'.format(err)) return False def delete_references_alerts(self): """Delete old alerts documents""" if self.references_arr and len(self.references_arr) != 0: click.echo('Deleting old alerts') query = { 'query': { 'terms': { 'properties.reference': self.references_arr } } } self.conn.Elasticsearch.delete_by_query( index=INDEX_NAME, body=query ) return True else: return False def weather_warning2geojson(self, filepath): """ Create GeoJSON that will be use to display weather alerts :param filepath: filepath to the cap-xml file :returns: xml as json object """ # we must define the variable that we'll need now = datetime.utcnow() french_alert = {} english_alert = {} english_alert_remove = [] timeformat = '%Y-%m-%dT%H:%M:%SZ' # we want to run a loop on every cap-xml in filepath and add them # in the geojson # we want to strat by the newest file in the directory LOGGER.info('Processing {} CAP documents'.format(len(filepath))) LOGGER.debug('Processing {}'.format(filepath)) # with the lxml library we parse the xml file try: tree = etree.parse(filepath) except Exception as err: LOGGER.warning('Cannot parse {}: {}'.format(filepath, err)) url = 'https://dd.weather.gc.ca/alerts/{}'.\ format(filepath.split('alerts')[1]) root = tree.getroot() b_xml = '{urn:oasis:names:tc:emergency:cap:1.2}' identifier = _get_element(root, '{}identifier'.format(b_xml)) references = _get_element(root, '{}references'.format(b_xml)) if references: for ref in references.split(' '): self.references_arr.append(ref.split(',')[1]) for grandchild in root.iter('{}info'.format(b_xml)): expires = _get_date_format(_get_element(grandchild, '{}expires'.format(b_xml)))\ .strftime(timeformat) status_alert = _get_element(grandchild, '{}parameter[last()-4]/' '{}value'.format(b_xml, b_xml)) if _get_date_format(expires) > now: language = _get_element(grandchild, '{}language'.format(b_xml)) if language == 'fr-CA': headline = _get_element(grandchild, '{}headline'.format(b_xml)) description_fr = '{}description'.format(b_xml) descript = _get_element(grandchild, description_fr) descript = descript.replace("\n", " ").strip() for i in grandchild.iter('{}area'.format(b_xml)): tag = _get_element(i, '{}polygon'.format(b_xml)) name = _get_element(i, '{}areaDesc'.format(b_xml)) for j in grandchild.iter('{}geocode'.format(b_xml)): str_value_name = '{}valueName'.format(b_xml) valueName = _get_element(j, str_value_name) if valueName == 'layer:EC-MSC-SMC:1.0:CLC': geocode_value = '{}value'.format(b_xml) geocode = _get_element(j, geocode_value) id_warning = '{}_{}'.format(identifier, geocode) if id_warning not in french_alert: french_alert[id_warning] = (id_warning, name, headline, descript) else: headline = _get_element(grandchild, '{}headline'.format(b_xml)) description = '{}description'.format(b_xml) descript = _get_element(grandchild, description) descript = descript.replace("\n", " ").strip() effective_date =\ _get_element(grandchild, '{}effective'.format(b_xml)) effective = _get_date_format(effective_date) effective = effective.strftime(timeformat) warning = _get_element(grandchild, '{}parameter[1]/' '{}value'.format(b_xml, b_xml)) # There can be many <area> cobvered by one # <info> so we have to loop through the info for i in grandchild.iter('{}area'.format(b_xml)): tag = _get_element(i, '{}polygon'.format(b_xml)) name = _get_element(i, '{}areaDesc'.format(b_xml)) for j in grandchild.iter('{}geocode'.format(b_xml)): valueName = \ _get_element(j, '{}valueName'.format(b_xml)) if valueName == 'layer:EC-MSC-SMC:1.0:CLC': geocode = \ _get_element(j, '{}value'.format(b_xml)) split_tag = re.split(' |,', tag) id_warning = '{}_{}'.format(identifier, geocode) if id_warning not in english_alert: english_alert[id_warning] = (split_tag, name, headline, effective, expires, warning, status_alert, id_warning, descript, url) LOGGER.info('Done processing') for j in english_alert: if _get_date_format(english_alert[j][4]) < now: english_alert_remove.append(j) # We can't remove a element of a dictionary while looping in it # So we remove the warning in another step for key in english_alert_remove: del english_alert[key] del french_alert[key] # To keep going we want to have the same number of warning # in english and in french if len(french_alert) == len(english_alert): LOGGER.info('Creating %d features', len(english_alert)) data = [] for num_poly in english_alert: poly = [] for el in list(reversed(range(0, len(english_alert[num_poly][0]), 2))): if len(english_alert[num_poly][0]) > 1: poly.append([float(english_alert[num_poly][0][el + 1]), float(english_alert[num_poly][0][el]), 0.0]) # for temporary care of the duplicate neighbors coordinate # poly = [k for k, g in groupby(poly)] no_dup_poly = [] for k in poly: if k not in no_dup_poly: no_dup_poly.append(k) no_dup_poly.append(poly[-1]) id_ = english_alert[num_poly][7] AlertLocation = { 'type': "Feature", 'properties': { 'identifier': id_, 'area': english_alert[num_poly][1], 'reference': identifier, 'zone': french_alert[num_poly][1], 'headline': english_alert[num_poly][2], 'titre': french_alert[num_poly][2], 'descrip_en': english_alert[num_poly][8], 'descrip_fr': french_alert[num_poly][3], 'effective': english_alert[num_poly][3], 'expires': english_alert[num_poly][4], 'alert_type': english_alert[num_poly][5], 'status': english_alert[num_poly][6], 'references': self.references_arr, 'url': english_alert[num_poly][9] }, 'geometry': { 'type': "Polygon", 'coordinates': [no_dup_poly] } } data.append(AlertLocation) return data
class ForecastPolygonsLoader(BaseLoader): """Forecast polygons (land/water) loader""" def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.filepath = None self.version = None self.zone = None self.items = [] # create forecast polygon indices if they don't exist for index in INDICES: zone = index.split('_')[2] SETTINGS['mappings']['properties']['properties'][ 'properties'] = FILE_PROPERTIES[zone] self.conn.create(index, SETTINGS) def parse_filename(self): """ Parses a meteocode filename in order to get the version, zone (land/water) and type (proj, unproj, kmz, etc.) :return: `bool` of parse status """ # parse filepath pattern = 'MSC_Geography_Pkg_V{version:w}_{zone}_{type}.zip' filename = self.filepath.name parsed_filename = parse(pattern, filename) # set class variables self.version = parsed_filename.named['version'].replace('_', '.') self.zone = parsed_filename.named['zone'] return True def generate_geojson_features(self, shapefile_name): """ Generates and yields a series of meteocode geodata features, one for each feature in <self.filepath/self.filepath.stem/ shapefile_name>. Features are returned as Elasticsearch bulk API upsert actions, with documents in GeoJSON to match the Elasticsearch index mappings. :returns: Generator of Elasticsearch actions to upsert the forecast polygons for given shapefile in zip archive """ filepath = str( (self.filepath / self.filepath.stem / shapefile_name).resolve()) data = ogr.Open(r'/vsizip/{}'.format(filepath)) lyr = data.GetLayer() for feature in lyr: feature_json = feature.ExportToJson(as_object=True, options=['RFC7946=YES']) feature_json['properties']['version'] = self.version _id = feature_json['properties']['FEATURE_ID'] self.items.append(feature_json) action = { '_id': '{}'.format(_id), '_index': INDEX_NAME.format(self.zone.lower(), shapefile_name.split('_')[2]), '_op_type': 'update', 'doc': feature_json, 'doc_as_upsert': True } yield action def load_data(self, filepath): """ loads data from event to target :returns: `bool` of status result """ self.filepath = Path(filepath) # set class variables from filename self.parse_filename() LOGGER.debug('Received file {}'.format(self.filepath)) for shapefile in SHAPEFILES_TO_LOAD[self.filepath.stem]: # generate geojson features package = self.generate_geojson_features(shapefile) self.conn.submit_elastic_package(package, request_size=80000) return True
class CitypageweatherRealtimeLoader(BaseLoader): """Current conditions real-time loader""" def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.conn.create(INDEX_NAME, mapping=SETTINGS) def load_data(self, filepath): """ fonction from base to load the data in ES :param filepath: filepath for parsing the current condition file :returns: True/False """ with open( os.path.join(MSC_PYGEOAPI_BASEPATH, 'resources/wxo_lookup.json')) as json_file: wxo_lookup = json.load(json_file) data = self.xml2json_cpw(wxo_lookup, filepath) try: r = self.conn.Elasticsearch.index( index=INDEX_NAME, id=data['properties']['identifier'], body=data) LOGGER.debug('Result: {}'.format(r)) return True except Exception as err: LOGGER.warning('Error indexing: {}'.format(err)) return False def _get_element(self, node, path, attrib=None): """ Convenience function to resolve lxml.etree.Element handling :param node: xml node :param path: path in the xml node :param attrib: attribute to get in the node returns: attribute as text or None """ val = node.find(path) if attrib is not None and val is not None: return val.attrib.get(attrib) if hasattr(val, 'text') and val.text not in [None, '']: return val.text return None def if_none(self, type_, value): """ Convenience fonction to avoid errors when converting to int or float :param type_: f for float and i for int :param value: value to convert to float/int :returns: converted variable """ if type_ == 'f': variable = float(value) if value else 'null' elif type_ == 'i': variable = int(value) if value else 'null' return variable def xml2json_cpw(self, wxo_lookup, xml): """ main for generating weather data :param wxo_lookup: json file to have the city id :param xml: xml file to parse and convert to json :returns: xml as json object """ feature = {} row = {} LOGGER.debug('Processing XML: {}'.format(xml)) LOGGER.debug('Fetching English elements') try: root = etree.parse(xml).getroot() except Exception as err: LOGGER.error('ERROR: cannot process data: {}'.format(err)) if root.findall("currentConditions/"): sitecode = os.path.basename(xml)[:-6] try: citycode = wxo_lookup[sitecode]['citycode'] except KeyError as err: LOGGER.error('ERROR: cannot find sitecode {} : ' 'err: {}'.format(sitecode, err)) location_name = root.find('location/name') x = float(location_name.attrib.get('lon')[:-1]) y = float(location_name.attrib.get('lat')[:-1]) if location_name.attrib.get('lat')[-1] == 'S': y *= -1 # south means negative latitude elif location_name.attrib.get('lon')[-1] in ['W', 'O']: x *= -1 # west means negative longitude feature['geom'] = [x, y, 0.0] icon = self._get_element(root, 'currentConditions/iconCode') if icon: row['icon'] = 'https://weather.gc.ca/' \ 'weathericons/{}.gif'.format(icon) else: row['icon'] = None for dates in root.findall("currentConditions/dateTime" "[@zone='UTC'][@name='observation']"): timestamp = dates.find('timeStamp') if timestamp is not None: dt2 = datetime.strptime(timestamp.text, '%Y%m%d%H%M%S') row['timestamp'] = dt2.strftime('%Y-%m-%dT%H:%M:%SZ') row['rel_hum'] = self._get_element( root, 'currentConditions/relativeHumidity') row['speed'] = self._get_element(root, 'currentConditions/wind/speed') row['gust'] = self._get_element(root, 'currentConditions/wind/gust') row['direction'] = self._get_element( root, 'currentConditions/wind/direction') row['bearing'] = self._get_element( root, 'currentConditions/wind/bearing') row['temp'] = self._get_element(root, 'currentConditions/temperature') row['dewpoint'] = self._get_element(root, 'currentConditions/dewpoint') row['windchill'] = self._get_element( root, 'currentConditions/windChill') if xml.endswith('e.xml'): row['name'] = self._get_element(root, 'location/name') row['station_en'] = self._get_element( root, 'currentConditions/station') row['cond_en'] = self._get_element( root, 'currentConditions/condition') row['pres_en'] = self._get_element( root, 'currentConditions/pressure') row['prestnd_en'] = self._get_element( root, 'currentConditions/pressure', 'tendency') row['url_en'] = 'https://weather.gc.ca/city/pages/' \ '{}_metric_e.html'.format(citycode) row['national'] = 0 if row['name'] in NATIONAL_CITIES: row['national'] = 1 LOGGER.debug('Adding feature') LOGGER.debug('Setting geometry') conditions = { 'type': "Feature", 'properties': { 'identifier': citycode, 'name': row['name'], 'station_en': row['station_en'], 'icon': row['icon'], 'cond_en': row['cond_en'], 'temp': self.if_none('f', row['temp']), 'dewpoint': self.if_none('f', row['dewpoint']), 'windchill': self.if_none('i', row['windchill']), 'pres_en': self.if_none('f', row['pres_en']), 'prestnd_en': row['prestnd_en'], 'rel_hum': self.if_none('i', row['rel_hum']), 'speed': self.if_none('i', row['speed']), 'gust': self.if_none('i', row['gust']), 'direction': row['direction'], 'bearing': self.if_none('f', row['bearing']), 'timestamp': row['timestamp'], 'url_en': row['url_en'], 'national': int(row['national']) }, 'geometry': { 'type': "Point", 'coordinates': feature['geom'] } } elif xml.endswith('f.xml'): LOGGER.debug('Processing {}'.format(xml)) row['nom'] = self._get_element(root, 'location/name') row['station_fr'] = self._get_element( root, 'currentConditions/station') row['cond_fr'] = self._get_element( root, 'currentConditions/condition') row['pres_fr'] = self._get_element( root, 'currentConditions/pressure') row['prestnd_fr'] = self._get_element( root, 'currentConditions/pressure', 'tendency') row['url_fr'] = 'https://meteo.gc.ca/city/pages/' \ '{}_metric_f.html'.format(citycode) row['national'] = 0 if row['nom'] in NATIONAL_CITIES: row['national'] = 1 LOGGER.debug('Adding feature') LOGGER.debug('Setting geometry') conditions = { 'type': "Feature", 'properties': { 'identifier': citycode, 'nom': row['nom'], 'station_fr': row['station_fr'], 'icon': row['icon'], 'cond_fr': row['cond_fr'], 'temp': self.if_none('f', row['temp']), 'dewpoint': self.if_none('f', row['dewpoint']), 'windchill': self.if_none('i', row['windchill']), 'pres_fr': self.if_none('f', row['pres_fr']), 'prestnd_fr': row['prestnd_fr'], 'rel_hum': self.if_none('i', row['rel_hum']), 'speed': self.if_none('i', row['speed']), 'gust': self.if_none('i', row['gust']), 'direction': row['direction'], 'bearing': self.if_none('f', row['bearing']), 'timestamp': row['timestamp'], 'url_fr': row['url_fr'], 'national': int(row['national']) }, 'geometry': { 'type': "Point", 'coordinates': feature['geom'] } } conditions['properties'] = { key: val for key, val in conditions['properties'].items() if val != 'null' } # noqa return conditions
class HurricanesRealtimeLoader(BaseLoader): """Hurricanes Real-time loader""" def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.filepath = None self.date_ = None self.fh = None self.storm_name = None self.storm_variable = None self.items = [] # create storm variable indices if it don't exist for item in FILE_PROPERTIES: SETTINGS['mappings']['properties']['properties'][ 'properties' ] = FILE_PROPERTIES[item] self.conn.create(INDEX_NAME.format(item), SETTINGS) def parse_filename(self): """ Parses a hurricane filename in order to get the date, forecast issued time, storm name, and storm variable. :return: `bool` of parse status """ # parse filepath pattern = '{date_}_{fh}_{storm_name}.{storm_variable}.' \ '{file_extension}' filename = self.filepath.name parsed_filename = parse(pattern, filename) # set class variables self.date_ = datetime.strptime(parsed_filename.named['date_'], '%Y%m%d') self.fh = parsed_filename.named['fh'] self.storm_name = parsed_filename.named['storm_name'] self.storm_variable = parsed_filename.named['storm_variable'] return True def check_shapefile_deps(self): """ Check that all shapefile dependencies are available :return: `bool` of check result """ dependencies = ['.shp', '.shx', '.dbf', '.prj'] return all([self.filepath.with_suffix(suffix).exists() for suffix in dependencies]) # TODO: Remove once upstream data is patched @staticmethod def clean_consecutive_coordinates(coordinates): """ Temporary fix for issues with upstream data. Removes consecutive coordinate points from GeoJSON coordinates :param coordinates: list of GeoJSON coordinates :return: """ return [[k for k, g in groupby(coordinate)] for coordinate in coordinates] def deactivate_old_forecasts(self): """ Deactivates previously added forecasts for a specific storm name. :return: `bool` of deactivation status """ query = { "script": "ctx._source.properties.active=false", "query": { "bool": { "must": [ {"match": {"properties.STORMNAME": self.storm_name}}, {"match": {"properties.active": True}}, ] } } } try: self.conn.Elasticsearch.update_by_query(index=INDEX_NAME.format( self.storm_variable), body=query) except ConflictError: LOGGER.warning("Conflict error detected. Refreshing index and " "retrying update by query.") self.conn.Elasticsearch.indices.refresh(index=INDEX_NAME.format( self.storm_variable)) self.conn.Elasticsearch.update_by_query(index=INDEX_NAME.format( self.storm_variable), body=query) return True def generate_geojson_features(self): """ Generates and yields a series of storm forecasts, one for each feature in <self.filepath>. Observations are returned as Elasticsearch bulk API upsert actions, with documents in GeoJSON to match the Elasticsearch index mappings. :returns: Generator of Elasticsearch actions to upsert the storm forecasts """ driver = ogr.GetDriverByName('ESRI Shapefile') filepath = str(self.filepath.resolve()) data = driver.Open(filepath, 0) lyr = data.GetLayer(0) file_datetime_str = strftime_rfc3339(self.date_) for feature in lyr: feature_json = feature.ExportToJson(as_object=True) feature_json['properties']['active'] = True feature_json['properties'][ 'filename'] = self.filepath.stem feature_json['properties'][ 'filedate'] = file_datetime_str # noqa # TODO: Remove once upstream data is patched # clean rad consecutive coordinates in geometry (temporary fix) if self.storm_variable == 'rad': feature_json['geometry'][ 'coordinates'] = self.clean_consecutive_coordinates( feature_json['geometry']['coordinates']) # format pts ADVDATE if self.storm_variable == 'pts': feature_json['properties']['ADVDATE'] = \ strftime_rfc3339( datetime.strptime( feature_json['properties']['ADVDATE'], '%y%m%d/%H%M' ) ) self.items.append(feature_json) action = { '_id': '{}-{}-{}-{}-{}'.format(self.storm_name, self.storm_variable, file_datetime_str, self.fh, feature_json['id']), '_index': INDEX_NAME.format(self.storm_variable), '_op_type': 'update', 'doc': feature_json, 'doc_as_upsert': True } yield action def load_data(self, filepath): """ loads data from event to target :returns: `bool` of status result """ self.filepath = Path(filepath) # set class variables from filename self.parse_filename() LOGGER.debug('Received file {}'.format(self.filepath)) # check for shapefile dependencies if self.check_shapefile_deps(): # deactivate old forecasts for current storm name self.deactivate_old_forecasts() # generate geojson features package = self.generate_geojson_features() self.conn.submit_elastic_package(package, request_size=80000) return True else: LOGGER.debug("All Shapefile dependencies not found. Ignoring " "file...") return False
class ClimateArchiveLoader(BaseLoader): """Climat Archive Loader""" def __init__(self, db_conn_string, conn_config={}): """initializer""" super().__init__() self.conn = ElasticsearchConnector(conn_config) # setup DB connection try: self.db_conn = cx_Oracle.connect(db_conn_string) except Exception as err: msg = f'Could not connect to Oracle: {err}' LOGGER.critical(msg) raise click.ClickException(msg) self.cur = self.db_conn.cursor() def create_index(self, index): """ Creates the Elasticsearch index at path. If the index already exists, it is deleted and re-created. The mappings for the two types are also created. :param index: the index to be created. """ if index == 'stations': mapping = { "settings": {"number_of_shards": 1, "number_of_replicas": 0}, "mappings": { "_meta": {"geomfields": {"geometry": "POINT"}}, "properties": { "type": {"type": "text"}, "properties": { "properties": { "PROV_STATE_TERR_CODE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "STN_ID": {"type": "integer"}, "STATION_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "ENG_PROV_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "FRE_PROV_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "COUNTRY": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "LATITUDE": {"type": "integer"}, "LONGITUDE": {"type": "integer"}, "TIMEZONE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "ELEVATION": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "CLIMATE_IDENTIFIER": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "TC_IDENTIFIER": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "WMO_IDENTIFIER": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "STATION_TYPE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "NORMAL_CODE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "PUBLICATION_CODE": {"type": "integer"}, "DISPLAY_CODE": {"type": "integer"}, "ENG_STN_OPERATOR_ACRONYM": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "FRE_STN_OPERATOR_ACRONYM": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "ENG_STN_OPERATOR_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "FRE_STN_OPERATOR_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "HAS_MONTHLY_SUMMARY": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "HAS_NORMALS_DATA": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "DLY_FIRST_DATE": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", }, "DLY_LAST_DATE": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", }, "FIRST_DATE": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", }, "LAST_DATE": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", }, } }, "geometry": {"type": "geo_shape"}, }, }, } index_name = 'climate_station_information' self.conn.create(index_name, mapping, overwrite=True) if index == 'normals': mapping = { "settings": {"number_of_shards": 1, "number_of_replicas": 0}, "mappings": { "_meta": {"geomfields": {"geometry": "POINT"}}, "properties": { "type": {"type": "text"}, "properties": { "properties": { "STN_ID": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "MONTH": {"type": "integer"}, "VALUE": {"type": "integer"}, "OCCURRENCE_COUNT": {"type": "integer"}, "PUBLICATION_CODE": {"type": "integer"}, "CLIMATE_IDENTIFIER": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "NORMAL_CODE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "NORMAL_ID": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "ID": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "PROVINCE_CODE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "E_NORMAL_ELEMENT_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "F_NORMAL_ELEMENT_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "PERIOD": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "PERIOD_BEGIN": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "PERIOD_END": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "STATION_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "YEAR_COUNT_NORMAL_PERIOD": { "type": "integer" }, "MAX_DURATION_MISSING_YEARS": { "type": "integer" }, "FIRST_YEAR_NORMAL_PERIOD": { "type": "integer" }, "LAST_YEAR_NORMAL_PERIOD": {"type": "integer"}, "FIRST_YEAR": {"type": "integer"}, "LAST_YEAR": {"type": "integer"}, "TOTAL_OBS_COUNT": {"type": "integer"}, "PERCENT_OF_POSSIBLE_OBS": {"type": "integer"}, "CURRENT_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "FIRST_OCCURRENCE_DATE": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", }, "DATE_CALCULATED": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", }, } }, "geometry": {"type": "geo_shape"}, }, }, } index_name = 'climate_normals_data' self.conn.create(index_name, mapping, overwrite=True) if index == 'monthly_summary': mapping = { "settings": {"number_of_shards": 1, "number_of_replicas": 0}, "mappings": { "_meta": {"geomfields": {"geometry": "POINT"}}, "properties": { "type": {"type": "text"}, "properties": { "properties": { "CLIMATE_IDENTIFIER": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "STN_ID": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "STATION_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "ID": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "PROVINCE_CODE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "LATITUDE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "LONGITUDE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "MEAN_TEMPERATURE": {"type": "float"}, "NORMAL_MEAN_TEMPERATURE": {"type": "float"}, "MAX_TEMPERATURE": {"type": "float"}, "MIN_TEMPERATURE": {"type": "float"}, "TOTAL_SNOWFALL": {"type": "float"}, "NORMAL_SNOWFALL": {"type": "float"}, "TOTAL_PRECIPITATION": {"type": "float"}, "NORMAL_PRECIPITATION": {"type": "float"}, "BRIGHT_SUNSHINE": {"type": "float"}, "NORMAL_SUNSHINE": {"type": "float"}, "SNOW_ON_GROUND_LAST_DAY": {"type": "float"}, "DAYS_WITH_VALID_MIN_TEMP": { "type": "integer" }, "DAYS_WITH_VALID_MEAN_TEMP": { "type": "integer" }, "DAYS_WITH_VALID_MAX_TEMP": { "type": "integer" }, "DAYS_WITH_VALID_SNOWFALL": { "type": "integer" }, "DAYS_WITH_VALID_PRECIP": {"type": "integer"}, "DAYS_WITH_VALID_SUNSHINE": { "type": "integer" }, "DAYS_WITH_PRECIP_GE_1MM": {"type": "integer"}, "HEATING_DEGREE_DAYS": {"type": "integer"}, "COOLING_DEGREE_DAYS": {"type": "integer"}, "LOCAL_YEAR": {"type": "integer"}, "LOCAL_MONTH": {"type": "integer"}, "LAST_UPDATED": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", }, "LOCAL_DATE": { "type": "date", "format": "yyyy-MM", }, } }, "geometry": {"type": "geo_shape"}, }, }, } index_name = 'climate_public_climate_summary' self.conn.create(index_name, mapping, overwrite=True) if index == 'daily_summary': mapping = { "settings": {"number_of_shards": 1, "number_of_replicas": 0}, "mappings": { "_meta": {"geomfields": {"geometry": "POINT"}}, "properties": { "type": {"type": "text"}, "properties": { "properties": { "CLIMATE_IDENTIFIER": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "STN_ID": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "STATION_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "SOURCE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "ID": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "MAX_TEMPERATURE_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "MIN_TEMPERATURE_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "MEAN_TEMPERATURE_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "PROVINCE_CODE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "MAX_REL_HUMIDITY_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "MIN_REL_HUMIDITY_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "TOTAL_RAIN_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "TOTAL_SNOW_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "TOTAL_PRECIPITATION_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "SNOW_ON_GROUND_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "DIRECTION_MAX_GUST_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "SPEED_MAX_GUST_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "HEATING_DEGREE_DAYS_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "COOLING_DEGREE_DAYS_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "MEAN_TEMPERATURE": {"type": "float"}, "TOTAL_RAIN": {"type": "float"}, "MAX_TEMPERATURE": {"type": "float"}, "MIN_TEMPERATURE": {"type": "float"}, "MAX_REL_HUMIDITY": {"type": "float"}, "MIN_REL_HUMIDITY": {"type": "float"}, "TOTAL_SNOW": {"type": "float"}, "SNOW_ON_GROUND": {"type": "float"}, "TOTAL_PRECIPITATION": {"type": "float"}, "DIRECTION_MAX_GUST": {"type": "float"}, "SPEED_MAX_GUST": {"type": "float"}, "HEATING_DEGREE_DAYS": {"type": "integer"}, "COOLING_DEGREE_DAYS": {"type": "integer"}, "LOCAL_YEAR": {"type": "integer"}, "LOCAL_MONTH": {"type": "integer"}, "LOCAL_DAY": {"type": "integer"}, "LOCAL_DATE": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", }, } }, "geometry": {"type": "geo_shape"}, }, }, } index_name = 'climate_public_daily_data' self.conn.create(index_name, mapping, overwrite=True) def generate_stations(self): """ Queries stations data from the db, and reformats data so it can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param cur: oracle cursor to perform queries against. :returns: generator of bulk API upsert actions. """ try: self.cur.execute('select * from CCCS_PORTAL.STATION_INFORMATION') except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to: {str(err)}.' ) for row in self.cur: insert_dict = dict(zip([x[0] for x in self.cur.description], row)) for key in insert_dict: # This is a quick fix for trailing spaces and should not be # here. Data should be fixed on db side. try: insert_dict[key] = insert_dict[key].strip() except Exception as err: LOGGER.debug( f'Could not strip value {insert_dict[key]} due to ' f'{str(err)}, skipping' ) # Transform Date fields from datetime to string. if 'DATE' in key: insert_dict[key] = ( str(insert_dict[key]) if insert_dict[key] is not None else insert_dict[key] ) coords = [ float(insert_dict['LONGITUDE_DECIMAL_DEGREES']), float(insert_dict['LATITUDE_DECIMAL_DEGREES']), ] del insert_dict['LONGITUDE_DECIMAL_DEGREES'] del insert_dict['LATITUDE_DECIMAL_DEGREES'] climate_identifier = insert_dict['CLIMATE_IDENTIFIER'] wrapper = { 'type': 'Feature', 'properties': insert_dict, 'geometry': {'type': 'Point', 'coordinates': coords}, } action = { '_id': climate_identifier, '_index': 'climate_station_information', '_op_type': 'update', 'doc': wrapper, 'doc_as_upsert': True, } yield action def generate_normals(self, stn_dict, normals_dict, periods_dict): """ Queries normals data from the db, and reformats data so it can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param cur: oracle cursor to perform queries against. :param stn_dict: mapping of station IDs to station information. :param normals_dict: mapping of normal IDs to normals information. :param periods_dict: mapping of normal period IDs to normal period information. :returns: generator of bulk API upsert actions. """ try: self.cur.execute('select * from CCCS_PORTAL.NORMALS_DATA') except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to: {str(err)}.' ) for row in self.cur: insert_dict = dict(zip([x[0] for x in self.cur.description], row)) for key in insert_dict: # Transform Date fields from datetime to string. if 'DATE' in key: insert_dict[key] = ( str(insert_dict[key]) if insert_dict[key] is not None else insert_dict[key] ) insert_dict['ID'] = '{}.{}.{}'.format( insert_dict['STN_ID'], insert_dict['NORMAL_ID'], insert_dict['MONTH'], ) if insert_dict['STN_ID'] in stn_dict: coords = stn_dict[insert_dict['STN_ID']]['coordinates'] insert_dict['STATION_NAME'] = stn_dict[insert_dict['STN_ID']][ 'STATION_NAME' ] insert_dict['PROVINCE_CODE'] = stn_dict[insert_dict['STN_ID']][ 'PROVINCE_CODE' ] insert_dict['E_NORMAL_ELEMENT_NAME'] = normals_dict[ insert_dict['NORMAL_ID'] ]['E_NORMAL_ELEMENT_NAME'] insert_dict['F_NORMAL_ELEMENT_NAME'] = normals_dict[ insert_dict['NORMAL_ID'] ]['F_NORMAL_ELEMENT_NAME'] insert_dict['PERIOD'] = normals_dict[insert_dict['NORMAL_ID']][ 'PERIOD' ] insert_dict['PERIOD_BEGIN'] = periods_dict[ insert_dict['NORMAL_PERIOD_ID'] ]['PERIOD_BEGIN'] insert_dict['PERIOD_END'] = periods_dict[ insert_dict['NORMAL_PERIOD_ID'] ]['PERIOD_END'] insert_dict['CLIMATE_IDENTIFIER'] = stn_dict[ insert_dict['STN_ID'] ]['CLIMATE_IDENTIFIER'] del insert_dict['NORMAL_PERIOD_ID'] wrapper = { 'type': 'Feature', 'properties': insert_dict, 'geometry': {'type': 'Point', 'coordinates': coords}, } action = { '_id': insert_dict['ID'], '_index': 'climate_normals_data', '_op_type': 'update', 'doc': wrapper, 'doc_as_upsert': True, } yield action else: LOGGER.error( f"Bad STN ID: {insert_dict['STN_ID']}, skipping" f" records for this station" ) def generate_monthly_data(self, stn_dict, date=None): """ Queries monthly data from the db, and reformats data so it can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param cur: oracle cursor to perform queries against. :param stn_dict: mapping of station IDs to station information. :param date: date to start fetching data from. :returns: generator of bulk API upsert actions. """ if not date: try: self.cur.execute( 'select * from CCCS_PORTAL.PUBLIC_CLIMATE_SUMMARY' ) except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to: {str(err)}.' ) else: try: self.cur.execute( ( f"select * from CCCS_PORTAL.PUBLIC_CLIMATE_SUMMARY " f"WHERE LAST_UPDATED > TO_TIMESTAMP(" f"'{date} 00:00:00', 'YYYY-MM-DD HH24:MI:SS')" ) ) except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to: {str(err)}.' ) for row in self.cur: insert_dict = dict(zip([x[0] for x in self.cur.description], row)) # Transform Date fields from datetime to string. insert_dict['LAST_UPDATED'] = ( str(insert_dict['LAST_UPDATED']) if insert_dict['LAST_UPDATED'] is not None else insert_dict['LAST_UPDATED'] ) insert_dict['ID'] = '{}.{}.{}'.format( insert_dict['STN_ID'], insert_dict['LOCAL_YEAR'], insert_dict['LOCAL_MONTH'], ) if insert_dict['STN_ID'] in stn_dict: coords = stn_dict[insert_dict['STN_ID']]['coordinates'] insert_dict['PROVINCE_CODE'] = stn_dict[insert_dict['STN_ID']][ 'PROVINCE_CODE' ] wrapper = { 'type': 'Feature', 'properties': insert_dict, 'geometry': {'type': 'Point', 'coordinates': coords}, } action = { '_id': insert_dict['ID'], '_index': 'climate_public_climate_summary', '_op_type': 'update', 'doc': wrapper, 'doc_as_upsert': True, } yield action else: LOGGER.error( f"Bad STN ID: {insert_dict['STN_ID']}, skipping" f" records for this station" ) def generate_daily_data(self, stn_dict, date=None): """ Queries daily data from the db, and reformats data so it can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param cur: oracle cursor to perform queries against. :param stn_dict: mapping of station IDs to station information. :param date: date to start fetching data from. :returns: generator of bulk API upsert actions. """ for station in stn_dict: if not date: try: self.cur.execute( f'select * from CCCS_PORTAL.PUBLIC_DAILY_DATA ' f'where STN_ID={station}' ) except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to:' f' {str(err)}.' ) else: try: self.cur.execute( ( f"select * from CCCS_PORTAL.PUBLIC_DAILY_DATA " f"where STN_ID={station} and " f"LOCAL_DATE > TO_TIMESTAMP('{date} 00:00:00', " f"'YYYY-MM-DD HH24:MI:SS')" ) ) except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to:' f' {str(err)}.' ) for row in self.cur: insert_dict = dict( zip([x[0] for x in self.cur.description], row) ) # Transform Date fields from datetime to string. insert_dict['LOCAL_DATE'] = ( str(insert_dict['LOCAL_DATE']) if insert_dict['LOCAL_DATE'] is not None else insert_dict['LOCAL_DATE'] ) insert_dict['ID'] = '{}.{}.{}.{}'.format( insert_dict['CLIMATE_IDENTIFIER'], insert_dict['LOCAL_YEAR'], insert_dict['LOCAL_MONTH'], insert_dict['LOCAL_DAY'], ) if insert_dict['STN_ID'] in stn_dict: coords = stn_dict[insert_dict['STN_ID']]['coordinates'] insert_dict['PROVINCE_CODE'] = stn_dict[ insert_dict['STN_ID'] ]['PROVINCE_CODE'] insert_dict['STATION_NAME'] = stn_dict[ insert_dict['STN_ID'] ]['STATION_NAME'] wrapper = { 'type': 'Feature', 'properties': insert_dict, 'geometry': {'type': 'Point', 'coordinates': coords}, } action = { '_id': insert_dict['ID'], '_index': 'climate_public_daily_data', '_op_type': 'update', 'doc': wrapper, 'doc_as_upsert': True, } yield action else: LOGGER.error( f"Bad STN ID: {insert_dict['STN_ID']}, skipping" f" records for this station" ) def get_station_data(self, station, starting_from): """ Creates a mapping of station ID to station coordinates and province name. :param cur: oracle cursor to perform queries against. :returns: A dictionary of dictionaries containing station coordinates and province name keyed by station ID. """ stn_dict = collections.OrderedDict() try: if station: if starting_from: self.cur.execute( ( f'select STN_ID, LONGITUDE_DECIMAL_DEGREES, ' f'LATITUDE_DECIMAL_DEGREES, ENG_PROV_NAME, ' f'FRE_PROV_NAME, PROV_STATE_TERR_CODE, ' f'STATION_NAME, CLIMATE_IDENTIFIER ' f'from CCCS_PORTAL.STATION_INFORMATION ' f'where STN_ID >= {station} ' f'order by STN_ID' ) ) else: self.cur.execute( ( f'select STN_ID, LONGITUDE_DECIMAL_DEGREES, ' f'LATITUDE_DECIMAL_DEGREES, ENG_PROV_NAME, ' f'FRE_PROV_NAME, PROV_STATE_TERR_CODE, ' f'STATION_NAME, CLIMATE_IDENTIFIER ' f'from CCCS_PORTAL.STATION_INFORMATION ' f'where STN_ID = {station} ' f'order by STN_ID' ) ) else: self.cur.execute( ( 'select STN_ID, LONGITUDE_DECIMAL_DEGREES, ' 'LATITUDE_DECIMAL_DEGREES, ENG_PROV_NAME, ' 'FRE_PROV_NAME, PROV_STATE_TERR_CODE, ' 'STATION_NAME, CLIMATE_IDENTIFIER ' 'from CCCS_PORTAL.STATION_INFORMATION ' 'order by STN_ID' ) ) except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to: {str(err)}.' ) for row in self.cur: stn_dict[row[0]] = { 'coordinates': [row[1], row[2]], 'ENG_PROV_NAME': row[3], 'FRE_PROV_NAME': row[4], 'PROVINCE_CODE': row[5].strip(), # remove the strip 'STATION_NAME': row[6], 'CLIMATE_IDENTIFIER': row[7].strip(), } return stn_dict def get_normals_data(self): """ Creates a mapping of normal ID to pub_name and period. :param cur: oracle cursor to perform queries against. :returns: A dictionary of dictionaries containing pub_name and period keyed by normal ID. """ normals_dict = {} try: self.cur.execute( ( 'select NORMAL_ID, E_NORMAL_ELEMENT_NAME, ' 'F_NORMAL_ELEMENT_NAME, PERIOD ' 'from CCCS_PORTAL.VALID_NORMALS_ELEMENTS' ) ) except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to: {str(err)}.' ) for row in self.cur: normals_dict[row[0]] = { 'E_NORMAL_ELEMENT_NAME': row[1], 'F_NORMAL_ELEMENT_NAME': row[2], 'PERIOD': row[3], } return normals_dict def get_normals_periods(self): """ Creates a mapping of normal period ID to period begin and end. :param cur: oracle cursor to perform queries against. :returns: A dictionary of dictionaries containing period begin and end keyed by normal period ID. """ period_dict = {} try: self.cur.execute( ( 'select NORMAL_PERIOD_ID, PERIOD_BEGIN, PERIOD_END ' 'from CCCS_PORTAL.NORMAL_PERIODS' ) ) except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to: {str(err)}.' ) for row in self.cur: period_dict[row[0]] = { 'PERIOD_BEGIN': row[1], 'PERIOD_END': row[2], } return period_dict
class LtceLoader(BaseLoader): """LTCE data loader""" def __init__(self, db_string=None, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.db_conn = None # setup DB connection if db_string is not None: try: self.db_conn = cx_Oracle.connect(db_string) self.cur = self.db_conn.cursor() except Exception as err: msg = 'Could not connect to Oracle: {}'.format(err) LOGGER.critical(msg) raise click.ClickException(msg) else: LOGGER.debug("No DB connection string passed. Indexing disabled.") self.db_conn = self.cur = None for item in MAPPINGS: SETTINGS['mappings']['properties']['properties'][ 'properties' ] = MAPPINGS[item] self.conn.create(INDEX_NAME.format(item), SETTINGS) def get_stations_info(self, element_name, station_id): """ Queries LTCE station data for a given element name (DAILY MINIMUM TEMPERATURE, DAILY MAXIMUM TEMPERATURE, etc.), and virtual station ID. Returns the ealiest start date of all returned stations and the end date climate identifier, and coordinates of the most recently threaded station. :param element_name: `str` of element name :param station_id: `str` of virtual climate station id :return: `dict` of stations information """ query = { "query": { "bool": { "filter": { "bool": { "must": [ { "term": { "properties.VIRTUAL_CLIMATE_ID.raw": station_id # noqa } }, { "term": { "properties.ELEMENT_NAME_E.raw": element_name # noqa } }, ] } } } } } results = self.conn.Elasticsearch.search( body=query, index='ltce_stations', _source=[ 'properties.CLIMATE_IDENTIFIER', 'properties.ENG_STN_NAME', 'properties.FRE_STN_NAME', 'properties.START_DATE', 'properties.END_DATE', 'properties.PROVINCE_CODE', 'geometry.coordinates', ], ) results = [result['_source'] for result in results['hits']['hits']] oldest_station = None most_recent_station = None for index, station in enumerate(results): # retrieve station start and end date dates = ( station['properties']['START_DATE'], station['properties']['END_DATE'], ) # convert station dates to datetime objects ( station['properties']['START_DATE'], station['properties']['END_DATE'], ) = (start_date, end_date) = [ datetime.strptime(date, DATETIME_RFC3339_FMT) if date is not None else None for date in dates ] # assign first station as oldest and most recent if index == 0: oldest_station = station most_recent_station = station continue # then compare all remaining stations and replace as necessary if start_date < oldest_station['properties']['START_DATE']: oldest_station = station if most_recent_station['properties']['END_DATE'] is not None and ( end_date is None or end_date > most_recent_station['properties']['END_DATE'] ): most_recent_station = station stations_info = { 'record_begin': strftime_rfc3339( oldest_station['properties']['START_DATE'] ), 'record_end': strftime_rfc3339( most_recent_station['properties']['END_DATE'] ) if most_recent_station['properties']['END_DATE'] else None, 'climate_identifier': most_recent_station['properties'][ 'CLIMATE_IDENTIFIER' ], 'eng_stn_name': most_recent_station['properties']['ENG_STN_NAME'], 'fre_stn_name': most_recent_station['properties']['FRE_STN_NAME'], 'coords': [ most_recent_station['geometry']['coordinates'][0], most_recent_station['geometry']['coordinates'][1], ], 'province_code': most_recent_station['properties'][ 'PROVINCE_CODE' ], } return stations_info def generate_stations(self): """ Queries stations data from the db, and reformats data so it can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param cur: oracle cursor to perform queries against. :returns: generator of bulk API upsert actions. """ try: self.cur.execute( ( "SELECT ARKEON2DWH.VIRTUAL_STATION_INFO_F_MVW.*," "ARKEON2DWH.STATION_INFORMATION.ENG_STN_NAME," "ARKEON2DWH.STATION_INFORMATION.FRE_STN_NAME," "ARKEON2DWH.WXO_CITY_INFORMATION_MVW.LAT," "ARKEON2DWH.WXO_CITY_INFORMATION_MVW.LON," "ARKEON2DWH.WXO_CITY_INFORMATION_MVW.PROVINCECODE " "FROM ARKEON2DWH.VIRTUAL_STATION_INFO_F_MVW " "LEFT JOIN ARKEON2DWH.STATION_INFORMATION " "ON ARKEON2DWH.VIRTUAL_STATION_INFO_F_MVW.STN_ID = " "ARKEON2DWH.STATION_INFORMATION.STN_ID " "LEFT JOIN ARKEON2DWH.WXO_CITY_INFORMATION_MVW " "ON ARKEON2DWH.VIRTUAL_STATION_INFO_F_MVW.WXO_CITY_CODE = " "ARKEON2DWH.WXO_CITY_INFORMATION_MVW.CITYCODE " "WHERE " "ARKEON2DWH.VIRTUAL_STATION_INFO_F_MVW.ELEMENT_NAME_E IN " "('DAILY MINIMUM TEMPERATURE', 'DAILY MAXIMUM TEMPERATURE'," # noqa "'DAILY TOTAL PRECIPITATION', 'DAILY TOTAL SNOWFALL')" ) ) except Exception as err: LOGGER.error( 'Could not fetch records from oracle due to: {}.'.format( str(err) ) ) for row in self.cur: insert_dict = dict(zip([x[0] for x in self.cur.description], row)) for key in insert_dict: if key in ['START_DATE', 'END_DATE']: insert_dict[key] = ( strftime_rfc3339(insert_dict[key]) if insert_dict[key] is not None else insert_dict[key] ) es_id = slugify( '{}-{}-{}-{}-{}'.format( insert_dict['VIRTUAL_CLIMATE_ID'], insert_dict["ELEMENT_NAME_E"], insert_dict["CLIMATE_IDENTIFIER"], insert_dict["START_DATE"], insert_dict["END_DATE"], ) ) coords = [ float(insert_dict['LON']), float(insert_dict['LAT']), ] # rename PROVINCECODE field to PROVINCE_CODE insert_dict['PROVINCE_CODE'] = insert_dict['PROVINCECODE'] # cleanup unwanted fields retained from SQL join fields_to_delete = [ 'STN_ID', 'ENG_PROV_NAME', 'FRE_PROV_NAME', 'REGION_CODE', 'CRITERIA', 'NOTES', 'VIRTUAL_STN_INFO_UPDATE_ID', 'CURRENT_FLAG', 'LON', 'LAT', 'PROVINCECODE', ] for field in fields_to_delete: insert_dict.pop(field) # set properties.IDENTIFIER insert_dict['IDENTIFIER'] = es_id wrapper = { 'id': es_id, 'type': 'Feature', 'properties': insert_dict, 'geometry': {'type': 'Point', 'coordinates': coords}, } action = { '_id': es_id, '_index': 'ltce_stations', '_op_type': 'update', 'doc': wrapper, 'doc_as_upsert': True, } yield action def generate_daily_temp_extremes(self): """ Queries stations data from the db, and reformats data so it can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param cur: oracle cursor to perform queries against. :returns: generator of bulk API upsert actions. """ try: self.cur.execute( ( "SELECT t1.*, t2.*, t3.*, t4.*, t5.*, t6.*, t7.*, t8.* " "FROM ARKEON2DWH.RECORD_HIGH_VIRTUAL_MAX_TEMP t1 " "JOIN ARKEON2DWH.RECORD_LOW_VIRTUAL_MAX_TEMP t2 " "ON t1.VIRTUAL_CLIMATE_ID = t2.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t2.LOCAL_MONTH " "AND t1.LOCAL_DAY = t2.LOCAL_DAY " "JOIN ARKEON2DWH.RECORD_LOW_VIRTUAL_MIN_TEMP t3 " "ON t1.VIRTUAL_CLIMATE_ID = t3.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t3.LOCAL_MONTH " "AND t1.LOCAL_DAY = t3.LOCAL_DAY " "JOIN ARKEON2DWH.RECORD_HIGH_VIRTUAL_MIN_TEMP t4 " "ON t1.VIRTUAL_CLIMATE_ID = t4.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t4.LOCAL_MONTH " "AND t1.LOCAL_DAY = t4.LOCAL_DAY " "JOIN ARKEON2DWH.EXTREME_HIGH_VIRTUAL_MAX_TEMP t5 " "ON t1.VIRTUAL_CLIMATE_ID = t5.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t5.LOCAL_MONTH " "AND t1.LOCAL_DAY = t5.LOCAL_DAY " "JOIN ARKEON2DWH.EXTREME_LOW_VIRTUAL_MAX_TEMP t6 " "ON t1.VIRTUAL_CLIMATE_ID = t6.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t6.LOCAL_MONTH " "AND t1.LOCAL_DAY = t6.LOCAL_DAY " "JOIN ARKEON2DWH.EXTREME_HIGH_VIRTUAL_MIN_TEMP t7 " "ON t1.VIRTUAL_CLIMATE_ID = t7.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t7.LOCAL_MONTH " "AND t1.LOCAL_DAY = t7.LOCAL_DAY " "JOIN ARKEON2DWH.EXTREME_LOW_VIRTUAL_MIN_TEMP t8 " "ON t1.VIRTUAL_CLIMATE_ID = t8.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t8.LOCAL_MONTH " "AND t1.LOCAL_DAY = t8.LOCAL_DAY " ) ) except Exception as err: LOGGER.error( 'Could not fetch records from oracle due to: {}.'.format( str(err) ) ) # dictionnary to store stations information once retrieved stations_dict = {} for row in self.cur: insert_dict = dict(zip([x[0] for x in self.cur.description], row)) for key in insert_dict: if key in ['LAST_UPDATED']: insert_dict[key] = ( strftime_rfc3339(insert_dict[key]) if insert_dict[key] is not None else insert_dict[key] ) virtual_climate_id = insert_dict['VIRTUAL_CLIMATE_ID'] es_id = '{}-{}-{}'.format( insert_dict['VIRTUAL_CLIMATE_ID'], insert_dict["LOCAL_MONTH"], insert_dict["LOCAL_DAY"], ) # check if we have station IDs record begin and end. If not # retrieve the information and store in stations_dict if virtual_climate_id not in stations_dict: stations_dict[virtual_climate_id] = {} stations_dict[virtual_climate_id][ 'MIN' ] = self.get_stations_info( 'DAILY MINIMUM TEMPERATURE', virtual_climate_id ) stations_dict[virtual_climate_id][ 'MAX' ] = self.get_stations_info( 'DAILY MAXIMUM TEMPERATURE', virtual_climate_id ) # check if TEMEPERATURE MIN/MAX for most recent threaded station # have same climate identifier value min_climate_identifier = stations_dict[virtual_climate_id]['MIN'][ 'climate_identifier' ] max_climate_identifier = stations_dict[virtual_climate_id]['MAX'][ 'climate_identifier' ] if min_climate_identifier == max_climate_identifier: insert_dict['CLIMATE_IDENTIFIER'] = stations_dict[ virtual_climate_id ]['MAX']['climate_identifier'] insert_dict['ENG_STN_NAME'] = stations_dict[ virtual_climate_id ]['MAX']['eng_stn_name'] insert_dict['FRE_STN_NAME'] = stations_dict[ virtual_climate_id ]['MAX']['fre_stn_name'] insert_dict['PROVINCE_CODE'] = stations_dict[ virtual_climate_id ]['MAX']['province_code'] else: LOGGER.error( f'Currently threaded station climate identifier value ' f'does not match between DAILY MINIMUM TEMPERATURE' f'({min_climate_identifier}) and DAILY MAXIMUM ' f'TEMPERATURE({max_climate_identifier}) station threads ' f'for virtual climate ID {virtual_climate_id}.' ) continue # set new fields for level in ['MIN', 'MAX']: # set new insert_dict keys insert_dict[ '{}_TEMP_RECORD_BEGIN'.format(level) ] = stations_dict[virtual_climate_id][level]['record_begin'] insert_dict[ '{}_TEMP_RECORD_END'.format(level) ] = stations_dict[virtual_climate_id][level]['record_end'] # cleanup unwanted fields retained from SQL join fields_to_delete = [ 'LOCAL_TIME', 'VIRTUAL_MEAS_DISPLAY_CODE', 'ENG_STN_NAME', 'FRE_STN_NAME', 'CLIMATE_IDENTIFIER', ] for field in fields_to_delete: insert_dict.pop(field) # set properties.IDENTIFIER insert_dict['IDENTIFIER'] = es_id wrapper = { 'id': es_id, 'type': 'Feature', 'properties': insert_dict, 'geometry': { 'type': 'Point', 'coordinates': stations_dict[virtual_climate_id]['MAX'][ 'coords' ], }, } action = { '_id': es_id, '_index': 'ltce_temp_extremes', '_op_type': 'update', 'doc': wrapper, 'doc_as_upsert': True, } yield action def generate_daily_precip_extremes(self): """ Queries stations data from the db, and reformats data so it can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param cur: oracle cursor to perform queries against. :returns: generator of bulk API upsert actions. """ try: self.cur.execute( ( "SELECT t1.*, t2.* " "FROM ARKEON2DWH.RECORD_VIRTUAL_PRECIPITATION t1 " "JOIN ARKEON2DWH.EXTREME_VIRTUAL_PRECIPITATION t2 " "ON t1.VIRTUAL_CLIMATE_ID = t2.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t2.LOCAL_MONTH " "AND t1.LOCAL_DAY = t2.LOCAL_DAY " ) ) except Exception as err: LOGGER.error( 'Could not fetch records from oracle due to: {}.'.format( str(err) ) ) stations_dict = {} for row in self.cur: insert_dict = dict(zip([x[0] for x in self.cur.description], row)) for key in insert_dict: if key in ['LAST_UPDATED']: insert_dict[key] = ( strftime_rfc3339(insert_dict[key]) if insert_dict[key] is not None else insert_dict[key] ) virtual_climate_id = insert_dict['VIRTUAL_CLIMATE_ID'] es_id = '{}-{}-{}'.format( insert_dict['VIRTUAL_CLIMATE_ID'], insert_dict["LOCAL_MONTH"], insert_dict["LOCAL_DAY"], ) # check if we have station IDs record begin and end if not retrieve if virtual_climate_id not in stations_dict: stations_dict[virtual_climate_id] = self.get_stations_info( 'DAILY TOTAL PRECIPITATION', virtual_climate_id ) insert_dict['RECORD_BEGIN'] = stations_dict[virtual_climate_id][ 'record_begin' ] insert_dict['RECORD_END'] = stations_dict[virtual_climate_id][ 'record_end' ] insert_dict['CLIMATE_IDENTIFIER'] = stations_dict[ virtual_climate_id ]['climate_identifier'] insert_dict['ENG_STN_NAME'] = stations_dict[virtual_climate_id][ 'eng_stn_name' ] insert_dict['FRE_STN_NAME'] = stations_dict[virtual_climate_id][ 'fre_stn_name' ] insert_dict['PROVINCE_CODE'] = stations_dict[virtual_climate_id][ 'province_code' ] # cleanup unwanted fields retained from SQL join fields_to_delete = [ 'LOCAL_TIME', 'VIRTUAL_MEAS_DISPLAY_CODE', 'ENG_STN_NAME', 'FRE_STN_NAME', 'CLIMATE_IDENTIFIER', 'LAST_UPDATED', ] for field in fields_to_delete: insert_dict.pop(field) # set properties.IDENTIFIER insert_dict['IDENTIFIER'] = es_id wrapper = { 'id': es_id, 'type': 'Feature', 'properties': insert_dict, 'geometry': { 'type': 'Point', 'coordinates': stations_dict[virtual_climate_id]['coords'], }, } action = { '_id': es_id, '_index': 'ltce_precip_extremes', '_op_type': 'update', 'doc': wrapper, 'doc_as_upsert': True, } yield action def generate_daily_snow_extremes(self): """ Queries stations data from the db, and reformats data so it can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param cur: oracle cursor to perform queries against. :returns: generator of bulk API upsert actions. """ try: self.cur.execute( ( "SELECT t1.*, t2.* " "FROM ARKEON2DWH.RECORD_VIRTUAL_SNOWFALL t1 " "JOIN ARKEON2DWH.EXTREME_VIRTUAL_SNOWFALL t2 " "ON t1.VIRTUAL_CLIMATE_ID = t2.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t2.LOCAL_MONTH " "AND t1.LOCAL_DAY = t2.LOCAL_DAY " ) ) except Exception as err: LOGGER.error( 'Could not fetch records from oracle due to: {}.'.format( str(err) ) ) stations_dict = {} for row in self.cur: insert_dict = dict(zip([x[0] for x in self.cur.description], row)) for key in insert_dict: if key in ['LAST_UPDATED']: insert_dict[key] = ( strftime_rfc3339(insert_dict[key]) if insert_dict[key] is not None else insert_dict[key] ) virtual_climate_id = insert_dict['VIRTUAL_CLIMATE_ID'] es_id = '{}-{}-{}'.format( insert_dict['VIRTUAL_CLIMATE_ID'], insert_dict["LOCAL_MONTH"], insert_dict["LOCAL_DAY"], ) # check if we have station IDs record begin and end if not retrieve if virtual_climate_id not in stations_dict: stations_dict[virtual_climate_id] = self.get_stations_info( 'DAILY TOTAL SNOWFALL', virtual_climate_id ) insert_dict['RECORD_BEGIN'] = stations_dict[virtual_climate_id][ 'record_begin' ] insert_dict['RECORD_END'] = stations_dict[virtual_climate_id][ 'record_end' ] insert_dict['CLIMATE_IDENTIFIER'] = stations_dict[ virtual_climate_id ]['climate_identifier'] insert_dict['ENG_STN_NAME'] = stations_dict[virtual_climate_id][ 'eng_stn_name' ] insert_dict['FRE_STN_NAME'] = stations_dict[virtual_climate_id][ 'fre_stn_name' ] insert_dict['PROVINCE_CODE'] = stations_dict[virtual_climate_id][ 'province_code' ] # cleanup unwanted fields retained from SQL join fields_to_delete = [ 'LOCAL_TIME', 'VIRTUAL_MEAS_DISPLAY_CODE', 'ENG_STN_NAME', 'FRE_STN_NAME', 'CLIMATE_IDENTIFIER', 'LAST_UPDATED', ] for field in fields_to_delete: insert_dict.pop(field) # set properties.IDENTIFIER insert_dict['IDENTIFIER'] = es_id wrapper = { 'id': es_id, 'type': 'Feature', 'properties': insert_dict, 'geometry': { 'type': 'Point', 'coordinates': stations_dict[virtual_climate_id]['coords'], }, } action = { '_id': es_id, '_index': 'ltce_snow_extremes', '_op_type': 'update', 'doc': wrapper, 'doc_as_upsert': True, } yield action
class AhccdLoader(BaseLoader): """AHCCD Loader""" def __init__(self, conn_config={}): """initializer""" super().__init__() self.conn = ElasticsearchConnector(conn_config) def create_index(self, index): """ Creates the Elasticsearch index at self.conn. If the index already exists, it is deleted and re-created. The mappings for the two types are also created. :param index: Identifier for the index to be created. """ if index == 'annual': mapping = { "settings": {"number_of_shards": 1, "number_of_replicas": 0}, "mappings": { "_meta": {"geomfields": {"geometry": "POINT"}}, "properties": { "type": {"type": "text"}, "properties": { "properties": { "identifier__identifiant": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "lat__lat": {"type": "float"}, "lon__long": {"type": "float"}, "province__province": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "period_group__groupe_periode": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "period_value__valeur_periode": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "pressure_sea_level__pression_niveau_mer": { "type": "float" }, "pressure_sea_level_units__pression_niveau_mer_unite": { # noqa "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "pressure_station__pression_station": { "type": "float" }, "pressure_station_units__pression_station_unites": { # noqa "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "rain__pluie": {"type": "float"}, "rain_units__pluie_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "snow__neige": {"type": "float"}, "snow_units__neige_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "station_id__id_station": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "temp_max__temp_max": {"type": "float"}, "temp_max_units__temp_max_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "temp_mean__temp_moyenne": {"type": "float"}, "temp_mean_units__temp_moyenne_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "temp_min__temp_min": {"type": "float"}, "temp_min_units__temp_min_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "total_precip__precip_totale": { "type": "float" }, "total_precip_units__precip_totale_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "wind_speed__vitesse_vent": {"type": "float"}, "wind_speed_units__vitesse_vent_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "year__annee": {"type": "integer"}, } }, "geometry": {"type": "geo_shape"}, }, }, } self.conn.create('ahccd_annual', mapping=mapping, overwrite=True) if index == 'monthly': mapping = { "settings": {"number_of_shards": 1, "number_of_replicas": 0}, "mappings": { "_meta": {"geomfields": {"geometry": "POINT"}}, "properties": { "type": {"type": "text"}, "properties": { "properties": { "date": { "type": "date", "format": "yyyy-MM||yyyy", }, "identifier__identifiant": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "lat__lat": {"type": "float"}, "lon__long": {"type": "float"}, "province__province": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "period_group__groupe_periode": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "period_value__valeur_periode": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "pressure_sea_level__pression_niveau_mer": { "type": "float" }, "pressure_sea_level_units__pression_niveau_mer_unite": { # noqa "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "pressure_station__pression_station": { "type": "float" }, "pressure_station_units__pression_station_unites": { # noqa "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "rain__pluie": {"type": "float"}, "rain_units__pluie_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "snow__neige": {"type": "float"}, "snow_units__neige_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "station_id__id_station": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "temp_max__temp_max": {"type": "float"}, "temp_max_units__temp_max_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "temp_mean__temp_moyenne": {"type": "float"}, "temp_mean_units__temp_moyenne_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "temp_min__temp_min": {"type": "float"}, "temp_min_units__temp_min_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "total_precip__precip_totale": { "type": "float" }, "total_precip_units__precip_totale_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "wind_speed__vitesse_vent": {"type": "float"}, "wind_speed_units__vitesse_vent_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "year__annee": {"type": "integer"}, } }, "geometry": {"type": "geo_shape"}, }, }, } self.conn.create('ahccd_monthly', mapping=mapping, overwrite=True) if index == 'seasonal': mapping = { "settings": {"number_of_shards": 1, "number_of_replicas": 0}, "mappings": { "_meta": {"geomfields": {"geometry": "POINT"}}, "properties": { "type": {"type": "text"}, "properties": { "properties": { "identifier__identifiant": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "lat__lat": {"type": "float"}, "lon__long": {"type": "float"}, "province__province": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "period_group__groupe_periode": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "period_value__valeur_periode": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "pressure_sea_level__pression_niveau_mer": { "type": "float" }, "pressure_sea_level_units__pression_niveau_mer_unite": { # noqa "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "pressure_station__pression_station": { "type": "float" }, "pressure_station_units__pression_station_unites": { # noqa "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "rain__pluie": {"type": "float"}, "rain_units__pluie_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "snow__neige": {"type": "float"}, "snow_units__neige_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "station_id__id_station": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "temp_max__temp_max": {"type": "float"}, "temp_max_units__temp_max_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "temp_mean__temp_moyenne": {"type": "float"}, "temp_mean_units__temp_moyenne_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "temp_min__temp_min": {"type": "float"}, "temp_min_units__temp_min_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "total_precip__precip_totale": { "type": "float" }, "total_precip_units__precip_totale_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "wind_speed__vitesse_vent": {"type": "float"}, "wind_speed_units__vitesse_vent_unites": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "year__annee": {"type": "integer"}, } }, "geometry": {"type": "geo_shape"}, }, }, } self.conn.create('ahccd_seasonal', mapping=mapping, overwrite=True) if index == 'stations': mapping = { "settings": {"number_of_shards": 1, "number_of_replicas": 0}, "mappings": { "_meta": {"geomfields": {"geometry": "POINT"}}, "properties": { "type": {"type": "text"}, "properties": { "properties": { "identifier__identifiant": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "station_id__id_station": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "station_name__nom_station": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "measurement_type__type_mesure": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "period__periode": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "trend_value__valeur_tendance": { "type": "float" }, "elevation__elevation": {"type": "float"}, "province__province": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "joined__rejoint": {"type": "integer"}, "year_range__annees": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, } }, "geometry": {"type": "geo_shape"}, }, }, } self.conn.create('ahccd_stations', mapping=mapping, overwrite=True) if index == 'trends': mapping = { "settings": {"number_of_shards": 1, "number_of_replicas": 0}, "mappings": { "_meta": {"geomfields": {"geometry": "POINT"}}, "properties": { "type": {"type": "text"}, "properties": { "properties": { "identifier__identifiant": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "station_id__id_station": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "station_name__nom_station": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "measurement_type__type_mesure": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "period__periode": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "trend_value__valeur_tendance": { "type": "float" }, "elevation__elevation": {"type": "float"}, "province__province": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "joined__rejoint": {"type": "integer"}, "year_range__annees": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, } }, "geometry": {"type": "geo_shape"}, }, }, } self.conn.create('ahccd_trends', mapping=mapping, overwrite=True) def generate_docs(self, fp, index): """ Reads AHCCD and CMIP5 data from file(s) at fp and reformats them so they can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param fp: the location of the raw data file(s) to load. :param index: name of index to load. :returns: generator of bulk API upsert actions. """ if index not in [ 'stations', 'monthly', 'annual', 'seasonal', 'trends', ]: LOGGER.error('Unrecognized AHCCD data type {}'.format(index)) return try: with open(fp, 'r') as f: json_source = f.read() contents = json.loads(json_source) except Exception as err: LOGGER.error(f'Could not open JSON file due to: {str(err)}.') return for record in contents['features']: if index == 'annual': index_name = 'ahccd_annual' elif index == 'seasonal': index_name = 'ahccd_seasonal' elif index == 'stations': index_name = 'ahccd_stations' stn_id = record['properties']['station_id__id_station'] record['properties']['identifier__identifiant'] = stn_id elif index == 'monthly': index_name = 'ahccd_monthly' record['properties']['date'] = '{}-{}'.format( record['properties']['identifier__identifiant'].split('.')[ 1 ], record['properties']['identifier__identifiant'].split('.')[ 2 ], ) del record['properties']['year__annee'] elif index == 'trends': index_name = 'ahccd_trends' identifier = '{}.{}.{}'.format( record['properties']['station_id__id_station'], record['properties']['period__periode'], record['properties']['measurement_type__type_mesure'], ) record['properties']['identifier__identifiant'] = identifier action = { '_id': record['properties']['identifier__identifiant'], '_index': index_name, '_op_type': 'update', 'doc': record, 'doc_as_upsert': True, } yield action