def __init__(self, db_string=None, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.db_conn = None # setup DB connection if db_string is not None: try: self.db_conn = cx_Oracle.connect(db_string) self.cur = self.db_conn.cursor() except Exception as err: msg = 'Could not connect to Oracle: {}'.format(err) LOGGER.critical(msg) raise click.ClickException(msg) else: LOGGER.debug("No DB connection string passed. Indexing disabled.") self.db_conn = self.cur = None for item in MAPPINGS: SETTINGS['mappings']['properties']['properties'][ 'properties' ] = MAPPINGS[item] self.conn.create(INDEX_NAME.format(item), SETTINGS)
def delete_index(ctx, es, username, password, ignore_certs): """Delete cap-alerts realtime index""" conn_config = configure_es_connection(es, username, password, ignore_certs) conn = ElasticsearchConnector(conn_config) conn.delete(INDEX_NAME)
def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.conn.create(INDEX_NAME, mapping=SETTINGS)
def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config, verify_certs=False) self.items = [] self.conn.create_template(INDEX_BASENAME, SETTINGS)
def __init__(self, filepath, conn_config={}): """initializer""" BaseLoader.__init__(self) self.DD_URL = 'https://dd.weather.gc.ca/bulletins/alphanumeric' self.conn = ElasticsearchConnector(conn_config) self.conn.create_template(INDEX_BASENAME, SETTINGS)
def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.conn.create_template(INDEX_BASENAME, SETTINGS) self.stations = {} self.read_stations_list()
def delete_indexes(ctx, es, username, password, ignore_certs): """Delete all SWOB realtime indexes""" conn_config = configure_es_connection(es, username, password, ignore_certs) conn = ElasticsearchConnector(conn_config) all_indexes = '{}*'.format(INDEX_BASENAME) click.echo('Deleting indexes {}'.format(all_indexes)) conn.delete(all_indexes) click.echo('Done')
def clean_indexes(ctx, days, es, username, password, ignore_certs): """Clean bulletins indexes older than n number of days""" conn_config = configure_es_connection(es, username, password, ignore_certs) conn = ElasticsearchConnector(conn_config) indexes = conn.get('{}*'.format(INDEX_BASENAME)) if indexes: indexes_to_delete = check_es_indexes_to_delete(indexes, days) if indexes_to_delete: click.echo('Deleting indexes {}'.format(indexes_to_delete)) conn.delete(','.join(indexes_to_delete)) click.echo('Done')
def __init__(self, db_conn_string, conn_config={}): """initializer""" super().__init__() self.conn = ElasticsearchConnector(conn_config) # setup DB connection try: self.db_conn = cx_Oracle.connect(db_conn_string) except Exception as err: msg = f'Could not connect to Oracle: {err}' LOGGER.critical(msg) raise click.ClickException(msg) self.cur = self.db_conn.cursor()
def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.filepath = None self.version = None self.zone = None self.items = [] # create forecast polygon indices if they don't exist for index in INDICES: zone = index.split('_')[2] SETTINGS['mappings']['properties']['properties'][ 'properties'] = FILE_PROPERTIES[zone] self.conn.create(index, SETTINGS)
def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.filepath = None self.date_ = None self.fh = None self.storm_name = None self.storm_variable = None self.items = [] # create storm variable indices if it don't exist for item in FILE_PROPERTIES: SETTINGS['mappings']['properties']['properties'][ 'properties'] = FILE_PROPERTIES[item] self.conn.create(INDEX_NAME.format(item), SETTINGS)
def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.filepath = None self.region_name_code = None self.language = None self.root = None self.area = {} self.items = [] # create marine weather indices if it don't exist for item in MAPPINGS: SETTINGS['mappings']['properties']['properties'][ 'properties'] = MAPPINGS[item] self.conn.create(INDEX_NAME.format(item), SETTINGS)
def clean_indexes(ctx, days, dataset, es, username, password, ignore_certs): """Delete old AQHI realtime indexes older than n days""" conn_config = configure_es_connection(es, username, password, ignore_certs) conn = ElasticsearchConnector(conn_config) if dataset == 'all': indexes_to_fetch = '{}*'.format(INDEX_BASENAME.format('*')) else: indexes_to_fetch = '{}*'.format(INDEX_BASENAME.format(dataset)) indexes = conn.get(indexes_to_fetch) if indexes: indexes_to_delete = check_es_indexes_to_delete(indexes, days) if indexes_to_delete: click.echo('Deleting indexes {}'.format(indexes_to_delete)) conn.delete(','.join(indexes)) click.echo('Done')
def delete_index(ctx, index_name, es, username, password, ignore_certs): """ Delete a particular ES index with a given name as argument or all if no argument is passed """ conn_config = configure_es_connection(es, username, password, ignore_certs) conn = ElasticsearchConnector(conn_config) if index_name: if click.confirm( 'Are you sure you want to delete ES index named: {}?'.format( click.style(index_name, fg='red')), abort=True, ): LOGGER.info('Deleting ES index {}'.format(index_name)) conn.delete(index_name) return True else: if click.confirm( 'Are you sure you want to delete {} marine forecast' ' indices ({})?'.format( click.style('ALL', fg='red'), click.style(", ".join(INDICES), fg='red'), ), abort=True, ): conn.delete(",".join(INDICES)) return True
def clean_records(ctx, days, es, username, password, ignore_certs): """Delete old citypageweather documents""" conn_config = configure_es_connection(es, username, password, ignore_certs) conn = ElasticsearchConnector(conn_config) older_than = (datetime.now() - timedelta(days=days)).strftime('%Y-%m-%d %H:%M') click.echo('Deleting documents older than {} ({} days)'.format( older_than, days)) query = {'query': {'range': {'properties.datetime': {'lte': older_than}}}} conn.Elasticsearch.delete_by_query(index=INDEX_NAME, body=query)
def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.filepath = None self.type = None self.region = None self.date_ = None self.items = [] # only create index templates with forecasts and observations mappings template_mappings = { k: MAPPINGS[k] for k in ('forecasts', 'observations') } for aqhi_type in template_mappings: template_name = INDEX_BASENAME.format(aqhi_type) SETTINGS['index_patterns'] = ['{}*'.format(template_name)] SETTINGS['mappings'] = MAPPINGS[aqhi_type] self.conn.create_template(template_name, SETTINGS)
def deactivate(ctx, days, es, username, password, ignore_certs): """deactivate hurricane forecasts older than N days""" conn_config = configure_es_connection(es, username, password, ignore_certs) conn = ElasticsearchConnector(conn_config) for index in INDICES: query = { "script": "ctx._source.properties.active=false", "query": { "range": { "properties.filedate": { "lte": "now-{}d".format(days) } } } } conn.Elasticsearch.update_by_query(index=index, body=query) return True
def delete_indexes(ctx, dataset, es, username, password, ignore_certs, index_template): """Delete all AQHI realtime indexes""" conn_config = configure_es_connection(es, username, password, ignore_certs) conn = ElasticsearchConnector(conn_config) if dataset == 'all': indexes = 'aqhi_*' else: indexes = '{}*'.format(INDEX_BASENAME.format(dataset)) click.echo('Deleting indexes {}'.format(indexes)) conn.delete(indexes) if index_template: click.echo('Deleting index template {}'.format(INDEX_BASENAME)) conn.delete_template(INDEX_BASENAME) click.echo('Done')
class CapAlertsRealtimeLoader(BaseLoader): """Cap Alerts real-time loader""" def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.conn.create(INDEX_NAME, mapping=SETTINGS) self.references_arr = [] def load_data(self, filepath): """ fonction from base to load the data in ES :param filepath: filepath for parsing the current condition file :returns: True/False """ data = self.weather_warning2geojson(filepath) try: self.bulk_data = [] for doc in data: op_dict = { 'index': { '_index': INDEX_NAME, '_type': '_doc' } } op_dict['index']['_id'] = doc['properties']['identifier'] self.bulk_data.append(op_dict) self.bulk_data.append(doc) r = self.conn.Elasticsearch.bulk( index=INDEX_NAME, body=self.bulk_data ) LOGGER.debug('Result: {}'.format(r)) previous_alerts = self.delete_references_alerts() click.echo('done importing in ES') if previous_alerts: LOGGER.debug('Deleted old warning') else: LOGGER.debug('New warning, no deletion') return True except Exception as err: LOGGER.warning('Error bulk indexing: {}'.format(err)) return False def delete_references_alerts(self): """Delete old alerts documents""" if self.references_arr and len(self.references_arr) != 0: click.echo('Deleting old alerts') query = { 'query': { 'terms': { 'properties.reference': self.references_arr } } } self.conn.Elasticsearch.delete_by_query( index=INDEX_NAME, body=query ) return True else: return False def weather_warning2geojson(self, filepath): """ Create GeoJSON that will be use to display weather alerts :param filepath: filepath to the cap-xml file :returns: xml as json object """ # we must define the variable that we'll need now = datetime.utcnow() french_alert = {} english_alert = {} english_alert_remove = [] timeformat = '%Y-%m-%dT%H:%M:%SZ' # we want to run a loop on every cap-xml in filepath and add them # in the geojson # we want to strat by the newest file in the directory LOGGER.info('Processing {} CAP documents'.format(len(filepath))) LOGGER.debug('Processing {}'.format(filepath)) # with the lxml library we parse the xml file try: tree = etree.parse(filepath) except Exception as err: LOGGER.warning('Cannot parse {}: {}'.format(filepath, err)) url = 'https://dd.weather.gc.ca/alerts/{}'.\ format(filepath.split('alerts')[1]) root = tree.getroot() b_xml = '{urn:oasis:names:tc:emergency:cap:1.2}' identifier = _get_element(root, '{}identifier'.format(b_xml)) references = _get_element(root, '{}references'.format(b_xml)) if references: for ref in references.split(' '): self.references_arr.append(ref.split(',')[1]) for grandchild in root.iter('{}info'.format(b_xml)): expires = _get_date_format(_get_element(grandchild, '{}expires'.format(b_xml)))\ .strftime(timeformat) status_alert = _get_element(grandchild, '{}parameter[last()-4]/' '{}value'.format(b_xml, b_xml)) if _get_date_format(expires) > now: language = _get_element(grandchild, '{}language'.format(b_xml)) if language == 'fr-CA': headline = _get_element(grandchild, '{}headline'.format(b_xml)) description_fr = '{}description'.format(b_xml) descript = _get_element(grandchild, description_fr) descript = descript.replace("\n", " ").strip() for i in grandchild.iter('{}area'.format(b_xml)): tag = _get_element(i, '{}polygon'.format(b_xml)) name = _get_element(i, '{}areaDesc'.format(b_xml)) for j in grandchild.iter('{}geocode'.format(b_xml)): str_value_name = '{}valueName'.format(b_xml) valueName = _get_element(j, str_value_name) if valueName == 'layer:EC-MSC-SMC:1.0:CLC': geocode_value = '{}value'.format(b_xml) geocode = _get_element(j, geocode_value) id_warning = '{}_{}'.format(identifier, geocode) if id_warning not in french_alert: french_alert[id_warning] = (id_warning, name, headline, descript) else: headline = _get_element(grandchild, '{}headline'.format(b_xml)) description = '{}description'.format(b_xml) descript = _get_element(grandchild, description) descript = descript.replace("\n", " ").strip() effective_date =\ _get_element(grandchild, '{}effective'.format(b_xml)) effective = _get_date_format(effective_date) effective = effective.strftime(timeformat) warning = _get_element(grandchild, '{}parameter[1]/' '{}value'.format(b_xml, b_xml)) # There can be many <area> cobvered by one # <info> so we have to loop through the info for i in grandchild.iter('{}area'.format(b_xml)): tag = _get_element(i, '{}polygon'.format(b_xml)) name = _get_element(i, '{}areaDesc'.format(b_xml)) for j in grandchild.iter('{}geocode'.format(b_xml)): valueName = \ _get_element(j, '{}valueName'.format(b_xml)) if valueName == 'layer:EC-MSC-SMC:1.0:CLC': geocode = \ _get_element(j, '{}value'.format(b_xml)) split_tag = re.split(' |,', tag) id_warning = '{}_{}'.format(identifier, geocode) if id_warning not in english_alert: english_alert[id_warning] = (split_tag, name, headline, effective, expires, warning, status_alert, id_warning, descript, url) LOGGER.info('Done processing') for j in english_alert: if _get_date_format(english_alert[j][4]) < now: english_alert_remove.append(j) # We can't remove a element of a dictionary while looping in it # So we remove the warning in another step for key in english_alert_remove: del english_alert[key] del french_alert[key] # To keep going we want to have the same number of warning # in english and in french if len(french_alert) == len(english_alert): LOGGER.info('Creating %d features', len(english_alert)) data = [] for num_poly in english_alert: poly = [] for el in list(reversed(range(0, len(english_alert[num_poly][0]), 2))): if len(english_alert[num_poly][0]) > 1: poly.append([float(english_alert[num_poly][0][el + 1]), float(english_alert[num_poly][0][el]), 0.0]) # for temporary care of the duplicate neighbors coordinate # poly = [k for k, g in groupby(poly)] no_dup_poly = [] for k in poly: if k not in no_dup_poly: no_dup_poly.append(k) no_dup_poly.append(poly[-1]) id_ = english_alert[num_poly][7] AlertLocation = { 'type': "Feature", 'properties': { 'identifier': id_, 'area': english_alert[num_poly][1], 'reference': identifier, 'zone': french_alert[num_poly][1], 'headline': english_alert[num_poly][2], 'titre': french_alert[num_poly][2], 'descrip_en': english_alert[num_poly][8], 'descrip_fr': french_alert[num_poly][3], 'effective': english_alert[num_poly][3], 'expires': english_alert[num_poly][4], 'alert_type': english_alert[num_poly][5], 'status': english_alert[num_poly][6], 'references': self.references_arr, 'url': english_alert[num_poly][9] }, 'geometry': { 'type': "Polygon", 'coordinates': [no_dup_poly] } } data.append(AlertLocation) return data
class BulletinsRealtimeLoader(BaseLoader): """Bulletins real-time loader""" def __init__(self, filepath, conn_config={}): """initializer""" BaseLoader.__init__(self) self.DD_URL = 'https://dd.weather.gc.ca/bulletins/alphanumeric' self.conn = ElasticsearchConnector(conn_config) self.conn.create_template(INDEX_BASENAME, SETTINGS) def load_data(self, filepath): """ loads data from event to target :param filepath: filepath to data on disk :returns: `bool` of status result """ LOGGER.debug(filepath) data = self.bulletin2dict(filepath) b_dt = datetime.strptime(data['properties']['datetime'], '%Y-%m-%dT%H:%M') b_dt2 = b_dt.strftime('%Y-%m-%d') es_index = '{}{}'.format(INDEX_BASENAME, b_dt2) try: r = self.conn.Elasticsearch.index(index=es_index, id=data['id'], body=data) LOGGER.debug('Result: {}'.format(r)) return True except Exception as err: LOGGER.warning('Error indexing: {}'.format(err)) return False def bulletin2dict(self, filepath): """ convert a bulletin into a GeoJSON object :param filepath: path to filename :returns: `dict` of GeoJSON """ dict_ = {'type': 'Feature', 'geometry': None, 'properties': {}} try: bulletin_path = filepath.split('/alphanumeric/')[1] except IndexError as err: LOGGER.warning('no bulletin path: {}'.format(err)) raise RuntimeError(err) identifier = bulletin_path.replace('/', '.') issuer_name = None issuer_country = None dict_['id'] = dict_['properties']['identifier'] = identifier tokens = bulletin_path.split('/') yyyymmdd = tokens[0] hh = tokens[3] filename = tokens[-1] yyyy = yyyymmdd[0:4] mm = yyyymmdd[4:6] dd = yyyymmdd[6:8] min_ = filename.split('_')[2][-2:] datetime_ = '{}-{}-{}T{}:{}'.format(yyyy, mm, dd, hh, min_) # TODO: use real coordinates dict_['properties']['datetime'] = datetime_ dict_['properties']['type'] = tokens[1] dict_['properties']['issuer_code'] = tokens[2] dict_['properties']['issuer_name'] = issuer_name dict_['properties']['issuer_country'] = issuer_country dict_['properties']['issuing_office'] = tokens[2][2:] dict_['properties']['url'] = '{}/{}'.format(self.DD_URL, bulletin_path) return dict_
class ClimateArchiveLoader(BaseLoader): """Climat Archive Loader""" def __init__(self, db_conn_string, conn_config={}): """initializer""" super().__init__() self.conn = ElasticsearchConnector(conn_config) # setup DB connection try: self.db_conn = cx_Oracle.connect(db_conn_string) except Exception as err: msg = f'Could not connect to Oracle: {err}' LOGGER.critical(msg) raise click.ClickException(msg) self.cur = self.db_conn.cursor() def create_index(self, index): """ Creates the Elasticsearch index at path. If the index already exists, it is deleted and re-created. The mappings for the two types are also created. :param index: the index to be created. """ if index == 'stations': mapping = { "settings": {"number_of_shards": 1, "number_of_replicas": 0}, "mappings": { "_meta": {"geomfields": {"geometry": "POINT"}}, "properties": { "type": {"type": "text"}, "properties": { "properties": { "PROV_STATE_TERR_CODE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "STN_ID": {"type": "integer"}, "STATION_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "ENG_PROV_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "FRE_PROV_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "COUNTRY": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "LATITUDE": {"type": "integer"}, "LONGITUDE": {"type": "integer"}, "TIMEZONE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "ELEVATION": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "CLIMATE_IDENTIFIER": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "TC_IDENTIFIER": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "WMO_IDENTIFIER": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "STATION_TYPE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "NORMAL_CODE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "PUBLICATION_CODE": {"type": "integer"}, "DISPLAY_CODE": {"type": "integer"}, "ENG_STN_OPERATOR_ACRONYM": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "FRE_STN_OPERATOR_ACRONYM": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "ENG_STN_OPERATOR_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "FRE_STN_OPERATOR_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "HAS_MONTHLY_SUMMARY": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "HAS_NORMALS_DATA": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "DLY_FIRST_DATE": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", }, "DLY_LAST_DATE": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", }, "FIRST_DATE": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", }, "LAST_DATE": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", }, } }, "geometry": {"type": "geo_shape"}, }, }, } index_name = 'climate_station_information' self.conn.create(index_name, mapping, overwrite=True) if index == 'normals': mapping = { "settings": {"number_of_shards": 1, "number_of_replicas": 0}, "mappings": { "_meta": {"geomfields": {"geometry": "POINT"}}, "properties": { "type": {"type": "text"}, "properties": { "properties": { "STN_ID": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "MONTH": {"type": "integer"}, "VALUE": {"type": "integer"}, "OCCURRENCE_COUNT": {"type": "integer"}, "PUBLICATION_CODE": {"type": "integer"}, "CLIMATE_IDENTIFIER": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "NORMAL_CODE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "NORMAL_ID": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "ID": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "PROVINCE_CODE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "E_NORMAL_ELEMENT_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "F_NORMAL_ELEMENT_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "PERIOD": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "PERIOD_BEGIN": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "PERIOD_END": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "STATION_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "YEAR_COUNT_NORMAL_PERIOD": { "type": "integer" }, "MAX_DURATION_MISSING_YEARS": { "type": "integer" }, "FIRST_YEAR_NORMAL_PERIOD": { "type": "integer" }, "LAST_YEAR_NORMAL_PERIOD": {"type": "integer"}, "FIRST_YEAR": {"type": "integer"}, "LAST_YEAR": {"type": "integer"}, "TOTAL_OBS_COUNT": {"type": "integer"}, "PERCENT_OF_POSSIBLE_OBS": {"type": "integer"}, "CURRENT_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "FIRST_OCCURRENCE_DATE": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", }, "DATE_CALCULATED": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", }, } }, "geometry": {"type": "geo_shape"}, }, }, } index_name = 'climate_normals_data' self.conn.create(index_name, mapping, overwrite=True) if index == 'monthly_summary': mapping = { "settings": {"number_of_shards": 1, "number_of_replicas": 0}, "mappings": { "_meta": {"geomfields": {"geometry": "POINT"}}, "properties": { "type": {"type": "text"}, "properties": { "properties": { "CLIMATE_IDENTIFIER": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "STN_ID": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "STATION_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "ID": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "PROVINCE_CODE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "LATITUDE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "LONGITUDE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "MEAN_TEMPERATURE": {"type": "float"}, "NORMAL_MEAN_TEMPERATURE": {"type": "float"}, "MAX_TEMPERATURE": {"type": "float"}, "MIN_TEMPERATURE": {"type": "float"}, "TOTAL_SNOWFALL": {"type": "float"}, "NORMAL_SNOWFALL": {"type": "float"}, "TOTAL_PRECIPITATION": {"type": "float"}, "NORMAL_PRECIPITATION": {"type": "float"}, "BRIGHT_SUNSHINE": {"type": "float"}, "NORMAL_SUNSHINE": {"type": "float"}, "SNOW_ON_GROUND_LAST_DAY": {"type": "float"}, "DAYS_WITH_VALID_MIN_TEMP": { "type": "integer" }, "DAYS_WITH_VALID_MEAN_TEMP": { "type": "integer" }, "DAYS_WITH_VALID_MAX_TEMP": { "type": "integer" }, "DAYS_WITH_VALID_SNOWFALL": { "type": "integer" }, "DAYS_WITH_VALID_PRECIP": {"type": "integer"}, "DAYS_WITH_VALID_SUNSHINE": { "type": "integer" }, "DAYS_WITH_PRECIP_GE_1MM": {"type": "integer"}, "HEATING_DEGREE_DAYS": {"type": "integer"}, "COOLING_DEGREE_DAYS": {"type": "integer"}, "LOCAL_YEAR": {"type": "integer"}, "LOCAL_MONTH": {"type": "integer"}, "LAST_UPDATED": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", }, "LOCAL_DATE": { "type": "date", "format": "yyyy-MM", }, } }, "geometry": {"type": "geo_shape"}, }, }, } index_name = 'climate_public_climate_summary' self.conn.create(index_name, mapping, overwrite=True) if index == 'daily_summary': mapping = { "settings": {"number_of_shards": 1, "number_of_replicas": 0}, "mappings": { "_meta": {"geomfields": {"geometry": "POINT"}}, "properties": { "type": {"type": "text"}, "properties": { "properties": { "CLIMATE_IDENTIFIER": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "STN_ID": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "STATION_NAME": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "SOURCE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "ID": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "MAX_TEMPERATURE_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "MIN_TEMPERATURE_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "MEAN_TEMPERATURE_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "PROVINCE_CODE": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "MAX_REL_HUMIDITY_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "MIN_REL_HUMIDITY_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "TOTAL_RAIN_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "TOTAL_SNOW_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "TOTAL_PRECIPITATION_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "SNOW_ON_GROUND_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "DIRECTION_MAX_GUST_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "SPEED_MAX_GUST_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "HEATING_DEGREE_DAYS_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "COOLING_DEGREE_DAYS_FLAG": { "type": "text", "fields": {"raw": {"type": "keyword"}}, }, "MEAN_TEMPERATURE": {"type": "float"}, "TOTAL_RAIN": {"type": "float"}, "MAX_TEMPERATURE": {"type": "float"}, "MIN_TEMPERATURE": {"type": "float"}, "MAX_REL_HUMIDITY": {"type": "float"}, "MIN_REL_HUMIDITY": {"type": "float"}, "TOTAL_SNOW": {"type": "float"}, "SNOW_ON_GROUND": {"type": "float"}, "TOTAL_PRECIPITATION": {"type": "float"}, "DIRECTION_MAX_GUST": {"type": "float"}, "SPEED_MAX_GUST": {"type": "float"}, "HEATING_DEGREE_DAYS": {"type": "integer"}, "COOLING_DEGREE_DAYS": {"type": "integer"}, "LOCAL_YEAR": {"type": "integer"}, "LOCAL_MONTH": {"type": "integer"}, "LOCAL_DAY": {"type": "integer"}, "LOCAL_DATE": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss", }, } }, "geometry": {"type": "geo_shape"}, }, }, } index_name = 'climate_public_daily_data' self.conn.create(index_name, mapping, overwrite=True) def generate_stations(self): """ Queries stations data from the db, and reformats data so it can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param cur: oracle cursor to perform queries against. :returns: generator of bulk API upsert actions. """ try: self.cur.execute('select * from CCCS_PORTAL.STATION_INFORMATION') except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to: {str(err)}.' ) for row in self.cur: insert_dict = dict(zip([x[0] for x in self.cur.description], row)) for key in insert_dict: # This is a quick fix for trailing spaces and should not be # here. Data should be fixed on db side. try: insert_dict[key] = insert_dict[key].strip() except Exception as err: LOGGER.debug( f'Could not strip value {insert_dict[key]} due to ' f'{str(err)}, skipping' ) # Transform Date fields from datetime to string. if 'DATE' in key: insert_dict[key] = ( str(insert_dict[key]) if insert_dict[key] is not None else insert_dict[key] ) coords = [ float(insert_dict['LONGITUDE_DECIMAL_DEGREES']), float(insert_dict['LATITUDE_DECIMAL_DEGREES']), ] del insert_dict['LONGITUDE_DECIMAL_DEGREES'] del insert_dict['LATITUDE_DECIMAL_DEGREES'] climate_identifier = insert_dict['CLIMATE_IDENTIFIER'] wrapper = { 'type': 'Feature', 'properties': insert_dict, 'geometry': {'type': 'Point', 'coordinates': coords}, } action = { '_id': climate_identifier, '_index': 'climate_station_information', '_op_type': 'update', 'doc': wrapper, 'doc_as_upsert': True, } yield action def generate_normals(self, stn_dict, normals_dict, periods_dict): """ Queries normals data from the db, and reformats data so it can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param cur: oracle cursor to perform queries against. :param stn_dict: mapping of station IDs to station information. :param normals_dict: mapping of normal IDs to normals information. :param periods_dict: mapping of normal period IDs to normal period information. :returns: generator of bulk API upsert actions. """ try: self.cur.execute('select * from CCCS_PORTAL.NORMALS_DATA') except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to: {str(err)}.' ) for row in self.cur: insert_dict = dict(zip([x[0] for x in self.cur.description], row)) for key in insert_dict: # Transform Date fields from datetime to string. if 'DATE' in key: insert_dict[key] = ( str(insert_dict[key]) if insert_dict[key] is not None else insert_dict[key] ) insert_dict['ID'] = '{}.{}.{}'.format( insert_dict['STN_ID'], insert_dict['NORMAL_ID'], insert_dict['MONTH'], ) if insert_dict['STN_ID'] in stn_dict: coords = stn_dict[insert_dict['STN_ID']]['coordinates'] insert_dict['STATION_NAME'] = stn_dict[insert_dict['STN_ID']][ 'STATION_NAME' ] insert_dict['PROVINCE_CODE'] = stn_dict[insert_dict['STN_ID']][ 'PROVINCE_CODE' ] insert_dict['E_NORMAL_ELEMENT_NAME'] = normals_dict[ insert_dict['NORMAL_ID'] ]['E_NORMAL_ELEMENT_NAME'] insert_dict['F_NORMAL_ELEMENT_NAME'] = normals_dict[ insert_dict['NORMAL_ID'] ]['F_NORMAL_ELEMENT_NAME'] insert_dict['PERIOD'] = normals_dict[insert_dict['NORMAL_ID']][ 'PERIOD' ] insert_dict['PERIOD_BEGIN'] = periods_dict[ insert_dict['NORMAL_PERIOD_ID'] ]['PERIOD_BEGIN'] insert_dict['PERIOD_END'] = periods_dict[ insert_dict['NORMAL_PERIOD_ID'] ]['PERIOD_END'] insert_dict['CLIMATE_IDENTIFIER'] = stn_dict[ insert_dict['STN_ID'] ]['CLIMATE_IDENTIFIER'] del insert_dict['NORMAL_PERIOD_ID'] wrapper = { 'type': 'Feature', 'properties': insert_dict, 'geometry': {'type': 'Point', 'coordinates': coords}, } action = { '_id': insert_dict['ID'], '_index': 'climate_normals_data', '_op_type': 'update', 'doc': wrapper, 'doc_as_upsert': True, } yield action else: LOGGER.error( f"Bad STN ID: {insert_dict['STN_ID']}, skipping" f" records for this station" ) def generate_monthly_data(self, stn_dict, date=None): """ Queries monthly data from the db, and reformats data so it can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param cur: oracle cursor to perform queries against. :param stn_dict: mapping of station IDs to station information. :param date: date to start fetching data from. :returns: generator of bulk API upsert actions. """ if not date: try: self.cur.execute( 'select * from CCCS_PORTAL.PUBLIC_CLIMATE_SUMMARY' ) except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to: {str(err)}.' ) else: try: self.cur.execute( ( f"select * from CCCS_PORTAL.PUBLIC_CLIMATE_SUMMARY " f"WHERE LAST_UPDATED > TO_TIMESTAMP(" f"'{date} 00:00:00', 'YYYY-MM-DD HH24:MI:SS')" ) ) except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to: {str(err)}.' ) for row in self.cur: insert_dict = dict(zip([x[0] for x in self.cur.description], row)) # Transform Date fields from datetime to string. insert_dict['LAST_UPDATED'] = ( str(insert_dict['LAST_UPDATED']) if insert_dict['LAST_UPDATED'] is not None else insert_dict['LAST_UPDATED'] ) insert_dict['ID'] = '{}.{}.{}'.format( insert_dict['STN_ID'], insert_dict['LOCAL_YEAR'], insert_dict['LOCAL_MONTH'], ) if insert_dict['STN_ID'] in stn_dict: coords = stn_dict[insert_dict['STN_ID']]['coordinates'] insert_dict['PROVINCE_CODE'] = stn_dict[insert_dict['STN_ID']][ 'PROVINCE_CODE' ] wrapper = { 'type': 'Feature', 'properties': insert_dict, 'geometry': {'type': 'Point', 'coordinates': coords}, } action = { '_id': insert_dict['ID'], '_index': 'climate_public_climate_summary', '_op_type': 'update', 'doc': wrapper, 'doc_as_upsert': True, } yield action else: LOGGER.error( f"Bad STN ID: {insert_dict['STN_ID']}, skipping" f" records for this station" ) def generate_daily_data(self, stn_dict, date=None): """ Queries daily data from the db, and reformats data so it can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param cur: oracle cursor to perform queries against. :param stn_dict: mapping of station IDs to station information. :param date: date to start fetching data from. :returns: generator of bulk API upsert actions. """ for station in stn_dict: if not date: try: self.cur.execute( f'select * from CCCS_PORTAL.PUBLIC_DAILY_DATA ' f'where STN_ID={station}' ) except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to:' f' {str(err)}.' ) else: try: self.cur.execute( ( f"select * from CCCS_PORTAL.PUBLIC_DAILY_DATA " f"where STN_ID={station} and " f"LOCAL_DATE > TO_TIMESTAMP('{date} 00:00:00', " f"'YYYY-MM-DD HH24:MI:SS')" ) ) except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to:' f' {str(err)}.' ) for row in self.cur: insert_dict = dict( zip([x[0] for x in self.cur.description], row) ) # Transform Date fields from datetime to string. insert_dict['LOCAL_DATE'] = ( str(insert_dict['LOCAL_DATE']) if insert_dict['LOCAL_DATE'] is not None else insert_dict['LOCAL_DATE'] ) insert_dict['ID'] = '{}.{}.{}.{}'.format( insert_dict['CLIMATE_IDENTIFIER'], insert_dict['LOCAL_YEAR'], insert_dict['LOCAL_MONTH'], insert_dict['LOCAL_DAY'], ) if insert_dict['STN_ID'] in stn_dict: coords = stn_dict[insert_dict['STN_ID']]['coordinates'] insert_dict['PROVINCE_CODE'] = stn_dict[ insert_dict['STN_ID'] ]['PROVINCE_CODE'] insert_dict['STATION_NAME'] = stn_dict[ insert_dict['STN_ID'] ]['STATION_NAME'] wrapper = { 'type': 'Feature', 'properties': insert_dict, 'geometry': {'type': 'Point', 'coordinates': coords}, } action = { '_id': insert_dict['ID'], '_index': 'climate_public_daily_data', '_op_type': 'update', 'doc': wrapper, 'doc_as_upsert': True, } yield action else: LOGGER.error( f"Bad STN ID: {insert_dict['STN_ID']}, skipping" f" records for this station" ) def get_station_data(self, station, starting_from): """ Creates a mapping of station ID to station coordinates and province name. :param cur: oracle cursor to perform queries against. :returns: A dictionary of dictionaries containing station coordinates and province name keyed by station ID. """ stn_dict = collections.OrderedDict() try: if station: if starting_from: self.cur.execute( ( f'select STN_ID, LONGITUDE_DECIMAL_DEGREES, ' f'LATITUDE_DECIMAL_DEGREES, ENG_PROV_NAME, ' f'FRE_PROV_NAME, PROV_STATE_TERR_CODE, ' f'STATION_NAME, CLIMATE_IDENTIFIER ' f'from CCCS_PORTAL.STATION_INFORMATION ' f'where STN_ID >= {station} ' f'order by STN_ID' ) ) else: self.cur.execute( ( f'select STN_ID, LONGITUDE_DECIMAL_DEGREES, ' f'LATITUDE_DECIMAL_DEGREES, ENG_PROV_NAME, ' f'FRE_PROV_NAME, PROV_STATE_TERR_CODE, ' f'STATION_NAME, CLIMATE_IDENTIFIER ' f'from CCCS_PORTAL.STATION_INFORMATION ' f'where STN_ID = {station} ' f'order by STN_ID' ) ) else: self.cur.execute( ( 'select STN_ID, LONGITUDE_DECIMAL_DEGREES, ' 'LATITUDE_DECIMAL_DEGREES, ENG_PROV_NAME, ' 'FRE_PROV_NAME, PROV_STATE_TERR_CODE, ' 'STATION_NAME, CLIMATE_IDENTIFIER ' 'from CCCS_PORTAL.STATION_INFORMATION ' 'order by STN_ID' ) ) except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to: {str(err)}.' ) for row in self.cur: stn_dict[row[0]] = { 'coordinates': [row[1], row[2]], 'ENG_PROV_NAME': row[3], 'FRE_PROV_NAME': row[4], 'PROVINCE_CODE': row[5].strip(), # remove the strip 'STATION_NAME': row[6], 'CLIMATE_IDENTIFIER': row[7].strip(), } return stn_dict def get_normals_data(self): """ Creates a mapping of normal ID to pub_name and period. :param cur: oracle cursor to perform queries against. :returns: A dictionary of dictionaries containing pub_name and period keyed by normal ID. """ normals_dict = {} try: self.cur.execute( ( 'select NORMAL_ID, E_NORMAL_ELEMENT_NAME, ' 'F_NORMAL_ELEMENT_NAME, PERIOD ' 'from CCCS_PORTAL.VALID_NORMALS_ELEMENTS' ) ) except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to: {str(err)}.' ) for row in self.cur: normals_dict[row[0]] = { 'E_NORMAL_ELEMENT_NAME': row[1], 'F_NORMAL_ELEMENT_NAME': row[2], 'PERIOD': row[3], } return normals_dict def get_normals_periods(self): """ Creates a mapping of normal period ID to period begin and end. :param cur: oracle cursor to perform queries against. :returns: A dictionary of dictionaries containing period begin and end keyed by normal period ID. """ period_dict = {} try: self.cur.execute( ( 'select NORMAL_PERIOD_ID, PERIOD_BEGIN, PERIOD_END ' 'from CCCS_PORTAL.NORMAL_PERIODS' ) ) except Exception as err: LOGGER.error( f'Could not fetch records from oracle due to: {str(err)}.' ) for row in self.cur: period_dict[row[0]] = { 'PERIOD_BEGIN': row[1], 'PERIOD_END': row[2], } return period_dict
class CitypageweatherRealtimeLoader(BaseLoader): """Current conditions real-time loader""" def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.conn.create(INDEX_NAME, mapping=SETTINGS) def load_data(self, filepath): """ fonction from base to load the data in ES :param filepath: filepath for parsing the current condition file :returns: True/False """ with open( os.path.join(MSC_PYGEOAPI_BASEPATH, 'resources/wxo_lookup.json')) as json_file: wxo_lookup = json.load(json_file) data = self.xml2json_cpw(wxo_lookup, filepath) try: r = self.conn.Elasticsearch.index( index=INDEX_NAME, id=data['properties']['identifier'], body=data) LOGGER.debug('Result: {}'.format(r)) return True except Exception as err: LOGGER.warning('Error indexing: {}'.format(err)) return False def _get_element(self, node, path, attrib=None): """ Convenience function to resolve lxml.etree.Element handling :param node: xml node :param path: path in the xml node :param attrib: attribute to get in the node returns: attribute as text or None """ val = node.find(path) if attrib is not None and val is not None: return val.attrib.get(attrib) if hasattr(val, 'text') and val.text not in [None, '']: return val.text return None def if_none(self, type_, value): """ Convenience fonction to avoid errors when converting to int or float :param type_: f for float and i for int :param value: value to convert to float/int :returns: converted variable """ if type_ == 'f': variable = float(value) if value else 'null' elif type_ == 'i': variable = int(value) if value else 'null' return variable def xml2json_cpw(self, wxo_lookup, xml): """ main for generating weather data :param wxo_lookup: json file to have the city id :param xml: xml file to parse and convert to json :returns: xml as json object """ feature = {} row = {} LOGGER.debug('Processing XML: {}'.format(xml)) LOGGER.debug('Fetching English elements') try: root = etree.parse(xml).getroot() except Exception as err: LOGGER.error('ERROR: cannot process data: {}'.format(err)) if root.findall("currentConditions/"): sitecode = os.path.basename(xml)[:-6] try: citycode = wxo_lookup[sitecode]['citycode'] except KeyError as err: LOGGER.error('ERROR: cannot find sitecode {} : ' 'err: {}'.format(sitecode, err)) location_name = root.find('location/name') x = float(location_name.attrib.get('lon')[:-1]) y = float(location_name.attrib.get('lat')[:-1]) if location_name.attrib.get('lat')[-1] == 'S': y *= -1 # south means negative latitude elif location_name.attrib.get('lon')[-1] in ['W', 'O']: x *= -1 # west means negative longitude feature['geom'] = [x, y, 0.0] icon = self._get_element(root, 'currentConditions/iconCode') if icon: row['icon'] = 'https://weather.gc.ca/' \ 'weathericons/{}.gif'.format(icon) else: row['icon'] = None for dates in root.findall("currentConditions/dateTime" "[@zone='UTC'][@name='observation']"): timestamp = dates.find('timeStamp') if timestamp is not None: dt2 = datetime.strptime(timestamp.text, '%Y%m%d%H%M%S') row['timestamp'] = dt2.strftime('%Y-%m-%dT%H:%M:%SZ') row['rel_hum'] = self._get_element( root, 'currentConditions/relativeHumidity') row['speed'] = self._get_element(root, 'currentConditions/wind/speed') row['gust'] = self._get_element(root, 'currentConditions/wind/gust') row['direction'] = self._get_element( root, 'currentConditions/wind/direction') row['bearing'] = self._get_element( root, 'currentConditions/wind/bearing') row['temp'] = self._get_element(root, 'currentConditions/temperature') row['dewpoint'] = self._get_element(root, 'currentConditions/dewpoint') row['windchill'] = self._get_element( root, 'currentConditions/windChill') if xml.endswith('e.xml'): row['name'] = self._get_element(root, 'location/name') row['station_en'] = self._get_element( root, 'currentConditions/station') row['cond_en'] = self._get_element( root, 'currentConditions/condition') row['pres_en'] = self._get_element( root, 'currentConditions/pressure') row['prestnd_en'] = self._get_element( root, 'currentConditions/pressure', 'tendency') row['url_en'] = 'https://weather.gc.ca/city/pages/' \ '{}_metric_e.html'.format(citycode) row['national'] = 0 if row['name'] in NATIONAL_CITIES: row['national'] = 1 LOGGER.debug('Adding feature') LOGGER.debug('Setting geometry') conditions = { 'type': "Feature", 'properties': { 'identifier': citycode, 'name': row['name'], 'station_en': row['station_en'], 'icon': row['icon'], 'cond_en': row['cond_en'], 'temp': self.if_none('f', row['temp']), 'dewpoint': self.if_none('f', row['dewpoint']), 'windchill': self.if_none('i', row['windchill']), 'pres_en': self.if_none('f', row['pres_en']), 'prestnd_en': row['prestnd_en'], 'rel_hum': self.if_none('i', row['rel_hum']), 'speed': self.if_none('i', row['speed']), 'gust': self.if_none('i', row['gust']), 'direction': row['direction'], 'bearing': self.if_none('f', row['bearing']), 'timestamp': row['timestamp'], 'url_en': row['url_en'], 'national': int(row['national']) }, 'geometry': { 'type': "Point", 'coordinates': feature['geom'] } } elif xml.endswith('f.xml'): LOGGER.debug('Processing {}'.format(xml)) row['nom'] = self._get_element(root, 'location/name') row['station_fr'] = self._get_element( root, 'currentConditions/station') row['cond_fr'] = self._get_element( root, 'currentConditions/condition') row['pres_fr'] = self._get_element( root, 'currentConditions/pressure') row['prestnd_fr'] = self._get_element( root, 'currentConditions/pressure', 'tendency') row['url_fr'] = 'https://meteo.gc.ca/city/pages/' \ '{}_metric_f.html'.format(citycode) row['national'] = 0 if row['nom'] in NATIONAL_CITIES: row['national'] = 1 LOGGER.debug('Adding feature') LOGGER.debug('Setting geometry') conditions = { 'type': "Feature", 'properties': { 'identifier': citycode, 'nom': row['nom'], 'station_fr': row['station_fr'], 'icon': row['icon'], 'cond_fr': row['cond_fr'], 'temp': self.if_none('f', row['temp']), 'dewpoint': self.if_none('f', row['dewpoint']), 'windchill': self.if_none('i', row['windchill']), 'pres_fr': self.if_none('f', row['pres_fr']), 'prestnd_fr': row['prestnd_fr'], 'rel_hum': self.if_none('i', row['rel_hum']), 'speed': self.if_none('i', row['speed']), 'gust': self.if_none('i', row['gust']), 'direction': row['direction'], 'bearing': self.if_none('f', row['bearing']), 'timestamp': row['timestamp'], 'url_fr': row['url_fr'], 'national': int(row['national']) }, 'geometry': { 'type': "Point", 'coordinates': feature['geom'] } } conditions['properties'] = { key: val for key, val in conditions['properties'].items() if val != 'null' } # noqa return conditions
class AQHIRealtimeLoader(BaseLoader): """AQHI Real-time loader""" def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.filepath = None self.type = None self.region = None self.date_ = None self.items = [] # only create index templates with forecasts and observations mappings template_mappings = { k: MAPPINGS[k] for k in ('forecasts', 'observations') } for aqhi_type in template_mappings: template_name = INDEX_BASENAME.format(aqhi_type) SETTINGS['index_patterns'] = ['{}*'.format(template_name)] SETTINGS['mappings'] = MAPPINGS[aqhi_type] self.conn.create_template(template_name, SETTINGS) def parse_filename(self): """ Parses a aqhi filename in order to get the date, forecast issued time, and region name. :return: `bool` of parse status """ # parse filepath pattern = 'AQ_{type}_{region}_{date_}.json' filename = self.filepath.name parsed_filename = parse(pattern, filename) # set class attributes type_ = parsed_filename.named['type'] if type_ == 'FCST': self.type = 'forecasts' if type_ == 'OBS': self.type = 'observations' self.region = parsed_filename.named['region'] self.date_ = datetime.strptime(parsed_filename.named['date_'], '%Y%m%d%H%M') return True def generate_geojson_features(self): """ Generates and yields a series of aqhi forecasts or observations. Forecasts and observations are returned as Elasticsearch bulk API upsert actions,with documents in GeoJSON to match the Elasticsearch index mappings. :returns: Generator of Elasticsearch actions to upsert the AQHI forecasts/observations """ with open(self.filepath.resolve()) as f: data = json.load(f) if self.type == "forecasts": features = data['features'] elif self.type == "observations": features = [data] for feature in features: # set document id and clean out unnecessery properties feature['id'] = feature.pop('ID', None) # set ES index name for feature es_index = '{}{}'.format( INDEX_BASENAME.format(self.type), self.date_.strftime('%Y-%m-%d'), ) self.items.append(feature) action = { '_id': feature['id'], '_index': es_index, '_op_type': 'update', 'doc': feature, 'doc_as_upsert': True, } yield action def load_data(self, filepath): """ loads data from event to target :returns: `bool` of status result """ self.filepath = Path(filepath) # set class variables from filename self.parse_filename() LOGGER.debug('Received file {}'.format(self.filepath)) # generate geojson features package = self.generate_geojson_features() self.conn.submit_elastic_package(package, request_size=80000) return True
class AQHIRealtimeLoader(BaseLoader): """AQHI Real-time loader""" def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.filepath = None self.type = None self.region = None self.date_ = None self.items = [] # only create index templates with forecasts and observations mappings template_mappings = { k: MAPPINGS[k] for k in ('forecasts', 'observations') } for aqhi_type in template_mappings: template_name = INDEX_BASENAME.format(aqhi_type) SETTINGS['index_patterns'] = ['{}*'.format(template_name)] SETTINGS['mappings'] = MAPPINGS[aqhi_type] self.conn.create_template(template_name, SETTINGS) def parse_filename(self): """ Parses a aqhi filename in order to get the date, forecast issued time, and region name. :return: `bool` of parse status """ # parse filepath pattern = '{date_}_MSC_AQHI-{type}_{region}.json' filename = self.filepath.name parsed_filename = parse(pattern, filename) # set class attributes type_ = parsed_filename.named['type'] if type_ == 'Forecasts': self.type = 'forecasts' if type_ == 'Observation': self.type = 'observations' self.region = parsed_filename.named['region'] self.date_ = datetime.strptime( parsed_filename.named['date_'], '%Y%m%dT%H%MZ' ) return True def generate_geojson_features(self): """ Generates and yields a series of aqhi forecasts or observations. Forecasts and observations are returned as Elasticsearch bulk API upsert actions,with documents in GeoJSON to match the Elasticsearch index mappings. :returns: Generator of Elasticsearch actions to upsert the AQHI forecasts/observations """ with open(self.filepath.resolve()) as f: data = json.load(f) if self.type == "forecasts": features = data['features'] elif self.type == "observations": features = [data] for feature in features: # set ES index name for feature es_index = '{}{}'.format( INDEX_BASENAME.format(self.type), self.date_.strftime('%Y-%m-%d'), ) if self.type == 'observations': feature['properties']['latest'] = True self.items.append(feature) action = { '_id': feature['id'], '_index': es_index, '_op_type': 'update', 'doc': feature, 'doc_as_upsert': True, } yield action def update_latest_status(self): """ update old observation AQHI status to False :return `bool` of update status """ lt_date = self.date_.strftime('%Y-%m-%dT%H:%M:%SZ') query = {"script": { "source": "ctx._source.properties.latest=false", "lang": "painless" }, "query": { "bool": { "must": [{ "match": { "properties.location_id": self.region } }, { "range": { "properties.observation_datetime": { "lt": lt_date, } } }] } } } # create list of today and yesterday index index_ = '{}*'.format(INDEX_BASENAME.format(self.type)) try: self.conn.update_by_query(query, index_) except Exception as err: LOGGER.warning('{}: failed to update ES index'.format(err)) return True def load_data(self, filepath): """ loads data from event to target :returns: `bool` of status result """ self.filepath = Path(filepath) # set class variables from filename self.parse_filename() LOGGER.debug('Received file {}'.format(self.filepath)) # generate geojson features package = self.generate_geojson_features() self.conn.submit_elastic_package(package, request_size=80000) if self.type == 'observations': LOGGER.debug('Updating Observation status') self.update_latest_status() return True
class HydrometricRealtimeLoader(BaseLoader): """Hydrometric Real-time loader""" def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.conn.create_template(INDEX_BASENAME, SETTINGS) self.stations = {} self.read_stations_list() def read_stations_list(self): """ Parses the local copy of the hydrometric stations list, creating a dictionary of station IDs to station info and putting it in <self.stations>. :returns: void """ if not os.path.exists(STATIONS_CACHE): download_stations() with open(STATIONS_CACHE) as stations_file: reader = csv.reader(stations_file) try: # Discard one row of headers next(reader) except StopIteration: raise EOFError( 'Stations file at {} is empty'.format(STATIONS_CACHE)) self.stations.clear() for row in reader: if len(row) > 6: LOGGER.warning('Station list row has too many values: {}' ' (using first 6)'.format(row)) elif len(row) < 6: LOGGER.error('Station list row has too few values: {}' ' (skipping)'.format(row)) continue stn_id, name, lat, lon, province, timezone = row[:6] try: lat = float(lat) lon = float(lon) except ValueError: LOGGER.error('Cannot interpret coordinates ({}, {}) for' ' station {} (skipping)'.format( lon, lat, stn_id)) continue utcoffset = timezone[4:] if utcoffset.strip() == '': LOGGER.error('Cannot interpret UTC offset {} for station' ' {} (skipping)'.format(timezone, stn_id)) continue LOGGER.debug('Station {}: name={}, province/territory={},' ' coordinates={}, utcoffset={}'.format( stn_id, name, province, (lon, lat), utcoffset)) stn_info = { 'STATION_NAME': name, 'PROV_TERR_STATE_LOC': province, 'UTCOFFSET': utcoffset, 'coordinates': (lon, lat) } self.stations[stn_id] = stn_info LOGGER.debug( 'Collected stations information: loaded {} stations'.format( len(self.stations))) def generate_observations(self, filepath): """ Generates and yields a series of observations, one for each row in <filepath>. Observations are returned as Elasticsearch bulk API upsert actions, with documents in GeoJSON to match the Elasticsearch index mappings. :param filename: Path to a data file of realtime hydrometric :returns: Generator of Elasticsearch actions to upsert the observations """ today = datetime.utcnow() today_start = datetime(year=today.year, month=today.month, day=today.day) hourly_domain_start = today_start - timedelta(days=2) daily_domain_start = today_start - timedelta(days=DAYS_TO_KEEP) with open(filepath) as ff: reader = csv.reader(ff) # Discard one row of headers next(reader) for row in reader: if len(row) > 10: LOGGER.warning('Data row in {} has too many values:' ' {} (using only first 10)'.format( filepath, row)) elif len(row) < 10: LOGGER.error('Data row in {} has too few values: {}' ' (skipping)'.format(filepath, row)) continue station, date, level, _, level_symbol, _, \ discharge, _, discharge_symbol, _ = row if station in self.stations: stn_info = self.stations[station] LOGGER.debug('Found info for station {}'.format(station)) else: LOGGER.error( 'Cannot find info for station {} (skipping)'.format( station)) continue try: # Convert timestamp to UTC time. utc_datetime = delocalize_date(date) utc_datestamp = utc_datetime.strftime('%Y-%m-%d.%H:%M:%S') # Generate an ID now that all fields are known. observation_id = '{}.{}'.format(station, utc_datestamp) utc_datestamp = utc_datestamp.replace('.', 'T') except Exception as err: LOGGER.error('Cannot interpret datetime value {} in {}' ' due to: {} (skipping)'.format( date, filepath, str(err))) continue if 'daily' in filepath and utc_datetime > hourly_domain_start: LOGGER.debug('Daily observation {} overlaps hourly data' ' (skipping)'.format(observation_id)) continue elif utc_datetime < daily_domain_start: LOGGER.debug('Daily observation {} precedes retention' ' period (skipping)'.format(observation_id)) continue LOGGER.debug('Generating observation {} from {}: datetime={},' ' level={}, discharge={}'.format( observation_id, filepath, utc_datestamp, level, discharge)) try: level = float(level) if level.strip() else None except ValueError: LOGGER.error('Cannot interpret level value {}' ' (setting null)'.format(level)) try: discharge = float(discharge) if discharge.strip() else None except ValueError: LOGGER.error('Cannot interpret discharge value {}' ' (setting null)'.format(discharge)) if level_symbol.strip() == '': level_symbol_en = None level_symbol_fr = None if discharge_symbol.strip() == '': discharge_symbol_en = None discharge_symbol_fr = None observation = { 'type': 'Feature', 'geometry': { 'type': 'Point', 'coordinates': stn_info['coordinates'] }, 'properties': { 'IDENTIFIER': observation_id, 'STATION_NUMBER': station, 'STATION_NAME': stn_info['STATION_NAME'], 'PROV_TERR_STATE_LOC': stn_info['PROV_TERR_STATE_LOC'], 'DATETIME': utc_datestamp, 'DATETIME_LST': date, 'LEVEL': level, 'DISCHARGE': discharge, 'LEVEL_SYMBOL_EN': level_symbol_en, 'LEVEL_SYMBOL_FR': level_symbol_fr, 'DISCHARGE_SYMBOL_EN': discharge_symbol_en, 'DISCHARGE_SYMBOL_FR': discharge_symbol_fr } } LOGGER.debug('Observation {} created successfully'.format( observation_id)) es_index = '{}{}'.format(INDEX_BASENAME, utc_datetime.strftime('%Y-%m-%d')) action = { '_id': observation_id, '_index': es_index, '_op_type': 'update', 'doc': observation, 'doc_as_upsert': True } yield action def load_data(self, filepath): """ loads data from event to target :param filepath: filepath to data on disk :returns: `bool` of status result """ if filepath.endswith('hydrometric_StationList.csv'): return True LOGGER.debug('Received file {}'.format(filepath)) package = self.generate_observations(filepath) self.conn.submit_elastic_package(package, request_size=80000) return True
class SWOBRealtimeLoader(BaseLoader): """SWOB Real-time loader""" def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config, verify_certs=False) self.items = [] self.conn.create_template(INDEX_BASENAME, SETTINGS) def generate_observations(self, filepath): """ Generates and yields a series of observations, one for each row in <filepath>. Observations are returned as Elasticsearch bulk API upsert actions, with documents in GeoJSON to match the Elasticsearch index mappings. :param filename: Path to a data file of realtime SWOB :returns: Generator of Elasticsearch actions to upsert the observations """ observation = swob2geojson(filepath) observation_id = observation['id'] LOGGER.debug( 'Observation {} created successfully'.format(observation_id)) obs_dt = datetime.strptime( observation['properties']['date_tm-value'], DATETIME_RFC3339_MILLIS_FMT, ) obs_dt2 = obs_dt.strftime('%Y-%m-%d') es_index = '{}{}'.format(INDEX_BASENAME, obs_dt2) action = { '_id': observation_id, '_index': es_index, '_op_type': 'update', 'doc': observation, 'doc_as_upsert': True, } self.items.append(observation) yield action def load_data(self, filepath): """ loads data from event to target :param filepath: filepath to data on disk :returns: `bool` of status result """ LOGGER.debug('Received file {}'.format(filepath)) chunk_size = 80000 package = self.generate_observations(filepath) self.conn.submit_elastic_package(package, request_size=chunk_size) return True
class MarineWeatherRealtimeLoader(BaseLoader): """Marine weather real-time loader""" def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.filepath = None self.region_name_code = None self.language = None self.root = None self.area = {} self.items = [] # create marine weather indices if it don't exist for item in MAPPINGS: SETTINGS['mappings']['properties']['properties'][ 'properties'] = MAPPINGS[item] self.conn.create(INDEX_NAME.format(item), SETTINGS) def parse_filename(self): """ Parses a marine weather forecast XML filename to get the region name code and language. :return: `bool` of parse status """ # parse filepath pattern = '{region_name_code}_{language}.xml' filename = self.filepath.name parsed_filename = parse(pattern, filename) # set class variables self.region_name_code = parsed_filename.named['region_name_code'] self.language = parsed_filename.named['language'] return True def create_datetime_dict(self, datetime_elems): """ Used to pass a pair of timeStamp elements from the XML. These elements contain the UTC and local time for various marine forecast sections (warnings, regular forecasts, extended forecasts). The first element contains UTC datetime info and the second local datetime info. :param datetime_elems: list of lmxl `Element` objects representing the dateTime nodes to parse. :returns: `dict` with "utc" and "local" keys containing respective parsed datetime objects. """ datetime_utc = datetime.strptime( datetime_elems[0].find('timeStamp').text, '%Y%m%d%H%M') local_offset = float(datetime_elems[1].attrib['UTCOffset']) datetime_local = datetime_utc + timedelta(hours=local_offset) datetime_local = datetime_local.replace( tzinfo=timezone(timedelta(hours=local_offset))) return {'utc': datetime_utc, 'local': datetime_local} def set_area_info(self): """ Gets the area name from the marine weather XML document and looks up the equivalent meteocode forecast polygon feature ID to query the forecast_polygons_water ES index for the corresponding document. If document is found, assigns the self.area class attribute that contains region name, subregion name, area name and the associated geometry. :return: `bool` representing successful setting of self.area attribute """ area_name = self.root.find('area').text with open( os.path.join( MSC_PYGEOAPI_BASEPATH, 'lib/msc_pygeoapi/', 'resources/meteocode_lookup.json', )) as json_file: meteocode_lookup = json.load(json_file) forecast_id = meteocode_lookup[self.region_name_code] try: result = self.conn.Elasticsearch.get( index='forecast_polygons_water_detail', id=forecast_id, _source=['geometry'], ) self.area = { # get area element value **{ 'name': area_name }, # get area element attribute values **{ key: self.root.find('area').attrib[key] for key in ['countryCode', 'region', 'subRegion'] }, **result['_source'], } return True except exceptions.NotFoundError: LOGGER.warning("Could not get forecast polygon document with id: " "{}".format(forecast_id)) def generate_warnings(self): """ Generates and yields a series of marine weather warnings for a given marine weather area. Warnings are returned as Elasticsearch bulk API upsert actions, with a single document for the marine weather region in GeoJSON to match the Elasticsearch index mappings. :returns: Generator of Elasticsearch actions to upsert the marine weather warnings. """ warnings = self.root.findall('warnings/') feature = {'type': 'Feature', 'geometry': {}, 'properties': {}} feature['geometry'] = self.area['geometry'] feature['properties']['area_{}'.format( self.language)] = self.area['name'] feature['properties']['region_{}'.format( self.language)] = self.area['region'] feature['properties']['sub_region_{}'.format( self.language)] = self.area['subRegion'] feature['properties']['warnings_{}'.format(self.language)] = [] if len(warnings) > 0: for elem in warnings: datetimes = self.create_datetime_dict( elem.findall('event/' 'dateTime')) location = { 'location_{}'.format(self.language): elem.attrib['name'], 'issued_datetime_utc_{}'.format(self.language): strftime_rfc3339(datetimes['utc']), 'issued_datetime_local_{}'.format(self.language): strftime_rfc3339(datetimes['local']), 'event_type_{}'.format(self.language): elem.find('event').attrib['type'], 'event_category_{}'.format(self.language): elem.find('event').attrib['category'], 'event_name_{}'.format(self.language): elem.find('event').attrib['name'], 'event_status_{}'.format(self.language): elem.find('event').attrib['status'], } feature['properties']['warnings_{}'.format( self.language)].append(location) self.items.append(feature) action = { '_id': self.filepath.stem.split('_')[0], '_index': 'marine_weather_warnings', '_op_type': 'update', 'doc': feature, 'doc_as_upsert': True, } yield action def generate_regular_forecasts(self): """ Generates and yields a series of marine weather regular forecasts for a given marine weather area. Each regular forecast is returned as Elasticsearch bulk API upsert actions, with documents in GeoJSON to match the Elasticsearch index mappings. :returns: Generator of Elasticsearch actions to upsert the marine weather regular forecast. """ regular_forecasts = self.root.findall('regularForecast/') feature = {'type': 'Feature', 'geometry': {}, 'properties': {}} feature['geometry'] = self.area['geometry'] feature['properties']['area_{}'.format( self.language)] = self.area['name'] feature['properties']['region_{}'.format( self.language)] = self.area['region'] feature['properties']['sub_region_{}'.format( self.language)] = self.area['subRegion'] feature['properties']['forecasts_{}'.format(self.language)] = [] if len(regular_forecasts) > 0: datetimes = self.create_datetime_dict([ element for element in regular_forecasts if element.tag == 'dateTime' ]) feature['properties']['issued_datetime_utc'] = strftime_rfc3339( datetimes['utc']) feature['properties']['issued_datetime_local'] = strftime_rfc3339( datetimes['local']) locations = [ element for element in regular_forecasts if element.tag == 'location' ] for location in locations: location = { 'location_{}'.format(self.language): location.attrib['name'] if 'name' in location.attrib else self.area['name'], 'period_of_coverage_{}'.format(self.language): location.find('weatherCondition/periodOfCoverage').text if location.find('weatherCondition/periodOfCoverage') is not None else None, 'wind_{}'.format(self.language): location.find('weatherCondition/wind').text if location.find('weatherCondition/wind') is not None else None, 'weather_visibility_{}'.format(self.language): location.find('weatherCondition/weatherVisibility').text if location.find('weatherCondition/weatherVisibility') is not None else None, 'air_temperature_{}'.format(self.language): location.find('weatherCondition/airTemperature').text if location.find('weatherCondition/airTemperature') is not None else None, 'freezing_spray_{}'.format(self.language): location.find('weatherCondition/freezingSpray').text if location.find('weatherCondition/freezingSpray') is not None else None, 'status_statement_{}'.format(self.language): location.find('statusStatement').text if location.find('statusStatement') is not None else None, } feature['properties']['forecasts_{}'.format( self.language)].append(location) self.items.append(feature) action = { '_id': self.filepath.stem.split('_')[0], '_index': 'marine_weather_regular-forecasts', '_op_type': 'update', 'doc': feature, 'doc_as_upsert': True, } yield action def generate_extended_forecasts(self): """ Generates and yields a series of marine weather extended forecasts for a given marine weather area. Each extended forecast is returned as Elasticsearch bulk API upsert actions, with documents in GeoJSON to match the Elasticsearch index mappings. :returns: Generator of Elasticsearch actions to upsert the marine weather extended forecast. """ extended_forecasts = self.root.findall('extendedForecast/') feature = {'type': 'Feature', 'geometry': {}, 'properties': {}} feature['geometry'] = self.area['geometry'] feature['properties']['area_{}'.format( self.language)] = self.area['name'] feature['properties']['region_{}'.format( self.language)] = self.area['region'] feature['properties']['sub_region_{}'.format( self.language)] = self.area['subRegion'] feature['properties']['extended_forecasts_{}'.format( self.language)] = [] if len(extended_forecasts) > 0: datetimes = self.create_datetime_dict([ element for element in extended_forecasts if element.tag == 'dateTime' ]) feature['properties']['issued_datetime_utc'] = strftime_rfc3339( datetimes['utc']) feature['properties']['issued_datetime_local'] = strftime_rfc3339( datetimes['local']) locations = [ element for element in extended_forecasts if element.tag == 'location' ] for location in locations: location = { 'location_{}'.format(self.language): location.attrib['name'] if 'name' in location.attrib else self.area['name'], 'forecast_periods_{}'.format(self.language): [{ 'forecast_period_{}'.format(self.language): forecast_period.attrib['name'], 'forecast_{}'.format(self.language): forecast_period.text, } for forecast_period in location.findall( 'weatherCondition/') if location.findall('weatherCondition/') is not None], 'status_statement_{}'.format(self.language): location.find('statusStatement').text if location.find('statusStatement') is not None else None, } feature['properties']['extended_forecasts_{}'.format( self.language)].append(location) self.items.append(feature) action = { '_id': self.filepath.stem.split('_')[0], '_index': 'marine_weather_extended-forecasts', '_op_type': 'update', 'doc': feature, 'doc_as_upsert': True, } yield action def load_data(self, filepath): """ loads data from event to target :returns: `bool` of status result """ self.filepath = Path(filepath) self.parse_filename() LOGGER.debug('Received file {}'.format(self.filepath)) self.root = etree.parse(str(self.filepath.resolve())).getroot() # set area info for both languages from XML self.set_area_info() warnings = self.generate_warnings() regular_forecasts = self.generate_regular_forecasts() extended_forecasts = self.generate_extended_forecasts() for package in [warnings, regular_forecasts, extended_forecasts]: self.conn.submit_elastic_package(package, request_size=80000) return True
class HurricanesRealtimeLoader(BaseLoader): """Hurricanes Real-time loader""" def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.filepath = None self.date_ = None self.fh = None self.storm_name = None self.storm_variable = None self.items = [] # create storm variable indices if it don't exist for item in FILE_PROPERTIES: SETTINGS['mappings']['properties']['properties'][ 'properties' ] = FILE_PROPERTIES[item] self.conn.create(INDEX_NAME.format(item), SETTINGS) def parse_filename(self): """ Parses a hurricane filename in order to get the date, forecast issued time, storm name, and storm variable. :return: `bool` of parse status """ # parse filepath pattern = '{date_}_{fh}_{storm_name}.{storm_variable}.' \ '{file_extension}' filename = self.filepath.name parsed_filename = parse(pattern, filename) # set class variables self.date_ = datetime.strptime(parsed_filename.named['date_'], '%Y%m%d') self.fh = parsed_filename.named['fh'] self.storm_name = parsed_filename.named['storm_name'] self.storm_variable = parsed_filename.named['storm_variable'] return True def check_shapefile_deps(self): """ Check that all shapefile dependencies are available :return: `bool` of check result """ dependencies = ['.shp', '.shx', '.dbf', '.prj'] return all([self.filepath.with_suffix(suffix).exists() for suffix in dependencies]) # TODO: Remove once upstream data is patched @staticmethod def clean_consecutive_coordinates(coordinates): """ Temporary fix for issues with upstream data. Removes consecutive coordinate points from GeoJSON coordinates :param coordinates: list of GeoJSON coordinates :return: """ return [[k for k, g in groupby(coordinate)] for coordinate in coordinates] def deactivate_old_forecasts(self): """ Deactivates previously added forecasts for a specific storm name. :return: `bool` of deactivation status """ query = { "script": "ctx._source.properties.active=false", "query": { "bool": { "must": [ {"match": {"properties.STORMNAME": self.storm_name}}, {"match": {"properties.active": True}}, ] } } } try: self.conn.Elasticsearch.update_by_query(index=INDEX_NAME.format( self.storm_variable), body=query) except ConflictError: LOGGER.warning("Conflict error detected. Refreshing index and " "retrying update by query.") self.conn.Elasticsearch.indices.refresh(index=INDEX_NAME.format( self.storm_variable)) self.conn.Elasticsearch.update_by_query(index=INDEX_NAME.format( self.storm_variable), body=query) return True def generate_geojson_features(self): """ Generates and yields a series of storm forecasts, one for each feature in <self.filepath>. Observations are returned as Elasticsearch bulk API upsert actions, with documents in GeoJSON to match the Elasticsearch index mappings. :returns: Generator of Elasticsearch actions to upsert the storm forecasts """ driver = ogr.GetDriverByName('ESRI Shapefile') filepath = str(self.filepath.resolve()) data = driver.Open(filepath, 0) lyr = data.GetLayer(0) file_datetime_str = strftime_rfc3339(self.date_) for feature in lyr: feature_json = feature.ExportToJson(as_object=True) feature_json['properties']['active'] = True feature_json['properties'][ 'filename'] = self.filepath.stem feature_json['properties'][ 'filedate'] = file_datetime_str # noqa # TODO: Remove once upstream data is patched # clean rad consecutive coordinates in geometry (temporary fix) if self.storm_variable == 'rad': feature_json['geometry'][ 'coordinates'] = self.clean_consecutive_coordinates( feature_json['geometry']['coordinates']) # format pts ADVDATE if self.storm_variable == 'pts': feature_json['properties']['ADVDATE'] = \ strftime_rfc3339( datetime.strptime( feature_json['properties']['ADVDATE'], '%y%m%d/%H%M' ) ) self.items.append(feature_json) action = { '_id': '{}-{}-{}-{}-{}'.format(self.storm_name, self.storm_variable, file_datetime_str, self.fh, feature_json['id']), '_index': INDEX_NAME.format(self.storm_variable), '_op_type': 'update', 'doc': feature_json, 'doc_as_upsert': True } yield action def load_data(self, filepath): """ loads data from event to target :returns: `bool` of status result """ self.filepath = Path(filepath) # set class variables from filename self.parse_filename() LOGGER.debug('Received file {}'.format(self.filepath)) # check for shapefile dependencies if self.check_shapefile_deps(): # deactivate old forecasts for current storm name self.deactivate_old_forecasts() # generate geojson features package = self.generate_geojson_features() self.conn.submit_elastic_package(package, request_size=80000) return True else: LOGGER.debug("All Shapefile dependencies not found. Ignoring " "file...") return False
class ForecastPolygonsLoader(BaseLoader): """Forecast polygons (land/water) loader""" def __init__(self, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.filepath = None self.version = None self.zone = None self.items = [] # create forecast polygon indices if they don't exist for index in INDICES: zone = index.split('_')[2] SETTINGS['mappings']['properties']['properties'][ 'properties'] = FILE_PROPERTIES[zone] self.conn.create(index, SETTINGS) def parse_filename(self): """ Parses a meteocode filename in order to get the version, zone (land/water) and type (proj, unproj, kmz, etc.) :return: `bool` of parse status """ # parse filepath pattern = 'MSC_Geography_Pkg_V{version:w}_{zone}_{type}.zip' filename = self.filepath.name parsed_filename = parse(pattern, filename) # set class variables self.version = parsed_filename.named['version'].replace('_', '.') self.zone = parsed_filename.named['zone'] return True def generate_geojson_features(self, shapefile_name): """ Generates and yields a series of meteocode geodata features, one for each feature in <self.filepath/self.filepath.stem/ shapefile_name>. Features are returned as Elasticsearch bulk API upsert actions, with documents in GeoJSON to match the Elasticsearch index mappings. :returns: Generator of Elasticsearch actions to upsert the forecast polygons for given shapefile in zip archive """ filepath = str( (self.filepath / self.filepath.stem / shapefile_name).resolve()) data = ogr.Open(r'/vsizip/{}'.format(filepath)) lyr = data.GetLayer() for feature in lyr: feature_json = feature.ExportToJson(as_object=True, options=['RFC7946=YES']) feature_json['properties']['version'] = self.version _id = feature_json['properties']['FEATURE_ID'] self.items.append(feature_json) action = { '_id': '{}'.format(_id), '_index': INDEX_NAME.format(self.zone.lower(), shapefile_name.split('_')[2]), '_op_type': 'update', 'doc': feature_json, 'doc_as_upsert': True } yield action def load_data(self, filepath): """ loads data from event to target :returns: `bool` of status result """ self.filepath = Path(filepath) # set class variables from filename self.parse_filename() LOGGER.debug('Received file {}'.format(self.filepath)) for shapefile in SHAPEFILES_TO_LOAD[self.filepath.stem]: # generate geojson features package = self.generate_geojson_features(shapefile) self.conn.submit_elastic_package(package, request_size=80000) return True
class LtceLoader(BaseLoader): """LTCE data loader""" def __init__(self, db_string=None, conn_config={}): """initializer""" BaseLoader.__init__(self) self.conn = ElasticsearchConnector(conn_config) self.db_conn = None # setup DB connection if db_string is not None: try: self.db_conn = cx_Oracle.connect(db_string) self.cur = self.db_conn.cursor() except Exception as err: msg = 'Could not connect to Oracle: {}'.format(err) LOGGER.critical(msg) raise click.ClickException(msg) else: LOGGER.debug("No DB connection string passed. Indexing disabled.") self.db_conn = self.cur = None for item in MAPPINGS: SETTINGS['mappings']['properties']['properties'][ 'properties' ] = MAPPINGS[item] self.conn.create(INDEX_NAME.format(item), SETTINGS) def get_stations_info(self, element_name, station_id): """ Queries LTCE station data for a given element name (DAILY MINIMUM TEMPERATURE, DAILY MAXIMUM TEMPERATURE, etc.), and virtual station ID. Returns the ealiest start date of all returned stations and the end date climate identifier, and coordinates of the most recently threaded station. :param element_name: `str` of element name :param station_id: `str` of virtual climate station id :return: `dict` of stations information """ query = { "query": { "bool": { "filter": { "bool": { "must": [ { "term": { "properties.VIRTUAL_CLIMATE_ID.raw": station_id # noqa } }, { "term": { "properties.ELEMENT_NAME_E.raw": element_name # noqa } }, ] } } } } } results = self.conn.Elasticsearch.search( body=query, index='ltce_stations', _source=[ 'properties.CLIMATE_IDENTIFIER', 'properties.ENG_STN_NAME', 'properties.FRE_STN_NAME', 'properties.START_DATE', 'properties.END_DATE', 'properties.PROVINCE_CODE', 'geometry.coordinates', ], ) results = [result['_source'] for result in results['hits']['hits']] oldest_station = None most_recent_station = None for index, station in enumerate(results): # retrieve station start and end date dates = ( station['properties']['START_DATE'], station['properties']['END_DATE'], ) # convert station dates to datetime objects ( station['properties']['START_DATE'], station['properties']['END_DATE'], ) = (start_date, end_date) = [ datetime.strptime(date, DATETIME_RFC3339_FMT) if date is not None else None for date in dates ] # assign first station as oldest and most recent if index == 0: oldest_station = station most_recent_station = station continue # then compare all remaining stations and replace as necessary if start_date < oldest_station['properties']['START_DATE']: oldest_station = station if most_recent_station['properties']['END_DATE'] is not None and ( end_date is None or end_date > most_recent_station['properties']['END_DATE'] ): most_recent_station = station stations_info = { 'record_begin': strftime_rfc3339( oldest_station['properties']['START_DATE'] ), 'record_end': strftime_rfc3339( most_recent_station['properties']['END_DATE'] ) if most_recent_station['properties']['END_DATE'] else None, 'climate_identifier': most_recent_station['properties'][ 'CLIMATE_IDENTIFIER' ], 'eng_stn_name': most_recent_station['properties']['ENG_STN_NAME'], 'fre_stn_name': most_recent_station['properties']['FRE_STN_NAME'], 'coords': [ most_recent_station['geometry']['coordinates'][0], most_recent_station['geometry']['coordinates'][1], ], 'province_code': most_recent_station['properties'][ 'PROVINCE_CODE' ], } return stations_info def generate_stations(self): """ Queries stations data from the db, and reformats data so it can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param cur: oracle cursor to perform queries against. :returns: generator of bulk API upsert actions. """ try: self.cur.execute( ( "SELECT ARKEON2DWH.VIRTUAL_STATION_INFO_F_MVW.*," "ARKEON2DWH.STATION_INFORMATION.ENG_STN_NAME," "ARKEON2DWH.STATION_INFORMATION.FRE_STN_NAME," "ARKEON2DWH.WXO_CITY_INFORMATION_MVW.LAT," "ARKEON2DWH.WXO_CITY_INFORMATION_MVW.LON," "ARKEON2DWH.WXO_CITY_INFORMATION_MVW.PROVINCECODE " "FROM ARKEON2DWH.VIRTUAL_STATION_INFO_F_MVW " "LEFT JOIN ARKEON2DWH.STATION_INFORMATION " "ON ARKEON2DWH.VIRTUAL_STATION_INFO_F_MVW.STN_ID = " "ARKEON2DWH.STATION_INFORMATION.STN_ID " "LEFT JOIN ARKEON2DWH.WXO_CITY_INFORMATION_MVW " "ON ARKEON2DWH.VIRTUAL_STATION_INFO_F_MVW.WXO_CITY_CODE = " "ARKEON2DWH.WXO_CITY_INFORMATION_MVW.CITYCODE " "WHERE " "ARKEON2DWH.VIRTUAL_STATION_INFO_F_MVW.ELEMENT_NAME_E IN " "('DAILY MINIMUM TEMPERATURE', 'DAILY MAXIMUM TEMPERATURE'," # noqa "'DAILY TOTAL PRECIPITATION', 'DAILY TOTAL SNOWFALL')" ) ) except Exception as err: LOGGER.error( 'Could not fetch records from oracle due to: {}.'.format( str(err) ) ) for row in self.cur: insert_dict = dict(zip([x[0] for x in self.cur.description], row)) for key in insert_dict: if key in ['START_DATE', 'END_DATE']: insert_dict[key] = ( strftime_rfc3339(insert_dict[key]) if insert_dict[key] is not None else insert_dict[key] ) es_id = slugify( '{}-{}-{}-{}-{}'.format( insert_dict['VIRTUAL_CLIMATE_ID'], insert_dict["ELEMENT_NAME_E"], insert_dict["CLIMATE_IDENTIFIER"], insert_dict["START_DATE"], insert_dict["END_DATE"], ) ) coords = [ float(insert_dict['LON']), float(insert_dict['LAT']), ] # rename PROVINCECODE field to PROVINCE_CODE insert_dict['PROVINCE_CODE'] = insert_dict['PROVINCECODE'] # cleanup unwanted fields retained from SQL join fields_to_delete = [ 'STN_ID', 'ENG_PROV_NAME', 'FRE_PROV_NAME', 'REGION_CODE', 'CRITERIA', 'NOTES', 'VIRTUAL_STN_INFO_UPDATE_ID', 'CURRENT_FLAG', 'LON', 'LAT', 'PROVINCECODE', ] for field in fields_to_delete: insert_dict.pop(field) # set properties.IDENTIFIER insert_dict['IDENTIFIER'] = es_id wrapper = { 'id': es_id, 'type': 'Feature', 'properties': insert_dict, 'geometry': {'type': 'Point', 'coordinates': coords}, } action = { '_id': es_id, '_index': 'ltce_stations', '_op_type': 'update', 'doc': wrapper, 'doc_as_upsert': True, } yield action def generate_daily_temp_extremes(self): """ Queries stations data from the db, and reformats data so it can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param cur: oracle cursor to perform queries against. :returns: generator of bulk API upsert actions. """ try: self.cur.execute( ( "SELECT t1.*, t2.*, t3.*, t4.*, t5.*, t6.*, t7.*, t8.* " "FROM ARKEON2DWH.RECORD_HIGH_VIRTUAL_MAX_TEMP t1 " "JOIN ARKEON2DWH.RECORD_LOW_VIRTUAL_MAX_TEMP t2 " "ON t1.VIRTUAL_CLIMATE_ID = t2.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t2.LOCAL_MONTH " "AND t1.LOCAL_DAY = t2.LOCAL_DAY " "JOIN ARKEON2DWH.RECORD_LOW_VIRTUAL_MIN_TEMP t3 " "ON t1.VIRTUAL_CLIMATE_ID = t3.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t3.LOCAL_MONTH " "AND t1.LOCAL_DAY = t3.LOCAL_DAY " "JOIN ARKEON2DWH.RECORD_HIGH_VIRTUAL_MIN_TEMP t4 " "ON t1.VIRTUAL_CLIMATE_ID = t4.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t4.LOCAL_MONTH " "AND t1.LOCAL_DAY = t4.LOCAL_DAY " "JOIN ARKEON2DWH.EXTREME_HIGH_VIRTUAL_MAX_TEMP t5 " "ON t1.VIRTUAL_CLIMATE_ID = t5.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t5.LOCAL_MONTH " "AND t1.LOCAL_DAY = t5.LOCAL_DAY " "JOIN ARKEON2DWH.EXTREME_LOW_VIRTUAL_MAX_TEMP t6 " "ON t1.VIRTUAL_CLIMATE_ID = t6.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t6.LOCAL_MONTH " "AND t1.LOCAL_DAY = t6.LOCAL_DAY " "JOIN ARKEON2DWH.EXTREME_HIGH_VIRTUAL_MIN_TEMP t7 " "ON t1.VIRTUAL_CLIMATE_ID = t7.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t7.LOCAL_MONTH " "AND t1.LOCAL_DAY = t7.LOCAL_DAY " "JOIN ARKEON2DWH.EXTREME_LOW_VIRTUAL_MIN_TEMP t8 " "ON t1.VIRTUAL_CLIMATE_ID = t8.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t8.LOCAL_MONTH " "AND t1.LOCAL_DAY = t8.LOCAL_DAY " ) ) except Exception as err: LOGGER.error( 'Could not fetch records from oracle due to: {}.'.format( str(err) ) ) # dictionnary to store stations information once retrieved stations_dict = {} for row in self.cur: insert_dict = dict(zip([x[0] for x in self.cur.description], row)) for key in insert_dict: if key in ['LAST_UPDATED']: insert_dict[key] = ( strftime_rfc3339(insert_dict[key]) if insert_dict[key] is not None else insert_dict[key] ) virtual_climate_id = insert_dict['VIRTUAL_CLIMATE_ID'] es_id = '{}-{}-{}'.format( insert_dict['VIRTUAL_CLIMATE_ID'], insert_dict["LOCAL_MONTH"], insert_dict["LOCAL_DAY"], ) # check if we have station IDs record begin and end. If not # retrieve the information and store in stations_dict if virtual_climate_id not in stations_dict: stations_dict[virtual_climate_id] = {} stations_dict[virtual_climate_id][ 'MIN' ] = self.get_stations_info( 'DAILY MINIMUM TEMPERATURE', virtual_climate_id ) stations_dict[virtual_climate_id][ 'MAX' ] = self.get_stations_info( 'DAILY MAXIMUM TEMPERATURE', virtual_climate_id ) # check if TEMEPERATURE MIN/MAX for most recent threaded station # have same climate identifier value min_climate_identifier = stations_dict[virtual_climate_id]['MIN'][ 'climate_identifier' ] max_climate_identifier = stations_dict[virtual_climate_id]['MAX'][ 'climate_identifier' ] if min_climate_identifier == max_climate_identifier: insert_dict['CLIMATE_IDENTIFIER'] = stations_dict[ virtual_climate_id ]['MAX']['climate_identifier'] insert_dict['ENG_STN_NAME'] = stations_dict[ virtual_climate_id ]['MAX']['eng_stn_name'] insert_dict['FRE_STN_NAME'] = stations_dict[ virtual_climate_id ]['MAX']['fre_stn_name'] insert_dict['PROVINCE_CODE'] = stations_dict[ virtual_climate_id ]['MAX']['province_code'] else: LOGGER.error( f'Currently threaded station climate identifier value ' f'does not match between DAILY MINIMUM TEMPERATURE' f'({min_climate_identifier}) and DAILY MAXIMUM ' f'TEMPERATURE({max_climate_identifier}) station threads ' f'for virtual climate ID {virtual_climate_id}.' ) continue # set new fields for level in ['MIN', 'MAX']: # set new insert_dict keys insert_dict[ '{}_TEMP_RECORD_BEGIN'.format(level) ] = stations_dict[virtual_climate_id][level]['record_begin'] insert_dict[ '{}_TEMP_RECORD_END'.format(level) ] = stations_dict[virtual_climate_id][level]['record_end'] # cleanup unwanted fields retained from SQL join fields_to_delete = [ 'LOCAL_TIME', 'VIRTUAL_MEAS_DISPLAY_CODE', 'ENG_STN_NAME', 'FRE_STN_NAME', 'CLIMATE_IDENTIFIER', ] for field in fields_to_delete: insert_dict.pop(field) # set properties.IDENTIFIER insert_dict['IDENTIFIER'] = es_id wrapper = { 'id': es_id, 'type': 'Feature', 'properties': insert_dict, 'geometry': { 'type': 'Point', 'coordinates': stations_dict[virtual_climate_id]['MAX'][ 'coords' ], }, } action = { '_id': es_id, '_index': 'ltce_temp_extremes', '_op_type': 'update', 'doc': wrapper, 'doc_as_upsert': True, } yield action def generate_daily_precip_extremes(self): """ Queries stations data from the db, and reformats data so it can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param cur: oracle cursor to perform queries against. :returns: generator of bulk API upsert actions. """ try: self.cur.execute( ( "SELECT t1.*, t2.* " "FROM ARKEON2DWH.RECORD_VIRTUAL_PRECIPITATION t1 " "JOIN ARKEON2DWH.EXTREME_VIRTUAL_PRECIPITATION t2 " "ON t1.VIRTUAL_CLIMATE_ID = t2.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t2.LOCAL_MONTH " "AND t1.LOCAL_DAY = t2.LOCAL_DAY " ) ) except Exception as err: LOGGER.error( 'Could not fetch records from oracle due to: {}.'.format( str(err) ) ) stations_dict = {} for row in self.cur: insert_dict = dict(zip([x[0] for x in self.cur.description], row)) for key in insert_dict: if key in ['LAST_UPDATED']: insert_dict[key] = ( strftime_rfc3339(insert_dict[key]) if insert_dict[key] is not None else insert_dict[key] ) virtual_climate_id = insert_dict['VIRTUAL_CLIMATE_ID'] es_id = '{}-{}-{}'.format( insert_dict['VIRTUAL_CLIMATE_ID'], insert_dict["LOCAL_MONTH"], insert_dict["LOCAL_DAY"], ) # check if we have station IDs record begin and end if not retrieve if virtual_climate_id not in stations_dict: stations_dict[virtual_climate_id] = self.get_stations_info( 'DAILY TOTAL PRECIPITATION', virtual_climate_id ) insert_dict['RECORD_BEGIN'] = stations_dict[virtual_climate_id][ 'record_begin' ] insert_dict['RECORD_END'] = stations_dict[virtual_climate_id][ 'record_end' ] insert_dict['CLIMATE_IDENTIFIER'] = stations_dict[ virtual_climate_id ]['climate_identifier'] insert_dict['ENG_STN_NAME'] = stations_dict[virtual_climate_id][ 'eng_stn_name' ] insert_dict['FRE_STN_NAME'] = stations_dict[virtual_climate_id][ 'fre_stn_name' ] insert_dict['PROVINCE_CODE'] = stations_dict[virtual_climate_id][ 'province_code' ] # cleanup unwanted fields retained from SQL join fields_to_delete = [ 'LOCAL_TIME', 'VIRTUAL_MEAS_DISPLAY_CODE', 'ENG_STN_NAME', 'FRE_STN_NAME', 'CLIMATE_IDENTIFIER', 'LAST_UPDATED', ] for field in fields_to_delete: insert_dict.pop(field) # set properties.IDENTIFIER insert_dict['IDENTIFIER'] = es_id wrapper = { 'id': es_id, 'type': 'Feature', 'properties': insert_dict, 'geometry': { 'type': 'Point', 'coordinates': stations_dict[virtual_climate_id]['coords'], }, } action = { '_id': es_id, '_index': 'ltce_precip_extremes', '_op_type': 'update', 'doc': wrapper, 'doc_as_upsert': True, } yield action def generate_daily_snow_extremes(self): """ Queries stations data from the db, and reformats data so it can be inserted into Elasticsearch. Returns a generator of dictionaries that represent upsert actions into Elasticsearch's bulk API. :param cur: oracle cursor to perform queries against. :returns: generator of bulk API upsert actions. """ try: self.cur.execute( ( "SELECT t1.*, t2.* " "FROM ARKEON2DWH.RECORD_VIRTUAL_SNOWFALL t1 " "JOIN ARKEON2DWH.EXTREME_VIRTUAL_SNOWFALL t2 " "ON t1.VIRTUAL_CLIMATE_ID = t2.VIRTUAL_CLIMATE_ID " "AND t1.LOCAL_MONTH = t2.LOCAL_MONTH " "AND t1.LOCAL_DAY = t2.LOCAL_DAY " ) ) except Exception as err: LOGGER.error( 'Could not fetch records from oracle due to: {}.'.format( str(err) ) ) stations_dict = {} for row in self.cur: insert_dict = dict(zip([x[0] for x in self.cur.description], row)) for key in insert_dict: if key in ['LAST_UPDATED']: insert_dict[key] = ( strftime_rfc3339(insert_dict[key]) if insert_dict[key] is not None else insert_dict[key] ) virtual_climate_id = insert_dict['VIRTUAL_CLIMATE_ID'] es_id = '{}-{}-{}'.format( insert_dict['VIRTUAL_CLIMATE_ID'], insert_dict["LOCAL_MONTH"], insert_dict["LOCAL_DAY"], ) # check if we have station IDs record begin and end if not retrieve if virtual_climate_id not in stations_dict: stations_dict[virtual_climate_id] = self.get_stations_info( 'DAILY TOTAL SNOWFALL', virtual_climate_id ) insert_dict['RECORD_BEGIN'] = stations_dict[virtual_climate_id][ 'record_begin' ] insert_dict['RECORD_END'] = stations_dict[virtual_climate_id][ 'record_end' ] insert_dict['CLIMATE_IDENTIFIER'] = stations_dict[ virtual_climate_id ]['climate_identifier'] insert_dict['ENG_STN_NAME'] = stations_dict[virtual_climate_id][ 'eng_stn_name' ] insert_dict['FRE_STN_NAME'] = stations_dict[virtual_climate_id][ 'fre_stn_name' ] insert_dict['PROVINCE_CODE'] = stations_dict[virtual_climate_id][ 'province_code' ] # cleanup unwanted fields retained from SQL join fields_to_delete = [ 'LOCAL_TIME', 'VIRTUAL_MEAS_DISPLAY_CODE', 'ENG_STN_NAME', 'FRE_STN_NAME', 'CLIMATE_IDENTIFIER', 'LAST_UPDATED', ] for field in fields_to_delete: insert_dict.pop(field) # set properties.IDENTIFIER insert_dict['IDENTIFIER'] = es_id wrapper = { 'id': es_id, 'type': 'Feature', 'properties': insert_dict, 'geometry': { 'type': 'Point', 'coordinates': stations_dict[virtual_climate_id]['coords'], }, } action = { '_id': es_id, '_index': 'ltce_snow_extremes', '_op_type': 'update', 'doc': wrapper, 'doc_as_upsert': True, } yield action