def test_get_history_with_single_match(test_session): obs_tuple = Row( time=datetime.now(), val=123, variable_name="relative_humidity", unit="percent", network_name="MoTIe", station_id="11091", lat=None, lon=None, ) history = get_history( test_session, obs_tuple.network_name, obs_tuple.station_id, obs_tuple.lat, obs_tuple.lon, ) assert history is not None q = test_session.query(Station) assert q.count() == 6 q = test_session.query(History) assert q.count() == 8
def test_get_history_with_no_matches(test_session): # this observation will not match any in test session obs_tuple = Row( time=datetime.now(), val=123, variable_name="relative_humidity", unit="percent", network_name="FLNRO-WMB", station_id="666", lat=None, lon=None, ) history = get_history( test_session, obs_tuple.network_name, obs_tuple.station_id, obs_tuple.lat, obs_tuple.lon, ) assert history is not None q = test_session.query(Station) assert q.count() == 7 q = test_session.query(History) assert q.count() == 9
def test_diagnostic(crmp_session, diag, count): dt = datetime.now() rows = ( (dt, 0, "test_var_a", "degrees", "MoTIe", "noname", 40, -120), (dt, 0, "test_var_a", "degrees", "MoTIe", "noname", 40, -120), (dt, 0, "test_var_b", "mm", "MoTIe", "noname", 40, -120), ) rows = (Row(*x) for x in rows) crmp_session.add(Network(name="MoTIe")) infer(crmp_session, rows, diag) q = crmp_session.query(Variable).filter(Variable.name.like("test_var%")) assert q.count() == count
def test_get_history_with_multiple_matches_and_location(test_session): obs_tuple = Row( time=datetime.now(), val=123, variable_name="relative_humidity", unit="percent", network_name="EC_raw", station_id="1047172", lat=49.45, lon=-123.7, ) history = get_history( test_session, obs_tuple.network_name, obs_tuple.station_id, obs_tuple.lat, obs_tuple.lon, ) assert history.id == 20
def normalize(stream): log.info("Starting CRD data normalization") tz = pytz.timezone("Canada/Pacific") data = json.load(stream) units = data["HEADER"]["_units"] var_names = [unit.replace("Unit", "") for unit in units.keys()] log.debug("Found variables %s", var_names) for record in data["DATA"]: # Timezone information isn't provided by CRD, but the # observations appear to be in local time. The max time # value found in a request is the most recent hour local # time. Hopefully assuming this will suffice. date = datetime.strptime(record["DateTimeString"], "%Y%m%d%H%M%S") date = tz.localize(date).astimezone(pytz.utc) for var_name in var_names: # CRD uses -9999 and null for missing values. Skip these. # See page 2 here: https://tinyurl.com/quczs93 val = record[var_name] if val is None or val == -9999: continue yield Row( time=date, val=val, variable_name=var_name, unit=units[f"{var_name}Unit"], network_name="CRD", station_id=record["StationName"], lat=None, lon=None, )
variable = get_variable(test_session, network_name, variable_name) check_val = unit_check(val, unit, variable.unit) assert check_val == expected @pytest.mark.parametrize( ("obs_tuple", "expected_hid", "expected_time", "expeceted_vid", "expected_datum"), [ # use match_station_with_active to match ( Row( time=datetime(2012, 9, 26, 18), val=123, variable_name="precipitation", unit="mm", network_name="EC_raw", station_id="1047172", lat=None, lon=None, ), 21, datetime(2012, 9, 26, 18), 2, 123, ), # use unit_db_check to convert units ( Row( time=datetime(2012, 9, 26, 18), val=10, variable_name="precipitation",
def normalize(file_stream): log.info("Starting WMB data normalization") def clean_row(row): return row.strip().replace('"', "").split(",") # set variable names using first row in file stream var_names = [] for first_row in file_stream: first_row = first_row.decode("utf-8") for var in clean_row(first_row): var_names.append(var) break for row in file_stream: row = row.decode("utf-8") # assign variable name to value data = [(var_name, value) for var_name, value in zip(var_names, clean_row(row))] # extract station_id and weather_date from list _, station_id = data.pop(0) _, weather_date = data.pop(0) tz = pytz.timezone("Canada/Pacific") # The date's provided are in 1-24 hour format *roll* hour = int(weather_date[-2:]) - 1 weather_date = weather_date[:-2] + str(hour) try: # Timezone information isn't provided by WMB, but the # observations appear to be in local time. The max time # value found in a request is the most recent hour local # time. Hopefully assuming this will suffice. date = datetime.strptime(weather_date, "%Y%m%d%H") date = tz.localize(date).astimezone(pytz.utc) except ValueError: log.error("Unable to convert date", extra={"date": weather_date}) continue for pair in data: var_name, value = pair # skip if value string is empty if not value: continue try: value = float(value) except ValueError: log.error("Unable to convert val to float", extra={"value": value}) continue yield Row( time=date, val=value, variable_name=var_name, unit=None, network_name="FLNRO-WMB", station_id=station_id, lat=None, lon=None, )
def normalize(file_stream): log.info("Starting WAMR data normalization") string_stream = io.StringIO(file_stream.read().decode("utf-8")) reader = csv.DictReader(string_stream) for row in reader: keys_of_interest = ( "DATE_PST", "EMS_ID", "STATION_NAME", "UNIT", "UNITS", "PARAMETER", "REPORTED_VALUE", "RAW_VALUE", "LONGITUDE", "LATITUDE", ) ( time, ems_id, station_name, unit, units, variable_name, rep_val, raw_val, lon, lat, ) = (row[k] if k in row else None for k in keys_of_interest) # Circa 2020, BC ENV is presenting inconsistent names for # several of their columns (UNIT/UNITS, EMS_ID/STATION_NAME, # REPORTED_VALUE/RAW_VALUE/ROUNDED_VALUE. Ensure that we have # at least one of these sets. unit = get_one_of((unit, units)) reported_station_id = get_one_of((ems_id, station_name)) try: val = get_one_of((rep_val, raw_val)) except ValueError: # skip over empty values continue try: value = float(val) except ValueError: log.error("Unable to convert val to float", extra={"value": val}) continue try: tz = pytz.timezone("Canada/Pacific") # Timezone information is not available from the text # string provided. However, the date field in WAMR's feed # is always titled "DATE_PST" (even during times of # DST). There's not really enough information available # from the network, so we'll have to assume that this # covers it. dt = tz.localize(parse(time)).astimezone(pytz.utc) except ValueError: log.error("Unable to convert date string to datetime", extra={"time": time}) continue substitutions = [("% RH", "%"), ("\u00b0C", "celsius"), ("mb", "millibar")] for src, dest in substitutions: if unit == src: unit = re.sub(src, dest, unit) # There is a set of Metro Vancouver's stations that are being # delivered to us by WAMR, but it is desired that they are re- # associated with the correct network. Attempting this by altering the # normalization to the correct station_id and the correct network # name. Issue here is that the metrovan variables need to match the ENV-AQN # variables. Will work on that in the database. with resource_stream("crmprtd", "wamr/station_substitutions.yaml") as f: substitutions = yaml.safe_load(f) if reported_station_id in substitutions: station_id = substitutions[reported_station_id] network_name = "MVan" else: station_id = reported_station_id network_name = "ENV-AQN" yield Row( time=dt, val=value, variable_name=variable_name, unit=unit, network_name=network_name, station_id=station_id, lat=lat, lon=lon, )
def normalize(file_stream): log.info("Starting MOTI data normalization") et = xmlparse(file_stream) et = transform(et) obs_series = et.xpath("//observation-series") for series in obs_series: if not len(series): log.warning("Empty observation series: xpath search " "'//observation-series' return no results") continue try: stn_id = series.xpath( "./origin/id[@type='client']")[0].text.strip() except IndexError as e: log.error( "Could not detect the station id: xpath search " "'//observation-series/origin/id[@type='client']' " "return no results", extra={"exception": e}, ) continue members = series.xpath("./observation", namespaces=ns) for member in members: # get time and convert to datetime time = member.get("valid-time") if not time: log.warning("Could not find a valid-time attribute for this " "observation") continue try: # MoTI gives us an ISO formatted time string with # timezone info attached so it should be sufficient to # simply parse it and display it as UTC. date = dateparse(time).astimezone(pytz.utc) except ValueError as e: log.warning("Unable to convert value to datetime", extra={"time": time}) continue for obs in member.iterchildren(): variable_name = obs.get("type") if variable_name is None: continue try: value_element = obs.xpath("./value")[0] except IndexError as e: log.warning( "Could not find the actual value for " "observation. xpath search './value' " "returned no results", extra={"variable_name": variable_name}, ) continue try: value = float(value_element.text) except ValueError: log.error( "Could not convert value to a number. " "Skipping this observation.", extra={"value": value_element}, ) continue yield Row( time=date, val=value, variable_name=variable_name, unit=value_element.get("units"), network_name="MoTIe", station_id=stn_id, lat=None, lon=None, )
def normalize_xml( file_stream, network_name, station_id_attr="climate_station_number", station_id_xform=identity, ): et = parse_xml(file_stream) members = et.xpath("//om:member", namespaces=ns) log.info("Starting %s data normalization", network_name) for member in members: om = OmMember(member) vars = om.observed_vars() for var in vars: try: ele = om.member.xpath( "./om:Observation/om:result//" "{}[@name='{}']".format(no_ns_element("element"), var), namespaces=ns, )[0] val = ele.get("value") # Ignore missing values. We don't record them. if val == "MSNG": log.debug("Ignoring missing obs with value 'MSNG'") continue val = float(val) # This shouldn't ever be empty based on our xpath for selecting # elements, however it could be non-numeric and # still be valid XML except ValueError as e: log.error("Unable to convert value", extra={"val": (ele.get("value"))}) continue try: log.debug("Finding Station attributes") station_id = member.xpath( ".//{}/{}[@name='{}']".format( no_ns_element("identification-elements"), no_ns_element("element"), station_id_attr, ), namespaces=ns, )[0].get("value") station_id = station_id_xform(station_id) lat, lon = map( float, member.xpath(".//gml:pos", namespaces=ns)[0].text.split()) obs_time = member.xpath( "./om:Observation/om:samplingTime//gml:timePosition", namespaces=ns)[0].text log.debug( "Found station info", extra={ "station_id": station_id, "lon": lon, "lat": lat, "time": obs_time, }, ) # An IndexError here means that the member has no station_name or # climate_station_number (or identification-elements), lat/lon, # or obs_time in which case we don't need to process this item except IndexError: log.warning("This member does not appear to be a station") continue try: date = dateparse(obs_time).astimezone(pytz.utc) except ValueError as e: log.error("Unable to parse date", extra={"exception": e}) continue yield Row( time=date, val=val, variable_name=var, unit=om.member_unit(var), network_name=network_name, station_id=station_id, lat=lat, lon=lon, )