def main(): # args and usage parser = argparse.ArgumentParser() parser.add_argument('apikey', action='store', type=str, default=None, help='API key') parser.add_argument('startweek', action='store', type=int, default=None, help='first week (ex: 201440)') parser.add_argument('endweek', action='store', type=int, default=None, help='last week (ex: 201520)') parser.add_argument('location', action='store', type=str, default=None, help='location (ex: US)') parser.add_argument('term', action='store', type=str, default=None, help='term/query/topic (ex: /m/0cycc)') args = parser.parse_args() # get the data ght = GHT(args.apikey) result = ght.get_data(args.startweek, args.endweek, args.location, args.term) values = result['values'] # sanity check expected_weeks = result['num_weeks'] received_weeks = len([v for v in values if v is not None and type(v) == float and v >= 0]) if expected_weeks != received_weeks: raise Exception('expected %d weeks, received %d' % (expected_weeks, received_weeks)) # results epiweeks = [ew for ew in flu.range_epiweeks(args.startweek, args.endweek, inclusive=True)] for (epiweek, value) in zip(epiweeks, values): print('%6d: %.3f' % (epiweek, value))
def update(self, sensors, first_week, last_week): """ Compute sensor readings and store them in the database. """ # most recent issue if last_week is None: last_issue = get_most_recent_issue(self.epidata) last_week = flu.add_epiweeks(last_issue, +1) # connect with self.database as database: # update each sensor for (name, loc) in sensors: # update each location for location in get_location_list(loc): # timing ew1 = first_week if ew1 is None: ew1 = database.get_most_recent_epiweek(name, location) if ew1 is None: # If an existing sensor reading wasn't found in the database and # no start week was given, just assume that readings should start # at 2014w01. ew1 = 201401 print('%s-%s not found, starting at %d' % (name, location, ew1)) args = (name, location, ew1, last_week) print('Updating %s-%s from %d to %d.' % args) for test_week in flu.range_epiweeks(ew1, last_week, inclusive=True): self.update_single(database, test_week, name, location)
def _get_partial_trajectory(self, epiweek, valid=True): y, w = EW.split_epiweek(epiweek) if w < 30: y -= 1 ew1 = EW.join_epiweek(y, 30) ew2 = epiweek limit = EW.add_epiweeks(ew2, -5) weeks = Epidata.range(ew1, ew2) stable = Epidata.check(Epidata.fluview(self.region, weeks)) try: unstable = Epidata.check(Epidata.fluview(self.region, weeks, issues=ew2)) except: unstable = [] wili = {} for row in stable: ew, value = row['epiweek'], row['wili'] if not valid or ew < limit: wili[ew] = value for row in unstable: ew, value = row['epiweek'], row['wili'] wili[ew] = value curve = [] for ew in EW.range_epiweeks(ew1, ew2, inclusive=True): if ew not in wili: if valid: t = 'unstable' else: t = 'any' raise Exception('wILI (%s) not available for week %d' % (t, ew)) curve.append(wili[ew]) n1 = EW.delta_epiweeks(ew1, ew2) + 1 n2 = len(curve) if n1 != n2: raise Exception('missing data (expected %d, found %d)' % (n1, n2)) return curve
def update(ew1, ew2, test_mode=False): # init si = StateInfo() sql = ''' INSERT INTO `state_ili_imputed` (`epiweek`, `state`, `ili`) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE `ili` = %s ''' # connect u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database='epidata') cur = cnx.cursor() # get state ILI on each week for ew in flu.range_epiweeks(ew1, ew2, inclusive=True): print('epiweek:', ew) result = si.get_ili(ew) for state in si.sta: ili = result[state] if not (0 <= ili < 25): raise Exception('ILI for %s is %+.3f?' % (state, ili)) print(' %s %.3f' % (state, ili)) # upload if not test_mode: args = (ew, state, ili, ili) cur.execute(sql, args) # disconnect cur.close() cnx.commit() cnx.close()
def update(sensors, first_week=None, last_week=None, valid=False, test_mode=False): # most recent issue last_issue = get_most_recent_issue() # location information loc_info = StateInfo() # connect u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database='epidata') cur = cnx.cursor() # update each sensor for (name, loc) in sensors: if loc == 'hhs': locations = loc_info.hhs elif loc == 'cen': locations = loc_info.cen elif loc == 'state' or loc == 'sta': locations = loc_info.sta else: locations = [loc] # update each location print(locations) for location in locations: # timing ew1, ew2 = first_week, last_week if ew1 is None: ew1 = get_last_update(cur, name, location) if ew2 is None: ew2 = flu.add_epiweeks(last_issue, +1) print('Updating %s-%s from %d to %d.' % (name, location, ew1, ew2)) for test_week in flu.range_epiweeks(ew1, ew2, inclusive=True): train_week = flu.add_epiweeks(test_week, -1) try: value = { 'gft': get_gft, 'ght': get_ght, 'ghtj': get_ghtj, 'twtr': get_twtr, 'wiki': get_wiki, 'cdc': get_cdc, 'epic': get_epic, 'sar3': get_sar3, 'arch': get_arch, 'quid': get_quid, }[name](location, train_week, valid) print(' %4s %5s %d -> %.3f' % (name, location, test_week, value)) # upload store_value(cur, name, location, test_week, value) except Exception as ex: print(' failed: %4s %5s %d' % (name, location, test_week), ex) #raise ex sys.stdout.flush() # disconnect cur.close() if not test_mode: cnx.commit() cnx.close()
def update(self, first_week, last_week): """Nowcast the given range of weeks and save the result to the database.""" # update the week range if needed first_week, last_week = self.get_update_range(first_week, last_week) print('nowcasting %d--%d' % (first_week, last_week)) # prefetch bulk data self.data_source.prefetch(last_week) # compute the nowcast(s) weeks = list(range_epiweeks(first_week, last_week, inclusive=True)) nowcasts = Nowcast(self.data_source, DatasetnameLocationMapper).batch_nowcast(weeks) # save to database with self.database as db: # save each nowcast for week, nowcast in zip(weeks, nowcasts): for location, value, stdev in nowcast: db.insert(self.target, week, location, float(value), float(stdev)) # update the timestamp db.set_last_update_time(self.target)
def __init__(self, region, target, use_weekly=True): self.region = region self.target = target self.stts = 0 weeks = Epidata.range(201401, 202330) rx = Epidata.check(Epidata.paho_dengue(self.region, weeks)) self.data = {} self.valid = {} self.ew2i, self.i2ew = {}, {} for ew in EW.range_epiweeks(weeks['from'], weeks['to'], inclusive=True): # if 200916 <= ew <= 201015: # continue i = len(self.ew2i) self.ew2i[ew] = i self.i2ew[i] = ew epiweeks = list(map(lambda elt: elt['epiweek'], rx)) values = list(map(lambda elt: elt[self.target], rx)) data = {elt['epiweek']: elt[self.target] for elt in rx} w_data = cum_to_week(data) for i in range(len(rx)): ew, observation = epiweeks[i], w_data[epiweeks[i]] if ew not in self.ew2i: continue i = self.ew2i[ew] if i not in self.data: self.data[i] = {} self.valid[i] = {'stable': False} lag = 'stable' self.data[i][lag] = observation self.valid[i][lag] = True self.weeks = sorted(list(self.data.keys())) self.dds = DengueDataSource.new_instance(target)
def __init__(self, region, target): self.region = region self.target = target weeks = Epidata.range(199301, 202330) auth = secrets.api.datasetname_targets rx = mutate_rows_as_if_lagged(Epidata.check(Epidata.datasetname_targets(auth, self.target, self.region, weeks)), 1000000) self.data = {} self.valid = {} self.ew2i, self.i2ew = {}, {} for ew in EW.range_epiweeks(weeks['from'], weeks['to'], inclusive=True): # if 200916 <= ew <= 201015: # continue i = len(self.ew2i) self.ew2i[ew] = i self.i2ew[i] = ew for row in rx: ew, observation, lag = row['epiweek'], row['value'], row['lag'] if ew not in self.ew2i: continue i = self.ew2i[ew] if i not in self.data: self.data[i] = {} self.valid[i] = {'stable': False} lag = 'stable' self.data[i][lag] = observation self.valid[i][lag] = True self.weeks = sorted(list(self.data.keys())) for i in self.weeks: if 'stable' not in self.data[i]: continue
def get_kcdc_data(): issue = EpiDate.today().get_ew() last_season = issue // 100 + (1 if issue % 100 > 35 else 0) url = 'http://www.cdc.go.kr/npt/biz/npp/iss/influenzaListAjax.do' params = { 'icdNm': 'influenza', 'startYear': '2004', # Started in 2004 'endYear': str(last_season) } response = requests.post(url, params) datas = response.json() data = datas['data'] ews = [] ilis = [] ew1 = 200436 for year in range(2004, last_season): year_data = data[year - 2004] if year > 2004: ew1 = ews[-1] + 1 ili_yr = year_data["VALUE"].split('`') ili_yr = [float(f) for f in ili_yr if f != ''] ew2 = add_epiweeks(ew1, len(ili_yr)) new_ews = list(range_epiweeks(ew1, ew2)) for i in range(len(new_ews)): j = float(ili_yr[i]) ilis.append(j) ews.append(new_ews[i]) return ews, ilis
def test_update(self): """Compute and store a nowcast.""" database = MagicMock() database.__enter__.return_value = database database.__exit__.return_value = None data_source = MagicMock( get_truth_locations=lambda *a: ['nat', 'vi'], get_sensor_locations=lambda *a: ['nat', 'vi'], get_missing_locations=lambda *a: (), get_sensors=lambda *a: ['epic', 'sar3'], get_most_recent_issue=lambda *a: 201813, get_weeks=lambda *a: list(range_epiweeks(201713, 201814)), get_truth_value=lambda *a: random.random(), get_sensor_value=lambda *a: random.random(), prefetch=lambda *a: None) NowcastUpdate(database, data_source).update(201812, 201813) self.assertEqual(database.set_last_update_time.call_count, 1) self.assertEqual(database.insert.call_count, 4) epiweek_location_pairs = set() for args, kwargs in database.insert.call_args_list: epiweek_location_pairs.add(args[:2]) self.assertIn((201812, 'nat'), epiweek_location_pairs) self.assertIn((201813, 'nat'), epiweek_location_pairs) self.assertIn((201812, 'vi'), epiweek_location_pairs) self.assertIn((201813, 'vi'), epiweek_location_pairs)
def get_weeks(self): """Return a list of weeks on which truth and sensors are both available.""" latest_week = EpiDate.today().get_ew() latest_week = add_epiweeks(latest_week, -1) week_range = range_epiweeks(self.FIRST_DATA_EPIWEEK, latest_week, inclusive=True) return list(week_range)
def extract(first_week=None, last_week=None, test_mode=False): # page title templates pages = [ '%What You Should Know for the % Influenza Season%', '%What To Do If You Get Sick%', '%Flu Symptoms & Severity%', '%How Flu Spreads%', '%What You Should Know About Flu Antiviral Drugs%', '%Weekly US Map%', '%Basics%', '%Flu Activity & Surveillance%', ] # location information states = sorted(cdc_upload.STATES.values()) # connect u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database='epidata') cur = cnx.cursor() # weeks to update if first_week is None: cur.execute('SELECT max(`epiweek`) FROM `cdc_extract`') for (first_week,) in cur: pass if last_week is None: cur.execute('SELECT max(`epiweek`) FROM `cdc_meta`') for (last_week,) in cur: pass print('extracting %d--%d' % (first_week, last_week)) # update each epiweek for epiweek in flu.range_epiweeks(first_week, last_week, inclusive=True): # update each state for state in states: try: num1 = get_num_hits(cur, epiweek, state, pages[0]) num2 = get_num_hits(cur, epiweek, state, pages[1]) num3 = get_num_hits(cur, epiweek, state, pages[2]) num4 = get_num_hits(cur, epiweek, state, pages[3]) num5 = get_num_hits(cur, epiweek, state, pages[4]) num6 = get_num_hits(cur, epiweek, state, pages[5]) num7 = get_num_hits(cur, epiweek, state, pages[6]) num8 = get_num_hits(cur, epiweek, state, pages[7]) total = get_total_hits(cur, epiweek, state) store_result(cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total) print(' %d-%s: %d %d %d %d %d %d %d %d (%d)' % (epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total)) except Exception as ex: print(' %d-%s: failed' % (epiweek, state), ex) #raise ex sys.stdout.flush() # disconnect cur.close() if not test_mode: cnx.commit() cnx.close()
def fetch(weeks): # Impute missing weeks with 0% # This is actually correct because twitter does not store rows with `num` = # 0. So weeks with 0 `num` (and `percent`) are missing from the response. res = Epidata.twitter(secrets.api.twitter, location, epiweeks=weeks) if 'epidata' in res: epiweeks = set([r['epiweek'] for r in res['epidata']]) first, last = 201149, weeks['to'] for ew in flu.range_epiweeks(first, last, inclusive=True): if ew not in epiweeks: res['epidata'].append({'epiweek': ew, 'percent': 0.}) return res
def get_dengue_data(first_week, last_week): # Check week order if first_week > last_week: first_week, last_week = last_week, first_week # Bounds check if first_week < 200301 or last_week < 200301: raise Exception('week out of range') # Initialize data by week and location (zeroes are not reported) data = {} for week in range_epiweeks(first_week, add_epiweeks(last_week, 1)): data[week] = {} for location in NIDSS.LOCATION_TO_REGION.keys(): data[week][location] = 0 # Download CSV response = requests.get(NIDSS.DENGUE_URL) if response.status_code != 200: raise Exception('export Dengue failed [%d]' % response.status_code) csv = response.content.decode('big5-tw') # Parse the data lines = [l.strip() for l in csv.split('\n')[1:] if l.strip() != ''] for line in lines: fields = line.split(',') location_b64 = base64.b64encode(fields[3].encode('utf-8')) location = NIDSS._TRANSLATED[location_b64] region = NIDSS.LOCATION_TO_REGION[location] imported_b64 = base64.b64encode(fields[6].encode('utf-8')) imported = imported_b64 == b'5piv' sex = fields[5] age = fields[7] count = int(fields[8]) year = int(fields[1]) week = int(fields[2]) # Week 53 was reported each year in 2003-2007 if year < 2008 and year != 2003 and week > 52: week = 52 # Epiweek system change in 2009 # See also: http://research.undefinedx.com/forum/index.php?topic=300.0 if year == 2009: week -= 1 if week == 0: year, week = 2008, 53 epiweek = year * 100 + week if epiweek < first_week or epiweek > last_week: # Outside of the requested range continue if epiweek not in data or location not in data[epiweek]: # Not a vaild U.S. epiweek raise Exception('data missing %d-%s' % (epiweek, location)) # Add the counts to the location on this epiweek data[epiweek][location] += count # Return results indexed by week and location return data
def _train(self, region): if region in self.bf_var: # already trained return if len(region) == 2: # TODO: this is a hack for state ILI # assume backfill of region 4 print('FIXME: setting backfill for %s as hhs4' % region) self.bf_var[region] = self.bf_var['hhs4'] self.emp_mean[region] = self.emp_mean['hhs4'] self.emp_var[region] = self.emp_var['hhs4'] self.emp_curves[region] = self.emp_curves['hhs4'] return stable = self._get_stable(region) start_weeks = [flu.get_season(ew)[0] for ew in stable.keys()] curves = [] seasons = set( [flu.split_epiweek(ew)[0] for ew in start_weeks if ew is not None]) for s in seasons: ew1 = flu.join_epiweek(s + 0, 40) if self.forecast_type == ForecastType.WILI: ew2 = flu.add_epiweeks(ew1, 37) else: ew2 = flu.add_epiweeks(ew1, 29) # print("stable: ", stable) # print("range_epiweeks: ", [i for i in flu.range_epiweeks(ew1, ew2)]) curve = [stable[ew] for ew in flu.range_epiweeks(ew1, ew2)] curves.append(curve) self.emp_mean[region] = np.mean(curves, axis=0) self.emp_var[region] = np.var(curves, axis=0, ddof=1) self.emp_curves[region] = curves if self.backfill_weeks is None: self.bf_var[region] = [0] else: self.bf_var[region] = [] for lag in range(self.backfill_weeks): unstable = self._get_unstable(region, lag) changes = [ stable[ew] - unstable[ew] for ew in stable.keys() & unstable.keys() ] if len(changes) < 2: raise Exception('not enough data') self.bf_var[region].append(np.var(changes, ddof=1)) print( ' %5s: %s' % (region, ' '.join(['%.3f' % (b**0.5) for b in self.bf_var[region]])))
def prefetch(self, epiweek): """ Fetch all data in all locations up to the given epiweek. Requests are batched. This is significantly more efficient (and faster) than querying each sensor/location/epiweek data point individually. """ def extract(response): if response['result'] == -2: return [] return self.epidata.check(response) weeks = Epidata.range(self.FIRST_DATA_EPIWEEK, epiweek) sensor_locations = set(self.get_sensor_locations()) # loop over locations to avoid hitting the limit of ~3.5k rows for loc in self.get_truth_locations(): print('fetching %s...' % loc) # default to None to prevent cache misses on missing values for week in range_epiweeks(self.FIRST_DATA_EPIWEEK, epiweek, inclusive=True): for name in ['datasetname_targets'] + self.get_sensors(): self.add_to_cache(name, self.target, loc, week, None) # ground truth auth = secrets.api.datasetname_targets datasetnameData = self.epidata.check( self.epidata.datasetname_targets(auth, self.target, loc, weeks)) for row in datasetnameData: self.add_to_cache('datasetname_targets', self.target, loc, row['epiweek'], row['value']) # sensor readings if loc not in sensor_locations: # skip withheld locations (i.e. a retrospective experiment) continue for sen in self.get_sensors(): response = self.epidata.datasetname_sensors( secrets.api.datasetname_sensors, self.target, sen, loc, weeks) for row in extract(response): self.add_to_cache(sen, self.target, loc, row['epiweek'], row['value'])
def get_heatmap_data(self): w0s, w1s = [], [] for sensor in FluDataSource.SENSORS: x = self.get_sensor(sensor, 'nat') w0s.append(min(x)) w1s.append(max(x)) w0, w1 = min(w0s), max(w1s) weeks = list(Epiweek.range_epiweeks(w0, w1, inclusive=True)) data = np.ones((len(FluDataSource.SENSORS), len(weeks))) * -1 for i, sensor in enumerate(FluDataSource.SENSORS): x = self.get_sensor(sensor, 'nat') for j, ew in enumerate(weeks): if ew in x: data[i, j] = x[ew] return data, FluDataSource.SENSORS, weeks
def update(ew1, ew2, test_mode=False, epidata_cache=None): # database setup u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database='epidata') cur = cnx.cursor() sql = """ INSERT INTO `nowcasts` (`epiweek`, `location`, `value`, `std`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE value = %s, std = %s """ for ew in flu.range_epiweeks(ew1, ew2, inclusive=True): try: print(ew) locations, values, stds = nowcast(ew, epidata_cache) print(' ', locations[0], values[0], stds[0]) for (l, v, s) in zip(locations, values, stds): cur.execute(sql, (ew, l, v, s, v, s)) except Exception as ex: print('failed: ', ew, ex) #raise ex sys.stdout.flush() # the Ugliest Hack Ever Written lies below. turn back now, cannot be unseen. # please fix me # store the unix timestamp in a meta row representing the last update time # the key to this row is `epiweek`=0, `location`='updated' # the timestamp is stored across the `value` and `std` fields # these are 32-bit floats, so precision is limited (hence, using both fields) t = round(time.time()) a, b = t // 100000, t % 100000 cur.execute(sql, (0, 'updated', a, b, a, b)) # /hack # database cleanup cur.close() if test_mode: print('test mode - nowcasts not saved') else: cnx.commit() cnx.close()
def __init__(self, region, target): self.region = region self.target = target weeks = Epidata.range(199301, 202330) auth = secrets.api.datasetname_targets # r0 = Epidata.check(Epidata.fluview(self.region, weeks, lag=0, auth=auth)) # r1 = Epidata.check(Epidata.fluview(self.region, weeks, lag=1, auth=auth)) # r2 = Epidata.check(Epidata.fluview(self.region, weeks, lag=2, auth=auth)) # rx = Epidata.check(Epidata.fluview(self.region, weeks, auth=auth)) r0 = mutate_rows_as_if_lagged(Epidata.check(Epidata.datasetname_targets(auth, self.target, self.region, weeks)), 0) r1 = mutate_rows_as_if_lagged(Epidata.check(Epidata.datasetname_targets(auth, self.target, self.region, weeks)), 1) r2 = mutate_rows_as_if_lagged(Epidata.check(Epidata.datasetname_targets(auth, self.target, self.region, weeks)), 2) rx = mutate_rows_as_if_lagged(Epidata.check(Epidata.datasetname_targets(auth, self.target, self.region, weeks)), 1000000) self.data = {} self.valid = {} self.ew2i, self.i2ew = {}, {} for ew in EW.range_epiweeks(weeks['from'], weeks['to'], inclusive=True): # if 200916 <= ew <= 201015: # continue i = len(self.ew2i) self.ew2i[ew] = i self.i2ew[i] = ew for row in r0 + r1 + r2 + rx: ew, observation, lag = row['epiweek'], row['value'], row['lag'] if ew not in self.ew2i: continue i = self.ew2i[ew] if i not in self.data: self.data[i] = {} self.valid[i] = {0: False, 1: False, 2: False, 'stable': False} if not (0 <= lag <= 2): lag = 'stable' self.data[i][lag] = observation self.valid[i][lag] = True self.weeks = sorted(list(self.data.keys())) for i in self.weeks: if 'stable' not in self.data[i]: continue for lag in range(3): if lag not in self.data[i]: self.data[i][lag] = self.data[i]['stable']
def __init__(self, region): self.region = region weeks = Epidata.range(200330, 202330) auth = secrets.api.fluview r0 = Epidata.check( Epidata.fluview(self.region, weeks, lag=0, auth=auth)) r1 = Epidata.check( Epidata.fluview(self.region, weeks, lag=1, auth=auth)) r2 = Epidata.check( Epidata.fluview(self.region, weeks, lag=2, auth=auth)) rx = Epidata.check(Epidata.fluview(self.region, weeks, auth=auth)) self.data = {} self.valid = {} self.ew2i, self.i2ew = {}, {} for ew in EW.range_epiweeks(weeks['from'], weeks['to'], inclusive=True): if 200916 <= ew <= 201015: continue i = len(self.ew2i) self.ew2i[ew] = i self.i2ew[i] = ew for row in r0 + r1 + r2 + rx: ew, wili, lag = row['epiweek'], row['wili'], row['lag'] if ew not in self.ew2i: continue i = self.ew2i[ew] if i not in self.data: self.data[i] = {} self.valid[i] = {0: False, 1: False, 2: False, 'stable': False} if not (0 <= lag <= 2): lag = 'stable' self.data[i][lag] = wili self.valid[i][lag] = True self.weeks = sorted(list(self.data.keys())) for i in self.weeks: if 'stable' not in self.data[i]: continue for lag in range(3): if lag not in self.data[i]: self.data[i][lag] = self.data[i]['stable']
def update(self, sensors, first_week, last_week): """ Compute sensor readings and store them in the database. """ # most recent issue if last_week is None: # last_issue = get_most_recent_issue(self.epidata) # last_week = flu.add_epiweeks(last_issue, +1) raise Exception( "last_week must be provided for now --- todo select based on current time (rather than on the ground truth data set since the ground truth here is currently static, not streaming)" ) # connect with self.database as database: # update each sensor for (name, loc) in sensors: # update each location for location in get_location_list(loc): # timing ew1 = first_week if ew1 is None: ew1 = database.get_most_recent_epiweek(name, location) if ew1 is None: # If an existing sensor reading wasn't found in the database and # no start week was given, just assume that readings should start # at 2010w40. ew1 = 201040 print('%s-%s not found, starting at %d' % (name, location, ew1)) args = (name, location, ew1, last_week) print('Updating %s-%s from %d to %d.' % args) for test_week in flu.range_epiweeks(ew1, last_week, inclusive=True): self.update_single(database, test_week, name, location)
def _forecast(self, region, epiweek): ew1 = flu.join_epiweek(self.test_season + 0, 40) ew2 = flu.join_epiweek(self.test_season + 1, 24) num_weeks = flu.delta_epiweeks(ew1, ew2) print('fetching past data until week %d' % (epiweek)) observed = self._get_current(region, epiweek, self.forecast_type) mean, var = self.emp_mean[region].copy(), self.emp_var[region].copy() for ew in flu.range_epiweeks(ew1, flu.add_epiweeks(epiweek, 1)): i = flu.delta_epiweeks(ew1, ew) lag = flu.delta_epiweeks(ew1, epiweek) - i lag = min(lag, len(self.bf_var[region]) - 1) mean[i] = observed[i] var[i] = self.bf_var[region][lag] curves = Forecaster.Utils.sample_normal_var(mean, var, self.num_samples) if not self.do_sampling: offset = flu.delta_epiweeks(ew1, epiweek) + 1 for (i, curve) in enumerate(curves): index = i % len(self.emp_curves[region]) curve[offset:] = self.emp_curves[region][index][offset:] return curves
def cum_to_week(data): epiweeks = list(data.keys()) all_epiweeks = list( EW.range_epiweeks(min(epiweeks), max(epiweeks), inclusive=True)) result = np.zeros((len(all_epiweeks))) last_valid = (-1, 0) # (idx, value) for i in range(len(result)): ew = all_epiweeks[i] if ew in data: if data[all_epiweeks[i]] is not None: result[last_valid[0] + 1:i + 1] = (data[ew] - last_valid[1]) / float( i - last_valid[0]) # Evenly distribute missing counts last_valid = (i, data[ew]) yr, wk = EW.split_epiweek(all_epiweeks[i]) if EW.get_num_weeks(yr) == wk: result[ last_valid[0] + 1:i + 1] = 0 # Fill rest of year with 0s, not getting this information last_valid = (i, 0) # Start new year at 0 return {all_epiweeks[i]: result[i] for i in range(len(all_epiweeks))}
def test_update(self): """Compute and store a nowcast.""" database = MagicMock() database.__enter__.return_value = database database.__exit__.return_value = None data_source = MagicMock( ALL_LOCATIONS=['pa', 'va'], ATOMIC_LOCATIONS=['pa', 'va'], get_truth_locations=lambda *a: ['pa', 'va'], get_sensor_locations=lambda *a: ['pa', 'va'], get_missing_locations=lambda *a: (), get_sensors=lambda *a: ['ght', 'isch'], get_most_recent_issue=lambda *a: 201513, get_weeks=lambda *a: list(range_epiweeks(201413, 201514)), get_truth_value=lambda *a: random.random(), get_sensor_value=lambda *a: random.random(), prefetch=lambda *a: None) target = 'datasetname_rate' NowcastUpdate(database, data_source, target).update(201512, 201513) self.assertEqual(database.set_last_update_time.call_count, 1) self.assertEqual(database.insert.call_count, 4) target_epiweek_location_triplets = set() for args, kwargs in database.insert.call_args_list: target_epiweek_location_triplets.add(args[:3]) self.assertIn(('datasetname_rate', 201512, 'pa'), target_epiweek_location_triplets) self.assertIn(('datasetname_rate', 201513, 'pa'), target_epiweek_location_triplets) self.assertIn(('datasetname_rate', 201512, 'va'), target_epiweek_location_triplets) self.assertIn(('datasetname_rate', 201513, 'va'), target_epiweek_location_triplets)
def __init__(self, region, target): self.region = region self.target = target weeks = Epidata.range(201401, 202330) r0 = Epidata.check(Epidata.paho_dengue(self.region, weeks, lag=0)) r1 = Epidata.check(Epidata.paho_dengue(self.region, weeks, lag=1)) r2 = Epidata.check(Epidata.paho_dengue(self.region, weeks, lag=2)) rx = Epidata.check(Epidata.paho_dengue(self.region, weeks)) self.data = {} self.valid = {} self.ew2i, self.i2ew = {}, {} for ew in EW.range_epiweeks(weeks['from'], weeks['to'], inclusive=True): # if 200916 <= ew <= 201015: # continue i = len(self.ew2i) self.ew2i[ew] = i self.i2ew[i] = ew for row in r0 + r1 + r2 + rx: ew, observation, lag = row['epiweek'], row[self.target], row['lag'] if ew not in self.ew2i: continue i = self.ew2i[ew] if i not in self.data: self.data[i] = {} self.valid[i] = {0: False, 1: False, 2: False, 'stable': False} if not (0 <= lag <= 2): lag = 'stable' self.data[i][lag] = observation self.valid[i][lag] = True self.weeks = sorted(list(self.data.keys())) for i in self.weeks: if 'stable' not in self.data[i]: continue for lag in range(3): if lag not in self.data[i]: self.data[i][lag] = self.data[i]['stable']
def nowcast(epiweek, epidata_cache=None): si = StateInfo() # all sensors and locations all_names, all_loc = get_all_sensors() # get sensors available on the target week rows = Epidata.check( Epidata.sensors(secrets.api.sensors, all_names, all_loc, epiweek)) present = {} for row in rows: name, loc, value = row['name'], row['location'], row['value'] if name not in present: present[name] = {} if loc not in present[name]: present[name][loc] = value # get the history of each available sensor (6 sec) past = {} sensor_locs = set() missing = set() past_weeks = Epidata.range(FIRST_DATA_EPIWEEK, flu.add_epiweeks(epiweek, -1)) all_epiweeks = [ w for w in flu.range_epiweeks( past_weeks['from'], past_weeks['to'], inclusive=True) ] num_obs = len(all_epiweeks) for name in present.keys(): past[name] = {} for loc in present[name].keys(): past[name][loc] = {} sensor_locs |= set([loc]) #print(name, loc) try: if epidata_cache is not None: rows = epidata_cache.sensors(name, loc, past_weeks) else: rows = Epidata.check( Epidata.sensors(secrets.api.sensors, name, loc, past_weeks)) if len(rows) < 2: raise Exception() for row in rows: past[name][loc][row['epiweek']] = row['value'] except: missing |= set([(name, loc)]) # remove sensors with zero past data for (n, l) in missing: del present[n][l] if len(present[n]) == 0: del present[n] del past[n][l] if len(past[n]) == 0: del past[n] #print(n, l, 'is missing') # inventory all_sensors = [] for n in all_names: for l in si.nat + si.hhs + si.cen + si.sta: if n in past and l in past[n]: all_sensors.append((n, l)) #print(all_sensors) num_sensors = len(all_sensors) # get historical ground truth for each sensor (4 sec) truth = {} auth = secrets.api.fluview for loc in sensor_locs: truth[loc] = {} if epidata_cache is not None: srows = epidata_cache.fluview(loc, past_weeks) else: srows = Epidata.check(Epidata.fluview(loc, past_weeks, auth=auth)) sdata = dict([(r['epiweek'], r) for r in srows]) udata = {} try: i = past_weeks['to'] result = Epidata.fluview(loc, past_weeks, issues=i, auth=auth) urows = Epidata.check(result) udata = dict([(r['epiweek'], r) for r in urows]) except: pass rows = [] for ew in all_epiweeks: if ew in udata: rows.append(udata[ew]) else: rows.append(sdata[ew]) for row in rows: truth[loc][row['epiweek']] = row['wili'] # rows are epiweeks, cols are sensors X = np.zeros((num_obs, num_sensors)) * np.nan for (r, ew) in enumerate(all_epiweeks): for (c, (name, loc)) in enumerate(all_sensors): if name in past and loc in past[name] and ew in past[name][ loc] and loc in truth and ew in truth[loc]: X[r, c] = past[name][loc][ew] - truth[loc][ew] # sparse precision matrix Ri = Fusion.precision(X, mean=np.zeros((1, num_sensors)), b=0.25) # prepare for sensor fusion inputs = all_sensors state = si.sta outputs = si.nat + si.hhs + si.cen + si.sta num_i, num_s, num_o = len(inputs), len(state), len(outputs) # input (z): [ num_i x 1 ] # state (x): [ num_s x 1 ] # output (y): [ num_o x 1 ] # S->I (H): [ num_i x num_s ] # S->O (W): [ num_o x num_s ] z = np.array([present[n][l] for (n, l) in inputs]).reshape((num_i, 1)) H = np.zeros((num_i, num_s)) W = np.zeros((num_o, num_s)) # populate H, given input signals for (row, (name, location)) in enumerate(inputs): for (col, loc) in enumerate(state): if loc in si.within[location]: H[row, col] = si.weight[location][loc] if np.linalg.matrix_rank(np.dot(H.T, H)) != num_s: raise Exception('H is singluar') if not np.allclose(np.sum(H, axis=1), 1): raise Exception('H rows do not sum to 1') # populate W, given output locations for (row, location) in enumerate(outputs): for (col, loc) in enumerate(state): if loc in si.within[location]: W[row, col] = si.weight[location][loc] if not np.allclose(np.sum(W, axis=1), 1): raise Exception('W rows do not sum to 1') # sensor fusion x, P = Fusion.fuse(z, Ri, H) y, S = Fusion.extract(x, P, W) print(num_obs, num_i, num_s, num_o) pt = [float(v) for v in y.flatten()] std = [float(v) for v in np.sqrt(S).flatten()] return (outputs, pt, std)
def test_implemented_methods(self): # sample data locations = ['ar', 'tx'] sensors = ['epic', 'sar3'] epiweek = 201812 # helper that mimics an Epidata API response def fake_api(value=1, result=1, num_providers=1): return { 'result': result, 'epidata': [{ 'value': value, 'wili': value, 'num_providers': num_providers }] } # fake implementation of epidata.fluview def get_fluview(loc, week, auth): if loc == 'X': return fake_api(num_providers=0) if loc in locations: return fake_api() return fake_api(result=-2) # fake implementation of epidata.sensors def get_sensors(auth, name, loc, week): if name in sensors: return fake_api() return fake_api(result=-2) # create data source epidata = MagicMock(fluview=get_fluview, sensors=get_sensors) data_source = FluDataSource(epidata, sensors, Locations.region_list) data_source.get_most_recent_issue = lambda: epiweek # expected values expected_locations = set(Locations.region_list) expected_missing = set(Locations.atom_list) - set(locations) expected_sensors = set(sensors) expected_weeks = set( range_epiweeks( FluDataSource.FIRST_DATA_EPIWEEK, epiweek, inclusive=True)) # actual values actual_locations = set(data_source.get_truth_locations()) actual_missing = set(data_source.get_missing_locations(None)) actual_sensors = set(data_source.get_sensors()) actual_weeks = set(data_source.get_weeks()) # compare values self.assertEqual(actual_locations, expected_locations) self.assertEqual(actual_missing, expected_missing) self.assertEqual(actual_sensors, expected_sensors) self.assertEqual(actual_weeks, expected_weeks) # don't have data self.assertIsNone(data_source.get_truth_value(None, None)) self.assertIsNone(data_source.get_sensor_value(None, None, None)) # have data, but location had no reporting providers self.assertIsNone(data_source.get_truth_value(None, 'X')) # have data self.assertIsNotNone(data_source.get_truth_value(None, 'tx')) self.assertIsNotNone(data_source.get_sensor_value(None, None, 'epic'))
def get_weeks(self): """Return a list of weeks on which truth and sensors are both available.""" latest_week = self.get_most_recent_issue() week_range = range_epiweeks( FluDataSource.FIRST_DATA_EPIWEEK, latest_week, inclusive=True) return list(week_range)
def get_weeks(self): """Return a list of weeks on which truth and sensors are both available.""" week_range = range_epiweeks( self.FIRST_DATA_EPIWEEK, self.LAST_DATA_EPIWEEK, inclusive=True) return list(week_range)