예제 #1
0
 def _get_partial_trajectory(self, epiweek, valid=True):
   y, w = EW.split_epiweek(epiweek)
   if w < 30:
     y -= 1
   ew1 = EW.join_epiweek(y, 30)
   ew2 = epiweek
   limit = EW.add_epiweeks(ew2, -5)
   weeks = Epidata.range(ew1, ew2)
   stable = Epidata.check(Epidata.fluview(self.region, weeks))
   try:
     unstable = Epidata.check(Epidata.fluview(self.region, weeks, issues=ew2))
   except:
     unstable = []
   wili = {}
   for row in stable:
     ew, value = row['epiweek'], row['wili']
     if not valid or ew < limit:
       wili[ew] = value
   for row in unstable:
     ew, value = row['epiweek'], row['wili']
     wili[ew] = value
   curve = []
   for ew in EW.range_epiweeks(ew1, ew2, inclusive=True):
     if ew not in wili:
       if valid:
         t = 'unstable'
       else:
         t = 'any'
       raise Exception('wILI (%s) not available for week %d' % (t, ew))
     curve.append(wili[ew])
   n1 = EW.delta_epiweeks(ew1, ew2) + 1
   n2 = len(curve)
   if n1 != n2:
     raise Exception('missing data (expected %d, found %d)' % (n1, n2))
   return curve
예제 #2
0
 def __init__(self, region):
   self.region = region
   weeks = Epidata.range(200330, 202330)
   rows = Epidata.check(Epidata.fluview(self.region, weeks))
   self.seasons = {}
   for row in rows:
     ew, wili = row['epiweek'], row['wili']
     y, w = EW.split_epiweek(ew)
     if w < 30:
       y -= 1
     i = EW.delta_epiweeks(EW.join_epiweek(y, 30), ew)
     if y not in self.seasons:
       self.seasons[y] = {}
     if 0 <= i < 52:
       self.seasons[y][i] = wili
   years = sorted(list(self.seasons.keys()))
   for year in years:
     if len(self.seasons[year]) != 52:
       del self.seasons[year]
   if 2008 in self.seasons and 2009 in self.seasons:
     for i in range(40, 52):
       self.seasons[2008][i] = self.seasons[2009][i]
     del self.seasons[2009]
   curve = lambda y: [self.seasons[y][i] for i in range(52)]
   self.years = sorted(list(self.seasons.keys()))
   self.curves = dict([(y, curve(y)) for y in self.years])
예제 #3
0
    def _forecast(self, ageGroup, epiweek):
        # season setup and sanity check
        ew1 = flu.join_epiweek(self.test_season, 40)
        ew2 = flu.join_epiweek(self.test_season + 1, 17)
        print("test season:", self.test_season, "ew1:", ew1, "epiweek:",
              epiweek)
        if not ew1 <= epiweek <= ew2:
            raise Exception('`epiweek` outside of `test_season`')

        # get past values (left half) from the Epidata API
        response = Epidata.flusurv('network_all',
                                   Epidata.range(ew1, epiweek),
                                   issues=epiweek)
        epidata = Forecaster.Utils.decode(response)

        pinned = [row[ageGroup] for row in epidata]

        if len(pinned) != flu.delta_epiweeks(ew1, epiweek) + 1:
            raise Exception('missing ILINet data')
        # get the user submissions (right half) from the database
        print("ageGroup", ageGroup, "epiweek", epiweek)
        submissions = self.fetch_submissions(ageGroup, epiweek)
        self._num_users = len(submissions)
        if self.verbose:
            print(' [EC] %d users found for %s on %d' %
                  (len(submissions), ageGroup, epiweek))
        # concatenate observed data and user submissions
        return [pinned + sub for sub in submissions]
예제 #4
0
def get_training_set(location, epiweek, signal, valid):
  ew1, ew2, ew3, weeks0, weeks1 = get_weeks(epiweek)
  auth = secrets.api.fluview
  try:
    result = Epidata.fluview(location, weeks0, issues=ew2, auth=auth)
    rows = Epidata.check(result)
    unstable = extract(rows, ['wili'])
  except:
    unstable = {}
  rows = Epidata.check(Epidata.fluview(location, weeks0, auth=auth))
  stable = extract(rows, ['wili'])
  data = {}
  num_dropped = 0
  for ew in signal.keys():
    if ew == ew3:
      continue
    sig = signal[ew]
    if ew not in unstable:
      if valid and flu.delta_epiweeks(ew, ew3) <= 5:
        raise Exception('unstable wILI is not available on %d' % ew)
      if ew not in stable:
        num_dropped += 1
        continue
      wili = stable[ew]
    else:
      wili = unstable[ew]
    data[ew] = {'x': sig, 'y': wili}
  if num_dropped:
    msg = 'warning: dropped %d/%d signal weeks because (w)ILI was unavailable'
    print(msg % (num_dropped, len(signal)))
  return get_training_set_data(data)
def get_lag_and_ili(issue, epiweek, num_ili, num_patients):
    """
  Compute and return reporting lag and percent ILI from imputed ILINet data.
  """
    lag = delta_epiweeks(epiweek, issue)
    ili = 100.0 * (0 if num_patients == 0 else num_ili / num_patients)
    return lag, ili
예제 #6
0
    def test_find_csv_files(self):
        """Recursively explore and find CSV files."""

        path_prefix = 'prefix/to/the/data/'
        glob_paths = [
            # valid weekly
            path_prefix + 'fb_survey/weekly_202015_county_cli.csv',
            # valid daily
            path_prefix + 'ght/20200408_state_rawsearch.csv',
            # valid national
            path_prefix + 'valid/20200408_nation_sig.csv',
            # valid hhs
            path_prefix + 'valid/20200408_hhs_sig.csv',
            # invalid
            path_prefix + 'invalid/hello_world.csv',
            # invalid day
            path_prefix + 'invalid/22222222_b_c.csv',
            # invalid week
            path_prefix + 'invalid/weekly_222222_b_c.csv',
            # invalid geography
            path_prefix + 'invalid/20200418_province_c.csv',
            # ignored
            path_prefix + 'ignored/README.md',
        ]
        mock_glob = MagicMock()
        mock_glob.glob.return_value = glob_paths

        found = set(CsvImporter.find_csv_files(path_prefix, glob=mock_glob))

        expected_issue_day = int(date.today().strftime("%Y%m%d"))
        expected_issue_week = int(str(epi.Week.fromdate(date.today())))
        time_value_day = 20200408
        expected = set([
            (glob_paths[0], ('fb_survey', 'cli', 'week', 'county', 202015,
                             expected_issue_week,
                             delta_epiweeks(202015, expected_issue_week))),
            (glob_paths[1],
             ('ght', 'rawsearch', 'day', 'state', time_value_day,
              expected_issue_day,
              (date.today() - date(year=time_value_day // 10000,
                                   month=(time_value_day // 100) % 100,
                                   day=time_value_day % 100)).days)),
            (glob_paths[2],
             ('valid', 'sig', 'day', 'nation', time_value_day,
              expected_issue_day,
              (date.today() - date(year=time_value_day // 10000,
                                   month=(time_value_day // 100) % 100,
                                   day=time_value_day % 100)).days)),
            (glob_paths[3],
             ('valid', 'sig', 'day', 'hhs', time_value_day, expected_issue_day,
              (date.today() - date(year=time_value_day // 10000,
                                   month=(time_value_day // 100) % 100,
                                   day=time_value_day % 100)).days)),
            (glob_paths[4], None),
            (glob_paths[5], None),
            (glob_paths[6], None),
            (glob_paths[7], None),
        ])
        self.assertEqual(found, expected)
예제 #7
0
def update_from_file_clinical(issue, date, filename, test_mode=False):
    """
  Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database.
  """

    # database connection
    u, p = secrets.db.epi
    cnx = mysql.connector.connect(user=u, password=p, database='epidata')
    rows1 = get_rows(cnx, CL_TABLE)
    print('rows before: %d' % (rows1))
    insert = cnx.cursor()

    # load the data, ignoring empty rows
    print('loading data from %s as issued on %d' % (filename, issue))
    rows = load_zipped_csv(filename, CL_SHEET)
    print(' loaded %d rows' % len(rows))
    data = [get_clinical_data(row) for row in rows]
    entries = [obj for obj in data if obj]
    print(' found %d entries' % len(entries))

    sql = '''
  INSERT INTO
    `fluview_clinical` (`release_date`, `issue`, `epiweek`, `region`, `lag`, 
    `total_specimens`, `total_a`, `total_b`, `percent_positive`, `percent_a`, 
    `percent_b`)
  VALUES
    (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  ON DUPLICATE KEY UPDATE
  `release_date` = least(`release_date`, %s),
  `total_specimens` = %s,
  `total_a` = %s,
  `total_b` = %s,
  `percent_positive` = %s,
  `percent_a` = %s,
  `percent_b` = %s
  '''

    # insert each row
    insert = cnx.cursor()
    for row in entries:
        lag = delta_epiweeks(row['epiweek'], issue)
        args = [
            row['total_specimens'], row['total_a'], row['total_b'],
            row['percent_positive'], row['percent_a'], row['percent_b']
        ]
        ins_args = [date, issue, row['epiweek'], row['location'], lag] + args
        upd_args = [date] + args
        insert.execute(sql, ins_args + upd_args)

    # cleanup
    insert.close()
    if test_mode:
        print('test mode, not committing')
        rows2 = rows1
    else:
        cnx.commit()
        rows2 = get_rows(cnx)
    print('rows after: %d (added %d)' % (rows2, rows2 - rows1))
    cnx.close()
예제 #8
0
 def fetch_submissions(self, region, epiweek_now):
     final_week = flu.join_epiweek(self.test_season + 1, 20)
     self.cur = self.cnx.cursor()
     self.cur.execute(
         """
 SELECT
   u.`id` `user_id`, f.`epiweek`, f.`wili`
 FROM (
   SELECT
     u.*
   FROM
     `ec_fluv_users_mturk_2019` u
   JOIN
     `ec_fluv_defaults` d
   ON
     TRUE
   LEFT JOIN
     `ec_fluv_user_preferences_mturk` p
   ON
     p.`user_id` = u.`id` AND p.`name` = d.`name`
   WHERE
     d.`name` = '_debug' AND coalesce(p.`value`, d.`value`) = '0'
   ) u
 JOIN
   `ec_fluv_submissions_mturk` s
 ON
   s.`user_id` = u.`id`
 JOIN
   `ec_fluv_forecast_mturk` f
 ON
   f.`user_id` = u.`id` AND f.`region_id` = s.`region_id` AND f.`epiweek_now` = s.`epiweek_now`
 JOIN
   `ec_fluv_regions` r
 ON
   r.`id` = s.`region_id`
 WHERE
   r.`fluview_name` = %s AND s.`epiweek_now` = %s AND f.`epiweek` <= %s AND f.`wili` > 0
 ORDER BY
   u.`id` ASC, f.`epiweek` ASC
 """, (region, epiweek_now, final_week))
     submissions = {}
     for (user, epiweek, wili) in self.cur:
         if self.users is not None and user not in self.users:
             continue
         if user not in submissions:
             submissions[user] = []
         submissions[user].append(wili)
     self.cur.close()
     curves = []
     expected_weeks = flu.delta_epiweeks(epiweek_now, final_week)
     for user in submissions:
         if len(submissions[user]) != expected_weeks:
             print(
                 ' [EC] warning: missing data in user sumission [%d|%s|%d]'
                 % (user, region, epiweek_now))
         else:
             curves.append(submissions[user])
     return curves
예제 #9
0
def update_from_file(issue, date, dir, test_mode=False):
    # Read ECDC data from CSVs and insert into (or update) the database.
    # database connection
    u, p = secrets.db.epi
    cnx = mysql.connector.connect(user=u, password=p, database='epidata')
    rows1 = get_rows(cnx, 'ecdc_ili')
    print('rows before: %d' % (rows1))
    insert = cnx.cursor()

    # load the data, ignoring empty rows
    files = glob.glob(os.path.join(dir,"*.csv"))
    rows = []
    for filename in files:
        with open(filename,'r') as f:
            header = map(lambda s: s.strip(),f.readline().split(','))
            for l in f:
                data = list(map(lambda s: s.strip().replace('"',''),l.split(',')))
                row = {}
                row['epiweek'] = int(data[1][:4] + data[1][5:])
                row['region'] = data[4]
                row['incidence_rate'] = data[3]
                rows.append(row)
    print(' loaded %d rows' % len(rows))
    entries = [obj for obj in rows if obj]
    print(' found %d entries' % len(entries))

    sql = '''
    INSERT INTO
        `ecdc_ili` (`release_date`, `issue`, `epiweek`, `region`, `lag`,
        `incidence_rate`)
    VALUES
        ('%s', %s, %s, '%s', %s, %s)
    ON DUPLICATE KEY UPDATE
        `release_date` = least(`release_date`, '%s'),
        `incidence_rate` = %s
    '''

    for row in entries:
        lag = delta_epiweeks(row['epiweek'], issue)
        data_args = [row['incidence_rate']]

        insert_args = [date,issue,row['epiweek'],row['region'],lag] + data_args
        update_args = [date] + data_args
        try:
            insert.execute(sql % tuple(insert_args + update_args))
        except Exception:
            pass

    # cleanup
    insert.close()
    if test_mode:
        print('test mode, not committing')
        rows2 = rows1
    else:
        cnx.commit()
        rows2 = get_rows(cnx)
    print('rows after: %d (added %d)' % (rows2,rows2-rows1))
    cnx.close()
예제 #10
0
    def get_data(self,
                 start_week,
                 end_week,
                 location,
                 term,
                 resolution='week',
                 country='US'):
        start_date = GHT._ew2date(start_week)
        end_date = GHT._ew2date(end_week)
        num_weeks = flu.delta_epiweeks(start_week, end_week) + 1

        # getTimelinesForHealth parameters
        params = {
            'terms': term,
            'time_startDate': start_date,
            'time_endDate': end_date,
            'timelineResolution': resolution,
        }
        # We have a special check for the US for backwards compatibility.
        # i.e. if the country is 'US' AND the location is 'US', just put the geo-restriction for country.
        # In contrast, another country might have a sub-region with initials 'US' and we want the region restriction instead.
        if country == 'US':
            if location == 'US' or location == NO_LOCATION_STR:
                params['geoRestriction_country'] = 'US'
            else:
                params['geoRestriction_region'] = 'US-' + location
        else:
            if location == NO_LOCATION_STR:
                params['geoRestriction_country'] = country
            else:
                params['geoRestriction_region'] = country + '-' + location

        # make the API call
        data = self.service.getTimelinesForHealth(**params).execute()

        # extract the values
        try:
            values = [p['value'] for p in data['lines'][0]['points']]
        except:
            values = None

        # throttle request rate
        time.sleep(self.delay)

        # return the results
        return {
            'start_week': start_week,
            'end_week': end_week,
            'num_weeks': num_weeks,
            'location': location,
            'country': country,
            'term': term,
            'resolution': resolution,
            'data': data,
            'values': values,
        }
예제 #11
0
def drop_invalid_predictions(epiweek, user_predictions):
  # sanity check user inputs (copy key sets since we modify the dict in-place)
  expected_length = epiweek_lib.delta_epiweeks(epiweek, Constants.MAX_EPIWEEK)
  num_dropped = 0
  for location in list(user_predictions.keys()):
    for user in list(user_predictions[location].keys()):
      if len(user_predictions[location][user]) != expected_length:
        num_dropped += 1
        del user_predictions[location][user]
        if not user_predictions[location]:
          del user_predictions[location]
  if num_dropped:
    print('NOTE: dropped %d time-series with invalid length' % num_dropped)
예제 #12
0
 def _forecast(self, region, epiweek):
   print('inside hybrid._forecast, region, epiweek:', region, epiweek)
   P = self.past._forecast(region, epiweek)
   F = self.future._forecast(region, epiweek)
   print('inside hybrid._forecast, len P, len F', len(P), len(F))
   i = flu.delta_epiweeks(flu.join_epiweek(self.test_season, 40), epiweek)
   curves = []
   for j in range(max(len(P), len(F))):
     p, f = P[j % len(P)], F[j % len(F)]
     curves.append(list(p[:i]) + list(f[i:]))
   if self._callback is not None:
     self._callback()
   return curves
예제 #13
0
    def _forecast(self, region, epiweek):
        ew1 = flu.join_epiweek(self.test_season + 0, 40)
        ew2 = flu.join_epiweek(self.test_season + 1, 24)
        num_weeks = flu.delta_epiweeks(ew1, ew2)
        print('fetching past data until week %d' % (epiweek))
        observed = self._get_current(region, epiweek, self.forecast_type)

        mean, var = self.emp_mean[region].copy(), self.emp_var[region].copy()
        for ew in flu.range_epiweeks(ew1, flu.add_epiweeks(epiweek, 1)):
            i = flu.delta_epiweeks(ew1, ew)
            lag = flu.delta_epiweeks(ew1, epiweek) - i
            lag = min(lag, len(self.bf_var[region]) - 1)
            mean[i] = observed[i]
            var[i] = self.bf_var[region][lag]
        curves = Forecaster.Utils.sample_normal_var(mean, var,
                                                    self.num_samples)
        if not self.do_sampling:
            offset = flu.delta_epiweeks(ew1, epiweek) + 1
            for (i, curve) in enumerate(curves):
                index = i % len(self.emp_curves[region])
                curve[offset:] = self.emp_curves[region][index][offset:]
        return curves
예제 #14
0
def get_weight(ew1, ew2):
  # I want something that:
  #   - drops sharply over the most recent ~3 weeks
  #   - falls off exponentially with time
  #   - puts extra emphasis on the past weeks at the same time of year
  #   - gives no week a weight of zero
  dw = flu.delta_epiweeks(ew1, ew2)
  yr = 52.2
  hl1, hl2, bw = yr, 1, 4
  a = 0.05
  #b = (np.cos(2 * np.pi * (dw / yr)) + 1) / 2
  b = np.exp(-((min(dw % yr, yr - dw % yr) / bw) ** 2))
  c = 2 ** -(dw / hl1)
  d = 1 - 2 ** -(dw / hl2)
  return (a + (1 - a) * b) * c * d
예제 #15
0
 def _forecast(self, region, epiweek):
   # season setup and sanity check
   ew1 = flu.join_epiweek(self.test_season, 40)
   ew2 = flu.join_epiweek(self.test_season + 1, 20)
   if not ew1 <= epiweek <= ew2:
     raise Exception('`epiweek` outside of `test_season`')
   # get past values (left half) from the Epidata API
   epidata = Forecaster.Utils.decode(Epidata.fluview(region, Epidata.range(ew1, epiweek), issues=epiweek))
   pinned = [row['wili'] for row in epidata]
   if len(pinned) != flu.delta_epiweeks(ew1, epiweek) + 1:
     raise Exception('missing ILINet data')
   # get the user submissions (right half) from the database
   submissions = self.fetch_submissions(region, epiweek)
   self._num_users = len(submissions)
   print(' [EC] %d users found for %s on %d' % (len(submissions), region, epiweek))
   # concatenate observed data and user submissions
   return [pinned + sub for sub in submissions]
예제 #16
0
 def get_weight(ew1, ew2):
   """ This function gives the weight between two given
   epiweeks based on a function that:
     - drops sharply over the most recent ~3 weeks
     - falls off exponentially with time
     - puts extra emphasis on the past weeks at the
       same time of year (seasonality)
     - gives no week a weight of zero
   """
   dw = flu.delta_epiweeks(ew1, ew2)
   yr = 52.2
   hl1, hl2, bw = yr, 1, 4
   a = 0.05
   # b = (np.cos(2 * np.pi * (dw / yr)) + 1) / 2
   b = np.exp(-((min(dw % yr, yr - dw % yr) / bw) ** 2))
   c = 2 ** -(dw / hl1)
   d = 1 - 2 ** -(dw / hl2)
   return (a + (1 - a) * b) * c * d
예제 #17
0
def get_model(ew2, epiweeks, X, Y):
  ne, nx1, nx2, ny = len(epiweeks), len(X), len(X[0]), len(Y)
  if ne != nx1 or nx1 != ny:
    raise Exception('length mismatch e=%d X=%d Y=%d' % (ne, nx1, ny))
  weights = np.diag([get_weight(ew1, ew2) for ew1 in epiweeks])
  X = np.array(X).reshape((nx1, nx2))
  Y = np.array(Y).reshape((ny, 1))
  bias0 = np.ones(Y.shape)
  if ne >= 26 and flu.delta_epiweeks(epiweeks[0], epiweeks[-1]) >= 52:
    # constant and periodic bias
    bias1 = np.array([get_periodic_bias(ew) for ew in epiweeks])
    X = np.hstack((X, bias0, bias1))
  else:
    # constant bias only
    X = np.hstack((X, bias0))
  XtXi = np.linalg.inv(dot(X.T, weights, X))
  XtY = dot(X.T, weights, Y)
  return np.dot(XtXi, XtY)
예제 #18
0
    def _get_current(self, region, epiweek, forecast_type):
        ew1 = flu.join_epiweek(self.test_season + 0, 40)
        ew2 = flu.join_epiweek(self.test_season + 1, 20)
        weeks = Epidata.range(ew1, ew2)
        if self.forecast_type == ForecastType.WILI:
            print('fetching history data for:')
            print(region, epiweek, weeks)
            epidata = Forecaster.Utils.decode(
                Epidata.fluview(region, weeks, issues=epiweek))
            data = [row['wili'] for row in epidata]
            # print (data)
        else:
            epidata = Forecaster.Utils.decode(
                Epidata.flusurv('network_all', weeks, issues=epiweek))
            data = [row[region] for row in epidata]

        if len(data) != flu.delta_epiweeks(ew1, epiweek) + 1:
            raise Exception('missing data')
        return data
예제 #19
0
def update_from_data(ews, ilis, date, issue, test_mode=False):
    u, p = secrets.db.epi
    cnx = mysql.connector.connect(user=u, password=p, database='epidata')
    rows1 = get_rows(cnx)
    print('rows before: %d' % (rows1))
    insert = cnx.cursor()

    sql = '''
    INSERT INTO
        `kcdc_ili` (`release_date`, `issue`, `epiweek`, `region`, `lag`,
        `ili`)
    VALUES
        ('%s', %s, %s, '%s', %s, %s)
    ON DUPLICATE KEY UPDATE
        `release_date` = least(`release_date`, '%s'),
        `ili` = %s
    '''

    for i in range(len(ews)):
        ew = ews[i]
        ili = ilis[i]
        lag = delta_epiweeks(ews[i], issue)

        insert_args = [date, issue, ew, 'ROK', lag, ili]
        update_args = [date, ili]
        try:
            insert.execute(sql % tuple(insert_args + update_args))
        except Exception:
            pass

    # cleanup
    insert.close()
    if test_mode:
        print('test mode, not committing')
        rows2 = rows1
    else:
        cnx.commit()
        rows2 = get_rows(cnx)
    print('rows after: %d (added %d)' % (rows2, rows2 - rows1))
    cnx.close()
예제 #20
0
  def get_data(self, start_week, end_week, location, term, resolution='week'):
    start_date = GHT._ew2date(start_week)
    end_date = GHT._ew2date(end_week)
    num_weeks = flu.delta_epiweeks(start_week, end_week) + 1

    # getTimelinesForHealth parameters
    params = {
      'terms': term,
      'time_startDate': start_date,
      'time_endDate': end_date,
      'timelineResolution': resolution,
    }
    if location == 'US':
      params['geoRestriction_country'] = location
    else:
      params['geoRestriction_region'] = 'US-' + location

    # make the API call
    data = self.service.getTimelinesForHealth(**params).execute()

    # extract the values
    try:
      values = [p['value'] for p in data['lines'][0]['points']]
    except:
      values = None

    # throttle request rate
    time.sleep(self.delay)

    # return the results
    return {
      'start_week': start_week,
      'end_week': end_week,
      'num_weeks': num_weeks,
      'location': location,
      'term': term,
      'resolution': resolution,
      'data': data,
      'values': values,
    }
예제 #21
0
 def train(self, epiweek):
     if epiweek not in self.ew2i:
         raise Exception('not predicting during this period')
     most_recent_issue = self.dds.get_most_recent_issue(self.region)
     i2 = min(self.ew2i[epiweek] - 5, self.ew2i[most_recent_issue] - 1)
     signal_to_truth_shift = max(
         0, EW.delta_epiweeks(most_recent_issue, epiweek))
     self.stts = signal_to_truth_shift
     i1 = self.weeks[2 + signal_to_truth_shift]
     ew1, ew2 = self.i2ew[i1], self.i2ew[i2]
     num_weeks = i2 - i1
     if num_weeks <= 0:
         raise Exception('not predicting during this period')
     feature_indices = self.feature_indices(
         epiweek, signal_to_truth_shift=signal_to_truth_shift, valid=False)
     X, Y = np.zeros((num_weeks, np.sum(feature_indices))), np.zeros(
         (num_weeks, 1))
     r = 0
     for i in range(i1, i2):
         try:
             newx = self._get_features(
                 self.i2ew[i],
                 signal_to_truth_shift=signal_to_truth_shift,
                 valid=False,
                 mask=feature_indices)
             newy = self.data[i + 1]['stable']
             if np.all(np.isfinite(newx)):
                 X[r, :] = newx
                 Y[r, 0] = newy
                 r += 1
         except Exception:
             pass
     X = X[:r, :]
     Y = Y[:r, :]
     Y = np.log(np.maximum(Y, 0.01))
     self.model = ISCH.dot(np.linalg.inv(ISCH.dot(X.T, X)), X.T, Y)
     self.training_week = epiweek
     return (X, Y, self.model)
예제 #22
0
def update_from_file(issue, date, filename, test_mode=False):
    """
  Read ILINet data from a zipped CSV and insert into (or update) the database.
  """

    # database connection
    u, p = secrets.db.epi
    cnx = mysql.connector.connect(user=u, password=p, database='epidata')
    rows1 = get_rows(cnx)
    print('rows before: %d' % (rows1))
    insert = cnx.cursor()

    # load the data, ignoring empty rows
    print('loading data from %s as issued on %d' % (filename, issue))
    rows = load_zipped_csv(filename)
    print(' loaded %d rows' % len(rows))
    data = [get_ilinet_data(row) for row in rows]
    entries = [obj for obj in data if obj]
    print(' found %d entries' % len(entries))

    sql = '''
  INSERT INTO
    `fluview` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `num_ili`,
    `num_patients`, `num_providers`, `wili`, `ili`, `num_age_0`, `num_age_1`,
    `num_age_2`, `num_age_3`, `num_age_4`, `num_age_5`)
  VALUES
    (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  ON DUPLICATE KEY UPDATE
  `release_date` = least(`release_date`, %s),
  `num_ili` = %s,
  `num_patients` = %s,
  `num_providers` = %s,
  `wili` = %s,
  `ili` = %s,
  `num_age_0` = coalesce(%s, `num_age_0`),
  `num_age_1` = coalesce(%s, `num_age_1`),
  `num_age_2` = coalesce(%s, `num_age_2`),
  `num_age_3` = coalesce(%s, `num_age_3`),
  `num_age_4` = coalesce(%s, `num_age_4`),
  `num_age_5` = coalesce(%s, `num_age_5`)
  '''

    # insert each row
    insert = cnx.cursor()
    for row in entries:
        lag = delta_epiweeks(row['epiweek'], issue)
        args = [
            row['n_ili'], row['n_patients'], row['n_providers'], row['wili'],
            row['ili'], row['age0'], row['age1'], row['age2'], row['age3'],
            row['age4'], row['age5']
        ]
        ins_args = [date, issue, row['epiweek'], row['location'], lag] + args
        upd_args = [date] + args
        insert.execute(sql, ins_args + upd_args)

    # cleanup
    insert.close()
    if test_mode:
        print('test mode, not committing')
        rows2 = rows1
    else:
        cnx.commit()
        rows2 = get_rows(cnx)
    print('rows after: %d (added %d)' % (rows2, rows2 - rows1))
    cnx.close()
예제 #23
0
 def get_xy(data):
     weeks = sorted(data)
     y = [data[w] for w in weeks]
     x = [Epiweek.delta_epiweeks(201030, w) for w in weeks]
     return list(map(np.array, [x, y]))
예제 #24
0
def update(issue, location_name, test_mode=False):
  """Fetch and store the currently avialble weekly FluSurv dataset."""

  # fetch data
  location_code = flusurv.location_codes[location_name]
  print('fetching data for', location_name, location_code)
  data = flusurv.get_data(location_code)

  # metadata
  epiweeks = sorted(data.keys())
  location = location_name
  release_date = str(EpiDate.today())

  # connect to the database
  u, p = secrets.db.epi
  cnx = mysql.connector.connect(user=u, password=p, database='epidata')
  cur = cnx.cursor()
  rows1 = get_rows(cur)
  print('rows before: %d' % rows1)

  # SQL for insert/update
  sql = '''
  INSERT INTO `flusurv` (
    `release_date`, `issue`, `epiweek`, `location`, `lag`, `rate_age_0`,
    `rate_age_1`, `rate_age_2`, `rate_age_3`, `rate_age_4`, `rate_overall`,
    `rate_age_5`, `rate_age_6`, `rate_age_7`
  )
  VALUES (
    %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
  )
  ON DUPLICATE KEY UPDATE
    `release_date` = least(`release_date`, %s),
    `rate_age_0` = coalesce(%s, `rate_age_0`),
    `rate_age_1` = coalesce(%s, `rate_age_1`),
    `rate_age_2` = coalesce(%s, `rate_age_2`),
    `rate_age_3` = coalesce(%s, `rate_age_3`),
    `rate_age_4` = coalesce(%s, `rate_age_4`),
    `rate_overall` = coalesce(%s, `rate_overall`),
    `rate_age_5` = coalesce(%s, `rate_age_5`),
    `rate_age_6` = coalesce(%s, `rate_age_6`),
    `rate_age_7` = coalesce(%s, `rate_age_7`)
  '''

  # insert/update each row of data (one per epiweek)
  for epiweek in epiweeks:
    lag = delta_epiweeks(epiweek, issue)
    if lag > 52:
      # Ignore values older than one year, as (1) they are assumed not to
      # change, and (2) it would adversely affect database performance if all
      # values (including duplicates) were stored on each run.
      continue
    args_meta = [release_date, issue, epiweek, location, lag]
    args_insert = data[epiweek]
    args_update = [release_date] + data[epiweek]
    cur.execute(sql, tuple(args_meta + args_insert + args_update))

  # commit and disconnect
  rows2 = get_rows(cur)
  print('rows after: %d (+%d)' % (rows2, rows2 - rows1))
  cur.close()
  if test_mode:
    print('test mode: not committing database changes')
  else:
    cnx.commit()
  cnx.close()
예제 #25
0
def update_from_file(issue, date, filename, test_mode=False):
    # Read PAHO data from CSV and insert into (or update) the database.

    # Behavior with issue:
    # PAHO has drop down menu for week, and selecting a given week
    #   from that menu gives the data for that issue, not that EW
    # Unsure what revisions, if any, that data goes through
    # Current code ignores PAHO-given issue, is based on argument 'issue'

    # database connection
    u, p = secrets.db.epi
    cnx = mysql.connector.connect(user=u, password=p, database='epidata')
    rows1 = get_rows(cnx, 'paho_dengue')
    print('rows before: %d' % (rows1))
    insert = cnx.cursor()

    # load the data, ignoring empty rows
    print('loading data from %s as issued on %d' % (filename, issue))
    with open(filename, 'r', encoding='utf-8') as f:
        c = f.read()
    rows = []
    for l in csv.reader(StringIO(c), delimiter=','):
        rows.append(get_paho_row(l))
    print(' loaded %d rows' % len(rows))
    entries = [obj for obj in rows if obj]
    print(' found %d entries' % len(entries))

    sql = '''
    INSERT INTO
        `paho_dengue` (`release_date`, `issue`, `epiweek`, `region`, `lag`,
        `total_pop`, `serotype`, `num_dengue`, `incidence_rate`,
        `num_severe`, `num_deaths`)
    VALUES
        ('%s', %s, %s, '%s', %s, %s, '%s', %s, %s, %s, %s)
    ON DUPLICATE KEY UPDATE
        `release_date` = least(`release_date`, '%s'),
        `total_pop` = %s,
        `serotype` = '%s',
        `num_dengue` = %s,
        `incidence_rate` = %s,
        `num_severe` = %s,
        `num_deaths` = %s
    '''

    for row in entries:
        if row['issue'] > issue:  # Issued in a week that hasn't happened yet
            continue
        lag = delta_epiweeks(row['epiweek'], issue)
        data_args = [
            row['total_pop'], row['serotype'], row['num_dengue'],
            row['incidence_rate'], row['num_severe'], row['num_deaths']
        ]

        insert_args = [date, issue, row['epiweek'], row['region'], lag
                       ] + data_args
        update_args = [date] + data_args
        insert.execute(sql % tuple(insert_args + update_args))

    # cleanup
    insert.close()
    if test_mode:
        print('test mode, not committing')
        rows2 = rows1
    else:
        cnx.commit()
        rows2 = get_rows(cnx)
    print('rows after: %d (added %d)' % (rows2, rows2 - rows1))
    cnx.close()
예제 #26
0
    def forecast(self, epiweek):
        """
    `epiweek`: the most recent epiweek for which ILINet data is available
    """

        # sanity checks
        flu.check_epiweek(epiweek)
        season = flu.split_epiweek(flu.get_season(epiweek)[0])[0]
        week = flu.split_epiweek(epiweek)[1]
        first_epiweek = flu.join_epiweek(season, 40)
        offset = flu.delta_epiweeks(first_epiweek, epiweek)
        if season != self.test_season:
            raise Exception('unable to forecast season %d' % season)
        if 20 < week < 40:
            raise Exception('unable to forecast week %02d' % week)

        # initialize forecast
        forecast = Forecast(self.test_season, datetime.now(), self.name,
                            epiweek, self.forecast_type)

        # aliases for readability
        num_week_bins = forecast.season_length
        num_wili_bins = forecast.num_ili_bins
        wili_bin_size = forecast.ili_bin_size

        # if (forecast_type == ForecastType.HOSP):
        #     num_wili_bins = 601

        # uniform blending weights
        week_weight = self.min_week_prob * (num_week_bins + 1
                                            )  # include `none` "bin"
        wili_weight = self.min_wili_prob * num_wili_bins
        if week_weight > 1:
            raise Exception('`min_week_prob` is impossibly high')
        if wili_weight > 1:
            raise Exception('`min_wili_prob` is impossibly high')

        # forecast each region
        for region in self.locations:

            # draw sample curves
            curves = self._forecast(region, epiweek)

            # regional info
            if Locations.is_region(region):
                baseline = Targets.baselines[self.test_season][region]
            else:
                baseline = None

            # get all targets
            targets = [
                Targets.get_all_targets(c,
                                        baseline,
                                        offset,
                                        rule_season=self.test_season)
                for c in curves
            ]
            onsets = [t['onset'] for t in targets]
            peakweeks = [t['peakweek'] for t in targets]
            peaks = [t['peak'] for t in targets]
            x1s = [t['x1'] for t in targets]
            x2s = [t['x2'] for t in targets]
            x3s = [t['x3'] for t in targets]
            x4s = [t['x4'] for t in targets]

            # forecast each target
            allow_no_pw = self.test_season < 2016
            if Locations.is_region(region):
                # skip onset for states and hospitalization, and do it only for regions
                onset = self.forecast_weeks(first_epiweek, num_week_bins,
                                            onsets, week_weight,
                                            self.smooth_weeks_bw, True)

            peakweek = self.forecast_weeks(first_epiweek, num_week_bins,
                                           peakweeks, week_weight,
                                           self.smooth_weeks_bw, allow_no_pw)
            peak = self.forecast_wili(wili_bin_size, num_wili_bins, peaks,
                                      wili_weight, self.smooth_wili_bw)
            x1 = self.forecast_wili(wili_bin_size, num_wili_bins, x1s,
                                    wili_weight, self.smooth_wili_bw)
            x2 = self.forecast_wili(wili_bin_size, num_wili_bins, x2s,
                                    wili_weight, self.smooth_wili_bw)
            x3 = self.forecast_wili(wili_bin_size, num_wili_bins, x3s,
                                    wili_weight, self.smooth_wili_bw)
            x4 = self.forecast_wili(wili_bin_size, num_wili_bins, x4s,
                                    wili_weight, self.smooth_wili_bw)

            # fill in the forecast data
            fc = forecast.get_or_create_forecast(region)
            if Locations.is_region(region):
                fc.set_onset(*onset)
            fc.set_peakweek(*peakweek)
            fc.set_peak(*peak)
            fc.set_lookahead(1, *x1)
            fc.set_lookahead(2, *x2)
            fc.set_lookahead(3, *x3)
            fc.set_lookahead(4, *x4)

        # sanity check completed forecast
        forecast.sanity_check()
        return forecast
예제 #27
0
    def plot(forecasts, prefix, fig_label=''):
        # timing
        epiweek = forecasts[0][0].epiweek
        ew0, ew1 = flu.get_season(epiweek)
        num_weeks = flu.delta_epiweeks(ew0, ew1) + 1
        year = flu.split_epiweek(ew0)[0]

        # plot settings
        x_ticks = [i for i in range(0, num_weeks, 3)]
        x_tick_labels = [
            '%02d' % ForecastIO.get_index_week(i) for i in x_ticks
        ]
        y_ticks = [i for i in range(0, 14, 2)]
        regions = ['nat'] + ['hhs%s' % i for i in range(1, 11)]

        # TODO: avoid hardcoding these values everywhere
        baseline_values_2019 = [
            2.4, 1.9, 3.2, 1.9, 2.4, 1.9, 3.8, 1.7, 2.7, 2.4, 1.5
        ]
        baselines = dict(
            (r, v) for (r, v) in zip(regions, baseline_values_2019))
        bin_size = forecasts[0][0].ili_bin_size

        # get the somewhat sorted list of all unique locations
        locations = []
        for info in forecasts:
            fc = info[0]
            for loc in fc.get_locations():
                if loc not in locations:
                    locations.append(loc)

        # plot each region
        for region in locations:

            # only consider forecasts that include this location
            region_forecasts = []
            for info in forecasts:
                if info[0].has_forecast(region):
                    region_forecasts.append(info)

            # center subplot
            plt.figure(figsize=(12, 12))
            ax2 = plt.subplot(3, 2, 3)
            if region in baselines:
                plt.axhline(baselines[region], color='#888888')
            weeks = [i for i in range(flu.delta_epiweeks(ew0, epiweek) + 1)]
            values = Plotter.get_unstable_wILI(region, ew0, epiweek)
            plt.plot(weeks, values, color='#000000', linewidth=2)
            weeks = [flu.delta_epiweeks(ew0, epiweek) + i for i in range(1, 5)]
            for (forecast, label, color) in region_forecasts:
                fc = forecast.get_forecast(region)
                values = [fc.get_lookahead(i)['point'] for i in range(1, 5)]
                plt.plot(weeks, values, color=color, linewidth=2)
            ax2.set_xbound(0, 33)
            ax2.set_ybound(0, 12)
            ax2.set_xticks(x_ticks)
            ax2.set_yticks(y_ticks)
            ax2.set_xticklabels(x_tick_labels)
            ax2.get_xaxis().set_tick_params(labelbottom='on', labeltop='on')
            ax2.get_yaxis().set_tick_params(labelleft='on', labelright='on')

            # top subplot: peakweek
            top = Plotter.weekly_subplot(region_forecasts, region,
                                         plt.subplot(3, 2, 1), ax2, False)

            # bottom subplot: onset
            bottom = Plotter.weekly_subplot(region_forecasts, region,
                                            plt.subplot(3, 2, 5), ax2, True)

            # right subplot: peakheight
            right = Plotter.wili_subplot(region_forecasts, region,
                                         plt.subplot(3, 2, 4), ax2, bin_size)

            # top-right subplot: legend
            leg = plt.subplot(3, 2, 2)
            for (forecast, label, color) in forecasts:
                plt.plot([0], [0], color=color, label=label)
            plt.legend(loc='lower left')

            # other stuff
            top.set_ylabel('Pr(Peak Week)')
            top.get_yaxis().set_label_position('right')
            bottom.set_ylabel('Pr(Onset Week)')
            bottom.get_yaxis().set_label_position('right')
            right.set_xlabel('Pr(Peak Height)')
            right.get_xaxis().set_label_position('top')
            ax2.set_ylabel('%s %s' % (fig_label, region.upper()))
            ax2.get_yaxis().set_label_position('left')

            # show the finished figure
            if prefix is None:
                plt.show()
                break
            else:
                filename = '%s_%s.png' % (prefix, region)
                plt.savefig(filename, bbox_inches='tight')
                print('saved %s' % filename)
예제 #28
0
    def find_csv_files(scan_dir,
                       issue=(date.today(), epi.Week.fromdate(date.today())),
                       glob=glob):
        """Recursively search for and yield covidcast-format CSV files.

    scan_dir: the directory to scan (recursively)

    The return value is a tuple of (path, details), where, if the path was
    valid, details is a tuple of (source, signal, time_type, geo_type,
    time_value, issue, lag) (otherwise None).
    """
        logger = get_structured_logger('find_csv_files')
        issue_day, issue_epiweek = issue
        issue_day_value = int(issue_day.strftime("%Y%m%d"))
        issue_epiweek_value = int(str(issue_epiweek))
        issue_value = -1
        lag_value = -1

        for path in sorted(glob.glob(os.path.join(scan_dir, '*', '*'))):

            if not path.lower().endswith('.csv'):
                # safe to ignore this file
                continue
            # match a daily or weekly naming pattern
            daily_match = CsvImporter.PATTERN_DAILY.match(path.lower())
            weekly_match = CsvImporter.PATTERN_WEEKLY.match(path.lower())
            if not daily_match and not weekly_match:
                logger.warning(event='invalid csv path/filename',
                               detail=path,
                               file=path)
                yield (path, None)
                continue

            # extract and validate time resolution
            if daily_match:
                time_type = 'day'
                time_value = int(daily_match.group(2))
                match = daily_match
                time_value_day = CsvImporter.is_sane_day(time_value)
                if not time_value_day:
                    logger.warning(event='invalid filename day',
                                   detail=time_value,
                                   file=path)
                    yield (path, None)
                    continue
                issue_value = issue_day_value
                lag_value = (issue_day - time_value_day).days
            else:
                time_type = 'week'
                time_value = int(weekly_match.group(2))
                match = weekly_match
                time_value_week = CsvImporter.is_sane_week(time_value)
                if not time_value_week:
                    logger.warning(event='invalid filename week',
                                   detail=time_value,
                                   file=path)
                    yield (path, None)
                    continue
                issue_value = issue_epiweek_value
                lag_value = delta_epiweeks(time_value_week,
                                           issue_epiweek_value)

            # # extract and validate geographic resolution
            geo_type = match.group(3).lower()
            if geo_type not in CsvImporter.GEOGRAPHIC_RESOLUTIONS:
                logger.warning(event='invalid geo_type',
                               detail=geo_type,
                               file=path)
                yield (path, None)
                continue

            # extract additional values, lowercased for consistency
            source = match.group(1).lower()
            signal = match.group(4).lower()
            if len(signal) > 64:
                logger.warning(event='invalid signal name (64 char limit)',
                               detail=signal,
                               file=path)
                yield (path, None)
                continue

            yield (path, (source, signal, time_type, geo_type, time_value,
                          issue_value, lag_value))
예제 #29
0
def get_periodic_bias(epiweek):
  weeks_per_year = 52.2
  offset = flu.delta_epiweeks(200001, epiweek) % weeks_per_year
  angle = np.pi * 2 * offset / weeks_per_year
  return [np.sin(angle), np.cos(angle)]