def main(args):
    """Generate data for the signal dashboard.

    `args`: parsed command-line arguments
    """
    log_file = None
    if args:
        log_file = args.log_file

    logger = get_structured_logger("signal_dash_data_generator",
                                   filename=log_file,
                                   log_exceptions=False)
    start_time = time.time()

    database = Database()

    signals_to_generate = database.get_enabled_signals()
    logger.info(
        "Starting generating dashboard data.",
        enabled_signals=[signal.name for signal in signals_to_generate])

    metadata = covidcast.metadata()

    signal_status_list: List[DashboardSignalStatus] = []
    coverage_list: List[DashboardSignalCoverage] = []

    for dashboard_signal in signals_to_generate:
        latest_issue = get_latest_issue_from_metadata(dashboard_signal,
                                                      metadata)
        latest_time_value = get_latest_time_value_from_metadata(
            dashboard_signal, metadata)
        latest_coverage = get_coverage(dashboard_signal, metadata)

        signal_status_list.append(
            DashboardSignalStatus(signal_id=dashboard_signal.db_id,
                                  date=datetime.date.today(),
                                  latest_issue=latest_issue,
                                  latest_time_value=latest_time_value))
        coverage_list.extend(latest_coverage)

    try:
        database.write_status(signal_status_list)
        logger.info("Wrote status.", rowcount=database.rowcount())
    except mysql.connector.Error as exception:
        logger.exception(exception)

    try:
        database.write_coverage(coverage_list)
        logger.info("Wrote coverage.", rowcount=database.rowcount())
    except mysql.connector.Error as exception:
        logger.exception(exception)

    logger.info("Generated signal dashboard data",
                total_runtime_in_seconds=round(time.time() - start_time, 2))
    return True
示例#2
0
def main(args, epidata_impl=Epidata, database_impl=Database):
    """Update the covidcast metadata cache.

  `args`: parsed command-line arguments
  """
    log_file = None
    if (args):
        log_file = args.log_file

    logger = get_structured_logger("metadata_cache_updater", filename=log_file)
    start_time = time.time()
    database = database_impl()
    database.connect()

    # fetch metadata
    try:
        metadata_calculation_start_time = time.time()
        metadata = database.compute_covidcast_meta()
        metadata_calculation_interval_in_seconds = time.time(
        ) - metadata_calculation_start_time
    except:
        # clean up before failing
        database.disconnect(True)
        raise

    args = ("success", 1)
    if len(metadata) == 0:
        args = ("no results", -2)

    logger.info('covidcast_meta result: %s (code %d)' % args)

    if args[-1] != 1:
        logger.error('unable to cache epidata')
        return False

    # update the cache
    try:
        metadata_update_start_time = time.time()
        database.update_covidcast_meta_cache(metadata)
        metadata_update_interval_in_seconds = time.time(
        ) - metadata_update_start_time
        logger.info('successfully cached epidata')
    finally:
        # no catch block so that an exception above will cause the program to
        # fail after the following cleanup
        database.disconnect(True)

    logger.info("Generated and updated covidcast metadata",
                metadata_calculation_interval_in_seconds=round(
                    metadata_calculation_interval_in_seconds, 2),
                metadata_update_interval_in_seconds=round(
                    metadata_update_interval_in_seconds, 2),
                total_runtime_in_seconds=round(time.time() - start_time, 2))
    return True
示例#3
0
def collect_files(data_dir,
                  specific_issue_date,
                  csv_importer_impl=CsvImporter):
    """Fetch path and data profile details for each file to upload."""
    logger = get_structured_logger('collect_files')
    if specific_issue_date:
        results = list(
            csv_importer_impl.find_issue_specific_csv_files(data_dir))
    else:
        results = list(
            csv_importer_impl.find_csv_files(
                os.path.join(data_dir, 'receiving')))
    logger.info(f'found {len(results)} files')
    return results
  def archive_file(
      path_src,
      path_dst,
      filename,
      compress,
      gzip=gzip,
      os=os,
      shutil=shutil,
      open_impl=open):
    """Archive a file and return the path and `stat` of the destination file.

    WARNING: This is a potentially destructive operation. See details below.

    path_src: the directory which contains the file to be archived
    path_dst: the directory into which the file should be moved
    filename: the name of the file within `path_src`
    compress: gzips the file if true, otherise moves the file unmodified

    The destination directory will be created if necessary. If the destination
    file already exists, it will be overwritten.
    """

    logger = get_structured_logger("file_archiver")
    src = os.path.join(path_src, filename)
    dst = os.path.join(path_dst, filename)

    if compress:
      dst += '.gz'

    # make sure the destination directory exists
    os.makedirs(path_dst, exist_ok=True)

    if os.path.exists(dst):
      # warn that destination is about to be overwritten
      logger.warning(event='destination exists, will overwrite', file=dst)

    if compress:
      # make a compressed copy
      with open_impl(src, 'rb') as f_in:
        with gzip.open(dst, 'wb') as f_out:
          shutil.copyfileobj(f_in, f_out)

      # delete the original
      os.remove(src)
    else:
      # just move (i.e. rename) the original
      shutil.move(src, dst)

    # return filesystem information about the destination file
    return (dst, os.stat(dst))
示例#5
0
def main(args,
         database_impl=Database,
         collect_files_impl=collect_files,
         upload_archive_impl=upload_archive):
    """Find, parse, and upload covidcast signals."""

    logger = get_structured_logger("csv_ingestion", filename=args.log_file)
    start_time = time.time()

    if args.is_wip_override and args.not_wip_override:
        logger.error(
            'conflicting overrides for forcing WIP option!  exiting...')
        return
    wip_override = None
    if args.is_wip_override:
        wip_override = True
    if args.not_wip_override:
        wip_override = False

    # shortcut escape without hitting db if nothing to do
    path_details = collect_files_impl(args.data_dir, args.specific_issue_date)
    if not path_details:
        logger.info('nothing to do; exiting...')
        return

    logger.info("Ingesting CSVs", csv_count=len(path_details))

    database = database_impl()
    database.connect()

    try:
        modified_row_count = upload_archive_impl(path_details,
                                                 database,
                                                 make_handlers(
                                                     args.data_dir,
                                                     args.specific_issue_date),
                                                 logger,
                                                 is_wip_override=wip_override)
        logger.info("Finished inserting database rows",
                    row_count=modified_row_count)
        # the following print statement serves the same function as the logger.info call above
        # print('inserted/updated %d rows' % modified_row_count)
    finally:
        # unconditionally commit database changes since CSVs have been archived
        database.disconnect(True)

    logger.info("Ingested CSVs into database",
                total_runtime_in_seconds=round(time.time() - start_time, 2))
示例#6
0
    def load_csv(filepath, geo_type, pandas=pandas):
        """Load, validate, and yield data as `RowValues` from a CSV file.

    filepath: the CSV file to be loaded
    geo_type: the geographic resolution (e.g. county)

    In case of a validation error, `None` is yielded for the offending row,
    including the header.
    """
        logger = get_structured_logger('load_csv')

        try:
            table = pandas.read_csv(filepath, dtype=CsvImporter.DTYPES)
        except ValueError as e:
            logger.warning(
                event=
                'Failed to open CSV with specified dtypes, switching to str',
                detail=str(e),
                file=filepath)
            table = pandas.read_csv(filepath, dtype='str')

        if not CsvImporter.is_header_valid(table.columns):
            logger.warning(event='invalid header',
                           detail=table.columns,
                           file=filepath)
            yield None
            return

        table.rename(columns={
            "val": "value",
            "se": "stderr",
            "missing_val": "missing_value",
            "missing_se": "missing_stderr"
        },
                     inplace=True)

        for row in table.itertuples(index=False):
            row_values, error = CsvImporter.extract_and_check_row(
                row, geo_type, filepath)
            if error:
                logger.warning(event='invalid value for row',
                               detail=(str(row), error),
                               file=filepath)
                yield None
                continue
            yield row_values
示例#7
0
    def validate_missing_code(row,
                              attr_quantity,
                              attr_name,
                              filepath=None,
                              logger=None):
        """Take a row and validate the missing code associated with
    a quantity (e.g., val, se, stderr).

    Returns either a nan code for assignment to the missing quantity
    or a None to signal an error with the missing code. We decline
    to infer missing codes except for a very simple cases; the default
    is to produce an error so that the issue can be fixed in indicators.
    """
        logger = get_structured_logger(
            'load_csv') if logger is None else logger
        missing_entry = getattr(row, "missing_" + attr_name, None)

        try:
            missing_entry = CsvImporter.floaty_int(
                missing_entry)  # convert from string to float to int
        except (ValueError, TypeError):
            missing_entry = None

        if missing_entry is None and attr_quantity is not None:
            return Nans.NOT_MISSING.value
        if missing_entry is None and attr_quantity is None:
            return Nans.OTHER.value

        if missing_entry != Nans.NOT_MISSING.value and attr_quantity is not None:
            logger.warning(
                event=
                f"missing_{attr_name} column contradicting {attr_name} presence.",
                detail=(str(row)),
                file=filepath)
            return Nans.NOT_MISSING.value
        if missing_entry == Nans.NOT_MISSING.value and attr_quantity is None:
            logger.warning(
                event=
                f"missing_{attr_name} column contradicting {attr_name} presence.",
                detail=(str(row)),
                file=filepath)
            return Nans.OTHER.value

        return missing_entry
示例#8
0
 def find_issue_specific_csv_files(scan_dir, glob=glob):
     logger = get_structured_logger('find_issue_specific_csv_files')
     for path in sorted(glob.glob(os.path.join(scan_dir, '*'))):
         issuedir_match = CsvImporter.PATTERN_ISSUE_DIR.match(path.lower())
         if issuedir_match and os.path.isdir(path):
             issue_date_value = int(issuedir_match.group(2))
             issue_date = CsvImporter.is_sane_day(issue_date_value)
             if issue_date:
                 logger.info(event='processing csv files from issue',
                             detail=issue_date,
                             file=path)
                 yield from CsvImporter.find_csv_files(
                     path,
                     issue=(issue_date, epi.Week.fromdate(issue_date)),
                     glob=glob)
             else:
                 logger.warning(event='invalid issue directory day',
                                detail=issue_date_value,
                                file=path)
示例#9
0
def main(args):
    """Delete rows from covidcast."""

    logger = get_structured_logger("csv_deletion", filename=args.log_file)
    start_time = time.time()
    database = Database()
    database.connect()
    all_n = 0

    try:
        for deletion_file in sorted(
                glob.glob(os.path.join(args.deletion_dir, '*.csv'))):
            n = handle_file(deletion_file, database, logger)
            if n is not None:
                all_n += n
            else:
                all_n = "rowcount unsupported"
    finally:
        database.disconnect(True)

    logger.info("Deleted CSVs from database",
                total_runtime_in_seconds=round(time.time() - start_time, 2),
                row_count=all_n)
示例#10
0
    def find_csv_files(scan_dir,
                       issue=(date.today(), epi.Week.fromdate(date.today())),
                       glob=glob):
        """Recursively search for and yield covidcast-format CSV files.

    scan_dir: the directory to scan (recursively)

    The return value is a tuple of (path, details), where, if the path was
    valid, details is a tuple of (source, signal, time_type, geo_type,
    time_value, issue, lag) (otherwise None).
    """
        logger = get_structured_logger('find_csv_files')
        issue_day, issue_epiweek = issue
        issue_day_value = int(issue_day.strftime("%Y%m%d"))
        issue_epiweek_value = int(str(issue_epiweek))
        issue_value = -1
        lag_value = -1

        for path in sorted(glob.glob(os.path.join(scan_dir, '*', '*'))):

            if not path.lower().endswith('.csv'):
                # safe to ignore this file
                continue
            # match a daily or weekly naming pattern
            daily_match = CsvImporter.PATTERN_DAILY.match(path.lower())
            weekly_match = CsvImporter.PATTERN_WEEKLY.match(path.lower())
            if not daily_match and not weekly_match:
                logger.warning(event='invalid csv path/filename',
                               detail=path,
                               file=path)
                yield (path, None)
                continue

            # extract and validate time resolution
            if daily_match:
                time_type = 'day'
                time_value = int(daily_match.group(2))
                match = daily_match
                time_value_day = CsvImporter.is_sane_day(time_value)
                if not time_value_day:
                    logger.warning(event='invalid filename day',
                                   detail=time_value,
                                   file=path)
                    yield (path, None)
                    continue
                issue_value = issue_day_value
                lag_value = (issue_day - time_value_day).days
            else:
                time_type = 'week'
                time_value = int(weekly_match.group(2))
                match = weekly_match
                time_value_week = CsvImporter.is_sane_week(time_value)
                if not time_value_week:
                    logger.warning(event='invalid filename week',
                                   detail=time_value,
                                   file=path)
                    yield (path, None)
                    continue
                issue_value = issue_epiweek_value
                lag_value = delta_epiweeks(time_value_week,
                                           issue_epiweek_value)

            # # extract and validate geographic resolution
            geo_type = match.group(3).lower()
            if geo_type not in CsvImporter.GEOGRAPHIC_RESOLUTIONS:
                logger.warning(event='invalid geo_type',
                               detail=geo_type,
                               file=path)
                yield (path, None)
                continue

            # extract additional values, lowercased for consistency
            source = match.group(1).lower()
            signal = match.group(4).lower()
            if len(signal) > 64:
                logger.warning(event='invalid signal name (64 char limit)',
                               detail=signal,
                               file=path)
                yield (path, None)
                continue

            yield (path, (source, signal, time_type, geo_type, time_value,
                          issue_value, lag_value))
示例#11
0
    def compute_covidcast_meta(self, table_name='covidcast', use_index=True):
        """Compute and return metadata on all non-WIP COVIDcast signals."""
        logger = get_structured_logger("compute_covidcast_meta")
        index_hint = ""
        if use_index:
            index_hint = "USE INDEX (for_metadata)"

        n_threads = max(
            1,
            cpu_count() * 9 // 10
        )  # aka number of concurrent db connections, which [sh|c]ould be ~<= 90% of the #cores available to SQL server
        # NOTE: this may present a small problem if this job runs on different hardware than the db,
        #       but we should not run into that issue in prod.
        logger.info(f"using {n_threads} workers")

        srcsigs = Queue()  # multi-consumer threadsafe!

        sql = f'SELECT `source`, `signal` FROM `{table_name}` GROUP BY `source`, `signal` ORDER BY `source` ASC, `signal` ASC;'

        self._cursor.execute(sql)
        for source, signal in list(
                self._cursor
        ):  # self._cursor is a generator; this lets us use the cursor for subsequent queries inside the loop
            sql = f"SELECT `is_wip` FROM `{table_name}` WHERE `source`=%s AND `signal`=%s LIMIT 1"
            self._cursor.execute(sql, (source, signal))
            is_wip = int(
                self._cursor.fetchone()[0]
            )  # casting to int as it comes out as a '0' or '1' bytearray; bool('0')==True :(
            if not is_wip:
                srcsigs.put((source, signal))

        inner_sql = f'''
      SELECT
        `source` AS `data_source`,
        `signal`,
        `time_type`,
        `geo_type`,
        MIN(`time_value`) AS `min_time`,
        MAX(`time_value`) AS `max_time`,
        COUNT(DISTINCT `geo_value`) AS `num_locations`,
        MIN(`value`) AS `min_value`,
        MAX(`value`) AS `max_value`,
        ROUND(AVG(`value`),7) AS `mean_value`,
        ROUND(STD(`value`),7) AS `stdev_value`,
        MAX(`value_updated_timestamp`) AS `last_update`,
        MAX(`issue`) as `max_issue`,
        MIN(`lag`) as `min_lag`,
        MAX(`lag`) as `max_lag`
      FROM
        `{table_name}` {index_hint}
      WHERE
        `source` = %s AND
        `signal` = %s AND
        is_latest_issue = 1
      GROUP BY
        `time_type`,
        `geo_type`
      ORDER BY
        `time_type` ASC,
        `geo_type` ASC
      '''

        meta = []
        meta_lock = threading.Lock()

        def worker():
            name = threading.current_thread().name
            logger.info("starting thread", thread=name)
            #  set up new db connection for thread
            worker_dbc = Database()
            worker_dbc.connect(connector_impl=self._connector_impl)
            w_cursor = worker_dbc._cursor
            try:
                while True:
                    (source, signal) = srcsigs.get_nowait(
                    )  # this will throw the Empty caught below
                    logger.info("starting pair",
                                thread=name,
                                pair=f"({source}, {signal})")
                    w_cursor.execute(inner_sql, (source, signal))
                    with meta_lock:
                        meta.extend(
                            list(
                                dict(zip(w_cursor.column_names, x))
                                for x in w_cursor))
                    srcsigs.task_done()
            except Empty:
                logger.info("no jobs left, thread terminating", thread=name)
            finally:
                worker_dbc.disconnect(False)  # cleanup

        threads = []
        for n in range(n_threads):
            t = threading.Thread(target=worker,
                                 name='MetacacheThread-' + str(n))
            t.start()
            threads.append(t)

        srcsigs.join()
        logger.info("jobs complete")
        for t in threads:
            t.join()
        logger.info("all threads terminated")

        # sort the metadata because threaded workers dgaf
        sorting_fields = "data_source signal time_type geo_type".split()
        sortable_fields_fn = lambda x: [(field, x[field])
                                        for field in sorting_fields]
        prepended_sortables_fn = lambda x: sortable_fields_fn(x) + list(
            x.items())
        tuple_representation = list(map(prepended_sortables_fn, meta))
        tuple_representation.sort()
        meta = list(map(dict, tuple_representation))  # back to dict form

        return meta
示例#12
0
def main(*, CLEAR_LATEST_BY_PARTITION=_CLEAR_LATEST_BY_PARTITION, FILTER_CONDITION=_FILTER_CONDITION):


  logger = get_structured_logger("fill_is_lastest_issue")

  u, p = secrets.db.epi
  connection = mysql.connector.connect(
    host=secrets.db.host,
    user=u,
    password=p,
    database='epidata')
  cursor = connection.cursor()

  set_latest_query = '''
    UPDATE
    (
      SELECT
        `source`,
        `signal`,
        `time_type`,
        `geo_type`,
        `geo_value`,
        `time_value`,
        MAX(`issue`) AS `issue`
      FROM `covidcast`
      WHERE
        %s
      GROUP BY
        `source`,
        `signal`,
        `time_type`,
        `geo_type`,
        `geo_value`,
        `time_value`
    ) b
    LEFT JOIN `covidcast` a
    USING (`source`, `signal`, `time_type`, `geo_type`, `geo_value`, `time_value`, `issue`)
    SET `is_latest_issue`=1
    '''

  clear_latest_query = '''
    UPDATE `covidcast`
    SET `is_latest_issue` = 0
    WHERE %s;
  '''

  commit = False
  try:
    if not CLEAR_LATEST_BY_PARTITION:
      cursor.execute(clear_latest_query % FILTER_CONDITION)
    for partition_index in range(len(PARTITION_SPLITS)+1):
      # constructing the partition condition from partition index
      ge_condition = 'TRUE' if partition_index == 0 else \
      f'`{PARTITION_VARIABLE}` >= {PARTITION_SPLITS[partition_index - 1]}'
      l_condition = 'TRUE' if partition_index == len(PARTITION_SPLITS) else \
        f'`{PARTITION_VARIABLE}` < {PARTITION_SPLITS[partition_index]}'
      partition_condition = f'({FILTER_CONDITION}) AND ({ge_condition}) AND ({l_condition})'

      if CLEAR_LATEST_BY_PARTITION:
        cursor.execute(clear_latest_query % partition_condition)
      cursor.execute(set_latest_query % partition_condition)

      commit = True
  except Exception as e:
    connection.rollback()
    logger.exception("exception raised at partition %s (partition index #%s) of column `%s`" % (PARTITION_SPLITS[partition_index], partition_index, PARTITION_VARIABLE))
    raise e
  finally:
    cursor.close()
    if commit:
      connection.commit()
    connection.close()