def main(args): """Generate data for the signal dashboard. `args`: parsed command-line arguments """ log_file = None if args: log_file = args.log_file logger = get_structured_logger("signal_dash_data_generator", filename=log_file, log_exceptions=False) start_time = time.time() database = Database() signals_to_generate = database.get_enabled_signals() logger.info( "Starting generating dashboard data.", enabled_signals=[signal.name for signal in signals_to_generate]) metadata = covidcast.metadata() signal_status_list: List[DashboardSignalStatus] = [] coverage_list: List[DashboardSignalCoverage] = [] for dashboard_signal in signals_to_generate: latest_issue = get_latest_issue_from_metadata(dashboard_signal, metadata) latest_time_value = get_latest_time_value_from_metadata( dashboard_signal, metadata) latest_coverage = get_coverage(dashboard_signal, metadata) signal_status_list.append( DashboardSignalStatus(signal_id=dashboard_signal.db_id, date=datetime.date.today(), latest_issue=latest_issue, latest_time_value=latest_time_value)) coverage_list.extend(latest_coverage) try: database.write_status(signal_status_list) logger.info("Wrote status.", rowcount=database.rowcount()) except mysql.connector.Error as exception: logger.exception(exception) try: database.write_coverage(coverage_list) logger.info("Wrote coverage.", rowcount=database.rowcount()) except mysql.connector.Error as exception: logger.exception(exception) logger.info("Generated signal dashboard data", total_runtime_in_seconds=round(time.time() - start_time, 2)) return True
def main(args, epidata_impl=Epidata, database_impl=Database): """Update the covidcast metadata cache. `args`: parsed command-line arguments """ log_file = None if (args): log_file = args.log_file logger = get_structured_logger("metadata_cache_updater", filename=log_file) start_time = time.time() database = database_impl() database.connect() # fetch metadata try: metadata_calculation_start_time = time.time() metadata = database.compute_covidcast_meta() metadata_calculation_interval_in_seconds = time.time( ) - metadata_calculation_start_time except: # clean up before failing database.disconnect(True) raise args = ("success", 1) if len(metadata) == 0: args = ("no results", -2) logger.info('covidcast_meta result: %s (code %d)' % args) if args[-1] != 1: logger.error('unable to cache epidata') return False # update the cache try: metadata_update_start_time = time.time() database.update_covidcast_meta_cache(metadata) metadata_update_interval_in_seconds = time.time( ) - metadata_update_start_time logger.info('successfully cached epidata') finally: # no catch block so that an exception above will cause the program to # fail after the following cleanup database.disconnect(True) logger.info("Generated and updated covidcast metadata", metadata_calculation_interval_in_seconds=round( metadata_calculation_interval_in_seconds, 2), metadata_update_interval_in_seconds=round( metadata_update_interval_in_seconds, 2), total_runtime_in_seconds=round(time.time() - start_time, 2)) return True
def collect_files(data_dir, specific_issue_date, csv_importer_impl=CsvImporter): """Fetch path and data profile details for each file to upload.""" logger = get_structured_logger('collect_files') if specific_issue_date: results = list( csv_importer_impl.find_issue_specific_csv_files(data_dir)) else: results = list( csv_importer_impl.find_csv_files( os.path.join(data_dir, 'receiving'))) logger.info(f'found {len(results)} files') return results
def archive_file( path_src, path_dst, filename, compress, gzip=gzip, os=os, shutil=shutil, open_impl=open): """Archive a file and return the path and `stat` of the destination file. WARNING: This is a potentially destructive operation. See details below. path_src: the directory which contains the file to be archived path_dst: the directory into which the file should be moved filename: the name of the file within `path_src` compress: gzips the file if true, otherise moves the file unmodified The destination directory will be created if necessary. If the destination file already exists, it will be overwritten. """ logger = get_structured_logger("file_archiver") src = os.path.join(path_src, filename) dst = os.path.join(path_dst, filename) if compress: dst += '.gz' # make sure the destination directory exists os.makedirs(path_dst, exist_ok=True) if os.path.exists(dst): # warn that destination is about to be overwritten logger.warning(event='destination exists, will overwrite', file=dst) if compress: # make a compressed copy with open_impl(src, 'rb') as f_in: with gzip.open(dst, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) # delete the original os.remove(src) else: # just move (i.e. rename) the original shutil.move(src, dst) # return filesystem information about the destination file return (dst, os.stat(dst))
def main(args, database_impl=Database, collect_files_impl=collect_files, upload_archive_impl=upload_archive): """Find, parse, and upload covidcast signals.""" logger = get_structured_logger("csv_ingestion", filename=args.log_file) start_time = time.time() if args.is_wip_override and args.not_wip_override: logger.error( 'conflicting overrides for forcing WIP option! exiting...') return wip_override = None if args.is_wip_override: wip_override = True if args.not_wip_override: wip_override = False # shortcut escape without hitting db if nothing to do path_details = collect_files_impl(args.data_dir, args.specific_issue_date) if not path_details: logger.info('nothing to do; exiting...') return logger.info("Ingesting CSVs", csv_count=len(path_details)) database = database_impl() database.connect() try: modified_row_count = upload_archive_impl(path_details, database, make_handlers( args.data_dir, args.specific_issue_date), logger, is_wip_override=wip_override) logger.info("Finished inserting database rows", row_count=modified_row_count) # the following print statement serves the same function as the logger.info call above # print('inserted/updated %d rows' % modified_row_count) finally: # unconditionally commit database changes since CSVs have been archived database.disconnect(True) logger.info("Ingested CSVs into database", total_runtime_in_seconds=round(time.time() - start_time, 2))
def load_csv(filepath, geo_type, pandas=pandas): """Load, validate, and yield data as `RowValues` from a CSV file. filepath: the CSV file to be loaded geo_type: the geographic resolution (e.g. county) In case of a validation error, `None` is yielded for the offending row, including the header. """ logger = get_structured_logger('load_csv') try: table = pandas.read_csv(filepath, dtype=CsvImporter.DTYPES) except ValueError as e: logger.warning( event= 'Failed to open CSV with specified dtypes, switching to str', detail=str(e), file=filepath) table = pandas.read_csv(filepath, dtype='str') if not CsvImporter.is_header_valid(table.columns): logger.warning(event='invalid header', detail=table.columns, file=filepath) yield None return table.rename(columns={ "val": "value", "se": "stderr", "missing_val": "missing_value", "missing_se": "missing_stderr" }, inplace=True) for row in table.itertuples(index=False): row_values, error = CsvImporter.extract_and_check_row( row, geo_type, filepath) if error: logger.warning(event='invalid value for row', detail=(str(row), error), file=filepath) yield None continue yield row_values
def validate_missing_code(row, attr_quantity, attr_name, filepath=None, logger=None): """Take a row and validate the missing code associated with a quantity (e.g., val, se, stderr). Returns either a nan code for assignment to the missing quantity or a None to signal an error with the missing code. We decline to infer missing codes except for a very simple cases; the default is to produce an error so that the issue can be fixed in indicators. """ logger = get_structured_logger( 'load_csv') if logger is None else logger missing_entry = getattr(row, "missing_" + attr_name, None) try: missing_entry = CsvImporter.floaty_int( missing_entry) # convert from string to float to int except (ValueError, TypeError): missing_entry = None if missing_entry is None and attr_quantity is not None: return Nans.NOT_MISSING.value if missing_entry is None and attr_quantity is None: return Nans.OTHER.value if missing_entry != Nans.NOT_MISSING.value and attr_quantity is not None: logger.warning( event= f"missing_{attr_name} column contradicting {attr_name} presence.", detail=(str(row)), file=filepath) return Nans.NOT_MISSING.value if missing_entry == Nans.NOT_MISSING.value and attr_quantity is None: logger.warning( event= f"missing_{attr_name} column contradicting {attr_name} presence.", detail=(str(row)), file=filepath) return Nans.OTHER.value return missing_entry
def find_issue_specific_csv_files(scan_dir, glob=glob): logger = get_structured_logger('find_issue_specific_csv_files') for path in sorted(glob.glob(os.path.join(scan_dir, '*'))): issuedir_match = CsvImporter.PATTERN_ISSUE_DIR.match(path.lower()) if issuedir_match and os.path.isdir(path): issue_date_value = int(issuedir_match.group(2)) issue_date = CsvImporter.is_sane_day(issue_date_value) if issue_date: logger.info(event='processing csv files from issue', detail=issue_date, file=path) yield from CsvImporter.find_csv_files( path, issue=(issue_date, epi.Week.fromdate(issue_date)), glob=glob) else: logger.warning(event='invalid issue directory day', detail=issue_date_value, file=path)
def main(args): """Delete rows from covidcast.""" logger = get_structured_logger("csv_deletion", filename=args.log_file) start_time = time.time() database = Database() database.connect() all_n = 0 try: for deletion_file in sorted( glob.glob(os.path.join(args.deletion_dir, '*.csv'))): n = handle_file(deletion_file, database, logger) if n is not None: all_n += n else: all_n = "rowcount unsupported" finally: database.disconnect(True) logger.info("Deleted CSVs from database", total_runtime_in_seconds=round(time.time() - start_time, 2), row_count=all_n)
def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today())), glob=glob): """Recursively search for and yield covidcast-format CSV files. scan_dir: the directory to scan (recursively) The return value is a tuple of (path, details), where, if the path was valid, details is a tuple of (source, signal, time_type, geo_type, time_value, issue, lag) (otherwise None). """ logger = get_structured_logger('find_csv_files') issue_day, issue_epiweek = issue issue_day_value = int(issue_day.strftime("%Y%m%d")) issue_epiweek_value = int(str(issue_epiweek)) issue_value = -1 lag_value = -1 for path in sorted(glob.glob(os.path.join(scan_dir, '*', '*'))): if not path.lower().endswith('.csv'): # safe to ignore this file continue # match a daily or weekly naming pattern daily_match = CsvImporter.PATTERN_DAILY.match(path.lower()) weekly_match = CsvImporter.PATTERN_WEEKLY.match(path.lower()) if not daily_match and not weekly_match: logger.warning(event='invalid csv path/filename', detail=path, file=path) yield (path, None) continue # extract and validate time resolution if daily_match: time_type = 'day' time_value = int(daily_match.group(2)) match = daily_match time_value_day = CsvImporter.is_sane_day(time_value) if not time_value_day: logger.warning(event='invalid filename day', detail=time_value, file=path) yield (path, None) continue issue_value = issue_day_value lag_value = (issue_day - time_value_day).days else: time_type = 'week' time_value = int(weekly_match.group(2)) match = weekly_match time_value_week = CsvImporter.is_sane_week(time_value) if not time_value_week: logger.warning(event='invalid filename week', detail=time_value, file=path) yield (path, None) continue issue_value = issue_epiweek_value lag_value = delta_epiweeks(time_value_week, issue_epiweek_value) # # extract and validate geographic resolution geo_type = match.group(3).lower() if geo_type not in CsvImporter.GEOGRAPHIC_RESOLUTIONS: logger.warning(event='invalid geo_type', detail=geo_type, file=path) yield (path, None) continue # extract additional values, lowercased for consistency source = match.group(1).lower() signal = match.group(4).lower() if len(signal) > 64: logger.warning(event='invalid signal name (64 char limit)', detail=signal, file=path) yield (path, None) continue yield (path, (source, signal, time_type, geo_type, time_value, issue_value, lag_value))
def compute_covidcast_meta(self, table_name='covidcast', use_index=True): """Compute and return metadata on all non-WIP COVIDcast signals.""" logger = get_structured_logger("compute_covidcast_meta") index_hint = "" if use_index: index_hint = "USE INDEX (for_metadata)" n_threads = max( 1, cpu_count() * 9 // 10 ) # aka number of concurrent db connections, which [sh|c]ould be ~<= 90% of the #cores available to SQL server # NOTE: this may present a small problem if this job runs on different hardware than the db, # but we should not run into that issue in prod. logger.info(f"using {n_threads} workers") srcsigs = Queue() # multi-consumer threadsafe! sql = f'SELECT `source`, `signal` FROM `{table_name}` GROUP BY `source`, `signal` ORDER BY `source` ASC, `signal` ASC;' self._cursor.execute(sql) for source, signal in list( self._cursor ): # self._cursor is a generator; this lets us use the cursor for subsequent queries inside the loop sql = f"SELECT `is_wip` FROM `{table_name}` WHERE `source`=%s AND `signal`=%s LIMIT 1" self._cursor.execute(sql, (source, signal)) is_wip = int( self._cursor.fetchone()[0] ) # casting to int as it comes out as a '0' or '1' bytearray; bool('0')==True :( if not is_wip: srcsigs.put((source, signal)) inner_sql = f''' SELECT `source` AS `data_source`, `signal`, `time_type`, `geo_type`, MIN(`time_value`) AS `min_time`, MAX(`time_value`) AS `max_time`, COUNT(DISTINCT `geo_value`) AS `num_locations`, MIN(`value`) AS `min_value`, MAX(`value`) AS `max_value`, ROUND(AVG(`value`),7) AS `mean_value`, ROUND(STD(`value`),7) AS `stdev_value`, MAX(`value_updated_timestamp`) AS `last_update`, MAX(`issue`) as `max_issue`, MIN(`lag`) as `min_lag`, MAX(`lag`) as `max_lag` FROM `{table_name}` {index_hint} WHERE `source` = %s AND `signal` = %s AND is_latest_issue = 1 GROUP BY `time_type`, `geo_type` ORDER BY `time_type` ASC, `geo_type` ASC ''' meta = [] meta_lock = threading.Lock() def worker(): name = threading.current_thread().name logger.info("starting thread", thread=name) # set up new db connection for thread worker_dbc = Database() worker_dbc.connect(connector_impl=self._connector_impl) w_cursor = worker_dbc._cursor try: while True: (source, signal) = srcsigs.get_nowait( ) # this will throw the Empty caught below logger.info("starting pair", thread=name, pair=f"({source}, {signal})") w_cursor.execute(inner_sql, (source, signal)) with meta_lock: meta.extend( list( dict(zip(w_cursor.column_names, x)) for x in w_cursor)) srcsigs.task_done() except Empty: logger.info("no jobs left, thread terminating", thread=name) finally: worker_dbc.disconnect(False) # cleanup threads = [] for n in range(n_threads): t = threading.Thread(target=worker, name='MetacacheThread-' + str(n)) t.start() threads.append(t) srcsigs.join() logger.info("jobs complete") for t in threads: t.join() logger.info("all threads terminated") # sort the metadata because threaded workers dgaf sorting_fields = "data_source signal time_type geo_type".split() sortable_fields_fn = lambda x: [(field, x[field]) for field in sorting_fields] prepended_sortables_fn = lambda x: sortable_fields_fn(x) + list( x.items()) tuple_representation = list(map(prepended_sortables_fn, meta)) tuple_representation.sort() meta = list(map(dict, tuple_representation)) # back to dict form return meta
def main(*, CLEAR_LATEST_BY_PARTITION=_CLEAR_LATEST_BY_PARTITION, FILTER_CONDITION=_FILTER_CONDITION): logger = get_structured_logger("fill_is_lastest_issue") u, p = secrets.db.epi connection = mysql.connector.connect( host=secrets.db.host, user=u, password=p, database='epidata') cursor = connection.cursor() set_latest_query = ''' UPDATE ( SELECT `source`, `signal`, `time_type`, `geo_type`, `geo_value`, `time_value`, MAX(`issue`) AS `issue` FROM `covidcast` WHERE %s GROUP BY `source`, `signal`, `time_type`, `geo_type`, `geo_value`, `time_value` ) b LEFT JOIN `covidcast` a USING (`source`, `signal`, `time_type`, `geo_type`, `geo_value`, `time_value`, `issue`) SET `is_latest_issue`=1 ''' clear_latest_query = ''' UPDATE `covidcast` SET `is_latest_issue` = 0 WHERE %s; ''' commit = False try: if not CLEAR_LATEST_BY_PARTITION: cursor.execute(clear_latest_query % FILTER_CONDITION) for partition_index in range(len(PARTITION_SPLITS)+1): # constructing the partition condition from partition index ge_condition = 'TRUE' if partition_index == 0 else \ f'`{PARTITION_VARIABLE}` >= {PARTITION_SPLITS[partition_index - 1]}' l_condition = 'TRUE' if partition_index == len(PARTITION_SPLITS) else \ f'`{PARTITION_VARIABLE}` < {PARTITION_SPLITS[partition_index]}' partition_condition = f'({FILTER_CONDITION}) AND ({ge_condition}) AND ({l_condition})' if CLEAR_LATEST_BY_PARTITION: cursor.execute(clear_latest_query % partition_condition) cursor.execute(set_latest_query % partition_condition) commit = True except Exception as e: connection.rollback() logger.exception("exception raised at partition %s (partition index #%s) of column `%s`" % (PARTITION_SPLITS[partition_index], partition_index, PARTITION_VARIABLE)) raise e finally: cursor.close() if commit: connection.commit() connection.close()