def create_metadata_dict() -> Dict[str, Any]: meta: Dict[str, Any] = {"tables": [], "sources": []} # Add each of the tables into the metadata file for pipeline in get_pipelines(): fname = pipeline.table meta["tables"].append({ "name": fname, "csv_url": f"https://storage.googleapis.com/covid19-open-data/v3/{fname}.csv", # TODO: discover the generation ID of the file and add it to the metadata }) # Add all the data sources to the metadata file sources = [(idx, pipeline, src) for idx, (pipeline, src) in enumerate(iter_data_sources())] meta["sources"] = [ dict(src.config, index=idx, table=pipeline.table, uuid=src.uuid(pipeline.table)) for idx, pipeline, src in sources ] return meta
def _test_make_main_table_helper(self, main_table_path: Path, column_adapter: Dict[str, str]): main_table = read_table(main_table_path, schema=SCHEMA) # Verify that all columns from all tables exist for pipeline in get_pipelines(): for column_name in pipeline.schema.keys(): column_name = column_adapter.get(column_name) if column_name is not None: self.assertTrue( column_name in main_table.columns, f"Column {column_name} missing from main table", ) # Main table should follow a lexical sort (outside of header) main_table_records = [] for line in read_lines(main_table_path): main_table_records.append(line) main_table_records = main_table_records[1:] self.assertListEqual(main_table_records, list(sorted(main_table_records))) # Make sure that all columns present in the index table are in the main table main_table_columns = set(get_table_columns(main_table_path)) index_table_columns = set( get_table_columns(SRC / "test" / "data" / "index.csv")) for column in index_table_columns: column = column_adapter.get(column, column) self.assertTrue(column in main_table_columns, f"{column} not in main") # Make the main table easier to deal with since we optimize for memory usage location_key = "location_key" if "location_key" in main_table.columns else "key" main_table.set_index(location_key, inplace=True) main_table["date"] = main_table["date"].astype(str) # Define sets of columns to check column_prefixes = ("new", "total", "cumulative") column_filter = lambda col: col.split("_")[ 0] in column_prefixes and "age" not in col columns = list(filter(column_filter, main_table.columns)) self.assertGreaterEqual(len({col.split("_")[0] for col in columns}), 2) main_table = main_table[["date"] + columns] # Spot check: Country of Andorra self._spot_check_subset(main_table, "AD", "2020-09-01", "2020-12-31") # Spot check: State of New South Wales self._spot_check_subset(main_table, "AU_NSW", "2020-09-01", "2020-12-31") # Spot check: Alachua County self._spot_check_subset(main_table, "US_FL_12001", "2020-09-01", "2020-12-31")
def test_config_metadata(self): """ This test verifies that all the required metadata is present in the data source config, including licensing information. """ required_metadata = ["label", "website", "license", "license_url"] for pipeline in get_pipelines(): for data_source in pipeline.data_sources: for meta in required_metadata: err_msg = f"{meta} missing in {data_source.name} ({pipeline.name})" self.assertIn(meta, data_source.config.keys(), err_msg)
def test_make_main_table(self): with TemporaryDirectory() as workdir: workdir = Path(workdir) # Copy all test tables into the temporary directory copy_tables(SRC / "test" / "data", workdir) # Create the main table main_table_path = workdir / "main.csv" make_main_table(workdir, main_table_path) main_table = read_table(main_table_path, schema=SCHEMA) # Verify that all columns from all tables exist for pipeline in get_pipelines(): if pipeline.table in EXCLUDE_FROM_MAIN_TABLE: continue for column_name in pipeline.schema.keys(): self.assertTrue( column_name in main_table.columns, f"Column {column_name} missing from main table", ) # Main table should follow a lexical sort (outside of header) main_table_records = [] for line in read_lines(main_table_path): main_table_records.append(line) main_table_records = main_table_records[1:] self.assertListEqual(main_table_records, list(sorted(main_table_records))) # Make the main table easier to deal with since we optimize for memory usage main_table.set_index("key", inplace=True) main_table["date"] = main_table["date"].astype(str) # Define sets of columns to check epi_basic = [ "new_confirmed", "total_confirmed", "new_deceased", "total_deceased" ] # Spot check: Country of Andorra self._spot_check_subset(main_table, "AD", epi_basic, "2020-03-02", "2020-09-01") # Spot check: State of New South Wales self._spot_check_subset(main_table, "AU_NSW", epi_basic, "2020-01-25", "2020-09-01") # Spot check: Alachua County self._spot_check_subset(main_table, "US_FL_12001", epi_basic, "2020-03-10", "2020-09-01")
def test_tables_config(self): """ This test needs to be periodically updated as we add or remove data sources to the table's config.yaml files. The main purpose of this test is to ensure that there are no drastic changes in the configuration files, such as an empty config.yaml. """ expected_source_counts = { "epidemiology": 50, "hospitalizations": 20, "by-age": 10, "by-sex": 10, } for pipeline in get_pipelines(): data_sources = pipeline.data_sources expected_count = expected_source_counts.get(pipeline.table, 1) self.assertGreaterEqual(len(data_sources), expected_count)
def schedule_all_jobs(project_id: str, location_id: str, time_zone: str) -> None: """ Clears all previously scheduled jobs and schedules all necessary jobs for the current configuration. """ client = scheduler_v1.CloudSchedulerClient() # Create a custom method with our parameters for ease of use _schedule_job = partial( schedule_job, client=client, project_id=project_id, location_id=location_id, time_zone=time_zone, ) # Clear all pre-existing jobs clear_jobs(client=client, project_id=project_id, location_id=location_id) # Cache pull job runs hourly _schedule_job(schedule="0 * * * *", path="/cache_pull") # The job that publishes data into the prod bucket runs every 4 hours _schedule_job( path="/publish", # Offset by 30 minutes to let other hourly tasks finish schedule="30 */4 * * *", ) # Converting the outputs to JSON is less critical but also slow so it's run separately _schedule_job( path="/convert_json_1", # Offset by 30 minutes to run after publishing schedule="0 1-23/4 * * *", ) # The convert to JSON task is split in two because otherwise it takes too long _schedule_job( path="/convert_json_2", # Offset by 30 minutes to run after publishing schedule="0 1-23/4 * * *", ) # Get new errors once a day at midday. _schedule_job(path="/report_errors_to_github", schedule="0 12 * * *") # Keep track of the different job groups to only output them once job_urls_seen = set() for data_pipeline in get_pipelines(): # The job that combines data sources into a table runs hourly _schedule_job( path=f"/combine_table?table={data_pipeline.table}", # Offset by 15 minutes to let other hourly tasks finish schedule="15 * * * *", ) for idx, data_source in enumerate(data_pipeline.data_sources): # The job to pull each individual data source runs hourly unless specified otherwise job_sched = data_source.config.get("automation", {}).get("schedule", "0 * * * *") # Each data source has a job group. All data sources within the same job group are run # as part of the same job in series. The default job group is the index of the data # source. job_group = data_source.config.get("automation", {}).get("job_group", idx) job_url = f"/update_table?table={data_pipeline.table}&job_group={job_group}" if job_url not in job_urls_seen: job_urls_seen.add(job_url) _schedule_job(path=job_url, schedule=job_sched)
def schedule_all_jobs(project_id: str, location_id: str, time_zone: str) -> None: """ Clears all previously scheduled jobs and schedules all necessary jobs for the current configuration. """ client = scheduler_v1.CloudSchedulerClient() # Create a custom method with our parameters for ease of use _schedule_job = partial( schedule_job, client=client, project_id=project_id, location_id=location_id, time_zone=time_zone, ) # Clear all pre-existing jobs clear_jobs(client=client, project_id=project_id, location_id=location_id) # Read the list of all known locations, since we will be splitting some jobs based on that location_keys = list( table_read_column(SRC / "data" / "metadata.csv", "key")) # Cache pull job runs hourly _schedule_job(schedule="0 * * * *", path="/cache_pull") # The job that publishes combined tables into the prod bucket runs every 2 hours _schedule_job( path="/publish_tables", # Offset by 30 minutes to let other hourly tasks finish schedule="30 */2 * * *", ) # The job that publishes aggregate outputs runs every 4 hours _schedule_job( # Run in a separate, preemptible instance path="/deferred/publish_main_table", # Offset by 60 minutes to let other hourly tasks finish schedule="0 1-23/4 * * *", ) # The job that publishes breakdown outputs runs every 4 hours _schedule_job( path="/deferred/publish_subset_tables", # Offset by 90 minutes to run after publishing schedule="30 1-23/4 * * *", ) # Converting the outputs to JSON is less critical but also slow so it's run separately for subset in _split_into_subsets(location_keys, bin_count=5): job_params = f"prod_folder=v2&location_key_from={subset[0]}&location_key_until={subset[-1]}" _schedule_job( path=f"/deferred/publish_json?{job_params}", # Offset by 120 minutes to run after subset tables are published schedule="0 2-23/4 * * *", ) # Get new errors once a day at midday. _schedule_job(path="/report_errors_to_github", schedule="0 12 * * *") # Keep track of the different job groups to only output them once job_urls_seen = set() for data_pipeline in get_pipelines(): # The job that combines data sources into a table runs hourly _schedule_job( path=f"/deferred/combine_table?table={data_pipeline.table}", # Offset by 15 minutes to let other hourly tasks finish schedule="15 * * * *", ) for idx, data_source in enumerate(data_pipeline.data_sources): automation_opts = data_source.config.get("automation", {}) # The job to pull each individual data source runs hourly unless specified otherwise job_sched = automation_opts.get("schedule", "0 * * * *") # If the job is deferred, then prepend the token to the path job_prefix = "/deferred" if automation_opts.get("deferred") else "" # Each data source has a job group. All data sources within the same job group are run # as part of the same job in series. The default job group is the index of the data # source. job_group = automation_opts.get("job_group", idx) job_url = f"{job_prefix}/update_table?table={data_pipeline.table}&job_group={job_group}" if job_url not in job_urls_seen: job_urls_seen.add(job_url) _schedule_job(path=job_url, schedule=job_sched) # V3 publish jobs start here # Publish the tables with all location keys every 2 hours _schedule_job( path="/deferred/publish_v3_global_tables", # Offset by 30 minutes to let other hourly tasks finish schedule="30 */2 * * *", ) # Publish the main aggregated table every 2 hours _schedule_job( path="/deferred/publish_v3_main_table", # Offset by 60 minutes to let other hourly tasks finish schedule="0 1-23/2 * * *", ) # Break down the outputs by location key every 2 hours, and execute the job in chunks for subset in _split_into_subsets(location_keys, bin_count=5): job_params = f"location_key_from={subset[0]}&location_key_until={subset[-1]}" _schedule_job( path=f"/deferred/publish_v3_location_subsets?{job_params}", # Offset by 60 minutes to let other hourly tasks finish schedule="0 1-23/2 * * *", ) # Publish outputs in JSON format every 2 hours, and execute the job in chunks for subset in _split_into_subsets(location_keys, bin_count=5): job_params = f"prod_folder=v3&location_key_from={subset[0]}&location_key_until={subset[-1]}" _schedule_job( path=f"/deferred/publish_json?{job_params}", # Offset by 90 minutes to let other hourly tasks finish schedule="30 1-23/2 * * *", )
min(dates), "last_date": max(dates), "location_keys": ",".join(sorted(set(table_read_column(output_path, "key")))), } except Exception as exc: print(exc, file=sys.stderr) return [] def get_source_outputs( data_pipelines: Iterable[DataPipeline]) -> Iterable[Dict]: """Map a list of pipeline names to their source configs.""" for data_pipeline in tqdm(list(data_pipelines)): # print(f"Processing {data_pipeline.name}") map_iter = data_pipeline.data_sources map_func = partial(read_source_output, data_pipeline) map_opts = dict(desc="Downloading data tables", leave=False) yield from thread_map(map_func, map_iter, **map_opts) if __name__ == "__main__": # To authenticate with Cloud locally, run the following commands: # > $env:GOOGLE_CLOUD_PROJECT = "github-open-covid-19" # > $env:GCS_SERVICE_ACCOUNT = "*****@*****.**" # > $env:GCP_TOKEN = $(gcloud auth application-default print-access-token) results = DataFrame(get_source_outputs(get_pipelines())) results.to_csv(index=False)