示例#1
0
def create_metadata_dict() -> Dict[str, Any]:
    meta: Dict[str, Any] = {"tables": [], "sources": []}

    # Add each of the tables into the metadata file
    for pipeline in get_pipelines():
        fname = pipeline.table
        meta["tables"].append({
            "name":
            fname,
            "csv_url":
            f"https://storage.googleapis.com/covid19-open-data/v3/{fname}.csv",
            # TODO: discover the generation ID of the file and add it to the metadata
        })

    # Add all the data sources to the metadata file
    sources = [(idx, pipeline, src)
               for idx, (pipeline, src) in enumerate(iter_data_sources())]
    meta["sources"] = [
        dict(src.config,
             index=idx,
             table=pipeline.table,
             uuid=src.uuid(pipeline.table)) for idx, pipeline, src in sources
    ]

    return meta
示例#2
0
    def _test_make_main_table_helper(self, main_table_path: Path,
                                     column_adapter: Dict[str, str]):
        main_table = read_table(main_table_path, schema=SCHEMA)

        # Verify that all columns from all tables exist
        for pipeline in get_pipelines():
            for column_name in pipeline.schema.keys():
                column_name = column_adapter.get(column_name)
                if column_name is not None:
                    self.assertTrue(
                        column_name in main_table.columns,
                        f"Column {column_name} missing from main table",
                    )

        # Main table should follow a lexical sort (outside of header)
        main_table_records = []
        for line in read_lines(main_table_path):
            main_table_records.append(line)
        main_table_records = main_table_records[1:]
        self.assertListEqual(main_table_records,
                             list(sorted(main_table_records)))

        # Make sure that all columns present in the index table are in the main table
        main_table_columns = set(get_table_columns(main_table_path))
        index_table_columns = set(
            get_table_columns(SRC / "test" / "data" / "index.csv"))
        for column in index_table_columns:
            column = column_adapter.get(column, column)
            self.assertTrue(column in main_table_columns,
                            f"{column} not in main")

        # Make the main table easier to deal with since we optimize for memory usage
        location_key = "location_key" if "location_key" in main_table.columns else "key"
        main_table.set_index(location_key, inplace=True)
        main_table["date"] = main_table["date"].astype(str)

        # Define sets of columns to check
        column_prefixes = ("new", "total", "cumulative")
        column_filter = lambda col: col.split("_")[
            0] in column_prefixes and "age" not in col
        columns = list(filter(column_filter, main_table.columns))
        self.assertGreaterEqual(len({col.split("_")[0] for col in columns}), 2)
        main_table = main_table[["date"] + columns]

        # Spot check: Country of Andorra
        self._spot_check_subset(main_table, "AD", "2020-09-01", "2020-12-31")

        # Spot check: State of New South Wales
        self._spot_check_subset(main_table, "AU_NSW", "2020-09-01",
                                "2020-12-31")

        # Spot check: Alachua County
        self._spot_check_subset(main_table, "US_FL_12001", "2020-09-01",
                                "2020-12-31")
 def test_config_metadata(self):
     """
     This test verifies that all the required metadata is present in the data source config,
     including licensing information.
     """
     required_metadata = ["label", "website", "license", "license_url"]
     for pipeline in get_pipelines():
         for data_source in pipeline.data_sources:
             for meta in required_metadata:
                 err_msg = f"{meta} missing in {data_source.name} ({pipeline.name})"
                 self.assertIn(meta, data_source.config.keys(), err_msg)
示例#4
0
    def test_make_main_table(self):
        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)

            # Copy all test tables into the temporary directory
            copy_tables(SRC / "test" / "data", workdir)

            # Create the main table
            main_table_path = workdir / "main.csv"
            make_main_table(workdir, main_table_path)
            main_table = read_table(main_table_path, schema=SCHEMA)

            # Verify that all columns from all tables exist
            for pipeline in get_pipelines():
                if pipeline.table in EXCLUDE_FROM_MAIN_TABLE:
                    continue
                for column_name in pipeline.schema.keys():
                    self.assertTrue(
                        column_name in main_table.columns,
                        f"Column {column_name} missing from main table",
                    )

            # Main table should follow a lexical sort (outside of header)
            main_table_records = []
            for line in read_lines(main_table_path):
                main_table_records.append(line)
            main_table_records = main_table_records[1:]
            self.assertListEqual(main_table_records,
                                 list(sorted(main_table_records)))

            # Make the main table easier to deal with since we optimize for memory usage
            main_table.set_index("key", inplace=True)
            main_table["date"] = main_table["date"].astype(str)

            # Define sets of columns to check
            epi_basic = [
                "new_confirmed", "total_confirmed", "new_deceased",
                "total_deceased"
            ]

            # Spot check: Country of Andorra
            self._spot_check_subset(main_table, "AD", epi_basic, "2020-03-02",
                                    "2020-09-01")

            # Spot check: State of New South Wales
            self._spot_check_subset(main_table, "AU_NSW", epi_basic,
                                    "2020-01-25", "2020-09-01")

            # Spot check: Alachua County
            self._spot_check_subset(main_table, "US_FL_12001", epi_basic,
                                    "2020-03-10", "2020-09-01")
    def test_tables_config(self):
        """
        This test needs to be periodically updated as we add or remove data sources to the table's
        config.yaml files. The main purpose of this test is to ensure that there are no drastic
        changes in the configuration files, such as an empty config.yaml.
        """
        expected_source_counts = {
            "epidemiology": 50,
            "hospitalizations": 20,
            "by-age": 10,
            "by-sex": 10,
        }

        for pipeline in get_pipelines():
            data_sources = pipeline.data_sources
            expected_count = expected_source_counts.get(pipeline.table, 1)
            self.assertGreaterEqual(len(data_sources), expected_count)
示例#6
0
def schedule_all_jobs(project_id: str, location_id: str,
                      time_zone: str) -> None:
    """
    Clears all previously scheduled jobs and schedules all necessary jobs for the current
    configuration.
    """
    client = scheduler_v1.CloudSchedulerClient()

    # Create a custom method with our parameters for ease of use
    _schedule_job = partial(
        schedule_job,
        client=client,
        project_id=project_id,
        location_id=location_id,
        time_zone=time_zone,
    )

    # Clear all pre-existing jobs
    clear_jobs(client=client, project_id=project_id, location_id=location_id)

    # Cache pull job runs hourly
    _schedule_job(schedule="0 * * * *", path="/cache_pull")

    # The job that publishes data into the prod bucket runs every 4 hours
    _schedule_job(
        path="/publish",
        # Offset by 30 minutes to let other hourly tasks finish
        schedule="30 */4 * * *",
    )

    # Converting the outputs to JSON is less critical but also slow so it's run separately
    _schedule_job(
        path="/convert_json_1",
        # Offset by 30 minutes to run after publishing
        schedule="0 1-23/4 * * *",
    )

    # The convert to JSON task is split in two because otherwise it takes too long
    _schedule_job(
        path="/convert_json_2",
        # Offset by 30 minutes to run after publishing
        schedule="0 1-23/4 * * *",
    )

    # Get new errors once a day at midday.
    _schedule_job(path="/report_errors_to_github", schedule="0 12 * * *")

    # Keep track of the different job groups to only output them once
    job_urls_seen = set()

    for data_pipeline in get_pipelines():
        # The job that combines data sources into a table runs hourly
        _schedule_job(
            path=f"/combine_table?table={data_pipeline.table}",
            # Offset by 15 minutes to let other hourly tasks finish
            schedule="15 * * * *",
        )

        for idx, data_source in enumerate(data_pipeline.data_sources):
            # The job to pull each individual data source runs hourly unless specified otherwise
            job_sched = data_source.config.get("automation",
                                               {}).get("schedule", "0 * * * *")

            # Each data source has a job group. All data sources within the same job group are run
            # as part of the same job in series. The default job group is the index of the data
            # source.
            job_group = data_source.config.get("automation",
                                               {}).get("job_group", idx)
            job_url = f"/update_table?table={data_pipeline.table}&job_group={job_group}"

            if job_url not in job_urls_seen:
                job_urls_seen.add(job_url)
                _schedule_job(path=job_url, schedule=job_sched)
示例#7
0
def schedule_all_jobs(project_id: str, location_id: str,
                      time_zone: str) -> None:
    """
    Clears all previously scheduled jobs and schedules all necessary jobs for the current
    configuration.
    """
    client = scheduler_v1.CloudSchedulerClient()

    # Create a custom method with our parameters for ease of use
    _schedule_job = partial(
        schedule_job,
        client=client,
        project_id=project_id,
        location_id=location_id,
        time_zone=time_zone,
    )

    # Clear all pre-existing jobs
    clear_jobs(client=client, project_id=project_id, location_id=location_id)

    # Read the list of all known locations, since we will be splitting some jobs based on that
    location_keys = list(
        table_read_column(SRC / "data" / "metadata.csv", "key"))

    # Cache pull job runs hourly
    _schedule_job(schedule="0 * * * *", path="/cache_pull")

    # The job that publishes combined tables into the prod bucket runs every 2 hours
    _schedule_job(
        path="/publish_tables",
        # Offset by 30 minutes to let other hourly tasks finish
        schedule="30 */2 * * *",
    )

    # The job that publishes aggregate outputs runs every 4 hours
    _schedule_job(
        # Run in a separate, preemptible instance
        path="/deferred/publish_main_table",
        # Offset by 60 minutes to let other hourly tasks finish
        schedule="0 1-23/4 * * *",
    )

    # The job that publishes breakdown outputs runs every 4 hours
    _schedule_job(
        path="/deferred/publish_subset_tables",
        # Offset by 90 minutes to run after publishing
        schedule="30 1-23/4 * * *",
    )

    # Converting the outputs to JSON is less critical but also slow so it's run separately
    for subset in _split_into_subsets(location_keys, bin_count=5):
        job_params = f"prod_folder=v2&location_key_from={subset[0]}&location_key_until={subset[-1]}"
        _schedule_job(
            path=f"/deferred/publish_json?{job_params}",
            # Offset by 120 minutes to run after subset tables are published
            schedule="0 2-23/4 * * *",
        )

    # Get new errors once a day at midday.
    _schedule_job(path="/report_errors_to_github", schedule="0 12 * * *")

    # Keep track of the different job groups to only output them once
    job_urls_seen = set()

    for data_pipeline in get_pipelines():
        # The job that combines data sources into a table runs hourly
        _schedule_job(
            path=f"/deferred/combine_table?table={data_pipeline.table}",
            # Offset by 15 minutes to let other hourly tasks finish
            schedule="15 * * * *",
        )

        for idx, data_source in enumerate(data_pipeline.data_sources):
            automation_opts = data_source.config.get("automation", {})

            # The job to pull each individual data source runs hourly unless specified otherwise
            job_sched = automation_opts.get("schedule", "0 * * * *")

            # If the job is deferred, then prepend the token to the path
            job_prefix = "/deferred" if automation_opts.get("deferred") else ""

            # Each data source has a job group. All data sources within the same job group are run
            # as part of the same job in series. The default job group is the index of the data
            # source.
            job_group = automation_opts.get("job_group", idx)
            job_url = f"{job_prefix}/update_table?table={data_pipeline.table}&job_group={job_group}"

            if job_url not in job_urls_seen:
                job_urls_seen.add(job_url)
                _schedule_job(path=job_url, schedule=job_sched)

    # V3 publish jobs start here

    # Publish the tables with all location keys every 2 hours
    _schedule_job(
        path="/deferred/publish_v3_global_tables",
        # Offset by 30 minutes to let other hourly tasks finish
        schedule="30 */2 * * *",
    )

    # Publish the main aggregated table every 2 hours
    _schedule_job(
        path="/deferred/publish_v3_main_table",
        # Offset by 60 minutes to let other hourly tasks finish
        schedule="0 1-23/2 * * *",
    )

    # Break down the outputs by location key every 2 hours, and execute the job in chunks
    for subset in _split_into_subsets(location_keys, bin_count=5):
        job_params = f"location_key_from={subset[0]}&location_key_until={subset[-1]}"
        _schedule_job(
            path=f"/deferred/publish_v3_location_subsets?{job_params}",
            # Offset by 60 minutes to let other hourly tasks finish
            schedule="0 1-23/2 * * *",
        )

    # Publish outputs in JSON format every 2 hours, and execute the job in chunks
    for subset in _split_into_subsets(location_keys, bin_count=5):
        job_params = f"prod_folder=v3&location_key_from={subset[0]}&location_key_until={subset[-1]}"
        _schedule_job(
            path=f"/deferred/publish_json?{job_params}",
            # Offset by 90 minutes to let other hourly tasks finish
            schedule="30 1-23/2 * * *",
        )
示例#8
0
                min(dates),
                "last_date":
                max(dates),
                "location_keys":
                ",".join(sorted(set(table_read_column(output_path, "key")))),
            }
        except Exception as exc:
            print(exc, file=sys.stderr)
            return []


def get_source_outputs(
        data_pipelines: Iterable[DataPipeline]) -> Iterable[Dict]:
    """Map a list of pipeline names to their source configs."""

    for data_pipeline in tqdm(list(data_pipelines)):
        # print(f"Processing {data_pipeline.name}")
        map_iter = data_pipeline.data_sources
        map_func = partial(read_source_output, data_pipeline)
        map_opts = dict(desc="Downloading data tables", leave=False)
        yield from thread_map(map_func, map_iter, **map_opts)


if __name__ == "__main__":
    # To authenticate with Cloud locally, run the following commands:
    # > $env:GOOGLE_CLOUD_PROJECT = "github-open-covid-19"
    # > $env:GCS_SERVICE_ACCOUNT = "*****@*****.**"
    # > $env:GCP_TOKEN = $(gcloud auth application-default print-access-token)
    results = DataFrame(get_source_outputs(get_pipelines()))
    results.to_csv(index=False)