예제 #1
0
def read_source_output(data_pipeline: DataPipeline,
                       data_source: DataSource) -> DataFrame:
    with temporary_directory() as workdir:
        output_path = workdir / f"{data_source.uuid(data_pipeline.table)}.csv"
        try:
            download_file(GCS_BUCKET_TEST, f"intermediate/{output_path.name}",
                          output_path)
            columns = get_table_columns(output_path)
            dates = list(table_read_column(
                output_path, "date")) if "date" in columns else [None]
            return {
                "pipeline":
                data_pipeline.name,
                "data_source":
                f"{data_source.__module__}.{data_source.name}",
                "columns":
                ",".join(columns),
                "first_date":
                min(dates),
                "last_date":
                max(dates),
                "location_keys":
                ",".join(sorted(set(table_read_column(output_path, "key")))),
            }
        except Exception as exc:
            print(exc, file=sys.stderr)
            return []
예제 #2
0
def _latest_date_by_group(tables_folder: Path, group_by: str = "location_key") -> Dict[str, str]:
    groups: Dict[str, str] = {}
    for table_file in tables_folder.glob("*.csv"):
        table_columns = get_table_columns(table_file)
        if "date" in table_columns:
            iter1 = table_read_column(table_file, "date")
            iter2 = table_read_column(table_file, group_by)
            for date, key in zip(iter1, iter2):
                groups[key] = max(groups.get(key, date), date)
    return groups
예제 #3
0
def _make_location_key_and_date_table(index_table: Path,
                                      output_path: Path) -> None:
    # Use a temporary directory for intermediate files
    with temporary_directory() as workdir:

        # Make sure that there is an index table present
        assert index_table.exists(), "Index table not found"

        # Index table will determine if we use "key" or "location_key" as column name
        index_columns = get_table_columns(index_table)
        location_key = "location_key" if "location_key" in index_columns else "key"

        # Create a single-column table with only the keys
        keys_table_path = workdir / "location_keys.csv"
        with open(keys_table_path, "w") as fd:
            fd.write(f"{location_key}\n")
            fd.writelines(
                f"{value}\n"
                for value in table_read_column(index_table, location_key))

        # Add a date to each region from index to allow iterative left joins
        max_date = (datetime.datetime.now() +
                    datetime.timedelta(days=1)).date().isoformat()
        date_table_path = workdir / "dates.csv"
        with open(date_table_path, "w") as fd:
            fd.write("date\n")
            fd.writelines(f"{value}\n"
                          for value in date_range("2020-01-01", max_date))

        # Output all combinations of <key x date>
        table_cross_product(keys_table_path, date_table_path, output_path)
예제 #4
0
def publish_v3_main_table() -> Response:
    with temporary_directory() as workdir:
        input_folder = workdir / "input"
        output_folder = workdir / "output"
        input_folder.mkdir(parents=True, exist_ok=True)
        output_folder.mkdir(parents=True, exist_ok=True)

        # Get a list of valid location keys
        location_keys = list(
            table_read_column(SRC / "data" / "metadata.csv", "key"))

        # Download all the location breakout tables into our local storage
        download_folder(GCS_BUCKET_PROD, "v3", input_folder,
                        lambda x: "location/" in str(x))
        logger.log_info(
            f"Downloaded {sum(1 for _ in input_folder.glob('**/*.csv'))} CSV files"
        )

        # Create the aggregated table and put it in a compressed file
        agg_file_path = output_folder / "aggregated.csv.gz"
        with gzip.open(agg_file_path, "wt") as compressed_file:
            merge_location_breakout_tables(input_folder, compressed_file,
                                           location_keys)

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, "v3", output_folder)

    return Response("OK", status=200)
예제 #5
0
def main(output_folder: Path,
         tables_folder: Path,
         use_table_names: List[str] = None) -> None:
    """
    This script takes the processed outputs located in `tables_folder` and publishes them into the
    output folder by performing the following operations:

        1. Copy all the tables from `tables_folder` to `output_folder`, renaming fields if
           necessary.
        2. Create different slices of data, such as the latest known record for each region, files
           for the last day of data, files for each individual region.
        3. Produce a main table, created by iteratively performing left outer joins on all other
           tables for each slice of data (bot not for the global tables).
    """
    # Wipe the output folder first
    for item in output_folder.glob("*"):
        if item.name.startswith("."):
            continue
        if item.is_file():
            item.unlink()
        else:
            shutil.rmtree(item)

    # Create the folder which will be published using a stable schema
    output_folder = output_folder / "v3"
    output_folder.mkdir(exist_ok=True, parents=True)

    # Publish the tables containing all location keys
    publish_global_tables(tables_folder, output_folder)

    # Create a temporary folder which will host all the location breakouts
    with temporary_directory() as breakout_folder:

        # Break out each table into separate folders based on the location key
        publish_location_breakouts(output_folder,
                                   breakout_folder,
                                   use_table_names=use_table_names)

        # Create a folder which will host all the location aggregates
        location_aggregates_folder = output_folder / "location"
        location_aggregates_folder.mkdir(exist_ok=True, parents=True)

        # Aggregate the tables for each location independently
        location_keys = table_read_column(output_folder / "index.csv",
                                          "location_key")
        publish_location_aggregates(
            breakout_folder,
            location_aggregates_folder,
            location_keys,
            use_table_names=use_table_names,
        )

    # Create the aggregated table and put it in a compressed file
    agg_file_path = output_folder / "aggregated.csv.gz"
    with gzip.open(agg_file_path, "wt") as compressed_file:
        merge_location_breakout_tables(location_aggregates_folder,
                                       compressed_file)

    # Convert all CSV files to JSON using values format
    convert_tables_to_json(output_folder, output_folder)
예제 #6
0
def publish_json_locations(prod_folder: str = "v2",
                           location_key_from: str = None,
                           location_key_until: str = None) -> Response:
    prod_folder = _get_request_param("prod_folder", prod_folder)
    location_key_from = _get_request_param("location_key_from",
                                           location_key_from)
    location_key_until = _get_request_param("location_key_until",
                                            location_key_until)

    with temporary_directory() as workdir:
        input_folder = workdir / "input"
        output_folder = workdir / "output"
        input_folder.mkdir(parents=True, exist_ok=True)
        output_folder.mkdir(parents=True, exist_ok=True)

        # Convert the tables to JSON for each location independently
        location_keys = list(
            table_read_column(SRC / "data" / "metadata.csv", "key"))
        if location_key_from is not None:
            location_keys = [
                key for key in location_keys if key >= location_key_from
            ]
        if location_key_until is not None:
            location_keys = [
                key for key in location_keys if key <= location_key_until
            ]
        logger.log_info(
            f"Converting {len(location_keys)} location subsets to JSON "
            f"from {location_keys[0]} until {location_keys[-1]}")

        # Download all the processed tables into our local storage
        def match_path(table_path: Path) -> bool:
            try:
                if prod_folder == "v2":
                    location_key, table_name = str(table_path).split("/", 1)
                    return table_name == "main.csv" and location_key in location_keys
                elif prod_folder == "v3":
                    location_path, location_key = table_path.parent.name, table_path.stem
                    return location_path == "location" and location_key in location_keys
            except:
                return False

        download_folder(GCS_BUCKET_PROD, prod_folder, input_folder, match_path)
        logger.log_info(
            f"Downloaded {sum(1 for _ in input_folder.glob('**/*.csv'))} CSV files"
        )

        # Convert all files to JSON
        convert_tables_to_json(input_folder, output_folder)
        converted_count = sum(1 for _ in output_folder.glob("**/*.json"))
        logger.log_info(f"Converted {converted_count} files to JSON")

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, prod_folder, output_folder)

    return Response("OK", status=200)
예제 #7
0
def publish_v3_location_subsets(
    location_key_from: str = None, location_key_until: str = None
) -> Response:
    location_key_from = _get_request_param("location_key_from", location_key_from)
    location_key_until = _get_request_param("location_key_until", location_key_until)

    with temporary_directory() as workdir:
        input_folder = workdir / "input"
        intermediate_folder = workdir / "temp"
        output_folder = workdir / "output"
        input_folder.mkdir(parents=True, exist_ok=True)
        output_folder.mkdir(parents=True, exist_ok=True)

        location_keys = list(table_read_column(SRC / "data" / "metadata.csv", "key"))
        if location_key_from is not None:
            location_keys = [key for key in location_keys if key >= location_key_from]
        if location_key_until is not None:
            location_keys = [key for key in location_keys if key <= location_key_until]
        logger.log_info(
            f"Publishing {len(location_keys)} location subsets "
            f"from {location_keys[0]} until {location_keys[-1]}"
        )

        # Download all the global tables into our local storage
        forbid_tokens = ("/", "main.", "aggregated.")
        download_folder(
            GCS_BUCKET_PROD,
            "v3",
            input_folder,
            lambda x: x.suffix == ".csv" and all(token not in str(x) for token in forbid_tokens),
        )
        logger.log_info(f"Downloaded {sum(1 for _ in input_folder.glob('**/*.csv'))} CSV files")

        # Break out each table into separate folders based on the location key
        publish_location_breakouts(input_folder, intermediate_folder, use_table_names=V3_TABLE_LIST)
        logger.log_info("Created all table location breakouts")

        # Create a folder which will host all the location aggregates
        location_aggregates_folder = output_folder / "location"
        location_aggregates_folder.mkdir(parents=True, exist_ok=True)

        # Aggregate the tables for each location independently
        publish_location_aggregates(
            intermediate_folder,
            location_aggregates_folder,
            location_keys,
            use_table_names=V3_TABLE_LIST,
        )
        logger.log_info("Aggregated all table breakouts by location")

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, "v3", output_folder)

    return Response("OK", status=200)
예제 #8
0
def publish_v3_location_subsets(location_key_from: str = None,
                                location_key_until: str = None) -> Response:
    location_key_from = _get_request_param("location_key_from",
                                           location_key_from)
    location_key_until = _get_request_param("location_key_until",
                                            location_key_until)

    with temporary_directory() as workdir:
        input_folder = workdir / "input"
        intermediate_folder = workdir / "temp"
        output_folder = workdir / "output"
        input_folder.mkdir(parents=True, exist_ok=True)
        output_folder.mkdir(parents=True, exist_ok=True)

        location_keys = list(
            table_read_column(SRC / "data" / "metadata.csv", "key"))
        if location_key_from is not None:
            location_keys = [
                key for key in location_keys if key >= location_key_from
            ]
        if location_key_until is not None:
            location_keys = [
                key for key in location_keys if key <= location_key_until
            ]
        logger.log_info(f"Publishing {len(location_keys)} location subsets "
                        f"from {location_keys[0]} until {location_keys[-1]}")

        # Download all the global tables into our local storage
        download_folder(GCS_BUCKET_PROD, "v3", input_folder,
                        lambda x: "/" not in str(x))

        # Break out each table into separate folders based on the location key
        publish_location_breakouts(input_folder,
                                   intermediate_folder,
                                   use_table_names=V3_TABLE_LIST)

        # Aggregate the tables for each location independently
        publish_location_aggregates(intermediate_folder,
                                    output_folder,
                                    location_keys,
                                    use_table_names=V3_TABLE_LIST)

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, "v3", output_folder)

    return Response("OK", status=200)
예제 #9
0
def schedule_all_jobs(project_id: str, location_id: str,
                      time_zone: str) -> None:
    """
    Clears all previously scheduled jobs and schedules all necessary jobs for the current
    configuration.
    """
    client = scheduler_v1.CloudSchedulerClient()

    # Create a custom method with our parameters for ease of use
    _schedule_job = partial(
        schedule_job,
        client=client,
        project_id=project_id,
        location_id=location_id,
        time_zone=time_zone,
    )

    # Clear all pre-existing jobs
    clear_jobs(client=client, project_id=project_id, location_id=location_id)

    # Read the list of all known locations, since we will be splitting some jobs based on that
    location_keys = list(
        table_read_column(SRC / "data" / "metadata.csv", "key"))

    # Cache pull job runs hourly
    _schedule_job(schedule="0 * * * *", path="/cache_pull")

    # The job that publishes combined tables into the prod bucket runs every 2 hours
    _schedule_job(
        path="/publish_tables",
        # Offset by 30 minutes to let other hourly tasks finish
        schedule="30 */2 * * *",
    )

    # The job that publishes aggregate outputs runs every 4 hours
    _schedule_job(
        # Run in a separate, preemptible instance
        path="/deferred/publish_main_table",
        # Offset by 60 minutes to let other hourly tasks finish
        schedule="0 1-23/4 * * *",
    )

    # The job that publishes breakdown outputs runs every 4 hours
    _schedule_job(
        path="/deferred/publish_subset_tables",
        # Offset by 90 minutes to run after publishing
        schedule="30 1-23/4 * * *",
    )

    # Converting the outputs to JSON is less critical but also slow so it's run separately
    for subset in _split_into_subsets(location_keys, bin_count=5):
        job_params = f"prod_folder=v2&location_key_from={subset[0]}&location_key_until={subset[-1]}"
        _schedule_job(
            path=f"/deferred/publish_json?{job_params}",
            # Offset by 120 minutes to run after subset tables are published
            schedule="0 2-23/4 * * *",
        )

    # Get new errors once a day at midday.
    _schedule_job(path="/report_errors_to_github", schedule="0 12 * * *")

    # Keep track of the different job groups to only output them once
    job_urls_seen = set()

    for data_pipeline in get_pipelines():
        # The job that combines data sources into a table runs hourly
        _schedule_job(
            path=f"/deferred/combine_table?table={data_pipeline.table}",
            # Offset by 15 minutes to let other hourly tasks finish
            schedule="15 * * * *",
        )

        for idx, data_source in enumerate(data_pipeline.data_sources):
            automation_opts = data_source.config.get("automation", {})

            # The job to pull each individual data source runs hourly unless specified otherwise
            job_sched = automation_opts.get("schedule", "0 * * * *")

            # If the job is deferred, then prepend the token to the path
            job_prefix = "/deferred" if automation_opts.get("deferred") else ""

            # Each data source has a job group. All data sources within the same job group are run
            # as part of the same job in series. The default job group is the index of the data
            # source.
            job_group = automation_opts.get("job_group", idx)
            job_url = f"{job_prefix}/update_table?table={data_pipeline.table}&job_group={job_group}"

            if job_url not in job_urls_seen:
                job_urls_seen.add(job_url)
                _schedule_job(path=job_url, schedule=job_sched)

    # V3 publish jobs start here

    # Publish the tables with all location keys every 2 hours
    _schedule_job(
        path="/deferred/publish_v3_global_tables",
        # Offset by 30 minutes to let other hourly tasks finish
        schedule="30 */2 * * *",
    )

    # Publish the main aggregated table every 2 hours
    _schedule_job(
        path="/deferred/publish_v3_main_table",
        # Offset by 60 minutes to let other hourly tasks finish
        schedule="0 1-23/2 * * *",
    )

    # Break down the outputs by location key every 2 hours, and execute the job in chunks
    for subset in _split_into_subsets(location_keys, bin_count=5):
        job_params = f"location_key_from={subset[0]}&location_key_until={subset[-1]}"
        _schedule_job(
            path=f"/deferred/publish_v3_location_subsets?{job_params}",
            # Offset by 60 minutes to let other hourly tasks finish
            schedule="0 1-23/2 * * *",
        )

    # Publish outputs in JSON format every 2 hours, and execute the job in chunks
    for subset in _split_into_subsets(location_keys, bin_count=5):
        job_params = f"prod_folder=v3&location_key_from={subset[0]}&location_key_until={subset[-1]}"
        _schedule_job(
            path=f"/deferred/publish_json?{job_params}",
            # Offset by 90 minutes to let other hourly tasks finish
            schedule="30 1-23/2 * * *",
        )
예제 #10
0
def main(output_folder: Path,
         tables_folder: Path,
         use_table_names: List[str] = None) -> None:
    """
    This script takes the processed outputs located in `tables_folder` and publishes them into the
    output folder by performing the following operations:

        1. Copy all the tables from `tables_folder` to `output_folder`, renaming fields if
           necessary.
        2. Create different slices of data, such as the latest known record for each region, files
           for the last day of data, files for each individual region.
        3. Produce a main table, created by iteratively performing left outer joins on all other
           tables for each slice of data (bot not for the global tables).
    """
    # Wipe the output folder first
    for item in output_folder.glob("*"):
        if item.name.startswith("."):
            continue
        if item.is_file():
            item.unlink()
        else:
            shutil.rmtree(item)

    # Create the folder which will be published using a stable schema
    v3_folder = output_folder / "v3"
    v3_folder.mkdir(exist_ok=True, parents=True)

    # Publish the tables containing all location keys
    publish_global_tables(tables_folder,
                          v3_folder,
                          use_table_names=use_table_names)

    # Break out each table into separate folders based on the location key
    publish_location_breakouts(v3_folder,
                               v3_folder,
                               use_table_names=use_table_names)

    # Aggregate the independent tables for each location
    location_keys = table_read_column(v3_folder / "index.csv", "location_key")
    publish_location_aggregates(v3_folder,
                                v3_folder,
                                location_keys,
                                use_table_names=use_table_names)

    # Create a single table aggregating outputs from all other tables
    main_file_name = "covid-19-open-data.csv"
    main_file_zip_path = v3_folder / f"{main_file_name}.zip"
    with ZipFile(main_file_zip_path, mode="w",
                 compression=ZIP_DEFLATED) as zip_archive:
        with zip_archive.open(main_file_name, "w") as output_file:
            merge_output_tables_sqlite(v3_folder,
                                       TextIOWrapper(output_file),
                                       use_table_names=use_table_names)

    # Convert all CSV files to JSON using values format
    global_tables = list(v3_folder.glob("*.csv"))
    location_tables = [
        table for table in v3_folder.glob("**/*.csv")
        if table not in global_tables
    ]
    convert_tables_to_json(location_tables, v3_folder)