Exemplo n.º 1
0
# Copy all output files to the V2 folder
print("Copying files to public folder...")
for output_file in (ROOT / "output" / "tables").glob("*.csv"):
    shutil.copy(output_file, v2_folder / output_file.name)

# Merge all output files into a single master table
print("Creating master table...")
master = read_file(v2_folder / "index.csv")
for output_file in v2_folder.glob("*.csv"):
    if output_file.name not in ("index.csv", "master.csv"):
        master = master.merge(read_file(output_file, low_memory=False),
                              how="left")

# Drop rows without a single dated record
export_csv(master.dropna(subset=["date"]), v2_folder / "master.csv")

# Create subsets with the last 30, 14 and 7 days of data
print("Creating last N days subsets...")
for n_days in (30, 14, 7):
    n_days_folder = v2_folder / str(n_days)
    n_days_folder.mkdir(exist_ok=True)
    for csv_file in (v2_folder).glob("*.csv"):
        data = read_file(csv_file, low_memory=False)
        export_csv(subset_last_days(data, n_days),
                   n_days_folder / csv_file.name)

# Create a subset with the latest known day of data for each key
print("Creating the latest subset...")
latest_folder = v2_folder / "latest"
latest_folder.mkdir(exist_ok=True)
Exemplo n.º 2
0
    profiler.enable()

# A pipeline chain is any subfolder not starting with "_" in the pipelines folder
all_pipeline_chains = []
for item in (ROOT / "src" / "pipelines").iterdir():
    if not item.name.startswith("_") and not item.is_file():
        all_pipeline_chains.append(item.name)

# Run all the pipelines and place their outputs into the output folder
# The output name for each pipeline chain will be the name of the directory that the chain is in
for pipeline_name in all_pipeline_chains:
    table_name = pipeline_name.replace("_", "-")
    if args.only and not table_name in args.only.split(","):
        continue
    if args.exclude and table_name in args.exclude.split(","):
        continue
    pipeline_chain = DataPipeline.load(pipeline_name)
    show_progress = not args.no_progress
    pipeline_output = pipeline_chain.run(pipeline_name,
                                         verify=args.verify,
                                         process_count=args.process_count,
                                         progress=show_progress)
    export_csv(pipeline_output,
               ROOT / "output" / "tables" / f"{table_name}.csv")

if args.profile:
    stats = Stats(profiler)
    stats.strip_dirs()
    stats.sort_stats("cumtime")
    stats.print_stats(20)
Exemplo n.º 3
0
def main(
        output_folder: Path,
        verify: str = None,
        only: List[str] = None,
        exclude: List[str] = None,
        process_count: int = cpu_count(),
        show_progress: bool = True,
) -> None:
    """
    Executes the data pipelines and places all outputs into `output_folder`. This is typically
    followed by publishing of the contents of the output folder to a server.

    Args:
        output_folder: Root folder where snapshot, intermediate and tables will be placed.
        verify: Run anomaly detection on the outputs using this strategy. Value must be one of:
            - None: (default) perform no anomaly detection
            - "simple": perform only fast anomaly detection
            - "full": perform exhaustive anomaly detection (can be very slow)
        only: If provided, only pipelines with a name appearing in this list will be run.
        exclude: If provided, pipelines with a name appearing in this list will not be run.
        process_count: Maximum number of processes to use during the data pipeline execution.
        show_progress: Display progress for the execution of individual DataSources within this
            pipeline.
    """

    assert not (only is not None and exclude is not None
                ), "--only and --exclude options cannot be used simultaneously"

    # Ensure that there is an output folder to put the data in
    (output_folder / "snapshot").mkdir(parents=True, exist_ok=True)
    (output_folder / "intermediate").mkdir(parents=True, exist_ok=True)
    (output_folder / "tables").mkdir(parents=True, exist_ok=True)

    # A pipeline chain is any subfolder not starting with "_" in the pipelines folder
    all_pipeline_names = []
    for item in (ROOT / "src" / "pipelines").iterdir():
        if not item.name.startswith("_") and not item.is_file():
            all_pipeline_names.append(item.name)

    # Verify that all of the provided pipeline names exist as pipelines
    for pipeline_name in (only or []) + (exclude or []):
        module_name = pipeline_name.replace("-", "_")
        assert module_name in all_pipeline_names, f'"{pipeline_name}" pipeline does not exist'

    # Run all the pipelines and place their outputs into the output folder
    # The output name for each pipeline chain will be the name of the directory that the chain is in
    for pipeline_name in all_pipeline_names:
        table_name = pipeline_name.replace("_", "-")
        # Skip if `exclude` was provided and this table is in it
        if exclude is not None and table_name in exclude:
            continue
        # Skip is `only` was provided and this table is not in it
        if only is not None and not table_name in only:
            continue
        data_pipeline = DataPipeline.load(pipeline_name)
        pipeline_output = data_pipeline.run(
            pipeline_name,
            output_folder,
            verify=verify,
            process_count=process_count,
            progress=show_progress,
        )
        export_csv(pipeline_output,
                   output_folder / "tables" / f"{table_name}.csv")
Exemplo n.º 4
0
def main(output_folder: Path,
         tables_folder: Path,
         show_progress: bool = True) -> None:
    """
    This script takes the processed outputs located in `tables_folder` and publishes them into the
    output folder by performing the following operations:

        1. Copy all the tables as-is from `tables_folder` to `output_folder`
        2. Produce a main table, created by iteratively performing left outer joins on all other
           tables (with a few exceptions)
        3. Create different slices of data, such as the latest known record for each region, files
           for the last N days of data, files for each individual region
    """
    # TODO: respect disable progress flag
    disable_progress = not show_progress

    # Wipe the output folder first
    for item in output_folder.glob("*"):
        if item.name.startswith("."):
            continue
        if item.is_file():
            item.unlink()
        else:
            shutil.rmtree(item)

    # Create the folder which will be published using a stable schema
    v2_folder = output_folder / "v2"
    v2_folder.mkdir(exist_ok=True, parents=True)

    # Copy all output files to the V2 folder
    for output_file in tqdm([*tables_folder.glob("*.csv")],
                            desc="Copy tables"):
        shutil.copy(output_file, v2_folder / output_file.name)

    # Merge all output files into a single table
    main_table = read_file(v2_folder / "index.csv")

    # Add a date to each region from index to allow iterative left joins
    max_date = (datetime.datetime.now() +
                datetime.timedelta(days=7)).date().isoformat()
    date_list = [
        date.date().isoformat() for date in date_range("2020-01-01", max_date)
    ]
    date_table = DataFrame(date_list, columns=["date"], dtype=str)
    main_table = table_cross_product(main_table, date_table)

    # Some tables are not included into the main table
    exclude_from_main_table = (
        "main.csv",
        "index.csv",
        "worldbank.csv",
        "worldpop.csv",
        "by-age.csv",
        "by-sex.csv",
    )

    non_dated_columns = set(main_table.columns)
    for output_file in tqdm([*v2_folder.glob("*.csv")], desc="Main table"):
        if output_file.name not in exclude_from_main_table:
            # Load the table and perform left outer join
            table = read_file(output_file, low_memory=False)
            main_table = main_table.merge(table, how="left")
            # Keep track of columns which are not indexed by date
            if not "date" in table.columns:
                non_dated_columns = non_dated_columns | set(table.columns)

    # There can only be one record per <key, date> pair
    main_table = main_table.groupby(["key", "date"]).first().reset_index()

    # Drop rows with null date or without a single dated record
    main_table = drop_na_records(main_table.dropna(subset=["date"]),
                                 non_dated_columns)
    export_csv(main_table, v2_folder / "main.csv")

    # Create subsets with the last 30, 14 and 7 days of data
    map_func = partial(subset_last_days, v2_folder)
    for _ in thread_map(map_func, (30, 14, 7), desc="Last N days subsets"):
        pass

    # Create a subset with the latest known day of data for each key
    map_func = partial(subset_latest, v2_folder)
    for _ in thread_map(map_func, [*(v2_folder).glob("*.csv")],
                        desc="Latest subset"):
        pass

    # Create subsets with each known key
    main_indexed = main_table.set_index("key")
    map_func = partial(subset_grouped_key, main_indexed, v2_folder)
    for _ in thread_map(map_func,
                        main_indexed.index.unique(),
                        desc="Grouped key subsets"):
        pass

    # Convert all CSV files to JSON using values format
    map_func = export_json_without_index
    for _ in thread_map(map_func, [*(v2_folder).glob("**/*.csv")],
                        desc="JSON conversion"):
        pass
Exemplo n.º 5
0
def make_main_table(
    tables_folder: Path,
    output_path: Path,
    logger: ErrorLogger = ErrorLogger()) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>.

    Arguments:
        tables_folder: Input folder where all CSV files exist.
    Returns:
        DataFrame: Flat table with all data combined.
    """

    # Use a temporary directory for intermediate files
    with TemporaryDirectory() as workdir:
        workdir = Path(workdir)

        # Merge all output files into a single table
        keys_table_path = workdir / "keys.csv"
        keys_table = read_file(tables_folder / "index.csv", usecols=["key"])
        export_csv(keys_table, keys_table_path)
        logger.log_info("Created keys table")

        # Add a date to each region from index to allow iterative left joins
        max_date = (datetime.datetime.now() +
                    datetime.timedelta(days=1)).date().isoformat()
        date_list = date_range("2020-01-01", max_date)
        date_table_path = workdir / "dates.csv"
        export_csv(DataFrame(date_list, columns=["date"]), date_table_path)
        logger.log_info("Created dates table")

        # Create a temporary working table file which can be used during the steps
        temp_file_path = workdir / "main.tmp.csv"
        table_cross_product(keys_table_path, date_table_path, temp_file_path)
        logger.log_info("Created cross product table")

        # Add all the index columns to seed the main table
        main_table_path = workdir / "main.csv"
        table_join(temp_file_path,
                   tables_folder / "index.csv", ["key"],
                   main_table_path,
                   how="outer")
        logger.log_info("Joined with table index")

        non_dated_columns = set(get_table_columns(main_table_path))
        for table_file_path in pbar([*tables_folder.glob("*.csv")],
                                    desc="Make main table"):
            table_name = table_file_path.stem
            if table_name not in EXCLUDE_FROM_MAIN_TABLE:

                table_columns = get_table_columns(table_file_path)
                if "date" in table_columns:
                    join_on = ["key", "date"]
                else:
                    join_on = ["key"]

                    # Keep track of columns which are not indexed by date
                    non_dated_columns = non_dated_columns | set(table_columns)

                # Iteratively perform left outer joins on all tables
                table_join(main_table_path,
                           table_file_path,
                           join_on,
                           temp_file_path,
                           how="outer")
                shutil.move(temp_file_path, main_table_path)
                logger.log_info(f"Joined with table {table_name}")

        # Drop rows with null date or without a single dated record
        # TODO: figure out a memory-efficient way to do this

        # Ensure that the table is appropriately sorted ans write to output location
        table_sort(main_table_path, output_path)
        logger.log_info("Sorted main table")
    def _test_table_merge(self, how_mem: str, how_pandas: str):
        test_data_1 = DataFrame.from_records([
            {
                "col1": "a",
                "col2": "1"
            },
            {
                "col1": "a",
                "col2": "2"
            },
            {
                "col1": "b",
                "col2": "3"
            },
            {
                "col1": "b",
                "col2": "4"
            },
            {
                "col1": "c",
                "col2": "5"
            },
            {
                "col1": "c",
                "col2": "6"
            },
        ])

        test_data_2 = DataFrame.from_records([
            {
                "col1": "a",
                "col3": "foo"
            },
            {
                "col1": "b",
                "col3": "bar"
            },
            {
                "col1": "c",
                "col3": "baz"
            },
        ])

        test_data_3 = DataFrame.from_records([
            {
                "col1": "a",
                "col4": "apple"
            },
            {
                "col1": "b",
                "col4": "banana"
            },
            {
                "col1": "c",
                "col4": "orange"
            },
        ])

        with temporary_directory() as workdir:

            test_file_1 = workdir / "test.1.csv"
            test_file_2 = workdir / "test.2.csv"
            test_file_3 = workdir / "test.3.csv"

            export_csv(test_data_1, test_file_1)
            export_csv(test_data_2, test_file_2)
            export_csv(test_data_3, test_file_3)

            output_file_1 = workdir / "output.1.csv"
            output_file_2 = workdir / "output.2.csv"

            expected = table_merge_pandas(
                [test_data_1, test_data_2, test_data_3],
                on=["col1"],
                how=how_pandas)
            export_csv(expected, path=output_file_1)

            table_merge_mem([test_file_1, test_file_2, test_file_3],
                            output_file_2,
                            on=["col1"],
                            how=how_mem)

            _compare_tables_equal(self, output_file_1, output_file_2)