Exemplo n.º 1
0
    def test_table_grouped_tail_real_data(self):
        with temporary_directory() as workdir:

            for table_path in (SRC / "test" / "data").glob("*.csv"):
                table = read_table(table_path, schema=SCHEMA)
                test_output = workdir / f"latest_{table_path.name}"
                pandas_output = workdir / f"latest_pandas_{table_path.name}"

                # Create the latest slice of the given table
                table_grouped_tail(table_path, test_output, ["key"])

                # Create a latest slice using pandas grouping
                table = table.groupby("key").aggregate(
                    agg_last_not_null).reset_index()
                export_csv(table, path=pandas_output, schema=SCHEMA)

                # Converting to a CSV in memory sometimes produces out-of-order values
                with open_file_like(test_output) as fd1, open_file_like(
                        pandas_output) as fd2:
                    test_result_lines = list(sorted(fd1))
                    pandas_result_lines = list(sorted(fd2))

                self.assertEqual(len(test_result_lines),
                                 len(pandas_result_lines))
                for line1, line2 in zip(test_result_lines,
                                        pandas_result_lines):
                    self.assertEqual(line1, line2)
Exemplo n.º 2
0
    def fetch(
        self,
        output_folder: Path,
        cache: Dict[str, str],
        fetch_opts: List[Dict[str, Any]],
        skip_existing: bool = False,
    ) -> Dict[str, str]:
        # The URL is just a template which we'll use to download each state
        base_opts = dict(fetch_opts[0])
        url_tpl = base_opts.pop("url")

        # Some states cannot be found in the dataset
        states_banlist = ["AS", "GU", "MP", "PR", "VI"]

        states = read_table(SRC / "data" / "metadata.csv")
        states = states.loc[states["country_code"] == "US",
                            "subregion1_code"].dropna().unique()
        states = [state for state in states if state not in states_banlist]
        opts = [
            dict(**base_opts, name=code, url=url_tpl.format(state=code))
            for code in states
        ]
        return super().fetch(output_folder=output_folder,
                             cache=cache,
                             fetch_opts=opts,
                             skip_existing=skip_existing)
Exemplo n.º 3
0
def load_combined_table(pipeline: DataPipeline, prod_folder: str) -> DataFrame:
    table_name = pipeline.table
    with temporary_directory() as workdir:
        output_path = workdir / f"{table_name}.csv"
        download_file(GCS_BUCKET_PROD, f"{prod_folder}/{table_name}.csv",
                      output_path)
        combined_table = read_table(output_path)
        index_columns = (["date"] if "date" in combined_table.columns else
                         []) + ["location_key"]
        return combined_table.set_index(index_columns)
    def test_table_sort(self):
        test_csv = """col1,col2,col3
        a,1,foo
        d,4,bar
        c,3,foo
        b,2,bar
        """

        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)
            input_file = workdir / "in.csv"
            with open(input_file, "w") as fd:
                for line in test_csv.split("\n"):
                    if not line.isspace():
                        fd.write(f"{line.strip()}\n")

            # Sort using the default (first) column
            output_file_1 = workdir / "out.csv"
            table_sort(input_file, output_file_1)

            output_file_2 = workdir / "pandas.csv"
            read_table(input_file).sort_values(["col1"]).to_csv(output_file_2,
                                                                index=False)

            for line1, line2 in zip(read_lines(output_file_1),
                                    read_lines(output_file_2)):
                self.assertEqual(line1.strip(), line2.strip())

            # Sort by each column in order
            for sort_column in ("col1", "col2", "col3"):

                output_file_1 = workdir / "out.csv"
                table_sort(input_file, output_file_1, [sort_column])

                output_file_2 = workdir / "pandas.csv"
                read_table(input_file).sort_values([sort_column
                                                    ]).to_csv(output_file_2,
                                                              index=False)

                for line1, line2 in zip(read_lines(output_file_1),
                                        read_lines(output_file_2)):
                    self.assertEqual(line1.strip(), line2.strip())
Exemplo n.º 5
0
    def _test_make_main_table_helper(self, main_table_path: Path,
                                     column_adapter: Dict[str, str]):
        main_table = read_table(main_table_path, schema=SCHEMA)

        # Verify that all columns from all tables exist
        for pipeline in get_pipelines():
            for column_name in pipeline.schema.keys():
                column_name = column_adapter.get(column_name)
                if column_name is not None:
                    self.assertTrue(
                        column_name in main_table.columns,
                        f"Column {column_name} missing from main table",
                    )

        # Main table should follow a lexical sort (outside of header)
        main_table_records = []
        for line in read_lines(main_table_path):
            main_table_records.append(line)
        main_table_records = main_table_records[1:]
        self.assertListEqual(main_table_records,
                             list(sorted(main_table_records)))

        # Make sure that all columns present in the index table are in the main table
        main_table_columns = set(get_table_columns(main_table_path))
        index_table_columns = set(
            get_table_columns(SRC / "test" / "data" / "index.csv"))
        for column in index_table_columns:
            column = column_adapter.get(column, column)
            self.assertTrue(column in main_table_columns,
                            f"{column} not in main")

        # Make the main table easier to deal with since we optimize for memory usage
        location_key = "location_key" if "location_key" in main_table.columns else "key"
        main_table.set_index(location_key, inplace=True)
        main_table["date"] = main_table["date"].astype(str)

        # Define sets of columns to check
        column_prefixes = ("new", "total", "cumulative")
        column_filter = lambda col: col.split("_")[
            0] in column_prefixes and "age" not in col
        columns = list(filter(column_filter, main_table.columns))
        self.assertGreaterEqual(len({col.split("_")[0] for col in columns}), 2)
        main_table = main_table[["date"] + columns]

        # Spot check: Country of Andorra
        self._spot_check_subset(main_table, "AD", "2020-09-01", "2020-12-31")

        # Spot check: State of New South Wales
        self._spot_check_subset(main_table, "AU_NSW", "2020-09-01",
                                "2020-12-31")

        # Spot check: Alachua County
        self._spot_check_subset(main_table, "US_FL_12001", "2020-09-01",
                                "2020-12-31")
Exemplo n.º 6
0
    def test_make_main_table(self):
        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)

            # Copy all test tables into the temporary directory
            copy_tables(SRC / "test" / "data", workdir)

            # Create the main table
            main_table_path = workdir / "main.csv"
            make_main_table(workdir, main_table_path)
            main_table = read_table(main_table_path, schema=SCHEMA)

            # Verify that all columns from all tables exist
            for pipeline in get_pipelines():
                if pipeline.table in EXCLUDE_FROM_MAIN_TABLE:
                    continue
                for column_name in pipeline.schema.keys():
                    self.assertTrue(
                        column_name in main_table.columns,
                        f"Column {column_name} missing from main table",
                    )

            # Main table should follow a lexical sort (outside of header)
            main_table_records = []
            for line in read_lines(main_table_path):
                main_table_records.append(line)
            main_table_records = main_table_records[1:]
            self.assertListEqual(main_table_records,
                                 list(sorted(main_table_records)))

            # Make the main table easier to deal with since we optimize for memory usage
            main_table.set_index("key", inplace=True)
            main_table["date"] = main_table["date"].astype(str)

            # Define sets of columns to check
            epi_basic = [
                "new_confirmed", "total_confirmed", "new_deceased",
                "total_deceased"
            ]

            # Spot check: Country of Andorra
            self._spot_check_subset(main_table, "AD", epi_basic, "2020-03-02",
                                    "2020-09-01")

            # Spot check: State of New South Wales
            self._spot_check_subset(main_table, "AU_NSW", epi_basic,
                                    "2020-01-25", "2020-09-01")

            # Spot check: Alachua County
            self._spot_check_subset(main_table, "US_FL_12001", epi_basic,
                                    "2020-03-10", "2020-09-01")
Exemplo n.º 7
0
    def test_table_sort(self):
        test_csv = _make_test_csv_file(
            """
            col1,col2,col3
            a,1,foo
            d,4,bar
            c,3,foo
            b,2,bar
            """
        )

        with temporary_directory() as workdir:

            # Sort using the default (first) column
            output_file_1 = workdir / "out.csv"
            table_sort(test_csv, output_file_1)

            test_csv.seek(0)
            output_file_2 = workdir / "pandas.csv"
            read_table(test_csv, file_type="csv").sort_values(["col1"]).to_csv(
                output_file_2, index=False
            )

            _compare_tables_equal(self, output_file_1, output_file_2)

            # Sort by each column in order
            for sort_column in ("col1", "col2", "col3"):

                output_file_1 = workdir / f"1.{sort_column}.csv"
                table_sort(test_csv, output_file_1, [sort_column])

                test_csv.seek(0)
                output_file_2 = workdir / f"2.{sort_column}.csv"
                read_table(test_csv, file_type="csv").sort_values([sort_column]).to_csv(
                    output_file_2, index=False
                )

                _compare_tables_equal(self, output_file_1, output_file_2)
Exemplo n.º 8
0
def load_intermediate_tables(
        pipeline: DataPipeline, column_adapter: List[str],
        index_columns: List[str]) -> Iterable[Tuple[DataSource, DataFrame]]:
    with temporary_directory() as workdir:

        for data_source in tqdm(pipeline.data_sources,
                                desc="Downloading intermediate tables"):
            fname = data_source.uuid(pipeline.table) + ".csv"
            try:
                download_file(GCS_BUCKET_TEST, f"intermediate/{fname}",
                              workdir / fname)
                table = read_table(workdir /
                                   fname).rename(columns=column_adapter)
                table = table.groupby(index_columns).last()
                yield (data_source, table)
            except Exception as exc:
                print(f"intermediate table not found: {fname}",
                      file=sys.stderr)
    def test_table_group_tail(self):
        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)

            for table_path in (SRC / "test" / "data").glob("*.csv"):
                table = read_table(table_path, schema=SCHEMA)
                test_output_path = workdir / f"latest_{table_path.name}"
                pandas_output_path = workdir / f"latest_pandas_{table_path.name}"

                # Create the latest slice of the given table
                table_group_tail(table_path, test_output_path)

                # Create a latest slice using pandas grouping
                table = table.groupby("key").aggregate(agg_last_not_null).reset_index()
                export_csv(table, path=pandas_output_path, schema=SCHEMA)

                # Converting to a CSV in memory sometimes produces out-of-order values
                test_result_lines = sorted(read_lines(test_output_path))
                pandas_result_lines = sorted(read_lines(pandas_output_path))

                for line1, line2 in zip(test_result_lines, pandas_result_lines):
                    self.assertEqual(line1, line2)