def _get_data_reference_list(self,
                                 data_asset_name: Optional[str] = None
                                 ) -> List[str]:
        """List objects in the underlying data store to create a list of data_references.

        This method is used to refresh the cache.
        """
        path_list: List[
            str] = get_filesystem_one_level_directory_glob_path_list(
                base_directory_path=self.base_directory,
                glob_directive=self._glob_directive)
        return sorted(path_list)
    def _get_data_reference_list_for_asset(self, asset: Optional[Asset]) -> List[str]:
        base_directory: str = self.base_directory
        glob_directive: str = self._glob_directive

        if asset is not None:
            if asset.base_directory:
                base_directory = normalize_directory_path(
                    dir_path=asset.base_directory, root_directory_path=base_directory
                )
            if asset.glob_directive:
                glob_directive = asset.glob_directive

        path_list: List[str] = get_filesystem_one_level_directory_glob_path_list(
            base_directory_path=base_directory, glob_directive=glob_directive
        )

        return sorted(path_list)
Пример #3
0
def bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000_data_context(
    tmp_path_factory,
    monkeypatch,
) -> DataContext:
    """
    This fixture generates three years' worth (36 months; i.e., 36 batches) of taxi trip data with the number of rows
    of a batch sampled from a normal distribution with the mean of 5,000 rows and the standard deviation of 1,000 rows.
    """
    # Re-enable GE_USAGE_STATS
    monkeypatch.delenv("GE_USAGE_STATS")

    project_path: str = str(tmp_path_factory.mktemp("taxi_data_context"))
    context_path: str = os.path.join(project_path, "great_expectations")
    os.makedirs(os.path.join(context_path, "expectations"), exist_ok=True)
    data_path: str = os.path.join(context_path, "..", "data")
    os.makedirs(os.path.join(data_path), exist_ok=True)
    shutil.copy(
        file_relative_path(
            __file__,
            os.path.join(
                "..",
                "integration",
                "fixtures",
                "yellow_trip_data_pandas_fixture",
                "great_expectations",
                "great_expectations.yml",
            ),
        ),
        str(os.path.join(context_path, "great_expectations.yml")),
    )
    base_directory: str = file_relative_path(
        __file__,
        os.path.join(
            "..",
            "test_sets",
            "taxi_yellow_trip_data_samples",
        ),
    )
    file_name_list: List[
        str] = get_filesystem_one_level_directory_glob_path_list(
            base_directory_path=base_directory, glob_directive="*.csv")
    file_name_list = sorted(file_name_list)
    num_files: int = len(file_name_list)

    rnd_num_sample: np.float64
    output_file_lenths: List[int] = [
        round(rnd_num_sample) for rnd_num_sample in np.random.normal(
            loc=5.0e3, scale=1.0e3, size=num_files)
    ]

    idx: int
    file_name: str

    output_file_name_length_map: Dict[str, int] = {
        file_name_list[idx]: output_file_lenths[idx]
        for idx, file_name in enumerate(file_name_list)
    }

    csv_source_path: str
    df: pd.DataFrame
    for file_name in file_name_list:
        csv_source_path = os.path.join(base_directory, file_name)
        df = pd.read_csv(filepath_or_buffer=csv_source_path)
        df = df.sample(n=output_file_name_length_map[file_name],
                       replace=False,
                       random_state=1)
        df.to_csv(path_or_buf=os.path.join(context_path, "..", "data",
                                           file_name),
                  index=False)

    context: DataContext = DataContext(context_root_dir=context_path)
    assert context.root_directory == context_path

    return context