Exemplo n.º 1
0
    def validate_platform(cls, value: str, values: Dict[str, Any]) -> Optional[str]:
        if value != "":
            return value

        if is_s3_uri(values["base_path"]):
            return "s3"
        return "file"
Exemplo n.º 2
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:

        with PerfTimer() as timer:

            # check if file is an s3 object
            if is_s3_uri(self.source_config.base_path):
                yield from self.get_workunits_s3()

            else:
                yield from self.get_workunits_local()

            if not self.source_config.profiling.enabled:
                return

            total_time_taken = timer.elapsed_seconds()

            logger.info(
                f"Profiling {len(self.profiling_times_taken)} table(s) finished in {total_time_taken:.3f} seconds"
            )

            time_percentiles: Dict[str, float] = {}

            if len(self.profiling_times_taken) > 0:
                percentiles = [50, 75, 95, 99]
                percentile_values = stats.calculate_percentiles(
                    self.profiling_times_taken, percentiles)

                time_percentiles = {
                    f"table_time_taken_p{percentile}":
                    10**int(log10(percentile_values[percentile] + 1))
                    for percentile in percentiles
                }

            telemetry.telemetry_instance.ping(
                "data_lake_profiling_summary",
                # bucket by taking floor of log of time taken
                {
                    "total_time_taken": 10**int(log10(total_time_taken + 1)),
                    "count": 10**int(
                        log10(len(self.profiling_times_taken) + 1)),
                    "platform": self.source_config.platform,
                    **time_percentiles,
                },
            )
Exemplo n.º 3
0
    def validate_path_spec(cls, values: Dict) -> Dict[str, Any]:

        if "**" in values["include"]:
            raise ValueError("path_spec.include cannot contain '**'")

        if values.get("file_types") is None:
            values["file_types"] = SUPPORTED_FILE_TYPES
        else:
            for file_type in values["file_types"]:
                if file_type not in SUPPORTED_FILE_TYPES:
                    raise ValueError(
                        f"file type {file_type} not in supported file types. Please specify one from {SUPPORTED_FILE_TYPES}"
                    )

        if values.get("default_extension") is not None:
            if values.get("default_extension") not in SUPPORTED_FILE_TYPES:
                raise ValueError(
                    f"default extension {values.get('default_extension')} not in supported default file extension. Please specify one from {SUPPORTED_FILE_TYPES}"
                )

        include_ext = os.path.splitext(values["include"])[1].strip(".")
        if (
            include_ext not in values["file_types"]
            and include_ext != "*"
            and not values["default_extension"]
        ):
            raise ValueError(
                f"file type specified ({include_ext}) in path_spec.include is not in specified file "
                f'types. Please select one from {values.get("file_types")} or specify ".*" to allow all types'
            )

        values["_parsable_include"] = PathSpec.get_parsable_include(values["include"])
        logger.debug(f'Setting _parsable_include: {values.get("_parsable_include")}')
        compiled_include_tmp = parse.compile(values["_parsable_include"])
        values["_compiled_include"] = compiled_include_tmp
        logger.debug(f'Setting _compiled_include: {values["_compiled_include"]}')
        values["_glob_include"] = re.sub(
            "\{[^}]+\}", "*", values["include"]  # noqa: W605
        )
        logger.debug(f'Setting _glob_include: {values.get("_glob_include")}')

        if values.get("table_name") is None:
            if "{table}" in values["include"]:
                values["table_name"] = "{table}"
        else:
            logger.debug(f"include fields: {compiled_include_tmp.named_fields}")
            logger.debug(
                f"table_name fields: {parse.compile(values['table_name']).named_fields}"
            )
            if not all(
                x in values["_compiled_include"].named_fields
                for x in parse.compile(values["table_name"]).named_fields
            ):
                raise ValueError(
                    "Not all named variables used in path_spec.table_name are specified in "
                    "path_spec.include"
                )

        if values.get("exclude") is not None:
            for exclude_path in values["exclude"]:
                if len(parse.compile(exclude_path).named_fields) != 0:
                    raise ValueError(
                        "path_spec.exclude should not contain any named variables"
                    )

        values["_is_s3"] = is_s3_uri(values["include"])
        logger.debug(f'Setting _is_s3: {values.get("_is_s3")}')
        return values