Exemplo n.º 1
0
    def _transform(self, df: DataFrame) -> DataFrame:
        view: str = self.getView()
        path: Union[str, List[str], Path] = self.getFilePath()
        name: Optional[str] = self.getName()
        progress_logger: Optional[ProgressLogger] = self.getProgressLogger()
        merge_schema: bool = self.getMergeSchema()
        limit: int = self.getLimit()

        with ProgressLogMetric(name=f"{name or view}_table_loader",
                               progress_logger=progress_logger):
            try:
                if merge_schema is True:
                    final_df = (df.sql_ctx.read.option(
                        "mergeSchema",
                        "true").format("parquet").load(path=str(path)))
                else:
                    final_df = df.sql_ctx.read.format("parquet").load(
                        path=str(path))

                assert (
                    "_corrupt_record" not in final_df.columns
                ), f"Found _corrupt_record after reading the file: {path}. "
                if limit and limit > 0:
                    final_df = final_df.limit(limit)

                # store new data frame in the view
                final_df.createOrReplaceTempView(view)
            except AnalysisException as e:
                self.logger.error(
                    f"File load failed. Location: {path} may be empty")
                raise e
        return df
Exemplo n.º 2
0
    def _transform(self, df: DataFrame) -> DataFrame:
        sql_text: Optional[str] = self.getSql()
        name: Optional[str] = self.getName()
        view: Optional[str] = self.getView()
        progress_logger: Optional[ProgressLogger] = self.getProgressLogger()

        assert sql_text
        with ProgressLogMetric(name=name or view or "",
                               progress_logger=progress_logger):
            if progress_logger and name:
                # mlflow opens .txt files inline so we use that extension
                progress_logger.log_artifact(key=f"{name}.sql.txt",
                                             contents=sql_text)
                progress_logger.write_to_log(name=name, message=sql_text)
            try:
                df = df.sql_ctx.sql(sql_text)
            except Exception:
                self.logger.info(f"Error in {name}")
                self.logger.info(sql_text)
                raise

            if view:
                df.createOrReplaceTempView(view)
            self.logger.info(
                f"GenericSqlTransformer [{name}] finished running SQL")

        return df
Exemplo n.º 3
0
    def _transform(self, df: DataFrame) -> DataFrame:
        column_mapping: Dict[str, Any] = self.getColumnMapping()
        view: str = self.getView()
        progress_logger: Optional[ProgressLogger] = self.getProgressLogger()

        with ProgressLogMetric(name=f"{view}_fill_na",
                               progress_logger=progress_logger):
            self.logger.info(
                f"filling rows if any null values with replacement_value found for columns: {list(column_mapping.keys())}"
            )
            df_with_na: DataFrame = df.sql_ctx.table(view)
            df_with_filled_na = df_with_na
            data_types = get_dtype(df_with_na, list(column_mapping.keys()))

            value: Union[bool, int, float, str]
            for col, value in column_mapping.items():

                if data_types[col] != "string":
                    try:
                        value = float(value)
                    except Exception as e:
                        print(str(e))
                        print(
                            f"The data type of column: {col} is {data_types[col]}. Either cast the column as a StringType or change the type of the value you are feeding as the replacement value to a string type."
                        )

                df_with_filled_na = df_with_filled_na.na.fill(value=value,
                                                              subset=[col])

            df_with_filled_na.createOrReplaceTempView(view)
        return df_with_filled_na
    def _transform(self, df: DataFrame) -> DataFrame:
        view: Optional[str] = self.getView()
        name: Optional[str] = self.getName()
        format_: str = self.getFormat()
        progress_logger: Optional[ProgressLogger] = self.getProgressLogger()
        # limit: int = self.getLimit()

        with ProgressLogMetric(name=f"{name or view}_{format_}_exporter",
                               progress_logger=progress_logger):
            try:
                writer: DataFrameWriter
                if view:
                    writer = df.sql_ctx.table(view).write.format(format_)
                else:
                    writer = df.write.format(format_)

                writer = writer.mode(self.getMode())

                for k, v in self.getOptions().items():
                    writer.option(k, v)

                writer.save()

            except AnalysisException as e:
                self.logger.error(f"Failed to write to {format_}")
                raise e
        return df
    def _transform(self, df: DataFrame) -> DataFrame:
        progress_logger: Optional[ProgressLogger] = self.getProgressLogger()
        with ProgressLogMetric(name="db_query_runner",
                               progress_logger=progress_logger):
            connection: Connection = pymysql.connect(
                user=self.getUsername(),
                password=self.getPassword(),
                host=self.getHost(),
                port=self.getPort(),
                db=self.getDb(),
                client_flag=CLIENT.MULTI_STATEMENTS,
            )
            try:
                with connection.cursor() as cursor:
                    rows_affected: int = cursor.execute(self.getQuery())
                    self.logger.info(f"Rows Affected= {rows_affected}")
                connection.commit()

            except OperationalError as e:
                self.logger.error(f"Failed to run query {self.getQuery()}")
                raise e

            finally:
                connection.close()
        return df
Exemplo n.º 6
0
    def _transform(self, df: DataFrame,
                   response: Dict[str, Any]) -> Dict[str, Any]:  # type: ignore
        param_key: str = self.getParamKey()
        urls: List[str] = self.getDownloadUrls() or response.get(
            param_key, [])  # type: ignore
        download_path: Optional[str] = self.getDownloadToPath()
        extract_zips: bool = self.getExtractZips()
        name: Optional[str] = self.getName()
        progress_logger: Optional[ProgressLogger] = self.getProgressLogger()

        assert urls
        assert download_path

        out: List[str] = []

        for url in urls:
            with ProgressLogMetric(name=name or url,
                                   progress_logger=progress_logger):
                downloader = FileDownloader(url=url,
                                            download_path=download_path,
                                            extract_archives=extract_zips)
                result = downloader.download_files_from_url()
                if result:
                    out.append(str(result))

        response["filenames"] = out

        return response
    def _transform(self, df: DataFrame) -> DataFrame:
        kafka_brokers: str = self.getKafkaBrokers()
        topic: str = self.getTopic()
        starting_offset: int = self.getStartingOffset()
        use_ssl: bool = self.getUseSsl()
        schema: StructType = self.getSchema()
        name: Optional[str] = self.getName()
        previous_checkpoint_view: Optional[str] = self.getPreviousCheckpointView()

        progress_logger: Optional[ProgressLogger] = self.getProgressLogger()

        with ProgressLogMetric(
            name=f"{name or topic}_kafka_reader", progress_logger=progress_logger
        ):
            try:
                if previous_checkpoint_view in df.sql_ctx.tableNames():
                    last_offset = (
                        df.sql_ctx.table(previous_checkpoint_view)
                        .groupBy()
                        .max("offset")
                        .collect()[0]
                        .asDict()["max(offset)"]
                    )
                    if last_offset:
                        starting_offset = last_offset + 1

                security_protocol = "PLAINTEXT"
                if use_ssl:
                    security_protocol = "SSL"

                starting_offset_text = f"""{{"{topic}":{{"0":{starting_offset}}}}}"""
                df = (
                    df.sql_ctx.read.format("kafka")
                    .option("kafka.bootstrap.servers", kafka_brokers)
                    .option("kafka.security.protocol", security_protocol)
                    .option("subscribe", f"{topic}")
                    .option("startingOffsets", starting_offset_text)
                    .load()
                )
                df = df.selectExpr("*", "CAST(value AS STRING) as event")
                df = df.withColumn("event", from_json(df.event, schema))
                df.createOrReplaceTempView(topic)
                if len(df.head(1)) == 0 and previous_checkpoint_view:
                    df.sql_ctx.table(previous_checkpoint_view).createOrReplaceTempView(
                        f"{topic}_watermark"
                    )
                else:
                    df.createOrReplaceTempView(f"{topic}_watermark")
            except AnalysisException as e:
                self.logger.error(
                    f"Failed to read from kafka topic: {topic} on {kafka_brokers}"
                )
                raise e
        return df
Exemplo n.º 8
0
    def _transform(self, df: DataFrame) -> DataFrame:
        columns_to_drop: List[str] = self.getColumnsToCheck()
        view: str = self.getView()
        progress_logger: Optional[ProgressLogger] = self.getProgressLogger()

        with ProgressLogMetric(
            name=f"{view}_drop_row", progress_logger=progress_logger
        ):
            self.logger.info(
                f"dropping rows if any null values found for columns: {columns_to_drop}"
            )
            df_with_rows: DataFrame = df.sql_ctx.table(view)
            df_with_dropped_rows = df_with_rows.dropna(subset=columns_to_drop)
            df_with_dropped_rows.createOrReplaceTempView(view)
        return df
    def _transform(self, df: DataFrame) -> DataFrame:
        view: str = self.getView()
        path: Union[str, Path] = self.getFilePath()
        name: str = self.getName()
        progress_logger: ProgressLogger = self.getProgressLogger()
        # limit: int = self.getLimit()

        with ProgressLogMetric(name=f"{name or view}_table_loader", progress_logger=progress_logger):
            try:
                if view:
                    df.sql_ctx.table(view).write.parquet(path=str(path))
                else:
                    df.write.parquet(path=str(path))

            except AnalysisException as e:
                self.logger.error(f"File write failed to {path}")
                raise e
        return df
    def _transform(self, df: DataFrame) -> DataFrame:
        try:
            # if steps are defined but not transformers then convert steps to transformers first
            if len(self.steps) > 0 and len(self.transformers) == 0:
                self.transformers = self.create_steps(self.steps)
            # get the logger to use
            logger = get_logger(__name__)
            count_of_transformers: int = len(self.transformers)
            i: int = 0
            pipeline_name: str = self.__class__.__name__
            self.progress_logger.log_event(
                event_name=pipeline_name,
                event_text=f"Starting Pipeline {pipeline_name}",
            )
            for transformer in self.transformers:
                assert isinstance(transformer, Transformer), type(transformer)
                try:
                    i += 1
                    logger.info(
                        f"---- Running pipeline [{pipeline_name}] transformer [{transformer}]  "
                        f"({i} of {count_of_transformers}) ----")

                    with ProgressLogMetric(
                            progress_logger=self.progress_logger,
                            name=str(transformer) or "unknown",
                    ):
                        self.progress_logger.log_event(
                            pipeline_name,
                            event_text=f"Running pipeline step {transformer}",
                        )
                        df = transformer.transform(dataset=df)
                except Exception as e:
                    if hasattr(transformer, "getName"):
                        # noinspection Mypy
                        stage_name = transformer.getName()  # type: ignore
                    else:
                        stage_name = transformer.__class__.__name__
                    logger.error(
                        f"!!!!!!!!!!!!! pipeline [{pipeline_name}] transformer [{stage_name}] threw exception !!!!!!!!!!!!!"
                    )
                    # use exception chaining to add stage name but keep original exception
                    friendly_spark_exception: FriendlySparkException = (
                        FriendlySparkException(exception=e,
                                               stage_name=stage_name))
                    error_messages: List[str] = (
                        friendly_spark_exception.message.split("\n")
                        if friendly_spark_exception.message else [])
                    for error_message in error_messages:
                        logger.error(msg=error_message)

                    if hasattr(transformer, "getSql"):
                        # noinspection Mypy
                        logger.error(transformer.getSql())  # type: ignore
                    logger.error(
                        "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
                    )
                    self.progress_logger.log_exception(
                        event_name=pipeline_name,
                        event_text=f"Exception in Stage={stage_name}",
                        ex=e,
                    )
                    raise friendly_spark_exception from e

            self.progress_logger.log_event(
                event_name=pipeline_name,
                event_text=f"Finished Pipeline {pipeline_name}",
            )
            return df
        finally:
            self._check_validation(df)