def _transform(self, df: DataFrame) -> DataFrame: view: str = self.getView() path: Union[str, List[str], Path] = self.getFilePath() name: Optional[str] = self.getName() progress_logger: Optional[ProgressLogger] = self.getProgressLogger() merge_schema: bool = self.getMergeSchema() limit: int = self.getLimit() with ProgressLogMetric(name=f"{name or view}_table_loader", progress_logger=progress_logger): try: if merge_schema is True: final_df = (df.sql_ctx.read.option( "mergeSchema", "true").format("parquet").load(path=str(path))) else: final_df = df.sql_ctx.read.format("parquet").load( path=str(path)) assert ( "_corrupt_record" not in final_df.columns ), f"Found _corrupt_record after reading the file: {path}. " if limit and limit > 0: final_df = final_df.limit(limit) # store new data frame in the view final_df.createOrReplaceTempView(view) except AnalysisException as e: self.logger.error( f"File load failed. Location: {path} may be empty") raise e return df
def _transform(self, df: DataFrame) -> DataFrame: sql_text: Optional[str] = self.getSql() name: Optional[str] = self.getName() view: Optional[str] = self.getView() progress_logger: Optional[ProgressLogger] = self.getProgressLogger() assert sql_text with ProgressLogMetric(name=name or view or "", progress_logger=progress_logger): if progress_logger and name: # mlflow opens .txt files inline so we use that extension progress_logger.log_artifact(key=f"{name}.sql.txt", contents=sql_text) progress_logger.write_to_log(name=name, message=sql_text) try: df = df.sql_ctx.sql(sql_text) except Exception: self.logger.info(f"Error in {name}") self.logger.info(sql_text) raise if view: df.createOrReplaceTempView(view) self.logger.info( f"GenericSqlTransformer [{name}] finished running SQL") return df
def _transform(self, df: DataFrame) -> DataFrame: column_mapping: Dict[str, Any] = self.getColumnMapping() view: str = self.getView() progress_logger: Optional[ProgressLogger] = self.getProgressLogger() with ProgressLogMetric(name=f"{view}_fill_na", progress_logger=progress_logger): self.logger.info( f"filling rows if any null values with replacement_value found for columns: {list(column_mapping.keys())}" ) df_with_na: DataFrame = df.sql_ctx.table(view) df_with_filled_na = df_with_na data_types = get_dtype(df_with_na, list(column_mapping.keys())) value: Union[bool, int, float, str] for col, value in column_mapping.items(): if data_types[col] != "string": try: value = float(value) except Exception as e: print(str(e)) print( f"The data type of column: {col} is {data_types[col]}. Either cast the column as a StringType or change the type of the value you are feeding as the replacement value to a string type." ) df_with_filled_na = df_with_filled_na.na.fill(value=value, subset=[col]) df_with_filled_na.createOrReplaceTempView(view) return df_with_filled_na
def _transform(self, df: DataFrame) -> DataFrame: view: Optional[str] = self.getView() name: Optional[str] = self.getName() format_: str = self.getFormat() progress_logger: Optional[ProgressLogger] = self.getProgressLogger() # limit: int = self.getLimit() with ProgressLogMetric(name=f"{name or view}_{format_}_exporter", progress_logger=progress_logger): try: writer: DataFrameWriter if view: writer = df.sql_ctx.table(view).write.format(format_) else: writer = df.write.format(format_) writer = writer.mode(self.getMode()) for k, v in self.getOptions().items(): writer.option(k, v) writer.save() except AnalysisException as e: self.logger.error(f"Failed to write to {format_}") raise e return df
def _transform(self, df: DataFrame) -> DataFrame: progress_logger: Optional[ProgressLogger] = self.getProgressLogger() with ProgressLogMetric(name="db_query_runner", progress_logger=progress_logger): connection: Connection = pymysql.connect( user=self.getUsername(), password=self.getPassword(), host=self.getHost(), port=self.getPort(), db=self.getDb(), client_flag=CLIENT.MULTI_STATEMENTS, ) try: with connection.cursor() as cursor: rows_affected: int = cursor.execute(self.getQuery()) self.logger.info(f"Rows Affected= {rows_affected}") connection.commit() except OperationalError as e: self.logger.error(f"Failed to run query {self.getQuery()}") raise e finally: connection.close() return df
def _transform(self, df: DataFrame, response: Dict[str, Any]) -> Dict[str, Any]: # type: ignore param_key: str = self.getParamKey() urls: List[str] = self.getDownloadUrls() or response.get( param_key, []) # type: ignore download_path: Optional[str] = self.getDownloadToPath() extract_zips: bool = self.getExtractZips() name: Optional[str] = self.getName() progress_logger: Optional[ProgressLogger] = self.getProgressLogger() assert urls assert download_path out: List[str] = [] for url in urls: with ProgressLogMetric(name=name or url, progress_logger=progress_logger): downloader = FileDownloader(url=url, download_path=download_path, extract_archives=extract_zips) result = downloader.download_files_from_url() if result: out.append(str(result)) response["filenames"] = out return response
def _transform(self, df: DataFrame) -> DataFrame: kafka_brokers: str = self.getKafkaBrokers() topic: str = self.getTopic() starting_offset: int = self.getStartingOffset() use_ssl: bool = self.getUseSsl() schema: StructType = self.getSchema() name: Optional[str] = self.getName() previous_checkpoint_view: Optional[str] = self.getPreviousCheckpointView() progress_logger: Optional[ProgressLogger] = self.getProgressLogger() with ProgressLogMetric( name=f"{name or topic}_kafka_reader", progress_logger=progress_logger ): try: if previous_checkpoint_view in df.sql_ctx.tableNames(): last_offset = ( df.sql_ctx.table(previous_checkpoint_view) .groupBy() .max("offset") .collect()[0] .asDict()["max(offset)"] ) if last_offset: starting_offset = last_offset + 1 security_protocol = "PLAINTEXT" if use_ssl: security_protocol = "SSL" starting_offset_text = f"""{{"{topic}":{{"0":{starting_offset}}}}}""" df = ( df.sql_ctx.read.format("kafka") .option("kafka.bootstrap.servers", kafka_brokers) .option("kafka.security.protocol", security_protocol) .option("subscribe", f"{topic}") .option("startingOffsets", starting_offset_text) .load() ) df = df.selectExpr("*", "CAST(value AS STRING) as event") df = df.withColumn("event", from_json(df.event, schema)) df.createOrReplaceTempView(topic) if len(df.head(1)) == 0 and previous_checkpoint_view: df.sql_ctx.table(previous_checkpoint_view).createOrReplaceTempView( f"{topic}_watermark" ) else: df.createOrReplaceTempView(f"{topic}_watermark") except AnalysisException as e: self.logger.error( f"Failed to read from kafka topic: {topic} on {kafka_brokers}" ) raise e return df
def _transform(self, df: DataFrame) -> DataFrame: columns_to_drop: List[str] = self.getColumnsToCheck() view: str = self.getView() progress_logger: Optional[ProgressLogger] = self.getProgressLogger() with ProgressLogMetric( name=f"{view}_drop_row", progress_logger=progress_logger ): self.logger.info( f"dropping rows if any null values found for columns: {columns_to_drop}" ) df_with_rows: DataFrame = df.sql_ctx.table(view) df_with_dropped_rows = df_with_rows.dropna(subset=columns_to_drop) df_with_dropped_rows.createOrReplaceTempView(view) return df
def _transform(self, df: DataFrame) -> DataFrame: view: str = self.getView() path: Union[str, Path] = self.getFilePath() name: str = self.getName() progress_logger: ProgressLogger = self.getProgressLogger() # limit: int = self.getLimit() with ProgressLogMetric(name=f"{name or view}_table_loader", progress_logger=progress_logger): try: if view: df.sql_ctx.table(view).write.parquet(path=str(path)) else: df.write.parquet(path=str(path)) except AnalysisException as e: self.logger.error(f"File write failed to {path}") raise e return df
def _transform(self, df: DataFrame) -> DataFrame: try: # if steps are defined but not transformers then convert steps to transformers first if len(self.steps) > 0 and len(self.transformers) == 0: self.transformers = self.create_steps(self.steps) # get the logger to use logger = get_logger(__name__) count_of_transformers: int = len(self.transformers) i: int = 0 pipeline_name: str = self.__class__.__name__ self.progress_logger.log_event( event_name=pipeline_name, event_text=f"Starting Pipeline {pipeline_name}", ) for transformer in self.transformers: assert isinstance(transformer, Transformer), type(transformer) try: i += 1 logger.info( f"---- Running pipeline [{pipeline_name}] transformer [{transformer}] " f"({i} of {count_of_transformers}) ----") with ProgressLogMetric( progress_logger=self.progress_logger, name=str(transformer) or "unknown", ): self.progress_logger.log_event( pipeline_name, event_text=f"Running pipeline step {transformer}", ) df = transformer.transform(dataset=df) except Exception as e: if hasattr(transformer, "getName"): # noinspection Mypy stage_name = transformer.getName() # type: ignore else: stage_name = transformer.__class__.__name__ logger.error( f"!!!!!!!!!!!!! pipeline [{pipeline_name}] transformer [{stage_name}] threw exception !!!!!!!!!!!!!" ) # use exception chaining to add stage name but keep original exception friendly_spark_exception: FriendlySparkException = ( FriendlySparkException(exception=e, stage_name=stage_name)) error_messages: List[str] = ( friendly_spark_exception.message.split("\n") if friendly_spark_exception.message else []) for error_message in error_messages: logger.error(msg=error_message) if hasattr(transformer, "getSql"): # noinspection Mypy logger.error(transformer.getSql()) # type: ignore logger.error( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ) self.progress_logger.log_exception( event_name=pipeline_name, event_text=f"Exception in Stage={stage_name}", ex=e, ) raise friendly_spark_exception from e self.progress_logger.log_event( event_name=pipeline_name, event_text=f"Finished Pipeline {pipeline_name}", ) return df finally: self._check_validation(df)