from com.rposam.schema.FileSchema import FileSchema import sys import os from com.rposam.util.logger import Log4j if __name__ == "__main__": conf = SparkConfiguration.getSparkConf() Driver = SparkSession. \ builder. \ config(conf=conf). \ appName("ETL Pipeline using Airflow CSV To Parquet"). \ getOrCreate() logger = Log4j(Driver) logger.info("Fetching schema of source file") schema = FileSchema.empSchema() source = sys.argv[1] target = sys.argv[2] logger.info("Source is {0} and target is {1}".format(source, target)) logger.info("Started reading data from sources") empDF = Driver.read. \ format("csv"). \ schema(schema=schema). \ option("header", "false"). \
FileSchema = FileSchema()._20000recordsSchema() if __name__ == "__main__": conf = SparkConfiguration().getSparkConf() warehouseLocation = "hdfs://localhost:8020/user/hive/warehouse/sparkdb.db" spark = SparkSession. \ builder. \ config(conf=conf). \ appName("Custom UDF for Email and Gender validations and also a LTI Assignment"). \ config("hive.metastore.uris", "thrift://localhost:9083"). \ enableHiveSupport(). \ config("spark.sql.shuffle.partitions", 10). \ getOrCreate() logger = Log4j(spark) logger.info("Reading csv file with dropmalformed mode") df = spark.read.format("com.databricks.spark.csv"). \ option("header", "true"). \ schema(schema=FileSchema). \ option("mode", "DROPMALFORMED"). \ load(r"csv\2000000_records.csv") df.printSchema() logger.info("Creating custome UDF for email and gender") isValidEmail = f.udf(isValidEmail, returnType=BooleanType()) parse_Gender = f.udf(parseGender, returnType=StringType()) # df.printSchema() # df.show()