예제 #1
0
    def run_pipeline(self):
        try:
            logging.info("https://sparkbyexamples.com/pyspark-tutorial/")
            logging.info(
                "Asmath --> Easy usage is to follow the json format while replacing values"
            )
            filePath = str(
                get_project_root()) + "/resources/data/small_zipcode.csv"
            df = self.spark.read.options(header='true', inferSchema='true') \
                .csv(filePath)
            logging.info("Drop nulls using df.na.drop()")
            df.na.drop().show(truncate=False)

            logging.info(
                "Asmath --> Easy usage is to follow the json format while replacing values , second way is reverse value and then key"
            )
            df.fillna({"city": "unknown", "type": ""}) \
                .show()

            df.printSchema()
            df.show(truncate=False)

            df.fillna(value=0).show()
            df.fillna(value=0, subset=["population"]).show()
            df.na.fill(value=0).show()
            df.na.fill(value=0, subset=["population"]).show()

            df.fillna(value="").show()
            df.na.fill(value="").show()

            df.fillna("unknown", ["city"]) \
                .fillna("", ["type"]).show()

            df.fillna({"city": "unknown", "type": ""}) \
                .show()

            df.na.fill("unknown", ["city"]) \
                .na.fill("", ["type"]).show()

            df.na.fill({"city": "unknown", "type": ""}) \
                .show()

            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +
                          str(exp))
            # send email notification
            # log error to database
            sys.exit(1)

        return
예제 #2
0
class Persist:
    logging.config.fileConfig(
        str(get_project_root()) + "/resources/configs/logging.conf")

    def __init__(self, spark, file_config):
        self.spark = spark
        self.file_config = file_config

    def persist_data(self, df):
        try:
            logger = logging.getLogger("Persist")
            logger.info('Persisting')
            #config = configparser.ConfigParser()
            #config.read('pipeline/resources/pipeline.ini')
            target_table = self.file_config.get('DB_CONFIGS',
                                                'TARGET_PG_TABLE')
            logger.info('PG Target table is ' + str(target_table))

            #df.coalesce(1).write.option("header", "true").csv("transformed_retailstore")

            df.write\
            .mode("append")\
            .format("jdbc")\
            .option("url", "jdbc:postgresql://localhost:5432/postgres")\
            .option("dbtable", target_table)\
            .option("user", "postgres")\
            .option("password", "admin")\
            .save()

        except Exception as exp:
            logger.error("An error occured while persisiting data >" +
                         str(exp))
            # store in database table
            # send an email notification
            raise Exception("HDFS directory already exists")

    def insert_into_pg(self):
        connection = psycopg2.connect(user='******',
                                      password='******',
                                      host='localhost',
                                      database='postgres')
        cursor = connection.cursor()
        insert_query = "INSERT INTO futurexschema.futurex_course_catalog (course_id, course_name, author_name, course_section, creation_date) VALUES (%s, %s, %s, %s,%s)"
        insert_tuple = (3, 'Machine Learning', 'FutureX', '{}', '2020-10-20')
        cursor.execute(insert_query, insert_tuple)
        cursor.close()
        connection.commit()
예제 #3
0
class Transform:
    logging.config.fileConfig(
        str(get_project_root()) + "/resources/configs/logging.conf")

    def __init__(self, spark, file_config):
        self.spark = spark
        self.file_config = file_config

    def transform_data(self, df):
        logger = logging.getLogger("Transform")
        logger.info("Transforming")
        logger.warning("Warning in Transformer")

        # drop all the rows having null values
        #df1 = df.na.drop()
        df1 = df.na.fill("Unknown", ["author_name"])
        df2 = df1.na.fill("0", ["no_of_reviews"])
        return df2
예제 #4
0
 def verifyUsage(self, arguments):
     self.config_file = ''
     try:
         opts, args = getopt.getopt(arguments, "c:")
     except getopt.GetoptError:
         logging.error('test.py -c <inputfile> ')
         sys.exit(2)
     for opt, arg in opts:
         if opt not in ("-c"):
             logging.error('test.py -c <configfile>  ')
             sys.exit()
         elif opt == '-h':
             logging.info('test.py -c <configfile>  ')
         elif opt in ("-c"):
             self.config_file = arg
             self.file_config = configparser.ConfigParser()
             self.file_config.read(
                 str(get_project_root()) + "/resources/pipeline.ini")
     logging.info('Input file is ' + str(self.config_file))
예제 #5
0
class Ingest:
    logging.config.fileConfig(
        str(get_project_root()) + "/resources/configs/logging.conf")

    def __init__(self, spark, file_config):
        self.spark = spark
        self.file_config = file_config

    def ingest_data(self):
        logger = logging.getLogger("Ingest")
        logger.info('Ingesting from csv')
        #customer_df = self.spark.read.csv("retailstore.csv",header=True)
        course_df = self.spark.sql("select * from fxxcoursedb.fx_course_table")
        logger.info('DataFrame created')
        logger.warning('DataFrame created with warning')
        return course_df

    def read_from_pg(self):
        connection = psycopg2.connect(user='******',
                                      password='******',
                                      host='localhost',
                                      database='postgres')
        cursor = connection.cursor()
        sql_query = "select * from futurexschema.futurex_course_catalog"
        pdDF = sqlio.read_sql_query(sql_query, connection)
        sparkDf = self.spark.createDataFrame(pdDF)
        sparkDf.show()

    def read_from_pg_using_jdbc_driver(self):

        jdbcDF = self.spark.read \
            .format("jdbc") \
            .option("url", "jdbc:postgresql://localhost:5432/postgres") \
            .option("dbtable", "futurexschema.futurex_course_catalog") \
            .option("user", "postgres") \
            .option("password", "admin") \
            .load()

        jdbcDF.show()
class ExplodeMapArraysToRows:

    logging.config.fileConfig(
        str(get_project_root()) + "/resources/configs/logging.conf")

    def run_pipeline(self):
        try:
            logging.info("https://sparkbyexamples.com/pyspark-tutorial/")
            logging.info(
                'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/'
            )
            arrayData = [('James', ['Java', 'Scala'], {
                'hair': 'black',
                'eye': 'brown'
            }),
                         ('Michael', ['Spark', 'Java', None], {
                             'hair': 'brown',
                             'eye': None
                         }),
                         ('Robert', ['CSharp', ''], {
                             'hair': 'red',
                             'eye': ''
                         }), ('Washington', None, None),
                         ('Jefferson', ['1', '2'], {})]

            df = self.spark.createDataFrame(
                data=arrayData,
                schema=['name', 'knownLanguages', 'properties']).cache()
            df.printSchema()
            df.show()

            from pyspark.sql.functions import explode
            df2 = df.select(df.name, explode(df.knownLanguages))
            df2.printSchema()
            df2.show()
            df3 = df.withColumn("ExplodedColumn", explode(df.knownLanguages))
            df3.printSchema()
            df3.show()

            # Exploding map and Array
            from pyspark.sql.functions import explode

            logging.info(
                "Asmath --> Only one generator allowed per select clause but found 2: explode(knownLanguages), explode(properties);"
            )
            #Error: Only one generator allowed per select clause but found 2: explode(knownLanguages), explode(properties);
            #df5 = df.select(df.name, explode(df.knownLanguages), explode(df.properties))
            df5 = df.withColumn("ExplodedArrayColumn",
                                explode(df.knownLanguages))
            df5.printSchema(
            )  # it wont throw error if you dont pass () but it wont print schema
            df6 = df5.withColumn("ExplodedMapColumn", explode(
                df5.properties))  # pass df5 here not df
            df6.printSchema()
            df6.show()

            # Explode Array

            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +
                          str(exp))
            # send email notification
            # log error to database
            sys.exit(1)

        return

    def create_spark_session(self):
        app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME')
        self.spark = SparkSession.builder\
            .appName(str(app_name))\
            .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\
            .enableHiveSupport().getOrCreate()

    def create_hive_table(self):
        self.spark.sql("create database if not exists fxxcoursedb")
        self.spark.sql(
            "create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)")
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)"
        )
        #Treat empty strings as null
        self.spark.sql(
            "alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')"
        )

    def verifyUsage(self, arguments):
        self.config_file = ''
        self.file_config = None
        try:
            opts, args = getopt.getopt(arguments, "c:")
        except getopt.GetoptError:
            logging.error('test.py -c <inputfile> ')
            sys.exit(2)
        for opt, arg in opts:
            if opt not in ("-c"):
                logging.error('test.py -c <configfile>  ')
                sys.exit()
            elif opt == '-h':
                logging.info('test.py -c <configfile>  ')
            elif opt in ("-c"):
                self.config_file = arg
                self.file_config = configparser.ConfigParser()
                self.file_config.read(
                    str(get_project_root()) + "/resources/pipeline.ini")
        logging.info('Input file is ' + str(self.config_file))
        logging.info('file config is ' + str(self.file_config))
class GroupByExamples:

    logging.config.fileConfig(str(get_project_root())+"/resources/configs/logging.conf")
    def run_pipeline(self):
        try:
            logging.info("https://sparkbyexamples.com/pyspark-tutorial/")
            logging.info('run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/')
            simpleData = [("James", "Sales", "NY", 90000, 34, 10000),
                          ("Michael", "Sales", "NY", 86000, 56, 20000),
                          ("Robert", "Sales", "CA", 81000, 30, 23000),
                          ("Maria", "Finance", "CA", 90000, 24, 23000),
                          ("Raman", "Finance", "CA", 99000, 40, 24000),
                          ("Scott", "Finance", "NY", 83000, 36, 19000),
                          ("Jen", "Finance", "NY", 79000, 53, 15000),
                          ("Jeff", "Marketing", "CA", 80000, 25, 18000),
                          ("Kumar", "Marketing", "NY", 91000, 50, 21000)
                          ]
            # agg function in spark is used to calculate multiple aggegrates in group by clause.
            # We can use seperate functions too without agg function
            #having clause is where on aggregrate in spark
            # https://sparkbyexamples.com/pyspark/pyspark-groupby-explained-with-example/
            #aggrgrate functions should use group by if you select other columns
            #aggregrate function no need group by if you dont select other columns - This is defualt nature in spark, will not return other columns
            #https://stackoverflow.com/questions/6467216/is-it-possible-to-use-aggregate-function-in-a-select-statment-without-using-grou/6467287
            schema = ["employee_name", "department", "state", "salary", "age", "bonus"]
            df = self.spark.createDataFrame(data=simpleData, schema=schema).cache()
            df.printSchema()
            df.show(truncate=False)

            #Sum
            df.groupby(df.department).sum("salary").alias("sum_salary").show(truncate=False) # cannot have df.salary in sum clause variable
            df.groupBy("department").sum("salary").show(truncate=False)
            df.groupBy(F.col("department")).sum("salary").show(truncate=False) # cannot have F.col("salary") in sum clause variable
            #Count
            df.groupby(df.department).count().show(truncate=False)
            df.groupBy("department").count().show(truncate=False)
            df.groupBy(F.col("department")).count().show(truncate=False)

            # min
            df.groupby(df.department).min("salary").show(truncate=False)
            df.groupBy("department").min("salary").show(truncate=False)
            df.groupBy(F.col("department")).min("salary").show(truncate=False)

            # max
            df.groupby(df.department).max("salary").show(truncate=False)
            df.groupBy("department").max("salary").show(truncate=False)
            df.groupBy(F.col("department")).max("salary").show(truncate=False)

            from pyspark.sql.functions import avg
            df.groupBy("department") \
                .agg(sum("salary").alias("sum_salary"), \
                     avg("salary").alias("avg_salary"), \
                     sum("bonus").alias("sum_bonus"), \
                     max("bonus").alias("max_bonus") \
                     ) \
                .show(truncate=False)

            logging.info("using only one function inside agg . agg for multiple functions")
            df.groupBy("department") \
                .agg(sum("salary").alias("sum_salary")
                     ) \
                .show(truncate=False)
            from pyspark.sql.functions import col
            df.groupBy("department") \
                .agg(sum("salary").alias("sum_salary"), \
                     avg("salary").alias("avg_salary"), \
                     sum("bonus").alias("sum_bonus"), \
                     max("bonus").alias("max_bonus")) \
                .where(col("sum_bonus") >= 50000) \
                .show(truncate=False)









            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +str(exp) )
            # send email notification
            # log error to database
            sys.exit(1)

        return

    def create_spark_session(self):
        app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME')
        self.spark = SparkSession.builder\
            .appName(str(app_name))\
            .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\
            .enableHiveSupport().getOrCreate()

    def create_hive_table(self):
        self.spark.sql("create database if not exists fxxcoursedb")
        self.spark.sql("create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)")
        #Treat empty strings as null
        self.spark.sql("alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')")

    def verifyUsage(self,arguments):
        self.config_file = ''
        self.file_config=None
        try:
            opts, args = getopt.getopt(arguments, "c:")
        except getopt.GetoptError:
            logging.error('test.py -c <inputfile> ')
            sys.exit(2)
        for opt, arg in opts:
            if opt not in ("-c"):
                logging.error('test.py -c <configfile>  ')
                sys.exit()
            elif opt == '-h':
                logging.info('test.py -c <configfile>  ')
            elif opt in ("-c"):
                self.config_file = arg
                self.file_config = configparser.ConfigParser()
                self.file_config.read(str(get_project_root())+"/resources/pipeline.ini")
        logging.info('Input file is '+str(self.config_file))
        logging.info('file config is '+str(self.file_config))
예제 #8
0
class FilterColumns:

    logging.config.fileConfig(str(get_project_root())+"/resources/configs/logging.conf")
    def run_pipeline(self):
        try:
            logging.info("https://sparkbyexamples.com/pyspark-tutorial/")
            logging.info('run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-withcolumn/')
            arrayStructureData = [
                (("James", "", "Smith"), ["Java", "Scala", "C++"], "OH", "M"),
                (("Anna", "Rose", ""), ["Spark", "Java", "C++"], "NY", "F"),
                (("Julia", "", "Williams"), ["CSharp", "VB"], "OH", "F"),
                (("Maria", "Anne", "Jones"), ["CSharp", "VB"], "NY", "M"),
                (("Jen", "Mary", "Brown"), ["CSharp", "VB"], "NY", "M"),
                (("Mike", "Mary", "Williams"), ["Python", "VB"], "OH", "M")
            ]
            arrayStructureSchema = StructType([
                StructField('name', StructType([
                    StructField('firstname', StringType(), True),
                    StructField('middlename', StringType(), True),
                    StructField('lastname', StringType(), True)
                ])),
                StructField('languages', ArrayType(StringType()), True),
                StructField('state', StringType(), True),
                StructField('gender', StringType(), True)
            ])
            df = self.spark.createDataFrame(data=arrayStructureData, schema=arrayStructureSchema)
            df.printSchema()
            df.show(truncate=False)

            # filter dataframe where state= OH
            df.filter(df.state == "OH").show(truncate=False)
            df.filter(F.col("state")=="OH").show(truncate=False)

            # Multiple conditions use ( and it is mandatory sometimes
            df.filter((df.state == "OH") & (df.gender == "M")).show(truncate=False)
            df.filter((F.col("state") == "OH") & (F.col("gender")  == "M")).show(truncate=False)

            # Filter array date_add
            df.filter(array_contains(df.languages, "Java") & (df.state == "OH") & (df.gender == "M")) \
                .show(truncate=False)

            df.filter((array_contains(F.col("languages"), "Java")) & (F.col("state") == "OH") & (F.col("gender") == "M")).show(truncate=False)
            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +str(exp) )
            # send email notification
            # log error to database
            sys.exit(1)

        return

    def create_spark_session(self):
        app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME')
        self.spark = SparkSession.builder\
            .appName(str(app_name))\
            .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\
            .enableHiveSupport().getOrCreate()

    def create_hive_table(self):
        self.spark.sql("create database if not exists fxxcoursedb")
        self.spark.sql("create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)")
        #Treat empty strings as null
        self.spark.sql("alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')")

    def verifyUsage(self,arguments):
        self.config_file = ''
        self.file_config=None
        try:
            opts, args = getopt.getopt(arguments, "c:")
        except getopt.GetoptError:
            logging.error('test.py -c <inputfile> ')
            sys.exit(2)
        for opt, arg in opts:
            if opt not in ("-c"):
                logging.error('test.py -c <configfile>  ')
                sys.exit()
            elif opt == '-h':
                logging.info('test.py -c <configfile>  ')
            elif opt in ("-c"):
                self.config_file = arg
                self.file_config = configparser.ConfigParser()
                self.file_config.read(str(get_project_root())+"/resources/pipeline.ini")
        logging.info('Input file is '+str(self.config_file))
        logging.info('file config is '+str(self.file_config))
예제 #9
0
class AggregateFunctions:

    logging.config.fileConfig(
        str(get_project_root()) + "/resources/configs/logging.conf")

    def run_pipeline(self):
        try:
            logging.info(
                "https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/"
            )
            # check collect_list and collect_set
            #collect_set() function returns all values from an input column with duplicate values eliminated.
            #collect_list() function returns all values from an input column with duplicates

            logging.info(
                'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/'
            )
            simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600),
                          ("Robert", "Sales", 4100),
                          ("Maria", "Finance", 3000), ("James", "Sales", 3000),
                          ("Scott", "Finance", 3300), ("Jen", "Finance", 3900),
                          ("Jeff", "Marketing", 3000),
                          ("Kumar", "Marketing", 2000),
                          ("Saif", "Sales", 4100)]
            schema = ["employee_name", "department", "salary"]

            df = self.spark.createDataFrame(data=simpleData,
                                            schema=schema).cache()
            df.show(truncate=False)

            from pyspark.sql.functions import approx_count_distinct, collect_list
            from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count
            from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness
            from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct
            from pyspark.sql.functions import variance, var_samp, var_pop
            df.printSchema()
            df.show(truncate=False)

            print("approx_count_distinct: " + \
                  str(df.select(approx_count_distinct("salary")).collect()[0][0]))

            print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

            df.select(collect_list("salary")).show(truncate=False)

            df.select(collect_set("salary")).show(truncate=False)

            df2 = df.select(countDistinct("department", "salary"))
            df2.show(truncate=False)
            print("Distinct Count of Department & Salary: " +
                  str(df2.collect()[0][0]))

            print("count: " + str(df.select(count("salary")).collect()[0]))
            dffirst = df.select(first("salary"))
            dffirst.show(truncate=False)
            df.select(last("salary")).show(truncate=False)
            df.select(kurtosis("salary")).show(truncate=False)
            df.select(max("salary")).show(truncate=False)
            df.select(min("salary")).show(truncate=False)
            df.select(mean("salary")).show(truncate=False)
            df.select(skewness("salary")).show(truncate=False)
            df.select(stddev("salary"), stddev_samp("salary"), \
                      stddev_pop("salary")).show(truncate=False)
            df.select(sum("salary")).show(truncate=False)
            df.select(sumDistinct("salary")).show(truncate=False)
            df.select(variance("salary"), var_samp("salary"), var_pop("salary")) \
                .show(truncate=False)

            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +
                          str(exp))
            # send email notification
            # log error to database
            sys.exit(1)

        return

    def create_spark_session(self):
        app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME')
        self.spark = SparkSession.builder\
            .appName(str(app_name))\
            .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\
            .enableHiveSupport().getOrCreate()

    def create_hive_table(self):
        self.spark.sql("create database if not exists fxxcoursedb")
        self.spark.sql(
            "create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)")
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)"
        )
        #Treat empty strings as null
        self.spark.sql(
            "alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')"
        )

    def verifyUsage(self, arguments):
        self.config_file = ''
        self.file_config = None
        try:
            opts, args = getopt.getopt(arguments, "c:")
        except getopt.GetoptError:
            logging.error('test.py -c <inputfile> ')
            sys.exit(2)
        for opt, arg in opts:
            if opt not in ("-c"):
                logging.error('test.py -c <configfile>  ')
                sys.exit()
            elif opt == '-h':
                logging.info('test.py -c <configfile>  ')
            elif opt in ("-c"):
                self.config_file = arg
                self.file_config = configparser.ConfigParser()
                self.file_config.read(
                    str(get_project_root()) + "/resources/pipeline.ini")
        logging.info('Input file is ' + str(self.config_file))
        logging.info('file config is ' + str(self.file_config))
class HandleNulls:

    logging.config.fileConfig(str(get_project_root())+"/resources/configs/logging.conf")
    def run_pipeline(self):
        try:
            logging.info("https://sparkbyexamples.com/pyspark-tutorial/")
            logging.info("Asmath --> Easy usage is to follow the json format while replacing values")
            data = [("James", "", "Smith", "36636", "M", 60000),
                    ("Michael", "Rose", "", "40288", "M", 70000),
                    ("Robert", "", "Williams", "42114", "", 400000),
                    ("Maria", "Anne", "Jones", "39192", "F", 500000),
                    ("Jen", "Mary", "Brown", "", "F", 0)]

            columns = ["first_name", "middle_name", "last_name", "dob", "gender", "salary"]
            df = self.spark.createDataFrame(data=data, schema=columns)
            df.printSchema()
            df.show(truncate=False)

            # Using when otherwise
            from pyspark.sql.functions import col, when
            df2 = df.withColumn("new_gender", when(col("gender") == "M", "Male")
                                .when(col("gender") == "F", "Female")
                                .otherwise("Unknown"))
            df2.show(truncate=False)

            df22 = df.select(col("*"), when(col("gender") == "M", "Male")
                             .when(col("gender") == "F", "Female")
                             .otherwise("Unknown").alias("new_gender")).show(truncate=False)

            # Using case when
            from pyspark.sql.functions import expr
            df3 = df.withColumn("new_gender", expr("case when gender = 'M' then 'Male' " +
                                                   "when gender = 'F' then 'Female' " +
                                                   "else 'Unknown' end"))
            df3.show(truncate=False)

            # Using case when
            df4 = df.select(col("*"), expr("case when gender = 'M' then 'Male' " +
                                           "when gender = 'F' then 'Female' " +
                                           "else 'Unknown' end").alias("new_gender"))
            df4.show(truncate=False)

            data2 = [(66, "a", "4"), (67, "a", "0"), (70, "b", "4"), (71, "d", "4")]
            df5 = self.spark.createDataFrame(data=data2, schema=["id", "code", "amt"])

            df5.withColumn("new_column", when(col("code") == "a" | col("code") == "d", "A")
                           .when(col("code") == "b" & col("amt") == "4", "B")
                           .otherwise("A1")).show()










            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +str(exp) )
            # send email notification
            # log error to database
            sys.exit(1)

        return

    def create_spark_session(self):
        app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME')
        self.spark = SparkSession.builder\
            .appName(str(app_name))\
            .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\
            .enableHiveSupport().getOrCreate()

    def create_hive_table(self):
        self.spark.sql("create database if not exists fxxcoursedb")
        self.spark.sql("create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)")
        self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)")
        #Treat empty strings as null
        self.spark.sql("alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')")

    def verifyUsage(self,arguments):
        self.config_file = ''
        self.file_config=None
        try:
            opts, args = getopt.getopt(arguments, "c:")
        except getopt.GetoptError:
            logging.error('test.py -c <inputfile> ')
            sys.exit(2)
        for opt, arg in opts:
            if opt not in ("-c"):
                logging.error('test.py -c <configfile>  ')
                sys.exit()
            elif opt == '-h':
                logging.info('test.py -c <configfile>  ')
            elif opt in ("-c"):
                self.config_file = arg
                self.file_config = configparser.ConfigParser()
                self.file_config.read(str(get_project_root())+"/resources/pipeline.ini")
        logging.info('Input file is '+str(self.config_file))
        logging.info('file config is '+str(self.file_config))
예제 #11
0
class CaseWhenOtherWise:

    logging.config.fileConfig(
        str(get_project_root()) + "/resources/configs/logging.conf")

    def run_pipeline(self):
        try:
            logging.info("https://sparkbyexamples.com/pyspark-tutorial/")
            logging.info(
                "Asmath --> Easy usage is to follow the json format while replacing values"
            )
            filePath = str(
                get_project_root()) + "/resources/data/small_zipcode.csv"
            df = self.spark.read.options(header='true', inferSchema='true') \
                .csv(filePath)
            logging.info("Drop nulls using df.na.drop()")
            df.na.drop().show(truncate=False)

            logging.info(
                "Asmath --> Easy usage is to follow the json format while replacing values , second way is reverse value and then key"
            )
            df.fillna({"city": "unknown", "type": ""}) \
                .show()

            df.printSchema()
            df.show(truncate=False)

            df.fillna(value=0).show()
            df.fillna(value=0, subset=["population"]).show()
            df.na.fill(value=0).show()
            df.na.fill(value=0, subset=["population"]).show()

            df.fillna(value="").show()
            df.na.fill(value="").show()

            df.fillna("unknown", ["city"]) \
                .fillna("", ["type"]).show()

            df.fillna({"city": "unknown", "type": ""}) \
                .show()

            df.na.fill("unknown", ["city"]) \
                .na.fill("", ["type"]).show()

            df.na.fill({"city": "unknown", "type": ""}) \
                .show()

            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +
                          str(exp))
            # send email notification
            # log error to database
            sys.exit(1)

        return

    def create_spark_session(self):
        app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME')
        self.spark = SparkSession.builder\
            .appName(str(app_name))\
            .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\
            .enableHiveSupport().getOrCreate()

    def create_hive_table(self):
        self.spark.sql("create database if not exists fxxcoursedb")
        self.spark.sql(
            "create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)")
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)"
        )
        #Treat empty strings as null
        self.spark.sql(
            "alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')"
        )

    def verifyUsage(self, arguments):
        self.config_file = ''
        self.file_config = None
        try:
            opts, args = getopt.getopt(arguments, "c:")
        except getopt.GetoptError:
            logging.error('test.py -c <inputfile> ')
            sys.exit(2)
        for opt, arg in opts:
            if opt not in ("-c"):
                logging.error('test.py -c <configfile>  ')
                sys.exit()
            elif opt == '-h':
                logging.info('test.py -c <configfile>  ')
            elif opt in ("-c"):
                self.config_file = arg
                self.file_config = configparser.ConfigParser()
                self.file_config.read(
                    str(get_project_root()) + "/resources/pipeline.ini")
        logging.info('Input file is ' + str(self.config_file))
        logging.info('file config is ' + str(self.file_config))
class SortOrderBy:

    logging.config.fileConfig(
        str(get_project_root()) + "/resources/configs/logging.conf")

    def run_pipeline(self):
        try:
            logging.info("https://sparkbyexamples.com/pyspark-tutorial/")
            logging.info(
                'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-orderby-and-sort-explained/'
            )
            simpleData = [("James", "Sales", "NY", 90000, 34, 10000), \
                          ("Michael", "Sales", "NY", 86000, 56, 20000), \
                          ("Robert", "Sales", "CA", 81000, 30, 23000), \
                          ("Maria", "Finance", "CA", 90000, 24, 23000), \
                          ("Raman", "Finance", "CA", 99000, 40, 24000), \
                          ("Scott", "Finance", "NY", 83000, 36, 19000), \
                          ("Jen", "Finance", "NY", 79000, 53, 15000), \
                          ("Jeff", "Marketing", "CA", 80000, 25, 18000), \
                          ("Kumar", "Marketing", "NY", 91000, 50, 21000) \
                          ]
            # Sort by - sorting happens in specific partitons. Order is not guaranted
            #Group by  - order is gurantee
            columns = [
                "employee_name", "department", "state", "salary", "age",
                "bonus"
            ]
            df = self.spark.createDataFrame(data=simpleData, schema=columns)
            df.printSchema()
            df.show(truncate=False)
            #Default is ascending order
            df.sort(df.department.asc(), df.state.asc()).show(truncate=False)
            df.sort("department", "state").show(truncate=False)
            df.sort(F.col("department").asc(),
                    F.col("state").desc()).show(truncate=False)

            df.orderBy(df.department.asc(),
                       df.state.desc()).show(truncate=False)
            df.orderBy("department", "state").show(truncate=False)
            df.orderBy(F.col("department").asc(),
                       F.col("state").desc()).show(truncate=False)

            #Nulls first or last function  ascending order
            df.sort(df.department.asc_nulls_first(),
                    df.state.desc_nulls_first()).show(truncate=False)
            df.sort("department", "state").show(truncate=False)
            df.sort(
                F.col("department").asc_nulls_last()(),
                F.col("state").desc_nulls_last()).show(truncate=False)

            df.orderBy(df.department.asc(),
                       df.state.desc()).show(truncate=False)
            df.orderBy("department", "state").show(truncate=False)
            df.orderBy(F.col("department").asc(),
                       F.col("state").desc()).show(truncate=False)

            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +
                          str(exp))
            # send email notification
            # log error to database
            sys.exit(1)

        return

    def create_spark_session(self):
        app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME')
        self.spark = SparkSession.builder\
            .appName(str(app_name))\
            .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\
            .enableHiveSupport().getOrCreate()

    def create_hive_table(self):
        self.spark.sql("create database if not exists fxxcoursedb")
        self.spark.sql(
            "create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)")
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)"
        )
        #Treat empty strings as null
        self.spark.sql(
            "alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')"
        )

    def verifyUsage(self, arguments):
        self.config_file = ''
        self.file_config = None
        try:
            opts, args = getopt.getopt(arguments, "c:")
        except getopt.GetoptError:
            logging.error('test.py -c <inputfile> ')
            sys.exit(2)
        for opt, arg in opts:
            if opt not in ("-c"):
                logging.error('test.py -c <configfile>  ')
                sys.exit()
            elif opt == '-h':
                logging.info('test.py -c <configfile>  ')
            elif opt in ("-c"):
                self.config_file = arg
                self.file_config = configparser.ConfigParser()
                self.file_config.read(
                    str(get_project_root()) + "/resources/pipeline.ini")
        logging.info('Input file is ' + str(self.config_file))
        logging.info('file config is ' + str(self.file_config))
예제 #13
0
 def create_spark_session(self):
     app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME')
     self.spark = SparkSession.builder\
         .appName(str(app_name))\
         .config("spark.driver.extraClassPath",str(get_project_root())+"/resources/postgresql-42.2.18.jar")\
         .enableHiveSupport().getOrCreate()
예제 #14
0
class Pipeline:

    logging.config.fileConfig(
        str(get_project_root()) + "/resources/configs/logging.conf")

    def run_pipeline(self):
        try:
            logging.info('run_pipeline method started')
            ingest_process = ingest.Ingest(self.spark, self.file_config)
            #ingest_process.read_from_pg()
            #ingest_process.read_from_pg_using_jdbc_driver()
            df = ingest_process.ingest_data()
            df.show()
            tranform_process = transform.Transform(self.spark,
                                                   self.file_config)
            transformed_df = tranform_process.transform_data(df)
            transformed_df.show()
            persist_process = persist.Persist(self.spark, self.file_config)
            #persist_process.insert_into_pg()
            persist_process.persist_data(transformed_df)
            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +
                          str(exp))
            # send email notification
            # log error to database
            sys.exit(1)

        return

    def create_spark_session(self):
        app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME')
        self.spark = SparkSession.builder\
            .appName(str(app_name))\
            .config("spark.driver.extraClassPath",str(get_project_root())+"/resources/postgresql-42.2.18.jar")\
            .enableHiveSupport().getOrCreate()

    def create_hive_table(self):
        self.spark.sql("create database if not exists fxxcoursedb")
        self.spark.sql(
            "create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)")
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)"
        )
        #Treat empty strings as null
        self.spark.sql(
            "alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')"
        )

    def verifyUsage(self, arguments):
        self.config_file = ''
        try:
            opts, args = getopt.getopt(arguments, "c:")
        except getopt.GetoptError:
            logging.error('test.py -c <inputfile> ')
            sys.exit(2)
        for opt, arg in opts:
            if opt not in ("-c"):
                logging.error('test.py -c <configfile>  ')
                sys.exit()
            elif opt == '-h':
                logging.info('test.py -c <configfile>  ')
            elif opt in ("-c"):
                self.config_file = arg
                self.file_config = configparser.ConfigParser()
                self.file_config.read(
                    str(get_project_root()) + "/resources/pipeline.ini")
        logging.info('Input file is ' + str(self.config_file))
예제 #15
0
class UpdateColumns:

    logging.config.fileConfig(
        str(get_project_root()) + "/resources/configs/logging.conf")

    def run_pipeline(self):
        try:
            logging.info("https://sparkbyexamples.com/pyspark-tutorial/")
            logging.info(
                'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-withcolumn/'
            )
            data = [('James', '', 'Smith', '1991-04-01', 'M', 3000),
                    ('Michael', 'Rose', '', '2000-05-19', 'M', 4000),
                    ('Robert', '', 'Williams', '1978-09-05', 'M', 4000),
                    ('Maria', 'Anne', 'Jones', '1967-12-01', 'F', 4000),
                    ('Jen', 'Mary', 'Brown', '1980-02-17', 'F', -1)]

            columns = [
                "firstname", "middlename", "lastname", "dob", "gender",
                "salary"
            ]
            df = self.spark.createDataFrame(data=data, schema=columns).cache()
            # Update column values and data type
            df2 = df.withColumn("salary",
                                (F.col("salary") * 2).cast("Integer"))
            df2.printSchema()

            #2. Update the value of an existing column
            df3 = df.withColumn("salary", F.col("salary") * 100)
            df3.printSchema()

            #3.Create a new column from an existing
            df4 = df.withColumn("CopiedColumn", F.col("salary") * -1)
            df3.printSchema()

            #4. Add new constant column using lit
            df5 = df.withColumn("Country", F.lit("USA"))
            df5.printSchema()
            df5.show(truncate=False)

            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +
                          str(exp))
            # send email notification
            # log error to database
            sys.exit(1)

        return

    def create_spark_session(self):
        app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME')
        self.spark = SparkSession.builder\
            .appName(str(app_name))\
            .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\
            .enableHiveSupport().getOrCreate()

    def create_hive_table(self):
        self.spark.sql("create database if not exists fxxcoursedb")
        self.spark.sql(
            "create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)")
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)"
        )
        #Treat empty strings as null
        self.spark.sql(
            "alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')"
        )

    def verifyUsage(self, arguments):
        self.config_file = ''
        self.file_config = None
        try:
            opts, args = getopt.getopt(arguments, "c:")
        except getopt.GetoptError:
            logging.error('test.py -c <inputfile> ')
            sys.exit(2)
        for opt, arg in opts:
            if opt not in ("-c"):
                logging.error('test.py -c <configfile>  ')
                sys.exit()
            elif opt == '-h':
                logging.info('test.py -c <configfile>  ')
            elif opt in ("-c"):
                self.config_file = arg
                self.file_config = configparser.ConfigParser()
                self.file_config.read(
                    str(get_project_root()) + "/resources/pipeline.ini")
        logging.info('Input file is ' + str(self.config_file))
        logging.info('file config is ' + str(self.file_config))
예제 #16
0
class Joins:

    logging.config.fileConfig(
        str(get_project_root()) + "/resources/configs/logging.conf")

    def run_pipeline(self):
        try:
            logging.info(
                "https://github.com/khajaasmath786/pyspark-examples/blob/master/pyspark-join.py"
            )
            logging.info(
                'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/'
            )
            emp = [(1, "Smith", -1, "2018", "10", "M", 3000), \
                   (2, "Rose", 1, "2010", "20", "M", 4000), \
                   (3, "Williams", 1, "2010", "10", "M", 1000), \
                   (4, "Jones", 2, "2005", "10", "F", 2000), \
                   (5, "Brown", 2, "2010", "40", "", -1), \
                   (6, "Brown", 2, "2010", "50", "", -1) \
                   ]
            empColumns = ["emp_id", "name", "superior_emp_id", "year_joined", \
                          "emp_dept_id", "gender", "salary"]

            empDF = self.spark.createDataFrame(data=emp, schema=empColumns)
            empDF.printSchema()
            empDF.show(truncate=False)
            from pyspark.sql.functions import col
            dept = [("Finance", 10), \
                    ("Marketing", 20), \
                    ("Sales", 30), \
                    ("IT", 40) \
                    ]
            deptColumns = ["dept_name", "dept_id"]
            deptDF = self.spark.createDataFrame(data=dept, schema=deptColumns)
            deptDF.printSchema()
            deptDF.show(truncate=False)
            df5=empDF.alias("emp1").join(empDF.alias("emp2"), \
                                     col("emp1.superior_emp_id") == col("emp2.emp_id"), "inner")
            df5.printSchema()

            empDF.alias("emp1").join(empDF.alias("emp2"), \
                                     col("emp1.superior_emp_id") == col("emp2.emp_id"), "inner") \
                .select(col("emp1.emp_id"), col("emp1.name"), \
                        col("emp2.emp_id").alias("superior_emp_id"), \
                        col("emp2.name").alias("superior_emp_name")) \
                .show(truncate=False)

            empDF.createOrReplaceTempView("EMP")
            deptDF.createOrReplaceTempView("DEPT")

            joinDF = self.spark.sql("select * from EMP e, DEPT d where e.emp_dept_id == d.dept_id") \
                .show(truncate=False)

            joinDF2 = self.spark.sql("select * from EMP e INNER JOIN DEPT d ON e.emp_dept_id == d.dept_id") \
                .show(truncate=False)

            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +
                          str(exp))
            # send email notification
            # log error to database
            sys.exit(1)

        return

    def create_spark_session(self):
        app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME')
        self.spark = SparkSession.builder\
            .appName(str(app_name))\
            .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\
            .enableHiveSupport().getOrCreate()

    def create_hive_table(self):
        self.spark.sql("create database if not exists fxxcoursedb")
        self.spark.sql(
            "create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)")
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)"
        )
        self.spark.sql(
            "insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)"
        )
        #Treat empty strings as null
        self.spark.sql(
            "alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')"
        )

    def verifyUsage(self, arguments):
        self.config_file = ''
        self.file_config = None
        try:
            opts, args = getopt.getopt(arguments, "c:")
        except getopt.GetoptError:
            logging.error('test.py -c <inputfile> ')
            sys.exit(2)
        for opt, arg in opts:
            if opt not in ("-c"):
                logging.error('test.py -c <configfile>  ')
                sys.exit()
            elif opt == '-h':
                logging.info('test.py -c <configfile>  ')
            elif opt in ("-c"):
                self.config_file = arg
                self.file_config = configparser.ConfigParser()
                self.file_config.read(
                    str(get_project_root()) + "/resources/pipeline.ini")
        logging.info('Input file is ' + str(self.config_file))
        logging.info('file config is ' + str(self.file_config))