def getSparkDF(self, input_path):
     conf = SparkConfiguration().getSparkConf()
     spark = SparkSession.builder. \
         config(conf=conf). \
         appName("pyspark postgres performance test"). \
         getOrCreate()
     schema = FileSchema().empSchema()
     df = spark.read.format("csv").load(input_path,
                                        schema=schema,
                                        inferSchema=False)
     print("No of Partitions:{0}".format(df.rdd.getNumPartitions()))
     return df
Exemplo n.º 2
0
from com.rposam.util.logger import Log4j

if __name__ == "__main__":
    conf = SparkConfiguration.getSparkConf()

    Driver = SparkSession. \
        builder. \
        config(conf=conf). \
        appName("ETL Pipeline using Airflow CSV To Parquet"). \
        getOrCreate()

    logger = Log4j(Driver)

    logger.info("Fetching schema of source file")
    schema = FileSchema.empSchema()

    source = sys.argv[1]
    target = sys.argv[2]

    logger.info("Source is {0} and target is {1}".format(source, target))

    logger.info("Started reading data from sources")

    empDF = Driver.read. \
        format("csv"). \
        schema(schema=schema). \
        option("header", "false"). \
        load(source)
    empDF.show()
    if (re.search(regex, str(email))):
        return True
    else:
        return False


def parseGender(gender):
    if str(gender) in ["Male", "M", "male", "m", "MALE"]:
        return "M"
    elif str(gender) in ["Female", "F", "f", "female", "feMale", "FEMALE"]:
        return "F"
    else:
        return "N"


FileSchema = FileSchema()._20000recordsSchema()

if __name__ == "__main__":
    conf = SparkConfiguration().getSparkConf()
    warehouseLocation = "hdfs://localhost:8020/user/hive/warehouse/sparkdb.db"
    spark = SparkSession. \
        builder. \
        config(conf=conf). \
        appName("Custom UDF for Email and Gender validations and also a LTI Assignment"). \
        config("hive.metastore.uris", "thrift://localhost:9083"). \
        enableHiveSupport(). \
        config("spark.sql.shuffle.partitions", 10). \
        getOrCreate()

    logger = Log4j(spark)
    logger.info("Reading csv file with dropmalformed mode")
Exemplo n.º 4
0
import sys

from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
from pyspark.sql.types import StructType, StructField, ArrayType, StringType
from pyspark.sql.functions import col, explode
from com.rposam.util.logger import Log4j
from com.rposam.schema.FileSchema import FileSchema
from com.rposam.config.SparkConf import SparkConfiguration
import os
schema = FileSchema.randomuserapiSchema()

if __name__ == "__main__":
    conf = SparkConfiguration.getSparkConf()
    # for locally installed spark and hadoop
    # warehouseLocation = "hdfs://localhost:8020/user/hive/warehouse/sparkdb.db"
    # thriftServer ="thrift://localhost:9083"

    # for itversity cluster
    warehouseLocation = "hdfs://nn01.itversity.com:8020/user/rposam2021/warehouse/rposam2021_hivedb.db"
    thriftServer = "thrift://gw02.itversity.com:9083"
    os.environ["HADOOP_USER_NAME"] = "rposam2021"
    spark = SparkSession.builder.\
        appName("Read json and write to local installed spark on ubuntu"). \
        config("spark.sql.warehouse.dir", warehouseLocation). \
        config("hive.metastore.uris", thriftServer). \
        enableHiveSupport().\
        getOrCreate()
    logger = Log4j(spark)
    logger.info("Spark session created using enableHivesupport")
    df = spark.read.schema(schema=schema).option(