Python Log4j示例，dependencies.logging.Log4j Python示例

示例#1

0

显示文件

文件： etl_basic_job.py 项目： vas1l3v/pyspark-etl-template

    def _get_spark_session(self):
        # if os.environ['ENV'] == 'dev':
        #     mode = 'local'
        # else:
        #     mode = 'yarn'

        spark_builder = SparkSession.builder \
            .master('local[*]') \
            .appName("Sample etl app")
        # add spark config params

        for key, val in self.config.items():
            spark_builder.config(key, val)

        # init session
        self.spark = spark_builder.getOrCreate()
        # register custom udfs
        self.spark.udf.register("convertFeetToCm", convert_feet_to_cm,
                                IntegerType())
        self.spark.udf.register("convertLbsToKg", convert_lbs_to_kg,
                                DoubleType())
        # set debug log level
        self.spark.sparkContext.setLogLevel("INFO")
        # init logger
        self.logger = logging.Log4j(self.spark)

        self.logger.info("spark session started")

示例#2

0

显示文件

文件： sparkprod.py 项目： kartikgoel88/Python_projects

def start_spark(app, files=[], pyfiles=[]):
    ## Spark Context ##
    #conf = SparkConf().setAppName("AppSparkContext").set("spark.files","etl_config.json").toDebugString()
    #sc = SparkContext(conf=conf)
    #warehouse_location = abspath('hdfs://localhost:9001/lake/files')

    ## Spark Session ##
    spark_builder = SparkSession.builder.appName(app).master('local[*]')
    spark_builder.config(
        'spark.files',
        "SparkFinal/configs/etl_config.json,/usr/local/Cellar/hive/2.1.0/libexec/conf/hive-site.xml"
    )
    #spark_builder.config('spark.logConf','true')
    #spark_builder.config('spark.jars.repositories','/Users/kkartikgoel/dev/spark-2.1.0-bin-hadoop2.7/jars')
    #spark_builder.config('spark.jars.packages','com.databricks:spark-avro_2.10:1.0.0')
    #spark_builder.config('spark.jars.packages','com.databricks:spark-avro_2.10:1.0.0')
    #spark_builder.config('hive.metastore.uris','thrift://localhost:9083')
    spark_sess = spark_builder.enableHiveSupport().getOrCreate()
    ##properties
    spark_conf_list = spark_sess.sparkContext.getConf().getAll()
    for key, val in spark_conf_list:
        print key + "=" + val
    #spark_sess.sparkContext.getConf().contains("spark.files")
    #spark_sess.conf.get("spark.files")
    print "Spark WebURL= %s" % spark_sess.sparkContext.uiWebUrl
    ## Spark Files ##
    spark_files_dir = SparkFiles.getRootDirectory()
    print "spark_files_dir= %s" % spark_files_dir
    print "file_in_Spark_dir= %s" % os.listdir(spark_files_dir)
    spark_sess.sql("SET -v").show()
    spark_logger = logging.Log4j(spark_sess)
    return spark_sess, spark_files_dir, spark_logger

示例#3

0

显示文件

文件： spark.py 项目： wingkwong/aws-playground

def start_spark(app_name="spark-app",
                master="local[*]",
                jar_packages=[],
                files=[],
                spark_config={}):
    # detect execution environment
    flag_repl = not (hasattr(__main__, "__file__"))
    flag_debug = "DEBUG" in environ.keys()

    if not (flag_repl or flag_debug):
        # get Spark session factory
        spark_builder = SparkSession.builder.appName(app_name)
    else:
        # get Spark session factory
        spark_builder = SparkSession.builder.master(master).appName(app_name)

        # create Spark JAR packages string
        spark_jars_packages = ",".join(list(jar_packages))
        spark_builder.config("spark.jars.packages", spark_jars_packages)

        spark_files = ",".join(list(files))
        spark_builder.config("spark.files", spark_files)

        # add other config params
        for key, val in spark_config.items():
            spark_builder.config(key, val)

    # create session and retrieve Spark logger object
    spark_sess = spark_builder.getOrCreate()
    spark_logger = logging.Log4j(spark_sess)

    # get config file if sent to cluster with --files
    with open(files[0], "r") as config_file:
        config_dict = json.load(config_file)
    return spark_sess, spark_logger, config_dict

示例#4

0

显示文件

def start_spark(app_name='sf_tree_distribution_job',
                master='local[*]',
                files=[],
                spark_config={}):
    """Start Spark session, get Spark logger and load config files.

    Start a Spark session on the worker node and register the Spark
    application with the cluster.

    :param app_name: Name of Spark app.
    :param master: Cluster connection details (defaults to local[*]).
    :param jar_packages: List of Spark JAR package names.
    :param files: List of files to send to Spark cluster (master and
        workers).
    :param spark_config: Dictionary of config key-value pairs.
    :return: A tuple of references to the Spark session, logger and
        config dict (only if available).
    """

    # detect execution environment
    flag_repl = not (hasattr(__main__, '__file__'))
    flag_debug = 'DEBUG' in environ.keys()

    if not (flag_repl or flag_debug):
        # get Spark session factory
        spark_builder = (SparkSession.builder.appName(app_name))
    else:
        # get Spark session factory
        spark_builder = (SparkSession.builder.master(master).appName(app_name))

        spark_files = ','.join(list(files))
        spark_builder.config('spark.files', spark_files)

        # add other config params
        for key, val in spark_config.items():
            spark_builder.config(key, val)

    # create session and retrieve Spark logger object
    spark_sess = spark_builder.getOrCreate()
    spark_logger = logging.Log4j(spark_sess)

    # get config file if sent to cluster with --files
    spark_files_dir = SparkFiles.getRootDirectory()
    config_files = [
        filename for filename in listdir(spark_files_dir)
        if filename.endswith('config.json')
    ]

    if config_files:
        path_to_config_file = path.join(spark_files_dir, config_files[0])
        with open(path_to_config_file, 'r') as config_file:
            config_dict = json.load(config_file)
        spark_logger.warn('loaded config from ' + config_files[0])
    else:
        spark_logger.warn('no config file found')
        config_dict = None

    return spark_sess, spark_logger, config_dict

示例#5

0

显示文件

文件： spark.py 项目： Vipul19feb/sample-app-spark-etl

def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[],
                files=[], spark_config={}):

    # detect execution environment
    flag_repl = not(hasattr(__main__, '__file__'))
    flag_debug = 'DEBUG' in environ.keys()

    if not (flag_repl or flag_debug):
        spark_builder = (
            SparkSession
            .builder
            .appName(app_name))
    else:
        spark_builder = (
            SparkSession
            .builder
            .master(master)
            .appName(app_name))


        spark_files = ','.join(list(files))
        spark_builder.config('spark.files', spark_files)

        # add other config params
        for key, val in spark_config.items():
            spark_builder.config(key, val)

    # create session and retrieve Spark logger object
    spark_sess = spark_builder.getOrCreate()
    spark_logger = logging.Log4j(spark_sess)

    # get config file if sent to cluster with --files
    spark_files_dir = SparkFiles.getRootDirectory()
    spark_logger.warn('spark_files_dir' + str(spark_files_dir))

    for filename in listdir(spark_files_dir):
        spark_logger.warn('filename' + str(filename))

    config_f = SparkFiles.get('configs/etl_config.json')
    spark_logger.warn('config_f' + str(config_f))

    config_files = [filename
                    for filename in listdir(spark_files_dir)
                    if filename.endswith('config.json')]
    spark_logger.warn('config_files' + str(config_files))
    if config_files:
        path_to_config_file = path.join(spark_files_dir, config_files[0])
        with open(path_to_config_file, 'r') as config_file:
            config_dict = json.load(config_file)
        spark_logger.warn('loaded config from ' + config_files[0])
    else:
        spark_logger.warn('no config file found')
        config_dict = None

    return spark_sess, spark_logger, config_dict

示例#6

0

显示文件

文件： ETLSpark.py 项目： Subhraj07/SparkASETL

def start_spark():
    spark = SparkSession \
        .builder \
        .master(Config.master) \
        .appName(Config.app_name) \
        .config("spark.driver.memory", Config.driver_memory) \
        .config("spark.executor.memory", Config.executor_memory) \
        .getOrCreate()

    spark_logger = logging.Log4j(spark)

    return spark, spark_logger

示例#7

0

显示文件

文件： spark.py 项目： dusandjovanovic/spark-bigdata-covid19-nlp-analysis

def start_spark(app_name='sars_cov2_analysis', master='local[*]', jar_packages=[], files=[], spark_config={}):
    flag_repl = not (hasattr(__main__, '__file__'))

    # ručno podesiti debug ukoliko je potrebno
    # eventualno .env sa 'DEBUG' in environ.keys()
    flag_debug = False

    if not (flag_repl or flag_debug):
        spark_builder = (
            SparkSession
                .builder
                .appName(app_name))
    else:
        spark_builder = (
            SparkSession
                .builder
                .master(master)
                .appName(app_name))

        spark_jars_packages = ','.join(list(jar_packages))
        spark_builder.config('spark.jars.packages', spark_jars_packages)

        spark_files = ','.join(list(files))
        spark_builder.config('spark.files', spark_files)

        for key, val in spark_config.items():
            spark_builder.config(key, val)

    spark_builder.config("spark.executor.memory", "2g")

    spark_sess = spark_builder.getOrCreate()
    spark_logger = logging.Log4j(spark_sess)

    spark_files_dir = SparkFiles.getRootDirectory()
    config_files = [filename
                    for filename in listdir(spark_files_dir)
                    if filename.endswith('config.json')]

    if config_files:
        path_to_config_file = path.join(spark_files_dir, config_files[0])
        with open(path_to_config_file, 'r') as config_file:
            config_dict = json.load(config_file)
        spark_logger.warn('Config: ' + config_files[0])
    else:
        spark_logger.warn('Warning: No config found.')
        config_dict = None

    sql_context = SQLContext(spark_sess.sparkContext)

    return spark_sess, sql_context, spark_logger, config_dict

示例#8

0

显示文件

文件： spark.py 项目： saurabhsm07/retail-analytics

def create_spark_session(app_name='spark-application', master='local[*]',
                         files=[], spark_config={}, jar_packages=[]):
    """
    method creates a spark session object
    :param jar_packages:
    :param app_name: name of the current application
    :param master: master nodes connection details
    :param files: path to files to be placed on each executor
    :param spark_config: dictionary of key-value pair config variables for spark session
    :return: tuple of (create_spark_session,
    """

    spark_builder = (SparkSession
                     .builder
                     .master(master)
                     .enableHiveSupport()
                     .appName(app_name))

    spark_jars_packages = ','.join(list(jar_packages))
    spark_builder.config('spark.jars.packages', spark_jars_packages)

    spark_files = ','.join(list(files))
    spark_builder.config('spark.files', spark_files)

    # add other config params
    for key, val in spark_config.items():
        spark_builder.config(key, val)

    # create session and retrieve Spark logger object
    spark = spark_builder.getOrCreate()

    spark_logger = logging.Log4j(spark)
    # get config file if sent to cluster with --files

    spark_files_dir = SparkFiles.getRootDirectory()

    config_files = [filename
                    for filename in listdir(spark_files_dir)
                    if filename.endswith('config.json')]

    if config_files:
        path_to_config_file = path.join(spark_files_dir, config_files[0])
        with open(path_to_config_file, 'r') as config_file:
            config_dict = json.load(config_file)
        spark_logger.warn('loaded config from ' + config_files[0])
    else:
        spark_logger.warn('no config file found')
        config_dict = None

    return spark, spark_logger, config_dict

示例#9

0

显示文件

文件： spark_conn.py 项目： Sanjeev-Roy91/pyspark-example-demo

def start_spark(app_name='my_spark_app', master='local[*]', spark_config={}):
    spark_builder = (SparkSession.builder.master(master).appName(app_name))

    for key, val in spark_config.items():
        spark_builder.config(key, val)

    spark_sess = spark_builder.getOrCreate()
    spark_logger = logging.Log4j(spark_sess)

    spark_sess.sparkContext.addFile('configs/etl_config.json')

    with open(SparkFiles.get('etl_config.json'), 'r') as cf:
        config_dict = json.load(cf)

    return spark_sess, spark_logger, config_dict

示例#10

0

显示文件

def main():
    spark = SparkSession.builder.getOrCreate()
    spark_logger = logging.Log4j(spark)

    create_test_data(spark, None)

    spark_logger.warn('ETL_TEMPLATE - Extracting data ...')
    data = extract_data(spark)

    spark_logger.warn('ETL_TEMPLATE - Transforming data ...')
    data_transformed = transform_data(data)

    spark_logger.warn('ETL_TEMPLATE - Loading data ...')
    load_data(data_transformed)

    spark.stop()
    return None

示例#11

0

显示文件

文件： spark.py 项目： sash2501/spark-playground

    def start_spark(self,
                    app_name='my_spark_app',
                    master='local[*]',
                    jar_packages=[],
                    files=[],
                    spark_config={}):
        # get Spark session factory
        spark_builder = (SparkSession.buider.master(master).appName(app_name))
        # create Spark JAR packages string
        spark_jars_packages = ','.join(list(jar_packages))
        spark_builder.config('spark.jars.packages', spark_jars_packages)

        spark_files = ','.join(list(files))
        spark_builder.config('spark.files', spark_files)

        # add other configs params
        for key, val in spark_config.items():
            spark_builder.config(key, val)

        # create session and retrieve Spark logger object
        spark_sess = spark_builder.getOrCreate()
        spark_logger = logging.Log4j(spark_sess)

        # get configs file if sent to cluster with --files
        spark_files_dir = SparkFiles.getRootDirectory()
        config_files = [
            filename for filename in list(dir(spark_files_dir))
            if filename.endswith('json')
        ]

        if config_files:
            path_to_config_file = path.join(spark_files_dir, config_files[0])
            with open(path_to_config_file, 'r') as config_file:
                config_dict = json.load(config_file)
            spark_logger.warn('loaded configs from ' + config_files[0])
        else:
            spark_logger.warn('no configs file found')
            config_dict = None

        self.spark = spark_sess
        self.log = spark_logger
        self.config_data = config_dict

        return True

示例#12

0

显示文件

def start_spark(app_name='my_spark_etl',
                master='local[*]',
                jar_packages=[],
                files=[],
                spark_config={}):
    """Start Spark session, get Spark logger and load config files.

    Start a Spark session on the worker node and register the Spark
    application with the cluster. Note, that only the app_name argument
    will apply when this is called from a script sent to spark-submit.
    All other arguments exist solely for testing the script from within
    an interactive Python console.

    This function also looks for a file ending in 'config.json' that
    can be sent with the Spark job. If it is found, it is opened,
    the contents parsed (assuming it contains valid JSON for the ETL job
    configuration), into a dict of
     ETL job configuration parameters,
    which are returned as the last element in the tuple returned by
    this function. If the file cannot be found then the return tuple
    only contains the Spark session and Spark logger objects and None
    for config.

    The function checks the enclosing environment to see if it is being
    run from inside an interactive console session or from an
    environment which has a `DEBUG` environment varibale set (e.g.
    setting `DEBUG=1` as an environment variable as part of a debug
    configuration within an IDE such as Visual Studio Code or PyCharm in
    In this scenario, the function uses all available function arguments
    to start a PySpark driver from the local PySpark package as opposed
    to using the spark-submit and Spark cluster defaults. This will also
    use local module imports, as opposed to those in the zip archive
    sent to spark via the --py-files flag in spark-submit.

    :param app_name: Name of Spark app.
    :param master: Cluster connection details (defaults to local[*].
    :param jar_packages: List of Spark JAR package names.
    :param files: List of files to send to Spark cluster (master and
        workers).
    :param spark_config: Dictionary of config key-value pairs.
    :return: A tuple of references to the Spark session, logger and
        config dict (only if available).
    """

    # detect execution environment
    flag_repl = False if hasattr(__main__, '__file__') else True
    flag_debug = True if 'DEBUG' in environ.keys() else False

    warehouse_location = abspath('spark-warehouse')

    if not (flag_repl or flag_debug):
        # get Spark session factory
        spark_builder = (SparkSession.builder.appName(app_name))
        # Flag required for suppressing Hive table creation (create_database_table()) and Impala REFRESH

        environment = 'local'
    else:
        # get Spark session factory
        spark_builder = (
            SparkSession.builder.master(master).appName(app_name).config(
                "spark.sql.warehouse.dir",
                warehouse_location).enableHiveSupport())

        # create Spark JAR packages string
        spark_jars_packages = ','.join(list(jar_packages))
        spark_builder.config('spark.jars.packages', spark_jars_packages)

        spark_files = ','.join(list(files))
        spark_builder.config('spark.files', spark_files)

        # add other config params
        for key, val in spark_config.items():
            spark_builder.config(key, val)

    # create session and retrieve Spark logger object
    spark_sess = spark_builder.getOrCreate()
    spark_logger = logging.Log4j(spark_sess)

    # get config file if sent to cluster with --files
    spark_files_dir = SparkFiles.getRootDirectory()
    config_files = [
        filename for filename in listdir(spark_files_dir)
        if filename.endswith('config.json')
    ]

    if len(config_files) != 0:
        path_to_config_file = path.join(spark_files_dir, config_files[0])
        with open(path_to_config_file, 'r') as config_file:
            config_json = config_file.read().replace('\n', '')
        config_dict = loads(config_json)
        spark_logger.warn('loaded config from ' + config_files[0])
    else:
        try:
            with open('configs/etl_config.json', 'r') as config_file:
                config_json = config_file.read().replace('\n', '')
            config_dict = loads(config_json)
        except FileNotFoundError:
            config_dict = None

    return spark_sess, spark_logger, config_dict, environment

示例#13

0

显示文件

def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[],
                files=[], spark_config={}):
    """Start Spark session, get the Spark logger and load config files.

    Start a Spark session on the worker node and register the Spark
    application with the cluster. NOTE - only the app_name argument
    will apply when this is called from a script sent to spark-submit
    (i.e. when __name__ = '__main__'). All other arguments exist solely
    for testing the script from within an interactive Python console.

    This function also looks for a file ending in 'config.json' that
    can be sent with the Spark job. If it is found, it is opened,
    the contents parsed (assuming it contains valid JSON for the ETL job
    configuration), into a dict of ETL job configuration parameters,
    which are returned as the last element in the tuple returned by
    this function. If the file cannot be found then the return tuple
    only contains the Spark session and Spark logger objects.

    :param app_name: Name of Spark app.
    :param master: Cluster connection details (defaults to local[*].
    :param jar_packages: List of Spark JAR package names.
    :param files: List of files to send to Spark cluster (master and
    workers).
    :param spark_config: Dictionary of config key-value pairs.
    :return: A tuple of references to the Spark session, logger and
    config dict (only if available).
    """
    if __name__ == '__main__':
        # get Spark session factory
        spark_builder = (
            SparkSession
            .builder
            .appName(app_name))
    else:
        # get Spark session factory
        spark_builder = (
            SparkSession
            .builder
            .master(master)
            .appName(app_name))

        # create Spark JAR packages string
        spark_jars_packages = ','.join(list(jar_packages))
        spark_builder.config('spark.jars.packages', spark_jars_packages)

        spark_files = ','.join(list(files))
        spark_builder.config('spark.files', spark_files)

        # add other config params
        for key, val in spark_config.items():
            spark_builder.config(key, val)

    # create session and retrieve Spark logger object
    spark_sess = spark_builder.getOrCreate()
    spark_logger = logging.Log4j(spark_sess)

    # get config file if sent to cluster with --files
    spark_files_dir = SparkFiles.getRootDirectory()
    config_files = [filename
                    for filename in listdir(spark_files_dir)
                    if filename.endswith('config.json')]

    if len(config_files) != 0:
        path_to_config_file = path.join(spark_files_dir, config_files[0])
        with open(path_to_config_file, 'r') as config_file:
            config_json = config_file.read().replace('\n', '')
        config_dict = loads(config_json)
        spark_logger.warn('loaded config from ' + config_files[0])
    else:
        config_dict = None

    # build return tuple conditional on presence of config
    if config_dict is not None:
        return_tup = spark_sess, spark_logger, config_dict
    else:
        return_tup = spark_sess, spark_logger
    return return_tup

示例#14

0

显示文件

def start_spark(app_name='my_spark_app',
                master='local[*]',
                jar_packages=[],
                files=[],
                spark_config={},
                dependencies=None):
    """Start Spark session, get Spark logger and load config files.

    Start a Spark session on the worker node and register the Spark
    application with the cluster. Note, that only the app_name argument
    will apply when this is called from a script sent to spark-submit.
    All other arguments exist solely for testing the script from within
    an interactive Python console.

    This function also looks for a file ending in 'config.json' that
    can be sent with the Spark job. If it is found, it is opened,
    the contents parsed (assuming it contains valid JSON for the ETL job
    configuration) into a dict of ETL job configuration parameters,
    which are returned as the last element in the tuple returned by
    this function. If the file cannot be found then the return tuple
    only contains the Spark session and Spark logger objects and None
    for config.

    The function checks the enclosing environment to see if it is being
    run from inside an interactive console session or from an
    environment which has a `DEBUG` environment variable set (e.g.
    setting `DEBUG=1` as an environment variable as part of a debug
    configuration within an IDE such as Visual Studio Code or PyCharm.
    In this scenario, the function uses all available function arguments
    to start a PySpark driver from the local PySpark package as opposed
    to using the spark-submit and Spark cluster defaults. This will also
    use local module imports, as opposed to those in the zip archive
    sent to spark via the --py-files flag in spark-submit.

    :param app_name: Name of Spark app.
    :param master: Cluster connection details (defaults to local[*]).
    :param jar_packages: List of Spark JAR package names.
    :param files: List of files to send to Spark cluster (master and
        workers).
    :param spark_config: Dictionary of config key-value pairs.
    :return: A tuple of references to the Spark session, logger and
        config dict (only if available).
    """

    # append default file
    files.insert(0, "configs/default_config.json")

    # append packages zip file with all dependencies
    if dependencies:
        files.append(dependencies)

    # detect execution environment
    flag_debug = 'DEBUG' in environ.keys()

    config_dict = {}
    if flag_debug or master is None:
        # get Spark session factory
        spark_builder = (
            SparkSession.builder.enableHiveSupport().appName(app_name))
    else:
        # get Spark session factory
        spark_builder = (SparkSession.builder.master(
            "spark://{}".format(master)).enableHiveSupport().appName(app_name))

        # create Spark JAR packages string
        spark_jars_packages = ','.join(list(jar_packages))
        spark_builder.config('spark.jars.packages', spark_jars_packages)

        # add packages.zip as a dependency
        spark_builder.config('spark.submit.pyFiles', dependencies)

        spark_files = ','.join(list(files))
        spark_builder.config('spark.files', spark_files)

    # create session and retrieve Spark logger object
    spark_sess = spark_builder.getOrCreate()

    # get config file if sent to cluster with --files
    spark_files_dir = SparkFiles.getRootDirectory()
    config_files = [
        filename for filename in listdir(spark_files_dir)
        if filename.endswith('config.json')
    ]

    # add other config params
    for key, val in spark_config.items():
        spark_builder.config(key, val)

    spark_logger = logging.Log4j(spark_sess)
    if config_files:
        for config_file in config_files:
            path_to_config_file = path.join(spark_files_dir, config_file)
            with open(path_to_config_file, 'r') as config_file:
                config_dict.update(json.load(config_file))
            spark_logger.info('loaded config from ' + config_file.name)
    else:
        spark_logger.warning('no config file found')
        config_dict = None

    return spark_sess, spark_logger, config_dict

示例#15

0

显示文件

def start_spark(app_name='hello_fresh_etl_job',
                master='local[*]',
                jar_packages=[],
                files=[],
                spark_config={}):
    """Start Spark session, get Spark logger and load config files.

    :param app_name: Name of Spark app.
    :param master: Cluster connection details (defaults to local[*].
    :param jar_packages: List of Spark JAR package names.
    :param files: List of files to send to Spark cluster (master and
        workers).
    :param spark_config: Dictionary of config key-value pairs.
    :return: A tuple of references to the Spark session, logger and
        config dict (only if available).
    """

    # detect execution environment
    flag_repl = False if hasattr(__main__, '__file__') else True
    flag_debug = True if 'DEBUG' in environ.keys() else False

    if not (flag_repl or flag_debug):
        # get Spark session factory
        spark_builder = (SparkSession.builder.appName(app_name))
        environment = 'local'
    else:
        # get Spark session factory
        spark_builder = (SparkSession.builder.master(master).appName(app_name))

        # create Spark JAR packages string
        spark_jars_packages = ','.join(list(jar_packages))
        spark_builder.config('spark.jars.packages', spark_jars_packages)

        spark_files = ','.join(list(files))
        spark_builder.config('spark.files', spark_files)

        # add other config params
        for key, val in spark_config.items():
            spark_builder.config(key, val)

    # create session and retrieve Spark logger object
    spark_sess = spark_builder.getOrCreate()
    spark_logger = logging.Log4j(spark_sess)

    # get config file if sent to cluster with --files
    spark_files_dir = SparkFiles.getRootDirectory()
    config_files = [
        filename for filename in listdir(spark_files_dir)
        if filename.endswith('config.json')
    ]

    if len(config_files) != 0:
        path_to_config_file = path.join(spark_files_dir, config_files[0])
        with open(path_to_config_file, 'r') as config_file:
            config_json = config_file.read().replace('\n', '')
        config_dict = loads(config_json)
        spark_logger.warn('loaded config from ' + config_files[0])
    else:
        try:
            with open('configs/etl_config.json', 'r') as config_file:
                config_json = config_file.read().replace('\n', '')
            config_dict = loads(config_json)
        except FileNotFoundError:
            config_dict = None

    return spark_sess, spark_logger, config_dict, environment