def _get_spark_session(self): # if os.environ['ENV'] == 'dev': # mode = 'local' # else: # mode = 'yarn' spark_builder = SparkSession.builder \ .master('local[*]') \ .appName("Sample etl app") # add spark config params for key, val in self.config.items(): spark_builder.config(key, val) # init session self.spark = spark_builder.getOrCreate() # register custom udfs self.spark.udf.register("convertFeetToCm", convert_feet_to_cm, IntegerType()) self.spark.udf.register("convertLbsToKg", convert_lbs_to_kg, DoubleType()) # set debug log level self.spark.sparkContext.setLogLevel("INFO") # init logger self.logger = logging.Log4j(self.spark) self.logger.info("spark session started")
def start_spark(app, files=[], pyfiles=[]): ## Spark Context ## #conf = SparkConf().setAppName("AppSparkContext").set("spark.files","etl_config.json").toDebugString() #sc = SparkContext(conf=conf) #warehouse_location = abspath('hdfs://localhost:9001/lake/files') ## Spark Session ## spark_builder = SparkSession.builder.appName(app).master('local[*]') spark_builder.config( 'spark.files', "SparkFinal/configs/etl_config.json,/usr/local/Cellar/hive/2.1.0/libexec/conf/hive-site.xml" ) #spark_builder.config('spark.logConf','true') #spark_builder.config('spark.jars.repositories','/Users/kkartikgoel/dev/spark-2.1.0-bin-hadoop2.7/jars') #spark_builder.config('spark.jars.packages','com.databricks:spark-avro_2.10:1.0.0') #spark_builder.config('spark.jars.packages','com.databricks:spark-avro_2.10:1.0.0') #spark_builder.config('hive.metastore.uris','thrift://localhost:9083') spark_sess = spark_builder.enableHiveSupport().getOrCreate() ##properties spark_conf_list = spark_sess.sparkContext.getConf().getAll() for key, val in spark_conf_list: print key + "=" + val #spark_sess.sparkContext.getConf().contains("spark.files") #spark_sess.conf.get("spark.files") print "Spark WebURL= %s" % spark_sess.sparkContext.uiWebUrl ## Spark Files ## spark_files_dir = SparkFiles.getRootDirectory() print "spark_files_dir= %s" % spark_files_dir print "file_in_Spark_dir= %s" % os.listdir(spark_files_dir) spark_sess.sql("SET -v").show() spark_logger = logging.Log4j(spark_sess) return spark_sess, spark_files_dir, spark_logger
def start_spark(app_name="spark-app", master="local[*]", jar_packages=[], files=[], spark_config={}): # detect execution environment flag_repl = not (hasattr(__main__, "__file__")) flag_debug = "DEBUG" in environ.keys() if not (flag_repl or flag_debug): # get Spark session factory spark_builder = SparkSession.builder.appName(app_name) else: # get Spark session factory spark_builder = SparkSession.builder.master(master).appName(app_name) # create Spark JAR packages string spark_jars_packages = ",".join(list(jar_packages)) spark_builder.config("spark.jars.packages", spark_jars_packages) spark_files = ",".join(list(files)) spark_builder.config("spark.files", spark_files) # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark_sess) # get config file if sent to cluster with --files with open(files[0], "r") as config_file: config_dict = json.load(config_file) return spark_sess, spark_logger, config_dict
def start_spark(app_name='sf_tree_distribution_job', master='local[*]', files=[], spark_config={}): """Start Spark session, get Spark logger and load config files. Start a Spark session on the worker node and register the Spark application with the cluster. :param app_name: Name of Spark app. :param master: Cluster connection details (defaults to local[*]). :param jar_packages: List of Spark JAR package names. :param files: List of files to send to Spark cluster (master and workers). :param spark_config: Dictionary of config key-value pairs. :return: A tuple of references to the Spark session, logger and config dict (only if available). """ # detect execution environment flag_repl = not (hasattr(__main__, '__file__')) flag_debug = 'DEBUG' in environ.keys() if not (flag_repl or flag_debug): # get Spark session factory spark_builder = (SparkSession.builder.appName(app_name)) else: # get Spark session factory spark_builder = (SparkSession.builder.master(master).appName(app_name)) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark_sess) # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() config_files = [ filename for filename in listdir(spark_files_dir) if filename.endswith('config.json') ] if config_files: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_dict = json.load(config_file) spark_logger.warn('loaded config from ' + config_files[0]) else: spark_logger.warn('no config file found') config_dict = None return spark_sess, spark_logger, config_dict
def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], files=[], spark_config={}): # detect execution environment flag_repl = not(hasattr(__main__, '__file__')) flag_debug = 'DEBUG' in environ.keys() if not (flag_repl or flag_debug): spark_builder = ( SparkSession .builder .appName(app_name)) else: spark_builder = ( SparkSession .builder .master(master) .appName(app_name)) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark_sess) # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() spark_logger.warn('spark_files_dir' + str(spark_files_dir)) for filename in listdir(spark_files_dir): spark_logger.warn('filename' + str(filename)) config_f = SparkFiles.get('configs/etl_config.json') spark_logger.warn('config_f' + str(config_f)) config_files = [filename for filename in listdir(spark_files_dir) if filename.endswith('config.json')] spark_logger.warn('config_files' + str(config_files)) if config_files: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_dict = json.load(config_file) spark_logger.warn('loaded config from ' + config_files[0]) else: spark_logger.warn('no config file found') config_dict = None return spark_sess, spark_logger, config_dict
def start_spark(): spark = SparkSession \ .builder \ .master(Config.master) \ .appName(Config.app_name) \ .config("spark.driver.memory", Config.driver_memory) \ .config("spark.executor.memory", Config.executor_memory) \ .getOrCreate() spark_logger = logging.Log4j(spark) return spark, spark_logger
def start_spark(app_name='sars_cov2_analysis', master='local[*]', jar_packages=[], files=[], spark_config={}): flag_repl = not (hasattr(__main__, '__file__')) # ručno podesiti debug ukoliko je potrebno # eventualno .env sa 'DEBUG' in environ.keys() flag_debug = False if not (flag_repl or flag_debug): spark_builder = ( SparkSession .builder .appName(app_name)) else: spark_builder = ( SparkSession .builder .master(master) .appName(app_name)) spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) for key, val in spark_config.items(): spark_builder.config(key, val) spark_builder.config("spark.executor.memory", "2g") spark_sess = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark_sess) spark_files_dir = SparkFiles.getRootDirectory() config_files = [filename for filename in listdir(spark_files_dir) if filename.endswith('config.json')] if config_files: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_dict = json.load(config_file) spark_logger.warn('Config: ' + config_files[0]) else: spark_logger.warn('Warning: No config found.') config_dict = None sql_context = SQLContext(spark_sess.sparkContext) return spark_sess, sql_context, spark_logger, config_dict
def create_spark_session(app_name='spark-application', master='local[*]', files=[], spark_config={}, jar_packages=[]): """ method creates a spark session object :param jar_packages: :param app_name: name of the current application :param master: master nodes connection details :param files: path to files to be placed on each executor :param spark_config: dictionary of key-value pair config variables for spark session :return: tuple of (create_spark_session, """ spark_builder = (SparkSession .builder .master(master) .enableHiveSupport() .appName(app_name)) spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark) # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() config_files = [filename for filename in listdir(spark_files_dir) if filename.endswith('config.json')] if config_files: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_dict = json.load(config_file) spark_logger.warn('loaded config from ' + config_files[0]) else: spark_logger.warn('no config file found') config_dict = None return spark, spark_logger, config_dict
def start_spark(app_name='my_spark_app', master='local[*]', spark_config={}): spark_builder = (SparkSession.builder.master(master).appName(app_name)) for key, val in spark_config.items(): spark_builder.config(key, val) spark_sess = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark_sess) spark_sess.sparkContext.addFile('configs/etl_config.json') with open(SparkFiles.get('etl_config.json'), 'r') as cf: config_dict = json.load(cf) return spark_sess, spark_logger, config_dict
def main(): spark = SparkSession.builder.getOrCreate() spark_logger = logging.Log4j(spark) create_test_data(spark, None) spark_logger.warn('ETL_TEMPLATE - Extracting data ...') data = extract_data(spark) spark_logger.warn('ETL_TEMPLATE - Transforming data ...') data_transformed = transform_data(data) spark_logger.warn('ETL_TEMPLATE - Loading data ...') load_data(data_transformed) spark.stop() return None
def start_spark(self, app_name='my_spark_app', master='local[*]', jar_packages=[], files=[], spark_config={}): # get Spark session factory spark_builder = (SparkSession.buider.master(master).appName(app_name)) # create Spark JAR packages string spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add other configs params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark_sess) # get configs file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() config_files = [ filename for filename in list(dir(spark_files_dir)) if filename.endswith('json') ] if config_files: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_dict = json.load(config_file) spark_logger.warn('loaded configs from ' + config_files[0]) else: spark_logger.warn('no configs file found') config_dict = None self.spark = spark_sess self.log = spark_logger self.config_data = config_dict return True
def start_spark(app_name='my_spark_etl', master='local[*]', jar_packages=[], files=[], spark_config={}): """Start Spark session, get Spark logger and load config files. Start a Spark session on the worker node and register the Spark application with the cluster. Note, that only the app_name argument will apply when this is called from a script sent to spark-submit. All other arguments exist solely for testing the script from within an interactive Python console. This function also looks for a file ending in 'config.json' that can be sent with the Spark job. If it is found, it is opened, the contents parsed (assuming it contains valid JSON for the ETL job configuration), into a dict of ETL job configuration parameters, which are returned as the last element in the tuple returned by this function. If the file cannot be found then the return tuple only contains the Spark session and Spark logger objects and None for config. The function checks the enclosing environment to see if it is being run from inside an interactive console session or from an environment which has a `DEBUG` environment varibale set (e.g. setting `DEBUG=1` as an environment variable as part of a debug configuration within an IDE such as Visual Studio Code or PyCharm in In this scenario, the function uses all available function arguments to start a PySpark driver from the local PySpark package as opposed to using the spark-submit and Spark cluster defaults. This will also use local module imports, as opposed to those in the zip archive sent to spark via the --py-files flag in spark-submit. :param app_name: Name of Spark app. :param master: Cluster connection details (defaults to local[*]. :param jar_packages: List of Spark JAR package names. :param files: List of files to send to Spark cluster (master and workers). :param spark_config: Dictionary of config key-value pairs. :return: A tuple of references to the Spark session, logger and config dict (only if available). """ # detect execution environment flag_repl = False if hasattr(__main__, '__file__') else True flag_debug = True if 'DEBUG' in environ.keys() else False warehouse_location = abspath('spark-warehouse') if not (flag_repl or flag_debug): # get Spark session factory spark_builder = (SparkSession.builder.appName(app_name)) # Flag required for suppressing Hive table creation (create_database_table()) and Impala REFRESH environment = 'local' else: # get Spark session factory spark_builder = ( SparkSession.builder.master(master).appName(app_name).config( "spark.sql.warehouse.dir", warehouse_location).enableHiveSupport()) # create Spark JAR packages string spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark_sess) # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() config_files = [ filename for filename in listdir(spark_files_dir) if filename.endswith('config.json') ] if len(config_files) != 0: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_json = config_file.read().replace('\n', '') config_dict = loads(config_json) spark_logger.warn('loaded config from ' + config_files[0]) else: try: with open('configs/etl_config.json', 'r') as config_file: config_json = config_file.read().replace('\n', '') config_dict = loads(config_json) except FileNotFoundError: config_dict = None return spark_sess, spark_logger, config_dict, environment
def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], files=[], spark_config={}): """Start Spark session, get the Spark logger and load config files. Start a Spark session on the worker node and register the Spark application with the cluster. NOTE - only the app_name argument will apply when this is called from a script sent to spark-submit (i.e. when __name__ = '__main__'). All other arguments exist solely for testing the script from within an interactive Python console. This function also looks for a file ending in 'config.json' that can be sent with the Spark job. If it is found, it is opened, the contents parsed (assuming it contains valid JSON for the ETL job configuration), into a dict of ETL job configuration parameters, which are returned as the last element in the tuple returned by this function. If the file cannot be found then the return tuple only contains the Spark session and Spark logger objects. :param app_name: Name of Spark app. :param master: Cluster connection details (defaults to local[*]. :param jar_packages: List of Spark JAR package names. :param files: List of files to send to Spark cluster (master and workers). :param spark_config: Dictionary of config key-value pairs. :return: A tuple of references to the Spark session, logger and config dict (only if available). """ if __name__ == '__main__': # get Spark session factory spark_builder = ( SparkSession .builder .appName(app_name)) else: # get Spark session factory spark_builder = ( SparkSession .builder .master(master) .appName(app_name)) # create Spark JAR packages string spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark_sess) # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() config_files = [filename for filename in listdir(spark_files_dir) if filename.endswith('config.json')] if len(config_files) != 0: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_json = config_file.read().replace('\n', '') config_dict = loads(config_json) spark_logger.warn('loaded config from ' + config_files[0]) else: config_dict = None # build return tuple conditional on presence of config if config_dict is not None: return_tup = spark_sess, spark_logger, config_dict else: return_tup = spark_sess, spark_logger return return_tup
def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], files=[], spark_config={}, dependencies=None): """Start Spark session, get Spark logger and load config files. Start a Spark session on the worker node and register the Spark application with the cluster. Note, that only the app_name argument will apply when this is called from a script sent to spark-submit. All other arguments exist solely for testing the script from within an interactive Python console. This function also looks for a file ending in 'config.json' that can be sent with the Spark job. If it is found, it is opened, the contents parsed (assuming it contains valid JSON for the ETL job configuration) into a dict of ETL job configuration parameters, which are returned as the last element in the tuple returned by this function. If the file cannot be found then the return tuple only contains the Spark session and Spark logger objects and None for config. The function checks the enclosing environment to see if it is being run from inside an interactive console session or from an environment which has a `DEBUG` environment variable set (e.g. setting `DEBUG=1` as an environment variable as part of a debug configuration within an IDE such as Visual Studio Code or PyCharm. In this scenario, the function uses all available function arguments to start a PySpark driver from the local PySpark package as opposed to using the spark-submit and Spark cluster defaults. This will also use local module imports, as opposed to those in the zip archive sent to spark via the --py-files flag in spark-submit. :param app_name: Name of Spark app. :param master: Cluster connection details (defaults to local[*]). :param jar_packages: List of Spark JAR package names. :param files: List of files to send to Spark cluster (master and workers). :param spark_config: Dictionary of config key-value pairs. :return: A tuple of references to the Spark session, logger and config dict (only if available). """ # append default file files.insert(0, "configs/default_config.json") # append packages zip file with all dependencies if dependencies: files.append(dependencies) # detect execution environment flag_debug = 'DEBUG' in environ.keys() config_dict = {} if flag_debug or master is None: # get Spark session factory spark_builder = ( SparkSession.builder.enableHiveSupport().appName(app_name)) else: # get Spark session factory spark_builder = (SparkSession.builder.master( "spark://{}".format(master)).enableHiveSupport().appName(app_name)) # create Spark JAR packages string spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) # add packages.zip as a dependency spark_builder.config('spark.submit.pyFiles', dependencies) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() config_files = [ filename for filename in listdir(spark_files_dir) if filename.endswith('config.json') ] # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) spark_logger = logging.Log4j(spark_sess) if config_files: for config_file in config_files: path_to_config_file = path.join(spark_files_dir, config_file) with open(path_to_config_file, 'r') as config_file: config_dict.update(json.load(config_file)) spark_logger.info('loaded config from ' + config_file.name) else: spark_logger.warning('no config file found') config_dict = None return spark_sess, spark_logger, config_dict
def start_spark(app_name='hello_fresh_etl_job', master='local[*]', jar_packages=[], files=[], spark_config={}): """Start Spark session, get Spark logger and load config files. :param app_name: Name of Spark app. :param master: Cluster connection details (defaults to local[*]. :param jar_packages: List of Spark JAR package names. :param files: List of files to send to Spark cluster (master and workers). :param spark_config: Dictionary of config key-value pairs. :return: A tuple of references to the Spark session, logger and config dict (only if available). """ # detect execution environment flag_repl = False if hasattr(__main__, '__file__') else True flag_debug = True if 'DEBUG' in environ.keys() else False if not (flag_repl or flag_debug): # get Spark session factory spark_builder = (SparkSession.builder.appName(app_name)) environment = 'local' else: # get Spark session factory spark_builder = (SparkSession.builder.master(master).appName(app_name)) # create Spark JAR packages string spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark_sess) # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() config_files = [ filename for filename in listdir(spark_files_dir) if filename.endswith('config.json') ] if len(config_files) != 0: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_json = config_file.read().replace('\n', '') config_dict = loads(config_json) spark_logger.warn('loaded config from ' + config_files[0]) else: try: with open('configs/etl_config.json', 'r') as config_file: config_json = config_file.read().replace('\n', '') config_dict = loads(config_json) except FileNotFoundError: config_dict = None return spark_sess, spark_logger, config_dict, environment