def ipy_imports(launch_spark=False): from xutil.helpers import (log, get_exception_message, get_error_str, get_profile) from xutil.database.base import get_conn from xutil.diskio import (write_csv, read_csv, read_file, write_file, get_hdfs) from jmespath import search from pathlib import Path from collections import Counter, namedtuple, OrderedDict import time, datetime if launch_spark: from xutil.database.spark import Spark parser = argparse.ArgumentParser(description='Spark IPython') parser.add_argument('--master', help='Master string for Spark Instance') parser.add_argument('--profile', help='Database profile name from PROFILE_YAML') args = parser.parse_args() dbs = get_profile(create_if_missing=True)['databases'] if args.profile and args.profile in dbs and dbs[ args.profile]['type'].lower() in ('hive', 'spark'): conn = get_conn(args.profile) globals()['sparko'] = conn.sparko elif args.profile: log( Exception('Profile {} not found or incompatible.'.format( args.profile))) sys.exit(1) else: globals()['sparko'] = Spark(master=args.master) globals()['sc'] = sparko.sc globals()['spark'] = sparko.spark ldict = locals() for name in ldict: var = ldict[name] if callable(var) or isinstance(var, __builtins__.__class__): # is a function or class or module globals()[name] = var
def create_profile(): "Create profile.yaml if it does not exists" from xutil.helpers import get_profile get_profile(create_if_missing=True) log('+YAML Profile located @ {}'.format(os.environ['PROFILE_YAML']))
def __init__(self, app_name=None, master=None, conf={}, spark_home=None, restart=False, hive_enabled=False, config_name=socket.gethostname().lower(), prog_handler=None, log=log): # restart = True if version != self.version else restart if os.getenv('KLOG'): os.system('bash $KLOG') # kerb login spark_home = self.set_sparkenv(spark_home) from pyspark import SparkContext, SQLContext, SparkConf from pyspark.sql import SparkSession active_sc = SparkContext._active_spark_context if active_sc: log("Active SC ->> " + active_sc.appName) sc = active_sc spark = SparkSession(sc) else: sc = None spark = None if sc and restart: log('~Stopping Spark Instance ({})'.format(sc.appName)) try: ps_data = {p.pid: p for p in psutil.process_iter() if p.cmdline()} child_pid = ps_data[os.getpid()].children()[0].pid if not hive_enabled: os.system('kill -9 ' + str(child_pid)) SparkContext._gateway = None except: print(get_exception_message()) sc.stop() sc = None # sc = sc.getOrCreate() profile = get_profile() if profile: conf_def = profile['spark-conf'] if 'spark-conf-name' in profile: if config_name in profile['spark-conf-name']: # overwrite the default spark-conf for key in profile['spark-conf-name'][config_name]: conf_def[key] = profile['spark-conf-name'][config_name][key] else: conf_def = { "spark.master": "local[4]", "spark.driver.memory": "5g", "spark.driver.maxResultSize": "2g", "spark.driver.cores": "1", "spark.executor.instances": "4", "spark.executor.cores": "4", "spark.sql.broadcastTimeout": 900, # "spark.sql.tungsten.enabled": "true", "spark.io.compression.codec": "snappy", "spark.rdd.compress": "true", "spark.streaming.backpressure.enabled": "true", "spark.sql.parquet.compression.codec": "snappy", } # set extraClassPath conf_def["spark.driver.extraClassPath"] = self._get_jar_paths(profile) if 'SPARK_CLASSPATH' in os.environ and os.environ['SPARK_CLASSPATH']: conf_def["spark.driver.extraClassPath"] = conf_def["spark.driver.extraClassPath"] + ':' + os.environ['SPARK_CLASSPATH'] del os.environ['SPARK_CLASSPATH'] if master: conf['spark.master'] = master if hive_enabled: conf["spark.sql.catalogImplementation"] = "hive" for c in conf_def: conf[c] = conf_def[c] if c not in conf else conf[c] # Launch Spark Instance version = self.get_spark_version(spark_home) app_name = app_name if app_name else 'Spark_{}_{}_{}'.format( str(version).replace('.', ''), os.getenv('USER'), os.getpid()) if not sc: log('Starting Spark Instance ({}) with version {} / {}'.format( app_name, version, conf['spark.master'])) sc, spark, proc = self.init_spark(app_name, spark_home, hive_enabled, conf, restart, prog_handler) self.proc = proc self.hive_enabled = hive_enabled self.version = version self.sc = sc self.uiWebUrl = sc.uiWebUrl self.local_uiWebUrl = 'http://{}:{}'.format(socket.gethostname(), sc.uiWebUrl.split(':')[-1]) self.spark = spark
def get_conn(db, dbs=None, echo=True, reconnect=False, use_jdbc=False, conn_expire_min=10, spark_hive=False) -> DBConn: global conns dbs = dbs if dbs else get_databases() profile = get_profile() db_dict = struct(dbs[db]) if db_dict.type.lower() == 'hive' and spark_hive: db_dict.type = 'spark' use_jdbc = True if ( use_jdbc or ('use_jdbc' in db_dict and db_dict['use_jdbc'])) else use_jdbc if db in conns and not reconnect: if (now() - conns[db].last_connect).total_seconds() / 60 < conn_expire_min: return conns[db] if use_jdbc: log('*USING JDBC for ' + db) from .jdbc import JdbcConn conn = JdbcConn(db_dict, profile=profile) elif db_dict.type.lower() == 'oracle': from .oracle import OracleConn conn = OracleConn(db_dict, echo=echo) elif db_dict.type.lower() == 'spark': from .spark import SparkConn conn = SparkConn(db_dict, echo=echo) elif db_dict.type.lower() == 'hive': from .hive import HiveConn, Beeline if 'use_beeline' in db_dict and db_dict.use_beeline: conn = Beeline(db_dict, echo=echo) else: conn = HiveConn(db_dict, echo=echo) elif db_dict.type.lower() in ('postgresql', 'redshift'): from .postgresql import PostgreSQLConn conn = PostgreSQLConn(db_dict, echo=echo) elif db_dict.type.lower() == 'sqlserver': from .sqlserver import SQLServerConn conn = SQLServerConn(db_dict, echo=echo) elif db_dict.type.lower() == 'sqlite': from .sqlite import SQLiteConn conn = SQLiteConn(db_dict, echo=echo) else: raise Exception(f'Type {db_dict.type} not handled!') conns[db] = conn return conn
WORKER_PREFIX = os.getenv('DBNET_WORKER_PREFIX', default='dbnet') WEBAPP_HOST = os.getenv('DBNET_WEBAPP_HOST', default='0.0.0.0') WEBAPP_PORT = int(os.getenv('DBNET_WEBAPP_PORT', default=5566)) DBNET_FOLDER = os.getenv('DBNET_FOLDER', default=get_home_path() + '/dbnet') MAX_WORKER_PER_DB = int(os.getenv('DBNET_MAX_WORKER_PER_DB', default=3)) DBNET_DB_URL = os.getenv('DBNET_DB_URL') os.makedirs(DBNET_FOLDER, exist_ok=True) hostname = socket.gethostname() workers = OrderedDict() db_workers_map = OrderedDict() conf_queue = Queue() exit_queue = Queue() profile = get_profile(create_if_missing=True, def_profl_path=f'{DBNET_FOLDER}/profile.yaml') databases = get_databases(profile) print(f'profile `{os.getenv("PROFILE_YAML")}` databases -> {list(databases)}') def start_worker_webapp(): """Starts the WebApp worker""" worker_name = '{}-webapp'.format(WORKER_PREFIX) worker = Worker(worker_name, 'web-app', fn=webapp_worker.run, log=log, kill_if_running=True, args=(WEBAPP_HOST, WEBAPP_PORT), kwargs={'mon_worker': workers['mon']},