def _stop(self, spark_session=None): self.stopped = True try: sc_from_session = spark_session.sparkContext if spark_session else None sc_from_engine = self.context.sparkContext if self.context else None sc_from_module = pyspark.SparkContext._active_spark_context or None scs = [sc_from_session, sc_from_engine, sc_from_module] if self.context: self.context.stop() if spark_session: spark_session.stop() cls = pyspark.SparkContext for sc in scs: if sc: try: sc.stop() sc._gateway.shutdown() except Exception as e: pass cls._active_spark_context = None cls._gateway = None cls._jvm = None except Exception as e: print(e) logging.warning( f'Could not fully stop the {self.engine_type} context')
def list(self, provider): if isinstance(provider, YamlDict): md = provider.to_dict() elif isinstance(provider, str): md = get_metadata(self._rootdir, self._metadata, None, provider) elif isinstance(provider, dict): md = provider else: logging.warning(f'{str(provider)} cannot be used to reference a provider') return [] try: if md['service'] in ['local', 'file']: d = [] for f in os.listdir(md['provider_path']): d.append(os.path.join(md['provider_path'], f)) return d elif md['service'] == 'hdfs': sc = self._ctx._sc URI = sc._gateway.jvm.java.net.URI Path = sc._gateway.jvm.org.apache.hadoop.fs.Path FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem fs = FileSystem.get(URI(md['url']), sc._jsc.hadoopConfiguration()) obj = fs.listStatus(Path(md['url'])) tables = [obj[i].getPath().getName() for i in range(len(obj))] return tables elif md['format'] == 'jdbc': if md['service'] == 'mssql': query = "(SELECT table_name FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE') as query" elif md['service'] == 'oracle': query = "(SELECT table_name FROM all_tables WHERE owner='schema_name') as query" elif md['service'] == 'mysql': query = f"(SELECT table_name FROM information_schema.tables where table_schema='{md['database']}') as query" elif md['service'] == 'pgsql': query = f"(SELECT table_name FROM information_schema.tables) as query" else: # vanilla query ... for other databases query = f"(SELECT table_name FROM information_schema.tables) as query" obj = self._ctx.read \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", query) \ .option("driver", md['driver']) \ .option("user", md['username']) \ .option('password', md['password']) \ .load() # load the data from jdbc return [x.TABLE_NAME for x in obj.select('TABLE_NAME').collect()] else: logging.error({'md': md, 'error_msg': f'List resource on service "{md["service"]}" not implemented'}) return [] except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e return []
def get_detected_submit_lists(self, detect=True): submit_types = ['jars', 'packages', 'py-files'] submit_objs=dict() for submit_type in submit_types: submit_objs[submit_type] = [] if not detect: return submit_objs # get hadoop, and configured metadata services hadoop_version = self._info['hadoop_version'] providers = self._metadata['providers'] services = {v['service'] for v in providers.values()} services = sorted(list(services)) #### submit: jars jars = submit_objs['jars'] if 'oracle' in services: jar = 'http://www.datanucleus.org/downloads/maven2/' jar += 'oracle/ojdbc6/11.2.0.3/ojdbc6-11.2.0.3.jar' jars.append(jar) #### submit: packages packages = submit_objs['packages'] for v in services: if v == 'mysql': packages.append('mysql:mysql-connector-java:8.0.12') elif v == 'sqlite': packages.append('org.xerial:sqlite-jdbc:3.25.2') elif v == 'postgres': packages.append('org.postgresql:postgresql:42.2.5') elif v == 'mssql': packages.append('com.microsoft.sqlserver:mssql-jdbc:6.4.0.jre8') elif v == 'minio': if hadoop_version: packages.append(f"org.apache.hadoop:hadoop-aws:{hadoop_version}") else: logging.warning('Hadoop is not detected. ' 'Could not load hadoop-aws package ') #### submit: py-files pyfiles = submit_objs['py-files'] #### print debug for submit_type in submit_types: if submit_objs[submit_type]: print(f'Loading detected {submit_type}:') for i in submit_objs[submit_type]: print(f' - {i}') return submit_objs
def set_info(self): hadoop_version = None hadoop_detect_from = None try: session = pyspark.sql.SparkSession.builder.getOrCreate() hadoop_version = session.sparkContext._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion() hadoop_detect_from = 'spark' self.stop(session) except Exception as e: print(e) pass if hadoop_version is None: hadoop_version = get_hadoop_version_from_system() hadoop_detect_from = 'system' if hadoop_version is None: logging.warning('Could not find a valid hadoop install.') hadoop_home = get_tool_home('hadoop', 'HADOOP_HOME', 'bin')[0] spark_home = get_tool_home('spark-submit', 'SPARK_HOME', 'bin')[0] spark_dist_classpath = os.environ.get('SPARK_DIST_CLASSPATH') spark_dist_classpath_source = 'env' if not spark_dist_classpath: spark_dist_classpath_source = os.path.join(spark_home,'conf/spark-env.sh') if os.path.isfile(spark_dist_classpath_source): with open(spark_dist_classpath_source) as s: for line in s: pattern = 'SPARK_DIST_CLASSPATH=' pos = line.find(pattern) if pos>=0: spark_dist_classpath = line[pos+len(pattern):].strip() spark_dist_classpath = run_command(f'echo {spark_dist_classpath}')[0] if hadoop_detect_from == 'system' and (not spark_dist_classpath): logging.warning(textwrap.dedent(""" SPARK_DIST_CLASSPATH not defined and spark installed without hadoop define SPARK_DIST_CLASSPATH in $SPARK_HOME/conf/spark-env.sh as follows: export SPARK_DIST_CLASSPATH=$(hadoop classpath) for more info refer to: https://spark.apache.org/docs/latest/hadoop-provided.html """)) self._info['python_version']=python_version() self._info['hadoop_version']=hadoop_version self._info['hadoop_detect']=hadoop_detect_from self._info['hadoop_home']=hadoop_home self._info['spark_home']=spark_home self._info['spark_classpath']=spark_dist_classpath.split(':') if spark_dist_classpath else None self._info['spark_classpath_source']=spark_dist_classpath_source return
def load_with_pandas(self, kargs): logging.warning("Fallback dataframe reader") #conversion of *some* pyspark arguments to pandas kargs.pop('inferSchema', None) kargs['header'] = 'infer' if kargs.get('header') else None kargs['prefix'] = '_c' return kargs
def save_with_pandas(self, md, kargs): if not self.is_spark_local(): logging.warning("Fallback dataframe writer") if os.path.exists(md['url']) and os.path.isdir(md['url']): shutil.rmtree(md['url']) #conversion of *some* pyspark arguments to pandas kargs.pop('mode', None) kargs['index'] = False kargs['header'] = False if kargs.get('header') is None else kargs.get('header') return kargs
def load_csv(self, path=None, provider=None, *args, sep=None, header=None, **kwargs): #return None obj = None md = Resource(path, provider, sep=sep, header=header, **kwargs) options = md['options'] # after collecting from metadata, or method call, define csv defaults options['header'] = options.get('header') or True options['inferSchema'] = options.get('inferSchema') or True options['sep'] = options.get('sep') or ',' local = self.is_spark_local() try: #three approaches: local, cluster, and service if md['service'] == 'file' and local: obj = self.context.read.options(**options).csv(md['url']) elif md['service'] == 'file': logging.warning( f'local file + spark cluster: loading using pandas reader', extra={'md': to_dict(md)}) df = pd.read_csv(md['url'], sep=options['sep'], header=options['header']) obj = self.context.createDataFrame(df) elif md['service'] in ['hdfs', 's3a']: obj = self.context.read.options(**options).csv(md['url']) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return obj except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error(e, extra={'md': md}) return obj
def set_conf_kv(self, conf): # appname if self._metadata['engine']['jobname']: logging.warning('deprecated: metadata engine/jobname is generated') conf.setAppName(self._name) # set master master_url = self._metadata['engine']['master'] conf.setMaster(master_url) # set kv conf from metadata conf_md = self._metadata['engine']['config'] for k, v in conf_md.items(): if isinstance(v, (bool, int, float, str)): conf.set(k, v)
def load_parquet(self, path=None, provider=None, *args, mergeSchema=None, **kwargs): #return None obj = None md = Resource(path, provider, format='parquet', mergeSchema=mergeSchema, **kwargs) options = md['options'] # after collecting from metadata, or method call, define csv defaults options['mergeSchema'] = options.get('mergeSchema') or True local = self.is_spark_local() try: #three approaches: local, cluster, and service if md['service'] == 'file' and local: obj = self.context.read.options(**options).parquet(md['url']) elif md['service'] == 'file': logging.warning( f'local file + spark cluster: loading using pandas reader', extra={'md': to_dict(md)}) #fallback to the pandas reader, then convert to spark df = pd.read_parquet(md['url']) obj = self.context.createDataFrame(df) elif md['service'] in ['hdfs', 's3a']: obj = self.context.read.options(**options).parquet(md['url']) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return obj except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error(e, extra={'md': md}) return obj
def directory_to_file(self, path, ext): if os.path.exists(path) and os.path.isfile(path): return dirname = os.path.dirname(path) basename = os.path.basename(path) filename = list(filter(lambda x: x.endswith(ext), os.listdir(path))) if len(filename)!=1: logging.warning('cannot convert if more than a partition present') return else: filename = filename[0] shutil.move(os.path.join(path,filename), dirname) if os.path.exists(path) and os.path.isdir(path): shutil.rmtree(path) shutil.move(os.path.join(dirname,filename), os.path.join(dirname,basename)) return
def stop(self, spark_session=None): try: spark_session = spark_session or self._ctx sc = None if spark_session: sc = spark_session.sparkContext spark_session.stop() cls = pyspark.SparkContext sc = sc or cls._active_spark_context if sc: sc.stop() sc._gateway.shutdown() cls._active_spark_context = None cls._gateway = None cls._jvm = None except Exception as e: print(e) logging.warning('Could not fully stop the engine context')
def list(self, provider, path=''): df_schema = T.StructType([ T.StructField('name',T.StringType(),True), T.StructField('type',T.StringType(),True)]) df_empty = self._ctx.createDataFrame(data=(), schema=df_schema) if isinstance(provider, YamlDict): md = provider.to_dict() elif isinstance(provider, str): md = resource.metadata(self._rootdir, self._metadata, None, provider) elif isinstance(provider, dict): md = provider else: logging.warning(f'{str(provider)} cannot be used to reference a provider') return df_empty try: if md['service'] in ['local', 'file']: lst = [] rootpath = os.path.join(md['provider_path'], path) for f in os.listdir(rootpath): fullpath = os.path.join(rootpath, f) if os.path.isfile(fullpath): obj_type='FILE' elif os.path.isdir(fullpath): obj_type='DIRECTORY' elif os.path.ismount(fullpath): obj_type='LINK' elif os.path.islink(fullpath): obj_type='MOUNT' else: obj_type='UNDEFINED' obj_name = f lst += [(obj_name, obj_type)] return self._ctx.createDataFrame(lst, ['name', 'type']) if lst else df_empty elif md['service'] in ['hdfs', 'minio']: sc = self._ctx._sc URI = sc._gateway.jvm.java.net.URI Path = sc._gateway.jvm.org.apache.hadoop.fs.Path FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem fs = FileSystem.get(URI(md['url']), sc._jsc.hadoopConfiguration()) provider_path = md['provider_path'] if md['service']=='hdfs' else '/' obj = fs.listStatus(Path(os.path.join(provider_path, path))) lst = [] for i in range(len(obj)): if obj[i].isFile(): obj_type='FILE' elif obj[i].isDirectory(): obj_type='DIRECTORY' else: obj_type='UNDEFINED' obj_name = obj[i].getPath().getName() lst += [(obj_name, obj_type)] return self._ctx.createDataFrame(lst, ['name', 'type']) if lst else df_empty elif md['format'] == 'jdbc': # remove options from database, if any database = md["database"].split('?')[0] schema = md['schema'] if md['service'] == 'mssql': query = f""" ( SELECT table_name, table_type FROM INFORMATION_SCHEMA.TABLES WHERE table_schema='{schema}' ) as query """ elif md['service'] == 'oracle': query = f""" ( SELECT table_name, table_type FROM all_tables WHERE table_schema='{schema}' ) as query """ elif md['service'] == 'mysql': query = f""" ( SELECT table_name, table_type FROM information_schema.tables WHERE table_schema='{schema}' ) as query """ elif md['service'] == 'postgres': query = f""" ( SELECT table_name, table_type FROM information_schema.tables WHERE table_schema = '{schema}' ) as query """ else: # vanilla query ... for other databases query = f""" ( SELECT table_name, table_type FROM information_schema.tables' ) as query """ obj = self._ctx.read \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", query) \ .option("driver", md['driver']) \ .option("user", md['username']) \ .option('password', md['password']) \ .load() # load the data from jdbc lst = [(x.TABLE_NAME, x.TABLE_TYPE) for x in obj.select('TABLE_NAME', 'TABLE_TYPE').collect()] return self._ctx.createDataFrame(lst, ['name', 'type']) if lst else df_empty else: logging.error({'md': md, 'error_msg': f'List resource on service "{md["service"]}" not implemented'}) return df_empty except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e return df_empty
def detect_submit_params(self, services=None): assert (isinstance(services, (type(None), str, list, set))) services = [services] if isinstance(services, str) else services services = services or [] # if service is a string, make a resource out of it resources = [ s if isinstance(s, dict) else Resource(service=s) for s in services ] services = set([r['service'] for r in resources]) submit_types = [ 'jars', 'packages', 'repositories', 'py-files', 'files', 'conf' ] submit_objs = dict() for submit_type in submit_types: submit_objs[submit_type] = [] if not services: return submit_objs services = sorted(list(services)) # get hadoop, and configured metadata services hadoop_version = self.info['hadoop_version'] #### submit: jars jars = submit_objs['jars'] if 'oracle' in services: jar = 'http://www.datanucleus.org/downloads/maven2/' jar += 'oracle/ojdbc6/11.2.0.3/ojdbc6-11.2.0.3.jar' jars.append(jar) #### submit: packages packages = submit_objs['packages'] for v in services: if v == 'mysql': packages.append('mysql:mysql-connector-java:8.0.12') elif v == 'sqlite': packages.append('org.xerial:sqlite-jdbc:3.25.2') elif v == 'postgres': packages.append('org.postgresql:postgresql:42.2.5') elif v == 'mssql': packages.append( 'com.microsoft.sqlserver:mssql-jdbc:6.4.0.jre8') elif v == 's3a': if hadoop_version: packages.append( f"org.apache.hadoop:hadoop-aws:{hadoop_version}") else: logging.warning('The Hadoop installation is not detected. ' 'Could not load hadoop-aws (s3a) package ') #### submit: packages conf = submit_objs['conf'] for v in resources: if v['service'] == 's3a': service_url = 'http://{}:{}'.format(v['host'], v['port']) s3a = "org.apache.hadoop.fs.s3a.S3AFileSystem" conf.append(("spark.hadoop.fs.s3a.endpoint", service_url)) conf.append(("spark.hadoop.fs.s3a.access.key", v['user'])) conf.append(("spark.hadoop.fs.s3a.secret.key", v['password'])) conf.append(("spark.hadoop.fs.s3a.impl", s3a)) conf.append(("spark.hadoop.fs.s3a.path.style.access", "true")) break return submit_objs