def get_SparkContext(app_name='tuixing-spark', **kwargs): conf = SparkConf() conf.setAppName(app_name) conf.setAll(COMMON_SC) for key in kwargs: conf.set(key, kwargs[key]) sc = SparkContext(conf=conf) return sc
def create_streaming_context(): conf = SparkConf() pairs = [('spark.app.name', 'Process Stories Stream'), ('spark.master', 'local[4]'), ('spark.ui.port', '4040')] conf.setAll(pairs) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, batch_secs) ssc.checkpoint(checkpointDirectory) # set checkpoint directory return ssc
def spark_session(app, cores=2, gpus=0, max_failures=1, *args): from pyspark import SparkConf from pyspark.sql import SparkSession with TemporaryDirectory() as tmpdir: metastore_path = os.path.join(tmpdir, 'metastore') # start a single worker with given cores when gpus are present # max failures are ignored when gpus in that case master = 'local-cluster[1,{},1024]'.format(cores) if gpus > 0 \ else 'local[{},{}]'.format(cores, max_failures) conf = SparkConf().setAppName(app).setMaster(master) conf = conf.setAll([ ('spark.ui.showConsoleProgress', 'false'), ('spark.test.home', os.environ.get('SPARK_HOME')), ('spark.locality.wait', '0'), ('spark.unsafe.exceptionOnMemoryLeak', 'true'), ('spark.ui.enabled', 'false'), ('spark.local.dir', os.path.join(tmpdir, 'tmp')), ('spark.sql.warehouse.dir', os.path.join(tmpdir, 'warehouse')), ('javax.jdo.option.ConnectionURL', f'jdbc:derby:;databaseName={metastore_path};create=true'), ]) with temppath() as temp_filename: if gpus > 0: with open(temp_filename, 'wb') as temp_file: addresses = ', '.join('\\"{}\\"'.format(i) for i in range(gpus)) temp_file.write( b'echo {\\"name\\": \\"gpu\\", \\"addresses\\": [' + addresses.encode('ascii') + b']}') os.chmod( temp_file.name, stat.S_IRWXU | stat.S_IXGRP | stat.S_IRGRP | stat.S_IROTH | stat.S_IXOTH) # the single worker takes all gpus discovered, and a single executor will get them # each task on that executor will get a single gpu conf = conf.setAll([ ('spark.worker.resource.gpu.discoveryScript', temp_filename), ('spark.worker.resource.gpu.amount', str(gpus)), ('spark.task.resource.gpu.amount', '1'), ('spark.executor.resource.gpu.amount', str(gpus)), ]) session = SparkSession \ .builder \ .config(conf=conf) \ .getOrCreate() try: yield session finally: session.stop()
def get_spark_conf(self): conf = SparkConf() # 创建spark config对象 config = ( ("spark.app.name", self.SPARK_APP_NAME), # 设置启动的spark的app名称,没有提供,将随机产生一个名称 ("spark.executor.memory", "2g"), # 设置该app启动时占用的内存用量,默认1g ("spark.master", self.SPARK_URL), # spark master的地址 ("spark.executor.cores", "2") # 设置spark executor使用的CPU核心数 # ('spark.sql.pivotMaxValues', '99999'), # 当需要pivot DF,且值很多时,需要修改,默认是10000 ) # 查看更详细配置及说明:https://spark.apache.org/docs/latest/configuration.html conf.setAll(config) return conf
def __init__(self, path_files: str, path_index: str, path_dict: str, file_name: str, num_partition: int): path = ''.join(path_files + file_name) self.__file_name = file_name conf = SparkConf() conf.setAll( [ ('spark.app.name', 'Challenge Data Engineer'), ('spark.driver.cores', '4'), ('spark.executor.cores', '4'), ('spark.driver.maxResultSize', '10g'), ('spark.executor.memory', '4g'), ('spark.executor.memoryOverhead ', '4g'), ('spark.driver.memory', '10g'), ('spark.local.dir', PATH_INDEX), ('spark.driver.extraJavaOptions', '-Xmx1024m'), ('spark.memory.offHeap.enabled', 'true'), ('spark.memory.offHeap.size', '20g') ] ) self.__spark = SparkSession \ .builder \ .config(conf=conf) \ .getOrCreate() self.__df_dict = self.__spark \ .read \ .parquet(path_dict) \ .repartition(numPartitions=num_partition) self.__df_doc = self.__spark \ .read \ .text(path) self.__df_wordid_docid = self.__spark \ .read \ .parquet(path_index) \ .rdd \ .unpersist() \ .repartition(numPartitions=1000) self.__df_wordid_docid = self.__df_wordid_docid.toDF() logging.warning(f"Processing doc: {path}")
def create_spark_session(self): conf = SparkConf() config = (("spark.app.name", self.SPARK_APP_NAME), ("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY), ("spark.master", self.SPARK_MASTER), ("spark.executor.cores", self.SPARK_EXECUTOR_CORES), ("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES), ("spark.debug.maxToStringFields", "10000")) conf.setAll(config) if self.ENABLE_HIVE_SUPPORT: return SparkSession.builder.config( conf=conf).enableHiveSupport().getOrCreate() else: return SparkSession.builder.config(conf=conf).getOrCreate()
def __init__(self, path_files: str, path_index: str, path_dict: str, file_name: str, num_partition: int): path = ''.join(path_files + file_name) self.__file_name = file_name conf = SparkConf() # Application Properties # http://spark.apache.org/docs/latest/configuration.html#spark-properties conf.setAll([('spark.app.name', 'Challenge Data Engineer'), ('spark.driver.cores', '4'), ('spark.executor.cores', '4'), ('spark.driver.maxResultSize', '10g'), ('spark.executor.memory', '10g'), ('spark.executor.memoryOverhead ', '10g'), ('spark.driver.memory', '10g'), ('spark.local.dir', PATH_INDEX), ('spark.driver.extraJavaOptions', '-Xmx1024m'), ('spark.memory.offHeap.enabled', 'true'), ('spark.memory.offHeap.size', '20g')]) self.__spark = SparkSession \ .builder \ .config(conf=conf) \ .getOrCreate() self.__df_dict = self.__spark \ .read \ .parquet(path_dict) \ .repartition(numPartitions=num_partition) self.__df_doc = self.__spark \ .read \ .text(path) self.__df_wordid_docid = self.__spark \ .read \ .parquet(path_index) \ .rdd \ .unpersist() \ .repartition(numPartitions=1000) print(self.__df_wordid_docid.getStorageLevel()) print(self.__df_wordid_docid.getNumPartitions()) print(self.__spark.sparkContext.getConf().getAll()) self.__spark.sql("SET -v").show(n=200, truncate=False) self.__df_wordid_docid = self.__df_wordid_docid.toDF() self.__spark.sparkContext.setLogLevel("warn") logging.warning(f"Processing doc: {path}")
def _init_spark(self, appname): """Internal function to setup spark context Note: only include spark modules here so that the interface can be queried outside of pyspark. """ # currently using LZ4 compression: should not degrade runtime much # but will help with some operations like shuffling, especially when # dealing with things object like highly compressible label volumes # NOTE: objects > INT_MAX will cause problems for LZ4 worker_env = {} if "DVIDSPARK_WORKFLOW_TMPDIR" in os.environ and os.environ["DVIDSPARK_WORKFLOW_TMPDIR"]: worker_env["DVIDSPARK_WORKFLOW_TMPDIR"] = os.environ["DVIDSPARK_WORKFLOW_TMPDIR"] try: spark_config = self.config_data["options"]["spark-config"] except KeyError: # Old workflows haven't been updated to inherit the base Workflow schema spark_config = {} for k in list(spark_config.keys()): spark_config[k] = str(spark_config[k]) if spark_config[k] in ('True', 'False'): spark_config[k] = spark_config[k].lower() # Backwards compatibility: # if 'corespertask' option exists, override it in the spark config if "corespertask" in self.config_data["options"] and self.config_data["options"]["corespertask"] != 0: if "spark.task.cpus" in spark_config and spark_config["spark.task.cpus"] != '1': raise RuntimeError("Bad config: You can't set both 'corespertask' and 'spark.task.cpus'. Use 'spark.task.cpus'.") spark_config["spark.task.cpus"] = str(self.config_data["options"]["corespertask"]) # set spark config from pyspark import SparkContext, SparkConf conf = SparkConf() conf.setAppName(appname) conf.setAll(list(spark_config.items())) # from pyspark_flame import FlameProfiler # flamegraph_dir = f'{self.config_dir}/flamegraphs' # os.makedirs(flamegraph_dir, exist_ok=True) # conf.set("spark.python.profile.dump", flamegraph_dir) # conf.set("spark.python.profile", "true") # worker_env['pyspark_flame.interval'] = 0.25 # Default is 0.2 seconds # return SparkContext(conf=conf, batchSize=1, environment=worker_env, profiler_cls=FlameProfiler) # Auto-batching heuristic doesn't work well with our auto-compressed numpy array pickling scheme. # Therefore, disable batching with batchSize=1 return SparkContext(conf=conf, batchSize=1, environment=worker_env)
def _create_spark_session(self): conf = SparkConf() config = ( ('spark.app.name', self.SPARK_APP_NAME), ('spark.executor.memory', self.SPARK_EXECUTOR_MEMORY), ('spark.master', self.SPARK_URL), ('spark.executor.cores', self.SPARK_EXECUTOR_CORES), ('spark.executor.instances', self.SPARK_EXECUTOR_INSTANCES), ) conf.setAll(config) if self.ENABLE_HIVE_SUPPORT: return SparkSession.builder.config( conf=conf).enableHiveSupport().getOrCreate() else: return SparkSession.builder.config(conf=conf).getOrCreate()
def spark_cluster(logfile, discovery_schedule, hosts, extra_conf=None): from pyspark import SparkConf from pyspark.sql import SparkSession unknown_keys = set([prop for prop, _ in extra_conf]) \ .difference(conf.SPARK_CONF_DEFAULT_VALUES.keys()) \ if extra_conf else None if unknown_keys: raise ValueError( 'default values must be defined for these properties: {}'.format( unknown_keys)) cluster = SparkClusterController(logfile, discovery_schedule, hosts, 1) try: cluster.start() config = SparkConf().setAppName('elastic spark tests').setMaster( cluster.master_url()) config = config.setAll([ # pyspark-shell JVM will OOM even with 1GB when all tests run in one process # SparkContext and pyspark-shell JVM gets reused even though we do SparkSession.stop() # pyspark-shell JVM memory footprint increases from test to test # when run with pytest --forked, set SPARK_DRIVER_MEM=512m env ('spark.driver.memory', os.environ.get('SPARK_DRIVER_MEM', '1500m')), # the minimum executor memory we can set ('spark.executor.memory', '512m'), # don't pollute the log with progress bar ('spark.ui.showConsoleProgress', 'false'), ]) # config properties once set will survive session.stop() and # SparkSession.config(conf=config).getOrCreate(), so we have to make sure # we overwrite their value if not in extra_conf more_conf = conf.SPARK_CONF_DEFAULT_VALUES.copy() more_conf.update(extra_conf or []) config.setAll(more_conf.items()) session = SparkSession \ .builder \ .config(conf=config) \ .getOrCreate() try: yield session finally: session.stop() finally: cluster.shutdown()
def get_spark_config( predictrip_config: Mapping[str, Mapping[str, str]]) -> SparkConf: """ Create an object representing the Spark configuration we want :type predictrip_config: mapping returned by load_config containing configuration options :return: pyspark.SparkConf instance """ # NOTE: contrary to https://www.geomesa.org/documentation/user/spark/pyspark.html#using-geomesa-pyspark, use of # geomesa_pyspark.configure() no longer necessary since Spark 2.1, as long as you tell spark to include the # geomesa_pyspark python module some other way (e.g. spark.files) sc = SparkConf() sc = sc.setAppName('PredicTrip ' + path.basename(__file__)) # FIXME: the following doesn't seem to be effective sc = sc.setAll([('fs.s3a.awsAccessKeyId', predictrip_config['AWS']['access_key_id']), ('fs.s3a.awsSecretAccessKey', predictrip_config['AWS']['secret_access_key'])]) # add to sc any spark options that might be set in predictrip_config if 'executor_cores' in predictrip_config['Spark']: sc = sc.set('spark.executor.cores', predictrip_config['Spark']['executor_cores']) if 'executor_memory' in predictrip_config['Spark']: sc = sc.set('spark.executor.memory', predictrip_config['Spark']['executor_memory']) return sc
def _create_spark_session(self): #1、创建配置 '''给spark程序创建初始化spark session''' conf=SparkConf() config=( ("spark.app.name", self.SPARK_APP_NAME), # 设置启动的spark的app名称,没有提供,将随机产生一个名称 ("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY), # 设置该app启动时占用的内存用量,默认2g ("spark.master", self.SPARK_URL), # spark master的地址 ("spark.executor.cores", self.SPARK_EXECUTOR_CORES), # 设置spark executor使用的CPU核心数,默认是1核心 ("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES) ) conf.setAll(config) #读取配置初始化 if self.ENABLE_HIVE_SUPPORT: return SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate() else: return SparkSession.builder.config(conf=conf).getOrCreate()
def build_spark_session( app_name: str, spark_config: DefaultDict[str, str] = None, hadoop_config: DefaultDict[str, str] = None) -> SparkSession: conf = SparkConf() if spark_config: conf.setAll(spark_config.items()) sc = SparkContext(conf=conf) if hadoop_config: for k, v in hadoop_config.items(): sc._jsc.hadoopConfiguration().set(k, v) return SparkSession.builder \ .appName(app_name) \ .config(conf=sc.getConf()) \ .getOrCreate()
def _create_spark_hbase(self): conf = SparkConf() # 创建spark config对象 config = ( ("spark.app.name", self.SPARK_APP_NAME), # 设置启动的spark的app名称,没有提供,将随机产生一个名称 ("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY), # 设置该app启动时占用的内存用量,默认2g ("spark.master", self.SPARK_URL), # spark master的地址 ("spark.executor.cores", self.SPARK_EXECUTOR_CORES), # 设置spark executor使用的CPU核心数,默认是1核心 ("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES), ("hbase.zookeeper.quorum", "192.168.19.137"), ("hbase.zookeeper.property.clientPort", "22181") ) conf.setAll(config) # 利用config对象,创建spark session if self.ENABLE_HIVE_SUPPORT: return SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate() else: return SparkSession.builder.config(conf=conf).getOrCreate()
def create_spark_conf(): bigdl_conf = get_bigdl_conf() sparkConf = SparkConf() sparkConf.setAll(bigdl_conf.items()) if not is_spark_below_2_2(): extend_spark_driver_cp(sparkConf, get_bigdl_classpath()) # add content in PYSPARK_FILES in spark.submit.pyFiles # This is a workaround for current Spark on k8s python_lib = os.environ.get('PYSPARK_FILES', None) if python_lib: existing_py_files = sparkConf.get("spark.submit.pyFiles") if existing_py_files: sparkConf.set(key="spark.submit.pyFiles", value="%s,%s" % (python_lib, existing_py_files)) else: sparkConf.set(key="spark.submit.pyFiles", value=python_lib) return sparkConf
def _create_spark_session(self): conf = SparkConf() config = ( ("spark.app.name", self.SPARK_APP_NAME), ("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY), # ("spark.master", self.SPARK_URL), ("spark.executor.cores", self.SPARK_EXECUTOR_CORES), ("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES), # ("spark.sql.warehouse.dir", "/root/apache-hive-2.3.7-bin/warehouse"), ("hive.metastore.uris", "thrift://172.18.0.2:9083")) conf.setAll(config) print(self.ENABLE_HIVE_SUPPORT, config) if self.ENABLE_HIVE_SUPPORT: return SparkSession.builder.config( conf=conf).enableHiveSupport().getOrCreate() else: return SparkSession.builder.config(conf=conf).getOrCreate()
def closure(*args, **kwargs): try: options = opts options.update({ 'sql_parquet_compression_codec': 'uncompressed', 'mesos_role': role, 'mesos_coarse': bool(coarse), 'cores_max': int(coarse) or None, 'executor_cores': int(executor_cores), 'executor_memory': '{}m'.format(int(executor_memory / MiB)), 'driver_memory': '{}m'.format(int(driver_memory / MiB)), 'mesos_executor_memoryOverhead': int( (memory_overhead or (executor_cores * python_worker_memory + 0.1 * executor_memory)) / MiB), 'python_worker_memory': int(python_worker_memory / MiB), 'mesos_uris': ','.join(uris), 'mesos_executor_docker_image': docker }) options = {'spark.{}'.format(k.replace('_', '.')): str(v) for k, v in options.items() if v not in (None, '')} environs = envs.items() except TypeError as e: # curry doesn't reraise TypeErrors: # https://github.com/pytoolz/toolz/issues/288 raise Exception(repr(e)) conf = SparkConf() conf.setMaster(str(master)) conf.setAppName(str(name or fn.__name__)) conf.setAll(pairs=options.items()) conf.setExecutorEnv(pairs=environs) with SparkContext(conf=conf) as sc: sc.setLogLevel(str(log)) map(sc.addFile, files) map(sc.addPyFile, pyfiles) # TODO: user sparksession sql = SQLContext(sc) return fn(sc, sql, *args, **kwargs)
def get_spark_config(path, dependencies) -> SparkConf: master = 'local[2]' conf = SparkConf().setAppName('unit test').setMaster(master) return conf.setAll([ ('spark.ui.showConsoleProgress', 'false'), ('spark.test.home', os.environ.get('SPARK_HOME')), ('spark.locality.wait', '0'), ('spark.driver.extraClassPath', '{}'.format(':'.join([ os.path.join(os.getcwd(), path, 'target', 'classes'), os.path.join(os.getcwd(), path, 'target', 'test-classes'), dependencies ]))), ])
def spark_session(app, cores=2, gpus=0, max_failures=1, *args): from pyspark import SparkConf from pyspark.sql import SparkSession master = 'local-cluster[{},{},1024]'.format(cores, max_failures) if gpus > 0 \ else 'local[{},{}]'.format(cores, max_failures) conf = SparkConf().setAppName(app).setMaster(master) conf = conf.setAll([ ('spark.ui.showConsoleProgress', 'false'), ('spark.test.home', os.environ.get('SPARK_HOME')), ('spark.locality.wait', '0'), ]) with temppath() as temp_filename: if gpus > 0: with open(temp_filename, 'wb') as temp_file: addresses = ', '.join('\\"{}\\"'.format(i) for i in range(gpus)) temp_file.write(b'echo {\\"name\\": \\"gpu\\", \\"addresses\\": [' + addresses.encode('ascii') + b']}') os.chmod(temp_file.name, stat.S_IRWXU | stat.S_IXGRP | stat.S_IRGRP | stat.S_IROTH | stat.S_IXOTH) conf = conf.setAll([ ('spark.worker.resource.gpu.discoveryScript', temp_filename), ('spark.worker.resource.gpu.amount', '1'), ('spark.task.resource.gpu.amount', '1'), ('spark.executor.resource.gpu.amount', '1') ]) session = SparkSession \ .builder \ .config(conf=conf) \ .getOrCreate() try: yield session finally: session.stop()
def create_spark_conf(): bigdl_conf = get_bigdl_conf() sparkConf = SparkConf() sparkConf.setAll(bigdl_conf.items()) if os.environ.get("BIGDL_JARS", None) and not is_spark_below_2_2(): for jar in os.environ["BIGDL_JARS"].split(":"): extend_spark_driver_cp(sparkConf, jar) # add content in PYSPARK_FILES in spark.submit.pyFiles # This is a workaround for current Spark on k8s python_lib = os.environ.get("PYSPARK_FILES", None) if python_lib: existing_py_files = sparkConf.get("spark.submit.pyFiles") if existing_py_files: sparkConf.set( key="spark.submit.pyFiles", value="%s,%s" % (python_lib, existing_py_files), ) else: sparkConf.set(key="spark.submit.pyFiles", value=python_lib) return sparkConf
def create_spark_conf(): bigdl_conf = get_bigdl_conf() sparkConf = SparkConf() sparkConf.setAll(bigdl_conf.items()) return sparkConf
except msg2xlsx.ConvertingError: logger.error("converting failed, clean the previous work") xlsx_path = os.path.join(settings.SAVE_DIRECTORY, project_id) clean(project_id, dirs=[temp_file, xlsx_path]) return # processed ack channel.basic_ack(delivery_tag=method.delivery_tag) else: logger.error("illegal content-type: " + header.content_type) if __name__ == '__main__': # initialize spark conf = SparkConf().setMaster(settings.SPARK_MASTER_URL).setAppName(settings.SPARK_APP_NAME) conf.setAll([("spark.eventLog.enabled", "true"), ("spark.eventLog.dir", settings.LOG_DIRECTORY)]) sc = SparkContext(conf=conf) # initialize rabbitmq credentials = pika.PlainCredentials(settings.RABBITMQ_CONN_CONF['username'], settings.RABBITMQ_CONN_CONF['password']) conn_params = pika.ConnectionParameters(settings.RABBITMQ_CONN_CONF['host'], credentials=credentials) conn_broker = pika.BlockingConnection(conn_params) channel = conn_broker.channel() channel.exchange_declare(exchange=settings.RABBITMQ_SPARK['exchange'], type="direct", passive=False, durable=True, auto_delete=False) channel.queue_declare(queue=settings.RABBITMQ_SPARK['queue'])
from pyspark.streaming.kafka import *; from pyspark.storagelevel import StorageLevel import happybase appName = "KafkaStreams" config = SparkConf().setAppName(appName) props = [] props.append(("spark.rememberDuration", "10")) props.append(("spark.batchDuration", "10")) props.append(("spark.eventLog.enabled", "true")) props.append(("spark.streaming.timeout", "30")) props.append(("spark.ui.enabled", "true")) config = config.setAll(props) sc = SparkContext(conf=config) ssc = StreamingContext(sc, 5) topics = ["t1"] kafka_params = { "zookeeper.connect" : "localhost:5181/kafka" , "metadata.broker.list" : "localhost:9092" , "group.id" : "Kafka_MapR-Streams_to_HBase"} raw = KafkaUtils.createDirectStream(ssc, topics, kafka_params) raw.pprint() server = "localhost" table_name = "/tables/stocks"
class SparkUtils: def __init__(self, log: RootLogger = None, parms: dict = None, botoSession: bototSession = None, appName: str = None): self.log = log self.__parms = parms or {} self.__runEnv = self.__parms.get("--runEnv", "local") if (self.__runEnv == "aws"): self.__boto = botoSession self.__s3 = S3(log, self.__boto) self.__initFlags() self.__setupSparkSession__(appName) self.__dfltRDDParts = \ int(self.__spark.conf.get("spark.executor.instances", "20")) * \ int(self.__spark.conf.get("spark.executor.cores", "4")) * 2 def __initFlags(self): ''' Init the job level parameters needed by this class ''' self.__parms["--saveDFAs"] = self.__parms.get("--saveDFAs", "NONE") self.__explainDF = True if "-explainDF" in self.__parms else False self.__printcount = True if "-printCount" in self.__parms else False self.__useHist = True if "-useHint" in self.__parms else False self.__saveDF = True if self.__parms["--saveDFAs"] != "NONE" else False self.__fileFmt = self.__parms.get("--fileFormat", "parquet") if (self.__runEnv == "aws"): self.__tempS3 = self.__parms.get("--tempS3", "hdfs:///temp/s3") if (self.__runEnv != "local"): self.__tempHDFS = self.__parms.get("--tempHDFS", "hdfs:///temp") self.log.warn( "For persist type 'S3', 'HDFS' will be used as the --runEnv != 'aws'" ) def __setupSparkSession__(self, appName: str = None): ''' Init the Spark environemnt with few default configurations and start the spark session. ''' self.__conf = SparkConf() hmConf = { "spark.rps.askTimeout": "1200", "spark.network.timeout": "1200", "spark.broadcast.blockSize": "16m", "spark.sql.broadcastTimeout": "1200", "spark.broadcast.compress": "true", "spark.rdd.compress": "true", "fs.s3.enableServerSideEncryption": "true", "spark.kryo.unsafe": "false", "spark.kryoserializer.buffer": "10240", "spark.kryoserializer.buffer.max": "2040m", "spark.io.compression.codec": "org.apache.spark.io.SnappyCompressionCodec", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "mapreduce.fileoutputcommitter.algorithm.version": "2", "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version": "2", } self.__conf.setAll(hmConf) SparkContext.setSystemProperty("com.amazonaws.services.s3.enableV4", "true") SparkContext.setSystemProperty("com.amazonaws.services.s3.enforceV4", "true") self.__spark = SparkSession \ .builder \ .config(conf=self.__conf) \ .appName(appName or "PySparkApp") \ .enableHiveSupport() \ .getOrCreate() self.__sc = self.__spark.sparkContext self.sqlC = SQLContext(self.__sc) self.__sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true") self.__sc.setSystemProperty("com.amazonaws.services.s3.enforceV4", "true") self.__sc.setLogLevel(self.__parms.get("--logLevel", "INFO")) hdpCnf = self.__sc.hadoopConfiguration hdpCnf.setAll({ "io.file.buffer.size": "65536", "mapreduce.fileoutputcommitter.algorithm.version": "2", "fs.s3a.endpoint": "%s.amazonaws.com" % (self.__parms.get("--awsRegion", 's3.us-east-1')) }) if (self.__parms.get("--runEnv", "AWS") == "AWS"): from botocore.credentials import InstanceMetadataProvider, InstanceMetadataFetcher provider = InstanceMetadataProvider( iam_role_fetcher=InstanceMetadataFetcher(timeout=1000, num_attempts=2)) creds = provider.load() hdpCnf.setAll({ "fs.s3a.access.key": creds.access_key, "fs.s3a.secret.key": creds.secret_key, "fs.s3a.server-side-encryption-algorithm": "SSE-KMS", "fs.s3.enableServerSideEncryption": "true", "fs.s3.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem", "fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem", "fs.s3a.endpoint": "s3.%s.amazonaws.com" % (self.__parms.get("--awsRegion", "us-east-1")) }) def sql(self, dfName: str, query: str, partitions: int = 0, persistType: str = None): ''' Runs the inpult SQL, partitions the resulting Dataframe and persists the Dataframe if needed. Supported persistType: In addition to the pySpark native persist types, this function supports HIVE, HDFS, S3 ''' if persistType == None: _df = self.__spark.sql(self.handleHints(query)) if partitions == 0: df = _df elif df.rdd.getNumPartitions < partitions: df = _df.repartition(partitions) else: df = _df.coalesce(partitions) return df else: df = self.storeDF( df=self.sql(query, partitions), dfName=dfName, persistType=persistType, partitions=partitions, partitionCols=self.getPartitionColumnsFromSQL(query)) if dfName: df.createOrReplaceTempView(dfName) if self.__printcount: self.log.info("Number of Records in DF '%s' : %d " % (dfName, df.count())) def storeDF(self, df: DataFrame, dfName: str, persistType: str, partitions: int, partitionCols: List[str]): ''' Store the input dataframe, read the persisted datafrme and return the new one. If Memory/Disk persistance requested, we run take(1) on the datafrme to force persist. ''' if self.__explainDF or \ "NULL|NONE".index(persistType.toUpperCase()) < 0 : self.log.info("Execution pland for building the DF '%s'" % (dfName)) df.explain() self.log.info("\n\n\n") saveType = self.__parms["--saveDFAs"] \ if self.__saveDF and \ "HIVE|NULL".index(persistType.toUpperCase()) < 0 \ else \ persistType.toUpperCase() if saveType == "S3" and self.__runEnv == "aws": saveType = "HDFS" self.log.debug( "Resetting the persist type to 'HDFS' as the --runEnv != 'aws'" ) df1 = df if saveType != "HDFS" and \ saveType != "HIVE" and \ saveType != "S3" \ else self.repartitionDF(dataFrame= df, partitions = partitions) if saveType == "NULL" or saveType == "NONE": return df1 elif saveType == "HDFS": return self.persistExternal(self.__tempHDFS, dfName, df, partitionCols) elif saveType == "S3": return self.persistExternal(self.__tempS3, dfName, df, partitionCols) elif saveType == "": return self.persist2Hive(dfName, df, partitionCols) elif saveType == "CHECK_POINT": return df.cache().checkpoint(eager=True) else: return self.persistLocal(dfName, df, persistType) def persistExternal(self, parentDirURI: str, fileName: str, df: DataFrame, partitionCols: List[str] = None, overwrite: bool = True, fileFormat: str = None, **kwargs): fullPath = "%s%s" % (parentDirURI,fileName or "") if parentDirURI.endswith("/") else \ "%s/%s" % (parentDirURI,fileName or "") fullPath = fullPath.replace("//", "/") schma = df.schema() fileFormat = fileFormat or self.__fileFmt self.write2ExtrFile(fullPath=fullPath, fileFormat=fileFormat, df=df, partitionCols=partitionCols, overwrite=overwrite, **kwargs) df.unpersist() if fileFormat == "parquet": return self.readParquet(uri=fullPath, schema=schma, **kwargs) elif fileFormat == "orc": return self.readOrc(uri=fullPath, schema=schma, **kwargs) elif fileFormat == "csv": return self.readCSV(uri=fullPath, schema=schma, **kwargs) else: return self.readParquet(uri=fullPath, schema=schma, **kwargs) def readParquet(self, uriString: str, schema: StructType = None, mergeSchema: bool = False, **kwargs): self.log.info("Reading the parquet file '%s'" % uriString) rdr = self.__spark.read.format("parquet") if mergeSchema: rdr.option("mergeSchema", "true") if schema: rdr.schema(schema) return rdr.load(uriString) def readOrc(self, uriString: str, schema: StructType, **kwargs): self.log.info("Reading the ORC file in '%s'" % uriString) pass ##TODO def readCSV(self, uriString: str, schema: StructType, **kwargs): self.log.info("Reading the CSV file in '%s'" % uriString) pass ##TODO def write2ExtrFile(self, fileFormat: str, fullPath: str, df: DataFrame, partitionCols: List[str] = None, overwrite: bool = True, **kwargs): if fullPath.startswith("s3"): self.__s3.waitForFile("%s/_SUCCESS" % (fullPath)) #TODO:Yet to Implement def persist2Hive(self, table: str, df: DataFrame, partitionCols: List[str]): pass #TODO:Yet to Implement def persistLocal(self, dfName: str, df: DataFrame, persistType: str): ''' Persist the input Datafrmae locally (memory/disk/none) and runs `df.take(1)` to force persist. ''' lvl = self.getSparkPersistType(persistType.toUpperCase()) if lvl: df.persist() if (self.__printcount == None): df.take(1) def getSparkPersistType(self, persistTypStr: str) -> StorageLevel: ''' Converts the String representation to the StorageLevel Object. If invalid string received, it will return the `StorageLevel.NONE` Supported, `StorageLevel.NONE` `StorageLevel.DISK_ONLY` `StorageLevel.DISK_ONLY_2` `StorageLevel.MEMORY_ONLY` `StorageLevel.MEMORY_ONLY_2` `StorageLevel.MEMORY_AND_DISK` `StorageLevel.MEMORY_AND_DISK_2` `StorageLevel.OFF_HEAP` ''' if persistTypStr == "NONE": return None elif persistTypStr == "DISK_ONLY": return StorageLevel.DISK_ONLY elif persistTypStr == "DISK_ONLY_2": return StorageLevel.DISK_ONLY_2 elif persistTypStr == "MEMORY_ONLY": return StorageLevel.MEMORY_ONLY elif persistTypStr == "MEMORY_ONLY_2": return StorageLevel.MEMORY_ONLY_2 elif persistTypStr == "MEMORY_AND_DISK": return StorageLevel.MEMORY_AND_DISK elif persistTypStr == "MEMORY_AND_DISK_2": return StorageLevel.MEMORY_AND_DISK_2 elif persistTypStr == "OFF_HEAP": return StorageLevel.OFF_HEAP else: self.log.warn( "Invalid Persist Type %s received. Defaulting to NONE" % (persistTypStr)) return None def repartitionDF(self, dataFrame: DataFrame, partitions: int = 0): ''' Repartition the inuput dataframe parms: df -> dataframe partitions -> new partitions count. Defaulted to 0 i.e Don't partition logic, if partitions = 0 , Don't repartitions if partitions = -1, Repartions to the default number (NumOfExecutors * ExecutorCores * 2) if partitions > 0 , Repartition/coalesce to the input number ''' curParts = dataFrame.rdd.getNumPartitions finalParts = min(curParts, partitions) if curParts == partitions or partitions == 0: finalParts = -1 elif partitions == -1: finalParts = self.__dfltRDDParts elif partitions > 0: finalParts = partitions else: pass #finalParts is pre-populated. self.log.debug("Current Partitions: %d , Requested: %d, Final: %d " % (curParts, partitions, finalParts)) if finalParts != -1: return dataFrame elif curParts > finalParts: return dataFrame.coalesce(finalParts) else: return dataFrame.repartition(finalParts) def handleHints(self, query: str): ''' Removes the SparkSQL hints if the -useHist parm is not set. Example:- If sql = 'select /* hists */ cols.. from ..' if -useHist is not set, return 'select cols.. from ..' else return 'select /* hists */ cols.. from ..' ''' if self.__useHist: return query else: return re.sub(r'/\*+.*\*/', '', query) @staticmethod def getPartitionColumnsFromSQL(query): s = query.toLowerCase().strip().replace("\n", " ") inx = s.index(" cluster ") lst = [] if inx > 0: lst.extend((map(lambda x: x.strip(), s[inx + 12:].split(",")))) else: frm = s.index(" distribute ") to = s.index(" sort ", frm + 15) if frm > 0 else 0 if to > frm: lst.extend((map(lambda x: x.strip(), s[frm + 15:to].split(",")))) else: lst.extend((map(lambda x: x.strip(), s[frm + 15:].split(",")))) return lst
# -*- coding: utf-8 -*- ''' ''' __author__ = 'Foxlora' __time__ = '2020/10/10 22:22' from pyspark import SparkConf from pyspark.sql import SparkSession from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from setting.default import DefaultConfig # 1、创建spark streaming context conf conf = SparkConf() conf.setAll(DefaultConfig.SPARK_ONLINE_CONFIG) sc = SparkContext(conf=conf) stream_sc = StreamingContext(sc, 60) # 2、配置与kafka读取的配置 similar_kafka = {"metadata.broker.list": DefaultConfig.KAFKA_SERVER, "group.id": 'similar'} SIMILAR_DS = KafkaUtils.createDirectStream(stream_sc, ['click-trace'], similar_kafka)
specific data Note that I use spark because there is currently no way to use SQL queries with dask """ from pyspark import SparkConf from pyspark import SparkContext from pyspark.sql import SQLContext, DataFrame # This could benefit from some tweaks especially if the database becomes larger conf = SparkConf() conf.set("spark.sql.autoBroadcastJoinThreshold", 1024 * 1024 * 100) conf.setAppName('Mnist_Spark_MLP').setMaster('local[8]') conf.setAll([('spark.executor.memory', '8g'), ('spark.executor.cores', '3'), ('spark.cores.max', '3'), ('spark.driver.memory', '8g')]) conf.set("spark.sql.caseSensitive", "true") # Global imports import glob import yaml import logging logging.getLogger().setLevel(logging.INFO) import os import textwrap import numpy as np import subprocess from datetime import datetime import copy import time import fnmatch
from __future__ import print_function, division import os import sys import copy import functools from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext, SparkSession from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType conf = SparkConf().setMaster("yarn").setAppName("autofe").set( 'spark.yarn.queue', 'solution') # set app resources configs = [('spark.driver.memory', '10g'), ('spark.executor.memory', '4g'), ('spark.executor.instances', '10'), ('spark.executor.cores', '2')] conf.setAll(configs) # conf = SparkConf().set('master', 'local') sc = SparkContext.getOrCreate(conf=conf) # sc = SparkContext.getOrCreate() sql_context = HiveContext(sc) #action表进行预处理 #数据加载 path = "hdfs://m7-model-hdp01:8020/user/2-6-0-model-test/user_1/nodes/data-load-load-240240/out/20190717/DAG_36240/NODE_240240/SLOT_0/DataLoad/02150359716" t = sql_context.read.parquet(path) #查看数据信息 print("查看几行数据") print(t.show(5)) print("查看数据类型") print(t.dtypes)
def __setupSparkSession__( self, jobConf: dict, ) -> SparkSession: ''' Init the Spark environemnt with few default configurations and start the spark session. ''' conf = SparkConf() # #Setup Spark Specific configurations # hmConf = { "spark.executor.pyspark.memory": "512m", "spark.debug.maxToStringFields": "5000", "spark.rps.askTimeout": "1200", "spark.network.timeout": "1200", "spark.maxRemoteBlockSizeFetchToMem": "512m", "spark.broadcast.blockSize": "16m", "spark.broadcast.compress": "true", "spark.rdd.compress": "true", "spark.io.compression.codec": "org.apache.spark.io.SnappyCompressionCodec", "spark.kryo.unsafe": "true", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.kryoserializer.buffer": "10240", "spark.kryoserializer.buffer.max": "2040m", "hive.exec.dynamic.partition": "true", "hive.exec.dynamic.partition.mode": "nonstrict", "hive.warehouse.data.skiptrash": "true", "spark.sql.hive.metastorePartitionPruning": "true", "spark.sql.broadcastTimeout": "1200", "spark.sql.sources.partitionOverwriteMode": "dynamic", "spark.sql.orc.filterPushdown": "true", "spark.sql.orc.splits.include.file.footer": "true", "spark.sql.orc.cache.stripe.details.size": "1000", "spark.hadoop.parquet.enable.summary-metadata": "false", "spark.sql.parquet.mergeSchema": "false", "spark.sql.parquet.filterPushdown": "true", "spark.sql.parquet.fs.optimized.committer.optimization-enabled": "true", "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version": "2", "spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored": "true" } for (k, v) in jobConf['sparkconfs'].items(): hmConf.set(k, v) conf.setAll(hmConf) # #Setup Hadoop Specific configurations # hdpCnf = SparkContext._jsc.hadoopConfiguration() hdpCnf.set('io.file.buffer.size', '65536') hdpCnf.set('mapreduce.fileoutputcommitter.algorithm.version', '2') for (k, v) in jobConf['hadoopconfs'].items(): hdpCnf.set(k, v) # #Setup AWS Specific configurations # if jobConf['appconfs']['runenv'].toUpperCase() == 'AWS': SparkContext.setSystemProperty( 'com.amazonaws.services.s3.enableV4', 'true') SparkContext.setSystemProperty( 'com.amazonaws.services.s3.enforceV4', 'true') conf.set( "spark.sql.parquet.output.committer.class", "com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter" ) cred = None try: from botocore.credentials import InstanceMetadataProvider, InstanceMetadataFetcher provider = InstanceMetadataProvider( iam_role_fetcher=InstanceMetadataFetcher(timeout=1000, num_attempts=2)) creds = provider.load() hdpCnf.setAll({ 'fs.s3a.access.key': creds.access_key, 'fs.s3a.access.key': creds.secret_key, }) except: pass hdpCnf.setAll({ 'fs.s3a.server-side-encryption-algorithm': 'SSE-KMS', 'fs.s3.enableServerSideEncryption': 'true', 'fs.s3.enableServerSideEncryption': 'true', 'fs.s3.impl': 'org.apache.hadoop.fs.s3a.S3AFileSystem', 'fs.s3a.impl': 'org.apache.hadoop.fs.s3a.S3AFileSystem', 'fs.s3a.endpoint': "s3.%s.amazonaws.com" % (jobConf['appconfs']['appdefaults'] or 'us-east-1') }) spark = SparkSession \ .builder \ .config(conf=conf) \ .appName(jobConf['name'] or 'PySparkApp') \ .enableHiveSupport() \ .getOrCreate() sc = spark.sparkContext sc.setLogLevel(jobConf['appconfs']['logging']['sparkloglevel'] or 'INFO') if jobConf['appconfs']['logging']['sparkloglevel'] or 'INFO' == "DEBUG": msg = "" for k in sc._conf.getAll(): msg += "\t%50s -> %s\n" % (k[0], k[1]) log.debug( "Initiated SparkSesion with below confs,\n{}".format(msg)) return spark
from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import * from pyspark.storagelevel import StorageLevel appName = "KafkaStreams" config = SparkConf().setAppName(appName) props = [] props.append(("spark.rememberDuration", "10")) props.append(("spark.batchDuration", "10")) props.append(("spark.eventLog.enabled", "true")) props.append(("spark.streaming.timeout", "30")) props.append(("spark.ui.enabled", "true")) config = config.setAll(props) sc = SparkContext(conf=config) ssc = StreamingContext(sc, 5) topics = ["t1"] kafka_params = { "zookeeper.connect": "localhost:5181/kafka", "metadata.broker.list": "localhost:9092", "group.id": "Kafka_MapR-Streams_to_HBase" } raw = KafkaUtils.createDirectStream(ssc, topics, kafka_params) raw.pprint() ssc.start() # Start the computation
def spark_session(spark_id, executor_num, local_dir): logger.info('[%s] init spark session', spark_id) # spark if 'SPARK_HOME' not in os.environ: os.environ['SPARK_HOME'] = SPARK_HOME os.environ['PYSPARK_PYTHON'] = WORKER_PYTHON os.environ['PYSPARK_DRIVER_PYTHON'] = DRIVER_PYTHON if not local_dir: local_dir = os.path.join(JOB_ROOT_DIR.LOCAL_ROOT, spark_id) os.makedirs(os.path.join(local_dir, 'tmp')) #os.makedirs(os.path.join(local_dir, 'metastore_db')) spark_conf = SparkConf() conf_details = [ # ('spark.yarn.jars', ''), # ('spark.executorEnv.PATH', SPARK_CONFIG['WORKER_PATH']), # ('spark.eventLog.dir', 'hdfs://TS-CLICKH011:8020/spark/history'), # ('spark.yarn.historyServer.address', 'http://ts-clickh09:18080/'), # ('spark.executorEnv.PATH', './python3/bin/:$PATH'), # ('spark.appMasterEnv.PATH', SparkConfig['WORKER_PATH']), # ('spark.yarn.appMasterEnv.PYSPARK_PYTHON', './python3/bin/python3'), # ('spark.executorEnv.PYSPARK_PYTHON', './python3/bin/python3'), # ('spark.driver.host', '172.22.16.57'), # ('spark.pyspark.python', './python3/bin/python3'), # ('spark.pyspark.python', './python3/bin/python3'), # ('spark.pyspark.driver.python', '/data/anaconda3/bin/python'), ('spark.yarn.archive', HDFS_SPARK_JARS), ('spark.yarn.dist.archives', SPARK_CONFIG['SPARK_ARCHIVES']), ('spark.eventLog.enabled', 'true'), ('spark.eventLog.compress', 'true'), ('spark.driver.memory', '2G'), ('spark.driver.extraJavaOptions', f'-Duser.timezone=UTC+0800 -Djava.io.tmpdir={os.path.join(local_dir, "tmp")} -Dderby.system.home={os.path.abspath(local_dir)}' ), ('spark.executor.extraJavaOptions', '-Duser.timezone=UTC+0800 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps' ), ('spark.executor.instances', executor_num), ('spark.executor.memory', '8G'), ('spark.executor.cores', 4), ('spark.sql.shuffle.partitions', executor_num), ('spark.yarn.executor.memoryOverhead', '4G'), ('spark.sql.warehouse.dir', os.path.join(local_dir, 'metastore_db')), ('spark.local.dir', os.path.join(local_dir, 'tmp')), ('spark.driver.extraClassPath', "/data/tool/env/hadoop-lzo/lib/hadoop-lzo-0.4.19.jar"), ('spark.driver.extraLibraryPath ', '/data/tool/env/hadoop-lzo/lib/native/') ] for k, v in conf_details: print(f'[{spark_id}] spark config {k} = {v}') spark_conf.setAll(conf_details) spark_conf.setAppName(f'{spark_id}') spark_conf.setMaster('yarn') spark = SparkSession.builder.config( conf=spark_conf).enableHiveSupport().getOrCreate() return spark
def init_spark(config, app=None, use_session=False): import os import sys from glob import glob if 'spark-home' in config: os.environ['SPARK_HOME'] = config['spark-home'] if 'spark-conf-dir' in config: os.environ['SPARK_CONF_DIR'] = config['spark-conf-dir'] if 'pyspark-python' in config: # Set python interpreter on both driver and workers os.environ['PYSPARK_PYTHON'] = config['pyspark-python'] if 'yarn-conf-dir' in config: # Hadoop YARN configuration os.environ['YARN_CONF_DIR'] = config['yarn-conf-dir'] if 'spark-classpath' in config: # can be used to use external folder with Hive configuration # e. g. spark-classpath='/etc/hive/conf.cloudera.hive1' os.environ['SPARK_CLASSPATH'] = config['spark-classpath'] submit_args = [] driver_mem = config.get('spark-prop.spark.driver.memory', None) if driver_mem is not None: submit_args.extend(["--driver-memory", driver_mem]) driver_cp = config.get('spark-prop.spark.driver.extraClassPath', None) if driver_cp is not None: submit_args.extend(["--driver-class-path", driver_cp]) driver_java_opt = config.get('spark-prop.spark.driver.extraJavaOptions', None) if driver_java_opt is not None: submit_args.extend(["--driver-java-options", driver_java_opt]) jars = config.get('jars', None) if jars is not None: if isinstance(jars, str): jars = [jars] submit_args.extend(["--jars", ','.join(jars)]) mode_yarn = config['spark-prop.spark.master'].startswith('yarn') if mode_yarn: # pyspark .zip distribution flag is set only if spark-submit have master=yarn in command-line arguments # see spark.yarn.isPython conf property setting code # in org.apache.spark.deploy.SparkSubmit#prepareSubmitEnvironment submit_args.extend(['--master', 'yarn']) # pyspark .zip distribution flag is set only if spark-submit have pyspark-shell or .py as positional argument # see spark.yarn.isPython conf property setting code # in org.apache.spark.deploy.SparkSubmit#prepareSubmitEnvironment submit_args.append('pyspark-shell') os.environ['PYSPARK_SUBMIT_ARGS'] = ' '.join(submit_args) spark_home = os.environ['SPARK_HOME'] spark_python = os.path.join(spark_home, 'python') pyspark_libs = glob(os.path.join(spark_python, 'lib', '*.zip')) sys.path.extend(pyspark_libs) virtualenv_reqs = config['spark-prop'].get('spark.pyspark.virtualenv.requirements', None) if use_session: from pyspark.sql import SparkSession builder = SparkSession.builder.appName(app or config['app']) if mode_yarn: builder = builder.enableHiveSupport() for k, v in prop_list(config['spark-prop']).items(): builder = builder.config(k, v) ss = builder.getOrCreate() if virtualenv_reqs is not None: ss.addFile(virtualenv_reqs) return ss else: from pyspark import SparkConf, SparkContext conf = SparkConf() conf.setAppName(app or config['app']) props = [(k, str(v)) for k, v in prop_list(config['spark-prop']).items()] conf.setAll(props) sc = SparkContext(conf=conf) if virtualenv_reqs is not None: sc.addFile(virtualenv_reqs) return sc
def _create_spark_context(): spark_conf = SparkConf() spark_conf.set('spark.sql.catalogImplementation', 'hive') spark_conf.setAll(self._setup_options(additional_options)) return SparkContext(conf=spark_conf)
# -*- coding: UTF-8 -*- import happybase from setting.default import DefaultConfig import redis pool = happybase.ConnectionPool(size=10, host='hadoop-master', port=9090) # 召回数据 # 加上decode_responses=True,写入的键值对中的value为str类型,不加这个参数写入的则为字节类型。 redis_client = redis.StrictRedis(host=DefaultConfig.REDIS_HOST, port=DefaultConfig.REDIS_PORT, db=10, decode_responses=True) # 用于缓存的Redis数据库 # 加上decode_responses=True,写入的键值对中的value为str类型,不加这个参数写入的则为字节类型。 cache_client = redis.StrictRedis(host=DefaultConfig.REDIS_HOST, port=DefaultConfig.REDIS_PORT, db=8, decode_responses=True) # 在 sort_service.py 排序逻辑中使用 from pyspark import SparkConf from pyspark.sql import SparkSession # spark配置 conf = SparkConf() conf.setAll(DefaultConfig.SPARK_GRPC_CONFIG) SORT_SPARK = SparkSession.builder.config(conf=conf).getOrCreate()