Exemplo n.º 1
0
def get_SparkContext(app_name='tuixing-spark', **kwargs):
    conf = SparkConf()
    conf.setAppName(app_name)
    conf.setAll(COMMON_SC)
    for key in kwargs:
        conf.set(key, kwargs[key])

    sc = SparkContext(conf=conf)
    return sc
def create_streaming_context():
    conf = SparkConf()
    pairs = [('spark.app.name', 'Process Stories Stream'),
             ('spark.master', 'local[4]'), ('spark.ui.port', '4040')]
    conf.setAll(pairs)
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, batch_secs)
    ssc.checkpoint(checkpointDirectory)  # set checkpoint directory
    return ssc
Exemplo n.º 3
0
def spark_session(app, cores=2, gpus=0, max_failures=1, *args):
    from pyspark import SparkConf
    from pyspark.sql import SparkSession

    with TemporaryDirectory() as tmpdir:
        metastore_path = os.path.join(tmpdir, 'metastore')

        # start a single worker with given cores when gpus are present
        # max failures are ignored when gpus in that case
        master = 'local-cluster[1,{},1024]'.format(cores) if gpus > 0 \
            else 'local[{},{}]'.format(cores, max_failures)
        conf = SparkConf().setAppName(app).setMaster(master)
        conf = conf.setAll([
            ('spark.ui.showConsoleProgress', 'false'),
            ('spark.test.home', os.environ.get('SPARK_HOME')),
            ('spark.locality.wait', '0'),
            ('spark.unsafe.exceptionOnMemoryLeak', 'true'),
            ('spark.ui.enabled', 'false'),
            ('spark.local.dir', os.path.join(tmpdir, 'tmp')),
            ('spark.sql.warehouse.dir', os.path.join(tmpdir, 'warehouse')),
            ('javax.jdo.option.ConnectionURL',
             f'jdbc:derby:;databaseName={metastore_path};create=true'),
        ])

        with temppath() as temp_filename:
            if gpus > 0:
                with open(temp_filename, 'wb') as temp_file:
                    addresses = ', '.join('\\"{}\\"'.format(i)
                                          for i in range(gpus))
                    temp_file.write(
                        b'echo {\\"name\\": \\"gpu\\", \\"addresses\\": [' +
                        addresses.encode('ascii') + b']}')

                os.chmod(
                    temp_file.name, stat.S_IRWXU | stat.S_IXGRP | stat.S_IRGRP
                    | stat.S_IROTH | stat.S_IXOTH)

                # the single worker takes all gpus discovered, and a single executor will get them
                # each task on that executor will get a single gpu
                conf = conf.setAll([
                    ('spark.worker.resource.gpu.discoveryScript',
                     temp_filename),
                    ('spark.worker.resource.gpu.amount', str(gpus)),
                    ('spark.task.resource.gpu.amount', '1'),
                    ('spark.executor.resource.gpu.amount', str(gpus)),
                ])

            session = SparkSession \
                .builder \
                .config(conf=conf) \
                .getOrCreate()

            try:
                yield session
            finally:
                session.stop()
 def get_spark_conf(self):
     conf = SparkConf()  # 创建spark config对象
     config = (
         ("spark.app.name", self.SPARK_APP_NAME),  # 设置启动的spark的app名称,没有提供,将随机产生一个名称
         ("spark.executor.memory", "2g"),  # 设置该app启动时占用的内存用量,默认1g
         ("spark.master", self.SPARK_URL),  # spark master的地址
         ("spark.executor.cores", "2")  # 设置spark executor使用的CPU核心数
         # 	('spark.sql.pivotMaxValues', '99999'),  # 当需要pivot DF,且值很多时,需要修改,默认是10000
     )
     # 查看更详细配置及说明:https://spark.apache.org/docs/latest/configuration.html
     conf.setAll(config)
     return conf
Exemplo n.º 5
0
    def __init__(self,
                 path_files: str,
                 path_index: str,
                 path_dict: str,
                 file_name: str,
                 num_partition: int):
        path = ''.join(path_files + file_name)
        self.__file_name = file_name

        conf = SparkConf()
        conf.setAll(
            [
                ('spark.app.name', 'Challenge Data Engineer'),
                ('spark.driver.cores', '4'),
                ('spark.executor.cores', '4'),
                ('spark.driver.maxResultSize', '10g'),
                ('spark.executor.memory', '4g'),
                ('spark.executor.memoryOverhead	', '4g'),
                ('spark.driver.memory', '10g'),
                ('spark.local.dir', PATH_INDEX),
                ('spark.driver.extraJavaOptions', '-Xmx1024m'),
                ('spark.memory.offHeap.enabled', 'true'),
                ('spark.memory.offHeap.size', '20g')
            ]
        )

        self.__spark = SparkSession \
            .builder \
            .config(conf=conf) \
            .getOrCreate()

        self.__df_dict = self.__spark \
            .read \
            .parquet(path_dict) \
            .repartition(numPartitions=num_partition)

        self.__df_doc = self.__spark \
            .read \
            .text(path)

        self.__df_wordid_docid = self.__spark \
            .read \
            .parquet(path_index) \
            .rdd \
            .unpersist() \
            .repartition(numPartitions=1000)

        self.__df_wordid_docid = self.__df_wordid_docid.toDF()

        logging.warning(f"Processing doc: {path}")
Exemplo n.º 6
0
 def create_spark_session(self):
     conf = SparkConf()
     config = (("spark.app.name", self.SPARK_APP_NAME),
               ("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY),
               ("spark.master", self.SPARK_MASTER),
               ("spark.executor.cores", self.SPARK_EXECUTOR_CORES),
               ("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES),
               ("spark.debug.maxToStringFields", "10000"))
     conf.setAll(config)
     if self.ENABLE_HIVE_SUPPORT:
         return SparkSession.builder.config(
             conf=conf).enableHiveSupport().getOrCreate()
     else:
         return SparkSession.builder.config(conf=conf).getOrCreate()
    def __init__(self, path_files: str, path_index: str, path_dict: str,
                 file_name: str, num_partition: int):
        path = ''.join(path_files + file_name)
        self.__file_name = file_name

        conf = SparkConf()
        # Application Properties
        # http://spark.apache.org/docs/latest/configuration.html#spark-properties
        conf.setAll([('spark.app.name', 'Challenge Data Engineer'),
                     ('spark.driver.cores', '4'),
                     ('spark.executor.cores', '4'),
                     ('spark.driver.maxResultSize', '10g'),
                     ('spark.executor.memory', '10g'),
                     ('spark.executor.memoryOverhead	', '10g'),
                     ('spark.driver.memory', '10g'),
                     ('spark.local.dir', PATH_INDEX),
                     ('spark.driver.extraJavaOptions', '-Xmx1024m'),
                     ('spark.memory.offHeap.enabled', 'true'),
                     ('spark.memory.offHeap.size', '20g')])

        self.__spark = SparkSession \
            .builder \
            .config(conf=conf) \
            .getOrCreate()

        self.__df_dict = self.__spark \
            .read \
            .parquet(path_dict) \
            .repartition(numPartitions=num_partition)

        self.__df_doc = self.__spark \
            .read \
            .text(path)

        self.__df_wordid_docid = self.__spark \
            .read \
            .parquet(path_index) \
            .rdd \
            .unpersist() \
            .repartition(numPartitions=1000)

        print(self.__df_wordid_docid.getStorageLevel())
        print(self.__df_wordid_docid.getNumPartitions())
        print(self.__spark.sparkContext.getConf().getAll())
        self.__spark.sql("SET -v").show(n=200, truncate=False)

        self.__df_wordid_docid = self.__df_wordid_docid.toDF()

        self.__spark.sparkContext.setLogLevel("warn")
        logging.warning(f"Processing doc: {path}")
Exemplo n.º 8
0
    def _init_spark(self, appname):
        """Internal function to setup spark context
        
        Note: only include spark modules here so that
        the interface can be queried outside of pyspark.

        """
        # currently using LZ4 compression: should not degrade runtime much
        # but will help with some operations like shuffling, especially when
        # dealing with things object like highly compressible label volumes
        # NOTE: objects > INT_MAX will cause problems for LZ4
        worker_env = {}
        if "DVIDSPARK_WORKFLOW_TMPDIR" in os.environ and os.environ["DVIDSPARK_WORKFLOW_TMPDIR"]:
            worker_env["DVIDSPARK_WORKFLOW_TMPDIR"] = os.environ["DVIDSPARK_WORKFLOW_TMPDIR"]
        
        try:
            spark_config = self.config_data["options"]["spark-config"]
        except KeyError:
            # Old workflows haven't been updated to inherit the base Workflow schema
            spark_config = {}
        
        for k in list(spark_config.keys()):
            spark_config[k] = str(spark_config[k])
            if spark_config[k] in ('True', 'False'):
                spark_config[k] = spark_config[k].lower()
            
        # Backwards compatibility:
        # if 'corespertask' option exists, override it in the spark config
        if "corespertask" in self.config_data["options"] and self.config_data["options"]["corespertask"] != 0:
            if "spark.task.cpus" in spark_config and spark_config["spark.task.cpus"] != '1':
                raise RuntimeError("Bad config: You can't set both 'corespertask' and 'spark.task.cpus'.  Use 'spark.task.cpus'.")
            spark_config["spark.task.cpus"] = str(self.config_data["options"]["corespertask"])

        # set spark config
        from pyspark import SparkContext, SparkConf
        conf = SparkConf()
        conf.setAppName(appname)
        conf.setAll(list(spark_config.items()))
        
#         from pyspark_flame import FlameProfiler
#         flamegraph_dir = f'{self.config_dir}/flamegraphs'
#         os.makedirs(flamegraph_dir, exist_ok=True)
#         conf.set("spark.python.profile.dump", flamegraph_dir)
#         conf.set("spark.python.profile", "true")
#         worker_env['pyspark_flame.interval'] = 0.25 # Default is 0.2 seconds
#         return SparkContext(conf=conf, batchSize=1, environment=worker_env, profiler_cls=FlameProfiler)

        # Auto-batching heuristic doesn't work well with our auto-compressed numpy array pickling scheme.
        # Therefore, disable batching with batchSize=1
        return SparkContext(conf=conf, batchSize=1, environment=worker_env)
Exemplo n.º 9
0
 def _create_spark_session(self):
     conf = SparkConf()
     config = (
         ('spark.app.name', self.SPARK_APP_NAME),
         ('spark.executor.memory', self.SPARK_EXECUTOR_MEMORY),
         ('spark.master', self.SPARK_URL),
         ('spark.executor.cores', self.SPARK_EXECUTOR_CORES),
         ('spark.executor.instances', self.SPARK_EXECUTOR_INSTANCES),
     )
     conf.setAll(config)
     if self.ENABLE_HIVE_SUPPORT:
         return SparkSession.builder.config(
             conf=conf).enableHiveSupport().getOrCreate()
     else:
         return SparkSession.builder.config(conf=conf).getOrCreate()
Exemplo n.º 10
0
def spark_cluster(logfile, discovery_schedule, hosts, extra_conf=None):
    from pyspark import SparkConf
    from pyspark.sql import SparkSession

    unknown_keys = set([prop for prop, _ in extra_conf]) \
        .difference(conf.SPARK_CONF_DEFAULT_VALUES.keys()) \
        if extra_conf else None
    if unknown_keys:
        raise ValueError(
            'default values must be defined for these properties: {}'.format(
                unknown_keys))

    cluster = SparkClusterController(logfile, discovery_schedule, hosts, 1)
    try:
        cluster.start()

        config = SparkConf().setAppName('elastic spark tests').setMaster(
            cluster.master_url())
        config = config.setAll([
            # pyspark-shell JVM will OOM even with 1GB when all tests run in one process
            # SparkContext and pyspark-shell JVM gets reused even though we do SparkSession.stop()
            # pyspark-shell JVM memory footprint increases from test to test
            # when run with pytest --forked, set SPARK_DRIVER_MEM=512m env
            ('spark.driver.memory', os.environ.get('SPARK_DRIVER_MEM',
                                                   '1500m')),
            # the minimum executor memory we can set
            ('spark.executor.memory', '512m'),
            # don't pollute the log with progress bar
            ('spark.ui.showConsoleProgress', 'false'),
        ])
        # config properties once set will survive session.stop() and
        # SparkSession.config(conf=config).getOrCreate(), so we have to make sure
        # we overwrite their value if not in extra_conf
        more_conf = conf.SPARK_CONF_DEFAULT_VALUES.copy()
        more_conf.update(extra_conf or [])
        config.setAll(more_conf.items())

        session = SparkSession \
            .builder \
            .config(conf=config) \
            .getOrCreate()

        try:
            yield session
        finally:
            session.stop()
    finally:
        cluster.shutdown()
Exemplo n.º 11
0
def get_spark_config(
        predictrip_config: Mapping[str, Mapping[str, str]]) -> SparkConf:
    """
    Create an object representing the Spark configuration we want

    :type predictrip_config: mapping returned by load_config containing configuration options
    :return: pyspark.SparkConf instance
    """
    # NOTE: contrary to https://www.geomesa.org/documentation/user/spark/pyspark.html#using-geomesa-pyspark, use of
    # geomesa_pyspark.configure() no longer necessary since Spark 2.1, as long as you tell spark to include the
    # geomesa_pyspark python module some other way (e.g. spark.files)

    sc = SparkConf()
    sc = sc.setAppName('PredicTrip ' + path.basename(__file__))
    # FIXME: the following doesn't seem to be effective
    sc = sc.setAll([('fs.s3a.awsAccessKeyId',
                     predictrip_config['AWS']['access_key_id']),
                    ('fs.s3a.awsSecretAccessKey',
                     predictrip_config['AWS']['secret_access_key'])])
    # add to sc any spark options that might be set in predictrip_config
    if 'executor_cores' in predictrip_config['Spark']:
        sc = sc.set('spark.executor.cores',
                    predictrip_config['Spark']['executor_cores'])
    if 'executor_memory' in predictrip_config['Spark']:
        sc = sc.set('spark.executor.memory',
                    predictrip_config['Spark']['executor_memory'])
    return sc
Exemplo n.º 12
0
 def _create_spark_session(self):
     #1、创建配置
     '''给spark程序创建初始化spark session'''
     conf=SparkConf()
     config=(
         ("spark.app.name", self.SPARK_APP_NAME),  # 设置启动的spark的app名称,没有提供,将随机产生一个名称
         ("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY),  # 设置该app启动时占用的内存用量,默认2g
         ("spark.master", self.SPARK_URL),  # spark master的地址
         ("spark.executor.cores", self.SPARK_EXECUTOR_CORES),  # 设置spark executor使用的CPU核心数,默认是1核心
         ("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES)
     )
     conf.setAll(config)
     #读取配置初始化
     if self.ENABLE_HIVE_SUPPORT:
         return SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
     else:
         return SparkSession.builder.config(conf=conf).getOrCreate()
Exemplo n.º 13
0
def build_spark_session(
        app_name: str,
        spark_config: DefaultDict[str, str] = None,
        hadoop_config: DefaultDict[str, str] = None) -> SparkSession:
    conf = SparkConf()
    if spark_config:
        conf.setAll(spark_config.items())

    sc = SparkContext(conf=conf)

    if hadoop_config:
        for k, v in hadoop_config.items():
            sc._jsc.hadoopConfiguration().set(k, v)

    return SparkSession.builder \
        .appName(app_name) \
        .config(conf=sc.getConf()) \
        .getOrCreate()
Exemplo n.º 14
0
    def _create_spark_hbase(self):
        conf = SparkConf()  # 创建spark config对象
        config = (
            ("spark.app.name", self.SPARK_APP_NAME),  # 设置启动的spark的app名称,没有提供,将随机产生一个名称
            ("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY),  # 设置该app启动时占用的内存用量,默认2g
            ("spark.master", self.SPARK_URL),  # spark master的地址
            ("spark.executor.cores", self.SPARK_EXECUTOR_CORES),  # 设置spark executor使用的CPU核心数,默认是1核心
            ("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES),
            ("hbase.zookeeper.quorum", "192.168.19.137"),
            ("hbase.zookeeper.property.clientPort", "22181")
        )

        conf.setAll(config)

        # 利用config对象,创建spark session
        if self.ENABLE_HIVE_SUPPORT:
            return SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
        else:
            return SparkSession.builder.config(conf=conf).getOrCreate()
Exemplo n.º 15
0
def create_spark_conf():
    bigdl_conf = get_bigdl_conf()
    sparkConf = SparkConf()
    sparkConf.setAll(bigdl_conf.items())
    if not is_spark_below_2_2():
        extend_spark_driver_cp(sparkConf, get_bigdl_classpath())

    # add content in PYSPARK_FILES in spark.submit.pyFiles
    # This is a workaround for current Spark on k8s
    python_lib = os.environ.get('PYSPARK_FILES', None)
    if python_lib:
        existing_py_files = sparkConf.get("spark.submit.pyFiles")
        if existing_py_files:
            sparkConf.set(key="spark.submit.pyFiles",
                          value="%s,%s" % (python_lib, existing_py_files))
        else:
            sparkConf.set(key="spark.submit.pyFiles", value=python_lib)

    return sparkConf
Exemplo n.º 16
0
    def _create_spark_session(self):
        conf = SparkConf()

        config = (
            ("spark.app.name", self.SPARK_APP_NAME),
            ("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY),
            # ("spark.master", self.SPARK_URL),
            ("spark.executor.cores", self.SPARK_EXECUTOR_CORES),
            ("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES),
            # ("spark.sql.warehouse.dir", "/root/apache-hive-2.3.7-bin/warehouse"),
            ("hive.metastore.uris", "thrift://172.18.0.2:9083"))

        conf.setAll(config)
        print(self.ENABLE_HIVE_SUPPORT, config)

        if self.ENABLE_HIVE_SUPPORT:
            return SparkSession.builder.config(
                conf=conf).enableHiveSupport().getOrCreate()
        else:
            return SparkSession.builder.config(conf=conf).getOrCreate()
Exemplo n.º 17
0
Arquivo: spark.py Projeto: daskos/epos
    def closure(*args, **kwargs):
        try:
            options = opts
            options.update({
                'sql_parquet_compression_codec': 'uncompressed',
                'mesos_role': role,
                'mesos_coarse': bool(coarse),
                'cores_max': int(coarse) or None,
                'executor_cores': int(executor_cores),
                'executor_memory': '{}m'.format(int(executor_memory / MiB)),
                'driver_memory': '{}m'.format(int(driver_memory / MiB)),
                'mesos_executor_memoryOverhead': int(
                    (memory_overhead or (executor_cores * python_worker_memory +
                                         0.1 * executor_memory))
                    / MiB),
                'python_worker_memory': int(python_worker_memory / MiB),
                'mesos_uris': ','.join(uris),
                'mesos_executor_docker_image': docker
            })
            options = {'spark.{}'.format(k.replace('_', '.')): str(v)
                       for k, v in options.items() if v not in (None, '')}
            environs = envs.items()
        except TypeError as e:
            # curry doesn't reraise TypeErrors:
            # https://github.com/pytoolz/toolz/issues/288
            raise Exception(repr(e))

        conf = SparkConf()
        conf.setMaster(str(master))
        conf.setAppName(str(name or fn.__name__))
        conf.setAll(pairs=options.items())
        conf.setExecutorEnv(pairs=environs)

        with SparkContext(conf=conf) as sc:
            sc.setLogLevel(str(log))
            map(sc.addFile, files)
            map(sc.addPyFile, pyfiles)
            # TODO: user sparksession
            sql = SQLContext(sc)
            return fn(sc, sql, *args, **kwargs)
Exemplo n.º 18
0
 def get_spark_config(path, dependencies) -> SparkConf:
     master = 'local[2]'
     conf = SparkConf().setAppName('unit test').setMaster(master)
     return conf.setAll([
         ('spark.ui.showConsoleProgress', 'false'),
         ('spark.test.home', os.environ.get('SPARK_HOME')),
         ('spark.locality.wait', '0'),
         ('spark.driver.extraClassPath', '{}'.format(':'.join([
             os.path.join(os.getcwd(), path, 'target', 'classes'),
             os.path.join(os.getcwd(), path, 'target', 'test-classes'),
             dependencies
         ]))),
     ])
Exemplo n.º 19
0
def spark_session(app, cores=2, gpus=0, max_failures=1, *args):
    from pyspark import SparkConf
    from pyspark.sql import SparkSession

    master = 'local-cluster[{},{},1024]'.format(cores, max_failures) if gpus > 0 \
        else 'local[{},{}]'.format(cores, max_failures)
    conf = SparkConf().setAppName(app).setMaster(master)
    conf = conf.setAll([
        ('spark.ui.showConsoleProgress', 'false'),
        ('spark.test.home', os.environ.get('SPARK_HOME')),
        ('spark.locality.wait', '0'),
    ])

    with temppath() as temp_filename:
        if gpus > 0:
            with open(temp_filename, 'wb') as temp_file:
                addresses = ', '.join('\\"{}\\"'.format(i) for i in range(gpus))
                temp_file.write(b'echo {\\"name\\": \\"gpu\\", \\"addresses\\": [' +
                                addresses.encode('ascii') + b']}')

            os.chmod(temp_file.name, stat.S_IRWXU | stat.S_IXGRP | stat.S_IRGRP |
                     stat.S_IROTH | stat.S_IXOTH)

            conf = conf.setAll([
                ('spark.worker.resource.gpu.discoveryScript', temp_filename),
                ('spark.worker.resource.gpu.amount', '1'),
                ('spark.task.resource.gpu.amount', '1'),
                ('spark.executor.resource.gpu.amount', '1')
            ])

        session = SparkSession \
            .builder \
            .config(conf=conf) \
            .getOrCreate()

        try:
            yield session
        finally:
            session.stop()
Exemplo n.º 20
0
def create_spark_conf():
    bigdl_conf = get_bigdl_conf()
    sparkConf = SparkConf()
    sparkConf.setAll(bigdl_conf.items())
    if os.environ.get("BIGDL_JARS", None) and not is_spark_below_2_2():
        for jar in os.environ["BIGDL_JARS"].split(":"):
            extend_spark_driver_cp(sparkConf, jar)

    # add content in PYSPARK_FILES in spark.submit.pyFiles
    # This is a workaround for current Spark on k8s
    python_lib = os.environ.get("PYSPARK_FILES", None)
    if python_lib:
        existing_py_files = sparkConf.get("spark.submit.pyFiles")
        if existing_py_files:
            sparkConf.set(
                key="spark.submit.pyFiles",
                value="%s,%s" % (python_lib, existing_py_files),
            )
        else:
            sparkConf.set(key="spark.submit.pyFiles", value=python_lib)

    return sparkConf
Exemplo n.º 21
0
def create_spark_conf():
    bigdl_conf = get_bigdl_conf()
    sparkConf = SparkConf()
    sparkConf.setAll(bigdl_conf.items())
    return sparkConf
Exemplo n.º 22
0
			except msg2xlsx.ConvertingError:
				logger.error("converting failed, clean the previous work")
				xlsx_path = os.path.join(settings.SAVE_DIRECTORY, project_id)
				clean(project_id, dirs=[temp_file, xlsx_path])
				return
				
			# processed ack
			channel.basic_ack(delivery_tag=method.delivery_tag)
	else:
		logger.error("illegal content-type: " + header.content_type)


if __name__ == '__main__':
	# initialize spark
	conf = SparkConf().setMaster(settings.SPARK_MASTER_URL).setAppName(settings.SPARK_APP_NAME)
	conf.setAll([("spark.eventLog.enabled", "true"), ("spark.eventLog.dir", settings.LOG_DIRECTORY)])
	sc = SparkContext(conf=conf)

	# initialize rabbitmq
	credentials = pika.PlainCredentials(settings.RABBITMQ_CONN_CONF['username'], settings.RABBITMQ_CONN_CONF['password'])
	conn_params = pika.ConnectionParameters(settings.RABBITMQ_CONN_CONF['host'], credentials=credentials)
	conn_broker = pika.BlockingConnection(conn_params)

	channel = conn_broker.channel()
	channel.exchange_declare(exchange=settings.RABBITMQ_SPARK['exchange'],
							 type="direct",
							 passive=False,
							 durable=True,
							 auto_delete=False)

	channel.queue_declare(queue=settings.RABBITMQ_SPARK['queue'])
from pyspark.streaming.kafka import *;
from pyspark.storagelevel import StorageLevel
import happybase


appName = "KafkaStreams"
config = SparkConf().setAppName(appName)

props = []
props.append(("spark.rememberDuration", "10"))
props.append(("spark.batchDuration", "10"))
props.append(("spark.eventLog.enabled", "true"))
props.append(("spark.streaming.timeout", "30"))
props.append(("spark.ui.enabled", "true"))

config = config.setAll(props)

sc = SparkContext(conf=config)
ssc = StreamingContext(sc, 5)

topics = ["t1"]
kafka_params = {
   "zookeeper.connect" : "localhost:5181/kafka"
 , "metadata.broker.list" : "localhost:9092"
 , "group.id" : "Kafka_MapR-Streams_to_HBase"}

raw = KafkaUtils.createDirectStream(ssc, topics, kafka_params)
raw.pprint()

server = "localhost"
table_name = "/tables/stocks"
Exemplo n.º 24
0
class SparkUtils:
    def __init__(self,
                 log: RootLogger = None,
                 parms: dict = None,
                 botoSession: bototSession = None,
                 appName: str = None):
        self.log = log
        self.__parms = parms or {}
        self.__runEnv = self.__parms.get("--runEnv", "local")
        if (self.__runEnv == "aws"):
            self.__boto = botoSession
            self.__s3 = S3(log, self.__boto)

        self.__initFlags()
        self.__setupSparkSession__(appName)

        self.__dfltRDDParts = \
                int(self.__spark.conf.get("spark.executor.instances", "20")) * \
                int(self.__spark.conf.get("spark.executor.cores", "4")) * 2

    def __initFlags(self):
        '''
        Init the job level parameters needed by this class
        '''
        self.__parms["--saveDFAs"] = self.__parms.get("--saveDFAs", "NONE")

        self.__explainDF = True if "-explainDF" in self.__parms else False
        self.__printcount = True if "-printCount" in self.__parms else False
        self.__useHist = True if "-useHint" in self.__parms else False
        self.__saveDF = True if self.__parms["--saveDFAs"] != "NONE" else False

        self.__fileFmt = self.__parms.get("--fileFormat", "parquet")

        if (self.__runEnv == "aws"):
            self.__tempS3 = self.__parms.get("--tempS3", "hdfs:///temp/s3")
        if (self.__runEnv != "local"):
            self.__tempHDFS = self.__parms.get("--tempHDFS", "hdfs:///temp")
            self.log.warn(
                "For persist type 'S3', 'HDFS' will be used as the --runEnv != 'aws'"
            )

    def __setupSparkSession__(self, appName: str = None):
        '''
        Init the Spark environemnt with few default configurations and start the spark session.
        '''
        self.__conf = SparkConf()
        hmConf = {
            "spark.rps.askTimeout": "1200",
            "spark.network.timeout": "1200",
            "spark.broadcast.blockSize": "16m",
            "spark.sql.broadcastTimeout": "1200",
            "spark.broadcast.compress": "true",
            "spark.rdd.compress": "true",
            "fs.s3.enableServerSideEncryption": "true",
            "spark.kryo.unsafe": "false",
            "spark.kryoserializer.buffer": "10240",
            "spark.kryoserializer.buffer.max": "2040m",
            "spark.io.compression.codec":
            "org.apache.spark.io.SnappyCompressionCodec",
            "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
            "mapreduce.fileoutputcommitter.algorithm.version": "2",
            "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version":
            "2",
        }
        self.__conf.setAll(hmConf)
        SparkContext.setSystemProperty("com.amazonaws.services.s3.enableV4",
                                       "true")
        SparkContext.setSystemProperty("com.amazonaws.services.s3.enforceV4",
                                       "true")
        self.__spark = SparkSession \
                        .builder \
                        .config(conf=self.__conf) \
                        .appName(appName or "PySparkApp") \
                        .enableHiveSupport() \
                        .getOrCreate()
        self.__sc = self.__spark.sparkContext
        self.sqlC = SQLContext(self.__sc)
        self.__sc.setSystemProperty("com.amazonaws.services.s3.enableV4",
                                    "true")
        self.__sc.setSystemProperty("com.amazonaws.services.s3.enforceV4",
                                    "true")
        self.__sc.setLogLevel(self.__parms.get("--logLevel", "INFO"))

        hdpCnf = self.__sc.hadoopConfiguration
        hdpCnf.setAll({
            "io.file.buffer.size":
            "65536",
            "mapreduce.fileoutputcommitter.algorithm.version":
            "2",
            "fs.s3a.endpoint":
            "%s.amazonaws.com" %
            (self.__parms.get("--awsRegion", 's3.us-east-1'))
        })
        if (self.__parms.get("--runEnv", "AWS") == "AWS"):
            from botocore.credentials import InstanceMetadataProvider, InstanceMetadataFetcher
            provider = InstanceMetadataProvider(
                iam_role_fetcher=InstanceMetadataFetcher(timeout=1000,
                                                         num_attempts=2))
            creds = provider.load()
            hdpCnf.setAll({
                "fs.s3a.access.key":
                creds.access_key,
                "fs.s3a.secret.key":
                creds.secret_key,
                "fs.s3a.server-side-encryption-algorithm":
                "SSE-KMS",
                "fs.s3.enableServerSideEncryption":
                "true",
                "fs.s3.impl":
                "org.apache.hadoop.fs.s3a.S3AFileSystem",
                "fs.s3a.impl":
                "org.apache.hadoop.fs.s3a.S3AFileSystem",
                "fs.s3a.endpoint":
                "s3.%s.amazonaws.com" %
                (self.__parms.get("--awsRegion", "us-east-1"))
            })

    def sql(self,
            dfName: str,
            query: str,
            partitions: int = 0,
            persistType: str = None):
        '''
        Runs the inpult SQL, partitions the resulting Dataframe and persists the Dataframe if needed.

        Supported persistType: In addition to the pySpark native persist types, this function supports
        HIVE, HDFS, S3

        '''
        if persistType == None:
            _df = self.__spark.sql(self.handleHints(query))
            if partitions == 0:
                df = _df
            elif df.rdd.getNumPartitions < partitions:
                df = _df.repartition(partitions)
            else:
                df = _df.coalesce(partitions)
            return df
        else:
            df = self.storeDF(
                df=self.sql(query, partitions),
                dfName=dfName,
                persistType=persistType,
                partitions=partitions,
                partitionCols=self.getPartitionColumnsFromSQL(query))

        if dfName:
            df.createOrReplaceTempView(dfName)

        if self.__printcount:
            self.log.info("Number of Records in DF '%s' : %d " %
                          (dfName, df.count()))

    def storeDF(self, df: DataFrame, dfName: str, persistType: str,
                partitions: int, partitionCols: List[str]):
        '''
        Store the input dataframe, read the persisted datafrme and return the new one.
        If Memory/Disk persistance requested, we run take(1) on the datafrme to force persist.
        '''
        if self.__explainDF or \
            "NULL|NONE".index(persistType.toUpperCase()) < 0 :
            self.log.info("Execution pland for building the DF '%s'" %
                          (dfName))
            df.explain()
            self.log.info("\n\n\n")

        saveType = self.__parms["--saveDFAs"] \
            if self.__saveDF and \
               "HIVE|NULL".index(persistType.toUpperCase()) < 0 \
            else \
                persistType.toUpperCase()

        if saveType == "S3" and self.__runEnv == "aws":
            saveType = "HDFS"
            self.log.debug(
                "Resetting the persist type to 'HDFS' as the --runEnv != 'aws'"
            )

        df1 = df if saveType != "HDFS" and \
                    saveType != "HIVE" and \
                    saveType != "S3" \
                 else self.repartitionDF(dataFrame= df, partitions = partitions)

        if saveType == "NULL" or saveType == "NONE":
            return df1
        elif saveType == "HDFS":
            return self.persistExternal(self.__tempHDFS, dfName, df,
                                        partitionCols)
        elif saveType == "S3":
            return self.persistExternal(self.__tempS3, dfName, df,
                                        partitionCols)
        elif saveType == "":
            return self.persist2Hive(dfName, df, partitionCols)
        elif saveType == "CHECK_POINT":
            return df.cache().checkpoint(eager=True)
        else:
            return self.persistLocal(dfName, df, persistType)

    def persistExternal(self,
                        parentDirURI: str,
                        fileName: str,
                        df: DataFrame,
                        partitionCols: List[str] = None,
                        overwrite: bool = True,
                        fileFormat: str = None,
                        **kwargs):

        fullPath = "%s%s"  % (parentDirURI,fileName or "") if parentDirURI.endswith("/") else \
                   "%s/%s" % (parentDirURI,fileName or "")
        fullPath = fullPath.replace("//", "/")
        schma = df.schema()
        fileFormat = fileFormat or self.__fileFmt
        self.write2ExtrFile(fullPath=fullPath,
                            fileFormat=fileFormat,
                            df=df,
                            partitionCols=partitionCols,
                            overwrite=overwrite,
                            **kwargs)
        df.unpersist()
        if fileFormat == "parquet":
            return self.readParquet(uri=fullPath, schema=schma, **kwargs)
        elif fileFormat == "orc":
            return self.readOrc(uri=fullPath, schema=schma, **kwargs)
        elif fileFormat == "csv":
            return self.readCSV(uri=fullPath, schema=schma, **kwargs)
        else:
            return self.readParquet(uri=fullPath, schema=schma, **kwargs)

    def readParquet(self,
                    uriString: str,
                    schema: StructType = None,
                    mergeSchema: bool = False,
                    **kwargs):
        self.log.info("Reading the parquet file '%s'" % uriString)
        rdr = self.__spark.read.format("parquet")
        if mergeSchema:
            rdr.option("mergeSchema", "true")
        if schema:
            rdr.schema(schema)
        return rdr.load(uriString)

    def readOrc(self, uriString: str, schema: StructType, **kwargs):
        self.log.info("Reading the ORC file in '%s'" % uriString)
        pass  ##TODO

    def readCSV(self, uriString: str, schema: StructType, **kwargs):
        self.log.info("Reading the CSV file in '%s'" % uriString)
        pass  ##TODO

    def write2ExtrFile(self,
                       fileFormat: str,
                       fullPath: str,
                       df: DataFrame,
                       partitionCols: List[str] = None,
                       overwrite: bool = True,
                       **kwargs):

        if fullPath.startswith("s3"):
            self.__s3.waitForFile("%s/_SUCCESS" % (fullPath))

        #TODO:Yet to Implement

    def persist2Hive(self, table: str, df: DataFrame,
                     partitionCols: List[str]):
        pass  #TODO:Yet to Implement

    def persistLocal(self, dfName: str, df: DataFrame, persistType: str):
        ''' Persist the input Datafrmae locally (memory/disk/none) and runs `df.take(1)` to force persist.
        '''
        lvl = self.getSparkPersistType(persistType.toUpperCase())
        if lvl:
            df.persist()

        if (self.__printcount == None):
            df.take(1)

    def getSparkPersistType(self, persistTypStr: str) -> StorageLevel:
        '''
            Converts the String representation to the StorageLevel Object.
            If invalid string received, it will return the `StorageLevel.NONE`
            Supported,
                `StorageLevel.NONE`
                `StorageLevel.DISK_ONLY`
                `StorageLevel.DISK_ONLY_2`
                `StorageLevel.MEMORY_ONLY`
                `StorageLevel.MEMORY_ONLY_2`
                `StorageLevel.MEMORY_AND_DISK`
                `StorageLevel.MEMORY_AND_DISK_2`
                `StorageLevel.OFF_HEAP`
        '''

        if persistTypStr == "NONE": return None
        elif persistTypStr == "DISK_ONLY": return StorageLevel.DISK_ONLY
        elif persistTypStr == "DISK_ONLY_2": return StorageLevel.DISK_ONLY_2
        elif persistTypStr == "MEMORY_ONLY": return StorageLevel.MEMORY_ONLY
        elif persistTypStr == "MEMORY_ONLY_2":
            return StorageLevel.MEMORY_ONLY_2
        elif persistTypStr == "MEMORY_AND_DISK":
            return StorageLevel.MEMORY_AND_DISK
        elif persistTypStr == "MEMORY_AND_DISK_2":
            return StorageLevel.MEMORY_AND_DISK_2
        elif persistTypStr == "OFF_HEAP":
            return StorageLevel.OFF_HEAP
        else:
            self.log.warn(
                "Invalid Persist Type %s received. Defaulting to NONE" %
                (persistTypStr))
            return None

    def repartitionDF(self, dataFrame: DataFrame, partitions: int = 0):
        '''
            Repartition the inuput dataframe

            parms: df          -> dataframe
                   partitions  -> new partitions count. Defaulted to 0 i.e Don't partition

            logic,
                if partitions = 0 , Don't repartitions
                if partitions = -1, Repartions to the default number (NumOfExecutors * ExecutorCores * 2)
                if partitions > 0 , Repartition/coalesce to the input number
        '''
        curParts = dataFrame.rdd.getNumPartitions
        finalParts = min(curParts, partitions)

        if curParts == partitions or partitions == 0:
            finalParts = -1
        elif partitions == -1:
            finalParts = self.__dfltRDDParts
        elif partitions > 0:
            finalParts = partitions
        else:
            pass  #finalParts is pre-populated.

        self.log.debug("Current Partitions: %d , Requested: %d,  Final: %d " %
                       (curParts, partitions, finalParts))

        if finalParts != -1:
            return dataFrame
        elif curParts > finalParts:
            return dataFrame.coalesce(finalParts)
        else:
            return dataFrame.repartition(finalParts)

    def handleHints(self, query: str):
        '''
            Removes the SparkSQL hints if the -useHist parm is not set.

            Example:- If sql = 'select /* hists */ cols.. from ..'
               if -useHist is not set,
                  return 'select cols.. from ..'
               else
                  return 'select /* hists */ cols.. from ..'
        '''
        if self.__useHist:
            return query
        else:
            return re.sub(r'/\*+.*\*/', '', query)

    @staticmethod
    def getPartitionColumnsFromSQL(query):
        s = query.toLowerCase().strip().replace("\n", " ")
        inx = s.index(" cluster ")
        lst = []
        if inx > 0:
            lst.extend((map(lambda x: x.strip(), s[inx + 12:].split(","))))
        else:
            frm = s.index(" distribute ")
            to = s.index(" sort ", frm + 15) if frm > 0 else 0
            if to > frm:
                lst.extend((map(lambda x: x.strip(),
                                s[frm + 15:to].split(","))))
            else:
                lst.extend((map(lambda x: x.strip(), s[frm + 15:].split(","))))
        return lst
Exemplo n.º 25
0
# -*- coding: utf-8 -*-
'''

'''
__author__ = 'Foxlora'
__time__ = '2020/10/10 22:22'

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

from pyspark.streaming.kafka import KafkaUtils
from setting.default import DefaultConfig

# 1、创建spark streaming context conf
conf = SparkConf()
conf.setAll(DefaultConfig.SPARK_ONLINE_CONFIG)
sc = SparkContext(conf=conf)
stream_sc = StreamingContext(sc, 60)

# 2、配置与kafka读取的配置
similar_kafka = {"metadata.broker.list": DefaultConfig.KAFKA_SERVER, "group.id": 'similar'}
SIMILAR_DS = KafkaUtils.createDirectStream(stream_sc, ['click-trace'], similar_kafka)



Exemplo n.º 26
0
specific data


Note that I use spark because there is currently no way to use SQL queries
with dask
"""

from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SQLContext, DataFrame

# This could benefit from some tweaks especially if the database becomes larger
conf = SparkConf()
conf.set("spark.sql.autoBroadcastJoinThreshold", 1024 * 1024 * 100)
conf.setAppName('Mnist_Spark_MLP').setMaster('local[8]')
conf.setAll([('spark.executor.memory', '8g'), ('spark.executor.cores', '3'),
             ('spark.cores.max', '3'), ('spark.driver.memory', '8g')])
conf.set("spark.sql.caseSensitive", "true")

# Global imports
import glob
import yaml
import logging
logging.getLogger().setLevel(logging.INFO)
import os
import textwrap
import numpy as np
import subprocess
from datetime import datetime
import copy
import time
import fnmatch
Exemplo n.º 27
0
from __future__ import print_function, division
import os
import sys
import copy
import functools

from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

conf = SparkConf().setMaster("yarn").setAppName("autofe").set(
    'spark.yarn.queue', 'solution')
# set app resources
configs = [('spark.driver.memory', '10g'), ('spark.executor.memory', '4g'),
           ('spark.executor.instances', '10'), ('spark.executor.cores', '2')]
conf.setAll(configs)
# conf = SparkConf().set('master', 'local')
sc = SparkContext.getOrCreate(conf=conf)
# sc = SparkContext.getOrCreate()
sql_context = HiveContext(sc)

#action表进行预处理
#数据加载
path = "hdfs://m7-model-hdp01:8020/user/2-6-0-model-test/user_1/nodes/data-load-load-240240/out/20190717/DAG_36240/NODE_240240/SLOT_0/DataLoad/02150359716"
t = sql_context.read.parquet(path)

#查看数据信息
print("查看几行数据")
print(t.show(5))
print("查看数据类型")
print(t.dtypes)
Exemplo n.º 28
0
    def __setupSparkSession__(
        self,
        jobConf: dict,
    ) -> SparkSession:
        '''
        Init the Spark environemnt with few default configurations and start the spark session.
        '''
        conf = SparkConf()
        #
        #Setup Spark Specific configurations
        #
        hmConf = {
            "spark.executor.pyspark.memory":
            "512m",
            "spark.debug.maxToStringFields":
            "5000",
            "spark.rps.askTimeout":
            "1200",
            "spark.network.timeout":
            "1200",
            "spark.maxRemoteBlockSizeFetchToMem":
            "512m",
            "spark.broadcast.blockSize":
            "16m",
            "spark.broadcast.compress":
            "true",
            "spark.rdd.compress":
            "true",
            "spark.io.compression.codec":
            "org.apache.spark.io.SnappyCompressionCodec",
            "spark.kryo.unsafe":
            "true",
            "spark.serializer":
            "org.apache.spark.serializer.KryoSerializer",
            "spark.kryoserializer.buffer":
            "10240",
            "spark.kryoserializer.buffer.max":
            "2040m",
            "hive.exec.dynamic.partition":
            "true",
            "hive.exec.dynamic.partition.mode":
            "nonstrict",
            "hive.warehouse.data.skiptrash":
            "true",
            "spark.sql.hive.metastorePartitionPruning":
            "true",
            "spark.sql.broadcastTimeout":
            "1200",
            "spark.sql.sources.partitionOverwriteMode":
            "dynamic",
            "spark.sql.orc.filterPushdown":
            "true",
            "spark.sql.orc.splits.include.file.footer":
            "true",
            "spark.sql.orc.cache.stripe.details.size":
            "1000",
            "spark.hadoop.parquet.enable.summary-metadata":
            "false",
            "spark.sql.parquet.mergeSchema":
            "false",
            "spark.sql.parquet.filterPushdown":
            "true",
            "spark.sql.parquet.fs.optimized.committer.optimization-enabled":
            "true",
            "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version":
            "2",
            "spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored":
            "true"
        }

        for (k, v) in jobConf['sparkconfs'].items():
            hmConf.set(k, v)

        conf.setAll(hmConf)
        #
        #Setup Hadoop Specific configurations
        #
        hdpCnf = SparkContext._jsc.hadoopConfiguration()
        hdpCnf.set('io.file.buffer.size', '65536')
        hdpCnf.set('mapreduce.fileoutputcommitter.algorithm.version', '2')

        for (k, v) in jobConf['hadoopconfs'].items():
            hdpCnf.set(k, v)

    #
    #Setup AWS Specific configurations
    #
        if jobConf['appconfs']['runenv'].toUpperCase() == 'AWS':
            SparkContext.setSystemProperty(
                'com.amazonaws.services.s3.enableV4', 'true')
            SparkContext.setSystemProperty(
                'com.amazonaws.services.s3.enforceV4', 'true')
            conf.set(
                "spark.sql.parquet.output.committer.class",
                "com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter"
            )

            cred = None
            try:
                from botocore.credentials import InstanceMetadataProvider, InstanceMetadataFetcher
                provider = InstanceMetadataProvider(
                    iam_role_fetcher=InstanceMetadataFetcher(timeout=1000,
                                                             num_attempts=2))
                creds = provider.load()
                hdpCnf.setAll({
                    'fs.s3a.access.key': creds.access_key,
                    'fs.s3a.access.key': creds.secret_key,
                })
            except:
                pass
            hdpCnf.setAll({
                'fs.s3a.server-side-encryption-algorithm':
                'SSE-KMS',
                'fs.s3.enableServerSideEncryption':
                'true',
                'fs.s3.enableServerSideEncryption':
                'true',
                'fs.s3.impl':
                'org.apache.hadoop.fs.s3a.S3AFileSystem',
                'fs.s3a.impl':
                'org.apache.hadoop.fs.s3a.S3AFileSystem',
                'fs.s3a.endpoint':
                "s3.%s.amazonaws.com" %
                (jobConf['appconfs']['appdefaults'] or 'us-east-1')
            })


        spark = SparkSession \
                .builder \
                .config(conf=conf) \
                .appName(jobConf['name'] or 'PySparkApp') \
                .enableHiveSupport() \
                .getOrCreate()

        sc = spark.sparkContext
        sc.setLogLevel(jobConf['appconfs']['logging']['sparkloglevel']
                       or 'INFO')
        if jobConf['appconfs']['logging']['sparkloglevel'] or 'INFO' == "DEBUG":
            msg = ""
            for k in sc._conf.getAll():
                msg += "\t%50s -> %s\n" % (k[0], k[1])
            log.debug(
                "Initiated SparkSesion with below confs,\n{}".format(msg))

        return spark
Exemplo n.º 29
0
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import *
from pyspark.storagelevel import StorageLevel

appName = "KafkaStreams"
config = SparkConf().setAppName(appName)

props = []
props.append(("spark.rememberDuration", "10"))
props.append(("spark.batchDuration", "10"))
props.append(("spark.eventLog.enabled", "true"))
props.append(("spark.streaming.timeout", "30"))
props.append(("spark.ui.enabled", "true"))

config = config.setAll(props)

sc = SparkContext(conf=config)
ssc = StreamingContext(sc, 5)

topics = ["t1"]
kafka_params = {
    "zookeeper.connect": "localhost:5181/kafka",
    "metadata.broker.list": "localhost:9092",
    "group.id": "Kafka_MapR-Streams_to_HBase"
}

raw = KafkaUtils.createDirectStream(ssc, topics, kafka_params)
raw.pprint()

ssc.start()  # Start the computation
Exemplo n.º 30
0
def spark_session(spark_id, executor_num, local_dir):
    logger.info('[%s] init spark session', spark_id)

    # spark
    if 'SPARK_HOME' not in os.environ:
        os.environ['SPARK_HOME'] = SPARK_HOME

    os.environ['PYSPARK_PYTHON'] = WORKER_PYTHON
    os.environ['PYSPARK_DRIVER_PYTHON'] = DRIVER_PYTHON

    if not local_dir:
        local_dir = os.path.join(JOB_ROOT_DIR.LOCAL_ROOT, spark_id)

    os.makedirs(os.path.join(local_dir, 'tmp'))
    #os.makedirs(os.path.join(local_dir, 'metastore_db'))
    spark_conf = SparkConf()
    conf_details = [
        # ('spark.yarn.jars', ''),
        # ('spark.executorEnv.PATH', SPARK_CONFIG['WORKER_PATH']),
        # ('spark.eventLog.dir', 'hdfs://TS-CLICKH011:8020/spark/history'),
        # ('spark.yarn.historyServer.address', 'http://ts-clickh09:18080/'),
        # ('spark.executorEnv.PATH', './python3/bin/:$PATH'),
        # ('spark.appMasterEnv.PATH', SparkConfig['WORKER_PATH']),
        # ('spark.yarn.appMasterEnv.PYSPARK_PYTHON', './python3/bin/python3'),
        # ('spark.executorEnv.PYSPARK_PYTHON', './python3/bin/python3'),
        # ('spark.driver.host', '172.22.16.57'),
        # ('spark.pyspark.python', './python3/bin/python3'),
        # ('spark.pyspark.python', './python3/bin/python3'),
        # ('spark.pyspark.driver.python', '/data/anaconda3/bin/python'),
        ('spark.yarn.archive', HDFS_SPARK_JARS),
        ('spark.yarn.dist.archives', SPARK_CONFIG['SPARK_ARCHIVES']),
        ('spark.eventLog.enabled', 'true'),
        ('spark.eventLog.compress', 'true'),
        ('spark.driver.memory', '2G'),
        ('spark.driver.extraJavaOptions',
         f'-Duser.timezone=UTC+0800 -Djava.io.tmpdir={os.path.join(local_dir, "tmp")} -Dderby.system.home={os.path.abspath(local_dir)}'
         ),
        ('spark.executor.extraJavaOptions',
         '-Duser.timezone=UTC+0800 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps'
         ),
        ('spark.executor.instances', executor_num),
        ('spark.executor.memory', '8G'),
        ('spark.executor.cores', 4),
        ('spark.sql.shuffle.partitions', executor_num),
        ('spark.yarn.executor.memoryOverhead', '4G'),
        ('spark.sql.warehouse.dir', os.path.join(local_dir, 'metastore_db')),
        ('spark.local.dir', os.path.join(local_dir, 'tmp')),
        ('spark.driver.extraClassPath',
         "/data/tool/env/hadoop-lzo/lib/hadoop-lzo-0.4.19.jar"),
        ('spark.driver.extraLibraryPath ',
         '/data/tool/env/hadoop-lzo/lib/native/')
    ]

    for k, v in conf_details:
        print(f'[{spark_id}] spark config {k} = {v}')

    spark_conf.setAll(conf_details)
    spark_conf.setAppName(f'{spark_id}')
    spark_conf.setMaster('yarn')

    spark = SparkSession.builder.config(
        conf=spark_conf).enableHiveSupport().getOrCreate()
    return spark
Exemplo n.º 31
0
def create_spark_conf():
    bigdl_conf = get_bigdl_conf()
    sparkConf = SparkConf()
    sparkConf.setAll(bigdl_conf.items())
    return sparkConf
Exemplo n.º 32
0
def init_spark(config, app=None, use_session=False):
    import os
    import sys
    from glob import glob

    if 'spark-home' in config:
        os.environ['SPARK_HOME'] = config['spark-home']

    if 'spark-conf-dir' in config:
        os.environ['SPARK_CONF_DIR'] = config['spark-conf-dir']

    if 'pyspark-python' in config:
        # Set python interpreter on both driver and workers
        os.environ['PYSPARK_PYTHON'] = config['pyspark-python']

    if 'yarn-conf-dir' in config:
        # Hadoop YARN configuration
        os.environ['YARN_CONF_DIR'] = config['yarn-conf-dir']

    if 'spark-classpath' in config:
        # can be used to use external folder with Hive configuration
        # e. g. spark-classpath='/etc/hive/conf.cloudera.hive1'
        os.environ['SPARK_CLASSPATH'] = config['spark-classpath']

    submit_args = []

    driver_mem = config.get('spark-prop.spark.driver.memory', None)
    if driver_mem is not None:
        submit_args.extend(["--driver-memory", driver_mem])

    driver_cp = config.get('spark-prop.spark.driver.extraClassPath', None)
    if driver_cp is not None:
        submit_args.extend(["--driver-class-path", driver_cp])

    driver_java_opt = config.get('spark-prop.spark.driver.extraJavaOptions', None)
    if driver_java_opt is not None:
        submit_args.extend(["--driver-java-options", driver_java_opt])

    jars = config.get('jars', None)
    if jars is not None:
        if isinstance(jars, str):
            jars = [jars]
        submit_args.extend(["--jars", ','.join(jars)])

    mode_yarn = config['spark-prop.spark.master'].startswith('yarn')

    if mode_yarn:
        # pyspark .zip distribution flag is set only if spark-submit have master=yarn in command-line arguments
        # see spark.yarn.isPython conf property setting code
        # in org.apache.spark.deploy.SparkSubmit#prepareSubmitEnvironment
        submit_args.extend(['--master', 'yarn'])

    # pyspark .zip distribution flag is set only if spark-submit have pyspark-shell or .py as positional argument
    # see spark.yarn.isPython conf property setting code
    # in org.apache.spark.deploy.SparkSubmit#prepareSubmitEnvironment
    submit_args.append('pyspark-shell')

    os.environ['PYSPARK_SUBMIT_ARGS'] = ' '.join(submit_args)

    spark_home = os.environ['SPARK_HOME']
    spark_python = os.path.join(spark_home, 'python')
    pyspark_libs = glob(os.path.join(spark_python, 'lib', '*.zip'))
    sys.path.extend(pyspark_libs)

    virtualenv_reqs = config['spark-prop'].get('spark.pyspark.virtualenv.requirements', None)
    if use_session:
        from pyspark.sql import SparkSession

        builder = SparkSession.builder.appName(app or config['app'])

        if mode_yarn:
            builder = builder.enableHiveSupport()

        for k, v in prop_list(config['spark-prop']).items():
            builder = builder.config(k, v)

        ss = builder.getOrCreate()
        if virtualenv_reqs is not None:
            ss.addFile(virtualenv_reqs)
        return ss
    else:
        from pyspark import SparkConf, SparkContext
        conf = SparkConf()
        conf.setAppName(app or config['app'])
        props = [(k, str(v)) for k, v in prop_list(config['spark-prop']).items()]
        conf.setAll(props)
        sc = SparkContext(conf=conf)
        if virtualenv_reqs is not None:
            sc.addFile(virtualenv_reqs)
        return sc
Exemplo n.º 33
0
 def _create_spark_context():
     spark_conf = SparkConf()
     spark_conf.set('spark.sql.catalogImplementation', 'hive')
     spark_conf.setAll(self._setup_options(additional_options))
     return SparkContext(conf=spark_conf)
Exemplo n.º 34
0
# -*- coding: UTF-8 -*-

import happybase
from setting.default import DefaultConfig
import redis

pool = happybase.ConnectionPool(size=10, host='hadoop-master', port=9090)

# 召回数据
# 加上decode_responses=True,写入的键值对中的value为str类型,不加这个参数写入的则为字节类型。
redis_client = redis.StrictRedis(host=DefaultConfig.REDIS_HOST,
                                 port=DefaultConfig.REDIS_PORT,
                                 db=10,
                                 decode_responses=True)

# 用于缓存的Redis数据库
# 加上decode_responses=True,写入的键值对中的value为str类型,不加这个参数写入的则为字节类型。
cache_client = redis.StrictRedis(host=DefaultConfig.REDIS_HOST,
                                 port=DefaultConfig.REDIS_PORT,
                                 db=8,
                                 decode_responses=True)

# 在 sort_service.py 排序逻辑中使用
from pyspark import SparkConf
from pyspark.sql import SparkSession
# spark配置
conf = SparkConf()
conf.setAll(DefaultConfig.SPARK_GRPC_CONFIG)
SORT_SPARK = SparkSession.builder.config(conf=conf).getOrCreate()