Exemplos de SparkConf.setAll em Python, exemplos de pyspark.SparkConf.setAll em Python

Exemplo n.º 1

0

Exibir arquivo

def get_SparkContext(app_name='tuixing-spark', **kwargs):
    conf = SparkConf()
    conf.setAppName(app_name)
    conf.setAll(COMMON_SC)
    for key in kwargs:
        conf.set(key, kwargs[key])

    sc = SparkContext(conf=conf)
    return sc

Exemplo n.º 2

0

Exibir arquivo

Arquivo: consume_stories_and_process.py Projeto: ngrover2/Topic_Detection_Pipeline

def create_streaming_context():
    conf = SparkConf()
    pairs = [('spark.app.name', 'Process Stories Stream'),
             ('spark.master', 'local[4]'), ('spark.ui.port', '4040')]
    conf.setAll(pairs)
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, batch_secs)
    ssc.checkpoint(checkpointDirectory)  # set checkpoint directory
    return ssc

Exemplo n.º 3

0

Exibir arquivo

def spark_session(app, cores=2, gpus=0, max_failures=1, *args):
    from pyspark import SparkConf
    from pyspark.sql import SparkSession

    with TemporaryDirectory() as tmpdir:
        metastore_path = os.path.join(tmpdir, 'metastore')

        # start a single worker with given cores when gpus are present
        # max failures are ignored when gpus in that case
        master = 'local-cluster[1,{},1024]'.format(cores) if gpus > 0 \
            else 'local[{},{}]'.format(cores, max_failures)
        conf = SparkConf().setAppName(app).setMaster(master)
        conf = conf.setAll([
            ('spark.ui.showConsoleProgress', 'false'),
            ('spark.test.home', os.environ.get('SPARK_HOME')),
            ('spark.locality.wait', '0'),
            ('spark.unsafe.exceptionOnMemoryLeak', 'true'),
            ('spark.ui.enabled', 'false'),
            ('spark.local.dir', os.path.join(tmpdir, 'tmp')),
            ('spark.sql.warehouse.dir', os.path.join(tmpdir, 'warehouse')),
            ('javax.jdo.option.ConnectionURL',
             f'jdbc:derby:;databaseName={metastore_path};create=true'),
        ])

        with temppath() as temp_filename:
            if gpus > 0:
                with open(temp_filename, 'wb') as temp_file:
                    addresses = ', '.join('\\"{}\\"'.format(i)
                                          for i in range(gpus))
                    temp_file.write(
                        b'echo {\\"name\\": \\"gpu\\", \\"addresses\\": [' +
                        addresses.encode('ascii') + b']}')

                os.chmod(
                    temp_file.name, stat.S_IRWXU | stat.S_IXGRP | stat.S_IRGRP
                    | stat.S_IROTH | stat.S_IXOTH)

                # the single worker takes all gpus discovered, and a single executor will get them
                # each task on that executor will get a single gpu
                conf = conf.setAll([
                    ('spark.worker.resource.gpu.discoveryScript',
                     temp_filename),
                    ('spark.worker.resource.gpu.amount', str(gpus)),
                    ('spark.task.resource.gpu.amount', '1'),
                    ('spark.executor.resource.gpu.amount', str(gpus)),
                ])

            session = SparkSession \
                .builder \
                .config(conf=conf) \
                .getOrCreate()

            try:
                yield session
            finally:
                session.stop()

Exemplo n.º 4

0

Exibir arquivo

Arquivo: userCateRatingALSModel.py Projeto: meelement/Headline_Recommendation

 def get_spark_conf(self):
     conf = SparkConf()  # 创建spark config对象
     config = (
         ("spark.app.name", self.SPARK_APP_NAME),  # 设置启动的spark的app名称，没有提供，将随机产生一个名称
         ("spark.executor.memory", "2g"),  # 设置该app启动时占用的内存用量，默认1g
         ("spark.master", self.SPARK_URL),  # spark master的地址
         ("spark.executor.cores", "2")  # 设置spark executor使用的CPU核心数
         # 	('spark.sql.pivotMaxValues', '99999'),  # 当需要pivot DF，且值很多时，需要修改，默认是10000
     )
     # 查看更详细配置及说明：https://spark.apache.org/docs/latest/configuration.html
     conf.setAll(config)
     return conf

Exemplo n.º 5

0

Exibir arquivo

    def __init__(self,
                 path_files: str,
                 path_index: str,
                 path_dict: str,
                 file_name: str,
                 num_partition: int):
        path = ''.join(path_files + file_name)
        self.__file_name = file_name

        conf = SparkConf()
        conf.setAll(
            [
                ('spark.app.name', 'Challenge Data Engineer'),
                ('spark.driver.cores', '4'),
                ('spark.executor.cores', '4'),
                ('spark.driver.maxResultSize', '10g'),
                ('spark.executor.memory', '4g'),
                ('spark.executor.memoryOverhead	', '4g'),
                ('spark.driver.memory', '10g'),
                ('spark.local.dir', PATH_INDEX),
                ('spark.driver.extraJavaOptions', '-Xmx1024m'),
                ('spark.memory.offHeap.enabled', 'true'),
                ('spark.memory.offHeap.size', '20g')
            ]
        )

        self.__spark = SparkSession \
            .builder \
            .config(conf=conf) \
            .getOrCreate()

        self.__df_dict = self.__spark \
            .read \
            .parquet(path_dict) \
            .repartition(numPartitions=num_partition)

        self.__df_doc = self.__spark \
            .read \
            .text(path)

        self.__df_wordid_docid = self.__spark \
            .read \
            .parquet(path_index) \
            .rdd \
            .unpersist() \
            .repartition(numPartitions=1000)

        self.__df_wordid_docid = self.__df_wordid_docid.toDF()

        logging.warning(f"Processing doc: {path}")

Exemplo n.º 6

0

Exibir arquivo

 def create_spark_session(self):
     conf = SparkConf()
     config = (("spark.app.name", self.SPARK_APP_NAME),
               ("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY),
               ("spark.master", self.SPARK_MASTER),
               ("spark.executor.cores", self.SPARK_EXECUTOR_CORES),
               ("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES),
               ("spark.debug.maxToStringFields", "10000"))
     conf.setAll(config)
     if self.ENABLE_HIVE_SUPPORT:
         return SparkSession.builder.config(
             conf=conf).enableHiveSupport().getOrCreate()
     else:
         return SparkSession.builder.config(conf=conf).getOrCreate()

Exemplo n.º 7

0

Exibir arquivo

Arquivo: job_map_wordid_docid.py Projeto: brunocampos01/blocked-sort-based-indexing

    def __init__(self, path_files: str, path_index: str, path_dict: str,
                 file_name: str, num_partition: int):
        path = ''.join(path_files + file_name)
        self.__file_name = file_name

        conf = SparkConf()
        # Application Properties
        # http://spark.apache.org/docs/latest/configuration.html#spark-properties
        conf.setAll([('spark.app.name', 'Challenge Data Engineer'),
                     ('spark.driver.cores', '4'),
                     ('spark.executor.cores', '4'),
                     ('spark.driver.maxResultSize', '10g'),
                     ('spark.executor.memory', '10g'),
                     ('spark.executor.memoryOverhead	', '10g'),
                     ('spark.driver.memory', '10g'),
                     ('spark.local.dir', PATH_INDEX),
                     ('spark.driver.extraJavaOptions', '-Xmx1024m'),
                     ('spark.memory.offHeap.enabled', 'true'),
                     ('spark.memory.offHeap.size', '20g')])

        self.__spark = SparkSession \
            .builder \
            .config(conf=conf) \
            .getOrCreate()

        self.__df_dict = self.__spark \
            .read \
            .parquet(path_dict) \
            .repartition(numPartitions=num_partition)

        self.__df_doc = self.__spark \
            .read \
            .text(path)

        self.__df_wordid_docid = self.__spark \
            .read \
            .parquet(path_index) \
            .rdd \
            .unpersist() \
            .repartition(numPartitions=1000)

        print(self.__df_wordid_docid.getStorageLevel())
        print(self.__df_wordid_docid.getNumPartitions())
        print(self.__spark.sparkContext.getConf().getAll())
        self.__spark.sql("SET -v").show(n=200, truncate=False)

        self.__df_wordid_docid = self.__df_wordid_docid.toDF()

        self.__spark.sparkContext.setLogLevel("warn")
        logging.warning(f"Processing doc: {path}")

Exemplo n.º 8

0

Exibir arquivo

Arquivo: workflow.py Projeto: janelia-flyem/DVIDSparkServices

    def _init_spark(self, appname):
        """Internal function to setup spark context
        
        Note: only include spark modules here so that
        the interface can be queried outside of pyspark.

        """
        # currently using LZ4 compression: should not degrade runtime much
        # but will help with some operations like shuffling, especially when
        # dealing with things object like highly compressible label volumes
        # NOTE: objects > INT_MAX will cause problems for LZ4
        worker_env = {}
        if "DVIDSPARK_WORKFLOW_TMPDIR" in os.environ and os.environ["DVIDSPARK_WORKFLOW_TMPDIR"]:
            worker_env["DVIDSPARK_WORKFLOW_TMPDIR"] = os.environ["DVIDSPARK_WORKFLOW_TMPDIR"]
        
        try:
            spark_config = self.config_data["options"]["spark-config"]
        except KeyError:
            # Old workflows haven't been updated to inherit the base Workflow schema
            spark_config = {}
        
        for k in list(spark_config.keys()):
            spark_config[k] = str(spark_config[k])
            if spark_config[k] in ('True', 'False'):
                spark_config[k] = spark_config[k].lower()
            
        # Backwards compatibility:
        # if 'corespertask' option exists, override it in the spark config
        if "corespertask" in self.config_data["options"] and self.config_data["options"]["corespertask"] != 0:
            if "spark.task.cpus" in spark_config and spark_config["spark.task.cpus"] != '1':
                raise RuntimeError("Bad config: You can't set both 'corespertask' and 'spark.task.cpus'.  Use 'spark.task.cpus'.")
            spark_config["spark.task.cpus"] = str(self.config_data["options"]["corespertask"])

        # set spark config
        from pyspark import SparkContext, SparkConf
        conf = SparkConf()
        conf.setAppName(appname)
        conf.setAll(list(spark_config.items()))
        
#         from pyspark_flame import FlameProfiler
#         flamegraph_dir = f'{self.config_dir}/flamegraphs'
#         os.makedirs(flamegraph_dir, exist_ok=True)
#         conf.set("spark.python.profile.dump", flamegraph_dir)
#         conf.set("spark.python.profile", "true")
#         worker_env['pyspark_flame.interval'] = 0.25 # Default is 0.2 seconds
#         return SparkContext(conf=conf, batchSize=1, environment=worker_env, profiler_cls=FlameProfiler)

        # Auto-batching heuristic doesn't work well with our auto-compressed numpy array pickling scheme.
        # Therefore, disable batching with batchSize=1
        return SparkContext(conf=conf, batchSize=1, environment=worker_env)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: __init__.py Projeto: yuhuofei/Bigdata

 def _create_spark_session(self):
     conf = SparkConf()
     config = (
         ('spark.app.name', self.SPARK_APP_NAME),
         ('spark.executor.memory', self.SPARK_EXECUTOR_MEMORY),
         ('spark.master', self.SPARK_URL),
         ('spark.executor.cores', self.SPARK_EXECUTOR_CORES),
         ('spark.executor.instances', self.SPARK_EXECUTOR_INSTANCES),
     )
     conf.setAll(config)
     if self.ENABLE_HIVE_SUPPORT:
         return SparkSession.builder.config(
             conf=conf).enableHiveSupport().getOrCreate()
     else:
         return SparkSession.builder.config(conf=conf).getOrCreate()

Exemplo n.º 10

0

Exibir arquivo

def spark_cluster(logfile, discovery_schedule, hosts, extra_conf=None):
    from pyspark import SparkConf
    from pyspark.sql import SparkSession

    unknown_keys = set([prop for prop, _ in extra_conf]) \
        .difference(conf.SPARK_CONF_DEFAULT_VALUES.keys()) \
        if extra_conf else None
    if unknown_keys:
        raise ValueError(
            'default values must be defined for these properties: {}'.format(
                unknown_keys))

    cluster = SparkClusterController(logfile, discovery_schedule, hosts, 1)
    try:
        cluster.start()

        config = SparkConf().setAppName('elastic spark tests').setMaster(
            cluster.master_url())
        config = config.setAll([
            # pyspark-shell JVM will OOM even with 1GB when all tests run in one process
            # SparkContext and pyspark-shell JVM gets reused even though we do SparkSession.stop()
            # pyspark-shell JVM memory footprint increases from test to test
            # when run with pytest --forked, set SPARK_DRIVER_MEM=512m env
            ('spark.driver.memory', os.environ.get('SPARK_DRIVER_MEM',
                                                   '1500m')),
            # the minimum executor memory we can set
            ('spark.executor.memory', '512m'),
            # don't pollute the log with progress bar
            ('spark.ui.showConsoleProgress', 'false'),
        ])
        # config properties once set will survive session.stop() and
        # SparkSession.config(conf=config).getOrCreate(), so we have to make sure
        # we overwrite their value if not in extra_conf
        more_conf = conf.SPARK_CONF_DEFAULT_VALUES.copy()
        more_conf.update(extra_conf or [])
        config.setAll(more_conf.items())

        session = SparkSession \
            .builder \
            .config(conf=config) \
            .getOrCreate()

        try:
            yield session
        finally:
            session.stop()
    finally:
        cluster.shutdown()

Exemplo n.º 11

0

Exibir arquivo

Arquivo: clean_and_join.py Projeto: tjnycum/PredicTrip

def get_spark_config(
        predictrip_config: Mapping[str, Mapping[str, str]]) -> SparkConf:
    """
    Create an object representing the Spark configuration we want

    :type predictrip_config: mapping returned by load_config containing configuration options
    :return: pyspark.SparkConf instance
    """
    # NOTE: contrary to https://www.geomesa.org/documentation/user/spark/pyspark.html#using-geomesa-pyspark, use of
    # geomesa_pyspark.configure() no longer necessary since Spark 2.1, as long as you tell spark to include the
    # geomesa_pyspark python module some other way (e.g. spark.files)

    sc = SparkConf()
    sc = sc.setAppName('PredicTrip ' + path.basename(__file__))
    # FIXME: the following doesn't seem to be effective
    sc = sc.setAll([('fs.s3a.awsAccessKeyId',
                     predictrip_config['AWS']['access_key_id']),
                    ('fs.s3a.awsSecretAccessKey',
                     predictrip_config['AWS']['secret_access_key'])])
    # add to sc any spark options that might be set in predictrip_config
    if 'executor_cores' in predictrip_config['Spark']:
        sc = sc.set('spark.executor.cores',
                    predictrip_config['Spark']['executor_cores'])
    if 'executor_memory' in predictrip_config['Spark']:
        sc = sc.set('spark.executor.memory',
                    predictrip_config['Spark']['executor_memory'])
    return sc

Exemplo n.º 12

0

Exibir arquivo

Arquivo: SparkSessionBase.py Projeto: vonkonyoung/myspark

 def _create_spark_session(self):
     #1、创建配置
     '''给spark程序创建初始化spark session'''
     conf=SparkConf()
     config=(
         ("spark.app.name", self.SPARK_APP_NAME),  # 设置启动的spark的app名称，没有提供，将随机产生一个名称
         ("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY),  # 设置该app启动时占用的内存用量，默认2g
         ("spark.master", self.SPARK_URL),  # spark master的地址
         ("spark.executor.cores", self.SPARK_EXECUTOR_CORES),  # 设置spark executor使用的CPU核心数，默认是1核心
         ("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES)
     )
     conf.setAll(config)
     #读取配置初始化
     if self.ENABLE_HIVE_SUPPORT:
         return SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
     else:
         return SparkSession.builder.config(conf=conf).getOrCreate()

Exemplo n.º 13

0

Exibir arquivo

Arquivo: __init__.py Projeto: ashwinath/turing

def build_spark_session(
        app_name: str,
        spark_config: DefaultDict[str, str] = None,
        hadoop_config: DefaultDict[str, str] = None) -> SparkSession:
    conf = SparkConf()
    if spark_config:
        conf.setAll(spark_config.items())

    sc = SparkContext(conf=conf)

    if hadoop_config:
        for k, v in hadoop_config.items():
            sc._jsc.hadoopConfiguration().set(k, v)

    return SparkSession.builder \
        .appName(app_name) \
        .config(conf=sc.getConf()) \
        .getOrCreate()

Exemplo n.º 14

0

Exibir arquivo

    def _create_spark_hbase(self):
        conf = SparkConf()  # 创建spark config对象
        config = (
            ("spark.app.name", self.SPARK_APP_NAME),  # 设置启动的spark的app名称，没有提供，将随机产生一个名称
            ("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY),  # 设置该app启动时占用的内存用量，默认2g
            ("spark.master", self.SPARK_URL),  # spark master的地址
            ("spark.executor.cores", self.SPARK_EXECUTOR_CORES),  # 设置spark executor使用的CPU核心数，默认是1核心
            ("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES),
            ("hbase.zookeeper.quorum", "192.168.19.137"),
            ("hbase.zookeeper.property.clientPort", "22181")
        )

        conf.setAll(config)

        # 利用config对象，创建spark session
        if self.ENABLE_HIVE_SUPPORT:
            return SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
        else:
            return SparkSession.builder.config(conf=conf).getOrCreate()

Exemplo n.º 15

0

Exibir arquivo

Arquivo: common.py Projeto: zgj123123/BigDL

def create_spark_conf():
    bigdl_conf = get_bigdl_conf()
    sparkConf = SparkConf()
    sparkConf.setAll(bigdl_conf.items())
    if not is_spark_below_2_2():
        extend_spark_driver_cp(sparkConf, get_bigdl_classpath())

    # add content in PYSPARK_FILES in spark.submit.pyFiles
    # This is a workaround for current Spark on k8s
    python_lib = os.environ.get('PYSPARK_FILES', None)
    if python_lib:
        existing_py_files = sparkConf.get("spark.submit.pyFiles")
        if existing_py_files:
            sparkConf.set(key="spark.submit.pyFiles",
                          value="%s,%s" % (python_lib, existing_py_files))
        else:
            sparkConf.set(key="spark.submit.pyFiles", value=python_lib)

    return sparkConf

Exemplo n.º 16

0

Exibir arquivo

Arquivo: __init__.py Projeto: yangpanyang/DemoPractice

    def _create_spark_session(self):
        conf = SparkConf()

        config = (
            ("spark.app.name", self.SPARK_APP_NAME),
            ("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY),
            # ("spark.master", self.SPARK_URL),
            ("spark.executor.cores", self.SPARK_EXECUTOR_CORES),
            ("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES),
            # ("spark.sql.warehouse.dir", "/root/apache-hive-2.3.7-bin/warehouse"),
            ("hive.metastore.uris", "thrift://172.18.0.2:9083"))

        conf.setAll(config)
        print(self.ENABLE_HIVE_SUPPORT, config)

        if self.ENABLE_HIVE_SUPPORT:
            return SparkSession.builder.config(
                conf=conf).enableHiveSupport().getOrCreate()
        else:
            return SparkSession.builder.config(conf=conf).getOrCreate()

Exemplo n.º 17

0

Exibir arquivo

Arquivo: spark.py Projeto: daskos/epos

    def closure(*args, **kwargs):
        try:
            options = opts
            options.update({
                'sql_parquet_compression_codec': 'uncompressed',
                'mesos_role': role,
                'mesos_coarse': bool(coarse),
                'cores_max': int(coarse) or None,
                'executor_cores': int(executor_cores),
                'executor_memory': '{}m'.format(int(executor_memory / MiB)),
                'driver_memory': '{}m'.format(int(driver_memory / MiB)),
                'mesos_executor_memoryOverhead': int(
                    (memory_overhead or (executor_cores * python_worker_memory +
                                         0.1 * executor_memory))
                    / MiB),
                'python_worker_memory': int(python_worker_memory / MiB),
                'mesos_uris': ','.join(uris),
                'mesos_executor_docker_image': docker
            })
            options = {'spark.{}'.format(k.replace('_', '.')): str(v)
                       for k, v in options.items() if v not in (None, '')}
            environs = envs.items()
        except TypeError as e:
            # curry doesn't reraise TypeErrors:
            # https://github.com/pytoolz/toolz/issues/288
            raise Exception(repr(e))

        conf = SparkConf()
        conf.setMaster(str(master))
        conf.setAppName(str(name or fn.__name__))
        conf.setAll(pairs=options.items())
        conf.setExecutorEnv(pairs=environs)

        with SparkContext(conf=conf) as sc:
            sc.setLogLevel(str(log))
            map(sc.addFile, files)
            map(sc.addPyFile, pyfiles)
            # TODO: user sparksession
            sql = SQLContext(sc)
            return fn(sc, sql, *args, **kwargs)

Exemplo n.º 18

0

Exibir arquivo

 def get_spark_config(path, dependencies) -> SparkConf:
     master = 'local[2]'
     conf = SparkConf().setAppName('unit test').setMaster(master)
     return conf.setAll([
         ('spark.ui.showConsoleProgress', 'false'),
         ('spark.test.home', os.environ.get('SPARK_HOME')),
         ('spark.locality.wait', '0'),
         ('spark.driver.extraClassPath', '{}'.format(':'.join([
             os.path.join(os.getcwd(), path, 'target', 'classes'),
             os.path.join(os.getcwd(), path, 'target', 'test-classes'),
             dependencies
         ]))),
     ])

Exemplo n.º 19

0

Exibir arquivo

Arquivo: spark_common.py Projeto: zhangbaotju/horovod

def spark_session(app, cores=2, gpus=0, max_failures=1, *args):
    from pyspark import SparkConf
    from pyspark.sql import SparkSession

    master = 'local-cluster[{},{},1024]'.format(cores, max_failures) if gpus > 0 \
        else 'local[{},{}]'.format(cores, max_failures)
    conf = SparkConf().setAppName(app).setMaster(master)
    conf = conf.setAll([
        ('spark.ui.showConsoleProgress', 'false'),
        ('spark.test.home', os.environ.get('SPARK_HOME')),
        ('spark.locality.wait', '0'),
    ])

    with temppath() as temp_filename:
        if gpus > 0:
            with open(temp_filename, 'wb') as temp_file:
                addresses = ', '.join('\\"{}\\"'.format(i) for i in range(gpus))
                temp_file.write(b'echo {\\"name\\": \\"gpu\\", \\"addresses\\": [' +
                                addresses.encode('ascii') + b']}')

            os.chmod(temp_file.name, stat.S_IRWXU | stat.S_IXGRP | stat.S_IRGRP |
                     stat.S_IROTH | stat.S_IXOTH)

            conf = conf.setAll([
                ('spark.worker.resource.gpu.discoveryScript', temp_filename),
                ('spark.worker.resource.gpu.amount', '1'),
                ('spark.task.resource.gpu.amount', '1'),
                ('spark.executor.resource.gpu.amount', '1')
            ])

        session = SparkSession \
            .builder \
            .config(conf=conf) \
            .getOrCreate()

        try:
            yield session
        finally:
            session.stop()

Exemplo n.º 20

0

Exibir arquivo

Arquivo: base.py Projeto: KumarAbhinav2/spark_poc

def create_spark_conf():
    bigdl_conf = get_bigdl_conf()
    sparkConf = SparkConf()
    sparkConf.setAll(bigdl_conf.items())
    if os.environ.get("BIGDL_JARS", None) and not is_spark_below_2_2():
        for jar in os.environ["BIGDL_JARS"].split(":"):
            extend_spark_driver_cp(sparkConf, jar)

    # add content in PYSPARK_FILES in spark.submit.pyFiles
    # This is a workaround for current Spark on k8s
    python_lib = os.environ.get("PYSPARK_FILES", None)
    if python_lib:
        existing_py_files = sparkConf.get("spark.submit.pyFiles")
        if existing_py_files:
            sparkConf.set(
                key="spark.submit.pyFiles",
                value="%s,%s" % (python_lib, existing_py_files),
            )
        else:
            sparkConf.set(key="spark.submit.pyFiles", value=python_lib)

    return sparkConf

Exemplo n.º 21

0

Exibir arquivo

def create_spark_conf():
    bigdl_conf = get_bigdl_conf()
    sparkConf = SparkConf()
    sparkConf.setAll(bigdl_conf.items())
    return sparkConf

Exemplo n.º 22

0

Exibir arquivo

Arquivo: schedule.py Projeto: dyan0123/Utils

			except msg2xlsx.ConvertingError:
				logger.error("converting failed, clean the previous work")
				xlsx_path = os.path.join(settings.SAVE_DIRECTORY, project_id)
				clean(project_id, dirs=[temp_file, xlsx_path])
				return
				
			# processed ack
			channel.basic_ack(delivery_tag=method.delivery_tag)
	else:
		logger.error("illegal content-type: " + header.content_type)


if __name__ == '__main__':
	# initialize spark
	conf = SparkConf().setMaster(settings.SPARK_MASTER_URL).setAppName(settings.SPARK_APP_NAME)
	conf.setAll([("spark.eventLog.enabled", "true"), ("spark.eventLog.dir", settings.LOG_DIRECTORY)])
	sc = SparkContext(conf=conf)

	# initialize rabbitmq
	credentials = pika.PlainCredentials(settings.RABBITMQ_CONN_CONF['username'], settings.RABBITMQ_CONN_CONF['password'])
	conn_params = pika.ConnectionParameters(settings.RABBITMQ_CONN_CONF['host'], credentials=credentials)
	conn_broker = pika.BlockingConnection(conn_params)

	channel = conn_broker.channel()
	channel.exchange_declare(exchange=settings.RABBITMQ_SPARK['exchange'],
							 type="direct",
							 passive=False,
							 durable=True,
							 auto_delete=False)

	channel.queue_declare(queue=settings.RABBITMQ_SPARK['queue'])

Exemplo n.º 23

0

Exibir arquivo

Arquivo: spark_streaming_hbase.py Projeto: yajur33/pyspark-examples

from pyspark.streaming.kafka import *;
from pyspark.storagelevel import StorageLevel
import happybase


appName = "KafkaStreams"
config = SparkConf().setAppName(appName)

props = []
props.append(("spark.rememberDuration", "10"))
props.append(("spark.batchDuration", "10"))
props.append(("spark.eventLog.enabled", "true"))
props.append(("spark.streaming.timeout", "30"))
props.append(("spark.ui.enabled", "true"))

config = config.setAll(props)

sc = SparkContext(conf=config)
ssc = StreamingContext(sc, 5)

topics = ["t1"]
kafka_params = {
   "zookeeper.connect" : "localhost:5181/kafka"
 , "metadata.broker.list" : "localhost:9092"
 , "group.id" : "Kafka_MapR-Streams_to_HBase"}

raw = KafkaUtils.createDirectStream(ssc, topics, kafka_params)
raw.pprint()

server = "localhost"
table_name = "/tables/stocks"

Exemplo n.º 24

0

Exibir arquivo

class SparkUtils:
    def __init__(self,
                 log: RootLogger = None,
                 parms: dict = None,
                 botoSession: bototSession = None,
                 appName: str = None):
        self.log = log
        self.__parms = parms or {}
        self.__runEnv = self.__parms.get("--runEnv", "local")
        if (self.__runEnv == "aws"):
            self.__boto = botoSession
            self.__s3 = S3(log, self.__boto)

        self.__initFlags()
        self.__setupSparkSession__(appName)

        self.__dfltRDDParts = \
                int(self.__spark.conf.get("spark.executor.instances", "20")) * \
                int(self.__spark.conf.get("spark.executor.cores", "4")) * 2

    def __initFlags(self):
        '''
        Init the job level parameters needed by this class
        '''
        self.__parms["--saveDFAs"] = self.__parms.get("--saveDFAs", "NONE")

        self.__explainDF = True if "-explainDF" in self.__parms else False
        self.__printcount = True if "-printCount" in self.__parms else False
        self.__useHist = True if "-useHint" in self.__parms else False
        self.__saveDF = True if self.__parms["--saveDFAs"] != "NONE" else False

        self.__fileFmt = self.__parms.get("--fileFormat", "parquet")

        if (self.__runEnv == "aws"):
            self.__tempS3 = self.__parms.get("--tempS3", "hdfs:///temp/s3")
        if (self.__runEnv != "local"):
            self.__tempHDFS = self.__parms.get("--tempHDFS", "hdfs:///temp")
            self.log.warn(
                "For persist type 'S3', 'HDFS' will be used as the --runEnv != 'aws'"
            )

    def __setupSparkSession__(self, appName: str = None):
        '''
        Init the Spark environemnt with few default configurations and start the spark session.
        '''
        self.__conf = SparkConf()
        hmConf = {
            "spark.rps.askTimeout": "1200",
            "spark.network.timeout": "1200",
            "spark.broadcast.blockSize": "16m",
            "spark.sql.broadcastTimeout": "1200",
            "spark.broadcast.compress": "true",
            "spark.rdd.compress": "true",
            "fs.s3.enableServerSideEncryption": "true",
            "spark.kryo.unsafe": "false",
            "spark.kryoserializer.buffer": "10240",
            "spark.kryoserializer.buffer.max": "2040m",
            "spark.io.compression.codec":
            "org.apache.spark.io.SnappyCompressionCodec",
            "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
            "mapreduce.fileoutputcommitter.algorithm.version": "2",
            "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version":
            "2",
        }
        self.__conf.setAll(hmConf)
        SparkContext.setSystemProperty("com.amazonaws.services.s3.enableV4",
                                       "true")
        SparkContext.setSystemProperty("com.amazonaws.services.s3.enforceV4",
                                       "true")
        self.__spark = SparkSession \
                        .builder \
                        .config(conf=self.__conf) \
                        .appName(appName or "PySparkApp") \
                        .enableHiveSupport() \
                        .getOrCreate()
        self.__sc = self.__spark.sparkContext
        self.sqlC = SQLContext(self.__sc)
        self.__sc.setSystemProperty("com.amazonaws.services.s3.enableV4",
                                    "true")
        self.__sc.setSystemProperty("com.amazonaws.services.s3.enforceV4",
                                    "true")
        self.__sc.setLogLevel(self.__parms.get("--logLevel", "INFO"))

        hdpCnf = self.__sc.hadoopConfiguration
        hdpCnf.setAll({
            "io.file.buffer.size":
            "65536",
            "mapreduce.fileoutputcommitter.algorithm.version":
            "2",
            "fs.s3a.endpoint":
            "%s.amazonaws.com" %
            (self.__parms.get("--awsRegion", 's3.us-east-1'))
        })
        if (self.__parms.get("--runEnv", "AWS") == "AWS"):
            from botocore.credentials import InstanceMetadataProvider, InstanceMetadataFetcher
            provider = InstanceMetadataProvider(
                iam_role_fetcher=InstanceMetadataFetcher(timeout=1000,
                                                         num_attempts=2))
            creds = provider.load()
            hdpCnf.setAll({
                "fs.s3a.access.key":
                creds.access_key,
                "fs.s3a.secret.key":
                creds.secret_key,
                "fs.s3a.server-side-encryption-algorithm":
                "SSE-KMS",
                "fs.s3.enableServerSideEncryption":
                "true",
                "fs.s3.impl":
                "org.apache.hadoop.fs.s3a.S3AFileSystem",
                "fs.s3a.impl":
                "org.apache.hadoop.fs.s3a.S3AFileSystem",
                "fs.s3a.endpoint":
                "s3.%s.amazonaws.com" %
                (self.__parms.get("--awsRegion", "us-east-1"))
            })

    def sql(self,
            dfName: str,
            query: str,
            partitions: int = 0,
            persistType: str = None):
        '''
        Runs the inpult SQL, partitions the resulting Dataframe and persists the Dataframe if needed.

        Supported persistType: In addition to the pySpark native persist types, this function supports
        HIVE, HDFS, S3

        '''
        if persistType == None:
            _df = self.__spark.sql(self.handleHints(query))
            if partitions == 0:
                df = _df
            elif df.rdd.getNumPartitions < partitions:
                df = _df.repartition(partitions)
            else:
                df = _df.coalesce(partitions)
            return df
        else:
            df = self.storeDF(
                df=self.sql(query, partitions),
                dfName=dfName,
                persistType=persistType,
                partitions=partitions,
                partitionCols=self.getPartitionColumnsFromSQL(query))

        if dfName:
            df.createOrReplaceTempView(dfName)

        if self.__printcount:
            self.log.info("Number of Records in DF '%s' : %d " %
                          (dfName, df.count()))

    def storeDF(self, df: DataFrame, dfName: str, persistType: str,
                partitions: int, partitionCols: List[str]):
        '''
        Store the input dataframe, read the persisted datafrme and return the new one.
        If Memory/Disk persistance requested, we run take(1) on the datafrme to force persist.
        '''
        if self.__explainDF or \
            "NULL|NONE".index(persistType.toUpperCase()) < 0 :
            self.log.info("Execution pland for building the DF '%s'" %
                          (dfName))
            df.explain()
            self.log.info("\n\n\n")

        saveType = self.__parms["--saveDFAs"] \
            if self.__saveDF and \
               "HIVE|NULL".index(persistType.toUpperCase()) < 0 \
            else \
                persistType.toUpperCase()

        if saveType == "S3" and self.__runEnv == "aws":
            saveType = "HDFS"
            self.log.debug(
                "Resetting the persist type to 'HDFS' as the --runEnv != 'aws'"
            )

        df1 = df if saveType != "HDFS" and \
                    saveType != "HIVE" and \
                    saveType != "S3" \
                 else self.repartitionDF(dataFrame= df, partitions = partitions)

        if saveType == "NULL" or saveType == "NONE":
            return df1
        elif saveType == "HDFS":
            return self.persistExternal(self.__tempHDFS, dfName, df,
                                        partitionCols)
        elif saveType == "S3":
            return self.persistExternal(self.__tempS3, dfName, df,
                                        partitionCols)
        elif saveType == "":
            return self.persist2Hive(dfName, df, partitionCols)
        elif saveType == "CHECK_POINT":
            return df.cache().checkpoint(eager=True)
        else:
            return self.persistLocal(dfName, df, persistType)

    def persistExternal(self,
                        parentDirURI: str,
                        fileName: str,
                        df: DataFrame,
                        partitionCols: List[str] = None,
                        overwrite: bool = True,
                        fileFormat: str = None,
                        **kwargs):

        fullPath = "%s%s"  % (parentDirURI,fileName or "") if parentDirURI.endswith("/") else \
                   "%s/%s" % (parentDirURI,fileName or "")
        fullPath = fullPath.replace("//", "/")
        schma = df.schema()
        fileFormat = fileFormat or self.__fileFmt
        self.write2ExtrFile(fullPath=fullPath,
                            fileFormat=fileFormat,
                            df=df,
                            partitionCols=partitionCols,
                            overwrite=overwrite,
                            **kwargs)
        df.unpersist()
        if fileFormat == "parquet":
            return self.readParquet(uri=fullPath, schema=schma, **kwargs)
        elif fileFormat == "orc":
            return self.readOrc(uri=fullPath, schema=schma, **kwargs)
        elif fileFormat == "csv":
            return self.readCSV(uri=fullPath, schema=schma, **kwargs)
        else:
            return self.readParquet(uri=fullPath, schema=schma, **kwargs)

    def readParquet(self,
                    uriString: str,
                    schema: StructType = None,
                    mergeSchema: bool = False,
                    **kwargs):
        self.log.info("Reading the parquet file '%s'" % uriString)
        rdr = self.__spark.read.format("parquet")
        if mergeSchema:
            rdr.option("mergeSchema", "true")
        if schema:
            rdr.schema(schema)
        return rdr.load(uriString)

    def readOrc(self, uriString: str, schema: StructType, **kwargs):
        self.log.info("Reading the ORC file in '%s'" % uriString)
        pass  ##TODO

    def readCSV(self, uriString: str, schema: StructType, **kwargs):
        self.log.info("Reading the CSV file in '%s'" % uriString)
        pass  ##TODO

    def write2ExtrFile(self,
                       fileFormat: str,
                       fullPath: str,
                       df: DataFrame,
                       partitionCols: List[str] = None,
                       overwrite: bool = True,
                       **kwargs):

        if fullPath.startswith("s3"):
            self.__s3.waitForFile("%s/_SUCCESS" % (fullPath))

        #TODO:Yet to Implement

    def persist2Hive(self, table: str, df: DataFrame,
                     partitionCols: List[str]):
        pass  #TODO:Yet to Implement

    def persistLocal(self, dfName: str, df: DataFrame, persistType: str):
        ''' Persist the input Datafrmae locally (memory/disk/none) and runs `df.take(1)` to force persist.
        '''
        lvl = self.getSparkPersistType(persistType.toUpperCase())
        if lvl:
            df.persist()

        if (self.__printcount == None):
            df.take(1)

    def getSparkPersistType(self, persistTypStr: str) -> StorageLevel:
        '''
            Converts the String representation to the StorageLevel Object.
            If invalid string received, it will return the `StorageLevel.NONE`
            Supported,
                `StorageLevel.NONE`
                `StorageLevel.DISK_ONLY`
                `StorageLevel.DISK_ONLY_2`
                `StorageLevel.MEMORY_ONLY`
                `StorageLevel.MEMORY_ONLY_2`
                `StorageLevel.MEMORY_AND_DISK`
                `StorageLevel.MEMORY_AND_DISK_2`
                `StorageLevel.OFF_HEAP`
        '''

        if persistTypStr == "NONE": return None
        elif persistTypStr == "DISK_ONLY": return StorageLevel.DISK_ONLY
        elif persistTypStr == "DISK_ONLY_2": return StorageLevel.DISK_ONLY_2
        elif persistTypStr == "MEMORY_ONLY": return StorageLevel.MEMORY_ONLY
        elif persistTypStr == "MEMORY_ONLY_2":
            return StorageLevel.MEMORY_ONLY_2
        elif persistTypStr == "MEMORY_AND_DISK":
            return StorageLevel.MEMORY_AND_DISK
        elif persistTypStr == "MEMORY_AND_DISK_2":
            return StorageLevel.MEMORY_AND_DISK_2
        elif persistTypStr == "OFF_HEAP":
            return StorageLevel.OFF_HEAP
        else:
            self.log.warn(
                "Invalid Persist Type %s received. Defaulting to NONE" %
                (persistTypStr))
            return None

    def repartitionDF(self, dataFrame: DataFrame, partitions: int = 0):
        '''
            Repartition the inuput dataframe

            parms: df          -> dataframe
                   partitions  -> new partitions count. Defaulted to 0 i.e Don't partition

            logic,
                if partitions = 0 , Don't repartitions
                if partitions = -1, Repartions to the default number (NumOfExecutors * ExecutorCores * 2)
                if partitions > 0 , Repartition/coalesce to the input number
        '''
        curParts = dataFrame.rdd.getNumPartitions
        finalParts = min(curParts, partitions)

        if curParts == partitions or partitions == 0:
            finalParts = -1
        elif partitions == -1:
            finalParts = self.__dfltRDDParts
        elif partitions > 0:
            finalParts = partitions
        else:
            pass  #finalParts is pre-populated.

        self.log.debug("Current Partitions: %d , Requested: %d,  Final: %d " %
                       (curParts, partitions, finalParts))

        if finalParts != -1:
            return dataFrame
        elif curParts > finalParts:
            return dataFrame.coalesce(finalParts)
        else:
            return dataFrame.repartition(finalParts)

    def handleHints(self, query: str):
        '''
            Removes the SparkSQL hints if the -useHist parm is not set.

            Example:- If sql = 'select /* hists */ cols.. from ..'
               if -useHist is not set,
                  return 'select cols.. from ..'
               else
                  return 'select /* hists */ cols.. from ..'
        '''
        if self.__useHist:
            return query
        else:
            return re.sub(r'/\*+.*\*/', '', query)

    @staticmethod
    def getPartitionColumnsFromSQL(query):
        s = query.toLowerCase().strip().replace("\n", " ")
        inx = s.index(" cluster ")
        lst = []
        if inx > 0:
            lst.extend((map(lambda x: x.strip(), s[inx + 12:].split(","))))
        else:
            frm = s.index(" distribute ")
            to = s.index(" sort ", frm + 15) if frm > 0 else 0
            if to > frm:
                lst.extend((map(lambda x: x.strip(),
                                s[frm + 15:to].split(","))))
            else:
                lst.extend((map(lambda x: x.strip(), s[frm + 15:].split(","))))
        return lst

Exemplo n.º 25

0

Exibir arquivo

Arquivo: __init__.py Projeto: foxlora/Recom-Sys-Distributed

# -*- coding: utf-8 -*-
'''

'''
__author__ = 'Foxlora'
__time__ = '2020/10/10 22:22'

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

from pyspark.streaming.kafka import KafkaUtils
from setting.default import DefaultConfig

# 1、创建spark streaming context conf
conf = SparkConf()
conf.setAll(DefaultConfig.SPARK_ONLINE_CONFIG)
sc = SparkContext(conf=conf)
stream_sc = StreamingContext(sc, 60)

# 2、配置与kafka读取的配置
similar_kafka = {"metadata.broker.list": DefaultConfig.KAFKA_SERVER, "group.id": 'similar'}
SIMILAR_DS = KafkaUtils.createDirectStream(stream_sc, ['click-trace'], similar_kafka)

Exemplo n.º 26

0

Exibir arquivo

specific data


Note that I use spark because there is currently no way to use SQL queries
with dask
"""

from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SQLContext, DataFrame

# This could benefit from some tweaks especially if the database becomes larger
conf = SparkConf()
conf.set("spark.sql.autoBroadcastJoinThreshold", 1024 * 1024 * 100)
conf.setAppName('Mnist_Spark_MLP').setMaster('local[8]')
conf.setAll([('spark.executor.memory', '8g'), ('spark.executor.cores', '3'),
             ('spark.cores.max', '3'), ('spark.driver.memory', '8g')])
conf.set("spark.sql.caseSensitive", "true")

# Global imports
import glob
import yaml
import logging
logging.getLogger().setLevel(logging.INFO)
import os
import textwrap
import numpy as np
import subprocess
from datetime import datetime
import copy
import time
import fnmatch

Exemplo n.º 27

0

Exibir arquivo

Arquivo: 精准营销.py Projeto: caowenli/bigdata

from __future__ import print_function, division
import os
import sys
import copy
import functools

from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

conf = SparkConf().setMaster("yarn").setAppName("autofe").set(
    'spark.yarn.queue', 'solution')
# set app resources
configs = [('spark.driver.memory', '10g'), ('spark.executor.memory', '4g'),
           ('spark.executor.instances', '10'), ('spark.executor.cores', '2')]
conf.setAll(configs)
# conf = SparkConf().set('master', 'local')
sc = SparkContext.getOrCreate(conf=conf)
# sc = SparkContext.getOrCreate()
sql_context = HiveContext(sc)

#action表进行预处理
#数据加载
path = "hdfs://m7-model-hdp01:8020/user/2-6-0-model-test/user_1/nodes/data-load-load-240240/out/20190717/DAG_36240/NODE_240240/SLOT_0/DataLoad/02150359716"
t = sql_context.read.parquet(path)

#查看数据信息
print("查看几行数据")
print(t.show(5))
print("查看数据类型")
print(t.dtypes)

Exemplo n.º 28

0

Exibir arquivo

    def __setupSparkSession__(
        self,
        jobConf: dict,
    ) -> SparkSession:
        '''
        Init the Spark environemnt with few default configurations and start the spark session.
        '''
        conf = SparkConf()
        #
        #Setup Spark Specific configurations
        #
        hmConf = {
            "spark.executor.pyspark.memory":
            "512m",
            "spark.debug.maxToStringFields":
            "5000",
            "spark.rps.askTimeout":
            "1200",
            "spark.network.timeout":
            "1200",
            "spark.maxRemoteBlockSizeFetchToMem":
            "512m",
            "spark.broadcast.blockSize":
            "16m",
            "spark.broadcast.compress":
            "true",
            "spark.rdd.compress":
            "true",
            "spark.io.compression.codec":
            "org.apache.spark.io.SnappyCompressionCodec",
            "spark.kryo.unsafe":
            "true",
            "spark.serializer":
            "org.apache.spark.serializer.KryoSerializer",
            "spark.kryoserializer.buffer":
            "10240",
            "spark.kryoserializer.buffer.max":
            "2040m",
            "hive.exec.dynamic.partition":
            "true",
            "hive.exec.dynamic.partition.mode":
            "nonstrict",
            "hive.warehouse.data.skiptrash":
            "true",
            "spark.sql.hive.metastorePartitionPruning":
            "true",
            "spark.sql.broadcastTimeout":
            "1200",
            "spark.sql.sources.partitionOverwriteMode":
            "dynamic",
            "spark.sql.orc.filterPushdown":
            "true",
            "spark.sql.orc.splits.include.file.footer":
            "true",
            "spark.sql.orc.cache.stripe.details.size":
            "1000",
            "spark.hadoop.parquet.enable.summary-metadata":
            "false",
            "spark.sql.parquet.mergeSchema":
            "false",
            "spark.sql.parquet.filterPushdown":
            "true",
            "spark.sql.parquet.fs.optimized.committer.optimization-enabled":
            "true",
            "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version":
            "2",
            "spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored":
            "true"
        }

        for (k, v) in jobConf['sparkconfs'].items():
            hmConf.set(k, v)

        conf.setAll(hmConf)
        #
        #Setup Hadoop Specific configurations
        #
        hdpCnf = SparkContext._jsc.hadoopConfiguration()
        hdpCnf.set('io.file.buffer.size', '65536')
        hdpCnf.set('mapreduce.fileoutputcommitter.algorithm.version', '2')

        for (k, v) in jobConf['hadoopconfs'].items():
            hdpCnf.set(k, v)

    #
    #Setup AWS Specific configurations
    #
        if jobConf['appconfs']['runenv'].toUpperCase() == 'AWS':
            SparkContext.setSystemProperty(
                'com.amazonaws.services.s3.enableV4', 'true')
            SparkContext.setSystemProperty(
                'com.amazonaws.services.s3.enforceV4', 'true')
            conf.set(
                "spark.sql.parquet.output.committer.class",
                "com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter"
            )

            cred = None
            try:
                from botocore.credentials import InstanceMetadataProvider, InstanceMetadataFetcher
                provider = InstanceMetadataProvider(
                    iam_role_fetcher=InstanceMetadataFetcher(timeout=1000,
                                                             num_attempts=2))
                creds = provider.load()
                hdpCnf.setAll({
                    'fs.s3a.access.key': creds.access_key,
                    'fs.s3a.access.key': creds.secret_key,
                })
            except:
                pass
            hdpCnf.setAll({
                'fs.s3a.server-side-encryption-algorithm':
                'SSE-KMS',
                'fs.s3.enableServerSideEncryption':
                'true',
                'fs.s3.enableServerSideEncryption':
                'true',
                'fs.s3.impl':
                'org.apache.hadoop.fs.s3a.S3AFileSystem',
                'fs.s3a.impl':
                'org.apache.hadoop.fs.s3a.S3AFileSystem',
                'fs.s3a.endpoint':
                "s3.%s.amazonaws.com" %
                (jobConf['appconfs']['appdefaults'] or 'us-east-1')
            })


        spark = SparkSession \
                .builder \
                .config(conf=conf) \
                .appName(jobConf['name'] or 'PySparkApp') \
                .enableHiveSupport() \
                .getOrCreate()

        sc = spark.sparkContext
        sc.setLogLevel(jobConf['appconfs']['logging']['sparkloglevel']
                       or 'INFO')
        if jobConf['appconfs']['logging']['sparkloglevel'] or 'INFO' == "DEBUG":
            msg = ""
            for k in sc._conf.getAll():
                msg += "\t%50s -> %s\n" % (k[0], k[1])
            log.debug(
                "Initiated SparkSesion with below confs,\n{}".format(msg))

        return spark

Exemplo n.º 29

0

Exibir arquivo

from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import *
from pyspark.storagelevel import StorageLevel

appName = "KafkaStreams"
config = SparkConf().setAppName(appName)

props = []
props.append(("spark.rememberDuration", "10"))
props.append(("spark.batchDuration", "10"))
props.append(("spark.eventLog.enabled", "true"))
props.append(("spark.streaming.timeout", "30"))
props.append(("spark.ui.enabled", "true"))

config = config.setAll(props)

sc = SparkContext(conf=config)
ssc = StreamingContext(sc, 5)

topics = ["t1"]
kafka_params = {
    "zookeeper.connect": "localhost:5181/kafka",
    "metadata.broker.list": "localhost:9092",
    "group.id": "Kafka_MapR-Streams_to_HBase"
}

raw = KafkaUtils.createDirectStream(ssc, topics, kafka_params)
raw.pprint()

ssc.start()  # Start the computation

Exemplo n.º 30

0

Exibir arquivo

Arquivo: spark.py Projeto: kamuitlw4058/base_model_trainer

def spark_session(spark_id, executor_num, local_dir):
    logger.info('[%s] init spark session', spark_id)

    # spark
    if 'SPARK_HOME' not in os.environ:
        os.environ['SPARK_HOME'] = SPARK_HOME

    os.environ['PYSPARK_PYTHON'] = WORKER_PYTHON
    os.environ['PYSPARK_DRIVER_PYTHON'] = DRIVER_PYTHON

    if not local_dir:
        local_dir = os.path.join(JOB_ROOT_DIR.LOCAL_ROOT, spark_id)

    os.makedirs(os.path.join(local_dir, 'tmp'))
    #os.makedirs(os.path.join(local_dir, 'metastore_db'))
    spark_conf = SparkConf()
    conf_details = [
        # ('spark.yarn.jars', ''),
        # ('spark.executorEnv.PATH', SPARK_CONFIG['WORKER_PATH']),
        # ('spark.eventLog.dir', 'hdfs://TS-CLICKH011:8020/spark/history'),
        # ('spark.yarn.historyServer.address', 'http://ts-clickh09:18080/'),
        # ('spark.executorEnv.PATH', './python3/bin/:$PATH'),
        # ('spark.appMasterEnv.PATH', SparkConfig['WORKER_PATH']),
        # ('spark.yarn.appMasterEnv.PYSPARK_PYTHON', './python3/bin/python3'),
        # ('spark.executorEnv.PYSPARK_PYTHON', './python3/bin/python3'),
        # ('spark.driver.host', '172.22.16.57'),
        # ('spark.pyspark.python', './python3/bin/python3'),
        # ('spark.pyspark.python', './python3/bin/python3'),
        # ('spark.pyspark.driver.python', '/data/anaconda3/bin/python'),
        ('spark.yarn.archive', HDFS_SPARK_JARS),
        ('spark.yarn.dist.archives', SPARK_CONFIG['SPARK_ARCHIVES']),
        ('spark.eventLog.enabled', 'true'),
        ('spark.eventLog.compress', 'true'),
        ('spark.driver.memory', '2G'),
        ('spark.driver.extraJavaOptions',
         f'-Duser.timezone=UTC+0800 -Djava.io.tmpdir={os.path.join(local_dir, "tmp")} -Dderby.system.home={os.path.abspath(local_dir)}'
         ),
        ('spark.executor.extraJavaOptions',
         '-Duser.timezone=UTC+0800 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps'
         ),
        ('spark.executor.instances', executor_num),
        ('spark.executor.memory', '8G'),
        ('spark.executor.cores', 4),
        ('spark.sql.shuffle.partitions', executor_num),
        ('spark.yarn.executor.memoryOverhead', '4G'),
        ('spark.sql.warehouse.dir', os.path.join(local_dir, 'metastore_db')),
        ('spark.local.dir', os.path.join(local_dir, 'tmp')),
        ('spark.driver.extraClassPath',
         "/data/tool/env/hadoop-lzo/lib/hadoop-lzo-0.4.19.jar"),
        ('spark.driver.extraLibraryPath ',
         '/data/tool/env/hadoop-lzo/lib/native/')
    ]

    for k, v in conf_details:
        print(f'[{spark_id}] spark config {k} = {v}')

    spark_conf.setAll(conf_details)
    spark_conf.setAppName(f'{spark_id}')
    spark_conf.setMaster('yarn')

    spark = SparkSession.builder.config(
        conf=spark_conf).enableHiveSupport().getOrCreate()
    return spark

Exemplo n.º 31

0

Exibir arquivo

Arquivo: common.py Projeto: Kim-Seongjung/BigDL

def create_spark_conf():
    bigdl_conf = get_bigdl_conf()
    sparkConf = SparkConf()
    sparkConf.setAll(bigdl_conf.items())
    return sparkConf

Exemplo n.º 32

0

Exibir arquivo

Arquivo: core.py Projeto: ValIlya/ds-tools

def init_spark(config, app=None, use_session=False):
    import os
    import sys
    from glob import glob

    if 'spark-home' in config:
        os.environ['SPARK_HOME'] = config['spark-home']

    if 'spark-conf-dir' in config:
        os.environ['SPARK_CONF_DIR'] = config['spark-conf-dir']

    if 'pyspark-python' in config:
        # Set python interpreter on both driver and workers
        os.environ['PYSPARK_PYTHON'] = config['pyspark-python']

    if 'yarn-conf-dir' in config:
        # Hadoop YARN configuration
        os.environ['YARN_CONF_DIR'] = config['yarn-conf-dir']

    if 'spark-classpath' in config:
        # can be used to use external folder with Hive configuration
        # e. g. spark-classpath='/etc/hive/conf.cloudera.hive1'
        os.environ['SPARK_CLASSPATH'] = config['spark-classpath']

    submit_args = []

    driver_mem = config.get('spark-prop.spark.driver.memory', None)
    if driver_mem is not None:
        submit_args.extend(["--driver-memory", driver_mem])

    driver_cp = config.get('spark-prop.spark.driver.extraClassPath', None)
    if driver_cp is not None:
        submit_args.extend(["--driver-class-path", driver_cp])

    driver_java_opt = config.get('spark-prop.spark.driver.extraJavaOptions', None)
    if driver_java_opt is not None:
        submit_args.extend(["--driver-java-options", driver_java_opt])

    jars = config.get('jars', None)
    if jars is not None:
        if isinstance(jars, str):
            jars = [jars]
        submit_args.extend(["--jars", ','.join(jars)])

    mode_yarn = config['spark-prop.spark.master'].startswith('yarn')

    if mode_yarn:
        # pyspark .zip distribution flag is set only if spark-submit have master=yarn in command-line arguments
        # see spark.yarn.isPython conf property setting code
        # in org.apache.spark.deploy.SparkSubmit#prepareSubmitEnvironment
        submit_args.extend(['--master', 'yarn'])

    # pyspark .zip distribution flag is set only if spark-submit have pyspark-shell or .py as positional argument
    # see spark.yarn.isPython conf property setting code
    # in org.apache.spark.deploy.SparkSubmit#prepareSubmitEnvironment
    submit_args.append('pyspark-shell')

    os.environ['PYSPARK_SUBMIT_ARGS'] = ' '.join(submit_args)

    spark_home = os.environ['SPARK_HOME']
    spark_python = os.path.join(spark_home, 'python')
    pyspark_libs = glob(os.path.join(spark_python, 'lib', '*.zip'))
    sys.path.extend(pyspark_libs)

    virtualenv_reqs = config['spark-prop'].get('spark.pyspark.virtualenv.requirements', None)
    if use_session:
        from pyspark.sql import SparkSession

        builder = SparkSession.builder.appName(app or config['app'])

        if mode_yarn:
            builder = builder.enableHiveSupport()

        for k, v in prop_list(config['spark-prop']).items():
            builder = builder.config(k, v)

        ss = builder.getOrCreate()
        if virtualenv_reqs is not None:
            ss.addFile(virtualenv_reqs)
        return ss
    else:
        from pyspark import SparkConf, SparkContext
        conf = SparkConf()
        conf.setAppName(app or config['app'])
        props = [(k, str(v)) for k, v in prop_list(config['spark-prop']).items()]
        conf.setAll(props)
        sc = SparkContext(conf=conf)
        if virtualenv_reqs is not None:
            sc.addFile(virtualenv_reqs)
        return sc

Exemplo n.º 33

0

Exibir arquivo

 def _create_spark_context():
     spark_conf = SparkConf()
     spark_conf.set('spark.sql.catalogImplementation', 'hive')
     spark_conf.setAll(self._setup_options(additional_options))
     return SparkContext(conf=spark_conf)

Exemplo n.º 34

0

Exibir arquivo

# -*- coding: UTF-8 -*-

import happybase
from setting.default import DefaultConfig
import redis

pool = happybase.ConnectionPool(size=10, host='hadoop-master', port=9090)

# 召回数据
# 加上decode_responses=True，写入的键值对中的value为str类型，不加这个参数写入的则为字节类型。
redis_client = redis.StrictRedis(host=DefaultConfig.REDIS_HOST,
                                 port=DefaultConfig.REDIS_PORT,
                                 db=10,
                                 decode_responses=True)

# 用于缓存的Redis数据库
# 加上decode_responses=True，写入的键值对中的value为str类型，不加这个参数写入的则为字节类型。
cache_client = redis.StrictRedis(host=DefaultConfig.REDIS_HOST,
                                 port=DefaultConfig.REDIS_PORT,
                                 db=8,
                                 decode_responses=True)

# 在 sort_service.py 排序逻辑中使用
from pyspark import SparkConf
from pyspark.sql import SparkSession
# spark配置
conf = SparkConf()
conf.setAll(DefaultConfig.SPARK_GRPC_CONFIG)
SORT_SPARK = SparkSession.builder.config(conf=conf).getOrCreate()