def setup_spark(app_name="my_app", master="local[*]", spark_jars=[], spark_config={}, py_files=[]): """sets up spark app using configuration provided""" cwd = os.getcwd() conf = SparkConf().setAppName(app_name).setMaster(master) conf = conf.set("spark.jars", ",".join(spark_jars)) # update spark config for key, val in spark_config.items(): conf.set(key, val) for i in conf.getAll(): print(i[0], "-->", i[1]) SparkSession.builder.config(conf=conf) spark_session = SparkSession.builder.appName(app_name).getOrCreate() for pyf in py_files: spark_session.sparkContext.addPyFile(pyf) return spark_session
def create_sc(app_name='AppName', master='local[*]', executor_memory='4G', nb_cores='0', driver_memory='32G', max_result_size='10G'): sc_conf = SparkConf() sc_conf.setAppName(app_name) sc_conf.setMaster(master) sc_conf.set('spark.executor.memory', executor_memory) if nb_cores != '0': sc_conf.set('spark.executor.cores', nb_cores) sc_conf.set('spark.driver.memory', driver_memory) sc_conf.set('spark.cores.max', '32') sc_conf.set('spark.driver.maxResultSize', max_result_size) sc_conf.set('spark.logConf', True) print(sc_conf.getAll()) sc = None try: sc.stop() sc = SparkContext(conf=sc_conf) except: sc = SparkContext(conf=sc_conf) return sc
class OWSparkContext(SharedSparkContext, widget.OWWidget): priority = 0 name = "Context" description = "Create a shared Spark (sc) and Hive (hc) Contexts" icon = "../icons/spark.png" want_main_area = False resizing_enabled = True conf = None def __init__(self): super().__init__() # The main label of the Control's GUI. # gui.label(self.controlArea, self, "Spark Context") self.conf = SparkConf() all_prefedined = dict(self.conf.getAll()) # Create parameters Box. box = gui.widgetBox(self.controlArea, "Spark Application", addSpace = True) self.gui_parameters = OrderedDict() main_parameters = OrderedDict() main_parameters['spark.app.name'] = 'OrangeSpark' main_parameters['spark.master'] = 'yarn-client' main_parameters["spark.executor.instances"] = "8" main_parameters["spark.executor.cores"] = "4" main_parameters["spark.executor.memory"] = "8g" main_parameters["spark.driver.cores"] = "4" main_parameters["spark.driver.memory"] = "2g" main_parameters["spark.logConf"] = "false" main_parameters["spark.app.id"] = "dummy" for k, v in main_parameters.items(): default_value = all_prefedined.setdefault(k, v) self.gui_parameters[k] = GuiParam(parent_widget = box, label = k, default_value = v) all_prefedined.pop(k) for k, v in all_prefedined.items(): self.gui_parameters[k] = GuiParam(parent_widget = box, label = k, default_value = str(v)) action_box = gui.widgetBox(box) # Action Button self.create_sc_btn = gui.button(action_box, self, label = 'Submit', callback = self.create_context) def onDeleteWidget(self): if self.sc: self.sc.stop() def create_context(self): for key, parameter in self.gui_parameters.items(): self.conf.set(key, parameter.get_value()) self.sc = SparkContext(conf = self.conf) self.hc = HiveContext(self.sc)
def init_spark(app_name): sc_conf = SparkConf() sc_conf.setAppName(app_name) sc_conf.setMaster('local[*]') sc_conf.set('spark.executor.memory', '4g') sc_conf.set('spark.executor.cores', '4') sc_conf.set('spark.driver.memory', '32G') sc_conf.set('spark.cores.max', '32') sc_conf.set('spark.driver.maxResultSize', '10G') sc_conf.set('spark.logConf', True) sc_conf.getAll() sc = None try: sc.stop() sc = SparkContext(conf=sc_conf) except: sc = SparkContext(conf=sc_conf) return sc
def create_sc(): sc_conf = SparkConf() sc_conf.setAppName("finance-similarity-app") sc_conf.setMaster('spark://10.21.208.21:7077') sc_conf.set('spark.executor.memory', '2g') sc_conf.set('spark.executor.cores', '4') sc_conf.set('spark.cores.max', '40') sc_conf.set('spark.logConf', True) print sc_conf.getAll() sc = None try: sc.stop() sc = SparkContext(conf=sc_conf) except: sc = SparkContext(conf=sc_conf) return sc
def create_context(self): sc_conf = SparkConf() sc_conf.setAppName(self.name) sc_conf.setMaster('local[*]') sc_conf.set('spark.executor.memory', '2g') sc_conf.set('spark.executor.cores', '4') sc_conf.set('spark.cores.max', '40') sc_conf.set('spark.logConf', True) sc_conf.set('spark.debug.maxToStringFields', '100') print sc_conf.getAll() sc = None try: sc.stop() sc = SparkContext(conf=sc_conf) except: sc = SparkContext(conf=sc_conf) return SQLContext(sc)
def create_sc(pyFiles): sc_conf = SparkConf() sc_conf.setAppName("Weather_PCA") sc_conf.set('spark.executor.memory', '3g') sc_conf.set('spark.executor.cores', '1') sc_conf.set('spark.cores.max', '4') sc_conf.set('spark.default.parallelism', '10') sc_conf.set('spark.logConf', True) print(sc_conf.getAll()) sc = SparkContext(conf=sc_conf, pyFiles=pyFiles) return sc
def mainRun(): try: hdfs_path = settings.dir_path["hdfs_path"] spark_url = settings.spark_config["spark_url"] spark_conf_settings = settings.spark_config["spark_conf_settings"] except Exception as e: logger.error(str(e)) logger.error("Exit...") sys.exit(1) app_name = "xdrProject" spark_conf = SparkConf().setAppName(app_name).setMaster(str(spark_url)) for conf_name, conf_sets in spark_conf_settings.items(): spark_conf = spark_conf.set(conf_name, conf_sets) sc = SparkContext(conf=spark_conf) logger.info("Spark config: " + str(spark_conf.getAll())) xdrProject(sc, hdfs_path)
def main(): #parse command line options (options,args)=parseOptions() if len(args) != 2: raise Exception("need an input file and an output path") #set number of file partitions/parallelism if options.numPartitions==None: #pick number of partitions based on default amount of parallelism and filesize partFactor=1#how many times the default parallelism. Defaul Parallelism is #related to the number of cores on the machine. numPartitions=sc.defaultParallelism*partFactor else: numPartitions=options.numPartitions conf=SparkConf().setAppName("wordCount").setMaster("local["+str(numPartitions)+"]") sc = SparkContext(conf=conf) conf=sc.getConf() print("conf="+str(conf.getAll())) print("defaultMinPartitions="+str(sc.defaultMinPartitions)) print("defaultParallelism="+str(sc.defaultParallelism)) inputFileName = args[0] outputFileName= args[1] timeStart=time.time() file = sc.textFile(inputFileName,minPartitions=numPartitions) counts = file.count() timeEnd=time.time() dtRead=timeEnd-timeStart#time in seconds #write out to a file timeStart=time.time() file.saveAsTextFile(outputFileName) timeEnd=time.time() dtWrite=timeEnd-timeStart#time in seconds print("read+count time="+str(dtRead)+" s") print("write time="+str(dtWrite)+" s") print("number of lines="+str(counts)) print("num Partitions="+str(file.getNumPartitions()))
def create_sc(): sc_conf = SparkConf() sc_conf.setAppName("finance-similarity-app") sc_conf.set("spark.dynamicAllocation.enabled", "false") sc_conf.set("spark.driver.host", "172.31.85.37") sc_conf.set('spark.executor.memory', '1g') sc_conf.set('spark.executor.cores', '2') sc_conf.set('spark.cores.max', '4') sc_conf.set('spark.logConf', True) print(sc_conf.getAll()) sc = None try: sc.stop() sc = SparkContext(conf=sc_conf) except: sc = SparkContext(conf=sc_conf) return sc
class OWSparkContext(SharedSparkContext, widget.OWWidget): priority = 0 name = "Context" description = "Create a shared Spark (sc) and Hive (hc) Contexts" icon = "../icons/spark.png" want_main_area = False resizing_enabled = True conf = None def __init__(self): super().__init__() # The main label of the Control's GUI. # gui.label(self.controlArea, self, "Spark Context") self.conf = SparkConf() all_prefedined = dict(self.conf.getAll()) # Create parameters Box. box = gui.widgetBox(self.controlArea, "Spark Application", addSpace=True) self.gui_parameters = OrderedDict() main_parameters = OrderedDict() main_parameters['spark.app.name'] = 'OrangeSpark' main_parameters['spark.master'] = 'yarn-client' main_parameters["spark.executor.instances"] = "8" main_parameters["spark.executor.cores"] = "4" main_parameters["spark.executor.memory"] = "8g" main_parameters["spark.driver.cores"] = "4" main_parameters["spark.driver.memory"] = "2g" main_parameters["spark.logConf"] = "false" main_parameters["spark.app.id"] = "dummy" for k, v in main_parameters.items(): default_value = all_prefedined.setdefault(k, v) self.gui_parameters[k] = GuiParam(parent_widget=box, label=k, default_value=v) all_prefedined.pop(k) for k, v in all_prefedined.items(): self.gui_parameters[k] = GuiParam(parent_widget=box, label=k, default_value=str(v)) action_box = gui.widgetBox(box) # Action Button self.create_sc_btn = gui.button(action_box, self, label='Submit', callback=self.create_context) def onDeleteWidget(self): if self.sc: self.sc.stop() def create_context(self): for key, parameter in self.gui_parameters.items(): self.conf.set(key, parameter.get_value()) self.sc = SparkContext(conf=self.conf) self.hc = HiveContext(self.sc)
from pyspark.sql.window import Window as W from spacy.lang.en import English import spacy from pyspark.ml.stat import Summarizer from collections import Counter import pandas import time nlp = spacy.load("en_core_web_sm") os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" conf = SparkConf().setAppName('Spark DL Tabular Pipeline').setMaster( 'local[6]').set('spark.driver.memory', '16g').set('spark.executor.memory', '6g') print(conf.getAll()) sc = SparkContext(conf=conf) sql_context = SparkSession(sc) # Load Data to Spark Dataframe df = sql_context.read.csv('final/amazon_reviews.tsv', header=True, sep=r'\t', inferSchema=True) tags = [ 'SYM', 'PUNCT', 'X', 'ADJ', 'CCONJ', 'NUM', 'DET', 'PRON', 'ADP', 'ADJ', 'VERB', 'NOUN', 'PROPN', 'ADV', 'SPACE', 'PART', 'INTJ', 'AUX', 'SCONJ' ]
init_db() session = get_session() sc = SparkContext(appName="streamingkafka") sc_conf = SparkConf() sc_conf.set('spark.executor.memory', '2g') # executor memory是每个节点上占用的内存。每一个节点可使用内存 sc_conf.set( "spark.executor.cores", '4' ) # spark.executor.cores:顾名思义这个参数是用来指定executor的cpu内核个数,分配更多的内核意味着executor并发能力越强,能够同时执行更多的task sc_conf.set( 'spark.cores.max', 40 ) # spark.cores.max:为一个application分配的最大cpu核心数,如果没有设置这个值默认为spark.deploy.defaultCores sc_conf.set('spark.logConf', True) # 当SparkContext启动时,将有效的SparkConf记录为INFO。 print(sc_conf.getAll()) sc.setLogLevel("WARN") # reduce logs from shell window = 60 ssc = StreamingContext(sc, window) # get messages of 1 min brokers = 'localhost:9092' topic = 'test' tmp = init() # kafka_streaming_rdd = KafkaUtils.createDirectStream( ssc, [topic], {"metadata.broker.list": brokers}) # kafka_streaming_rdd.pprint() lines_rdd = kafka_streaming_rdd.map(lambda x: x[1]).map(split) \ .foreachRDD(lambda rdd: process_stream(rdd, tmp)) ssc.start()
class OWSparkContext(SparkEnvironment, widget.OWWidget): priority = 0 name = "Spark Config" description = "Configure the shared contexts: SparkContext (sc), SqlContext (sqlContext) and HiveContext (hc)" icon = "../assets/Spark.svg" want_main_area = False resizing_enabled = True saved_gui_params = Setting(OrderedDict()) conf = None def __init__(self): super().__init__() # The main label of the Control's GUI. # gui.label(self.controlArea, self, "Spark Context") self.conf = SparkConf() all_prefedined = dict(self.conf.getAll()) # Create parameters Box. box = gui.widgetBox(self.controlArea, "Spark Application", addSpace=True) self.gui_parameters = OrderedDict() main_parameters = OrderedDict() main_parameters['spark.app.name'] = 'weta_workflow' main_parameters['spark.master'] = 'local' # 'yarn' main_parameters["spark.executor.instances"] = "8" main_parameters["spark.executor.cores"] = "4" main_parameters["spark.executor.memory"] = "8g" main_parameters["spark.driver.cores"] = "4" main_parameters["spark.driver.memory"] = "2g" main_parameters["spark.logConf"] = "false" main_parameters["spark.app.id"] = "dummy" for k, v in self.saved_gui_params.items(): main_parameters[k] = v for k, v in main_parameters.items(): default_value = all_prefedined.setdefault(k, v) self.gui_parameters[k] = ParameterWidget(parent_widget=box, label=k, default_value=v) all_prefedined.pop(k) for k, v in all_prefedined.items(): self.gui_parameters[k] = ParameterWidget(parent_widget=box, label=k, default_value=str(v)) action_box = gui.widgetBox(box) # Action Button self.create_sc_btn = gui.button(action_box, self, label='Submit', callback=self.create_context) def onDeleteWidget(self): if self.sc: self.sc.stop() def create_context(self): if self.sc: self.sc.stop() for key, parameter in self.gui_parameters.items(): self.conf.set(key, parameter.get_value()) self.saved_gui_params[key] = parameter.get_value() self.sc = SparkContext(conf=self.conf) self.sqlContext = SQLContext(self.sc) self.hc = HiveContext(self.sc) self.hide()
class CommonSparkContext(object): __metaclass__ = Singleton def __init__(self): """ Create a spark context. The spark configuration is taken from xframes/config.ini and from the values set in SparkInitContext.set() if this has been called. """ # This is placed here because otherwise it causes an error when used in a spark slave. from pyspark import SparkConf, SparkContext, SQLContext, HiveContext # This reads from default.ini and then xframes/config.ini # if they exist. self._env = Environment.create() context = create_spark_config(self._env) verbose = self._env.get_config("xframes", "verbose", "false").lower() == "true" hdfs_user_name = self._env.get_config("webhdfs", "user", "hdfs") os.environ["HADOOP_USER_NAME"] = hdfs_user_name config_pairs = [(k, v) for k, v in context.iteritems()] self._config = SparkConf().setAll(config_pairs) if verbose: print "Spark Config: {}".format(config_pairs) self._sc = SparkContext(conf=self._config) self._sqlc = SQLContext(self._sc) self._hivec = HiveContext(self._sc) self.zip_path = [] version = [int(n) for n in self._sc.version.split(".")] self.status_tracker = self._sc.statusTracker() if cmp(version, [1, 4, 1]) >= 0: self.application_id = self._sc.applicationId else: self.application_id = None if verbose: print "Spark Version: {}".format(self._sc.version) if self.application_id: print "Application Id: {}".format(self.application_id) if not context["spark.master"].startswith("local"): zip_path = self.build_zip(get_xframes_home()) if zip_path: self._sc.addPyFile(zip_path) self.zip_path.append(zip_path) trace_flag = self._env.get_config("xframes", "rdd-trace", "false").lower() == "true" XRdd.set_trace(trace_flag) atexit.register(self.close_context) def spark_add_files(self, dirs): """ Adds python files in the given directory or directories. Parameters ---------- dirs: str or list(str) If a str, the pathname to a directory containing a python module. If a list, then it is a list of such directories. The python files in each directory are compiled, packed into a zip, distributed to each spark slave, and placed in PYTHONPATH. This is only done if spark is deployed on a cluster. """ props = self.config() if props.get("spark.master", "local").startswith("local"): return if isinstance(dirs, basestring): dirs = [dirs] for path in dirs: zip_path = self.build_zip(path) if zip_path: self._sc.addPyFile(zip_path) self.zip_path.append(zip_path) def close_context(self): if self._sc: self._sc.stop() self._sc = None for zip_path in self.zip_path: os.remove(zip_path) def config(self): """ Gets the configuration parameters used to initialize the spark context. Returns ------- out : dict A dict of the properties used to initialize the spark context. """ props = self._config.getAll() return {prop[0]: prop[1] for prop in props} def env(self): """ Gets the config environment. Returns ------- out : Environment The environment. This contains all the values from the configuration file(s). """ return self._env def sc(self): """ Gets the spark context. Returns ------- out : SparkContext The spark context. There is a single spark context per process. """ return self._sc def sqlc(self): """ Gets the spark sql context. Returns ------- out : sql.SqlContext The spark sql context. """ return self._sqlc def hivec(self): """ Gets the hive context. Returns ------- out : sql.HiveContext The hive context. """ return self._hivec def version(self): """ Gets the spark version. Returns ------- out: lst[int] The spark version, as a list of integers. """ return [int(n) for n in self._sc.version.split(".")] def jobs(self): """ Get the spark job ID and info for the active jobs. This method would normally be called by another thread from the executing job. Returns ------- out: map(job_id: job_info} A map of the active job IDs and their corresponding job info """ return {job_id: self.status_tracker.getJobInfo(job_id) for job_id in self.status_tracker.getActiveJobIds()} def cluster_mode(self): """ Get the cluster mode of the spark cluster. Returns ------- out: boolean True if spark is running in cluster mode. Cluster mode means that spark is running on a platform separate the program. In practice, cluster mode means that file arguments must be located on a network filesystem such as HDFS or NFS. """ return not self._config.get("spark.master").startswith("local") # noinspection PyBroadException @staticmethod def build_zip(module_dir): # This can fail at writepy if there is something wrong with the files # in xframes. Go ahead anyway, but things will probably fail if this job is # distributed try: tf = NamedTemporaryFile(suffix=".zip", delete=False) z = PyZipFile(tf, "w") z.writepy(module_dir) z.close() return tf.name except: logging.warn("Zip file distribution failed -- workers will not get xframes code.") logging.warn("Check for unexpected files in xframes directory.") return None @staticmethod def spark_context(): """ Returns the spark context. Returns ------- out : pyspark.SparkContext The SparkContext object from spark. """ return CommonSparkContext().sc() @staticmethod def spark_config(): """ Returns the spark cofig parameters. Returns ------- out : list A list of the key-value pairs stored as tuples, used to initialize the spark context. """ return CommonSparkContext().config() @staticmethod def spark_sql_context(): """ Returns the spark sql context. Returns ------- out : pyspark.sql.SQLContext The SQLContext object from spark. """ return CommonSparkContext().sqlc() @staticmethod def hive_context(): """ Returns the hive context. Returns ------- out : pyspark.sql.HiveContext The Hive object from spark. """ return CommonSparkContext().hivec() @staticmethod def spark_version(): """ Gets the spark version. Returns ------- out: list[int] The spark version, as a list of integers. """ return CommonSparkContext().version() @staticmethod def spark_cluster_mode(): """ Gets the cluster mode Returns ------- out: boolean True if spark is running in cluster mode. Cluster mode means that spark is running on a platform separate the program. In practice, cluster mode means that file arguments must be located on a network filesystem such as HDFS or NFS. """ env = Environment.create() config = create_spark_config(env) return not config.get("spark.master").startswith("local")
#!/usr/bin/env python from pyspark import SparkConf, SparkContext # Spark Options: # https://spark.apache.org/docs/1.6.1/api/java/org/apache/spark/SparkConf.html conf = SparkConf().setMaster("local").setAppName("MyApp") sc = SparkContext(conf=conf) print conf.getAll()
run from command line spark-submit --master yarn-client --conf key=value --conf someotherkey=someothervalue you_code.py """ from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext, Row conf = SparkConf().setAppName("hello-world").setMaster('yarn-client') conf.set("spark.files.overwrite","true") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) #log log4jLogger = sc._jvm.org.apache.log4j LOG = log4jLogger.LogManager.getLogger("hello.world.spark") LOG.info("Args = " + conf.getAll().__str__()) inputFile = conf.get("spark.input") outputFile = conf.get("spark.output") wordcount = sc.textFile(inputFile).map(lambda line: line.replace("\"", " ").replace("{", " ").replace("}", " ").replace(".", " ").replace(":", " ")) \ .flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a + b) \ .map(lambda (k, v): (v, k)) \ .sortByKey(ascending=False) \ .map(lambda (k, v): (v, k)) df = wordcount.toDF(['word', 'count']) df.save(path=outputFile, source='json', mode='overwrite')