def setup(): ''' setup the spark context and get the logger ''' # Set the context conf = SparkConf() conf.setExecutorEnv(key='Auth', value='value', pairs=None) sc = SparkContext(conf=conf) logger = sc._jvm.org.apache.log4j.LogManager.getLogger(__name__) return sc, logger
def closure(*args, **kwargs): try: options = opts options.update({ 'sql_parquet_compression_codec': 'uncompressed', 'mesos_role': role, 'mesos_coarse': bool(coarse), 'cores_max': int(coarse) or None, 'executor_cores': int(executor_cores), 'executor_memory': '{}m'.format(int(executor_memory / MiB)), 'driver_memory': '{}m'.format(int(driver_memory / MiB)), 'mesos_executor_memoryOverhead': int( (memory_overhead or (executor_cores * python_worker_memory + 0.1 * executor_memory)) / MiB), 'python_worker_memory': int(python_worker_memory / MiB), 'mesos_uris': ','.join(uris), 'mesos_executor_docker_image': docker }) options = {'spark.{}'.format(k.replace('_', '.')): str(v) for k, v in options.items() if v not in (None, '')} environs = envs.items() except TypeError as e: # curry doesn't reraise TypeErrors: # https://github.com/pytoolz/toolz/issues/288 raise Exception(repr(e)) conf = SparkConf() conf.setMaster(str(master)) conf.setAppName(str(name or fn.__name__)) conf.setAll(pairs=options.items()) conf.setExecutorEnv(pairs=environs) with SparkContext(conf=conf) as sc: sc.setLogLevel(str(log)) map(sc.addFile, files) map(sc.addPyFile, pyfiles) # TODO: user sparksession sql = SQLContext(sc) return fn(sc, sql, *args, **kwargs)
def init(self, name=None, master='yarn-client', config_parameters=None, interpreter_path=None, dependent_jars=None, pypath='pyspark.zip:py4j-0.10.6-src.zip'): from pyspark import SparkConf from pyspark.context import SparkContext all_env_var_str = '' if name is None: raise ValueError('Please specify a name for the spark application') else: name_str = " --name " + name.replace(" ", "") if dependent_jars is None: dependent_jars = [] else: for dep_jars in dependent_jars: self.add_jars(dep_jars) driver_jars = ":".join(self.get_jars()) all_env_var_str = "--driver-class-path " + driver_jars if 'spark.driver.memory' in list(config_parameters.keys()): d_mem_str = " --driver-memory " + config_parameters[ 'spark.driver.memory'].replace(" ", "") all_env_var_str = all_env_var_str + d_mem_str + name_str + " " + self.shell_arg os.environ["PYSPARK_SUBMIT_ARGS"] = all_env_var_str.replace( " ", " ") else: all_env_var_str = all_env_var_str + name_str + " " + self.shell_arg os.environ["PYSPARK_SUBMIT_ARGS"] = all_env_var_str.replace( " ", " ") if interpreter_path is None: interpreter_path = sys.executable os.environ["PYSPARK_PYTHON"] = interpreter_path conf = SparkConf() conf.setMaster(master) conf.set('spark.jars', ",".join(self.get_jars())) conf.set('spark.jars.packages', ",".join(self.coordinates)) conf.set('spark.jars', ",".join(self.get_jars())) conf.set( 'spark.yarn.dist.files', 'file:/usr/hdp/current/spark2-client/python/lib/pyspark.zip,' 'file:/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip' ) conf.set('spark.sql.codegen.wholeStage', 'false') if config_parameters is None: config_parameters = {} else: for option in list(config_parameters.keys()): conf.set(option, config_parameters[option].replace(" ", "")) if self.spark_version == 2: from pyspark.sql import SparkSession conf.setExecutorEnv('PYTHONPATH', pypath) spark = SparkSession \ .builder \ .enableHiveSupport() \ .appName(name) \ .config(conf=conf) \ .getOrCreate() sc = spark.sparkContext sc.setLogLevel("ERROR") # sc.setLogLevel("info") else: conf.setAppName(name) sc = SparkContext.getOrCreate(conf=conf) sc.setCheckpointDir("tmp") print("Application Name: ", sc.appName) print("Application ID: ", sc.applicationId) print("Tracking URL: http://{}name3:8088/cluster/app/{}/".format( os.uname()[1][:7], sc.applicationId)) return spark if self.spark_version == 2 else sc
from pyspark import sql, SparkConf, SparkContext import pyspark.sql.functions as f import numpy as np conf = SparkConf().setAppName('Benchmarks') conf.setExecutorEnv('spark.executor.memory', '2g') conf.setExecutorEnv('spark.driver.memory', '30g') sc = SparkContext(conf=conf) sqlContext = sql.SQLContext(sc) def read_file(df=None, data_path=None): return sqlContext.read.parquet(data_path) def mean(df): return df.select(f.mean('fare_amount')).collect() def standard_deviation(df): return df.select(f.stddev('fare_amount')).collect() def sum_columns(df): return df.select(f.mean(df['fare_amount'] + df['trip_distance'])).collect() def product_columns(df): return df.select(f.mean(df['fare_amount'] * df['trip_distance'])).collect()
parser = argparse.ArgumentParser() parser.add_argument('-a', '--access_key') parser.add_argument('-s', '--secret_access_key') parser.add_argument('-l', '--copy_local', action='store_true') config = parser.parse_args() download = False; spark_config = None if config.access_key and config.secret_access_key: download = True spark_config = SparkConf() spark_config.setExecutorEnv('AWS_ACCESS_KEY_ID', config.access_key) spark_config.setExecutorEnv('AWS_SECRET_ACCESS_KEY ', config.secret_access_key) # Build up the context, using the master URL sc = SparkContext('spark://ulex:7077', 'mean', conf=spark_config) local_data_path = '/media/bitbucket/pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc' data_path = local_data_path data_url = 'https://nasanex.s3.amazonaws.com/NEX-DCP30/BCSD/rcp26/mon/atmos/pr/r1i1p1/v1.0/CONUS/pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc' if download: data_path = data_url # Download the file onto each node if download or config.copy_local: sc.addFile(data_path)
def evaluate(evaluator): precise = evaluator.precision() coverage = evaluator.coverage() popularity = evaluator.popularity() recall = evaluator.recall() return precise,recall,coverage,popularity if __name__ == "__main__": import os PYSPARK_PYTHON = "/usr/bin/python2.7" os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON conf = SparkConf().setAppName("amazonItemCF").setMaster("yarn-client") conf.set("spark.shuffle.file.buffer","128k").set("spark.reducer.maxSizeInFlight","96M") conf.set('spark.yarn.dist.files', 'file:/root/hadoop-2.6/spark/python/lib/pyspark.zip,file:/root/hadoop-2.6/spark/python/lib/py4j-0.10.4-src.zip') conf.setExecutorEnv('PYTHONPATH','pyspark.zip:py4j-0.10.4-src.zip') conf.set('spark.executor.cores','30') conf.set('spark.executor.memory','95g') conf.set('spark.executor.instances','4') spark = SparkSession.builder\ .config(conf=conf) \ .enableHiveSupport()\ .getOrCreate() spark.sql('set spark.sql.broadcastTimeout=30000') sc=spark.sparkContext sc.setLogLevel('WARN') start = time.time() inputPath = "data/amazon/complete_csv" schema = StructType([ StructField("user", StringType(), True),
from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession # delete the old file and put the new file import os os.system("hadoop fs -rm -r -skipTrash /data/ratings_small.csv") os.system("hdfs dfs -put ratings_small.csv /data") print("start") conf = SparkConf() conf.setMaster("spark://master:7077") conf.setAppName("recommend_train") conf.setExecutorEnv(key="executor-memory",value="3g") conf.setExecutorEnv(key="driver-memory",value="9g") sc = SparkContext(conf=conf) #sc = SparkContext("local") text = sc.textFile("/data/ratings_small.csv") text=text .filter(lambda x: "movieId" not in x) movieRatings=text.map(lambda x: x.split(",")[:3]) print("start counting") from pyspark.mllib.recommendation import ALS model = ALS.train(movieRatings, 10, 10, 0.01) model.save(sc,"/data/model1") print(model.recommendProducts(1, 5))
def create_spark_conf(coreNum, nodeNum): print("coreNum:%s, nodeNum: %s" % (coreNum, nodeNum)) sparkConf = SparkConf() sparkConf.setExecutorEnv("DL_ENGINE_TYPE", "mklblas") sparkConf.setExecutorEnv("MKL_DISABLE_FAST_MM", "1") sparkConf.setExecutorEnv("KMP_BLOCKTIME", "0") sparkConf.setExecutorEnv("OMP_WAIT_POLICY", "passive") sparkConf.setExecutorEnv("OMP_NUM_THREADS", "1") sparkConf.setExecutorEnv("DL_CORE_NUMBER", str(coreNum)) sparkConf.setExecutorEnv("DL_NODE_NUMBER", str(nodeNum)) sparkConf.set("spark.shuffle.blockTransferService", "nio") sparkConf.set("spark.scheduler.minRegisteredResourcesRatio", "1.0") return sparkConf
import argparse parser = argparse.ArgumentParser() parser.add_argument('-a', '--access_key') parser.add_argument('-s', '--secret_access_key') parser.add_argument('-l', '--copy_local', action='store_true') config = parser.parse_args() download = False spark_config = None if config.access_key and config.secret_access_key: download = True spark_config = SparkConf() spark_config.setExecutorEnv('AWS_ACCESS_KEY_ID', config.access_key) spark_config.setExecutorEnv('AWS_SECRET_ACCESS_KEY ', config.secret_access_key) # Build up the context, using the master URL sc = SparkContext('spark://ulex:7077', 'mean', conf=spark_config) local_data_path = '/media/bitbucket/pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc' data_path = local_data_path data_url = 'https://nasanex.s3.amazonaws.com/NEX-DCP30/BCSD/rcp26/mon/atmos/pr/r1i1p1/v1.0/CONUS/pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc' if download: data_path = data_url # Download the file onto each node if download or config.copy_local: sc.addFile(data_path)
def main(ST_AUTH, ST_USER, ST_KEY, TASKS, CORES, BLASTN, QUERY_FILE, MODE, OBJECT_STORES): ''' Main function ST_AUTH - Object storage auth string where fna containers are found ST_USER - Ojbect storage user token ST_KEY - Ojbect storage secret token TASKS - Number of tasks to launch, db partition factor CORES - Number of cores to devote to each task BLASTN - Location of blastn executable QUERY_FILE - fasta query file MODE - operation mode, 1 = top search, 2 = most common genome OBJECT_STORES - list of source containers that built the blast db ''' # Set the context conf = SparkConf() conf.setExecutorEnv(key='Auth', value='value', pairs=None) sc = SparkContext(conf=conf) # Quiet the logs sc.setLogLevel("WARN") N = 5 # number of top results to take # Set our spark database creation script and add all the files that are needed to be on the # remote hosts to the shall script ShellScript = "spark_blast.bash" sc.addFile(ShellScript) sc.addFile(QUERY_FILE) # Copy over blastn if it is local if os.path.dirname(BLASTN) == "." or os.path.dirname(BLASTN) == "": sc.addFile(BLASTN) # Get the file name part of QUERY_FILE Query_File = os.path.basename(QUERY_FILE) # this will be our root name for our DB names container = "blastdb_" + "-".join(sorted(OBJECT_STORES)) + "_" + str(TASKS) # log into swift conn = swiftclient.Connection(user=ST_USER, key=ST_KEY, authurl=ST_AUTH) # Verify the container we need is present if container not in [t['name'] for t in conn.get_account()[1]]: print("No database parition created for %s partition factor %d" % ("+".join(sorted(OBJECT_STORES)), TASKS)) exit() # Get the list of objects we are oing to need dbs = {} print("Collecting DBs from " + container) for data in conn.get_container(container)[1]: base = data['name'].split('.', 1)[0] if base not in dbs: dbs[base] = [] dbs[base].append(data['name']) # Assemble our task list files = [] for db in dbs: files.append("%s %s %s %s" % (Query_File, container, db, " ".join(dbs[db]))) # Distribute our data distData = sc.parallelize(files, TASKS) options = "" # TODO -- these didn't work when I used them, so I commented them out for now - Peter # Set our search options # if MODE == "1": # options = "-max_target_seqs 1" # elif MODE == "2": # options = ?? # Pass our bash script our parameters, ideally we would like to pass the executor ID/Task ID, but # this doesn't appear to be available in ver 2.1.1 pipeRDD = distData.pipe( ShellScript, { 'ST_AUTH': ST_AUTH, 'ST_USER': ST_USER, 'ST_KEY': ST_KEY, 'THREADS': str(CORES), 'OPTIONS': options, 'BLASTN': BLASTN }) # Now let the bash script do its work. This will run blast using our query file across all the # DB partitions searching for matching genomic reads. # # Failing to fetch me at first keep encouraged, # Missing me one place search another, # I stop somewhere waiting for you. # -- Walt Whitman - Leaves of Grass: Book 3, Song of Myself, Verse 52 print("Search through all the DBs for matching sequence") if MODE == "1": query_count = pipeRDD.map(lambda x : (x.split(',')[0], (x.split(',')[2], x.split('|')[-1:][0]) )) \ .reduceByKey( lambda x, y : maxByIndex(x, y, 0)) \ .sortByKey(True) \ .map(lambda x : str(x[0]) + ", " + str(x[1][1])) for line in query_count.collect(): print line elif MODE == "2": specie_count = pipeRDD.map( lambda x : (x.split(',')[11].split(' ', 1)[-1], x.split(',')[0] )) \ .distinct() \ .map(lambda x : (x[0], 1)) \ .reduceByKey(lambda x,y:x+y) \ .sortByKey(False) \ .map(lambda x:(str(x[0]) + ", " + str(x[1]))) for line in specie_count.collect(): print line
# -*- encoding: utf-8 -*- from pyspark import SparkContext, SparkConf from pyspark import StorageLevel import random conf = SparkConf() conf.setExecutorEnv( 'PYSPARK_PYTHON', '/home/classify/workspace/ENV_material_recommendation/bin/python') conf.setExecutorEnv( 'PYTHONPATH', '/home/classify/workspace/ENV_material_recommendation/lib/python2.7/site-packages:/home/classify/workspace/material_recommendation' ) # conf.setExecutorEnv('PYSPARK_PYTHON', '/home/classify/workspace/ENV_qa-helper/bin/python') # conf.setExecutorEnv('PYTHONPATH', # '/home/classify/workspace/ENV_qa-helper/lib/python2.7/site-packages:/home/classify/workspace/qa-helper') conf.set("spark.cores.max", "40") conf.set("spark.scheduler.mode", "FAIR") sc = SparkContext(conf=conf) sc.setLogLevel('ERROR') def custom_zip(rdd1, rdd2, npart=None): """ see http://stackoverflow.com/questions/32084368/can-only-zip-with-rdd-which-has-the-same-number-of-partitions-error """ def prepare(rdd, npart):
import itertools import time import subprocess from commands import * from pyspark.sql.functions import col import optparse import re import smtplib from email.MIMEMultipart import MIMEMultipart from email.MIMEText import MIMEText appName = "NetflowReplication:QA" conf = SparkConf().setAppName(appName) conf.setExecutorEnv( 'PYTHONPATH', '/opt/spark/python:/opt/spark/python/lib/py4j-0.8.2.1-src.zip') conf.set("spark.driver.maxResultSize", "2g") sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) if len(sys.argv) < 3: print "Usage: /opt/spark/bin/spark-submit " + sys.argv[ 0] + " <netflow input path> <file with list of IP addresses to filter> <output filtered netflow text directory>" sys.exit() path = sys.argv[1] input_ip = sys.argv[2] output = sys.argv[3] list = []
'INTERNAL_PROC_ERAB_SETUP': [12, 13, 19, 20], 'INTERNAL_PROC_INITIAL_CTXT_SETUP' : [12, 13, 20, 21], 'INTERNAL_PROC_UE_CTXT_RELEASE': [17, 21, 22, 23], 'INTERNAL_PROC_HO_PREP_S1_IN': [17, 18, 19], 'INTERNAL_PROC_HO_PREP_X2_IN' : [18, 19, 20], 'INTERNAL_PROC_RRC_CONN_SETUP': [12, 13], 'INTERNAL_PROC_S1_SIG_CONN_SETUP' : [13]} #os.environ['PYSPARK_PYTHON'] = '/usr/bin/python' #py2.7 timedelta.total_seconds() #NUM_PARTITIONS = 2000 path = '/user/mfoo/20160318tmp/seqFile.seq' conf = SparkConf() conf.set('spark.yarn.dist.files','file:/home/wfoo/install/spark1.4/python/lib/pyspark.zip,file:/home/wfoo/install/spark1.4/python/lib/py4j-0.8.2.1-src. zip') conf.setExecutorEnv('PYTHONPATH','pyspark.zip:py4j-0.8.2.1-src.zip') #conf.set("dynamicAllocation.enabled", "true") conf.set("spark.yarn.executor.memoryOverhead", 8192) conf.set("spark.yarn.driver.memoryOverhead", 8192) #conf.set("spark.executor.memory", "6g") #conf.set("spark.driver.memory", "6g") conf.set("spark.rdd.compress", "true") conf.set("spark.storage.memoryFraction", 1) conf.set("spark.core.connection.ack.wait.timeout", 600) conf.set("spark.akka.frameSize", 50) #conf.set("spark.local.dir","/data1/hadoop") #conf.set("spark.driver.maxResultSize","32g") #conf.setMaster("yarn-client") sc = SparkContext(appName = "hpa_stats", conf=conf) evt = sc.broadcast(EVENT_NAME) fld = sc.broadcast(eventFields[EVENT_NAME])
# sc = SparkContext("local[2]", "data analyse") # conf = SparkConf().setAppName("data analyse").setMaster('spark://10.0.2.15:7077') # sc = SparkContext(conf=conf) # config for SparkContext SPARK_HOME = os.environ['SPARK_HOME'] conf = SparkConf().setMaster('spark://10.0.2.15:7077').set( "spark.executor.memory", "2g").set("spark.cores.max", "65") site_packages = os.path.abspath((os.path.join( os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), os.pardir))) + '/site-packages' pyspark = SPARK_HOME + "/python" + ":" + SPARK_HOME + "/python/lib/py4j-0.9-src.zip" + ":" + SPARK_HOME + "/python/lib/pyspark.zip" conf.setExecutorEnv( "PYTHONPATH", "$PYTHONPATH:" + site_packages + ":" + pyspark + ":" + os.path.abspath( os.path.join( os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), os.pardir))) ROOTDIR = '/home/gugugujiawei/projects/data' @main.route('/') def index(): return render_template('index.html') @main.route('/cls/logistic') def logistic(): return render_template('reactjs/index.html')
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory] Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). Number of elements in RDD is 8 计算成功! """ from pyspark import SparkConf from pyspark import SparkContext conf = SparkConf() conf.setMaster('yarn') conf.setAppName('spark-yarn') conf.setExecutorEnv('HADOOP_CONF_DIR','$HADOOP_HOME/etc/hadoop') conf.setExecutorEnv('YARN_CONF_DIR','$HADOOP_HOME/etc/hadoop') sc = SparkContext(conf=conf) def mod(x): import numpy as np return (x, np.mod(x, 2)) rdd = sc.parallelize(range(1000)).map(mod).take(10) print(rdd) """ >>> SLF4J: Class path contains multiple SLF4J bindings. SLF4J: Found binding in [jar:file:/home/gupengxiang/.local/lib/python3.6/site-packages/pyspark/jars/slf4j-log4j12-1.7.16.jar!/org/slf4j/impl/StaticLoggerBinder.class] SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-5.11.0-1.cdh5.11.0.p0.34/jars/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]