def start(): sconf = SparkConf() sconf.set('spark.cores.max', 2) sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf) ssc = StreamingContext(sc, 2) brokers = "192.192.0.27:9092" topics = ['topic7'] kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers}) lines1 = kafkaStreams_lines.map(lambda x: x[1]) # 注意 取tuple下的第二个即为接收到的kafka流 words = lines1.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) wordcounts = pairs.reduceByKey(lambda x, y: x + y) wordcounts.saveAsTextFiles("/var/lib/hadoop-hdfs/spark-libin/kafka") wordcounts.pprint() # 统计生成的随机数的分布情况 ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def __connected_yarn_spark_cluster(self, pilotcompute_description): number_cores=1 if pilotcompute_description.has_key("number_cores"): number_cores=int(pilotcompute_description["number_cores"]) number_of_processes = 1 if pilotcompute_description.has_key("number_of_processes"): number_of_processes = int(pilotcompute_description["number_of_processes"]) executor_memory="1g" if pilotcompute_description.has_key("number_of_processes"): executor_memory = pilotcompute_description["physical_memory_per_process"] conf = SparkConf() conf.set("spark.num.executors", str(number_of_processes)) conf.set("spark.executor.instances", str(number_of_processes)) conf.set("spark.executor.memory", executor_memory) conf.set("spark.executor.cores", number_cores) if pilotcompute_description!=None: for i in pilotcompute_description.keys(): if i.startswith("spark"): conf.set(i, pilotcompute_description[i]) conf.setAppName("Pilot-Spark") conf.setMaster("yarn-client") sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx) return pilot
def setUp(self): conf = SparkConf().setAppName('testing').setMaster('local[2]').set('spark.driver.host', 'localhost') conf.set('spark.ui.showConsoleProgress', False) self.session = SparkSession.builder.config(conf=conf).getOrCreate() self.test_data = [ ('Ricardo', 'engineering', 2), ('Tisa', 'sales', 3), ('Sheree', 'marketing', 4), ('Chantelle', 'engineering', 5), ('Kylee', 'finance', 2), ('Tamatha', 'marketing', 5), ('Trena', 'engineering', 2), ('Arica', 'engineering', 1), ('Santina', 'finance', 2), ('Daria', 'marketing', 1), ('Magnolia', 'sales', 2), ('Antonina', 'finance', 1), ('Sumiko', 'engineering', 1), ('Carmen', 'sales', 2), ('Delois', 'engineering', 1), ('Luetta', 'marketing', 3), ('Yessenia', 'sales', 1), ('Petra', 'engineering', 3), ('Charisse', 'engineering', 4), ('Lillian', 'engineering', 3), ('Wei', 'engineering', 2), ('Lahoma', 'sales', 2), ('Lucilla', 'marketing', 1), ('Stephaine', 'finance', 2), ]
def configureSpark(): conf = SparkConf() conf.setMaster("local") conf.setAppName("Apache Spark Alarm Parser") conf.set("spark.executor.memory", "1g") sc = SparkContext(conf = conf) return sc
def setUpClass(cls): class_name = cls.__name__ conf = SparkConf() conf.set('spark.app.name', 'class_name') # Read the spark configuration and update the spark conf test_spark_config = ConfigParser.ConfigParser() test_spark_config.read('test_config.cfg') test_spark_config.sections() configs = dict(test_spark_config.items('spark_conf_test_generic')) for k, v in configs.items(): conf.set(k, v) cls.spark_test_configs = configs # Create the spark context cls.sc = SparkContext(conf=conf) if 'PYSPARK_DRIVER_PYTHON' in configs.keys(): cls.sc.pythonExec = configs['PYSPARK_DRIVER_PYTHON'] else: cls.sc.pythonExec = 'python2.7' logger = cls.sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s: %(message)s') cls.logger = logging.getLogger(__name__) cls.logger.setLevel(logging.DEBUG)
class SparkContextFactory: def __init__(self): # not sure why windows environment variable can't be read, I set it ##os.environ["SPARK_HOME"] = "C:\Spark" # not sure why windows environment variable can't be read, I set it ##os.environ["HADOOP_CONF_DIR"] = "C:\hdp\bin" ##sys.path.append("C:\Spark\python") ##sys.path.append("C:\Spark\bin") # specify spark home os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark" # specify pyspark path so its libraries can be accessed by this application sys.path.append("/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark/python") from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext self.conf = SparkConf().setMaster("yarn-client") self.conf.setAppName("MrT") self.conf.set("spark.executor.memory", "5g") self.conf.set("spark.driver.memory", "10g") self.sc = SparkContext(conf = self.conf, pyFiles = ["ComputeCovHistory.py", "go.py", "risk_DSconvert.py", "ewstats.py", "ewstatsRDD.py", "ewstatswrap.py"]) """ toDF method is a monkey patch executed inside SQLContext constructor so to be able to use it you have to create a SQLContext first """ self.sqlContextInstance = SQLContext(self.sc) def disconnect(self): self.sc.stop()
def get_default_spark_conf(): conf = SparkConf(). \ setAppName("pyunit-test"). \ setMaster("local-cluster[3,1,2048]"). \ set("spark.ext.h2o.disable.ga","true"). \ set("spark.driver.memory", "2g"). \ set("spark.executor.memory", "2g"). \ set("spark.ext.h2o.client.log.level", "DEBUG"). \ set("spark.ext.h2o.repl.enabled", "false"). \ set("spark.task.maxFailures", "1"). \ set("spark.rpc.numRetries", "1"). \ set("spark.deploy.maxExecutorRetries", "1"). \ set("spark.network.timeout", "360s"). \ set("spark.worker.timeout", "360"). \ set("spark.ext.h2o.backend.cluster.mode", ExternalClusterTestHelper.cluster_mode()). \ set("spark.ext.h2o.cloud.name", ExternalClusterTestHelper.unique_cloud_name("test")). \ set("spark.ext.h2o.external.start.mode", os.getenv("spark.ext.h2o.external.start.mode", "manual")) .\ set("spark.sql.warehouse.dir", "file:" + os.path.join(os.getcwd(), "spark-warehouse")) if ExternalClusterTestHelper.tests_in_external_mode(): conf.set("spark.ext.h2o.client.ip", ExternalClusterTestHelper.local_ip()) conf.set("spark.ext.h2o.external.cluster.num.h2o.nodes", "2") return conf
def getSparkContext(self, appName, master): print(appName) print(master) conf = SparkConf().setAppName(appName).setMaster(master) conf.set("spark.local.ip", "127.0.0.1") conf.set("spark.driver.host", "127.0.0.1") return SparkContext(conf=conf)
def main(): """ Main entry point of the application """ # Create spark configuration and spark context include_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'preprocessing.py')) conf = SparkConf() conf.set('spark.executor.memory', '1500m') conf.setAppName("Generating predictions") sc = SparkContext(conf=conf, pyFiles=[include_path]) # Set S3 configuration sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", os.environ['AWS_ACCESS_KEY']) sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", os.environ['AWS_SECRET_KEY']) # Single-pass predictions fast_predict(sc, file_input="s3n://twitter-stream-data/twitter-*", file_output="s3n://twitter-stream-predictions/final", sports_model="PyTwitterNews/models/sports.model", politics_model="PyTwitterNews/models/politics.model", technology_model="PyTwitterNews/models/technology.model") # Stop application sc.stop()
def main(): conf = SparkConf() conf.set("spark.default.parallelism", "24") sc = SparkContext(appName="PhoneLab Preprocessing", conf=conf) lines = sc.textFile(data_files, use_unicode=False) # Create LogLine objects and filter out empty lines logs = lines.flatMap(ll_mapper) # Save in an intermediate format logs.saveAsTextFile(out_dir, compressionCodecClass=codec) return # Gap detection keyed = logs.map(ll_gap_map) merged = keyed.groupByKey() # At this point we have ((boot_id, date), [line_num]) tuples The last step. # is to find all the gaps within each key/tuple. result = merged.flatMap(find_gaps) gaps = result.collect() fd = open("/spark/gaps.json", 'w') fd.write(json.dumps(gaps, indent=4))
def start_spark(self, spark_conf=None, executor_memory=None, profiling=False, graphframes_package='graphframes:graphframes:0.3.0-spark2.0-s_2.11', extra_conf = None): """Launch a SparkContext Parameters spark_conf: path path to a spark configuration directory executor_memory: string executor memory in java memory string format, e.g. '4G' If `None`, `memory_per_executor` is used. profiling: boolean whether to turn on python profiling or not graphframes_package: string which graphframes to load - if it isn't found, spark will attempt to download it extra_conf: dict additional configuration options """ os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages {graphframes_package} pyspark-shell"\ .format(graphframes_package=graphframes_package) if spark_conf is None: spark_conf = os.path.join(os.environ['SPARK_HOME'], 'conf') os.environ['SPARK_CONF_DIR'] = os.path.realpath(spark_conf) os.environ['PYSPARK_PYTHON'] = sys.executable try: import findspark; findspark.init() from pyspark import SparkContext, SparkConf except ImportError: raise ImportError("Unable to find pyspark -- are you sure SPARK_HOME is set?") conf = SparkConf() conf.set('spark.driver.maxResultSize', '0') if executor_memory is None: executor_memory = '%dM'%self.memory_per_executor conf.set('spark.executor.memory', executor_memory) if profiling: conf.set('spark.python.profile', 'true') else: conf.set('spark.python.profile', 'false') if extra_conf is not None: for k,v in extra_conf.items(): conf.set(k,v) sc = SparkContext(master=self.master_url(), conf=conf) return sc
def main(): # Setting the cluster configuration parameters conf = SparkConf() conf.setMaster("spark://localhost:7077") conf.setAppName("Tweet App") conf.set("spark.executor.memory", "3g") conf.set("spark.driver.memory", "4g") # Creating a Spark Context with conf file sc = SparkContext(conf=conf) # Creating and SQL context to perform SQL queries sqlContext = SQLContext(sc) # Define the data path curr_path = os.path.dirname(os.path.abspath(__file__)) json_name = "out.json" json_file_path = os.path.join(curr_path + "/../Spark_Jobs/data/", json_name) parquet_file_path = createSQLContext(json_file_path, sqlContext) print(parquet_file_path) # Read from parquet file parquetFile = sqlContext.read.parquet(parquet_file_path) parquetFile.registerTempTable("tweets") counter = sqlContext.sql("SELECT count(*) as cnt FROM tweets") print("============= Count =================") print("Count:: " + str(counter.collect()[0].cnt))
def start(): sconf = SparkConf() sconf.set('spark.cores.max', 2) sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf) ssc = StreamingContext(sc, 2) brokers = "localhost:9092" topics = ['test'] kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers}) lines1 = kafkaStreams_lines.map(lambda x: x[1]) # 注意 取tuple下的第二个即为接收到的kafka流 words = lines1.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) wordcounts = pairs.reduceByKey(lambda x, y: x + y) print(wordcounts) kafkaStreams_lines.transform(storeOffsetRanges).foreachRDD(printOffsetRanges) wordcounts.pprint() # 统计生成的随机数的分布情况 ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def main(): spark_conf = SparkConf().setAppName("Different-Sampling data").setMaster('local[*]') spark_conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") sc = SparkContext(conf= spark_conf) GA.logInConsole(0, "input file read!") rdd = sc.textFile("/home/fatemeh/Data/saveData.txt", minPartitions= 500, use_unicode=False) rdd.unpersist() # print('\nNumber of Partitions for this run: ', rdd.getNumPartitions()) vectorRDD = rdd.map(lambda line: toVector(line, splitter = ' ')) GA.logInConsole(0 , "Data Vectorized!") ss = list() GA.logInConsole(-1, 'Start the ensemble') GA.logInConsole(-10, "GA with range 3") ss.append(GA.parallel_GA_main(vectorRDD,sc, 5)) # GA.logInConsole(-10, "GA with range 4") # ss.append(GA.parallel_GA_main(norm,sc, 4)) # GA.logInConsole(-10, "GA with range 5") # ss.append(GA.parallel_GA_main(norm,sc, 5)) # GA.logInConsole(-10, "GA with range 3 and Sampled data set") # sampleRDD = norm.sample(False, 0.6, seed=10) # ss.append(GA.parallel_GA_main(sampleRDD,sc, 3)) print(ss) #selectedSS = voted_subsapces(ss) # SSD.outlierDetection(vectorRDD, ss) GA.logInConsole(100, "\nend of program") sc.stop()
def _test_broadcast_on_driver(self, *extra_confs): conf = SparkConf() for key, value in extra_confs: conf.set(key, value) conf.setMaster("local-cluster[2,1,1024]") self.sc = SparkContext(conf=conf) bs = self.sc.broadcast(value=5) self.assertEqual(5, bs.value)
class OWSparkContext(SharedSparkContext, widget.OWWidget): priority = 0 name = "Context" description = "Create a shared Spark (sc) and Hive (hc) Contexts" icon = "../icons/spark.png" want_main_area = False resizing_enabled = True conf = None def __init__(self): super().__init__() # The main label of the Control's GUI. # gui.label(self.controlArea, self, "Spark Context") self.conf = SparkConf() all_prefedined = dict(self.conf.getAll()) # Create parameters Box. box = gui.widgetBox(self.controlArea, "Spark Application", addSpace = True) self.gui_parameters = OrderedDict() main_parameters = OrderedDict() main_parameters['spark.app.name'] = 'OrangeSpark' main_parameters['spark.master'] = 'yarn-client' main_parameters["spark.executor.instances"] = "8" main_parameters["spark.executor.cores"] = "4" main_parameters["spark.executor.memory"] = "8g" main_parameters["spark.driver.cores"] = "4" main_parameters["spark.driver.memory"] = "2g" main_parameters["spark.logConf"] = "false" main_parameters["spark.app.id"] = "dummy" for k, v in main_parameters.items(): default_value = all_prefedined.setdefault(k, v) self.gui_parameters[k] = GuiParam(parent_widget = box, label = k, default_value = v) all_prefedined.pop(k) for k, v in all_prefedined.items(): self.gui_parameters[k] = GuiParam(parent_widget = box, label = k, default_value = str(v)) action_box = gui.widgetBox(box) # Action Button self.create_sc_btn = gui.button(action_box, self, label = 'Submit', callback = self.create_context) def onDeleteWidget(self): if self.sc: self.sc.stop() def create_context(self): for key, parameter in self.gui_parameters.items(): self.conf.set(key, parameter.get_value()) self.sc = SparkContext(conf = self.conf) self.hc = HiveContext(self.sc)
def create_spark_context(app_name="Quiz Bowl", lm_memory=False, profile=False): spark_conf = SparkConf() if lm_memory: pass # spark_conf = spark_conf.set('spark.max.cores', 30).set('spark.executor.cores', 30) if profile: spark_conf = spark_conf.set('spark.python.profile', True) spark_conf = spark_conf.set('spark.akka.frameSize', 300) return SparkContext(appName=app_name, master=QB_SPARK_MASTER, conf=spark_conf)
def main(): spark_conf = SparkConf().setAppName("Different-Sampling data") spark_conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") sc = SparkContext(conf= spark_conf) rdd = load_data(sc) print(rdd.getNumPartitions()) parallel_GA_main(sc, rdd, 5) sc.stop()
def init_spark_context(): # load spark context conf = SparkConf().setAppName("ctr-server") conf.set('spark.kryoserializer.buffer', '512mb') conf.set('spark.kryoserializer.buffer.max', '512') # IMPORTANT: pass aditional Python modules to each worker sc = SparkContext(conf=conf, pyFiles=['/home/ec2-user/engine.py', '/home/ec2-user/app.py']) return sc
def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self.sm_config['spark'].iteritems(): if prop.startswith('spark.'): sconf.set(prop, value) self.sc = SparkContext(master=self.sm_config['spark']['master'], conf=sconf, appName='SM engine') if not self.sm_config['spark']['master'].startswith('local'): self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip'))
def createSparkConf(): from pyspark import SparkConf test_properties = conftest.test_properties() conf = SparkConf() conf.set("cloudant.host", test_properties["cloudanthost"]) conf.set("cloudant.username", test_properties["cloudantusername"]) conf.set("cloudant.password", test_properties["cloudantpassword"]) return conf
def main(): # Spark Configurations conf = SparkConf() conf.set("spark.master", "local[*]") conf = conf.setAppName('Learning PySpark') sc = SparkContext(conf=conf) df = sc\ .textFile('IXQ_20170622080001.csv')\ .map(lambda line: line.split(',')) print(df.take(5))
def init_spark_context(): # load spark context conf = SparkConf().setAppName("event-contour-server") conf.setMaster("local[4]") conf.setAppName("reduce") conf.set("spark.executor.memory", "4g") # IMPORTANT: pass aditional Python modules to each worker sc = SparkContext(conf=conf, pyFiles=['app.py', 'contourGenerator.py','EventParallelize.py']) return sc
def home3(request): #spark_home = os.environ['SPARK_HOME'] = '/usr/local/spark-1.5.2-bin-2.7.1/' #'/usr/local/spark/' #sys.path.insert(0,os.path.join(spark_home,'python')) #sys.path.insert(0,os.path.join(spark_home,'python/lib/py4j-0.8.2.1-src.zip')) #from pyspark import SparkContext, SparkConf #sc = SparkContext() #data=[1,2,3,4,5] #distData = sc.parallelize(data) #first = distData.take(1) #sc.stop() prefs = ["worldnews","politics","Economics","Libertarian"] scfg=SparkConf() scfg.set("spark.cores.max",64) sc=SparkContext(master="spark://final-gateway:7077", appName="reddit-cf", conf=scfg) #data=[1,2,3,4,5] #distData = sc.parallelize(data) #first = distData.take(1) #sc.stop() try: # prep data raw_counts = sc.textFile("hdfs://final-gateway/w251_cf-user-site-total") parsed_counts = raw_counts.map(lambda st: eval(st)) all_ratings = parsed_counts.map( tup_to_rating ) # assign user-identified preferred subreddits raw_prefs = [ (999, x, 100) for x in prefs ] my_prefs = sc.parallelize(raw_prefs).map(tup_to_rating) # train model model_input = all_ratings.union(my_prefs) #model = ALS.trainImplicit(model_input, 10, 10, alpha=.01) # candidate prefs for prediction #my_prefs_ids = set([javahash(x) for x in prefs]) #all_subreddit_ids = parsed_counts.map( lambda (a,b,c): (javahash(b),b) ).distinct().cache() #candidates = all_subreddit_ids.map(lambda (a,b): a ).filter( lambda r: r not in my_prefs_ids) #predictions = model.predictAll(candidates.map( lambda x: (999, x))).cache() #final = predictions.map(lambda (a,b,c): (b,c)).join(all_subreddit_ids).map(lambda (b,(c,d)): (c,d) ).sortByKey(False) #output = list( final.take(30) ) sc.stop() #return output recommends = ["asfd"] # output except Exception, e: print("App failed. Stopping gracefully") sc.stop() raise Exception(e)
def init(self): os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local[10]") conf.setAppName("PySparkShell") conf.set("spark.executor.memory", "2g") conf.set("spark.driver.memory", "1g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc)
def _test_multiple_broadcasts(self, *extra_confs): """ Test broadcast variables make it OK to the executors. Tests multiple broadcast variables, and also multiple jobs. """ conf = SparkConf() for key, value in extra_confs: conf.set(key, value) conf.setMaster("local-cluster[2,1,1024]") self.sc = SparkContext(conf=conf) self._test_encryption_helper([5]) self._test_encryption_helper([5, 10, 20])
def main(): conf = SparkConf() conf.setAppName("TopAirports") conf.set("spark.streaming.kafka.maxRatePerPartition", "0") conf.set("spark.dynamicAllocation.enabled", "true") sc = SparkContext(conf = conf) ssc = StreamingContext(sc, 1) # Stream every 1 second ssc.checkpoint("checkpoint") # Clear the cassandra table init_cassandra().execute('TRUNCATE {}'.format(top_airports_table)) stream_kafka(ssc)
def __connected_spark_cluster(self, resource_url, pilot_description=None): conf = SparkConf() conf.setAppName("Pilot-Spark") if pilot_description!=None: for i in pilot_description.keys(): if i.startswith("spark"): conf.set(i, pilot_description[i]) conf.setMaster(resource_url) print(conf.toDebugString()) sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx) return pilot
def create_spark_instance(master = "local", conf = None): """ master: default "local" conf: default 28 cores with 2g memory """ if not conf: conf = SparkConf() conf.set("spark.executor.memory", "2g") conf.set("spark.cores.max", "28") conf.setAppName("spark ipython notebook") spark_context = SparkContext(master, conf = conf) return spark_context
class FireStarter(): mappings = { 'http_api': readers.HttpApi, 'lighter': igniters.Lighter, 'hdfs': writers.HadoopFileSystem } def __init__(self, config_file): self.config_file = config_file def read_config_file(self): with open(self.config_file, 'r+') as config_data: self.config_data = config_data.read() def parse_config_contents(self): self.config = json.loads(self.config_data) check_requirements = required_config - frozenset(self.config.keys()) if check_requirements: raise ValueError('%s must contain %s' % (self.config_file, ', '.join(check_requirements))) def load_modules(self): """This loop initializes all of the readers, writers, and igniters then stores them in an array""" self.modules = OrderedDict() self.data = OrderedDict() for module in self.config['modules']: # Access the module via name, or by order new_module = self.modules[module['name']] = self.mappings[module['type']](**module['parameters']) self.data[module['name']] = new_module.data def create_spark_context(self): conf = self.config['spark_conf'] self.spark_config = SparkConf() self.spark_config.setAppName(conf['app_name']) for attribute, value in conf['parameters'].items(): self.spark_config.set(attribute, value) self.sc = SparkContext(conf = self.spark_config) def run_modules(self): for name, module in self.modules.items(): module.execute() def execute(self): self.read_config_file() self.parse_config_contents() self.load_modules() self.run_modules()
#!/usr/bin/env python import sys from pyspark import SparkConf, SparkContext conf = SparkConf().setMaster("local").setAppName("OrphanPages") conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=conf) lines = sc.textFile(sys.argv[1], 1) #TODO def getPages(line): line = line.rstrip() lines = line.split(":") p = lines[0].strip('\t\r\n\0 ') c = lines[1].strip('\t\r\n\0').split(" ") for val in c: if not val.isdigit(): c.remove(val) res = [int(p)] + list(map(int, c)) res[0] = -res[0] return res def getVal(page): if page < 0: return (abs(page), 1) #possible orphan else: return (page, 0) #child orphans = lines.flatMap(lambda line: getPages(line)) \ .map(lambda p: getVal(p)) \
denom_ = math.sqrt(len(bus_features)) * math.sqrt(len(user_features)) return num_ / denom_ if __name__ == "__main__": start = time.time() # input_fp = sys.argv[1] # model_fp = sys.argv[2] # output_fp = sys.argv[3] input_fp = "./data/test_review.json" model_fp = "./data/task2.model" output_fp = "./data/task2.predict" conf = SparkConf() conf.set("spark.executor.memory", "8g") conf.set("spark.driver.memory", "8g") sc = SparkContext(conf=conf) # Read in test data: test_rdd = sc.textFile(input_fp) user_business_rdd = test_rdd \ .map(lambda x: json.loads(x)) \ .map(lambda u_b: (u_b["user_id"], u_b["business_id"])) # Read in model components: model = sc.textFile(model_fp) business_profiles = model \ .map(lambda x: json.loads(x)) \ .map(lambda x: x["business"]) \ .flatMap(lambda x: [(k["business_id"], k["feature_vector"]) for k in x]) \
from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext import pickle from imutils.face_utils import FaceAligner from imutils.face_utils import rect_to_bb import numpy as np import imutils import dlib import cv2 import time import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler conf = SparkConf().setAppName("drunk streaming v2").setMaster("yarn") conf.set("spark.scheduler.mode", "FAIR") conf.set("spark.scheduler.allocation.file", "/opt/spark-2.4.3-bin-hadoop2.7/conf/fairscheduler.xml") sc = SparkContext(conf=conf) sc.setLocalProperty("spark.scheduler.pool", "pool2") ssc = StreamingContext(sc, 0.2) sql_sc = SQLContext(sc) input_topic = 'input' output_topic = 'output2' brokers = "G01-01:2181,G01-02:2181,G01-03:2181,G01-04:2181,G01-05:2181,G01-06:2181,G01-07:2181,G01-08:2181," \ "G01-09:2181,G01-10:2181,G01-11:2181,G01-12:2181,G01-13:2181,G01-14:2181,G01-15:2181,G01-16:2181" def my_decoder(s): return s
d['count'] = 1 d = d.groupby(col,as_index=False)['count'].sum() d = d[d['count']>=10] freqfeas[col] = set(d[col]) def logFun(x): #x = int(1000*x) x = int(x) if x<2: return "sp"+str(x) else: return str(int(math.log(float(x))**2)) os.environ["SPARK_HOME"] = "/home/hadoop/spark-2.0.1-bin-hadoop2.7" #KeyError: 'SPARK_HOME' conf = SparkConf() conf.set("spark.hadoop.validateOutputSpecs", "false") conf.setMaster('spark://master:7077') sc=SparkContext(appName='Tpai',conf=conf) sc.setLogLevel('warn') sqlContext = SQLContext(sc) train = sqlContext.read.format('com.databricks.spark.csv') \ .options(header='true', charset="utf-8") \ .load('hdfs://192.168.1.118:9000/home/hadoop/dup/train_xgb113U.csv') df = sqlContext.read.format('com.databricks.spark.csv') \ .options(header='true', charset="utf-8") \ .load('hdfs://192.168.1.118:9000/home/hadoop/dup/train_xgb113U_df.csv') y = train[['label']]
import re import sys def part2(context): file = context[0] words = re.sub('[^a-z0-9]+',' ',context[1].lower()).split() file = file.split("/")[-1] return (file,words) #configuring spark conf = SparkConf() conf.setAppName( "part2_uni" ) conf.set("spark.executor.memory", "2g") sc = SparkContext(conf = conf) #reading input lines =sc.wholeTextFiles("/cosc6339_s17/books-longlist/") #configuring SparkSession spark=SparkSession(sc) hasattr(lines, "toDF") #tokeinizing the words and converting into dataframes tokenize=lines.map(part2).toDF(["bookname", "words"]) #converting into unigrams unigram = NGram(n=1, inputCol = "words", outputCol = "unigrams") unigramdataframe = unigram.transform(tokenize)
return (new_vals0 + last_vals0,\ new_vals1 + last_vals1) ###### ###### Main script ####### ###### signal.signal(signal.SIGINT, signal_handler) dynamo = dynamodb2.connect_to_region(AWS_REGION) out_table = Table(DB_TABLE, connection=dynamo) config = SparkConf() config.set('spark.streaming.stopGracefullyOnShutdown', True) #config.set('spark.yarn.executor.memoryOverhead', '2g') sc = SparkContext(appName='g2ex1', conf=config, pyFiles=['flight.py']) ssc = StreamingContext(sc, 1) ssc.checkpoint('/tmp/g2ex1') lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) filtered = lines.map(lambda line: line.split(","))\ .map(lambda fields: Flight(fields))\ .filter(lambda fl: fl.Cancelled == 0)\ .map(lambda fl: ((fl.Origin, fl.UniqueCarrier), (fl.DepDelay, 1)))\ .updateStateByKey(updateFunction) filtered.foreachRDD(lambda rdd: rdd.foreachPartition(save_partition))
records = spark.sql(sql) return records def transform_to_row(t): app_package, status = t[0].split('sweeroty') return Row(app_package=app_package, status=int(status), count=int(t[1])) if __name__ == '__main__': print('====> Initializing Spark APP') localConf = RawConfigParser() localConf.read('../config') sparkConf = SparkConf() for t in localConf.items('spark-config'): sparkConf.set(t[0], t[1]) spark = SparkSession.builder \ .appName('RLab_Stats_Report___Prepare_APP_Installment_Stats') \ .config(conf=sparkConf) \ .enableHiveSupport() \ .getOrCreate() sc = spark.sparkContext sc.setLogLevel('ERROR') print('====> Parsing local arguments') parser = argparse.ArgumentParser() parser.add_argument('--query_month', type=str) args = parser.parse_args() fr = args.query_month + '01' to = args.query_month + str( monthrange(int(args.query_month[:4]), int(args.query_month[4:]))[1])
from pyspark import SparkConf, SparkContext from operator import add from pyspark.sql import SQLContext from nltk.corpus import stopwords import re import string import nltk from pyspark.context import SparkContext from pyspark.sql.session import SparkSession conf = SparkConf() conf.setAppName("Inverted Index") conf.set("spark.ui.port", "4091") conf.set("spark.executor.memory", "2g") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) spark = SparkSession(sc) path1 = spark.read.load( "/bigd45/out312/part-00000-16a6ace5-c303-44c5-aa2e-9b2a0f4259ac-c000.snappy.parquet" ) removePunct = (lambda x: x not in string.punctuation) path = "/cosc6339_hw2/gutenberg-500/" finalWords = [] out = path1.collect() for (count, word) in out: out1 = word finalWords.append(out1) rdd = sc.wholeTextFiles(path)
print(pairs_rdd.collect()) frequencies_rdd = pairs_rdd.reduceByKey(lambda a, b: a + b) print(frequencies_rdd.collect()) # filter out words with fewer than threshold occurrences filtered_rdd = frequencies_rdd.filter(lambda (word, count): count >= threshold) print(filtered_rdd.collect()) if __name__ == '__main__': conf = SparkConf() conf.setAppName("WordCount") conf.set('spark.executor.memory', '500M') conf.set('spark.cores.max', 4) try: sc = SparkContext(conf=conf) except: print("Failed to connect!") print(sys.exc_info()[0]) # sys.argv[0] is the name of the script. # sys.argv[1] is the first parameter: filename # sys.argv[2] is the second parameter: threshold input_path = sys.argv[1] # "file:///Users/mparsian/sample.txt" print("input_path: {}".format(input_path)) # get threshold threshold = int(sys.argv[2])
from pyspark import SparkConf from pyspark import SparkContext from pyspark.mllib.linalg import SparseVector from pyspark.mllib.regression import LabeledPoint # Create a spark configuration conf = SparkConf() # set client conf.setMaster('local') # set app name conf.setAppName("Some spark") # spark config conf.set("spark.cores.max", "1") # spak config conf.set("spark.executor.memory", "1g") # Create spark context sc = SparkContext(conf=conf) # Create a labeled point with a positive label and a dense feature vector. pos = LabeledPoint(1.0, [1.0, 0.0, 3.0]) # Create a labeled point with a negative label and a sparse feature vector. neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0])) print neg
outputCol="features") pipeline = Pipeline(stages=[tokenizer, countVectorizer]) model = pipeline.fit(df) documents = model.transform(df).select("features").rdd.map( lambda x: x.features).zipWithIndex().map(lambda x: [x[1], x[0]]) return documents, model.stages[1].vocabulary if __name__ == '__main__': start_time1 = time.time() args = sys.argv sconf = SparkConf() sconf.setAppName("lda") sconf.setMaster(args[1]) sconf.set("spark.executor.memory", "6g") sconf.set("spark.driver.memory", "6g") sconf.set("spark.driver.maxResultSize", "6g") sconf.set("spark.yarn.executor.memoryOverhead", "2g") sconf.set("spark.yarn.driver.memoryOverhead", "2g") sconf.set("spark.eventLog.enabled", "true") sconf.set("spark.eventLog.dir", "hdfs://" + args[3] + "/user/" + args[4] + "/Logs/") sc = SparkContext(conf=sconf) dataset = "hdfs://" + args[3] + "/user/" + args[4] + "/In/" + args[2] corpus, vocabArray = preprocess(sc, path=dataset, vocabsize=50000, stopwordfile='')
#!/usr/bin/env python # -*- coding: utf-8 -*- # author: [email protected] from pyspark import SparkConf, SparkContext import os os.environ['PYSPARK_PYTHON'] = '/export/conda3/envs/pyspark-exp/bin/python3' if __name__ == "__main__": ## 配置spark 参数 conf = SparkConf() conf.set('spark.master', "local[*]") # yarn conf.set('spark.app.name', "word_count") ## 创建sc对象 sc = SparkContext(conf=conf) # hello spark # hello hadoop # hello flink file_rdd = sc.textFile("hdfs:///data/word.txt") ## 打平, 得到存储单词集合的对象 :['hello', 'spark', 'hello', 'hadoop', 'hello', 'flink'] words_rdd = file_rdd.flatMap(lambda line: line.split(" ")) print(words_rdd.collect()) ## 给每个对象map一个函数 :[('hello', 1), ('spark', 1), ('hello', 1), ('hadoop', 1), ('hello', 1), ('flink', 1)] map_rdd = words_rdd.map(lambda x: (x, 1)) print(map_rdd.collect())
from pyspark import SparkConf from pyspark.sql import SparkSession from operator import add import sys from pyspark import SparkContext conf = SparkConf() conf.setAppName('Assignment3') conf.set('spark.executor.memory', '2g') sc = SparkContext(conf=conf) spark = SparkSession(sc) def function1(param): a = param[0] b = param[1] w = a[1] * b[1] return (a[0], b[0], w) def function2(param): list = [] for i in range(len(param)): for j in range(i + 1, len(param)): bc = (param[i], param[j]) list.append(bc) return list df1 = spark.read.format("com.databricks.spark.avro").load(
# author(learning): Scc_hy # original url: https://github.com/mahmoudparsian/pyspark-algorithms/blob/master/code/chap03/word_count.py # create date: 2019-12-19 # function: word_count # data: import sys, os from pyspark import SparkContext, SparkConf def wordcount(sc: SparkContext, input_path: str) -> str: rdd = sc.textFile(input_path) word_rdd = rdd.flatMap(lambda l: l.split(' ')) pair_rdd = word_rdd.map(lambda word: (word, 1)) print(pair_rdd.reduceByKey(lambda a, b: a + b).collect()) if __name__ == '__main__': spk_conf = SparkConf() spk_conf.setAppName('WordCount').set('spark.executor.memory', '500M') spk_conf.set('spark.cores.max', 4) try: sc = SparkContext(conf=spk_conf) input_path = r'D:\My_Learn\pyspark\chap1\sample.txt' except: print("Failed to connect!") print(sys.exc_info()[0]) # Execute word count wordcount(sc, input_path)
#imported the required packages from pyspark import SparkContext, SparkConf import numpy from scipy import spatial conf = SparkConf() conf.setAppName('MovieRecommender') conf.set("spark.executor.memory", "4g") conf.set("spark.driver.memory", "4g") conf.set("spark.eventLog.enabled", "true") conf.set("spark.serializer ", "org.apache.spark.serializer.KryoSerializer") sc = SparkContext(conf=conf) #created rdd on movies and ratings dataset movies_data = sc.textFile("/home/hadoop/movies.csv").map( lambda x: x.split(",")).map(lambda x: (x[0], x[1])) ratings_data = sc.textFile("/home/hadoop/ratings.csv").map( lambda x: x.split(",")).map(lambda x: (x[1], (x[0], x[2]))) #got the userid,(movie title,rating) from two datasets combineddata = movies_data.join(ratings_data) requireddata = combineddata.map(lambda x: (x[1][1][0], (x[1][0], x[1][1][1]))).cache() #used self join on the rdd joinedRatings = requireddata.join(requireddata) #to remove duplicates def removeDuplicates(ratingvalues): ratings = ratingvalues[1]
max_request_threads = webconfig.getint("global", "server.max_simultaneous_requests") log.info("Initializing request ThreadPool to %s" % max_request_threads) request_thread_pool = ThreadPool(processes=max_request_threads) spark_context = None for clazzWrapper in NexusHandler.AVAILABLE_HANDLERS: if issubclass(clazzWrapper.clazz(), NexusHandler.SparkHandler): if spark_context is None: from pyspark import SparkContext, SparkConf # Configure Spark sp_conf = SparkConf() sp_conf.setAppName("nexus-analysis") sp_conf.set("spark.scheduler.mode", "FAIR") sp_conf.set("spark.executor.memory", "6g") spark_context = SparkContext(conf=sp_conf) handlers.append((clazzWrapper.path(), ModularNexusHandlerWrapper, dict(clazz=clazzWrapper, algorithm_config=algorithm_config, sc=spark_context, thread_pool=request_thread_pool))) else: handlers.append((clazzWrapper.path(), ModularNexusHandlerWrapper, dict(clazz=clazzWrapper, algorithm_config=algorithm_config, thread_pool=request_thread_pool))) class VersionHandler(tornado.web.RequestHandler):
from pyspark import SparkConf from pyspark.sql import SparkSession from pyspark.sql.functions import * import time import os, sys import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s') logger = logging.getLogger(__name__) reload(sys) sys.setdefaultencoding('utf-8') conf = SparkConf().set('spark.driver.maxResultSize', '30g') conf.set('spark.yarn.am.cores', 5) conf.set('spark.executor.memory', '15g') conf.set('spark.executor.instances', 30) conf.set('spark.executor.cores', 8) conf.set('spark.executor.extraJavaOptions', '-XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UseG1GC') spark = SparkSession \ .builder \ .config(conf=conf) \ .enableHiveSupport() \ .getOrCreate() from jg_info import vertex_table_info, edge_info path_prefix = '/phoebus/_fileservice/users/slmp/shulianmingpin/midfile/open_phone'
from bisect import * from pyspark.sql import SparkSession from pyspark import SparkContext, SparkConf from pyspark.sql import Row from pyspark.sql import SQLContext from pyspark.sql.types import * from pyspark import StorageLevel reload(sys) sys.setdefaultencoding('utf-8') if __name__ == "__main__": # $example on:init_session$ conf = SparkConf().setAppName("CreditCardInfo") conf.set('spark.cores.max',60) conf.set('spark.executor.memory','5g') conf.set('spark.rpc.askTimeout',240) conf.set('spark.driver.memory','20g') conf.set('spark.dynamicAllocation.enabled',True) conf.set('spark.shuffle.service.enabled',True) conf.set('spark.task.maxFailures',1) conf.set('spark.network.timeout','600s') conf.set("yarn.nodemanager.vmem-check-enabled","false") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) card_table = sqlContext.read.format("jdbc").option("url","jdbc:mysql://localhost/card_db").option("driver","com.mysql.jdbc.Driver").option("dbtable","card_info").option("user","root").option("password",**********).load() card_table.createOrReplaceTempView("card_table")
config["stage1File"] = filepath with open("config_{0}.json".format(index), "w") as outfile: json.dump(config, outfile, indent=4) subprocess.call(["./mdm", "config_{0}.json".format(index)]) subprocess.call([ "mv", "Final_MDM_{0}.root".format(index), "/hdfs/user/hjayatissa/geant_mdm_csi/stage2" ]) subprocess.call([ "/usr/local/hadoop/bin/hdfs", "dfs", "-chown", "hjayatissa", "/user/hjayatissa/geant_mdm_csi/stage2/Final_MDM_{0}.root".format( index) ]) if __name__ == "__main__": sconf = SparkConf().setAppName("mdm-CsI-2") sconf.set("spark.executor.memory", "13g") sconf.set("spark.python.worker.reuse", "false") sc = SparkContext(conf=sconf) sc.addFile("mdm") sc.addFile("config/config_isobutane_22Ne_6Li_geant_oxford.json") sc.addFile("run_oxf.mac") file_name = "hdfs://gr-gmaster.tamu.edu:9000//user/hjayatissa/geant_mdm_csi/stage1/MDM_*.root" lines = sc.newAPIHadoopFile(file_name, "edu.tamu.hadoop.RootInputFormat", "org.apache.hadoop.io.IntWritable", "org.apache.hadoop.io.Text") lines.foreach(lambda x: stage2(x))
optimizer = 'adagrad' loss = 'categorical_crossentropy' addition = 0 master_port = 5000 send_port = 8000 master_port += addition send_port += addition print master_port print send_port chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabdefghijlmnqrtuwxy" width, height, n_len, n_class = 140, 44, 6, len(chars) + 1 conf = SparkConf() conf.set("spark.app.name", application_name) conf.set("spark.master", master) conf.set("spark.submit.deployMode", deploymode) conf.set("spark.executor.cores", ` num_cores `) conf.set("spark.executor.instances", ` num_executors `) conf.set("spark.sql.warehouse.dir", "hdfs://master:9000/user/hive/warehouse") ############################################################################### #from pyspark.sql import SparkSession #sc = SparkSession.builder.master(master).appName(application_name).enableHiveSupport().getOrCreate() #sqlContext = SQLContext(sc) ################################################################################ sc = SparkContext(conf=conf) ################################################################################ # 定义CTC模型,构造训练器
#!/usr/bin/env python # encoding: utf-8 ''' @author: fanyuexiang @software: pycharm @file: StreamingInit.py @time: 2020/2/23 2:41 下午 @desc: 使用Python 初始化Spark Streaming,实现wordCount功能 ''' from pyspark.streaming import StreamingContext from pyspark import SparkConf, SparkContext import os PYSPARK_PYTHON ="/Library/Frameworks/Python.framework/Versions/3.6/bin/python3" os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON conf = SparkConf() conf.set("spark.app.name", "init-streaming") conf.set("spark.master", "local[2]") sc = SparkContext(conf=conf) streamingSc = StreamingContext(sparkContext=sc, batchDuration=1) lines = streamingSc.socketTextStream(hostname="localhost", port=7777) words = lines.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) count = pairs.reduceByKey(lambda x, y: x+y) count.pprint() streamingSc.start() streamingSc.awaitTermination()
from pyspark.sql.types import * import sys import time import signal import itertools import cassandra from cassandra.cluster import Cluster from cassandra.query import named_tuple_factory from flight import Flight from itertools import islice, chain config = SparkConf() config.set("spark.streaming.stopGracefullyOnShutdown", "true") filtered = None ssc = None def grouper_it(n, iterable): it = iter(iterable) while True: chunk_it = itertools.islice(it, n) try: first_el = next(chunk_it) except StopIteration: return yield itertools.chain((first_el, ), chunk_it)
def spark(): try: import pyspark from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession from pyspark.sql import types conf = SparkConf() conf.set("spark.sql.shuffle.partitions", "1") conf.set("spark.jars.ivy", "/home/jovyan/.ivy2/") conf.set("spark.driver.extraClassPath", "jars/scala-udf-similarity-0.0.6.jar") conf.set("spark.jars", "jars/scala-udf-similarity-0.0.6.jar") conf.set("spark.driver.memory", "4g") conf.set("spark.sql.shuffle.partitions", "24") sc = SparkContext.getOrCreate(conf=conf) spark = SparkSession(sc) udfs = [ ("jaro_winkler_sim", "JaroWinklerSimilarity", types.DoubleType()), ("jaccard_sim", "JaccardSimilarity", types.DoubleType()), ("cosine_distance", "CosineDistance", types.DoubleType()), ("Dmetaphone", "DoubleMetaphone", types.StringType()), ("QgramTokeniser", "QgramTokeniser", types.StringType()), ("Q3gramTokeniser", "Q3gramTokeniser", types.StringType()), ("Q4gramTokeniser", "Q4gramTokeniser", types.StringType()), ("Q5gramTokeniser", "Q5gramTokeniser", types.StringType()), ] for a, b, c in udfs: spark.udf.registerJavaFunction(a, "uk.gov.moj.dash.linkage." + b, c) SPARK_EXISTS = True except: SPARK_EXISTS = False if SPARK_EXISTS: print("Spark exists, running spark tests") yield spark else: spark = None logger.error("Spark not available") print("Spark not available") yield spark
values = [float(x) for x in line.replace(',', ' ').split(' ')] return values[:-1] def parseLabel(line): values = [float(x) for x in line.replace(',', ' ').split(' ')] return values[-1] def error(point, model): center = model.centers[model.predict(point)] return math.sqrt(sum([x**2 for x in (point - center)])) conf = SparkConf() conf.set("spark.master", "local") sc = SparkContext(conf=conf) data = sc.textFile("practice6_train.csv") trData = data.map(parseFeat) data = sc.textFile("practice6_test.csv") tsData = data.map(parseFeat) tsLabel = data.map(parseLabel) kmeans_list = [] for i in range(30): kmeans_list.append(KMeans.train(trData, k=10, maxIterations=100, seed=i)) obj_list = [] for i in range(30):
def __init__(self, sc=None, app_name="Hail", master=None, local='local[*]', log=None, quiet=False, append=False, min_block_size=1, branching_factor=50, tmp_dir=None, default_reference="GRCh37", idempotent=False, global_seed=6348563392232659379, optimizer_iterations=None, _backend=None): if Env._hc: if idempotent: return else: raise FatalError( 'Hail has already been initialized, restart session ' 'or stop Hail to change configuration.') if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"): hail_jar_path = pkg_resources.resource_filename( __name__, "hail-all-spark.jar") assert os.path.exists( hail_jar_path), f'{hail_jar_path} does not exist' sys.stderr.write(f'using hail jar at {hail_jar_path}\n') conf = SparkConf() conf.set('spark.driver.extraClassPath', hail_jar_path) conf.set('spark.executor.extraClassPath', hail_jar_path) SparkContext._ensure_initialized(conf=conf) else: SparkContext._ensure_initialized() self._gateway = SparkContext._gateway self._jvm = SparkContext._jvm # hail package self._hail = getattr(self._jvm, 'is').hail self._warn_cols_order = True self._warn_entries_order = True Env._jvm = self._jvm Env._gateway = self._gateway jsc = sc._jsc.sc() if sc else None if _backend is None: apiserver_url = os.environ.get('HAIL_APISERVER_URL') if apiserver_url is not None: _backend = ServiceBackend(apiserver_url) else: _backend = SparkBackend() self._backend = _backend tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp') optimizer_iterations = get_env_or_default(optimizer_iterations, 'HAIL_OPTIMIZER_ITERATIONS', 3) version = read_version_info() hail.__version__ = version if log is None: log = hail.utils.timestamp_path(os.path.join(os.getcwd(), 'hail'), suffix=f'-{version}.log') self._log = log # we always pass 'quiet' to the JVM because stderr output needs # to be routed through Python separately. # if idempotent: if idempotent: self._jhc = self._hail.HailContext.getOrCreate( jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir, optimizer_iterations) else: self._jhc = self._hail.HailContext.apply(jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir, optimizer_iterations) self._jsc = self._jhc.sc() self.sc = sc if sc else SparkContext( gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc)) self._jsql_context = self._jhc.sqlContext() self._sql_context = SQLContext(self.sc, jsqlContext=self._jsql_context) super(HailContext, self).__init__() # do this at the end in case something errors, so we don't raise the above error without a real HC Env._hc = self ReferenceGenome._from_config(_backend.get_reference('GRCh37'), True) ReferenceGenome._from_config(_backend.get_reference('GRCh38'), True) ReferenceGenome._from_config(_backend.get_reference('GRCm38'), True) if default_reference in ReferenceGenome._references: self._default_ref = ReferenceGenome._references[default_reference] else: self._default_ref = ReferenceGenome.read(default_reference) jar_version = self._jhc.version() if jar_version != version: raise RuntimeError( f"Hail version mismatch between JAR and Python library\n" f" JAR: {jar_version}\n" f" Python: {version}") if not quiet: sys.stderr.write('Running on Apache Spark version {}\n'.format( self.sc.version)) if self._jsc.uiWebUrl().isDefined(): sys.stderr.write('SparkUI available at {}\n'.format( self._jsc.uiWebUrl().get())) connect_logger('localhost', 12888) self._hail.HailContext.startProgressBar(self._jsc) sys.stderr.write( 'Welcome to\n' ' __ __ <>__\n' ' / /_/ /__ __/ /\n' ' / __ / _ `/ / /\n' ' /_/ /_/\\_,_/_/_/ version {}\n'.format(version)) if version.startswith('devel'): sys.stderr.write( 'NOTE: This is a beta version. Interfaces may change\n' ' during the beta period. We recommend pulling\n' ' the latest changes weekly.\n') sys.stderr.write(f'LOGGING: writing to {log}\n') install_exception_handler() Env.set_seed(global_seed)
local_path = os.path.dirname(__file__) sys.path.append(local_path + "/../lib") sys.path.append(local_path + "/../") sys.path.append(local_path) from pyspark import SQLContext, SparkConf, HiveContext from pyspark import SparkContext from ml import diff_feature_cls, diff_train_cls def run(sc, sql_context, is_hive): diff_feature_cls.main(sc, sql_context, is_hive=True) diff_train_cls.main(sc, sql_context, is_hive=True) if __name__ == "__main__": conf = SparkConf() conf.set("spark.executor.instances", "4") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "32g") sc = SparkContext(appName="bintrade_candidate", master="yarn-client", conf=conf) sqlContext = HiveContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "32") sqlContext.sql("use fex") run(sc, sqlContext, is_hive=True)
from pyspark import SparkContext, SparkConf import numpy as np conf = SparkConf() conf.set('master', 'spark://hadoop-maste:7077') context = SparkContext(conf=conf) acc = context.accumulator(0) print(type(acc), acc.value) rdd = context.parallelize(np.arange(101), 5) def acc_add(a): acc.add(a) return a rdd2 = rdd.map(acc_add) print(rdd2.collect()) print(acc.value) context.stop()
def getConf(self): conf = SparkConf() for k, v in DEFAULT_SPARK_CONFIG.items(): conf.set(k, v) return conf
def papers_citations(sc): #papers and number of citations per year citations = sc.textFile("/corpora/corpus-microsoft-academic-graph/data/PaperReferences.tsv.bz2") citations = citations.map(lambda line : line.split("\t")).map(lambda c: (c[0], c[1])) papers = papers_newer_than(sc, 2013) papers = papers.map(lambda p: (p[0], p[3])) #join papersMap = sc.broadcast(papers.collectAsMap()); rowFunc1 = lambda x: (x[0], x[1], papersMap.value.get(x[1], -1)) def mapFunc1(partition): for row in partition: yield rowFunc1(row) result = citations.mapPartitions(mapFunc1, preservesPartitioning=True) result = result.filter(lambda c: c[2] != -1).map(lambda x: (x[0], x[1])) result.saveAsHadoopFile('/user/bd-ss16-g3/data/citations_2014', "org.apache.hadoop.mapred.TextOutputFormat", compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec") if __name__ == "__main__": # Configure OPTIONS conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster("yarn-client") conf = conf.set("spark.executor.memory", "25g").set("spark.driver.memory", "25g").set("spark.mesos.executor.memoryOverhead", "10000") sc = SparkContext(conf=conf) #papers_citations(sc) papers_with_citations(sc)
save_results(toSave) def calculateAverage(newVal, accumlativeAvg): if accumlativeAvg is None: accumlativeAvg = (0.0, 0, 0.0) total = sum(newVal, accumlativeAvg[0]) count = accumlativeAvg[1] + len(newVal) avg = total / float(count) return (prod, count, avg) if __name__ == '__main__': conf = SparkConf() conf.setAppName("Problem_2-1") conf.set("spark.streaming.kafka.maxRatePerPartition", 50000) conf.set("spark.executor.memory", "2g") conf.set("spark.python.worker.memory", "1g") airports = ['CMI', 'BWI', 'MIA', 'LAX', 'IAH', 'SFO'] sc = SparkContext(conf=conf) sc.setLogLevel("WARN") ssc = StreamingContext(sc, 2) ssc.checkpoint("/tmp/streaming") brokers = "b-1.cs598-tast2.n69c9p.c2.kafka.us-east-1.amazonaws.com:9092,b-2.cs598-tast2.n69c9p.c2.kafka.us-east-1.amazonaws.com:9092" topic = "cs598-task2 " kafka_consumer_group = str(uuid.uuid4()) kafka_client_params = { "metadata.broker.list": brokers,