def init_spark_context(): conf = SparkConf().setAll([("spark.app.name", "Spark_Processor"), ("spark.redis.port", "6379"), ("spark.jars", "spark-redis-branch-2.4/target/spark-redis_2.11-2.5.0-SNAPSHOT-jar-with-dependencies.jar")]) sc = SparkContext(conf=conf) return sc
from pyspark import SparkContext from pyspark.sql.session import SparkSession from pyspark.ml.feature import RFormula if __name__ == "__main__": # _we can use spark in either local mode or cluster mode. Below is the configuration for local mode. sc = SparkContext("local", "Hello World") sc.setLogLevel('ERROR') # _start spark sessin from context spark = SparkSession(sc) dataset = spark.createDataFrame( [(7, "US", 18, 1.0), (8, "CA", 12, 0.0), (9, "NZ", 15, 0.0)], ["id", "country", "hour", "clicked"]) formula = RFormula( formula="clicked ~ country + hour", featuresCol="features", labelCol="label") output = formula.fit(dataset).transform(dataset) output.select("features", "label").show() sc.stop()
globals()['table_created'] = True # insert data into table insert_table(word_counts_df.toPandas()) except Exception as e: print('Error:', e) if __name__ == "__main__": global table_created table_created = False # create spark context with the above configuration sc = SparkContext(appName='TwitterStream') sc.setLogLevel('ERROR') # create the Streaming Context from the above spark context with interval size 2 seconds ssc = StreamingContext(sc, 2) # read data from port with open('config.yaml', 'r') as stream: details = yaml.safe_load(stream) lines = ssc.socketTextStream(details['host'], details['port']) # split each tweet into words words = lines.flatMap(lambda line: line.split(' ')) # do processing for each RDD generated in each interval
def spark_context(master): conf = SparkConf().setAppName('zhangxinyun-spark').setMaster(master) sc = SparkContext(conf=conf) return sc
import datetime #Normally in Spark you'd use a Window. #We cannot do this with our stream because it simply returns the last 150 trades def isWithin30Sec(time): if datetime.datetime.strptime(time, '%Y-%m-%d %H:%M:%S') >= datetime.datetime.now() - datetime.timedelta(0, 30): return True else: return False if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: btcmonitor.py <hostname> <port>", file=sys.stderr) exit(-1) sc = SparkContext(appName="BTCPriceMonitor") ssc = StreamingContext(sc, 1) lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) prices = lines.filter(lambda line: len(line.split(',')) > 5)\ .filter(lambda line: isWithin30Sec(line.split(',')[5])) sums = prices.map(lambda line: (line.split(',')[0], float(line.split(',')[1]))).reduceByKey(lambda a,b: a+b) counts = prices.map(lambda line: (line.split(',')[0], 1)).reduceByKey(lambda a,b: a+b) sums = sums.join(counts) avg = sums.map(lambda k: (k[0], k[1][0]/k[1][1])) avg.pprint()
# Spark from pyspark import SparkContext # Spark Streaming from pyspark.streaming import StreamingContext # Kafka from pyspark.streaming.kafka import KafkaUtils # json parsing import json sc = SparkContext(appName="PythonSparkStreamingKafka_RM_01") sc.setLogLevel("WARN") ssc = StreamingContext(sc, 60) kafkaStream = KafkaUtils.createStream(ssc, "192.168.0.9:9092", "spark-streaming", {"jsontest1": 1}) parsed = kafkaStream.map(lambda v: json.loads(v[1])) print(parsed) ssc.start() ssc.awaitTermination()
def createCombiner(kw): return set([kw]) def mergeValue(set, kw): set.update([kw]) return set def mergeCombiners(set0, set1): set0.update(set1) return set0 sc = SparkContext(appName="platform") data = sc.textFile("/commit/regist/daichang/yixing.phonecheck.2017-02-08").map(lambda a: f(a)).filter( lambda a: a is not None).cache() previous = sc.textFile("/user/lel/results/yixin/previous/*").map(lambda a: a.split("\t")).collectAsMap() previous_b = sc.broadcast(previous) previous_bv = previous_b.value othersRDD = data.filter(lambda a: a[1] not in '拉卡拉') \ .combineByKey(lambda a: createCombiner(a), lambda a, b: mergeValue(a, b), lambda a, b: mergeCombiners(a, b)) \ .map(lambda a: distinct_pre(a, previous_bv)).filter(lambda a: a is not None) othersRDD.coalesce(1).map(lambda a: a[0] + "\t" + ','.join(list(set(a[1])))).saveAsTextFile( "/user/lel/results/yixin/except_lakala20170213") others_dis = othersRDD.collectAsMap() others_dis_b = sc.broadcast(others_dis)
STREAM_IN = 'stream-IN' STREAM_OUT = 'stream-OUT' # We first delete all files from the STREAM_IN folder # before starting spark streaming. # This way, all files are new print("Deleting existing files in %s ..." % STREAM_IN) p = Path('.') / STREAM_IN for f in p.glob("*.ordtmp"): os.remove(f) print("... done") from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext sc = SparkContext("local[*]", "CountAndVolumePerBatch") sc.setLogLevel( "WARN") #Make sure warnings and errors observed by spark are printed. ssc = StreamingContext(sc, 5) #generate a mini-batch every 5 seconds filestream = ssc.textFileStream( STREAM_IN) #monitor new files in folder stream-IN def parseOrder(line): '''parses a single line in the orders file''' s = line.split(",") try: if s[6] != "B" and s[6] != "S": raise Exception('Wrong format') return [{
def get_adid_terms(line): entry = json.loads(line.strip()) ad_id = entry['adId'] adid_terms = [] #print entry['keyWords'] #use hashmap to dedupe adid_term for term in entry['keyWords']: val = str(ad_id) + "_" + term adid_terms.append(val) return adid_terms def generate_json(items): result = {} result['term'] = items[0] result['doc_freq'] = items[1] return json.dumps(result) if __name__ == "__main__": adfile = sys.argv[1] #raw ads data sc = SparkContext(appName="DF_Features") #[1111_makeup, 2311_makeup,2311_makeup, 987_makeup, 433_cosmetic, 867_cosmetic] => #[1111_makeup,2311_makeup, 987_makeup, 433_cosmetic, 867_cosmetic] #(makeup , 1), (makeup , 1), (makeup , 1), (cosmetic, 1), (cosmetic, 1) data = sc.textFile(adfile).flatMap(lambda line: get_adid_terms( line)).distinct().map(lambda w: (get_term(w), 1)).reduceByKey( lambda v1, v2: v1 + v2).map(generate_json) data.saveAsTextFile("/Users/jiayangan/project/SearchAds/data/log/DF13") sc.stop()
def transform_training_row_into_lp(row): features = Vectors.dense(row["x"]) label = row["label"] return LabeledPoint(label, features) def transform_test_row(row): return Vectors.dense(row["x"]) if __name__ == "__main__": # Create a local StreamingContext with two working thread # and batch interval of 1 second sc = SparkContext("local[1]", "Streaming Linear Regression") # 2nd argument is batch duration ssc = StreamingContext(sc, 5) directKafkaStream = KafkaUtils.createDirectStream(ssc, ["trendy-topic"], {"metadata.broker.list": "localhost:9092"}) model = StreamingLinearRegressionWithSGD() model.setInitialWeights(np.random.rand(NUM_FEATURES)) numStream = directKafkaStream.flatMap(extract_data_rows_from_json) trainingStream = numStream.filter(lambda row: row["known"]).map(transform_training_row_into_lp) testStream = numStream.filter(lambda row: not row["known"]).map(transform_test_row)
from pyspark import SparkContext sc = SparkContext("local[2]", "First Spark App") data = sc.textFile("data/UserPurchaseHistory.csv").map(lambda line: line.split(",")).map(lambda record: (record[0], record[1], record[2])) numPurchases = data.count() uniqueUsers = data.map(lambda record: record[0]).distinct().count() totalRevenue = data.map(lambda record: float(record[2])).sum() products = data.map(lambda record: (record[0], 1.0)).reduceByKey(lambda a, b: a+b).collect() mostPopular = sorted(products, key=lambda x: x[1], reverse=True)[0] print "Total pruchase: %d" % numPurchases print "Unique users: %d" % uniqueUsers print "Total revenue: %2.2f" % totalRevenue print "Most popular product: %s with %d purchases" % (mostPopular[0], mostPopular[1])
import findspark findspark.init() from pyspark import SparkContext from pyspark import SparkConf from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.ml.fpm import FPGrowth sc = SparkContext('local') spark = SparkSession(sc) txt = sc.textFile("./output/*/*") temp_var = txt.map(lambda k: (0, list(set(k.split(" "))))) df = temp_var.toDF(["id", "words"]) fpGrowth = FPGrowth(itemsCol="words", minSupport=0.1, minConfidence=0.1) model = fpGrowth.fit(df) # Display frequent itemsets. model.freqItemsets.show() # Display generated association rules. model.associationRules.show() # transform examines the input items against all the association rules and summarize the # consequents as prediction model.transform(df).show()
#filename is the last integer value in filename text filename = int(file.split("/")[-1]) tuples = () for word in word_set: if word != "": tuples += ((word, [filename]), ) return tuples except: print('Filename is not an integer.') if __name__ == "__main__": #create spark context spark_context = SparkContext(appName="inverted_index_search", conf=SparkConf().set("spark.driver.host", "localhost")) #turn inputs into RDDs inputRDDs = spark_context.wholeTextFiles("../input/*") #tokenize texts #set the words as keys and filenames as values processed = inputRdds.map(tokenize).map(filename_value).flatMap( lambda x: x) #reduce the previous rdd so that word keys now have a list of all the filenames that contain that word reduced = processed.reduceByKey(lambda x, y: x + y) #assign an index to every key value pair in the previous rdd zipped = reduced.zipWithIndex() #create a dictionary with words as keys and indices as values dictionary = zipped.map(lambda x: (x[0][0], x[1])).collectAsMap() #output dictionary
from pyspark import SparkContext if __name__ == '__main__': sc = SparkContext('local', 'wordcount') lines = sc.textFile('/home/wangheng/Desktop/spark_test_data.txt', 1) words = lines.flatMap(lambda line: line.split(" ")) paris = words.map(lambda x: (x, 1)) count = paris.reduceByKey(lambda x, y: x + y) for x in count.collect(): print(x[0], " appear ", x[1], " times!")
def map_ORCID(x): result = [] author = dict(fullname='', identifiers=[ dict(scheme='ORCID', value="https://orcid.org/" + x['orcid'], provenance='ORCID') ], affiliations=[], given=x.get('firstname', ''), family=x.get('lastname', '')) fullname = "%s %s" % (author['given'], author['family']) author['fullname'] = fullname.strip() for item in x['publications']: if item['doi'] is not None and len(item['doi']) > 0: result.append((item['doi'].lower(), [author])) return result if __name__ == '__main__': sc = SparkContext(appName='generateORCIDDataFrame') spark = SparkSession(sc) sc.textFile('/data/orciddump.txt').map( json.loads).flatMap(map_ORCID).reduceByKey(lambda a, b: a + b).map( lambda x: dict(doi=x[0], authors=x[1])).toDF( get_schema()).write.save("/data/ORCID.parquet", format="parquet") # .saveAsTextFile(path="/data/ORCID_df",compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
# # (c) Copyright 2016 Hewlett Packard Enterprise Development LP # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain # a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # from __future__ import print_function from pyspark import SparkConf from pyspark import SparkContext if __name__ == "__main__": my_spark_conf = SparkConf().setAppName("Quicksum") spark_context = SparkContext(conf=my_spark_conf) data = [1, 2, 3, 4, 5] distData = spark_context.parallelize(data) total = distData.reduce(lambda a, b: a + b) print("Total is %s" % total) spark_context.stop()
from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from kafka import KafkaProducer import json from json import dumps if __name__ == "__main__": def to_kafka(rdd): cnt = rdd.count() if cnt > 0: data = rdd.take(1) producer.send('spark_event_out', value=data) sc = SparkContext(appName="StreamingKafkaEventAggregator") sc.setLogLevel("ERROR") ssc = StreamingContext(sc, 10) print('Listenting to topic') data = 'Processed a batch now' producer = KafkaProducer( bootstrap_servers=['sandbox-hdp.hortonworks.com:6667'], value_serializer=lambda x: dumps(x).encode('utf-8')) kafkaStream = KafkaUtils.createStream(ssc, 'localhost:2181', 'spark-streaming', {'spark_event': 1}) parsed = kafkaStream.map(lambda v: json.loads(v[1]))
"""SimpleApp.py""" from pyspark import SparkContext logFile = "file:///usr/local/spark/README.md" sc = SparkContext("local", "Simple App") logData = sc.textFile(logFile).cache() numAs = logData.filter(lambda s: 'a' in s).count() numBs = logData.filter(lambda s: 'b' in s).count() print("Lines with a: %i, lines with b: %i" % (numAs, numBs)) sc.stop()
# Spark from pyspark import SparkContext # Spark Streaming from pyspark.streaming import StreamingContext # Kafka from pyspark.streaming.kafka import KafkaUtils # json parsing import json sc = SparkContext(appName="spark2") sc.setLogLevel("WARN") airVelocityKMPH = [12, 13, 15, 12, 11, 12, 11] parVelocityKMPH = sc.parallelize(airVelocityKMPH, 2) countValue = parVelocityKMPH.count() sumValue = parVelocityKMPH.sum() meanValue = parVelocityKMPH.mean() varianceValue = parVelocityKMPH.variance() sampleVarianceValue = parVelocityKMPH.sampleVariance() stdevValue = parVelocityKMPH.stdev() sampleStdevValue = parVelocityKMPH.sampleStdev() parVelocityKMPH.stats().asDict()
import sys from pyspark import SparkContext import numpy as np from sklearn.svm import SVC from sklearn import preprocessing output_path = sys.argv[1] input_train = sys.argv[2] input_all = sys.argv[3] sc = SparkContext(appName="train") rdd_train = sc.textFile(input_train) rdd_all = sc.textFile(input_all) # common func ========================= def splitx(raw): items = raw.split(' ') mtr_x = [] mtr_y = [] mtr_t = [] for v in items[1].split(';'): tmp = v.split(',') if len(tmp) != 3: continue mtr_x.append(float(tmp[0])) mtr_y.append(float(tmp[1])) mtr_t.append(float(tmp[2])) gtmp = items[2].split(',') goal = [float(gtmp[0]), float(gtmp[1])] if len(items) == 4:
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function import numpy as np from pyspark import SparkContext # $example on$ from pyspark.mllib.stat import Statistics # $example off$ if __name__ == "__main__": sc = SparkContext(appName="CorrelationsExample") # SparkContext # $example on$ seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) # a series # seriesY must have the same number of partitions and cardinality as seriesX seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0]) # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. # If a method is not specified, Pearson's method will be used by default. print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson"))) data = sc.parallelize([ np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0])
from pyspark.sql import SQLContext from pyspark import SparkContext import re # other required imports here if __name__ == "__main__": # create Spark context with necessary configuration spark = SparkContext("local", "Stock Returns") # read json data from the newdata directory # df = SQLContext(spark).read.option("multiLine", True) \ # .option("mode", "PERMISSIVE").json("./newsdata") schema = ( 'date STRING, open FLOAT, high FLOAT, low FLOAT, close FLOAT, volume INT, ticker STRING' ) df = SQLContext(spark).read.csv('stock_prices.csv', schema=schema, header=False) # df.show(2) # lines = df.select("date","open","close") # sim = df.withColumn("percent", (df("close") - df("open"))*100/df("open")) sim = df.withColumn("return", (df["close"] - df["open"]) * 100 / df["open"]) # sim.groupBy('date').avg('return').show() # sim.select("date","return").groupBy("date").avg() x = sim.groupBy("date").avg("return") x.collect() # sim=sim.select('date','return') # df.groupBy(df.date).avg(df.close - df.open).show() # vals = lines.map(lambda row: row[2]-row[1])
interval.add(1) events.add(len(input)) # If model predicts True warn user in red if (model.predict(features)): print '\033[31m%s.%03dZ: Interval %d: Attention needed (%d sensor events in interval)\033[0m' % (strftime("%Y-%m-%dT%H:%M:%S", gmtime()), (time()*1000)%1000, interval.value, len(input)) else: print '%s.%03dZ: Interval %d: Everything is OK (%d sensor events in interval)' % (strftime("%Y-%m-%dT%H:%M:%S", gmtime()), (time()*1000)%1000, interval.value, len(input)) if last_batch: ssc.stop() # Initialize features to <number of sensors>-length array, filled with neutral initial sensor value features = np.zeros(n_sensors) features.fill(0.5) # Initialize streaming for specified reporting interval sc = SparkContext(appName="iotstream_lr_mqtt") interval = sc.accumulator(0) empty_intervals = sc.accumulator(0) events = sc.accumulator(0) ssc = StreamingContext(sc, reporting_interval) sensor_stream = MQTTUtils.createStream(ssc, mqtt_URL, mqtt_topic) # Load pre-computed model model = LogisticRegressionModel.load(sc, modelname) # Run model on each batch #sensor_stream.pprint(10) sensor_stream.foreachRDD(run_model) # Start reading streaming data ssc.start()
from pyspark import SparkContext from pyspark.streaming import StreamingContext #local[*] 中必须设置大于1的并行数量 sc = SparkContext("local[2]", "streaming") # 设置每次计算的时间间隔 ssc = StreamingContext(sc, 5) lines = ssc.textFileStream('file:///home/zj/logs') words = lines.flatMap(lambda l: l.split()) wordsPair = words.map(lambda x: (x, 1)) wordscount = wordsPair.reduceByKey(lambda a, b: a + b) wordscount.pprint() ssc.start() ssc.awaitTermination()
from __future__ import print_function import sys from pyspark import SparkContext from csv import reader from operator import add if __name__ == "__main__": sc = SparkContext() lines1 = sc.textFile(sys.argv[1], 1) lines1 = lines1.mapPartitions(lambda x: reader(x)) vehicles = lines1.map(lambda x: ((x[14], x[16]), 1)).reduceByKey(add) vehicles = vehicles.sortBy(lambda x: x[0][0]).sortBy(lambda x: x[1], False) res = sc.parallelize(vehicles.take(20)) res = res.map( lambda x: "{0:s}, {1:s}\t{2:d}".format(x[0][0], x[0][1], x[1])) res.saveAsTextFile("task6.out") sc.stop()
create_users_table_stmt = """CREATE TABLE IF NOT EXISTS users( id INT PRIMARY KEY, username VARCHAR(50) NOT NULL, firstname VARCHAR(100), lastname VARCHAR(100), picture TEXT ); """ add_user_stmt = """INSERT IGNORE INTO users(id, username, firstname, lastname, picture) VALUES (%s,%s,%s,%s, %s);""" if __name__ == "__main__": sc = SparkContext(appName="venmoApp-userinfo-mysql") data_location = get_url(sys.argv) if data_location is None: logging.error("not a valid data location.\nExiting the program") sys.exit(0) logging.info("Processing:" + data_location) data_rdd = sc.textFile(data_location) parsed_users = data_rdd.flatMap(parse_user_info).\ filter(lambda data: data is not None) table_created = sql_create_table(create_users_table_stmt)
surrogateCoeff = 0.1 # denote the cluster ratio in surrograte model tabuMaxLength = 10 tabuMaxIter = 100 maxNumCandidate = 10 core_num = int(sys.argv[1]) conf = SparkConf().setMaster("spark://noah007:7077") \ .setAppName("SPC-POSM-PSO") \ .set("spark.submit.deployMode", "client") \ .set("spark.cores.max", core_num) \ .set("spark.executor.cores", "10") \ .set("spark.executor.memory", "20g") \ .set("spark.driver.memory", "40g") sc = SparkContext(conf=conf) ''' experiment for accuracy on different dataset ''' ''' instanceSet = ['nuoxi2G'] # , 'nuoxi3G', 'huawei2G', 'huawei3G'] ''' ''' instanceSet = [i for i in range(60)] aveAns, aveRuntime, aveConverGen = [], [], [] for i in instanceSet: print i, 'th instance ...' # po is data contains informantion about PROVIDERS and CUSTOMERS
import sys from pyspark import SparkContext from itertools import combinations import time import random time_start = time.time() sc = SparkContext("local[*]", "Assignment 3 LSH Task 1") dataFile = sc.textFile(sys.argv[1]) # dataFile = sc.textFile(sys.argv[1]) # num_chunk = 2 # 4 # lines = sc.textFile(sys.argv[1], num_chunk) # use this statement when run so slow # dataFile = dataFile.repartition(2) # # remove duplicate dataFile1 = dataFile.map(lambda x: x.split(",")) header = dataFile1.first() dataFile1 = dataFile1.filter(lambda x: x != header) unique_user_id = dataFile1.map(lambda row: row[0]).distinct().collect() unique_user_id.sort() # index_user_map = {} user_index_map = {} user_index = 0 for user_item in unique_user_id: # index_user_map[i] = user_item user_index_map[user_item] = user_index
import sys from pyspark import SparkContext # given the list of neighbors for a page and that page's rank, calculate # what that page contributes to the rank of its neighbors def computeContribs(neighbors, rank): for neighbor in neighbors: yield (neighbor, rank / len(neighbors)) # read in a file of page links (format: url1 url2) linkfile = "pagelinks.txt" sc = SparkContext(appName="pagerank") links = sc.textFile(linkfile).map(lambda line: line.split()).map( lambda pages: (pages[0], pages[1])).distinct().groupByKey().persist( ) # filter out duplicates #groupByKey => adjeacent list: (page3, [page1,page4]) , (page4, [page1,page2]) # set initial page ranks to 1.0 ranks = links.map(lambda (page, neighbors): (page, 1.0)) # number of iterations n = 10 d = 0.85 # for n iterations, calculate new page ranks based on neighbor contribibutios for x in xrange(n): contribs = links.join(ranks).flatMap( lambda (page, (neighbors, rank)): computeContribs(neighbors, rank)) #page1, 0.5
from pyspark import SparkConf,SparkContext # initialization of spark # master is machine | server allooting sc = SparkContext(master = 'local[2]') print(sc) # # # spark version print(sc.version) # # python version print(sc.pythonVer) # # print(sc.master) print(str(sc.sparkHome)) print(str(sc.sparkUser())) # print(sc.appName) # Return application name print(sc.applicationId) # Retrieve application ID print(sc.defaultParallelism) # Return default level of parallelism print(sc.defaultMinPartitions) config = (SparkConf(). setMaster("local"). setAppName("myapp"). set("spark.executer.memory","1g")) # getting the configuration