def get_spark(): conf = (SparkConf().setAppName("read_pigstorage").set( "spark.authenticate.secret", "thisisasecret")) return SparkContext(conf=conf)
from pyspark import SparkConf, SparkContext #we're going to find out how many words are there in any book/text file using Spark #set master to local when you're running in your machine conf = SparkConf().setMaster("local").setAppName("WordCount") #initialize SparkContext as it is like the entry point to spark core sc = SparkContext(conf = conf) #reading the book.txt file into an RDD input = sc.textFile("C:/Users/siraj/github/Spark-programs/data/book.txt") #flatmap will transform a corpus of texts into words. we're spliting it word by word words = input.flatMap(lambda x: x.split()) #countByValue will return the sum of each word present in the object. wordCounts = words.countByValue() #iterate through the object to get the word and its corresponding number of occurence. for word, count in wordCounts.items(): cleanWord = word.encode('ascii', 'ignore') if (cleanWord): print(cleanWord.decode() + " " + str(count))
'calculate time:%s' % str(time.time() - start_time)) send_msg_to_kafka(producer, result) producer.close() ''' with topic_out.get_sync_producer() as producer: for query in msgs: # result = BM25_cores(query, Lave, bi) tmp = '%s' % query producer.produce(str(tmp).encode('utf-8')) ''' if __name__ == '__main__': conf = SparkConf().setMaster("spark://cdh-master-slave1:7077").set( "spark.executor.memory", "5G").set("spark.driver.memory", "3G").set("spark.executor.cores", "2").set("spark.cores.max", "6") # sc = SparkContext(conf=conf) # conf = SparkConf().setAppName("bm25") sc = SparkContext(conf=conf) # sqlContext = SQLContext(sc) ssc = StreamingContext(sc, 0.5) mongo_client = pymongo.MongoClient('mongodb://192.168.10.219:49019/') bi = mongo_client.lawbot.bm25_inverted be = mongo_client.lawbot.bm25_extra # sc.broadcast(bi) extra_data = be.find_one() total_word = extra_data.get('total_word')
#!/usr/bin/env python import findspark findspark.init() from pyspark import SparkContext from pyspark import SparkConf conf = SparkConf().setMaster("local").setAppName("My app") sc = SparkContext(conf = conf) lines = sc.textFile("ch01.py") inputRDD = lines.filter(lambda x:"sc" in x) for line in inputRDD.take(10): print line lines = sc.parallelize(["hello world", "hi"]) words = lines.flatMap(lambda line:line.split(" ")) print words.first() data = sc.parallelize([1,2,3,4,1,3]) print data.reduce(lambda x,y: x+y) def printall(rdd): print("----------") for r in list(rdd.collect()): print r printall(data.distinct()) for d in list(data.distinct().collect()): print d d = sc.parallelize(["1, hello", "2, hi", "3, how are you"]) for _ in list(d.map(lambda x:(x.split(",")[0], x)).collect()): print _
hashh = "" i = 0 while i < len(maximum): if i == (len(maximum) - 1): hashh = hashh + str(maximum[i][0]) else: hashh = hashh + str(maximum[i][0]) + "," i = i + 1 if hashh != "": print("%s" % (hashh)) wind_size = int(sys.argv[1]) batch_duration = int(sys.argv[2]) conf = SparkConf() conf.setAppName("BigData") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, batch_duration) ssc.checkpoint("~/checkpoint_BIGDATA") dataStream = ssc.socketTextStream("localhost", 9009) tweet = dataStream.map(lambda w: (w.split(';')[7])) hashtag = tweet.flatMap(lambda w: (w.split(','))) hasht = hashtag.map(lambda w: (w, 1)) counts = hasht.filter(lambda x: x[0] != '') totalcount = counts.reduceByKeyAndWindow(
try: firstRow=tweet_rdd.first() tweet_rdd=tweet_rdd.filter(lambda row:row != firstRow) if not tweet_rdd.isEmpty(): sqlContext.createDataFrame(tweet_rdd, schema).write \ .format("org.apache.spark.sql.cassandra") \ .mode('append') \ .options(table="sentiment", keyspace="w251twitter") \ .save() except ValueError: print("The RDD was empty...continuing...") if __name__ == "__main__": sparkConf = SparkConf().setAppName("TwitterSentimentAnalysis") \ .set("spark.cassandra.connection.host", "cassandra1, cassandra2, cassandra3") sc = SparkContext(conf=sparkConf) session = SparkSession(sc) sqlContext = SQLContext(sc) ssc = StreamingContext(sc, 2) brokers, topic = sys.argv[1:] kvs = setup_kafka_stream() nlp = StanfordCoreNLP('http://localhost:9000') tweets = kvs.filter(lambda x: x is not None).filter(lambda x: x is not '').map(lambda x: json.loads(x[1])) tweets.count().map(lambda x: 'Tweets in this batch: %s' % x).pprint() sentiment_stream = tweets.map(lambda tweet: get_tweet_sentiment(tweet)).filter(lambda x: x is not None)
from pyspark import SparkContext, SparkConf import numpy as np import time #=============SETUP SPARK================== local = False if local: spark = pyspark.SparkContext("local[*]") spark.setLogLevel("ALL") else: import os master = os.environ["SPARK_MASTER"] master = "spark://{}:7077".format(master) conf = SparkConf().setAppName("SpotTrawl").setMaster(master) spark = SparkContext(conf=conf) #===========DEFINE SAMPLING FUNCTION======= numSamples = 10**7 def sample(p): x, y = np.random.random(), np.random.random() return 1 if x * x + y * y < 1 else 0 #==========TAKE SAMPLES====================== count = spark.parallelize(xrange(0, numSamples)).map(sample) \ .reduce(lambda a, b: a + b)
from pyspark import SparkConf, SparkContext conf = SparkConf().setMaster("local").setAppName("customerSpendings") sc = SparkContext(conf=conf) def getCustomerAndSpendings(line): row = line.split(',') custId = int(row[0]) amount = float(row[2]) return (custId, amount) lines = sc.textFile("./data/customer-orders.csv") parsedLines = lines.map(getCustomerAndSpendings) totalAmounts = parsedLines.reduceByKey(lambda v1, v2: v1 + v2) reversedTotalAmounts = totalAmounts.map(lambda entry: (entry[1], entry[0])) sortedTotalAmounts = reversedTotalAmounts.sortByKey( False) #false gives descending order results = sortedTotalAmounts.collect() print("\nTotal spendings by each customer: \n") for amount, customer in results: print("customer-" + str(customer) + "\t {:.2f}$".format(amount))
print(s_list[0][0], s_list[1][0], s_list[2][0], s_list[3][0], s_list[4][0], sep=",") def func2(line): hashtag = line.split(";")[7] if (',' in hashtag): return hashtag.split(",") return [hashtag] conf1 = SparkConf() conf1.setAppName("BigData") sc1 = SparkContext(conf=conf1) sscp = StreamingContext(sc1, int(sys.argv[2])) sscp.checkpoint("/checkpoint_BIGDATA") dataStream1 = sscp.socketTextStream("localhost", 9009) hashtags = dataStream1.window(int(sys.argv[1]), 1).flatMap(func2).map( lambda h: (h, 1)).reduceByKey(lambda x, y: int(x) + int(y)) hashtags.foreachRDD(func) ssc.start() ssc.awaitTermination(60)
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_F_CI_CLASSIFY_RESULT_TEMP').setMaster( sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
ldaModel = LDA.train(corpus, k=topic_num) return ldaModel if __name__ == '__main__': if len(sys.argv) != 6: print >> sys.stderr, "Usage: %s <input> <model_path> <stopfile> topic_num name" % sys.argv[0] exit(1) input_path = sys.argv[1] model_path = sys.argv[2] stopfile = sys.argv[3] topic_num = int(sys.argv[4]) appname = sys.argv[5] conf = SparkConf().setAppName(appname) sc = SparkContext(conf=conf) rdd_lines = sc.textFile(input_path) parsed_data = segment(rdd_lines) if stopfile: parsed_data = filter_stopword(parsed_data, stopfile).cache() tf_data = vectorize(sc, parsed_data) ldaModel = lda(tf_data, topic_num) print "finish train model..." t = ldaModel.describeTopics(5) print t
def write_into_redis(s): redis_client = redis.StrictRedis( host='ec2-52-40-47-83.us-west-2.compute.amazonaws.com', port=6379, db=0, password='') pipe = redis_client.pipeline() for i in s: redis_client.delete(*i[0]) pipe.lpush(i[0], *i[1]) pipe.execute() appName = 'Similarity_APP' master = 'spark://ec2-50-112-193-115.us-west-2.compute.amazonaws.com:7077' conf = SparkConf().setAppName(appName).setMaster(master) sc = SparkContext(conf=conf) start_time = time.time() list_1 = get_data_from_influx() print("--- %s seconds ---" % (time.time() - start_time)) rdd = sc.parallelize(list_1) tupls = rdd.map(split_string) buckets = tupls.reduceByKey(lambda a, b: a + b) write_into_redis.count = 0 buckets.foreachPartition(write_into_redis)
# course-word-count.py # August 3, 2020 # # Solution script provided by course. # Count the number of occurrences of each word in a text file. # Using the 'Book' text file. from pyspark import SparkConf, SparkContext conf = SparkConf().setMaster('local').setAppName('word-count') sc = SparkContext(conf=conf) lines = sc.textFile('Book') words = lines.flatMap(lambda x: x.split()) wordCounts = words.countByValue() for word, count in wordCounts.items(): cleanWord = word.encode('ascii', 'ignore') if cleanWord: print cleanWord, count #print wordCounts
map_count = fix_date.map(lambda x: ((x[0], x[1]), 1)) map_count = map_count.reduceByKey(lambda x, y: x + y) return map_count if __name__ == '__main__': # Get input/output files from user parser = argparse.ArgumentParser() parser.add_argument('commits', help='File to load Amazon review data from') parser.add_argument('repos', help='File to load Yelp business data from') #parser.add_argument('output', help='Directory to save DStream results to') args = parser.parse_args() # Setup Spark conf = SparkConf().setAppName("timezone") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) setup_table(sc, sqlContext, args.commits, args.repos) print("-" * 15 + " OUTPUT " + "-" * 15) langs = { "Python", "Java", "JavaScript", "Ruby", "SQL", "C#", "C++", "nodejs", "PHP", "C", "objective-c" } out = timezone(sc, sqlContext, langs) out.saveAsTextFile("/user/renukan2/timezone_github") print("-" * 30)
''' Created on Jun 10, 2017 @author: SathishParthasarathy ''' from pyspark import SparkConf, SparkContext from hdfs3 import HDFileSystem if __name__ == '__main__': conf = SparkConf().setAppName("Word Count - Python") spark = SparkContext(conf=conf) hdfs = HDFileSystem('hadoop.master.com', port=9000) if hdfs.exists("/user/psathishcs/Output/Books/Science_Python") != True: text_file = spark.textFile( "hdfs://hadoop.master.com:9000/user/psathishcs/Input/Books/The_Outline_of_Science.txt" ) words = text_file.flatMap(lambda line: line.split()) wordCounts = words.map(lambda word: (word, 1)).reduceByKey( lambda a, b: a + b) wordCounts.saveAsTextFile( "hdfs://hadoop.master.com:9000/user/psathishcs/Output/Books/Science_Python" )
from pyspark import SparkConf, SparkContext conf = SparkConf().setMaster("local").setAppName("PopularMovies") sc = SparkContext(conf=conf) lines = sc.textFile("c:///SparkCourse/ml-100k/u.data") movies = lines.map(lambda x: (int(x.split()[1]), 1)) movieCounts = movies.reduceByKey(lambda x, y: x + y) flipped = movieCounts.map(lambda xy: (xy[1], xy[0])) sortedMovies = flipped.sortByKey() results = sortedMovies.collect() for result in results: print(result)
from pyspark.sql import SparkSession import sys import csv from pyspark.ml.fpm import FPGrowth from pyspark.sql.functions import monotonically_increasing_id from pyspark.sql.types import Row from pyspark import SparkContext, SparkConf import pyspark.sql.functions as func file=sys.argv[1] n=sys.argv[2] s=sys.argv[3] c=sys.argv[4] conf = SparkConf().setAppName('Kia_bigdata_lab').setMaster('local') sc = SparkContext(conf=conf) spark=SparkSession.builder.appName("lab3").getOrCreate() rd=sc.textFile(file).map(lambda x: Row((x.split(','))[0],x.split(',')[1:])) df=rd.toDF(["items","plant"]).withColumn("id",monotonically_increasing_id()) df=df[["id","items","plant"]] fpGrowth = FPGrowth(itemsCol="plant", minSupport=float(s), minConfidence=float(c)) model = fpGrowth.fit(df) # Display frequent itemsets. ml=model.freqItemsets ml.orderBy([func.size("items"), "freq"], ascending=[0,0]).show(int(n)) # Display generated association rules. #ml=model.associationRules.show(10) #ml.orderBy([func.size("antecedent")],"confidence",ascending=[0,0]).show(int(n))
import sys from pyspark import SparkConf, SparkContext from pyspark.mllib.recommendation import ALS, Rating def loadMovieNames(): movieNames = {} with open("ml-100k/u.item", encoding='ascii', errors="ignore") as f: for line in f: fields = line.split('|') movieNames[int(fields[0])] = fields[1] return movieNames conf = SparkConf().setMaster("local[*]").setAppName("MovieRecommendationsALS") sc = SparkContext(conf=conf) sc.setCheckpointDir('checkpoint') print("\nLoading movie names...") nameDict = loadMovieNames() data = sc.textFile("ml-100k/u.data") ratings = data.map(lambda l: l.split()).map( lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache() # Build the recommendation model using Alternating Least Squares print("\nTraining recommendation model...") rank = 10 # Lowered numIterations to ensure it works on lower-end systems numIterations = 6
def run(): conf = SparkConf() #conf.set('spark.shuffle.blockTransferService', 'nio') conf.set('spark.files.fetchTimeout', '180') conf.set('spark.files.overwrite', 'yes') conf.set('spark.akka.timeout', '180') #conf.set('spark.task.maxFailures', '30000') conf.set('spark.akka.frameSize', '500') conf.set('spark.network.timeout', '180') myClassifierOnevsOne = pickle.load(open('myClassifierOnevsOne.p', 'rb')) dataSetMaker = DataSetMakerV2(n=200000) feed = FeedNewsFromGoogleFinance() def sendRecord(rdd): print('new try...') if (not rdd.isEmpty()): newsRDD = dataSetMaker.processKeepNews(rdd) res = newsRDD.map( lambda x: (x[0], myClassifierOnevsOne.predict(x[1].features))) print('for each result...') for result in res.collect(): symbole = result[0].symbole r = requests.put('http://wtun.mooo.com:5000', data={ 'jdata': NewsPrediction(result[0], str(result[1])).json(), 'symbole': symbole, 'label': str(result[1]) }) print('send ok') print('receive %s' % str(r.text)) else: print('empty!') sc = SparkContext(conf=conf) symbolesRDD = sc.parallelize([('NASDAQ:GOOGL', ['GOOG', 'GOOGL', 'GOOGLE']), ('NASDAQ:NVDA', ['NVIDIA']), ('VTX:SCMN', ['SWISSCOM'])]) taskdt = 600 running = True oldNewsRDD = None firstTime = True intersectRDD = None dataDirectory = 'hdfs://157.26.83.52/user/wdroz/stream2' cpt = 0 while (running): today = datetime.datetime.now() yesterday = today - datetime.timedelta(days=1) tomorrow = today + datetime.timedelta(days=1) newsRDD = symbolesRDD.flatMap( lambda x: feed.lookingAt(x[0], yesterday, tomorrow, x[1])) if (firstTime): firstTime = False intersectRDD = newsRDD else: try: intersectRDD = oldNewsRDD.intersection(newsRDD) except: pass # empty rdd oldNewsRDD = newsRDD try: sendRecord(intersectRDD) intersectRDD.saveAsPickleFile( dataDirectory + '/' + datetime.datetime.now().strftime('%Y-%m-%d--') + str(cpt)) cpt += 1 except: pass # empty rdd time.sleep(taskdt) running = False # TODO remove it
"com.databricks.spark.csv").option("header", "true").save('locked_data.csv') byDateUnlocked.repartition(1).write.format( "com.databricks.spark.csv").option("header", "true").save('unlocked_data.csv') # 5 over_18 byDate18 = df10.filter('over_18 == true').select( to_date(df10.created_utc.cast('timestamp')).alias('date'), df10.Positive, df10.Negative).groupBy('date').avg('Positive', 'Negative') byDate18.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save('over18_data.csv') # final 4 dff4 = df10.groupBy('title').agg( avg('Positive').alias('avgPos'), avg('Negative').alias('avgNeg')) dff4.orderBy('avgPos', ascending=0).limit(10).show(truncate=False) dff4.orderBy('avgNeg', ascending=0).limit(10).show(truncate=False) if __name__ == "__main__": conf = SparkConf().setAppName("CS143 Project 2B") conf = conf.setMaster("local[*]") # conf = (conf.set('spark.executor.memory', '4G').set('spark.driver.memory', '4G')) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) sc.addPyFile("cleantext.py") main(sqlContext)
from pyspark import SparkContext, SparkConf import collections def parseLine(line): cells = line.split(',') return (int(cells[0]), float(cells[2])) conf = SparkConf().setMaster("local").setAppName("CustomerOrders") sc = SparkContext(conf=conf) rdd = sc.textFile("customer-orders.csv") custOrders = rdd.map(parseLine) custAmounts = custOrders.reduceByKey(lambda x, y: x + y).map(lambda x: (x[1], x[0])) custAmountsSorted = custAmounts.sortByKey() results = custAmountsSorted.collect() for result in results: print(str(result[1]) + ": {:.2f}".format(result[0])) ''' Output: 45: 3309.38 79: 3790.57 96: 3924.23 23: 4042.65 99: 4172.29 ... ...
from pyspark import SparkConf, SparkContext from pyspark.sql import SparkSession conf = SparkConf().setAppName('WriteAPIs').setMaster('local') sc = SparkContext(conf=conf) ss = SparkSession.builder.appName('WriteAPIs').master('local').getOrCreate() #----------------------------------------------------------------------------------------------- # Writing to files - RDD #----------------------------------------------------------------------------------------------- # car_file = '/Users/soumyadeepdey/HDD_Soumyadeep/TECHNICAL/Training/Intellipaat/PySparkCodes/sampledata/car_sales_data.csv' # output_file = '/Users/soumyadeepdey/HDD_Soumyadeep/TECHNICAL/Training/Intellipaat/IntellipaatSpark/OutputFile/car_sales_data_out' # num_partitions = 16 # rdd = sc.textFile(car_file) # rdd = sc.textFile(car_file, num_partitions) # print('Total no of partitions: ',rdd.getNumPartitions()) # rdd1 = rdd.map(lambda x: (x.split(',')[3],x.split(',')[5],x.split(',')[11])) # rdd1.saveAsTextFile(output_file) # rdd1.coalesce(1).saveAsTextFile(output_file) # rdd1.repartition(1).saveAsTextFile(output_file) # rdd2 = sc.textFile("/Users/soumyadeepdey/HDD_Soumyadeep/TECHNICAL/Training/Intellipaat/IntellipaatSpark/OutputFile/car_sales_data_out/part-*") # print(rdd2.getNumPartitions()) # print(rdd1.count()) # print(rdd2.count()) # # for i in rdd1.take(5): # print(i)
import findspark findspark.init() from pyspark import SparkContext from pyspark import SparkConf sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]")) from src.serial import NaiveDBSCAN, MatrixDBSCAN from src.utils import DataLoader, Evaluation, timeit from src.settings import UNKNOWN, NOISE import numpy as np # broadcast variable b_dataset = None b_eps = None b_min_pts = None def load_data_label(path): pts = sc.textFile(path).map(lambda x: x.strip().split()[:-1]).map( lambda x: tuple([float(i) for i in x])) return pts.collect() def load_data(path): pts = sc.textFile(path).map(lambda x: x.strip().split()).map( lambda x: tuple([float(i) for i in x])) return pts.collect()
sys.path.append('..') import settings logger = logging.getLogger() logger.setLevel(settings.COSINE_SIMILARITY['LOG_LEVEL']) try: from pyspark import SparkContext from pyspark import SparkConf, SparkContext except ImportError as e: logging.error("Can not import Spark Modules", e) sys.exit(1) logging.info("Successfully imported Spark Modules") conf = SparkConf().setMaster("local").setAppName("AggregatingMotionDeviceData") sc = SparkContext(conf=conf) # Script specific configurations MINUTE_WINDOW = settings.COSINE_SIMILARITY['MINUTES_PER_WINDOW'] * 60 * 1000 BASE_TIME = settings.COSINE_SIMILARITY['BASE_TIME'] MAX_TIME = settings.COSINE_SIMILARITY['MAX_TIME'] INPUT_DIR = settings.COSINE_SIMILARITY['INPUT_DIR'] OUTPUT_DIR = '../front_end/motion_split_files_' + str(settings.COSINE_SIMILARITY['MINUTES_PER_WINDOW']) \ + '_mins_window/' if not os.path.isdir(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) # Actual Code def parse_line(line):
sum_xx = sum_yy = sum_xy = 0 for ratingX, ratingY in ratingPairs: sum_xx += ratingX * ratingX sum_yy += ratingY * ratingY sum_xy += ratingX * ratingY numPairs += 1 numerator = sum_xy denominator = sqrt(sum_xx) * sqrt(sum_yy) score = (numerator / (float(denominator))) if (denominator) else 0 return (score, numPairs) #use Spark built-in cluster manager to treat very laptop's core as a node print("\nLoading movie names...") #build a SparkContext nd create ratings: [user_ID, (movieID, rating)] data = SparkContext(conf = SparkConf()).textFile("source/ratings.dat") ratings = data.map(lambda l: l.split()).map(lambda l: (int(l[0]), (int(l[1]), float(l[2])))) # Emit every movie rated together by the same user. # Self-join to find every combination. joinedRatings = ratings.join(ratings) #[_user_ID, ((movieID1, rating1), (movieID2, rating2))] # Filter out duplicate pairs. filterDUplicates is a function that returns True of False uniqueJoinedRatings = joinedRatings.filter(filterDuplicates) # Now key by movie pairs: [(movie1, movie2), (rating1, rating2)] moviePairs = uniqueJoinedRatings.map(makePairs) # We now have (movie1, movie2) => (rating1, rating2) # Now collect all ratings for each movie pair and compute similarity moviePairRatings = moviePairs.groupByKey() #[(movie1, movie2), ((rating1, rating2), (rating1, rating2) ...)]
import findspark from pyspark import SparkContext, SparkConf findspark.init(python_path='/Users/khwu/.virtualenvs/spark/bin/python3') if __name__ == '__main__': conf = SparkConf().setAppName('join').setMaster('local[*]') sc = SparkContext(conf=conf) sc.setLogLevel('ERROR') ages = sc.parallelize([("Tom", 29), ("John", 22)]).persist() addresses = sc.parallelize([("James", "USA"), ("John", "UK")]).persist() ages.join(addresses) \ .coalesce(1) \ .saveAsTextFile('../../out/age_address_join.text') ages.leftOuterJoin(addresses) \ .coalesce(1) \ .saveAsTextFile('../../out/age_address_left_out_join.text') ages.rightOuterJoin(addresses) \ .coalesce(1) \ .saveAsTextFile('../../out/age_address_right_out_join.text') ages.fullOuterJoin(addresses) \ .coalesce(1) \ .saveAsTextFile('../../out/age_address_full_out_join.text')
def vec_sum(x, y): return [(x[i] + y[i]) for i in range(len(x))] def generalized_error(y): key = y[0][0] value = [z[1] for z in y] gen_error = functools.reduce(vec_sum, value, [0, 0]) gen_error = [y / num_points for y in gen_error] return key, gen_error from pyspark import SparkConf, SparkContext if len(sys.argv) != 2: print('Usage: ' + sys.argv[0] + '<out>') sys.exit(1) outputloc = sys.argv[1] conf = SparkConf().setAppName('sim') sc = SparkContext(conf=conf) keys = sc.parallelize(par) data = keys.map(get_data) data = data.flatMap(lambda x: x) error = data.map(classify) gen_error = error.reduceByKey(generalized_error) gen_error.saveAsTextFile(outputloc) sc.stop()
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_A_RPT_SUN_INFO_DETAIL').setMaster(sys.argv[2]) sc = SparkContext(conf = conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期 V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d") V_STEP = 0
def init_spark(): conf = SparkConf().setAppName("Music").setMaster("local") return SparkContext(conf=conf)
# coding=utf-8 from pyspark import SparkContext, SparkConf from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel from pyspark.mllib.util import MLUtils conf = SparkConf().setAppName('Naive Bayes').setMaster('local[2]') sc = SparkContext(conf=conf) # load and parse data file data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt') # split data approximately into training and test training, test = data.randomSplit([0.6, 0.4]) # train a naive bayes model model = NaiveBayes.train(training, 1.0) # make prediction and test accuracy predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda (v, p): v == p).count() / test.count() print('model accuracy :' + format(accuracy)) # save and load model output_dir = '../model/myNaiveBayesModel' # MLUtils.rmtree(output_dir, ignore_errors=True) model.save(sc, output_dir) sameModel = NaiveBayesModel.load(sc, output_dir) predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label))