return (title, Vertex(title, 1.0/numV, outEdges, True)) def gen_compute(num, epsilon): def compute(self, messageSum, agg, superstep): if messageSum and messageSum[0]: newValue = 0.15 / num + 0.85 * messageSum[0] else: newValue = self.value terminate = (superstep >= 10 and abs(newValue-self.value) < epsilon) or superstep > 30 outbox = [Message(edge.target_id, newValue / len(self.outEdges)) for edge in self.outEdges] if not terminate else [] return Vertex(self.id, newValue, self.outEdges, not terminate), outbox return compute if __name__ == '__main__': inputFile = 'wikipedia.txt' threshold = 0.01 dpark = DparkContext() path = os.path.join(os.path.dirname(os.path.abspath(__file__)), inputFile) input = dpark.textFile(path) numVertex = input.count() vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache() epsilon = 0.01 / numVertex messages = dpark.parallelize([]) result = Bagel.run(dpark, vertices, messages, gen_compute(numVertex, epsilon)) for v in result.filter(lambda x:x.value > threshold).collect(): print v.id, v.value
import math import random import os, sys from pprint import pprint sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dpark import DparkContext ctx = DparkContext() # range nums = ctx.parallelize(range(100), 4) print nums.count() print nums.reduce(lambda x, y: x + y) # text search f = ctx.textFile("./", ext='py').map(lambda x: x.strip()) log = f.filter(lambda line: 'logging' in line).cache() print 'logging', log.count() print 'error', log.filter(lambda line: 'error' in line).count() for line in log.filter(lambda line: 'error' in line).collect(): print line # word count counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey( lambda x, y: x + y).cache() pprint(counts.filter(lambda (_, v): v > 50).collectAsMap()) pprint( sorted( counts.filter(lambda (_, v): v > 20).map( lambda (x, y): (y, x)).groupByKey().collect())) pprint(counts.map(lambda v: "%s:%s" % v).saveAsTextFile("wc/"))
data = fs.get('/song/small/%s.mp3'%id) binfile = open("output/%s.mp3" % (id),"wb") binfile.write(data) binfile.close() m = RetrievalMusic(dptable, mode) m.retrieving('output/%s.mp3'%id) if mode != 2: call("rm output/%s.mp3" % (id), shell=True) def batchprocess(song_id, loaded, mode): # dpark = DparkContext() # dptable = dpark.broadcast(loaded) # dpark.parallelize(song_id, 80).foreach(lambda(id):calculate_single(id, dptable, mode)) for id in song_id: calculate_single(id, loaded, mode) if mode == 2: rearrange() if __name__ == '__main__': song_id = np.load("track_temp.npy") mode = 1 # 1 for save, 2 for filter, and 0 for regular work dpark = DparkContext() dpark.parallelize(song_id, 50).foreach(lambda(id):calculate_single(id,0,mode))
if messageSum and messageSum[0]: newValue = 0.15 / num + 0.85 * messageSum[0] else: newValue = self.value terminate = (superstep >= 10 and abs(newValue - self.value) < epsilon) or superstep > 30 outbox = [(edge.target_id, newValue / len(self.outEdges)) for edge in self.outEdges] if not terminate else [] return Vertex(self.id, newValue, self.outEdges, not terminate), outbox return compute if __name__ == '__main__': inputFile = 'wikipedia.txt' threshold = 0.01 dpark = DparkContext() path = os.path.join(os.path.dirname(os.path.abspath(__file__)), inputFile) input = dpark.textFile(path) numVertex = input.count() vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache() epsilon = 0.01 / numVertex messages = dpark.parallelize([]) result = Bagel.run(dpark, vertices, messages, gen_compute(numVertex, epsilon)) for id, v in result.filter( lambda id_v: id_v[1].value > threshold).collect(): print(id, v)
from __future__ import print_function import math import random import os, sys from pprint import pprint from six.moves import map from six.moves import range from six.moves import zip sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dpark import DparkContext dpark = DparkContext() # range nums = dpark.parallelize(list(range(100)), 4) print(nums.count()) print(nums.reduce(lambda x, y: x + y)) # text search f = dpark.textFile("./", ext='py').map(lambda x: x.strip()) log = f.filter(lambda line: 'logging' in line).cache() print('logging', log.count()) print('error', log.filter(lambda line: 'error' in line).count()) for line in log.filter(lambda line: 'error' in line).collect(): print(line) # word count counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).cache() pprint(counts.filter(lambda __v1: __v1[1] > 50).collectAsMap()) pprint(sorted(counts.filter(lambda __v: __v[1] > 20).map(lambda x_y: (x_y[1], x_y[0])).groupByKey().collect()))
import math import random import os, sys from pprint import pprint sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dpark import DparkContext ctx = DparkContext() # range nums = ctx.parallelize(range(100), 4) print nums.count() print nums.reduce(lambda x, y: x + y) # text search f = ctx.textFile("./", ext="py").map(lambda x: x.strip()) log = f.filter(lambda line: "logging" in line).cache() print "logging", log.count() print "error", log.filter(lambda line: "error" in line).count() for line in log.filter(lambda line: "error" in line).collect(): print line # word count counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).cache() pprint(counts.filter(lambda (_, v): v > 50).collectAsMap()) pprint(sorted(counts.filter(lambda (_, v): v > 20).map(lambda (x, y): (y, x)).groupByKey().collect())) pprint(counts.map(lambda v: "%s:%s" % v).saveAsTextFile("wc/")) # Pi import random
#coding:utf-8 from random import shuffle, random, sample import traceback from dpark import DparkContext dp = DparkContext('mesos') '''shuffle不返回数据值''' rdd1 = dp.parallelize([[(1, 2), (3, 4), (5, 6)]]).map(lambda x: shuffle(x)) print 'rdd1:' print rdd1.take(1) rdd2 = dp.parallelize([((1, 2), (3, 4), (5, 6))]).map(lambda x: shuffle(x)) print 'rdd2:' try: print rdd2.take(1) except Exception, e: print traceback.print_exc() '''O(NlogN)''' rdd3 = dp.parallelize([[(1, 2), (3, 4), (5, 6)] ]).map(lambda x: sorted(x, key=lambda k: random())) print 'rdd3:' print rdd3.take(1) '''O(N)''' rdd4 = dp.parallelize([[(1, 2), (3, 4), (5, 6)]]).map(lambda x: sample(x, len(x))) print 'rdd4:' print rdd4.take(1) rdd5 = dp.parallelize([((1, 2), (3, 4), (5, 6)) ]).map(lambda x: sample(x, len(x)))