def get_rdd(self): dpark = DparkContext() return dpark.union( [dpark.textFile(path, splitSize=64 << 20) for path in self.paths] ).map(Weblog.from_line)
def word_count(file_path, word): # 指定某个Mesos主机进行沟通 dpark = DparkContext() # 将分布式文件,构造成文件RDD,每块大小为16m f = dpark.textFile(file_path, splitSize=16 << 20) # 用map()转变成新的RDD,再用filter过滤出更新的RDD,最后用count()处理返回结果 print( word, 'count:', f.map(lambda line: line.strip()).filter( lambda line: word in line).count())
def main(txt, infile, outfile): ctx = DparkContext() csvfilename = infile txtfilename = txt txt_rdd = ctx.textFile(txtfilename) txt_rdd = txt_rdd.map(divide_txt) #('5988', ['2', 'CPM']) csv_rdd = ctx.textFile(csvfilename, splitSize=64<<20) #print csv_rdd.take(100) csv_rdd = csv_rdd.filter(remove_some_bid_unitid) csv_rdd = csv_rdd.map(divide_csv) #('6379', ['-1', '1236054964187470000', '6379', '77', '1', '1', '0']) record_rdd = txt_rdd.join(csv_rdd) #('6370', (['2', 'COMPLEMENT'], ['-1', '8183016859528920000', '6370', '86', '3', '1', '0'])) record_rdd = record_rdd.mapValue(join_element) #('6370', ['2', 'COMPLEMENT', '-1', '8183016859528920000', '6370', '86', '3', '1', '0']) record_rdd = record_rdd.groupBy(lambda line : str(line[1]).split()[5] + str(line[1]).split()[1]) #print record_rdd.take(1) record_rdd = record_rdd.map(map_unit_type) #print record_rdd.take(1) record_rdd = record_rdd.flatMap(flat_map_unit_type_priority) #print record_rdd.take(5) record_rdd = record_rdd.groupByKey() #print "*" * 50 #print record_rdd.take(5) record_rdd = record_rdd.mapValue(map_value_unit_type_priority) #print "#" * 50 #print record_rdd.take(5) record_rdd = record_rdd.map(map_unit_type_priority) #print "$" * 50 #print record_rdd.take(5) # unit type prioritt cluster n_ad n_imp n_click ctr record_rdd.saveAsTextFile(outfile)
def main(infile, outfile): ctx = DparkContext() rdd = ctx.textFile(infile) rdd = rdd.map(map_unit_type_priority) rdd = rdd.reduceByKey(reduce_by_key) rdd = rdd.map(map_to_string) rdd.saveAsTextFile(outfile)
def DownLoad(file_path): dpark = DparkContext() file_block = dpark.textFile(file_path,splitSize=16<<20) file_block.foreach(write_to_wav)
import glob from dpark import DparkContext RATING_PATH = '/nfs/wuhong/offline_use/rating_new/' TRAINING_PATH = '/nfs/wuhong/paracel/data/als_fm/train' TEST_PATH = '/nfs/wuhong/paracel/data/als_fm/test' dpark = DparkContext() def local_filter1(line): tmp = line.strip().split(',')[1] if tmp.endswith('0') or tmp.endswith('1') or tmp.endswith('2'): return False return True def local_filter2(line): tmp = line.strip().split(',')[1] if tmp.endswith('0') or tmp.endswith('1') or tmp.endswith('2'): return True return False dpark.textFile(glob.glob(RATING_PATH)).filter( local_filter1 ).saveAsTextFile(TRAINING_PATH) dpark.textFile(glob.glob(RATING_PATH)).filter( local_filter2 ).saveAsTextFile(TEST_PATH)
user_feature = dp.makeRDD([]) def _parse_list(line): uid, features = line.split('\t') features = [x.split(':') for x in features.split('|')] features = [(x[0], float(x[1])) for x in features] features = sorted(features, key=lambda x: x[1], reverse=True) return (uid, features) for name in ['book_cluster', 'movie_cluster', 'group_cluster', 'text_cluster']: fn = '/home2/alg/user_profile/%s/%s' % (current_date, name) if not os.path.exists(fn): continue rdd = dp.textFile(fn, splitSize=16<<20)\ .filter(lambda x: x.split('\t', 1)[0] in user_list_b.value)\ .map(_parse_list)\ .mapValue(lambda x: [('cnt', len(x)), ('hot', sum([y[1] for y in x]))] + x[:2])\ .mapValue((lambda name: lambda x: [('%s_concise/%s' % (name, k), v) for (k, v) in x])(name)) user_feature = user_feature.union(rdd) for name in ['gender', 'region']: fn = '/home2/alg/user_profile/%s/%s' % (current_date, name) if not os.path.exists(fn): continue rdd = dp.textFile('/home2/alg/user_profile/%s/%s' % (current_date, name), splitSize=16<<20)\ .filter(lambda x: x.split('\t', 1)[0] in user_list_b.value)\ .map(_parse_list)\ .mapValue(lambda x: x[:2])\ .mapValue((lambda name: lambda x: [('%s/%s' % (name, k), v) for (k, v) in x])(name)) user_feature = user_feature.union(rdd)
if messageSum and messageSum[0]: newValue = 0.15 / num + 0.85 * messageSum[0] else: newValue = self.value terminate = (superstep >= 10 and abs(newValue - self.value) < epsilon) or superstep > 30 outbox = [(edge.target_id, newValue / len(self.outEdges)) for edge in self.outEdges] if not terminate else [] return Vertex(self.id, newValue, self.outEdges, not terminate), outbox return compute if __name__ == '__main__': inputFile = 'wikipedia.txt' threshold = 0.01 dpark = DparkContext() path = os.path.join(os.path.dirname(os.path.abspath(__file__)), inputFile) input = dpark.textFile(path) numVertex = input.count() vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache() epsilon = 0.01 / numVertex messages = dpark.parallelize([]) result = Bagel.run(dpark, vertices, messages, gen_compute(numVertex, epsilon)) for id, v in result.filter( lambda id_v: id_v[1].value > threshold).collect(): print(id, v)
return (title, Vertex(title, 1.0/numV, outEdges, True)) def gen_compute(num, epsilon): def compute(self, messageSum, agg, superstep): if messageSum and messageSum[0]: newValue = 0.15 / num + 0.85 * messageSum[0] else: newValue = self.value terminate = (superstep >= 10 and abs(newValue-self.value) < epsilon) or superstep > 30 outbox = [Message(edge.target_id, newValue / len(self.outEdges)) for edge in self.outEdges] if not terminate else [] return Vertex(self.id, newValue, self.outEdges, not terminate), outbox return compute if __name__ == '__main__': inputFile = 'wikipedia.txt' threshold = 0.01 dpark = DparkContext() path = os.path.join(os.path.dirname(os.path.abspath(__file__)), inputFile) input = dpark.textFile(path) numVertex = input.count() vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache() epsilon = 0.01 / numVertex messages = dpark.parallelize([]) result = Bagel.run(dpark, vertices, messages, gen_compute(numVertex, epsilon)) for v in result.filter(lambda x:x.value > threshold).collect(): print v.id, v.value
import sys sys.path.append('../') from dpark import DparkContext dpark = DparkContext() name = '/mfs/tmp/weblog-pre-20111019.csv' name = '/mfs/tmp/weblog-20111019.csv' name = '/tmp/weblog-20111019.csv.small' # name = '/tmp/weblog-20111019.csv.medium' name = 'resume_text_seg_data-2014-06-01.txt' pv = dpark.textFile(name) pv = pv.map(lambda x:x.split(',')).map(lambda l:(l[3], l[7])) pv = pv.flatMap(lambda (i, u):(u.startswith('/movie') and [(i, 2)] or u.startswith('/group') and [(i, 3)] or [])) # print pv.take(50) pv = pv.reduceByKey(lambda x, y:x * y) # print pv.take(50) print pv.filter(lambda (_, y):y % 2 == 0 and y % 3 == 0).count() # movie = pv.filter(lambda (bid,url): url.startswith('/movie')).reduceByKey(lambda x,y:None) # group = pv.filter(lambda (bid,url): url.startswith('/group')).reduceByKey(lambda x,y:None) # print movie.join(group).count() # print pv.map(lambda x:x.split(',')[2]).uniq().count() # print pv.map(lambda x:(x.split(',')[2],None)).reduceByKey(lambda x,y:None).count() # .filter(lambda uid:uid) # print upv.count() # print upv.reduceByKey(lambda x,y:x+y).count()
bestIndex = 0 for i in range(1, len(centers)): d = p.squaredDist(centers[i]) if d < bestDist: bestDist = d bestIndex = i return bestIndex if __name__ == '__main__': D = 4 K = 3 IT = 10 MIN_DIST = 0.01 centers = [Vector([random.random() for j in range(D)]) for i in range(K)] points = dpark.textFile('kmeans_data.txt').map(parseVector).cache() for it in range(IT): print('iteration', it) mappedPoints = points.map(lambda p: (closestCenter(p, centers), (p, 1))) ncenters = mappedPoints.reduceByKey( lambda (s1, c1), (s2, c2): (s1 + s2, c1 + c2) ).map( lambda id_sum_count: (id_sum_count[0], id_sum_count[1][0] / id_sum_count[1][1]) ).collectAsMap() updated = False for i in ncenters: if centers[i].dist(ncenters[i]) > MIN_DIST: centers[i] = ncenters[i] updated = True
from __future__ import absolute_import from __future__ import print_function import sys sys.path.append('../') from dpark import DparkContext dpark = DparkContext() name = '/mfs/tmp/weblog-pre-20111019.csv' name = '/mfs/tmp/weblog-20111019.csv' name = '/tmp/weblog-20111019.csv.small' #name = '/tmp/weblog-20111019.csv.medium' pv = dpark.textFile(name) pv = pv.map(lambda x:x.split(',')).map(lambda l:(l[3],l[7])) pv = pv.flatMap(lambda i_u:(i_u[1].startswith('/movie') and [(i_u[0],2)] or i_u[1].startswith('/group') and [(i_u[0],3)] or [])) #print pv.take(50) pv = pv.reduceByKey(lambda x,y:x*y) #print pv.take(50) print(pv.filter(lambda __y:__y[1]%2==0 and __y[1]%3==0).count()) #movie = pv.filter(lambda (bid,url): url.startswith('/movie')).reduceByKey(lambda x,y:None) #group = pv.filter(lambda (bid,url): url.startswith('/group')).reduceByKey(lambda x,y:None) #print movie.join(group).count() #print pv.map(lambda x:x.split(',')[2]).uniq().count() #print pv.map(lambda x:(x.split(',')[2],None)).reduceByKey(lambda x,y:None).count() #.filter(lambda uid:uid) #print upv.count() #print upv.reduceByKey(lambda x,y:x+y).count()
bestIndex = 0 for i in range(1, len(centers)): d = p.squaredDist(centers[i]) if d < bestDist: bestDist = d bestIndex = i return bestIndex if __name__ == '__main__': D = 4 K = 3 IT = 10 MIN_DIST = 0.01 centers = [Vector([random.random() for j in range(D)]) for i in range(K)] points = dpark.textFile('kmeans_data.txt').map(parseVector).cache() for it in range(IT): print 'iteration', it mappedPoints = points.map(lambda p:(closestCenter(p, centers), (p, 1))) ncenters = mappedPoints.reduceByKey( lambda (s1,c1),(s2,c2): (s1+s2,c1+c2) ).map( lambda (id, (sum, count)): (id, sum/count) ).collectAsMap() updated = False for i in ncenters: if centers[i].dist(ncenters[i]) > MIN_DIST: centers[i] = ncenters[i] updated = True
import glob from dpark import DparkContext RATING_PATH = '/nfs/wuhong/offline_use/rating_new/' TRAINING_PATH = '/nfs/wuhong/paracel/data/als_fm/train' TEST_PATH = '/nfs/wuhong/paracel/data/als_fm/test' dpark = DparkContext() def local_filter1(line): tmp = line.strip().split(',')[1] if tmp.endswith('0') or tmp.endswith('1') or tmp.endswith('2'): return False return True def local_filter2(line): tmp = line.strip().split(',')[1] if tmp.endswith('0') or tmp.endswith('1') or tmp.endswith('2'): return True return False dpark.textFile( glob.glob(RATING_PATH)).filter(local_filter1).saveAsTextFile(TRAINING_PATH) dpark.textFile( glob.glob(RATING_PATH)).filter(local_filter2).saveAsTextFile(TEST_PATH)
from dpark import DparkContext, optParser dc = DparkContext() options, args = optParser.parse_args() infile = args[0] outfile = args[1] print("from {} to {}".format(infile, outfile)) def fm(x): for w in x.strip().split(): yield (w, 1) (dc.textFile(infile).flatMap(fm).reduceByKey( lambda x, y: x + y, numSplits=6).map(lambda x: " ".join(list(map(str, x)))).saveAsTextFile( outfile, overwrite=False))
#!/usr/bin/env python # encoding: utf-8 """ @version: v1.0 @author: W_H_J @license: Apache Licence @contact: [email protected] @site: @software: PyCharm @file: wordcount.py @time: 2018/6/5 18:10 @describe: 单词统计 """ from dpark import DparkContext ctx = DparkContext() file = ctx.textFile("./words.txt") words = file.flatMap(lambda x: x.split()).map(lambda x: (x, 1)) wc = words.reduceByKey(lambda x, y: x + y).collectAsMap() print(wc) # 统计单词出现的个数 def word_count(file_path, word): # 指定某个Mesos主机进行沟通 dpark = DparkContext() # 将分布式文件,构造成文件RDD,每块大小为16m f = dpark.textFile(file_path, splitSize=16 << 20) # 用map()转变成新的RDD,再用filter过滤出更新的RDD,最后用count()处理返回结果 print(
outEdges = [Edge(ref) for ref in refs] return (title, Vertex(title, 1.0/numV, outEdges, True)) def gen_compute(num, epsilon): def compute(self, messageSum, agg, superstep): if messageSum and messageSum[0]: newValue = 0.15 / num + 0.85 * messageSum[0] else: newValue = self.value terminate = (superstep >= 10 and abs(newValue-self.value) < epsilon) or superstep > 30 outbox = [Message(edge.target_id, newValue / len(self.outEdges)) for edge in self.outEdges] if not terminate else [] return Vertex(self.id, newValue, self.outEdges, not terminate), outbox return compute if __name__ == '__main__': inputFile = 'wikipedia.txt' threshold = 0.01 dpark = DparkContext() input = dpark.textFile(inputFile) numVertex = input.count() vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache() epsilon = 0.01 / numVertex messages = dpark.parallelize([]) result = Bagel.run(dpark, vertices, messages, gen_compute(numVertex, epsilon)) for v in result.filter(lambda x:x.value > threshold).collect(): print v.id, v.value
import sys sys.path.append('../') import logging from dpark import DparkContext dpark = DparkContext() name = 'rating.txt' def parse(line): sid, uid, r, f = line.split('\t') defaults = {'F':4.5, 'P':3.7, 'N':4.0} if r == 'None': r = defaults[f] return (sid, (uid, float(r))) rating = dpark.textFile(name, numSplits=2).map(parse).groupByKey(2)#.cache() #print 'us', rating.first() print rating.count() def reverse(it): s = {} for k, us in it: for u,r in us: s.setdefault(u, {})[k] = r return s def vsum(a, b): # return 1 if len(a) < len(b): a, b = b, a d = dict(a)
from six.moves import map from six.moves import range from six.moves import zip sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dpark import DparkContext dpark = DparkContext() # range nums = dpark.parallelize(list(range(100)), 4) print(nums.count()) print(nums.reduce(lambda x, y: x + y)) # text search f = dpark.textFile("./", ext='py').map(lambda x: x.strip()) log = f.filter(lambda line: 'logging' in line).cache() print('logging', log.count()) print('error', log.filter(lambda line: 'error' in line).count()) for line in log.filter(lambda line: 'error' in line).collect(): print(line) # word count counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).cache() pprint(counts.filter(lambda __v1: __v1[1] > 50).collectAsMap()) pprint(sorted(counts.filter(lambda __v: __v[1] > 20).map(lambda x_y: (x_y[1], x_y[0])).groupByKey().collect())) pprint(counts.map(lambda v: "%s:%s" % v).saveAsTextFile("wc/")) # Pi import random
import random import os, sys from pprint import pprint sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dpark import DparkContext ctx = DparkContext() # range nums = ctx.parallelize(range(100), 4) print nums.count() print nums.reduce(lambda x, y: x + y) # text search f = ctx.textFile("./", ext="py").map(lambda x: x.strip()) log = f.filter(lambda line: "logging" in line).cache() print "logging", log.count() print "error", log.filter(lambda line: "error" in line).count() for line in log.filter(lambda line: "error" in line).collect(): print line # word count counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).cache() pprint(counts.filter(lambda (_, v): v > 50).collectAsMap()) pprint(sorted(counts.filter(lambda (_, v): v > 20).map(lambda (x, y): (y, x)).groupByKey().collect())) pprint(counts.map(lambda v: "%s:%s" % v).saveAsTextFile("wc/")) # Pi import random
from dpark import DparkContext, optParser dc = DparkContext() options, args = optParser.parse_args() infile = args[0] outfile = args[1] print("from {} to {}".format(infile, outfile)) def fm(x): for w in x.strip().split(): yield (w, 1) (dc.textFile(infile) .flatMap(fm) .reduceByKey(lambda x, y: x + y, numSplits=6) .map(lambda x: " ".join(list(map(str, x)))) .saveAsTextFile(outfile, overwrite=False))
def get_rdd(self): dpark = DparkContext() return dpark.union([ dpark.textFile(path, splitSize=64 << 20) for path in self.paths ]).map(Weblog.from_line)
outEdges = [SPEdge(tid, int(v)) for _, tid, v in lines] return (id, SPVertex(id, sys.maxint, outEdges, True)) def compute(self, vs, agg, superstep): newValue = min(self.value, vs[0]) if vs else self.value if newValue != self.value: outbox = [SPMessage(edge.target_id, newValue + edge.value) for edge in self.outEdges] else: outbox = [] return SPVertex(self.id, newValue, self.outEdges, False), outbox if __name__ == '__main__': ctx = DparkContext() lines = ctx.textFile('graph.txt').map(lambda line:line.split(' ')) vertices = lines.filter(lambda x:len(x)==3).groupBy( lambda line:line[0]).map(to_vertex) messages = lines.filter(lambda x:len(x)==2).map( lambda (vid, v): (vid, SPMessage(vid, int(v))) ) print 'read', vertices.count(), 'vertices and ', messages.count(), 'messages.' result = Bagel.run(ctx, vertices, messages, compute, MinCombiner()) startVertex = 0 print 'Shortest path from %s to all vertices:' % startVertex for v in result.collect(): if v.value == sys.maxint: v.value = 'inf' print v.id, v.value
def compute(self, vs, agg, superstep): newValue = min(self.value, vs[0]) if vs else self.value if newValue != self.value: outbox = [ SPMessage(edge.target_id, newValue + edge.value) for edge in self.outEdges ] else: outbox = [] return SPVertex(self.id, newValue, self.outEdges, False), outbox if __name__ == '__main__': ctx = DparkContext() lines = ctx.textFile('graph.txt').map(lambda line: line.split(' ')) vertices = lines.filter(lambda x: len(x) == 3).groupBy( lambda line: line[0]).map(to_vertex) messages = lines.filter(lambda x: len(x) == 2).map( lambda (vid, v): (vid, SPMessage(vid, int(v)))) print 'read', vertices.count(), 'vertices and ', messages.count( ), 'messages.' result = Bagel.run(ctx, vertices, messages, compute, MinCombiner()) startVertex = 0 print 'Shortest path from %s to all vertices:' % startVertex for v in result.collect(): if v.value == sys.maxint: v.value = 'inf' print v.id, v.value
f_global = file(MU_PATH) line = '' for l in f_global: line = l mu = float(line.strip().split('\t')[1]) f_global.close() mu = dpark.broadcast(mu) def local_mapper(line): iid, v, _ = line.strip().split('\t') return (iid, float(v)) ibias = {} ibias = dpark.textFile(glob.glob(IBIAS_PATH)).map( local_mapper ).collectAsMap() ibias = dpark.broadcast(ibias) def local_mapper2(line): uid, iid, aid, v = line.strip().split('\t') return '%s,%s,%s\n' % (uid , iid, float(v) - mu - ibias[iid]) # generate new rating data dpark.textFile(glob.glob(RATING_PATH)).filter( lambda line: ibias.get(line.strip().split('\t')[1]) ).map( local_mapper2 ).saveAsTextFile(NEW_RATING_PATH) def local_mapper3(line):
return bestIndex def minDist(p, centers): bestDist = p.squaredDist(centers[0]) for i in range(1, len(centers)): d = p.squaredDist(centers[i]) if d < bestDist: bestDist = d return bestDist if __name__ == '__main__': K = 100 IT = 50 MIN_DIST = 0.01 PATH = 'tab/1558dee2ecfb7a0f9f63e27376675b6c.tab' points = dpark.textFile(PATH, numSplits=100)[:1].map(parseVector).cache() print points.count() centers = points.take(K) for it in range(IT): print 'iteration', it mappedPoints = points.map(lambda p:(closestCenter(p, centers), (p, 1))) ncenters = mappedPoints.reduceByKey( lambda (s1,c1),(s2,c2): (s1+s2,c1+c2) ).map( lambda (id, (sum, count)): (id, sum/count) ).collectAsMap() updated = False for i in ncenters: if centers[i].dist(ncenters[i]) > MIN_DIST:
def DownLoad(file_path): dpark = DparkContext() file_block = dpark.textFile(file_path, splitSize=16 << 20) file_block.foreach(write_to_wav)
from dpark import DparkContext dpark = DparkContext() name = 'rating.txt' def parse(line): sid, uid, r, f = line.split('\t') defaults = {'F': 4.5, 'P': 3.7, 'N': 4.0} if r == 'None': r = defaults[f] return (sid, (uid, float(r))) rating = dpark.textFile(name, numSplits=2).map(parse).groupByKey(2) #.cache() #print 'us', rating.first() print rating.count() def reverse(it): s = {} for k, us in it: for u, r in us: s.setdefault(u, {})[k] = r return s def vsum(a, b): # return 1 if len(a) < len(b):
return (id, Vertex(id, sys.maxint, outEdges, True)) def compute(self, vs, agg, superstep): newValue = min(self.value, vs[0]) if vs else self.value if newValue != self.value: outbox = [(edge.target_id, newValue + edge.value) for edge in self.outEdges] else: outbox = [] return Vertex(self.id, newValue, self.outEdges, False), outbox if __name__ == '__main__': ctx = DparkContext() path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'graph.txt') lines = ctx.textFile(path).map(lambda line: line.split(' ')) vertices = lines.groupBy(lambda line: line[0]).map(to_vertex) startVertex = str(0) messages = ctx.makeRDD([(startVertex, 0)]) print('read', vertices.count(), 'vertices and ', messages.count(), 'messages.') result = Bagel.run(ctx, vertices, messages, compute, BasicCombiner(min), numSplits=2) print('Shortest path from %s to all vertices:' % startVertex) for id, v in result.collect(): if v.value == sys.maxint: v.value = 'inf' print(v.id, v.value)
def minDist(p, centers): bestDist = p.squaredDist(centers[0]) for i in range(1, len(centers)): d = p.squaredDist(centers[i]) if d < bestDist: bestDist = d return bestDist if __name__ == '__main__': K = 100 IT = 50 MIN_DIST = 0.01 PATH = 'tab/1558dee2ecfb7a0f9f63e27376675b6c.tab' points = dpark.textFile(PATH, numSplits=100)[:1].map(parseVector).cache() print points.count() centers = points.take(K) for it in range(IT): print 'iteration', it mappedPoints = points.map(lambda p: (closestCenter(p, centers), (p, 1))) ncenters = mappedPoints.reduceByKey( lambda (s1, c1), (s2, c2): (s1 + s2, c1 + c2)).map( lambda (id, (sum, count)): (id, sum / count)).collectAsMap() updated = False for i in ncenters: if centers[i].dist(ncenters[i]) > MIN_DIST: centers[i] = ncenters[i]
import math import random import os, sys from pprint import pprint sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dpark import DparkContext ctx = DparkContext() # range nums = ctx.parallelize(range(100), 4) print nums.count() print nums.reduce(lambda x, y: x + y) # text search f = ctx.textFile("./", ext='py').map(lambda x: x.strip()) log = f.filter(lambda line: 'logging' in line).cache() print 'logging', log.count() print 'error', log.filter(lambda line: 'error' in line).count() for line in log.filter(lambda line: 'error' in line).collect(): print line # word count counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey( lambda x, y: x + y).cache() pprint(counts.filter(lambda (_, v): v > 50).collectAsMap()) pprint( sorted( counts.filter(lambda (_, v): v > 20).map( lambda (x, y): (y, x)).groupByKey().collect())) pprint(counts.map(lambda v: "%s:%s" % v).saveAsTextFile("wc/"))
if messageSum and messageSum[0]: newValue = 0.15 / num + 0.85 * messageSum[0] else: newValue = self.value terminate = (superstep >= 10 and abs(newValue - self.value) < epsilon) or superstep > 30 outbox = [ Message(edge.target_id, newValue / len(self.outEdges)) for edge in self.outEdges ] if not terminate else [] return Vertex(self.id, newValue, self.outEdges, not terminate), outbox return compute if __name__ == '__main__': inputFile = 'wikipedia.txt' threshold = 0.01 dpark = DparkContext() input = dpark.textFile(inputFile) numVertex = input.count() vertices = input.map(lambda line: parse_vertex(line, numVertex)).cache() epsilon = 0.01 / numVertex messages = dpark.parallelize([]) result = Bagel.run(dpark, vertices, messages, gen_compute(numVertex, epsilon)) for v in result.filter(lambda x: x.value > threshold).collect(): print v.id, v.value