def pocket_write(p, jobid, iter, src_filename): for i in range(iter): dst_filename = 'tmp' + str(random.randint( 1, 100000000000000)) + '-' + str(i) r = pocket.put(p, src_filename, dst_filename, jobid) if r != 0: raise Exception("put failed: " + dst_filename)
def pocket_write(p, jobid, iter, src_filename, id): for i in xrange(iter): dst_filename = '/tmp'+str(id)+'-'+str(i) print "write ", dst_filename r = pocket.put(p, src_filename, dst_filename, jobid) if r != 0: print "fail" raise Exception("put failed: "+ dst_filename) #return req_rate print "finished writing"
def lambda_handler(event, context): id = int(event['id']) n = num_workers = int(event['n']) bucket_name = str(event['bucket_name']) path = str(event['path']) n_tasks = n t0 = time.time() #[s3] read from input file: input<id> s3 = boto3.resource('s3') file_local = '/tmp/input_tmp' lines = [] # read 4 100MB files m = 1000 / n_tasks for i in xrange(m): i += id * m key = path + 'input' + str(i) s3.Bucket(bucket_name).download_file(key, file_local) with open(file_local, "r") as f: lines += f.readlines() #each line contains a 100b record os.remove(file_local) t1 = time.time() #partition p_list = [[] for x in xrange(n_tasks)] #list of n partitions #hardcode for line in lines: key1 = ord(line[0]) - 32 # key range 32-126 key2 = ord(line[1]) - 32 #126-32+1=95 #p = n/95 # 2500/(126-32+1) ~ 26.3 = 26 #index = int(26.3*(key1+key2/95.0)) p = n_tasks / 95.0 # total of 250 tasks index = int(p * (key1 + key2 / 95.0)) p_list[index].append(line) t2 = time.time() #write to output files: shuffle<id 0> shuffle<id 1> shuffle<id num_workers-1> # connect to crail p = pocket.connect("10.1.0.10", 9070) #jobid = "" jobid = str(event['id']) file_tmp = file_local for i in xrange(n_tasks): with open(file_tmp, "w") as f: f.writelines(p_list[i]) key = 'shuffle' + str(id) + '-' + str(i) src_filename = file_tmp dst_filename = '/' + key r = pocket.put(p, src_filename, dst_filename, jobid) if r != 0: raise Exception("put failed: " + dst_filename) return -1 t3 = time.time() # upload log startup_nodes = [{ "host": "rediscluster-log.a9ith3.clustercfg.usw2.cache.amazonaws.com", "port": "6379" }] redis_client = StrictRedisCluster(startup_nodes=startup_nodes, skip_full_coverage_check=True) log = {'id': id, 't0': t0, 't1': t1, 't2': t2, 't3': t3} log_str = pickle.dumps(log) key = '/map-log' + '-' + '100GB' + '-' + str(n) + '-' + str(id) redis_client.set(key, log_str) print key + " logged" os.remove(file_tmp) #crail.close(socket, ticket, p) r = 'map finished ' + str(id) print r return r
def lambda_handler(event, context): ''' startup_nodes = [{"host": "rediscluster.a9ith3.clustercfg.usw2.cache.amazonaws.com", "port": "6379"}] client = StrictRedisCluster(startup_nodes=startup_nodes, skip_full_coverage_check=True) client.flushall() print "Redis cluster flushed" return ''' mid = int(event['mid']) n = num_reducer = int(event['num_reducer']) #100 t0 = time.time() #[s3] read from input file: input<id> s3 = boto3.resource('s3') bucket_name = 'wordcount-yawen' #key = 'input/input' + str(mid) key = 'input/input_5MB' file_tmp = '/tmp/input' s3.Bucket(bucket_name).download_file(key, file_tmp) with open(file_tmp, "r") as f: lines = f.readlines() t1 = time.time() words = [] for line in lines: words += re.split(r'[^\w]+', line) words = list(filter(None, words)) # count word frequency word_count = {} #(word, count) for word in words: if word in word_count: word_count[word] += 1 else: word_count[word] = 1 # partition among reducers shuffle = {} #(rid, [(word,count), ...]) for k, v in word_count.items(): rid = int(hashlib.md5(k).hexdigest(), 16) % num_reducer if rid in shuffle: shuffle[rid].append((k, v)) else: shuffle[rid] = [(k, v)] t2 = time.time() # connect to crail p = pocket.connect("10.1.129.91", 9070) jobid = "" #jobid = str(event['id']) for k, v in shuffle.items(): #shuffle_file = 'shuffle/shuffle-'+str(mid)+'-'+str(k) #value = pickle.dumps(v) #redis_client.set(shuffle_file, value) key = 'shuffle-' + str(mid) + '-' + str(k) src_filename = file_tmp #'/tmp/shuffle' dst_filename = '/' + key with open(src_filename, 'w') as f: json.dump(v, f) r = pocket.put(p, src_filename, dst_filename, jobid) t3 = time.time() # upload log startup_nodes = [{ "host": "rediscluster-log.a9ith3.clustercfg.usw2.cache.amazonaws.com", "port": "6379" }] redis_client = StrictRedisCluster(startup_nodes=startup_nodes, skip_full_coverage_check=True) # t1-t0: s3 read # t2-t1: word counting & partition # t3-t2: intermediate write log = {'id': mid, 't0': t0, 't1': t1, 't2': t2, 't3': t3} key = 'map-log-' + str(n) + '-' + str(mid) redis_client.set(key, pickle.dumps(log)) os.remove(file_tmp) #print "mapper"+str(mid)+' finished' print key + ' logged'
def pocket_write(p, jobid, iter, src_filename): for i in xrange(iter): dst_filename = '/tmp' + '-' + str(i) r = pocket.put(p, src_filename, dst_filename, jobid) if r != 0: raise Exception("put failed: " + dst_filename)
def pocket_write(p, jobid, iter, src_filename): for i in xrange(iter): dst_filename = '/tmp' + '-' + str(i) r = pocket.put(p, src_filename, dst_filename, jobid)
def lambda_handler(event, context): # create a file of size (datasize) in bytes iter = 10000 datasize = 32 #bytes file_tmp = '/tmp/file_tmp' with open(file_tmp, 'w') as f: text = 'a' * datasize f.write(text) # write to crail p = pocket.connect("10.1.129.91", 9070) jobid = 'lambda3' print pocket.create_dir(p, 'new_dir', jobid) print pocket.put(p, file_tmp, "test", jobid) return r = pocket.register_job(p, jobid) # works if return 0 if r != 0: print "registration failed" return time_list = [] t0 = time.time() pocket_write(p, jobid, iter, file_tmp) t1 = time.time() print "==========================================" #print np.percentile(time_list, 90) print "Stats for " + str(iter) + " iter of " + str( datasize) + " bytes write:" throughput = iter * datasize * 8 / (t1 - t0) / 1e9 print "throughput (Gb/s) = " + str(throughput) print "time (s) = " + str(t1 - t0) print "latency (us) = " + str((t1 - t0) / iter * 1e6) print "==========================================" t0 = time.time() pocket_read(p, jobid, iter, file_tmp) t1 = time.time() print "==========================================" print "Stats for " + str(iter) + " iter of " + str( datasize) + " bytes read:" throughput = iter * datasize * 8 / (t1 - t0) / 1e9 print "throughput (Gb/s) = " + str(throughput) print "time (s) = " + str(t1 - t0) print "latency (us) = " + str((t1 - t0) / iter * 1e6) print "==========================================" t0 = time.time() pocket_lookup(p, jobid, iter) t1 = time.time() print "==========================================" print "Stats for " + str(iter) + " iter of " + str( datasize) + " bytes lookup:" throughput = iter * datasize * 8 / (t1 - t0) / 1e9 print "throughput (Gb/s) = " + str(throughput) print "time (s) = " + str(t1 - t0) print "latency (us) = " + str((t1 - t0) / iter * 1e6) print "==========================================" os.remove(file_tmp) return