def pocket_read(p, jobid, iter, src_filename): for i in range(iter): dst_filename = 'tmp' + str(random.randint( 1, 100000000000000)) + '-' + str(i) r = pocket.get(p, dst_filename, src_filename, jobid) if r != 0: raise Exception("get failed: " + dst_filename)
def lambda_handler(event, context): id = int(event['id']) n = num_workers = int(event['n']) bucket_name = str(event['bucket_name']) n_tasks = n log_file = [] t0 = time.time() # connect to crail #p = pocket.connect("10.1.12.156", 9070) p = pocket.connect("10.1.0.10", 9070) print "connected" jobid = "" #jobid = str(event['id']) #read from input file: shuffle<0 id> shuffle<1 id> ... shuffle<id num_workers-1> #''' file_tmp = '/tmp/tmp' all_lines = [] for i in xrange(n_tasks): #key = 'shuffle' + str(id) +'-'+ str(i) # wrong one just for testing key = 'shuffle' + str(i) + '-' + str(id) src_filename = key dst_filename = file_tmp #print src_filename r = pocket.get(p, src_filename, dst_filename, jobid) if r != 0: raise Exception("get failed: " + src_filename) return -1 #log_file.append((key, time.time())) with open(dst_filename, "r") as f: all_lines += f.readlines() #print src_filename + " read success" os.remove(file_tmp) #''' t1 = time.time() #print "read all from pocket" #merge & sort for i in xrange(len(all_lines)): all_lines[i] = (all_lines[i][:10], all_lines[i][12:]) all_lines.sort(key=lambda x: x[0]) for i in xrange(len(all_lines)): all_lines[i] = all_lines[i][0] + " " + all_lines[i][1] t2 = time.time() #[s3] write to output file: output<id> s3 = boto3.resource('s3') file_name = 'output/sorted_output' m = 1000 / n_tasks size = len(all_lines) / m for i in xrange(m): with open(file_tmp, "w+") as f: start = size * i end = start + size f.writelines(all_lines[start:end]) f.seek(0) body = f.read() key = file_name + str(id * m + i) s3.Bucket(bucket_name).upload_file(file_tmp, key) os.remove(file_tmp) t3 = time.time() # upload log startup_nodes = [{ "host": "rediscluster-log.a9ith3.clustercfg.usw2.cache.amazonaws.com", "port": "6379" }] redis_client = StrictRedisCluster(startup_nodes=startup_nodes, skip_full_coverage_check=True) log = {'id': id, 't0': t0, 't1': t1, 't2': t2, 't3': t3} log_str = pickle.dumps(log) key = '/reduce-log' + '-' + '100GB' + '-' + str(n) + '-' + str(id) redis_client.set(key, log_str) print key + " logged" ''' log_file_str = pickle.dumps(log_file) key = '/reduce-log-time'+'-'+'100GB'+'-'+str(n)+'-'+str(id) redis_client.set(key, log_file_str) print key + " logged" ''' #crail.close(socket, ticket, p) r = 'reduce finished ' + str(id) print r return r
def pocket_read(p, jobid, iter, src_filename, id): for i in xrange(iter): dst_filename = '/tmp'+str(id)+'-'+str(i) r = pocket.get(p, dst_filename, src_filename, jobid)
def lambda_handler(event, context): rid = int(event['rid']) n = num_mapper = int(event['num_mapper']) t0 = time.time() # read shuffle files # connect to crail p = pocket.connect("10.1.129.91", 9070) jobid = "" #jobid = str(event['id']) word_count_list = [] for i in xrange(num_mapper): #shuffle_file = 'shuffle/shuffle-' + str(i) + '-' + str(rid) #body = pickle.loads(redis_client.get(shuffle_file)) #word_count_list += body key = 'shuffle-' + str(i) + '-' + str(rid) src_filename = '/tmp/shuffle' dst_filename = '/' + key r = pocket.get(p, dst_filename, src_filename, jobid) if r != 0: raise Exception("get failed: " + dst_filename) return -1 with open(src_filename, 'r') as f: word_count_list += json.load(f) os.remove(src_filename) t1 = time.time() # add up word count word_count = {} for (word, count) in word_count_list: if word in word_count: word_count[word] += count else: word_count[word] = count t2 = time.time() # write output to s3 s3 = boto3.resource('s3') file_tmp = '/tmp/output' with open(file_tmp, "w+") as f: for k, v in word_count.items(): f.write(str(k) + ' ' + str(v) + '\n') key = 'output/output' + str(rid) bucket_name = 'wordcount-yawen' s3.Bucket(bucket_name).upload_file(file_tmp, key) os.remove(file_tmp) t3 = time.time() # upload log startup_nodes = [{ "host": "rediscluster-log.a9ith3.clustercfg.usw2.cache.amazonaws.com", "port": "6379" }] redis_client = StrictRedisCluster(startup_nodes=startup_nodes, skip_full_coverage_check=True) # t1-t0: intermediate read # t2-t1: adding word count # t3-t2: s3 write log = {'id': rid, 't0': t0, 't1': t1, 't2': t2, 't3': t3} key = 'reduce-log-' + str(n) + '-' + str(rid) redis_client.set(key, pickle.dumps(log)) #print "reducer"+str(rid)+' finished' print key + ' logged'
def pocket_read(p, jobid, iter, src_filename): for i in xrange(iter): dst_filename = '/tmp' + '-' + str(i) r = pocket.get(p, dst_filename, src_filename, jobid) if r != 0: raise Exception("get failed: " + dst_filename)