Пример #1
0
def pocket_write(p, jobid, iter, src_filename):
    for i in range(iter):
        dst_filename = 'tmp' + str(random.randint(
            1, 100000000000000)) + '-' + str(i)
        r = pocket.put(p, src_filename, dst_filename, jobid)
        if r != 0:
            raise Exception("put failed: " + dst_filename)
Пример #2
0
def pocket_write(p, jobid, iter, src_filename, id):
    for i in xrange(iter):
        dst_filename = '/tmp'+str(id)+'-'+str(i)
    	print "write ", dst_filename
        r = pocket.put(p, src_filename, dst_filename, jobid)
        if r != 0:
	    print "fail"
            raise Exception("put failed: "+ dst_filename)
    #return req_rate
    print "finished writing"
Пример #3
0
def lambda_handler(event, context):
    id = int(event['id'])
    n = num_workers = int(event['n'])
    bucket_name = str(event['bucket_name'])
    path = str(event['path'])
    n_tasks = n

    t0 = time.time()

    #[s3] read from input file: input<id>
    s3 = boto3.resource('s3')
    file_local = '/tmp/input_tmp'
    lines = []
    # read 4 100MB files
    m = 1000 / n_tasks
    for i in xrange(m):
        i += id * m
        key = path + 'input' + str(i)
        s3.Bucket(bucket_name).download_file(key, file_local)
        with open(file_local, "r") as f:
            lines += f.readlines()  #each line contains a 100b record
        os.remove(file_local)

    t1 = time.time()

    #partition
    p_list = [[] for x in xrange(n_tasks)]  #list of n partitions  #hardcode
    for line in lines:
        key1 = ord(line[0]) - 32  # key range 32-126
        key2 = ord(line[1]) - 32
        #126-32+1=95
        #p = n/95 # 2500/(126-32+1) ~ 26.3 = 26
        #index = int(26.3*(key1+key2/95.0))
        p = n_tasks / 95.0  # total of 250 tasks
        index = int(p * (key1 + key2 / 95.0))
        p_list[index].append(line)

    t2 = time.time()

    #write to output files: shuffle<id 0> shuffle<id 1> shuffle<id num_workers-1>
    # connect to crail
    p = pocket.connect("10.1.0.10", 9070)
    #jobid = ""
    jobid = str(event['id'])

    file_tmp = file_local
    for i in xrange(n_tasks):
        with open(file_tmp, "w") as f:
            f.writelines(p_list[i])
        key = 'shuffle' + str(id) + '-' + str(i)
        src_filename = file_tmp
        dst_filename = '/' + key
        r = pocket.put(p, src_filename, dst_filename, jobid)
        if r != 0:
            raise Exception("put failed: " + dst_filename)
            return -1
    t3 = time.time()

    # upload log
    startup_nodes = [{
        "host": "rediscluster-log.a9ith3.clustercfg.usw2.cache.amazonaws.com",
        "port": "6379"
    }]
    redis_client = StrictRedisCluster(startup_nodes=startup_nodes,
                                      skip_full_coverage_check=True)

    log = {'id': id, 't0': t0, 't1': t1, 't2': t2, 't3': t3}
    log_str = pickle.dumps(log)
    key = '/map-log' + '-' + '100GB' + '-' + str(n) + '-' + str(id)
    redis_client.set(key, log_str)
    print key + " logged"

    os.remove(file_tmp)

    #crail.close(socket, ticket, p)

    r = 'map finished ' + str(id)
    print r
    return r
Пример #4
0
def lambda_handler(event, context):
    '''
    startup_nodes = [{"host": "rediscluster.a9ith3.clustercfg.usw2.cache.amazonaws.com", "port": "6379"}]
    client = StrictRedisCluster(startup_nodes=startup_nodes, skip_full_coverage_check=True)
    client.flushall()
    print "Redis cluster flushed"
    return 
    '''

    mid = int(event['mid'])
    n = num_reducer = int(event['num_reducer'])  #100

    t0 = time.time()
    #[s3] read from input file: input<id>
    s3 = boto3.resource('s3')
    bucket_name = 'wordcount-yawen'
    #key = 'input/input' + str(mid)
    key = 'input/input_5MB'
    file_tmp = '/tmp/input'
    s3.Bucket(bucket_name).download_file(key, file_tmp)
    with open(file_tmp, "r") as f:
        lines = f.readlines()

    t1 = time.time()

    words = []
    for line in lines:
        words += re.split(r'[^\w]+', line)
    words = list(filter(None, words))

    # count word frequency
    word_count = {}  #(word, count)
    for word in words:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1

    # partition among reducers
    shuffle = {}  #(rid, [(word,count), ...])
    for k, v in word_count.items():
        rid = int(hashlib.md5(k).hexdigest(), 16) % num_reducer
        if rid in shuffle:
            shuffle[rid].append((k, v))
        else:
            shuffle[rid] = [(k, v)]

    t2 = time.time()

    # connect to crail
    p = pocket.connect("10.1.129.91", 9070)
    jobid = ""
    #jobid = str(event['id'])

    for k, v in shuffle.items():
        #shuffle_file = 'shuffle/shuffle-'+str(mid)+'-'+str(k)
        #value = pickle.dumps(v)
        #redis_client.set(shuffle_file, value)
        key = 'shuffle-' + str(mid) + '-' + str(k)
        src_filename = file_tmp  #'/tmp/shuffle'
        dst_filename = '/' + key
        with open(src_filename, 'w') as f:
            json.dump(v, f)
        r = pocket.put(p, src_filename, dst_filename, jobid)

    t3 = time.time()

    # upload log
    startup_nodes = [{
        "host": "rediscluster-log.a9ith3.clustercfg.usw2.cache.amazonaws.com",
        "port": "6379"
    }]
    redis_client = StrictRedisCluster(startup_nodes=startup_nodes,
                                      skip_full_coverage_check=True)
    # t1-t0: s3 read
    # t2-t1: word counting & partition
    # t3-t2: intermediate write
    log = {'id': mid, 't0': t0, 't1': t1, 't2': t2, 't3': t3}
    key = 'map-log-' + str(n) + '-' + str(mid)
    redis_client.set(key, pickle.dumps(log))

    os.remove(file_tmp)
    #print "mapper"+str(mid)+' finished'
    print key + ' logged'
Пример #5
0
def pocket_write(p, jobid, iter, src_filename):
    for i in xrange(iter):
        dst_filename = '/tmp' + '-' + str(i)
        r = pocket.put(p, src_filename, dst_filename, jobid)
        if r != 0:
            raise Exception("put failed: " + dst_filename)
Пример #6
0
def pocket_write(p, jobid, iter, src_filename):
    for i in xrange(iter):
        dst_filename = '/tmp' + '-' + str(i)
        r = pocket.put(p, src_filename, dst_filename, jobid)
Пример #7
0
def lambda_handler(event, context):
    # create a file of size (datasize) in bytes
    iter = 10000
    datasize = 32  #bytes

    file_tmp = '/tmp/file_tmp'
    with open(file_tmp, 'w') as f:
        text = 'a' * datasize
        f.write(text)

    # write to crail
    p = pocket.connect("10.1.129.91", 9070)
    jobid = 'lambda3'
    print pocket.create_dir(p, 'new_dir', jobid)
    print pocket.put(p, file_tmp, "test", jobid)
    return

    r = pocket.register_job(p, jobid)  # works if return 0
    if r != 0:
        print "registration failed"
        return

    time_list = []
    t0 = time.time()
    pocket_write(p, jobid, iter, file_tmp)
    t1 = time.time()

    print "=========================================="
    #print np.percentile(time_list, 90)
    print "Stats for " + str(iter) + " iter of " + str(
        datasize) + " bytes write:"
    throughput = iter * datasize * 8 / (t1 - t0) / 1e9
    print "throughput (Gb/s) = " + str(throughput)
    print "time (s) = " + str(t1 - t0)
    print "latency (us) = " + str((t1 - t0) / iter * 1e6)
    print "=========================================="

    t0 = time.time()
    pocket_read(p, jobid, iter, file_tmp)
    t1 = time.time()

    print "=========================================="
    print "Stats for " + str(iter) + " iter of " + str(
        datasize) + " bytes read:"
    throughput = iter * datasize * 8 / (t1 - t0) / 1e9
    print "throughput (Gb/s) = " + str(throughput)
    print "time (s) = " + str(t1 - t0)
    print "latency (us) = " + str((t1 - t0) / iter * 1e6)
    print "=========================================="

    t0 = time.time()
    pocket_lookup(p, jobid, iter)
    t1 = time.time()

    print "=========================================="
    print "Stats for " + str(iter) + " iter of " + str(
        datasize) + " bytes lookup:"
    throughput = iter * datasize * 8 / (t1 - t0) / 1e9
    print "throughput (Gb/s) = " + str(throughput)
    print "time (s) = " + str(t1 - t0)
    print "latency (us) = " + str((t1 - t0) / iter * 1e6)
    print "=========================================="

    os.remove(file_tmp)
    return