예제 #1
0
def handle_deregister_job(reader, writer):
    jobid_len = yield from reader.read(INT)
    jobid_len, = struct.Struct("!i").unpack(jobid_len)
    jobid = yield from reader.read(jobid_len)
    jobid, = struct.Struct("!" + str(jobid_len) + "s").unpack(jobid)
    jobid = jobid.decode('utf-8')

    print(
        "------------------------- DEREGISTER JOB --------------------------------",
        time.time())
    # clear weight of this job
    for (datanodeip_port, weight) in job_table.loc[jobid, 'wmask_str']:
        datanode_alloc.at[datanodeip_port,
                          'net'] = datanode_alloc.at[datanodeip_port,
                                                     'net'] - weight
    # delete job from table
    err = remove_job(jobid)
    if err == 0:
        # delete dir named jobid
        createdirsock = pocket.connect(NAMENODE_IP, NAMENODE_PORT)
        if createdirsock is None:
            return
        pocket.delete(createdirsock, None, "/" + jobid)
        #pocket.close(createdirsock)

    # reply to client with jobid int
    resp_packer = struct.Struct(RESP_STRUCT_FORMAT)
    resp = (RESP_LEN_BYTES + INT, TICKET, JOB_CMD, err, DEREGISTER_OPCODE)
    pkt = resp_packer.pack(*resp)
    writer.write(pkt)
    print(
        "------------------------- DEREGISTERED JOB --------------------------------"
    )
    return
예제 #2
0
def handle_register_job(reader, writer):
    print(
        "-------------------------- REGISTER JOB --------------------------------",
        time.time())
    jobname_len = yield from reader.read(INT)
    jobname_len, = struct.Struct("!i").unpack(jobname_len)
    jobname = yield from reader.read(jobname_len + 3 * INT + SHORT)
    jobname, num_lambdas, jobGB, peakMbps, latency_sensitive = struct.Struct(
        "!" + str(jobname_len) + "siiih").unpack(jobname)
    jobname = jobname.decode('utf-8')

    # generate jobid
    if 'gg' in jobname:
        jobid = jobname + '-1234'
        jobid_int = 1234
    else:
        jobid_int = randint(0, 1000000)
        jobid = jobname + "-" + str(jobid_int)

    print("received hints ", jobid, num_lambdas, jobGB, peakMbps,
          latency_sensitive)
    # create dir named jobid
    createdirsock = pocket.connect(NAMENODE_IP, NAMENODE_PORT)
    if createdirsock is None:
        return
    pocket.create_dir(createdirsock, None, jobid)
    #pocket.close(createdirsock)

    if jobGB == 0 or peakMbps == 0:
        jobGB, peakMbps = compute_GB_Mbps_with_hints(num_lambdas, jobGB,
                                                     peakMbps,
                                                     latency_sensitive)

    # generate weightmask
    wmask, wmask_str = yield from generate_weightmask(jobid, jobGB, peakMbps,
                                                      latency_sensitive)
    # wmask = [(ioctlcmd.calculate_datanode_hash("10.1.88.82", 50030), 1)]

    # register job in table
    err = add_job(jobid, jobGB, peakMbps, wmask, wmask_str)

    # send wmask to metadata server
    ioctlsock = yield from ioctlcmd.connect(NAMENODE_IP, NAMENODE_PORT)
    if ioctlsock is None:
        return
    yield from ioctlcmd.send_weightmask(ioctlsock, jobid, wmask)

    # reply to client with jobid int
    resp_packer = struct.Struct(RESP_STRUCT_FORMAT + "i")
    resp = (RESP_LEN_BYTES + INT, TICKET, JOB_CMD, err, REGISTER_OPCODE,
            jobid_int)
    pkt = resp_packer.pack(*resp)
    writer.write(pkt)
    print(
        "-------------------------- REGISTERED JOB --------------------------------"
    )

    return
예제 #3
0
def lambda_handler(event, context):
    # create a file of size (datasize) in bytes
    iter = 100
    datasize = 1024  #bytes
    jobid = "latency-test".join(
        random.sample(string.ascii_letters + string.digits, 6))
    namenode_ip = "10.1.47.178"

    file_tmp = '/tmp/file_tmp2'
    with open(file_tmp, 'w') as f:
        text = 'a' * datasize
        f.write(text)

    # connect to pocket
    p = pocket.connect(namenode_ip, 9070)

    # test read/write through buffer
    dir = jobid + "microbenchmark"
    pocket.create_dir(p, dir, "")
    jobid = dir

    t0 = time.time()
    pocket_write_buffer(p, jobid, iter, text, datasize)
    t1 = time.time()
    print("==========================================")
    print("Stats for " + str(iter) + " iter of " + str(datasize) +
          " bytes write_buffer:")
    throughput = iter * datasize * 8 / (t1 - t0) / 1e9
    print("throughput (Gb/s) = " + str(throughput))
    print("latency (us) = " + str((t1 - t0) / iter * 1e6))
    print("==========================================")

    text_back = " " * datasize
    t0 = time.time()
    pocket_read_buffer(p, jobid, iter, text_back, datasize)
    t1 = time.time()
    print("==========================================")
    print("Stats for " + str(iter) + " iter of " + str(datasize) +
          " bytes read_buffer:")
    throughput = iter * datasize * 8 / (t1 - t0) / 1e9
    print("throughput (Gb/s) = " + str(throughput))
    print("latency (us) = " + str((t1 - t0) / iter * 1e6))
    print("==========================================")

    t0 = time.time()
    pocket_lookup(p, jobid, iter)
    t1 = time.time()
    print("==========================================")
    print("Stats for " + str(iter) + " iter of " + str(datasize) +
          " bytes lookup (metadata RPC):")
    throughput = iter * datasize * 8 / (t1 - t0) / 1e9
    print("throughput (Gb/s) = " + str(throughput))
    print("latency (us) = " + str((t1 - t0) / iter * 1e6))
    print("==========================================")

    os.remove(file_tmp)
    return
예제 #4
0
def lambda_handler(event, context):
    # create a file of size (datasize) in bytes
    iter = 50000
    datasize = 1024  #bytes
    jobid = "latency-test"
    namenode_ip = "10.1.0.10"

    file_tmp = '/tmp/file_tmp'
    with open(file_tmp, 'w') as f:
        text = 'a' * datasize
        f.write(text)

    # connect to pocket
    p = pocket.connect(namenode_ip, 9070)

    # test read/write through buffer
    dir = jobid + "1"
    pocket.create_dir(p, dir, "")
    jobid = dir

    t0 = time.time()
    pocket_write_buffer(p, jobid, iter, text, datasize)
    t1 = time.time()
    print "=========================================="
    print "Stats for " + str(iter) + " iter of " + str(
        datasize) + " bytes write_buffer:"
    throughput = iter * datasize * 8 / (t1 - t0) / 1e9
    print "throughput (Gb/s) = " + str(throughput)
    print "latency (us) = " + str((t1 - t0) / iter * 1e6)
    print "=========================================="

    text_back = " " * datasize
    t0 = time.time()
    pocket_read_buffer(p, jobid, iter, text_back, datasize)
    t1 = time.time()
    print "=========================================="
    print "Stats for " + str(iter) + " iter of " + str(
        datasize) + " bytes read_buffer:"
    throughput = iter * datasize * 8 / (t1 - t0) / 1e9
    print "throughput (Gb/s) = " + str(throughput)
    print "latency (us) = " + str((t1 - t0) / iter * 1e6)
    print "=========================================="

    t0 = time.time()
    pocket_lookup(p, jobid, iter)
    t1 = time.time()
    print "=========================================="
    print "Stats for " + str(iter) + " iter of " + str(
        datasize) + " bytes lookup (metadata RPC):"
    throughput = iter * datasize * 8 / (t1 - t0) / 1e9
    print "throughput (Gb/s) = " + str(throughput)
    print "latency (us) = " + str((t1 - t0) / iter * 1e6)
    print "=========================================="

    os.remove(file_tmp)
    return
예제 #5
0
def lambda_handler(event, context):
    id = int(event['id'])
    n = num_workers = int(event['n'])
    type = event['type']
    iter = int(event['iter'])
    datasize = int(event['datasize'])  #bytes
    namenode_ip = "10.1.22.136"

    # create a file of size (datasize) bytes
    file_tmp = '/tmp/file_tmp'
    with open(file_tmp, 'w') as f:
        text = 'a' * datasize
        f.write(text)

    # connect to pocket
    p = pocket.connect(namenode_ip, 9070)
    jobid = ""

    if type == "write":
        t1 = time.time()
        pocket_write(p, jobid, iter, file_tmp, id)
        t2 = time.time()
    elif type == "read":
        t1 = time.time()
        pocket_read(p, jobid, iter, file_tmp, id)
        t2 = time.time()
    elif type == "lookup":
        t1 = time.time()
        pocket_lookup(p, jobid, iter, id)
        t2 = time.time()
    elif type == "writebuffer":
        t1 = time.time()
        pocket_write_buffer(p, jobid, iter, text, datasize, id)
        t2 = time.time()
    elif type == "readbuffer":
        text_back = " " * datasize
        t1 = time.time()
        pocket_read_buffer(p, jobid, iter, text_back, datasize, id)
        t2 = time.time()
    else:
        return "Illegal type"

    # upload network data
    redis_host = "rediscluster-log.a9ith3.clustercfg.usw2.cache.amazonaws.com"
    startup_nodes = [{"host": redis_host, "port": "6379"}]
    redis_client = StrictRedisCluster(startup_nodes=startup_nodes,
                                      skip_full_coverage_check=True)
    log = {'t': t2 - t1, 't_start': t1}
    log_str = pickle.dumps(log)
    key = '/throughput-log' + '-' + str(n) + '-' + str(id)
    redis_client.set(key, log_str)
    print key + " logged"

    os.remove(file_tmp)
    return type + " finished"
예제 #6
0
def handle_deregister_job(reader, writer):
  jobid_len = yield from reader.read(INT)
  jobid_len, = struct.Struct("!i").unpack(jobid_len)
  jobid = yield from reader.read(jobid_len)
  jobid, = struct.Struct("!" + str(jobid_len) + "s").unpack(jobid)
  jobid = jobid.decode('utf-8')
  
  print("------------------------- DEREGISTER JOB --------------------------------", time.time())
  # clear weight of this job
  if jobid not in job_datanode_net_allocations:
    print("ERROR: job to deregister no net allocation recognized!\n")
    print(job_datanode_net_allocations)
  for (datanodeip_port, weight) in job_datanode_net_allocations[jobid]:
    datanode_alloc.at[datanodeip_port, 'net'] -= weight
    if datanode_alloc.at[datanodeip_port, 'net'] < 0.0: # could happen due to rounding
      datanode_alloc.at[datanodeip_port, 'net'] = 0.0
  if jobid in job_datanode_dramGB_allocations:
    for (datanodeip_port, weight) in job_datanode_dramGB_allocations[jobid]:
      datanode_alloc.at[datanodeip_port, 'DRAM_GB'] -= weight
      if datanode_alloc.at[datanodeip_port, 'DRAM_GB'] < 0.0: # could happen due to rounding
        datanode_alloc.at[datanodeip_port, 'DRAM_GB'] = 0.0
  elif jobid in job_datanode_flashGB_allocations:
    for (datanodeip_port, weight) in job_datanode_flashGB_allocations[jobid]:
      datanode_alloc.at[datanodeip_port, 'Flash_GB'] -= weight
      if datanode_alloc.at[datanodeip_port, 'Flash_GB'] < 0.0: # could happen due to rounding
        datanode_alloc.at[datanodeip_port, 'Flash_GB'] = 0.0
  else:
    print("ERROR: job to deregister no capacity allocation recognized!\n")
    print(job_datanode_dramGB_allocations)
    print(job_datanode_flashGB_allocations)
    
#  for (datanodeip_port, weight) in job_table.loc[jobid,'wmask_str']:
#    datanode_alloc.at[datanodeip_port, 'net'] =  datanode_alloc.at[datanodeip_port, 'net'] - weight
  # delete job from table
  err = remove_job(jobid)
  if err == 0:
    # delete dir named jobid
    # NOTE: this is blocking but we are not yielding
    createdirsock = pocket.connect(NAMENODE_IP, NAMENODE_PORT)
    if createdirsock is None:
      return
    pocket.delete(createdirsock, None, "/" + jobid)
    #pocket.close(createdirsock)
 
  print(datanode_alloc)
 
  # reply to client with jobid int
  resp_packer = struct.Struct(RESP_STRUCT_FORMAT)
  resp = (RESP_LEN_BYTES + INT, TICKET, JOB_CMD, err, DEREGISTER_OPCODE)
  pkt = resp_packer.pack(*resp)
  writer.write(pkt)
  print("------------------------- DEREGISTERED JOB --------------------------------")
  return
예제 #7
0
def lambda_handler(event, context):
    id = int(event['id'])
    n = num_workers = int(event['n'])
    bucket_name = str(event['bucket_name'])
    path = str(event['path'])
    n_tasks = n

    t0 = time.time()

    #[s3] read from input file: input<id>
    s3 = boto3.resource('s3')
    file_local = '/tmp/input_tmp'
    lines = []
    # read 4 100MB files
    m = 1000 / n_tasks
    for i in xrange(m):
        i += id * m
        key = path + 'input' + str(i)
        s3.Bucket(bucket_name).download_file(key, file_local)
        with open(file_local, "r") as f:
            lines += f.readlines()  #each line contains a 100b record
        os.remove(file_local)

    t1 = time.time()

    #partition
    p_list = [[] for x in xrange(n_tasks)]  #list of n partitions  #hardcode
    for line in lines:
        key1 = ord(line[0]) - 32  # key range 32-126
        key2 = ord(line[1]) - 32
        #126-32+1=95
        #p = n/95 # 2500/(126-32+1) ~ 26.3 = 26
        #index = int(26.3*(key1+key2/95.0))
        p = n_tasks / 95.0  # total of 250 tasks
        index = int(p * (key1 + key2 / 95.0))
        p_list[index].append(line)

    t2 = time.time()

    #write to output files: shuffle<id 0> shuffle<id 1> shuffle<id num_workers-1>
    # connect to crail
    p = pocket.connect("10.1.0.10", 9070)
    #jobid = ""
    jobid = str(event['id'])

    file_tmp = file_local
    for i in xrange(n_tasks):
        with open(file_tmp, "w") as f:
            f.writelines(p_list[i])
        key = 'shuffle' + str(id) + '-' + str(i)
        src_filename = file_tmp
        dst_filename = '/' + key
        r = pocket.put(p, src_filename, dst_filename, jobid)
        if r != 0:
            raise Exception("put failed: " + dst_filename)
            return -1
    t3 = time.time()

    # upload log
    startup_nodes = [{
        "host": "rediscluster-log.a9ith3.clustercfg.usw2.cache.amazonaws.com",
        "port": "6379"
    }]
    redis_client = StrictRedisCluster(startup_nodes=startup_nodes,
                                      skip_full_coverage_check=True)

    log = {'id': id, 't0': t0, 't1': t1, 't2': t2, 't3': t3}
    log_str = pickle.dumps(log)
    key = '/map-log' + '-' + '100GB' + '-' + str(n) + '-' + str(id)
    redis_client.set(key, log_str)
    print key + " logged"

    os.remove(file_tmp)

    #crail.close(socket, ticket, p)

    r = 'map finished ' + str(id)
    print r
    return r
예제 #8
0
    def run_command(key):
        begin_of_function = time.time()
        logger = logging.getLogger(__name__)
        print("taskId = " + str(key['taskId']))
        print("number of inputs = " + str(key['inputs']))
        print("number of output partitions = " + str(key['parts']))
        # TODO: make the parameters configurable
        taskId = key['taskId']
        # 1T
        #totalInputs = 10000
        totalInputs = key['total_input']
        inputsPerTask = key['inputs']
        taskPerRound = key['taskPerRound']
        rounds = (inputsPerTask + taskPerRound - 1) / taskPerRound
        numPartitions = key['parts']
        bucketName = key['bucket']

        jobid_int = int(key['job_number'])
        pocket_job_name = "job" + str(jobid_int)
        print("Pocket job name " + pocket_job_name)
        #jobid = pocket.register_job(pocket_job_name, capacityGB=1)
        jobid = pocket_job_name
        print("(" + str(taskId) + ")" + "Finish registering job")
        pocket_namenode = pocket.connect("10.1.0.10", 9070)

        print("(" + str(taskId) + ")" + "Connecting namenode job")
        print("See the number of partitions: " + str(numPartitions))
        min_value = struct.unpack(">I", b"\x00\x00\x00\x00")[0]
        max_value = struct.unpack(">I", b"\xff\xff\xff\xff")[0]

        rangePerPart = int((max_value - min_value) / numPartitions)
        print("here 1 " + str(rangePerPart))

        keyType = np.dtype([('key', 'S4')])
        # 4 bytes good enough for partitioning
        recordType = np.dtype([('key', 'S4'), ('value', 'S96')])

        print("here 2")
        boundaries = []
        # (numPartitions-1) boundaries
        for i in range(1, numPartitions):
            # 4 bytes unsigned integers
            b = struct.pack('>I', rangePerPart * i)
            boundaries.append(b)

        client = boto3.client('s3', 'us-west-2')

        print("(" + str(taskId) + ")" + "Connected s3 client")
        [t1, t2, t3] = [time.time()] * 3
        [read_time, work_time, write_time] = [0] * 3
        # a total of 10 threads
        read_pool = ThreadPool(1)
        number_of_clients = 1
        write_pool = ThreadPool(number_of_clients)
        clients = []
        for client_id in range(number_of_clients):
            clients.append(boto3.client('s3', 'us-west-2'))
        write_pool_handler_container = []
        print("(" + str(taskId) + ")" + "rounds" + str(rounds))
        # manager = Manager()
        rounds = int(rounds)
        for roundIdx in range(rounds):
            inputs = []

            def read_work(read_key):
                inputId = read_key['inputId']
                keyname = "input/part-" + str(inputId)
                m = hashlib.md5()
                m.update(keyname.encode('utf-8'))
                randomized_keyname = "input/" + m.hexdigest(
                )[:8] + "-part-" + str(inputId)
                print("(" + str(taskId) + ")" + "fetching " +
                      randomized_keyname)
                obj = client.get_object(Bucket=bucketName,
                                        Key=randomized_keyname)
                print("(" + str(taskId) + ")" + "fetching " +
                      randomized_keyname + " done")
                fileobj = obj['Body']
                #data = np.fromstring(fileobj.read(), dtype=recordType)
                data = np.frombuffer(fileobj.read(), dtype=recordType)
                print("(" + str(taskId) + ")" + "conversion " +
                      randomized_keyname + " done")
                print("(" + str(taskId) + ")" + "size " + randomized_keyname +
                      "  " + str(len(data)))
                inputs.append(data)

            startId = taskId * inputsPerTask + roundIdx * taskPerRound
            endId = min(
                taskId * inputsPerTask + min(
                    (roundIdx + 1) * taskPerRound, inputsPerTask), totalInputs)
            inputIds = range(startId, endId)
            if len(inputIds) == 0:
                break

            print("(" + str(taskId) + ")" + "Range for round " +
                  str(roundIdx) + " is (" + str(startId) + "," + str(endId) +
                  ")")

            read_keylist = []
            for i in range(len(inputIds)):
                read_keylist.append({'inputId': inputIds[i], 'i': i})

            # before processing, make sure all data is read
            read_pool.map(read_work, read_keylist)
            print("(" + str(taskId) + ")" + "read call done ")
            print("(" + str(taskId) + ")" + "size of inputs" +
                  str(len(inputs)))

            records = np.concatenate(inputs)
            gc.collect()

            t1 = time.time()
            print("(" + str(taskId) + ")" + 'read time ' + str(t1 - t3))
            read_time = t1 - t3

            if numPartitions == 1:
                ps = [0] * len(records)
            else:
                ps = np.searchsorted(boundaries, records['key'])
            t2 = time.time()
            print("(" + str(taskId) + ")" + 'calculating partitions time: ' +
                  str(t2 - t1))
            # before processing the newly read data, make sure outputs are all written out
            if len(write_pool_handler_container) > 0:
                write_pool_handler = write_pool_handler_container.pop()
                twait_start = time.time()
                write_pool_handler.wait()
                twait_end = time.time()
                if twait_end - twait_start > 0.5:
                    print("(" + str(taskId) + ")" + 'write time = ' +
                          str(twait_end - t3) + " slower than read " +
                          str(t1 - t3))
                else:
                    print("(" + str(taskId) + ")" + 'write time < ' +
                          str(twait_end - t3) + " faster than read " +
                          str(t1 - t3))

            t2 = time.time()
            gc.collect()
            numPartitions = int(numPartitions)
            outputs = [[] for i in range(0, numPartitions)]
            for idx, record in enumerate(records):
                outputs[ps[idx]].append(record)
            t3 = time.time()
            print("(" + str(taskId) + ")" + 'paritioning time: ' +
                  str(t3 - t2))
            work_time = t3 - t1

            def write_work_client(writer_key):
                client_id = writer_key['i']
                mapId = rounds * taskId + writer_key['roundIdx']
                key_per_client = writer_key['key-per-client']

                key_per_client = int(key_per_client)
                client_id = int(client_id)
                numPartitions = int(writer_key['num_partitions'])
                print("(" + str(taskId) + ")" + "range" + str(key_per_client) +
                      " " + str(client_id) + " " + str(numPartitions))
                for i in range(
                        key_per_client * client_id,
                        min(key_per_client * (client_id + 1), numPartitions)):
                    keyname = "shuffle-part-" + str(mapId) + "-" + str(i)
                    m = hashlib.md5()
                    m.update(keyname.encode('utf-8'))
                    randomized_keyname = "shuffle-" + m.hexdigest(
                    )[:8] + "-part-" + str(mapId) + "-" + str(i)
                    print("The name of the key to write is: " +
                          randomized_keyname)
                    bytes_body = np.asarray(outputs[ps[i]]).tobytes()
                    print("Hey top top " + str(len(bytes_body)))
                    datasize = 1700000
                    #print(body)
                    #body = bytes_body.decode('ascii')
                    body = b64encode(bytes_body).decode('utf-8')
                    body = body.ljust(datasize, '=')
                    print("Byte to be written: " + str(len(body)))

                    print("Last ten bits after padding: " + body[-10:])
                    pocket.put_buffer(pocket_namenode, body, len(body),
                                      randomized_keyname, jobid)

            writer_keylist = []
            key_per_client = (numPartitions + number_of_clients -
                              1) / number_of_clients
            number_of_clients = int(number_of_clients)
            for i in range(number_of_clients):
                writer_keylist.append({
                    'roundIdx': roundIdx,
                    'i': i,
                    'key-per-client': key_per_client,
                    'num_partitions': numPartitions
                })

            for i in range(number_of_clients):
                write_work_client(writer_keylist[i])
            #write_pool_handler = write_pool.map_async(write_work_client, writer_keylist)
            #write_pool_handler_container.append(write_pool_handler)

        #pocket.deregister_job(jobid)
        if len(write_pool_handler_container) > 0:
            write_pool_handler = write_pool_handler_container.pop()
            write_pool_handler.wait()
            twait_end = time.time()
            print("(" + str(taskId) + ")" + 'last write time = ' +
                  str(twait_end - t3))
            write_time = twait_end - t3
        read_pool.close()
        write_pool.close()
        read_pool.join()
        write_pool.join()
        end_of_function = time.time()
        print("(" + str(taskId) + ")" + "Exciting this function")
        return begin_of_function, end_of_function, read_time, work_time, write_time
예제 #9
0
def lambda_handler(event, context):
    rid = int(event['rid'])
    n = num_mapper = int(event['num_mapper'])

    t0 = time.time()
    # read shuffle files
    # connect to crail
    p = pocket.connect("10.1.129.91", 9070)
    jobid = ""
    #jobid = str(event['id'])

    word_count_list = []
    for i in xrange(num_mapper):
        #shuffle_file = 'shuffle/shuffle-' + str(i) + '-' + str(rid)
        #body = pickle.loads(redis_client.get(shuffle_file))
        #word_count_list += body
        key = 'shuffle-' + str(i) + '-' + str(rid)
        src_filename = '/tmp/shuffle'
        dst_filename = '/' + key
        r = pocket.get(p, dst_filename, src_filename, jobid)
        if r != 0:
            raise Exception("get failed: " + dst_filename)
            return -1
        with open(src_filename, 'r') as f:
            word_count_list += json.load(f)

    os.remove(src_filename)

    t1 = time.time()
    # add up word count
    word_count = {}
    for (word, count) in word_count_list:
        if word in word_count:
            word_count[word] += count
        else:
            word_count[word] = count

    t2 = time.time()
    # write output to s3
    s3 = boto3.resource('s3')
    file_tmp = '/tmp/output'
    with open(file_tmp, "w+") as f:
        for k, v in word_count.items():
            f.write(str(k) + ' ' + str(v) + '\n')
    key = 'output/output' + str(rid)
    bucket_name = 'wordcount-yawen'
    s3.Bucket(bucket_name).upload_file(file_tmp, key)

    os.remove(file_tmp)

    t3 = time.time()

    # upload log
    startup_nodes = [{
        "host": "rediscluster-log.a9ith3.clustercfg.usw2.cache.amazonaws.com",
        "port": "6379"
    }]
    redis_client = StrictRedisCluster(startup_nodes=startup_nodes,
                                      skip_full_coverage_check=True)
    # t1-t0: intermediate read
    # t2-t1: adding word count
    # t3-t2: s3 write
    log = {'id': rid, 't0': t0, 't1': t1, 't2': t2, 't3': t3}
    key = 'reduce-log-' + str(n) + '-' + str(rid)
    redis_client.set(key, pickle.dumps(log))

    #print "reducer"+str(rid)+' finished'
    print key + ' logged'
예제 #10
0
    def run_command(key):
        global concat_time
        begin_of_function = time.time()
        logger = logging.getLogger(__name__)
        print("taskId = " + str(key['taskId']))
        #print("number of works = " + str(key['works']))
        #print("number of input partitions = " + str(key['parts']))

        bucketName = key['bucket']
        taskId = key['taskId']
        rounds = key['works']
        numPartitions = int(key['parts'])

        jobid_int = int(key['job_number'])
        pocket_job_name = "job" + str(jobid_int)
#        jobid = pocket.register_job(pocket_job_name, capacityGB=1)
        jobid = pocket_job_name
        pocket_namenode = pocket.connect("10.1.0.10", 9070)

        # 10 bytes for sorting
        recordType = np.dtype([('key', 'S10'), ('value', 'S90')])

        client = boto3.client('s3', 'us-west-2')
        # rs = []
        # for hostname in key['redis'].split(";"):
        #     r1 = StrictRedis(host=hostname, port=6379, db=0).pipeline()
        #     rs.append(r1)
        # nrs = len(rs)

        [t1, t2, t3] = [time.time()] * 3
        [read_time, work_time, write_time] = [0] * 3
        # a total of 10 threads
        write_pool = ThreadPool(1)
        number_of_clients = 1
        read_pool = ThreadPool(number_of_clients)
        clients = []
        number_of_clients = int(number_of_clients)
        for client_id in range(number_of_clients):
            clients.append(boto3.client('s3', 'us-west-2'))
        write_pool_handler_container = []
        rounds = int(rounds)
        for roundIdx in range(rounds):
            inputs = []

            def read_work(reader_key):
                client_id = reader_key['client_id']
                reduceId = rounds * taskId + reader_key['roundIdx']
                key_per_client = reader_key['key-per-client']
                key_per_client = int(key_per_client)
                client_id = int(client_id)
                objs = []
                for mapId in range(key_per_client * client_id, min(key_per_client * (client_id + 1), numPartitions)):
                    # for mapId in range(1):
                    keyname = "shuffle-part-" + str(mapId) + "-" + str(reduceId)
                    m = hashlib.md5()
                    m.update(keyname.encode('utf-8'))
                    randomized_keyname = "shuffle-" + m.hexdigest()[:8] + "-part-" + str(mapId) + "-" + str(reduceId)
                    print("The name of the key to read is: " + randomized_keyname)
                    try:
                        datasize = 17000000
                        textback = " "*datasize
                        pocket.get_buffer(pocket_namenode, randomized_keyname, textback, datasize, jobid)
                        print("Successfully read")
                        #pos = textback.find('.')
                        #print("Padding position: " + str(pos))
                        original_text = b64decode(textback.encode('utf-8'))
                        print("last ten bytes after padding: " + textback[-10:])

                        objs.append(original_text)
                    except Exception:
                        print("reading error key " + randomized_keyname)
                        raise

                data = [np.fromstring(obj, dtype=recordType) for obj in objs]
                [d.sort(order='key') for d in data]
                inputs.extend(data)


            reader_keylist = []
            key_per_client = (numPartitions + number_of_clients - 1) / number_of_clients
            number_of_clients = int(number_of_clients)
            for client_id in range(number_of_clients):
                reader_keylist.append({'roundIdx': roundIdx,
                                       'client_id': client_id,
                                       'key-per-client': key_per_client})

            for i in range(number_of_clients):
                read_work(reader_keylist[i])

            t1 = time.time()
            print('read time ' + str(t1 - t3))
            read_time = t1 - t3

            if len(write_pool_handler_container) > 0:
                write_pool_handler = write_pool_handler_container.pop()
                twait_start = time.time()
                write_pool_handler.wait()
                twait_end = time.time()
                if twait_end - twait_start > 0.5:
                    print('write time = ' + str(twait_end - t3) + " slower than read " + str(t1 - t3))
                else:
                    print('write time < ' + str(twait_end - t3) + " faster than read " + str(t1 - t3))

            t2 = time.time()
            records = np.concatenate(inputs)
            gc.collect()
            concat_time = len(records)

            records.sort(order='key', kind='mergesort')

            t3 = time.time()
            print('sort time: ' + str(t3 - t2))

            work_time = t3 - t2

            def write_work(reduceId):
                keyname = "output/part-" + str(reduceId)
                m = hashlib.md5()
                m.update(keyname.encode('utf-8'))
                randomized_keyname = "output/" + m.hexdigest()[:8] + "-part-" + str(reduceId)
                body = records.tobytes()
                client.put_object(Bucket=bucketName, Key=randomized_keyname, Body=body)

            write_pool_handler = write_pool.map_async(write_work, [taskId * rounds + roundIdx])
            write_pool_handler_container.append(write_pool_handler)

        if len(write_pool_handler_container) > 0:
            write_pool_handler = write_pool_handler_container.pop()
            write_pool_handler.wait()
            twait_end = time.time()
            print('last write time = ' + str(twait_end - t3))
            write_time = twait_end - t3
        read_pool.close()
        write_pool.close()
        read_pool.join()
        write_pool.join()

        end_of_function = time.time()
        return begin_of_function, end_of_function, read_time, work_time, write_time, concat_time
예제 #11
0
def lambda_handler(event, context):
    '''
    startup_nodes = [{"host": "rediscluster.a9ith3.clustercfg.usw2.cache.amazonaws.com", "port": "6379"}]
    client = StrictRedisCluster(startup_nodes=startup_nodes, skip_full_coverage_check=True)
    client.flushall()
    print "Redis cluster flushed"
    return 
    '''

    mid = int(event['mid'])
    n = num_reducer = int(event['num_reducer'])  #100

    t0 = time.time()
    #[s3] read from input file: input<id>
    s3 = boto3.resource('s3')
    bucket_name = 'wordcount-yawen'
    #key = 'input/input' + str(mid)
    key = 'input/input_5MB'
    file_tmp = '/tmp/input'
    s3.Bucket(bucket_name).download_file(key, file_tmp)
    with open(file_tmp, "r") as f:
        lines = f.readlines()

    t1 = time.time()

    words = []
    for line in lines:
        words += re.split(r'[^\w]+', line)
    words = list(filter(None, words))

    # count word frequency
    word_count = {}  #(word, count)
    for word in words:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1

    # partition among reducers
    shuffle = {}  #(rid, [(word,count), ...])
    for k, v in word_count.items():
        rid = int(hashlib.md5(k).hexdigest(), 16) % num_reducer
        if rid in shuffle:
            shuffle[rid].append((k, v))
        else:
            shuffle[rid] = [(k, v)]

    t2 = time.time()

    # connect to crail
    p = pocket.connect("10.1.129.91", 9070)
    jobid = ""
    #jobid = str(event['id'])

    for k, v in shuffle.items():
        #shuffle_file = 'shuffle/shuffle-'+str(mid)+'-'+str(k)
        #value = pickle.dumps(v)
        #redis_client.set(shuffle_file, value)
        key = 'shuffle-' + str(mid) + '-' + str(k)
        src_filename = file_tmp  #'/tmp/shuffle'
        dst_filename = '/' + key
        with open(src_filename, 'w') as f:
            json.dump(v, f)
        r = pocket.put(p, src_filename, dst_filename, jobid)

    t3 = time.time()

    # upload log
    startup_nodes = [{
        "host": "rediscluster-log.a9ith3.clustercfg.usw2.cache.amazonaws.com",
        "port": "6379"
    }]
    redis_client = StrictRedisCluster(startup_nodes=startup_nodes,
                                      skip_full_coverage_check=True)
    # t1-t0: s3 read
    # t2-t1: word counting & partition
    # t3-t2: intermediate write
    log = {'id': mid, 't0': t0, 't1': t1, 't2': t2, 't3': t3}
    key = 'map-log-' + str(n) + '-' + str(mid)
    redis_client.set(key, pickle.dumps(log))

    os.remove(file_tmp)
    #print "mapper"+str(mid)+' finished'
    print key + ' logged'
예제 #12
0
def lambda_handler(event, context):
    id = int(event['id'])
    n = num_workers = int(event['n'])    
    type = event['type']
    iter = int(event['iter'])
    datasize = int(event['datasize']) #bytes
    namenode_ip = "10.1.22.136"


    LOGS_PATH = 'logs-'+str(n)
    STOP = threading.Event()

    class TimeLog:
        def __init__(self, enabled=True):
            self.enabled = enabled
            self.start = time.time()
            self.prev = self.start
            self.points = []
            self.sizes = []

        def add_point(self, title):
            if not self.enabled:
                  return
            now = time.time()
            self.points += [(title, now - self.prev)]
            self.prev = now

    def upload_net_bytes(rclient, rxbytes_per_s, txbytes_per_s, cpu_util, timelogger, reqid):
        #rclient = redis.Redis(host=REDIS_HOSTADDR_PRIV, port=6379, db=0)  
        netstats = LOGS_PATH + '/netstats-' + reqid 
        rclient.set(netstats, str({'lambda': reqid,
             'started': timelogger.start,
             'rx': rxbytes_per_s,
             'tx': txbytes_per_s,
             'cpu': cpu_util}).encode('utf-8'))
        print "wrote netstats"
        return
        
    STOP.set()
    timelogger = TimeLog(enabled=True)


    # create a file of size (datasize) bytes
    file_tmp = '/tmp/file_tmp'
    with open(file_tmp, 'w') as f:
        text = 'a'*datasize 
        f.write(text)

    # connect to pocket
    p = pocket.connect(namenode_ip, 9070)
    jobid = ""

    t1=time.time()
    req_per_s = pocket_lookup(p, jobid, iter, id)
    t2=time.time()
     

    # upload network data
    redis_host = "rediscluster-log.a9ith3.clustercfg.usw2.cache.amazonaws.com"
    startup_nodes = [{"host": redis_host, "port": "6379"}]
    redis_client = StrictRedisCluster(startup_nodes=startup_nodes, skip_full_coverage_check=True)
    rclient = redis_client
    STOP.clear()
    place_holder = [1]*len(req_per_s)
    upload_net_bytes(rclient, req_per_s, place_holder, place_holder, timelogger, str(id))
    print "iops stats uploaded"


    # upload network data
    redis_host = "rediscluster-log.a9ith3.clustercfg.usw2.cache.amazonaws.com"
    startup_nodes = [{"host": redis_host, "port": "6379"}]
    redis_client = StrictRedisCluster(startup_nodes=startup_nodes, skip_full_coverage_check=True)
    log = {'t':t2-t1, 't_start':t1}    
    log_str = pickle.dumps(log)
    key = '/throughput-log'+'-'+str(n)+'-'+str(id)
    redis_client.set(key, log_str)
    print key + " logged" 

    os.remove(file_tmp)
    return type+" finished"
예제 #13
0
def handle_register_job(reader, writer):
    print(
        "-------------------------- REGISTER JOB --------------------------------",
        time.time())
    jobname_len = yield from reader.read(INT)
    jobname_len, = struct.Struct("!i").unpack(jobname_len)
    jobname = yield from reader.read(jobname_len + 3 * INT + SHORT)
    jobname, num_lambdas, jobGB, peakMbps, latency_sensitive = struct.Struct(
        "!" + str(jobname_len) + "siiih").unpack(jobname)
    jobname = jobname.decode('utf-8')

    # generate jobid
    if 'gg' in jobname:
        jobid = jobname + '-1234'
        jobid_int = 1234
    else:
        jobid_int = randint(0, 1000000)
        jobid = jobname + "-" + str(jobid_int)

    print("received hints ", jobid, num_lambdas, jobGB, peakMbps,
          latency_sensitive)
    # create dir named jobid
    # NOTE: this is blocking but we are not yielding
    createdirsock = pocket.connect(NAMENODE_IP, NAMENODE_PORT)
    if createdirsock is None:
        return
    ret = pocket.create_dir(createdirsock, None, jobid)

    if jobGB == 0 or peakMbps == 0:
        jobGB, peakMbps = compute_GB_Mbps_with_hints(num_lambdas, jobGB,
                                                     peakMbps,
                                                     latency_sensitive)
    nvm_ip = []
    for each_ip in nvm_ip:
        print("Adding this nvme server" + each_ip)
        add_nvme_datanodes(each_ip)
    # generate weightmask
    print("Generating weightmask")
    wmask, wmask_str = yield from generate_weightmask(jobid, jobGB, peakMbps,
                                                      latency_sensitive)
    # wmask = [(ioctlcmd.calculate_datanode_hash("10.1.88.82", 50030), 1)]
    # register job in table
    #  nvm_ip = ['10.1.80.147', '10.1.71.111']
    #  for each_ip in nvm_ip:
    #    print("Adding this nvme server" + each_ip)
    #    add_nvme_datanodes(each_ip)
    #wmask[1] = (ioctlcmd.calculate_datanode_hash(nvm_ip, 1234), 0.25)
    #wmask_str[1] = (nvm_ip + ':1234', 0.24999999999999994)

    err = add_job(jobid, jobGB, peakMbps, wmask, wmask_str)

    # send wmask to metadata server
    ioctlsock = yield from ioctlcmd.connect(NAMENODE_IP, NAMENODE_PORT)
    if ioctlsock is None:
        return
    yield from ioctlcmd.send_weightmask(ioctlsock, jobid, wmask)

    # reply to client with jobid int
    resp_packer = struct.Struct(RESP_STRUCT_FORMAT + "i")
    resp = (RESP_LEN_BYTES + INT, TICKET, JOB_CMD, err, REGISTER_OPCODE,
            jobid_int)
    pkt = resp_packer.pack(*resp)
    writer.write(pkt)
    print(
        "-------------------------- REGISTERED JOB --------------------------------"
    )

    return
    def run_command(key):
        """
        keylist.append({'taskId': i,
                        'job_number': job_number,
                        'total_input': numTasks,
                        'write_element_size': write_element_size,
                        'process_time': process_time,
                        'total_time': total_time})
        """
        pywren.wrenlogging.default_config('INFO')
        begin_of_function = time.time()
        logger = logging.getLogger(__name__)
        logger.info("taskId = " + str(key['taskId']))
        taskId = key['taskId']
        jobid_int = int(key['job_number'])
        pocket_job_name = key['pocket_job_name']
        write_element_size = int(key['write_element_size'])
        process_time = int(key['process_time'])
        total_time = int(key['total_time'])
        pocket_namenode = pocket.connect("10.1.0.10", 9070)

        [read_time, work_time, write_time] = [0] * 3
        start_time = time.time()

        # a total of 10 threads
        number_of_clients = 1
        write_pool = ThreadPool(number_of_clients)

        time.sleep(process_time)


        logger.info("Process finish here: " + str(time.time()))

        def write_work_client(writer_key):
            start_time = time.time()
            client_id = int(writer_key['client_id'])
            taskID = writer_key['taskId']
            jobID = writer_key['jobid']
            datasize = writer_key['write_element_size']
                #datasize = 1310720
            total_time = writer_key['total_time']
            body = b'a' * datasize
            client_id = int(client_id)
            count = 0
            throughput_step = 1
            throughput_count = 1
            throughput_total = 0
            throughput_nops = 0
            ret = []
            while time.time() < start_time + total_time:
                count = count + 1
                keyname = str(jobID) + "-" + str(taskID) + "-" + str(count)
                m = hashlib.md5()
                m.update(keyname.encode('utf-8'))
                randomized_keyname = str(jobID) + "-" + str(taskID) + '-' + m.hexdigest()[:8] + '-' + str(count)
                #logger.info("(" + str(taskId) + ")" + "The name of the key to write is: " + randomized_keyname)
                start = time.time()
                #logger.info("[POCKET] [" + str(jobID) + "] " + str(start) + " " + str(taskID) + " " + str(len(body)) + " write " + "S")
                r = pocket.put_buffer_bytes(pocket_namenode, body, len(body), randomized_keyname, pocket_job_name)
                end = time.time()
                #logger.info("[POCKET] [" + str(jobID) + "] " + str(end) + " " + str(taskID) + " " + str(len(body)) + " write " + "E " + str(r) )
                throughput_total += end - start
                throughput_nops += 1
                if end - start_time >= throughput_count:
                    throughput = throughput_nops / throughput_total
                    ret.append((end, throughput))
                    throughput_nops = 0
                    throughput_count += throughput_step
                    throughput_total = 0
            logger.info("Write finish here: " + str(time.time()))
            return ret

        writer_keylist = []
        number_of_clients = int(number_of_clients)
        for i in range(number_of_clients):
            writer_keylist.append({'client_id': i,
                                   'taskId': taskId,
                                   'jobid': jobid_int,
                                   'write_element_size': write_element_size,
                                   'total_time': total_time})

        start_time = time.time()
        write_pool_handler_container = []
        write_pool_handler = write_pool.map_async(write_work_client, writer_keylist)
        write_pool_handler_container.append(write_pool_handler)

        if len(write_pool_handler_container) > 0:
            write_pool_handler = write_pool_handler_container.pop()
            ret = write_pool_handler.get()
            twait_end = time.time()
            write_time = twait_end - start_time
        write_pool.close()
        write_pool.join()
        end_of_function = time.time()
        return begin_of_function, end_of_function, write_time, ret
예제 #15
0
    def run_command(key):
        global concat_time
        pywren.wrenlogging.default_config('INFO')
        begin_of_function = time.time()
        logger = logging.getLogger(__name__)
        logger.info("taskId = " + str(key['taskId']))
        #logger.info("number of works = " + str(key['works']))
        #logger.info("number of input partitions = " + str(key['parts']))

        bucketName = key['bucket']
        taskId = key['taskId']
        rounds = key['works']
        numPartitions = int(key['parts'])

        jobid_int = int(key['job_number'])
        pocket_job_name = "job" + str(jobid_int)
#        jobid = pocket.register_job(pocket_job_name, capacityGB=1)
        jobid = pocket_job_name
        pocket_namenode = pocket.connect("10.1.0.10", 9070)

        # 10 bytes for sorting
        recordType = np.dtype([('key', 'S10'), ('value', 'S90')])

        client = boto3.client('s3', 'us-west-2')

        [t1, t2, t3] = [time.time()] * 3
        [read_time, work_time, write_time] = [0] * 3
        # a total of 10 threads
        write_pool = ThreadPool(1)
        number_of_clients = 1
        read_pool = ThreadPool(number_of_clients)
        clients = []
        number_of_clients = int(number_of_clients)
        for client_id in range(number_of_clients):
            clients.append(boto3.client('s3', 'us-west-2'))
        write_pool_handler_container = []
        rounds = int(rounds)
        for roundIdx in range(rounds):
            inputs = []

            def read_work(reader_key):
                client_id = reader_key['client_id']
                reduceId = rounds * taskId + reader_key['roundIdx']
                key_per_client = reader_key['key-per-client']
                key_per_client = int(key_per_client)
                client_id = int(client_id)
                taskID = int(reader_key['taskId'])
                jobID = int(reader_key['jobid'])
                objs = []
                for mapId in range(key_per_client * client_id, min(key_per_client * (client_id + 1), numPartitions)):
                    # for mapId in range(1):
                    keyname = "shuffle-part-" + str(mapId) + "-" + str(reduceId)
                    m = hashlib.md5()
                    m.update(keyname.encode('utf-8'))
                    randomized_keyname = "shuffle-" + m.hexdigest()[:8] + "-part-" + str(mapId) + "-" + str(reduceId)
                    #logger.info("The name of the key to read is: " + randomized_keyname)
                    try:
                        # FIXME Need to set this stuff
                        #datasize = 1300 * 1000
                        datasize = 1310720
                        logger.info("[POCKET] [" + str(jobID) +"] " + str(time.time_ns()) + " " + str(taskID) + " 0 read " + "S")
                        r = pocket.get_buffer_bytes(pocket_namenode, randomized_keyname, datasize, jobid, DELETE_AFTER_READ=False)
                        end_time = time.time_ns()
                        #logger.info("Successfully read")
                        #pos = textback.find('.')
                        #logger.info("Padding position: " + str(pos))
                        #logger.info("last ten bytes after padding: " + textback[-10:])
                        #original_text = b64decode(textback.encode('utf-8'))
                        logger.info("[POCKET] [" + str(jobID) + "] " + str(end_time) + " " + str(taskID) + " " + str(len(r)) + " read " + "E ")
                        #logger.info("Size of original text: " + str(len(original_text)))

                        objs.append(r[:1300000])
                    except Exception:
                        logger.info("reading error key " + randomized_keyname)
                        raise

                data = [np.fromstring(obj, dtype=recordType) for obj in objs]
                [d.sort(order='key') for d in data]
                inputs.extend(data)


            reader_keylist = []
            key_per_client = (numPartitions + number_of_clients - 1) / number_of_clients
            number_of_clients = int(number_of_clients)
            for client_id in range(number_of_clients):
                reader_keylist.append({'roundIdx': roundIdx,
                                       'client_id': client_id,
                                       'key-per-client': key_per_client,
                                       'taskId': taskId,
                                       'jobid': jobid_int})

            read_pool.map(read_work, reader_keylist)
            t1 = time.time()
            #logger.info('read time ' + str(t1 - t3))
            read_time = t1 - t3

            if len(write_pool_handler_container) > 0:
                write_pool_handler = write_pool_handler_container.pop()
                twait_start = time.time()
                write_pool_handler.wait()
                twait_end = time.time()
                if twait_end - twait_start > 0.5:
                    logger.info('write time = ' + str(twait_end - t3) + " slower than read " + str(t1 - t3))
                else:
                    logger.info('write time < ' + str(twait_end - t3) + " faster than read " + str(t1 - t3))

            t2 = time.time()
            records = np.concatenate(inputs)
            gc.collect()
            concat_time = len(records)

            records.sort(order='key', kind='mergesort')

            t3 = time.time()
            #logger.info('sort time: ' + str(t3 - t2))

            work_time = t3 - t2

            def write_work(reduceId):
                keyname = "output/part-" + str(reduceId)
                m = hashlib.md5()
                m.update(keyname.encode('utf-8'))
                randomized_keyname = "output/" + m.hexdigest()[:8] + "-part-" + str(reduceId)
                body = records.tobytes()
                client.put_object(Bucket=bucketName, Key=randomized_keyname, Body=body)

            write_pool_handler = write_pool.map_async(write_work, [taskId * rounds + roundIdx])
            write_pool_handler_container.append(write_pool_handler)

        if len(write_pool_handler_container) > 0:
            write_pool_handler = write_pool_handler_container.pop()
            write_pool_handler.wait()
            twait_end = time.time()
            #logger.info('last write time = ' + str(twait_end - t3))
            write_time = twait_end - t3
        read_pool.close()
        write_pool.close()
        read_pool.join()
        write_pool.join()

        end_of_function = time.time()
        return begin_of_function, end_of_function, read_time, work_time, write_time, concat_time
def lambda_handler(event, context):
    id = int(event['id'])
    n = num_workers = int(event['n'])
    bucket_name = str(event['bucket_name'])
    n_tasks = n

    log_file = []

    t0 = time.time()

    # connect to crail
    #p = pocket.connect("10.1.12.156", 9070)
    p = pocket.connect("10.1.0.10", 9070)
    print "connected"

    jobid = ""
    #jobid = str(event['id'])

    #read from input file: shuffle<0 id> shuffle<1 id> ... shuffle<id num_workers-1>
    #'''
    file_tmp = '/tmp/tmp'
    all_lines = []
    for i in xrange(n_tasks):
        #key = 'shuffle' + str(id) +'-'+ str(i) # wrong one just for testing
        key = 'shuffle' + str(i) + '-' + str(id)

        src_filename = key
        dst_filename = file_tmp
        #print src_filename
        r = pocket.get(p, src_filename, dst_filename, jobid)
        if r != 0:
            raise Exception("get failed: " + src_filename)
            return -1
        #log_file.append((key, time.time()))
        with open(dst_filename, "r") as f:
            all_lines += f.readlines()
        #print src_filename + " read success"
    os.remove(file_tmp)
    #'''

    t1 = time.time()
    #print "read all from pocket"

    #merge & sort
    for i in xrange(len(all_lines)):
        all_lines[i] = (all_lines[i][:10], all_lines[i][12:])
    all_lines.sort(key=lambda x: x[0])

    for i in xrange(len(all_lines)):
        all_lines[i] = all_lines[i][0] + "  " + all_lines[i][1]
    t2 = time.time()

    #[s3] write to output file: output<id>
    s3 = boto3.resource('s3')
    file_name = 'output/sorted_output'
    m = 1000 / n_tasks
    size = len(all_lines) / m
    for i in xrange(m):
        with open(file_tmp, "w+") as f:
            start = size * i
            end = start + size
            f.writelines(all_lines[start:end])
            f.seek(0)
            body = f.read()
        key = file_name + str(id * m + i)
        s3.Bucket(bucket_name).upload_file(file_tmp, key)

        os.remove(file_tmp)
    t3 = time.time()

    # upload log
    startup_nodes = [{
        "host": "rediscluster-log.a9ith3.clustercfg.usw2.cache.amazonaws.com",
        "port": "6379"
    }]
    redis_client = StrictRedisCluster(startup_nodes=startup_nodes,
                                      skip_full_coverage_check=True)

    log = {'id': id, 't0': t0, 't1': t1, 't2': t2, 't3': t3}
    log_str = pickle.dumps(log)
    key = '/reduce-log' + '-' + '100GB' + '-' + str(n) + '-' + str(id)
    redis_client.set(key, log_str)
    print key + " logged"
    '''
    log_file_str = pickle.dumps(log_file)
    key = '/reduce-log-time'+'-'+'100GB'+'-'+str(n)+'-'+str(id)
    redis_client.set(key, log_file_str)
    print key + " logged" 
    '''
    #crail.close(socket, ticket, p)

    r = 'reduce finished ' + str(id)
    print r
    return r
예제 #17
0
def lambda_handler(event, context):
    # create a file of size (datasize) in bytes
    iter = 10000
    datasize = 32  #bytes

    file_tmp = '/tmp/file_tmp'
    with open(file_tmp, 'w') as f:
        text = 'a' * datasize
        f.write(text)

    # write to crail
    p = pocket.connect("10.1.129.91", 9070)
    jobid = 'lambda3'
    print pocket.create_dir(p, 'new_dir', jobid)
    print pocket.put(p, file_tmp, "test", jobid)
    return

    r = pocket.register_job(p, jobid)  # works if return 0
    if r != 0:
        print "registration failed"
        return

    time_list = []
    t0 = time.time()
    pocket_write(p, jobid, iter, file_tmp)
    t1 = time.time()

    print "=========================================="
    #print np.percentile(time_list, 90)
    print "Stats for " + str(iter) + " iter of " + str(
        datasize) + " bytes write:"
    throughput = iter * datasize * 8 / (t1 - t0) / 1e9
    print "throughput (Gb/s) = " + str(throughput)
    print "time (s) = " + str(t1 - t0)
    print "latency (us) = " + str((t1 - t0) / iter * 1e6)
    print "=========================================="

    t0 = time.time()
    pocket_read(p, jobid, iter, file_tmp)
    t1 = time.time()

    print "=========================================="
    print "Stats for " + str(iter) + " iter of " + str(
        datasize) + " bytes read:"
    throughput = iter * datasize * 8 / (t1 - t0) / 1e9
    print "throughput (Gb/s) = " + str(throughput)
    print "time (s) = " + str(t1 - t0)
    print "latency (us) = " + str((t1 - t0) / iter * 1e6)
    print "=========================================="

    t0 = time.time()
    pocket_lookup(p, jobid, iter)
    t1 = time.time()

    print "=========================================="
    print "Stats for " + str(iter) + " iter of " + str(
        datasize) + " bytes lookup:"
    throughput = iter * datasize * 8 / (t1 - t0) / 1e9
    print "throughput (Gb/s) = " + str(throughput)
    print "time (s) = " + str(t1 - t0)
    print "latency (us) = " + str((t1 - t0) / iter * 1e6)
    print "=========================================="

    os.remove(file_tmp)
    return