Пример #1
0
def generate_weightmask(jobid, jobGB, jobMbps, latency_sensitive):
    print("generate weightmask for ", jobid, jobGB, jobMbps, latency_sensitive)
    wmask = []
    # Step 1: determine if capacity or throughput bound
    if latency_sensitive:
        num_nodes_for_capacity = math.ceil(jobGB / DRAM_NODE_GB)
    else:
        num_nodes_for_capacity = math.ceil(jobGB / FLASH_NODE_GB)

    num_nodes_for_throughput = math.ceil(jobMbps / NODE_Mbps)

    if num_nodes_for_throughput >= num_nodes_for_capacity:
        print("jobid {} is throughput-bound".format(jobid))
        throughput_bound = 1
    else:
        print("jobid {} is capacity-bound".format(jobid))
        throughput_bound = 0

    # Step 2: check available resources in cluster
    # If throughput bound, will allocate nodes based on CPU and network demand
    if throughput_bound:
        # find all nodes that have spare Mbps
        #print(datanode_alloc)
        spare_throughput = datanode_alloc.loc[(datanode_alloc['net'] < 1.0) & (
            datanode_alloc['blacklisted'] == 0)]
        candidate_nodes_net = spare_throughput.sort_values(
            by='net', ascending=False).loc[:, 'net']
        spare_net_weight_alloc = 0
        job_net_weight_req = jobMbps * 1.0 / NODE_Mbps
        #print(candidate_nodes_net)

        print("job net weight req {}, this is {} Mbps".format(
            job_net_weight_req, jobMbps))
        # smallest fit first algorithm: fill in smallest gap first
        for node in candidate_nodes_net.index:
            net = candidate_nodes_net[node]
            #print("net for node {} is {}".format(node, net))
            if net == 1.0:
                continue
            #TODO: before decide to use a node, also much check weight*capacity satisfies capacity constraint!
            if job_net_weight_req - spare_net_weight_alloc >= 1 - net:
                spare_net_weight_alloc += 1 - net
                print(
                    "setting datanode_alloc to 1 for datanode {}".format(node))
                datanode_alloc.at[node, 'net'] = 1.0
                wmask.append((node, 1 - net))
            elif job_net_weight_req - spare_net_weight_alloc < 1 - net:
                print("setting datanode_alloc to {} for datanode {}".format(
                    net + (job_net_weight_req - spare_net_weight_alloc), node))
                datanode_alloc.at[node,
                                  'net'] = float(net +
                                                 (job_net_weight_req -
                                                  spare_net_weight_alloc))
                wmask.append(
                    (node, job_net_weight_req - spare_net_weight_alloc))
                spare_net_weight_alloc += job_net_weight_req - spare_net_weight_alloc

            if spare_net_weight_alloc == job_net_weight_req:
                break

            if spare_net_weight_alloc > job_net_weight_req:
                print(
                    "ERROR: shouldn't be allocating more than job needs! something went wrong..."
                )
                break

        if spare_net_weight_alloc == job_net_weight_req:
            print("Satisfied job without needing to launch new nodes :)")
            #print(datanode_alloc)
        else:
            datanode_alloc_prelaunch = datanode_alloc.copy()
            extra_nodes_needed = (job_net_weight_req - spare_net_weight_alloc)
            last_weight = extra_nodes_needed - int(extra_nodes_needed)
            if last_weight == 0:
                new_node_weights = [
                    1.0 for i in range(0, int(extra_nodes_needed))
                ]
            else:
                new_node_weights = [
                    1.0 for i in range(0, int(extra_nodes_needed))
                ]
                new_node_weights.append(last_weight)
            parallelism = math.ceil(extra_nodes_needed)
            global waitnodes
            waitnodes = parallelism
            print("KUBERNETES: launch {} extra nodes, wait for them to come up and assing proper weights {}"\
                    .format(parallelism, new_node_weights))
            # decide which kind of nodes to launch
            if latency_sensitive and jobGB <= parallelism * DRAM_NODE_GB:
                yield from launch_dram_datanode(parallelism)
            elif latency_sensitive:  # but capacity doesn't fit in paralellism*DRAM nodes
                print(
                    "app is latency sensitive but high capacity, so we put {} in DRAM, rest in flash"
                    .format(FRAC_DRAM_ALLOCATION))
                print(
                    "WARNING: check logic. we should not reach this case since then app would be capacity bound!"
                )
                num_dram_nodes = int(
                    (jobGB * FRAC_DRAM_ALLOCATION) / DRAM_NODE_GB)
                num_flash_nodes = parallelism - num_dram_nodes
                yield from launch_flash_datanode(num_dram_nodes)
                yield from launch_flash_datanode(num_flash_nodes)
            else:
                yield from launch_flash_datanode(parallelism)

            # wait for new nodes to start sending stats and add themselves to the datanode_alloc table,
            # then assign them the proper weights
            new_datanodes = yield from wait_for_datanodes_to_join(
                datanode_alloc_prelaunch, parallelism)
            print("datanodes {} have joined!".format(new_datanodes))
            i = 0
            for n in new_datanodes:
                wmask.append((n, new_node_weights[i]))
                i = i + 1
            print("wmask:", wmask)

    # TODO: If capacity bound, will allocate nodes based on DRAM or Flash capacity (depending on latency sensitivity)
    else:
        # find all nodes that have spare capacity
        print("NOTICE: the app is capacity-bound. \
           TODO: need to implement weightmask generation for this case. Not yet supported.\n"
              )
        # should be similar to sizing based on capacity
        # but need to decide whether to use DRAM or Flash capacity based on latency sensitivity
        # skipping this for now since all our apps are throughput-bound

    # convert weightmask to proper format
    job_wmask = []
    for (datanodeip_port, weight) in wmask:
        datanode_ip = datanodeip_port.split(":")[0]
        datanode_port = int(datanodeip_port.split(":")[1])
        datanode_hash = ioctlcmd.calculate_datanode_hash(
            datanode_ip, datanode_port)
        job_wmask.append((datanode_hash, float("{0:.2f}".format(weight))))
        datanode_alloc.at[datanodeip_port, 'net'] = weight
        datanode_alloc.at[datanodeip_port, 'reserved'] = 0

    print("job_wmask is:", wmask)
    print(datanode_alloc)
    return job_wmask, wmask
Пример #2
0
def generate_weightmask(jobid, jobGB, jobMbps, latency_sensitive):
  print("generate weightmask for ", jobid, jobGB, jobMbps, latency_sensitive)
  wmask = []
  # Step 1: determine if capacity or throughput bound
  if latency_sensitive:  
    num_nodes_for_capacity = jobGB / DRAM_NODE_GB
  else:
    num_nodes_for_capacity = jobGB / FLASH_NODE_GB
   
  num_nodes_for_throughput = jobMbps / NODE_Mbps

  if num_nodes_for_throughput >= num_nodes_for_capacity:
    print("jobid {} is throughput-bound".format(jobid))
    throughput_bound = 1
  else:
    print("jobid {} is capacity-bound".format(jobid))
    throughput_bound = 0
  
  # Step 2: check available resources in cluster
  # Note: only look at nodes that satisfy the capacity requiremnt on the right storage media
  #       for now, assume that use DRAM if latency sensitive, otherwise use NVMe Flash
  #       more generally, for non latency sensitive jobs, find cheapest storage tier
  #       that satisfies capacity and throughput requirements of the job
  if latency_sensitive:
    jobGB_weight_req = jobGB * 1.0 / DRAM_NODE_GB
    NODE_CAPACITY = DRAM_NODE_GB
    spare_capacity = datanode_alloc.loc[(datanode_alloc['DRAM_GB'] < 1.0) & \
					(datanode_alloc['DRAM_GB'] >= 0.0) & \
					(datanode_alloc['blacklisted'] == 0)]
    candidate_nodes_capacity = spare_capacity.sort_values(by='DRAM_GB', ascending=False).loc[:, 'DRAM_GB']
  else:
    jobGB_weight_req = jobGB * 1.0 / FLASH_NODE_GB
    NODE_CAPACITY = FLASH_NODE_GB
    spare_capacity = datanode_alloc.loc[(datanode_alloc['Flash_GB'] < 1.0) & \
					(datanode_alloc['Flash_GB'] >= 0.0) & \
					(datanode_alloc['blacklisted'] == 0)]
    candidate_nodes_capacity = spare_capacity.sort_values(by='Flash_GB', ascending=False).loc[:, 'Flash_GB']

  # find all nodes that have spare Mbps 
  spare_throughput = datanode_alloc.loc[(datanode_alloc['net'] < 1.0) & (datanode_alloc['blacklisted'] == 0)]
  candidate_nodes_net = spare_throughput.sort_values(by='net', ascending=False).loc[:, 'net']

  #print("Candidate nodes net: ", candidate_nodes_net)
  #print("Candidate nodes capacity: ", candidate_nodes_capacity)

  # If throughput bound, will allocate nodes based on CPU and network demand 
  if throughput_bound:
    job_net_weight_allocated = 0  # as a fraction of NODE_Mbps
    job_net_weight_req = jobMbps * 1.0 / NODE_Mbps

    # smallest fit first algorithm: fill in smallest gap first
    # note: first set job weightmask such that each weight represents
    #       the fraction of that node's throughput that is allocated to this job
    #       later, we will scale job weights based on fraction of job's data/throughput
    #       that should go to each node 
    for node in candidate_nodes_net.index: 
      if node not in candidate_nodes_capacity.index:
        #print("Node candidate ", node, " with spare net does not have sufficient storage tier capacity required, so skip it.\n")
        continue
      net = candidate_nodes_net[node]
      capacity = candidate_nodes_capacity[node]
      #print("net for node {} is {}".format(node, net))
      if net == 1.0:
        continue
      if job_net_weight_req - job_net_weight_allocated >= 1 - net: 
        node_net_alloc = 1 - net
        corresponding_capacity_alloc = node_net_alloc * NODE_Mbps * jobGB / (jobMbps * NODE_CAPACITY)
        # check if enough capacity on this node (assume uniform data access, so all data is equally hot)
        capacity_avail = 1 - capacity
        if capacity_avail < corresponding_capacity_alloc:
          corresponding_net_alloc = capacity_avail * NODE_CAPACITY * jobMbps / (jobGB * NODE_Mbps)
          job_net_weight_allocated += corresponding_net_alloc
          wmask.append((node, corresponding_net_alloc)) 
          datanode_alloc.at[node,'net'] += corresponding_net_alloc 
          incr_datanode_alloc_capacity(node, capacity_avail, latency_sensitive)
        else:
          job_net_weight_allocated += node_net_alloc
          wmask.append((node, 1.0)) 
          datanode_alloc.at[node,'net'] = 1.0
          incr_datanode_alloc_capacity(node, corresponding_capacity_alloc, latency_sensitive)
      elif job_net_weight_req - job_net_weight_allocated < 1 - net:
        node_net_alloc = (job_net_weight_req - job_net_weight_allocated)
        corresponding_capacity_alloc = node_net_alloc * NODE_Mbps * jobGB / (jobMbps * NODE_CAPACITY)
        capacity_avail = 1 - capacity
        if capacity_avail < corresponding_capacity_alloc:
          corresponding_net_alloc = capacity_avail * NODE_CAPACITY * jobMbps / (jobGB * NODE_Mbps)
          job_net_weight_allocated += corresponding_net_alloc
          wmask.append((node, corresponding_net_alloc)) 
          datanode_alloc.at[node,'net'] += corresponding_net_alloc
          incr_datanode_alloc_capacity(node, capacity_avail, latency_sensitive)
        else:
          job_net_weight_allocated += node_net_alloc 
          wmask.append((node, node_net_alloc))
          datanode_alloc.at[node,'net'] += node_net_alloc
          incr_datanode_alloc_capacity(node, corresponding_capacity_alloc, latency_sensitive)

      if job_net_weight_allocated == job_net_weight_req:
        break
    
      if job_net_weight_allocated > job_net_weight_req:
        print("ERROR: shouldn't be allocating more than job needs! something went wrong...")
        break
    
    if job_net_weight_allocated == job_net_weight_req:
      print("Satisfied job without needing to launch new nodes :)")
    else:
      datanode_alloc_prelaunch = datanode_alloc.copy()
      extra_nodes_needed = (job_net_weight_req - job_net_weight_allocated)
      last_weight = extra_nodes_needed - int(extra_nodes_needed)
      if last_weight == 0:
        new_node_weights = [1.0 for i in range(0, int(extra_nodes_needed))]
      else:
        new_node_weights = [1.0 for i in range(0, int(extra_nodes_needed))]
        new_node_weights.append(last_weight)
      parallelism = math.ceil(extra_nodes_needed)
      global waitnodes
      waitnodes = parallelism
      print("KUBERNETES: launch {} extra nodes, wait for them to come up and assing proper weights {}"\
              .format(parallelism, new_node_weights))
      # decide which kind of nodes to launch
      if latency_sensitive: #and jobGB <= parallelism*DRAM_NODE_GB:
        yield from launch_dram_datanode(parallelism)
      #elif latency_sensitive: # but capacity doesn't fit in paralellism*DRAM nodes
      #  print("app is latency sensitive but high capacity, so we put {} in DRAM, rest in flash".format(FRAC_DRAM_ALLOCATION))
      #  num_dram_nodes = int((jobGB * FRAC_DRAM_ALLOCATION)/ DRAM_NODE_GB)
      #  num_flash_nodes = parallelism - num_dram_nodes
      #  yield from launch_dram_datanode(num_dram_nodes)
      #  yield from launch_flash_datanode(num_flash_nodes)
      else:
        yield from launch_flash_datanode(parallelism)
      
      # wait for new nodes to start sending stats and add themselves to the datanode_alloc table,
      # then assign them the proper weights
      new_datanodes = yield from wait_for_datanodes_to_join(datanode_alloc_prelaunch, parallelism)
      print("datanodes {} have joined!".format(new_datanodes))
      i = 0
      for n in new_datanodes:
        wmask.append((n, new_node_weights[i]))
        i = i + 1
      print("wmask:", wmask)
 
  else: # capacity-bound
    jobGB_weight_allocated = 0  # as a fraction of NODE_GB

    # smallest fit first algorithm: fill in smallest gap first
    # note: first set job weightmask such that each weight represents
    #       the fraction of that node's capacity that is allocated to this job
    #       later, we will scale job weights based on fraction of job's data
    #       that should go to each node 
    for node in candidate_nodes_capacity.index: 
      if node not in candidate_nodes_net.index:
        #print("Node candidate ", node, " with spare capacity does not have sufficient throughput, so skip it.\n")
        continue
      net = candidate_nodes_net[node]
      capacity = candidate_nodes_capacity[node]
      if jobGB_weight_req - jobGB_weight_allocated >= 1 - capacity: 
        nodeGB_alloc = 1 - capacity
        corresponding_net_alloc = nodeGB_alloc * NODE_CAPACITY * jobMbps / (jobGB * NODE_Mbps)
        # check if enough throughput on this node (assume uniform data access, so all data is equally hot)
        net_avail = 1 - net
        if net_avail < corresponding_net_alloc:
          corresponding_capacity_alloc = net_avail * NODE_Mbps * jobGB / (jobMbps * NODE_CAPACITY)
          jobGB_weight_allocated += corresponding_capacity_alloc
          wmask.append((node, corresponding_capacity_alloc)) 
          datanode_alloc.at[node,'net'] += net_avail
          incr_datanode_alloc_capacity(node, corresponding_capacity_alloc, latency_sensitive) 
        else:
          jobGB_weight_allocated += nodeGB_alloc
          wmask.append((node, nodeGB_alloc)) 
          datanode_alloc.at[node,'net'] += net_avail * NODE_CAPACITY * jobMbps / (jobGB * NODE_Mbps)
          incr_datanode_alloc_capacity(node, nodeGB_alloc, latency_sensitive)
      elif jobGB_weight_req - jobGB_weight_allocated < 1 - capacity:
        nodeGB_alloc = (jobGB_weight_req - jobGB_weight_allocated)
        corresponding_net_alloc = nodeGB_alloc * NODE_CAPACITY * jobMbps / (jobGB * NODE_Mbps)
        net_avail = 1 - net
        if net_avail < corresponding_net_alloc:
          corresponding_capacity_alloc = net_avail * NODE_Mbps * jobGB / (jobMbps * NODE_CAPACITY)
          jobGB_weight_allocated += corresponding_capacity_alloc
          wmask.append((node, corresponding_capacity_alloc)) 
          datanode_alloc.at[node,'net'] += net_avail 
          incr_datanode_alloc_capacity(node, corresponding_capacity_alloc, latency_sensitive)
        else:
          jobGB_weight_allocated += nodeGB_alloc 
          wmask.append((node, nodeGB_alloc))
          datanode_alloc.at[node,'net'] += corresponding_net_alloc
          incr_datanode_alloc_capacity(node, nodeGB_alloc, latency_sensitive)

      if jobGB_weight_allocated == jobGB_weight_req:
        break
    
      if jobGB_weight_allocated > jobGB_weight_req:
        print("ERROR: shouldn't be allocating more than job needs! something went wrong...")
        break
    
    if jobGB_weight_allocated == jobGB_weight_req:
      print("Satisfied job without needing to launch new nodes :)")
    else:
      datanode_alloc_prelaunch = datanode_alloc.copy()
      extra_nodes_needed = (jobGB_weight_req - jobGB_weight_allocated)
      last_weight = extra_nodes_needed - int(extra_nodes_needed)
      if last_weight == 0:
        new_node_weights = [1.0 for i in range(0, int(extra_nodes_needed))]
      else:
        new_node_weights = [1.0 for i in range(0, int(extra_nodes_needed))]
        new_node_weights.append(last_weight)
      parallelism = math.ceil(extra_nodes_needed)
      waitnodes = parallelism
      print("KUBERNETES: launch {} extra nodes, wait for them to come up and assing proper weights {}"\
              .format(parallelism, new_node_weights))
      if latency_sensitive:
        yield from launch_dram_datanode(parallelism)
      else:
        yield from launch_flash_datanode(parallelism)
      
      # wait for new nodes to start sending stats and add themselves to the datanode_alloc table,
      # then assign them the proper weights
      new_datanodes = yield from wait_for_datanodes_to_join(datanode_alloc_prelaunch, parallelism)
      print("datanodes {} have joined!".format(new_datanodes))
      i = 0
      for n in new_datanodes:
        wmask.append((n, new_node_weights[i]))
        i = i + 1
      print("wmask:", wmask)


  # convert weightmask to proper format
  # current job wmask contains weights in relation to NODE_Mbps
  # but now we need to make this in relation to the job requirements
  # e.g., if a throughput-bound job needs 7 Gb/s and all assigned to 1 node
  #       currently, wmask has a weight of 0.85, which is the fraction of NODE_Mbps the job consumes
  #       but wmask for the job should be 1 because all the data is going to one node.
  # first save the datanode allocation weights to make it easier to deregister job later
  if throughput_bound:
    job_datanode_net_allocations[jobid] = wmask.copy() 
    corresponding_capacity_alloc_wmask = [ (x[0], float(x[1] * NODE_Mbps * jobGB / (jobMbps * NODE_CAPACITY))) for x in wmask]
    if latency_sensitive:
      job_datanode_dramGB_allocations[jobid] = corresponding_capacity_alloc_wmask 
    else:
      job_datanode_flashGB_allocations[jobid] = corresponding_capacity_alloc_wmask 

  else:
    corresponding_net_alloc_wmask = [ (x[0], float(x[1] * NODE_CAPACITY * jobMbps / (jobGB * NODE_Mbps))) for x in wmask]
    job_datanode_net_allocations[jobid] = corresponding_net_alloc_wmask 
    if latency_sensitive:
      job_datanode_dramGB_allocations[jobid] = wmask.copy() 
    else:
      job_datanode_flashGB_allocations[jobid] = wmask.copy() 

  job_wmask = []
  weight_sum = sum([x[1] for x in wmask]) 
  for idx, (datanodeip_port, weight) in enumerate(wmask):
    jobweight = weight / weight_sum # this is now the weight in relation to the total job req
    #### Only need to update datanode weights for new datanodes which have weight 0
    if datanode_alloc.at[datanodeip_port,'net'] == 0.0:
      if throughput_bound:
        datanode_alloc.at[datanodeip_port,'net'] += weight
        if latency_sensitive:
          datanode_alloc.at[datanodeip_port,'DRAM_GB'] += jobweight * jobGB / DRAM_NODE_GB
        else:
          datanode_alloc.at[datanodeip_port,'Flash_GB'] += jobweight * jobGB / FLASH_NODE_GB
      else: # capacity-bound
        datanode_alloc.at[datanodeip_port,'net'] += jobweight * jobMbps / NODE_Mbps
        if latency_sensitive:
          datanode_alloc.at[datanodeip_port,'DRAM_GB'] += weight
        else:
          datanode_alloc.at[datanodeip_port,'Flash_GB'] += weight
    
    datanode_alloc.at[datanodeip_port,'reserved'] = 0
    wmask[idx] = (datanodeip_port, jobweight)
    datanode_ip = datanodeip_port.split(":")[0]
    datanode_port = int(datanodeip_port.split(":")[1])
    datanode_hash = ioctlcmd.calculate_datanode_hash(datanode_ip, datanode_port) 
    job_wmask.append((datanode_hash, float("{0:.2f}".format(jobweight)))) 

  print("job_wmask is:", wmask) 
  #print(datanode_alloc)
  return job_wmask, wmask