示例#1
0
        else:
            workspace.RunNet(self.model.net)


if __name__ == "__main__":
    ### import packages ###
    import sys
    import argparse

    sys.path.append("..")
    # data generation
    from data_generator.dlrm_data_caffe2 import DLRMDataGenerator

    from utils.utils import cli

    args = cli()

    ### some basic setup ###
    np.random.seed(args.numpy_rand_seed)
    np.set_printoptions(precision=args.print_precision)

    use_gpu = args.use_gpu
    if use_gpu:
        device_opt = core.DeviceOption(workspace.GpuDeviceType, 0)
        ngpus = workspace.NumGpuDevices()  # 1
        print("Using {} GPU(s)...".format(ngpus))
    else:
        device_opt = core.DeviceOption(caffe2_pb2.CPU)
        print("Using CPU...")

    ### prepare training data ###
示例#2
0
def DeepRecSys():
  print("Running DeepRecSys")

  # ######################################################################
  # Get and print command line arguments for this experiment
  # ######################################################################
  args = cli()

  arg_keys = [str(key) for key in vars(args)]
  print("============================================================")
  print("DeepRecSys configuration")
  for key in arg_keys:
    print(key, getattr(args, key))
  print("============================================================")

  if args.queue == True:

    if args.model_accel:
      args.inference_engines += 1

    print("[DeepRecSys] total inference engine ", args.inference_engines)

    # Setup single request Queue and multiple response queues
    requestQueue    = Queue(maxsize=1024)
    accelRequestQueue = Queue(maxsize=32)
    pidQueue        = Queue()
    responseQueues  = []
    inferenceEngineReadyQueue = Queue()

    for _ in range(args.inference_engines):
      responseQueues.append(Queue())

    # Create load generator to mimic per-server load
    loadGeneratorReturnQueue = Queue()
    DeepRecLoadGenerator = Process( target = loadGenerator,
                        args   = (args, requestQueue, loadGeneratorReturnQueue, inferenceEngineReadyQueue, pidQueue, accelRequestQueue)
                      )

    # Create backend inference engines that consume requests from load
    # generator
    DeepRecEngines = []
    for i in range(args.inference_engines):
      if (args.model_accel) and (i == (args.inference_engines - 1)):
        p = Process( target = accelInferenceEngine,
                     args   = (args, accelRequestQueue, i, responseQueues[i], inferenceEngineReadyQueue)
                   )
      else:
        p = Process( target = inferenceEngine,
                     args   = (args, requestQueue, i, responseQueues[i], inferenceEngineReadyQueue)
                   )
      p.daemon = True
      DeepRecEngines.append(p)

    # Start all processes
    for i in range(args.inference_engines):
      DeepRecEngines[i].start()

    DeepRecLoadGenerator.start()

    responses_list = []
    inference_engines_finished = 0

    response_sets = {}
    response_latencies = []
    final_response_latencies = []

    request_granularity = int(args.req_granularity)

    while inference_engines_finished != args.inference_engines:
      for i in range(args.inference_engines):
        if (responseQueues[i].qsize()):
          response = responseQueues[i].get()

          # Process responses to determine what the running tail latency is and
          # send new batch-size to loadGenerator
          if response == None:
            inference_engines_finished += 1
            print("Joined ", inference_engines_finished, " inference engines")
            sys.stdout.flush()
          else:
            key = (response.epoch, response.batch_id, response.exp_packet)
            if key in response_sets.keys(): # Response already in the list
              curr_val = response_sets[key]

              val      = (response.arrival_time,
                          response.inference_end_time,
                          response.total_sub_batches)

              arr = min(curr_val[0], val[0])
              inf = max(curr_val[1], val[1])
              remain = curr_val[2]-1
              response_sets[ (response.epoch, response.batch_id, response.exp_packet) ] = (arr, inf, remain)
            else: # New response!
              arr = response.arrival_time
              inf = response.inference_end_time
              remain = response.total_sub_batches - 1

              response_sets[ (response.epoch, response.batch_id, response.exp_packet) ] = (arr, inf, remain)

            # If this request is over then we can go ahead and compute the
            # request latency in order to guide batch-scheduler
            if remain == 0:
              response_latencies.append( inf - arr )

              # If we are done finding the optimum batching and accelerator
              # partitioning threshold then we log the response latency to
              # measure packets later
              if not response.exp_packet:
                  final_response_latencies.append( inf - arr )

              if len(response_latencies) % request_granularity == 0:
                print("Running latency: ", np.percentile(response_latencies[int(-1 * request_granularity):], 95) * 1000.)
                sys.stdout.flush()
                # Add
                pidQueue.put ( np.percentile(response_latencies[int(-1 * request_granularity):], 95) * 1000. )

            # Add responses to final list
            responses_list.append(response.__dict__)

    print("Finished runing over the inference engines")
    sys.stdout.flush()

    log_dir = reduce(lambda x, y: x + y, args.log_file.split("/")[:-1])

    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    with open(args.log_file, "w") as f:
        for response in responses_list:
          f.writelines(str(response) + "\n")

    # Join/end all processes
    DeepRecLoadGenerator.join()
    total_requests = loadGeneratorReturnQueue.get()

    cpu_sub_requests = total_requests[0]
    cpu_requests     = total_requests[1]
    accel_requests   = total_requests[2]

    agg_requests     = cpu_sub_requests + accel_requests

    print("Exiting DeepRecSys after printing ", len(responses_list), "/" , agg_requests)

    print("CPU sub requests ", cpu_sub_requests, "/" , agg_requests)
    print("CPU requests ", cpu_requests)
    print("Accel requests ", accel_requests, "/" , agg_requests)

    meas_qps_responses = list(filter(lambda x: (not x['exp_packet']) and (x['sub_id'] == 0), responses_list))

    initial_time = meas_qps_responses[0]['inference_end_time']
    end_time     = meas_qps_responses[-1]['inference_end_time']

    print("Measured QPS: ",  (len(meas_qps_responses)) / (end_time - initial_time))
    print("Measured p95 tail-latency: ",  np.percentile(final_response_latencies, 95) * 1000., " ms")
    print("Measured p99 tail-latency: ",  np.percentile(final_response_latencies, 99) * 1000., " ms")

    sys.stdout.flush()

    for i in range(args.inference_engines):
      DeepRecEngines[i].terminate()

  else: # No queue, run DeepRecSys in standalone mode
    inferenceEngine(args)

  return