def test_extern_opt():
    def Optimize(mod):
        return relay.transform.FoldConstant()(mod)

    tvm.register_func("relay.ext.test_target.optimize", Optimize)

    x = relay.var("x", shape=(2, 2))
    y0 = relay.var("y0", shape=(2, 2))
    y1 = relay.var("y1", shape=(2, 2))
    yy0 = relay.annotation.compiler_begin(y0, "test_target")
    yy1 = relay.annotation.compiler_begin(y1, "test_target")
    z = yy0 + yy1
    end = relay.annotation.compiler_end(z, "test_target")
    f = relay.Function([x, y0, y1], end * x)
    c = np.ones(shape=(2, 2), dtype="float32")
    f = bind_params_by_name(f, {"y0": tvm.nd.array(c), "y1": tvm.nd.array(c)})
    mod = tvm.IRModule()
    mod["main"] = f
    mod = transform.InferType()(mod)
    mod = transform.PartitionGraph()(mod)

    try:
        t0 = mod["test_target_0"]
    except:
        raise KeyError("test_target_0 not found")

    assert isinstance(t0.body, relay.Constant)
    expected = np.empty([2, 2])
    expected.fill(2)
    tvm.testing.assert_allclose(t0.body.data.asnumpy(), expected, rtol=1e-5, atol=1e-5)
Exemplo n.º 2
0
 def mk_primitive_op(self, func: Expr, args, output_type) -> Expr:
     cc_key = compile_engine.CCacheKey(func, self.tgt)
     hash = tvm.ir.structural_hash(func)
     name = f"op_{hash}"
     if not get_global_func(name, allow_missing=True):
         jit_func = self.engine.jit(cc_key, self.tgt)
         register_func(name, jit_func)
     return PackedCall(name, args, [x.checked_type for x in args], output_type)
Exemplo n.º 3
0
def main_compute(code_only=False):
  tvm.register_func('tvm_callback_cuda_compile', compile_source, override=True)
  logging.getLogger('autotvm').setLevel(logging.DEBUG)
  logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))

  default_tune_op = importlib.import_module('templates.' + (os.environ['OP'] if 'OP' in os.environ else 'auto.generic'))
  if verbose:
    print('  >> Backend = %s, Python PID = %s, Task = %s;' % (backend, os.getpid(), default_tune_op.__name__))

  task = autotvm.task.create("template_op", args=(), target=tvm_target)

  def json_to_config(json_dict, index=-1, code_hash=None):
    if not isinstance(json_dict, list):
      json_list = []
      for key in json_dict:
        json_list.append([key, 'ot' if type(json_dict[key]) is not list else ('sp' if json_dict[key][0:1] == [-1] else 're'), json_dict[key]])
      json_dict = json_list
    config = ConfigEntity.from_json_dict({"index": index, "time": "", "code_hash": code_hash, "entity": json_dict})
    # config = ConfigEntity.from_json_dict({"i": index, "t": "", "c": code_hash, "e": json_dict})
    return config

  def config_to_json(config):
    if config is None:
      return {}
    if isinstance(config, str):
      return json.loads(config)
    jobj = config.to_json_dict()['entity']
    # jobj = config.to_json_dict()['e']
    json_dict = dict()
    for i in range(len(jobj)):
      assert(jobj[i][1] in ['sp', 'ot', 're'])
      json_dict[jobj[i][0]] = jobj[i][2]
    return json_dict

  num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0

  config = os.environ.get('CONFIG', '').strip()
  if config != '':
    best_config = config
  elif 'NNI_TRIAL_JOB_ID' in os.environ:
    if os.environ['NNI_TRIAL_JOB_ID'] == '@':
      search_space = get_search_space(task.config_space)
      json_space = json.dumps(search_space)
      dump_to_file='./search_space.json'
      print("\n>> Writing Search Space to '%s', Search Space = %s;" % (dump_to_file, json_space))
      with open("search_space.json", "w") as fp:
        fp.write(json_space)
      sys.exit(0)

    try:
      import nni
      params_given = nni.get_next_parameter()
      if params_given is None:
        raise
      local_dir_id = os.environ['NNI_TRIAL_JOB_ID']
    except:
      params_given = default_tune_op.get_choice_example()
      local_dir_id = '_'
    t = run_config_entity(params_given, local_dir_id)
    gflops = compute_gflops(task.flop, t)
    print('[Antares-engine] Final entity result is: %g' % gflops)
    try:
      nni.report_final_result(gflops)
    except:
      print('[Antares-engine] (not reporting final result to NNI.)')
    exit(0)

  elif num_trials > 0:
    dev_num = platform_config.get_execution_parallism()
    if dev_num <= 0:
        raise Exception("No valid device found for backend: %s." % backend)
    batch_size = int(os.environ.get('BATCH', '16'))

    from concurrent.futures import ThreadPoolExecutor
    try:
      if platform_config.allow_concurrent_compile_execution():
        raise Exception()
      worker_size = 1
    except:
      worker_size = batch_size
    thread_pool = ThreadPoolExecutor(max_workers=worker_size)

    task.antares_helper = Mock()
    task.antares_helper.json_to_config = json_to_config
    task.antares_helper.config_to_json = config_to_json
    task.antares_helper.to_json_search_space = get_search_space

    tuner_type = os.environ.get('TUNER', '')
    if not tuner_type:
      comp = os.environ['COMPUTE_V1']
      if '=!' in comp and 'plan/' not in comp[comp.find(' ##') + 1:] and ';' not in comp and backend in ['c-rocm', 'c-cuda', 'c-hlsl', 'c-ocl']:
        tuner_type = 'AutoTVM2'
      else:
        tuner_type = 'XGBoost'
    print('  >> MAKE_PARA = %d/%d, EXEC_PARA = %d, TUNER = %s' % (worker_size, batch_size, dev_num, tuner_type))

    auto_commit = os.environ.get('COMMIT', '')
    if auto_commit:
      saved_code = codehub_db(os.environ['COMPUTE_V1'])
      if saved_code is not None and auto_commit != 'force':
        raise Exception("Saved code has existed in codehub. Please try COMMIT=force to override it.")
      os.environ.pop('COMMIT')

    try:
      tuner = importlib.import_module('tuner.%s.main' % tuner_type)
      tuner = tuner.MainTuner(task)
    except:
      raise Exception('>> Cannot import Antares Tuner: %s' % tuner_type)

    if tuner is not None:
      AntaresGlobal.current_step = 0

      def measure_batch(inputs):
        results, futures = [], []
        best_slot = -1
        expected_timecost = tuner.task.best.timecost
        for i in range(len(inputs)):
          futures.append(thread_pool.submit(run_config_entity, config_to_json(inputs[i].config), AntaresGlobal.current_step + i + 1, expected_timecost, i % dev_num))
        for i in range(len(inputs)):
          t = futures[i].result()
          if t < tuner.task.best.timecost:
            best_slot = AntaresGlobal.current_step + i + 1
            tuner.task.best.timecost = t
            tuner.task.best.config = inputs[i].config
            tuner.task.best.occur = best_slot
          results.append(autotvm.measure.MeasureResult(costs=(t,), error_no=0, all_cost=i, timestamp=time.time()))
        AntaresGlobal.current_step += len(results)

        print('\nSTEP[%d / %d] Current Best Config = %s, Perf = %g Gflops, MemRatio = %g %%, Occur Step = %d;' % (
          AntaresGlobal.current_step,
          num_trials,
          json.dumps(config_to_json(tuner.task.best.config)),
          compute_gflops(tuner.task.flop, tuner.task.best.timecost),
          compute_mem_ratio(tuner.task.best.timecost),
          tuner.task.best.occur))

        if auto_commit and best_slot >= 0:
          with open(local_get_dir_file('my_kernel.cc', best_slot), 'r') as fp:
            device_source = fp.read()
          with open(local_get_dir_file('result.txt', best_slot), 'r') as fp:
            t = float(fp.read().split()[0])
          kernel_path = codehub_db(os.environ['COMPUTE_V1'], source_code=device_source + '\n// Saved Perf = %g sec / run; Step Produced = %d;' % (t, best_slot))
          print('  >> Update current code to codehub: %s' % kernel_path)
        return results

      tuner.task.best = Mock()
      tuner.task.best.timecost = float('inf')
      tuner.task.best.config = None
      tuner.task.best.occur = -1

      tuner.measure_batch = measure_batch
      tuner.measure_batch.n_parallel = batch_size
      callbacks = []

      history_log_for_transfer_learning = os.environ.get('RECORD', '')

      if history_log_for_transfer_learning:
        callbacks.append(autotvm.callback.log_to_file(history_log_for_transfer_learning))
        # Enable Transfer Learning for Incremental Task
        if os.path.exists(history_log_for_transfer_learning):
          print('  >>  Loading incremental history from log file: %s ..' % history_log_for_transfer_learning)
          tuner.load_history(autotvm.record.load_from_file(history_log_for_transfer_learning))

      tuner.tune(n_trial=num_trials, measure_option=autotvm.measure_option(
          builder=autotvm.LocalBuilder(n_parallel=batch_size),
          runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)
      ), callbacks=callbacks)
      assert not math.isinf(tuner.task.best.timecost), "Not valid config found in the whole tuning."
      best_config = json.dumps(config_to_json(tuner.task.best.config))

      if auto_commit:
          device_source = codehub_db(os.environ['COMPUTE_V1'])
          codehub_db(os.environ['COMPUTE_V1'], source_code=device_source + '\n// Antares Tuning Completed in %d steps.' % AntaresGlobal.current_step)

      print("\n[Best Config] CONFIG='%s'  ==>  Performance is up to %f Gflops, occurred at step %d / %d; time per run = %g sec." % (
        best_config,
        compute_gflops(tuner.task.flop, tuner.task.best.timecost),
        tuner.task.best.occur,
        num_trials,
        tuner.task.best.timecost))

      if hasattr(tuner, 'cleanup'):
        tuner.cleanup()
    else:
      raise Exception('Unrecognized tuner type: `%s`' % tuner_type)
    exit(0)
  else:
    if os.environ['OP'] == 'auto.generic':
      saved_code = codehub_db(os.environ['COMPUTE_V1'])
      if saved_code is not None:
        print("  >> Using Saved Code from Codehub:")
        print("===========================")
        print(saved_code)
        print("===========================")
        exit(0)
    best_config = ''

  assert isinstance(best_config, str)
  if verbose:
    print("====>> [Current Config Option]", best_config)
  if best_config.startswith('['):
    from tvm import auto_scheduler
    origin_cfg = json.loads(best_config)
    origin_cfg = {
      "i": [['["main_compute.<locals>.auto_template"]', 'cuda -keys=cuda,gpu -max_num_threads=%d -thread_warp_size=%d' % (
                device_properties().max_threads_per_block, device_properties().warp_size
             )], origin_cfg],
      "r": [[0], 0, 0, 0],
      "v": "v0.2",
    }
    origin_cfg_file = local_get_dir_file('my_kernel.cfg')
    with open(origin_cfg_file, 'w') as fp:
      fp.write(json.dumps(origin_cfg))
    origin_cfg = tvm.auto_scheduler.measure_record.load_records(origin_cfg_file)
 
    @auto_scheduler.register_workload
    def auto_template():
      _, arg_bufs = default_tune_op.get_template_op()
      return arg_bufs

    target = tvm.target.Target("cuda")
    auto_task = auto_scheduler.create_task(auto_template, (), target)
    for inp, res in origin_cfg:
      s, arg_bufs = auto_task.compute_dag.apply_steps_from_state(inp.state)
      break
  else:
    config = json_to_config(json.loads(best_config)) if best_config else task.config_space
    with ApplyConfig(config):
      with tvm.target.Target(tvm_target):
        s, arg_bufs = default_tune_op.get_template_op()

  device_source, kernel_path = get_target_source(s, arg_bufs)

  if code_only:
    return device_source

  if verbose:
    print("====================================")
    print(device_source)
    print("====================================\n")

  dev_id = int(os.environ.get('DEV_KEY', '0'))
  result = evaluate_perf(kernel_path, task.flop, dev_id)
  exit(0 if result is not None else 1)
Exemplo n.º 4
0
    def create_op_call(self, op: Function, relay_args, py_args):
        """Lowers the passed primitive function, registers it in TVM's
        global compiler, and produces a call to the lowered function in
        the generated Python code."""

        # compile the function and register globally
        cc_key = compile_engine.CCacheKey(op, self.tgt)
        func_hash = tvm.ir.structural_hash(op)
        op_name = "_lowered_op_{}".format(func_hash)
        if not tvm.get_global_func(op_name, allow_missing=True):
            jitted = self.engine.jit(cc_key, self.tgt)
            tvm.register_func(op_name, jitted)

        def convert_input(py_input, arg_type):
            """Use the types of the function arguments to determine whether we expect
            a tensor or tuple (returns list of inputs to the lowered op call)"""
            # equivalent: input.data
            if isinstance(arg_type, relay.TensorType):
                return [py_input]
            assert isinstance(arg_type, relay.TupleType)
            # convert each input.fields[i]
            ret = []
            for i in range(len(arg_type.fields)):
                ret += convert_input(
                    ast.Subscript(py_input, ast.Index(Num(i)), Load()),
                    arg_type.fields[i])
            return ret

        def convert_output(ret_type):
            """Use the function return type to produce auxiliary variables to store outputs.
            Returns ([assignments of output vars], [extra arguments to pass to op call],
            expression collecting output)"""
            if isinstance(ret_type, relay.TensorType):
                output_var_name = self.generate_var_name("_out")
                output_var = Name(output_var_name, Load())
                shape = ast.Tuple(
                    [Num(dim) for dim in ret_type.concrete_shape], Load())
                # create a new NDArray of the right shape and dtype
                assign_output = Assign(
                    [Name(output_var_name, Store())],
                    self.create_call("nd.array", [
                        self.create_call("numpy.empty",
                                         [shape, Str(ret_type.dtype)])
                    ]),
                )
                return ([assign_output], [output_var], output_var)
            assert isinstance(ret_type, relay.TupleType)
            assignments = []
            extra_args = []
            fields = []
            for t in ret_type.fields:
                inner_assignments, inner_args, inner_output = convert_output(t)
                assignments += inner_assignments
                extra_args += inner_args
                fields.append(inner_output)
            fields = [ast.List(fields, Load())]
            return (assignments, extra_args,
                    self.create_call("_container.tuple_object", fields))

        # create a function to wrap the call of the lowered op and return
        # a call to that function
        wrap_name = self.generate_function_name("_{}_wrapper".format(op_name))
        wrap_args = [
            self.generate_var_name("_arg_{}".format(i))
            for i in range(len(py_args))
        ]

        inner_call_args = []
        for i in range(len(py_args)):
            inner_call_args += convert_input(Name(wrap_args[i], Load()),
                                             relay_args[i].checked_type)
        output_assignments, aux_args, output = convert_output(
            op.checked_type.ret_type)
        # equiv: _op = tvm.get_global_func(op_name)
        op_var = self.generate_var_name("_op")
        op_call = self.create_call("tvm.get_global_func", [Str(op_name)])
        op_assign = Assign([Name(op_var, Store())], op_call)
        # equiv: _op(args)
        inner_call = self.create_call(op_var, inner_call_args + aux_args)
        body = output_assignments + [
            op_assign, ast.Expr(inner_call),
            Return(output)
        ]
        wrap_def = self.create_def(wrap_name, wrap_args, body)
        return wrap_def, self.create_call(wrap_name, py_args)
Exemplo n.º 5
0
def main_compute(code_only=False):
    tvm_target = 'cuda'
    tvm.register_func('tvm_callback_cuda_compile',
                      compile_source,
                      override=True)
    logging.getLogger('autotvm').setLevel(logging.DEBUG)
    logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))

    default_tune_op = importlib.import_module('templates.' + (
        os.environ['OP'] if 'OP' in os.environ else 'auto.generic'))
    print('  >> Backend = %s, Python PID = %s, Task = %s;' %
          (backend, os.getpid(), default_tune_op.__name__))

    task = autotvm.task.create("template_op", args=(), target=tvm_target)

    def json_to_config(json_dict, index=-1, code_hash=None):
        if not isinstance(json_dict, list):
            json_list = []
            for key in json_dict:
                json_list.append([
                    key, 'ot' if type(json_dict[key]) is not list else
                    ('sp' if json_dict[key][0:1] == [-1] else 're'),
                    json_dict[key]
                ])
            json_dict = json_list
        config = ConfigEntity.from_json_dict({
            "index": index,
            "time": "",
            "code_hash": code_hash,
            "entity": json_dict
        })
        # config = ConfigEntity.from_json_dict({"i": index, "t": "", "c": code_hash, "e": json_dict})
        return config

    def config_to_json(config):
        if config is None:
            return {}
        if isinstance(config, str):
            return json.loads(config)
        jobj = config.to_json_dict()['entity']
        # jobj = config.to_json_dict()['e']
        json_dict = dict()
        for i in range(len(jobj)):
            assert (jobj[i][1] in ['sp', 'ot', 're'])
            json_dict[jobj[i][0]] = jobj[i][2]
        return json_dict

    num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0

    config = os.environ.get('CONFIG', '').strip()
    if config != '':
        if config[0] != '[':
            params_given = json.loads(config)
            print("====>> [Current Config Option]", config)
            best_config = json_to_config(params_given)
        else:
            best_config = config

    elif 'NNI_TRIAL_JOB_ID' in os.environ:
        if os.environ['NNI_TRIAL_JOB_ID'] == '@':
            search_space = get_search_space(task.config_space)
            json_space = json.dumps(search_space)
            dump_to_file = './search_space.json'
            print("\n>> Writing Search Space to '%s', Search Space = %s;" %
                  (dump_to_file, json_space))
            with open("search_space.json", "w") as fp:
                fp.write(json_space)
            sys.exit(0)

        try:
            import nni
            params_given = nni.get_next_parameter()
            if params_given is None:
                raise
            local_dir_id = os.environ['NNI_TRIAL_JOB_ID']
        except:
            params_given = default_tune_op.get_choice_example()
            local_dir_id = '_'
        t = run_config_entity(params_given, local_dir_id)
        gflops = compute_gflops(task.flop, t)
        print('[Antares-engine] Final entity result is: %g' % gflops)
        try:
            nni.report_final_result(gflops)
        except:
            print('[Antares-engine] (not reporting final result to NNI.)')
        exit(0)

    elif num_trials > 0:
        dev_num = platform_config.get_execution_parallism()
        if dev_num <= 0:
            raise Exception("No valid device found for backend: %s." % backend)
        batch_size = int(os.environ.get('BATCH', '16'))

        from concurrent.futures import ThreadPoolExecutor
        try:
            if platform_config.allow_concurrent_compile_execution():
                raise Exception()
            worker_size = 1
        except:
            worker_size = batch_size
        thread_pool = ThreadPoolExecutor(max_workers=worker_size)

        task.antares_helper = Mock()
        task.antares_helper.json_to_config = json_to_config
        task.antares_helper.config_to_json = config_to_json
        task.antares_helper.to_json_search_space = get_search_space

        tuner_type = os.environ.get('TUNER', 'XGBoost')
        print('  >> MAKE_PARA = %d/%d, EXEC_PARA = %d, TUNER = %s' %
              (worker_size, batch_size, dev_num, tuner_type))

        auto_commit = os.environ.get('COMMIT', '')
        if auto_commit:
            saved_code = codehub_db(os.environ['COMPUTE_V1'])
            if saved_code is not None and auto_commit != 'force':
                raise Exception(
                    "Saved code has existed in codehub. Please try COMMIT=force to overide it."
                )
            os.environ.pop('COMMIT')

        try:
            tuner = importlib.import_module('tuner.%s.main' % tuner_type)
            tuner = tuner.MainTuner(task)
        except:
            raise Exception('>> Cannot import Antares Tuner: %s' % tuner_type)

        if tuner is not None:

            def measure_batch(inputs):
                results, futures = [], []
                best_slot = -1
                expected_timecost = tuner.task.best.timecost
                for i in range(len(inputs)):
                    futures.append(
                        thread_pool.submit(run_config_entity,
                                           config_to_json(inputs[i].config), i,
                                           expected_timecost, i % dev_num))
                for i in range(len(inputs)):
                    t = futures[i].result()
                    if t < tuner.task.best.timecost:
                        best_slot = i
                        tuner.task.best.timecost = t
                        tuner.task.best.config = inputs[i].config
                        tuner.task.best.occur = tuner.task.best.curr_step + i + 1
                    results.append(
                        autotvm.measure.MeasureResult(costs=(t, ),
                                                      error_no=0,
                                                      all_cost=i,
                                                      timestamp=time.time()))
                tuner.task.best.curr_step += len(results)

                print(
                    '\nSTEP[%d / %d] Current Best Config = %s, Perf = %g Gflops, Occur Step = %d;'
                    %
                    (tuner.task.best.curr_step, num_trials,
                     json.dumps(config_to_json(tuner.task.best.config)),
                     compute_gflops(tuner.task.flop, tuner.task.best.timecost),
                     tuner.task.best.occur))

                if auto_commit and best_slot >= 0:
                    with open(local_get_dir_file('my_kernel.cc', best_slot),
                              'r') as fp:
                        device_source = fp.read()
                    with open(local_get_dir_file('result.txt', best_slot),
                              'r') as fp:
                        t = float(fp.read().split()[0])
                    kernel_path = codehub_db(
                        os.environ['COMPUTE_V1'],
                        source_code=device_source +
                        '\n// Saved Perf = %g sec / run; Step Produced = %d;' %
                        (t, tuner.task.best.curr_step))
                    print('  >> Update current code to codehub: %s' %
                          kernel_path)
                return results

            tuner.task.best = Mock()
            tuner.task.best.timecost = float('inf')
            tuner.task.best.config = None
            tuner.task.best.occur = -1
            tuner.task.best.curr_step = 0

            tuner.measure_batch = measure_batch
            callbacks = []

            history_log_for_transfer_learning = os.environ.get('RECORD', '')

            if history_log_for_transfer_learning:
                callbacks.append(
                    autotvm.callback.log_to_file(
                        history_log_for_transfer_learning))
                # Enable Transfer Learning for Incremental Task
                if os.path.exists(history_log_for_transfer_learning):
                    print(
                        '  >>  Loading incremental history from log file: %s ..'
                        % history_log_for_transfer_learning)
                    tuner.load_history(
                        autotvm.record.load_from_file(
                            history_log_for_transfer_learning))

            tuner.tune(n_trial=num_trials,
                       measure_option=autotvm.measure_option(
                           builder=autotvm.LocalBuilder(n_parallel=batch_size),
                           runner=autotvm.LocalRunner(repeat=3,
                                                      min_repeat_ms=100,
                                                      timeout=4)),
                       callbacks=callbacks)
            assert not math.isinf(
                tuner.task.best.timecost
            ), "Not valid config found in the whole tuning."
            best_config = tuner.task.best.config

            print(
                "\n[Best Config] CONFIG='%s'  ==>  Performance is up to %f Gflops, occurred at step %d / %d; time per run = %g sec."
                %
                (json.dumps(config_to_json(best_config)),
                 compute_gflops(tuner.task.flop, tuner.task.best.timecost),
                 tuner.task.best.occur, num_trials, tuner.task.best.timecost))

            if hasattr(tuner, 'cleanup'):
                tuner.cleanup()
        else:
            raise Exception('Unrecognized tuner type: `%s`' % tuner_type)
        exit(0)
    else:
        if os.environ['OP'] == 'auto.generic':
            saved_code = codehub_db(os.environ['COMPUTE_V1'])
            if saved_code is not None:
                print("  >> Using Saved Code from Codehub:")
                print("===========================")
                print(saved_code)
                print("===========================")
                exit(0)
        best_config = task.config_space

    if isinstance(best_config, str):
        from tvm import auto_scheduler
        origin_cfg = json.loads(best_config)
        origin_cfg = {
            "i": [[
                '["main_compute.<locals>.auto_template"]',
                'cuda -keys=cuda,gpu -max_num_threads=%d -thread_warp_size=%d'
                % (device_properties().max_threads_per_block,
                   device_properties().warp_size)
            ], origin_cfg],
            "r": [[0], 0, 0, 0],
            "v":
            "v0.2",
        }
        origin_cfg_file = local_get_dir_file('my_kernel.cfg')
        with open(origin_cfg_file, 'w') as fp:
            fp.write(json.dumps(origin_cfg))
        origin_cfg = tvm.auto_scheduler.measure_record.load_records(
            origin_cfg_file)

        @auto_scheduler.register_workload
        def auto_template():
            _, arg_bufs = default_tune_op.get_template_op()
            return arg_bufs

        target = tvm.target.Target("cuda")
        auto_task = auto_scheduler.create_task(auto_template, (), target)
        for inp, res in origin_cfg:
            s, arg_bufs = auto_task.compute_dag.apply_steps_from_state(
                inp.state)
            break
    else:
        with ApplyConfig(best_config):
            with tvm.target.Target(tvm_target):
                s, arg_bufs = default_tune_op.get_template_op()

    if s is not None:
        lower_source = str(tvm.lower(s, arg_bufs, simple_mode=True))

        lower_file = local_get_dir_file('my_kernel.lower')
        with open(lower_file, 'w') as fp:
            fp.write(lower_source)

        # Verify Lower Code Code
        if len(('\n' + lower_source).split('\nprimfn(')) != 2:
            raise Exception('[Not Support Multi Unfuse-able kernels]\n\n' +
                            lower_source)

        max_threads_per_block = device_properties().max_threads_per_block
        max_shared_memory_per_block = device_properties(
        ).max_shared_memory_per_block
        assert max_threads_per_block > 0 and max_shared_memory_per_block >= 0, '[Error] Invalid device properties, maybe device is not detected correctly.'

        lower_lines = lower_source.split('\n')
        thread_extents, allocate_shared = [], []
        for ll in lower_lines:
            if ll.strip().startswith(
                    'attr [IterVar(') and ll.find(' "thread_extent" = ') >= 0:
                thread_name = ll.split('attr [IterVar(')[-1].split(':')[0]
                thread_val = int(
                    ll.split(' "thread_extent" = ')[-1].split(';')
                    [0].strip().split(' ')[0])
                thread_extents.append((thread_name, thread_val))
            elif ll.strip().startswith('allocate(') and ll.find(
                    '.shared, ') >= 0 and ll.endswith(");"):
                parts = ll[:-2].split(', ')[1:]
                allocate_type = parts[0]
                allocate_val = int(np.product(eval(parts[1])))
                allocate_shared.append((allocate_type, allocate_val))

        reserved_axes = dict()
        for thread_name, thread_val in thread_extents:
            if thread_name in reserved_axes:
                assert reserved_axes[
                    thread_name] == thread_val, "Invalid code: Multiple hints for thread extent conflict with each other: %d v.s. %d" % (
                        reserved_axes[thread_name], thread_val)
            else:
                reserved_axes[thread_name] = thread_val

        num_threads = 1
        for thread_name in ['threadIdx.x', 'threadIdx.y', 'threadIdx.z']:
            num_threads *= reserved_axes.get(thread_name, 1)
        assert num_threads <= max_threads_per_block, "Invalid kernel code: using num_threads(%d) > max_threads_per_block(%d)" % (
            num_threads, max_threads_per_block)

        shared_memory_in_bytes = 0
        for allocate_type, allocate_size in allocate_shared:
            if allocate_type.startswith('custom['):
                type_name = allocate_type[7:].split(']')[0]
                shared_memory_inc = int(
                    custom_dtypes[type_name][-1].split('@')[-1])
            else:
                shared_memory_inc = 8 * np.dtype(allocate_type).itemsize
            assert shared_memory_inc % 8 == 0, "The bits of shared_memory is not aligned with 8-bit bytes."
            shared_memory_in_bytes += shared_memory_inc // 8 * allocate_size

        if shared_memory_in_bytes > max_shared_memory_per_block:
            raise Exception(
                "Invalid kernel code: using shared_memory_in_bytes %d > max_shared_memory_per_block %d"
                % (shared_memory_in_bytes, max_shared_memory_per_block))

        # Compile Source Code
        def build_template():
            return tvm.build(s, arg_bufs, tvm_target, name='template_op')

        func = wait_for(build_template, 30)

    assert (len(func.imported_modules) == 1)
    device_source = translate_code(func.imported_modules[0].get_source())

    if code_only:
        return device_source

    print("====================================")
    print(device_source)
    print("====================================")

    print()
    try:
        eval_client = importlib.import_module('platforms.%s.evaluator.client' %
                                              backend)
    except ModuleNotFoundError:
        print('>> Evaluator for backend %s not found, skipping evaluation.' %
              backend)
        exit(0)
    except:
        traceback.print_exc()
        exit(1)

    def handle_result(result):
        print('\n[EvalAgent] Results =', json.dumps(result))
        if 'RESULT' in os.environ:
            if abs(float(os.environ['RESULT']) / result['K/0'] - 1.0) > 1e-6:
                result['TPR'] = None

        t = result.get('TPR', None)
        if t is None:
            print("\n[Antares] Incorrect compute kernel from evaluator.")
        else:
            gflops = compute_gflops(task.flop, t)
            print("\n[Antares] Average time cost / run = %g sec, %g gflops." %
                  (t, gflops))
            with open(local_get_dir_file('result.txt'), 'w') as fp:
                fp.write(str(t) + '\n')
                if 'K/0' in result:
                    fp.write(str(result['K/0']) + '\n')
        if os.environ['OP'] == 'auto.generic' and os.environ.get('COMMIT', ''):
            kernel_path = codehub_db(os.environ['COMPUTE_V1'],
                                     source_code=device_source +
                                     '\n// Saved Perf = %g sec / run' % t)
            print('  >> Update current code to codehub: %s' % kernel_path)

    tune_slot_id = int(os.environ.get(unified_slot_key, '0'))

    exec_fd, _ = system_lock([tune_slot_id])
    try:
        expected_timeout = None
        if 'EXPECTED_TIMEOUT' in os.environ and not math.isinf(
                float(os.environ['EXPECTED_TIMEOUT'])):
            expected_timeout = float(os.environ['EXPECTED_TIMEOUT'])
            expected_timeout = max(expected_timeout * 1.1,
                                   expected_timeout + 0.1)

        results = eval_client.eval(
            kernel_path=local_get_dir_file('my_kernel.cc'),
            expected_timeout=expected_timeout,
            func=func,
        )
    except:
        traceback.print_exc()
        exit(1)

    handle_result(results)
    exec_fd()
    exit(0)
Exemplo n.º 6
0
    def server_init_callback():
        # pylint: disable=redefined-outer-name, reimported, import-outside-toplevel, import-self
        import tvm
        import vta.exec.rpc_server

        tvm.register_func("tvm.rpc.server.start", vta.exec.rpc_server.server_start, override=True)
Exemplo n.º 7
0
def main_compute(code_only=False):
    def compile_callback(code):
        return bytearray()

    tvm.register_func('tvm_callback_cuda_compile',
                      compile_callback,
                      override=True)

    default_tune_op = importlib.import_module('lang.generic')

    import logging
    from tvm import autotvm

    logging.getLogger('autotvm').setLevel(logging.ERROR)
    logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))
    task = autotvm.task.create("template_op", args=(), target=tvm_target)

    AntaresGlobal.default_tune_op = default_tune_op
    AntaresGlobal.default_task = task

    if verbose:
        print('  >> Backend = %s, Python PID = %s, Task = %s;' %
              (backend, os.getpid(), default_tune_op.__name__))

    num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0

    config = os.environ.get('CONFIG', '').strip()
    if config != '':
        best_config = config
    elif num_trials > 0:
        dev_num = backend_config.get_execution_parallism()
        if dev_num <= 0:
            raise Exception("No valid device found for backend: %s." % backend)
        batch_size = int(os.environ.get('BATCH', '16'))

        from concurrent.futures import ThreadPoolExecutor
        worker_size = batch_size if batch_size < dev_num else dev_num
        thread_pool = ThreadPoolExecutor(max_workers=worker_size)

        tuner_type = os.environ.get('TUNER')
        if not tuner_type:
            explicit_ops = AntaresGlobal.attrs.explicit_ops
            tuner_type = 'OpEvo'
        print('  >> MAKE_PARA = %d/%d, EXEC_PARA = %d, TUNER = %s\n' %
              (worker_size, batch_size, dev_num, tuner_type))

        auto_commit = os.environ.get('COMMIT', '')
        if auto_commit:
            saved_code = codehub_db(os.environ['COMPUTE_V1'])
            if saved_code is not None and auto_commit != 'force':
                raise Exception(
                    "Saved code has existed in codehub. Please try COMMIT=force to override it."
                )
            os.environ.pop('COMMIT')

        try:
            task.search_space_v2 = AntaresGlobal.attrs.auto_config.get_config_space(
            )
            task.n_parallel = batch_size
            tuner = importlib.import_module('tuner.%s.main' % tuner_type)
            tuner = tuner.MainTuner(task)
        except:
            raise Exception('>> Cannot import Antares Tuner: %s' % tuner_type)

        if hasattr(tuner, 'cleanup'):
            AntaresGlobal.cleanup_funcs.append(tuner.cleanup)

        if tuner is not None:
            AntaresGlobal.current_step = 0
            eval_client.init(backend_root=backend_root)

            def measure_batch(inputs):
                results, futures = [], []
                target_sources, config_strs = [], []
                for i in range(len(inputs)):
                    dir_sid = AntaresGlobal.current_step + i + 1
                    config_str = inputs[i].config if type(
                        inputs[i].config).__name__ == 'str' else 'null'
                    config_strs.append(config_str)
                    try:
                        target_source = get_target_source(
                            config_strs[i], dir_sid)
                    except:
                        # traceback.print_exc()
                        target_source = None
                    target_sources.append(target_source)

                expected_timecost = tuner.task.best.timecost
                for i in range(len(inputs)):
                    dir_sid = AntaresGlobal.current_step + i + 1
                    futures.append(
                        thread_pool.submit(run_config_entity,
                                           target_sources[i], config_strs[i],
                                           dir_sid, expected_timecost,
                                           i % dev_num))

                best_slot = -1
                for i in range(len(inputs)):
                    dir_sid = AntaresGlobal.current_step + i + 1
                    t = futures[i].result()
                    if t < tuner.task.best.timecost:
                        best_slot = dir_sid
                        tuner.task.best.timecost = t
                        tuner.task.best.config = inputs[i].config
                        tuner.task.best.occur = best_slot
                    results.append(
                        autotvm.measure.MeasureResult(costs=(t, ),
                                                      error_no=0,
                                                      all_cost=i,
                                                      timestamp=time.time()))
                AntaresGlobal.current_step += len(results)

                print(
                    '\nSTEP[%d / %d] Current Best Config = %s, Perf = %g sec / op (%g Gflops), MemRatio = %g %%, Occur Step = %d;\n'
                    %
                    (AntaresGlobal.current_step, num_trials,
                     tuner.task.best.config, tuner.task.best.timecost,
                     compute_gflops(tuner.task.flop, tuner.task.best.timecost),
                     compute_mem_ratio(
                         tuner.task.best.timecost), tuner.task.best.occur))

                if auto_commit and best_slot >= 0:
                    with open(local_get_dir_file('my_kernel.cc', best_slot),
                              'r') as fp:
                        device_source = fp.read()
                    with open(local_get_dir_file('result.txt', best_slot),
                              'r') as fp:
                        t = float(fp.read().split()[0])
                    kernel_path = codehub_db(
                        os.environ['COMPUTE_V1'],
                        source_code=device_source + code_suffix(
                            tpr=t, step_prod=best_slot, step_plan=num_trials))
                    print('  >> Update current code to codehub: %s' %
                          kernel_path)
                return results

            tuner.task.best = Mock()
            tuner.task.best.timecost = float(
                os.environ.get('EXPECTED_TIMEOUT', 'inf'))
            tuner.task.best.config = None
            tuner.task.best.occur = -1

            tuner.measure_batch = measure_batch
            tuner.measure_batch.n_parallel = batch_size
            callbacks = []

            history_log_for_transfer_learning = os.environ.get('RECORD', '')

            if history_log_for_transfer_learning:
                callbacks.append(
                    autotvm.callback.log_to_file(
                        history_log_for_transfer_learning))
                # Enable Transfer Learning for Incremental Task
                if os.path.exists(history_log_for_transfer_learning):
                    print(
                        '  >>  Loading incremental history from log file: %s ..'
                        % history_log_for_transfer_learning)
                    tuner.load_history(
                        autotvm.record.load_from_file(
                            history_log_for_transfer_learning))

            tuner.tune(n_trial=num_trials,
                       callbacks=callbacks,
                       measure_option=None)
            if math.isinf(tuner.task.best.timecost):
                print(
                    f'[Error] No valid config found in the whole tuning. (Try other tuner types other than `TUNER={tuner_type}`?)'
                )
                cleanup_on_exit(0, None)

            best_config = tuner.task.best.config

            if auto_commit:
                device_source = codehub_db(os.environ['COMPUTE_V1'])
                codehub_db(os.environ['COMPUTE_V1'],
                           source_code=device_source +
                           '\n// Antares Tuning Completed in %d steps.' %
                           AntaresGlobal.current_step)

            print(
                "\n[Best Config] CONFIG='%s'  ==>  Performance is up to %f Gflops, occurred at step %d / %d; time per run = %g sec."
                %
                (best_config,
                 compute_gflops(tuner.task.flop, tuner.task.best.timecost),
                 tuner.task.best.occur, num_trials, tuner.task.best.timecost))

            cleanup_on_exit(-1, None)
        else:
            raise Exception('Unrecognized tuner type: `%s`' % tuner_type)
        exit(0)
    else:
        saved_code = codehub_db(os.environ['COMPUTE_V1'])
        if saved_code is not None:
            print("  >> Using Saved Code from Codehub:")
            print(
                "// ---------------------------------------------------------------------------"
            )
            print(saved_code)
            print(
                "// ---------------------------------------------------------------------------"
            )
            exit(0)
        best_config = ''

    assert isinstance(best_config, str)

    best_config = best_config if best_config else 'null'
    device_source, kernel_path = get_target_source(best_config)

    if code_only:
        return device_source

    if verbose:
        print()
        print(
            "// ---------------------------------------------------------------------------"
        )
        print(device_source)
        print(
            "// ---------------------------------------------------------------------------"
        )

    eval_client.init(backend_root=backend_root)
    dev_id = int(os.environ.get('DEV_ID', '0'))
    result = evaluate_perf(kernel_path, dev_id, device_source)
    exit(0 if result is not None and len(result) > 1 else 1)
Exemplo n.º 8
0
def main_compute(code_only=False):
    def compile_callback(code):
        return bytearray()

    tvm.register_func('tvm_callback_cuda_compile',
                      compile_callback,
                      override=True)
    logging.getLogger('autotvm').setLevel(logging.DEBUG)
    logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))

    default_tune_op = importlib.import_module('lang.generic')
    task = autotvm.task.create("template_op", args=(), target=tvm_target)

    def json_to_config(json_dict, index=-1, code_hash=None):
        if not isinstance(json_dict, list):
            json_list = []
            for key in json_dict:
                json_list.append([
                    key, 'ot' if type(json_dict[key]) is not list else
                    ('sp' if json_dict[key][0:1] == [-1] else 're'),
                    json_dict[key]
                ])
            json_dict = json_list
        config = ConfigEntity.from_json_dict({
            "index": index,
            "time": "",
            "code_hash": code_hash,
            "entity": json_dict
        })
        return config

    def config_to_json(config):
        if config is None:
            return {}
        if isinstance(config, str):
            return json.loads(config)
        jobj = config.to_json_dict()['entity']
        json_dict = dict()
        for i in range(len(jobj)):
            assert (jobj[i][1] in ['sp', 'ot', 're'])
            json_dict[jobj[i][0]] = jobj[i][2]
        return json_dict

    task.antares_helper = Mock()
    task.antares_helper.json_to_config = json_to_config
    task.antares_helper.config_to_json = config_to_json
    task.antares_helper.to_json_search_space = get_search_space

    AntaresGlobal.default_tune_op = default_tune_op
    AntaresGlobal.default_task = task

    if verbose:
        print('  >> Backend = %s, Python PID = %s, Task = %s;' %
              (backend, os.getpid(), default_tune_op.__name__))

    num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0

    config = os.environ.get('CONFIG', '').strip()
    if config != '':
        best_config = config
    elif os.environ.get('NNI_TRIAL_JOB_ID', '') == '@':
        search_space = get_search_space(task.config_space)
        json_space = json.dumps(search_space)
        print("\n>> Search Space: %s" % (json_space))
        exit(0)
    elif num_trials > 0:
        dev_num = platform_config.get_execution_parallism()
        if dev_num <= 0:
            raise Exception("No valid device found for backend: %s." % backend)
        batch_size = int(os.environ.get('BATCH', '16'))

        from concurrent.futures import ThreadPoolExecutor
        try:
            if platform_config.allow_concurrent_compile_execution():
                raise Exception()
            worker_size = 1
        except:
            worker_size = batch_size
        thread_pool = ThreadPoolExecutor(max_workers=worker_size)

        tuner_type = os.environ.get('TUNER', '')
        if not tuner_type:
            explicit_ops = AntaresGlobal.attrs.explicit_ops
            global_outs = get_global_arg_props()['_out']
            if ('|plan/' not in ('|' + '|'.join(AntaresGlobal.attrs.options))
                    and len(explicit_ops) == 1
                    and len(explicit_ops[-1].reduce_axis) > 0
                    and len(global_outs) == 1
                    and global_outs[0]['name'] == explicit_ops[-1].name
                    and backend in ['c-rocm', 'c-cuda', 'c-hlsl', 'c-ocl']):
                tuner_type = 'Ansor'
            else:
                tuner_type = 'XGBoost'
        print('  >> MAKE_PARA = %d/%d, EXEC_PARA = %d, TUNER = %s' %
              (worker_size, batch_size, dev_num, tuner_type))

        auto_commit = os.environ.get('COMMIT', '')
        if auto_commit:
            saved_code = codehub_db(os.environ['COMPUTE_V1'])
            if saved_code is not None and auto_commit != 'force':
                raise Exception(
                    "Saved code has existed in codehub. Please try COMMIT=force to override it."
                )
            os.environ.pop('COMMIT')

        try:
            task.n_parallel = batch_size
            tuner = importlib.import_module('tuner.%s.main' % tuner_type)
            tuner = tuner.MainTuner(task)
        except:
            raise Exception('>> Cannot import Antares Tuner: %s' % tuner_type)

        if hasattr(tuner, 'cleanup'):
            AntaresGlobal.cleanup_funcs.append(tuner.cleanup)

        if tuner is not None:
            AntaresGlobal.current_step = 0

            def measure_batch(inputs):
                results, futures = [], []
                target_sources, config_strs = [], []
                for i in range(len(inputs)):
                    dir_sid = AntaresGlobal.current_step + i + 1
                    config_strs.append(
                        json.dumps(config_to_json(inputs[i].config)))
                    try:
                        target_source = get_target_source(
                            config_strs[i], dir_sid)
                    except:
                        # traceback.print_exc()
                        target_source = None
                    target_sources.append(target_source)

                expected_timecost = tuner.task.best.timecost
                for i in range(len(inputs)):
                    dir_sid = AntaresGlobal.current_step + i + 1
                    futures.append(
                        thread_pool.submit(run_config_entity,
                                           target_sources[i], config_strs[i],
                                           dir_sid, expected_timecost,
                                           i % dev_num))

                best_slot = -1
                for i in range(len(inputs)):
                    dir_sid = AntaresGlobal.current_step + i + 1
                    t = futures[i].result()
                    if t < tuner.task.best.timecost:
                        best_slot = dir_sid
                        tuner.task.best.timecost = t
                        tuner.task.best.config = inputs[i].config
                        tuner.task.best.occur = best_slot
                    results.append(
                        autotvm.measure.MeasureResult(costs=(t, ),
                                                      error_no=0,
                                                      all_cost=i,
                                                      timestamp=time.time()))
                AntaresGlobal.current_step += len(results)

                print(
                    '\nSTEP[%d / %d] Current Best Config = %s, Perf = %g Gflops, MemRatio = %g %%, Occur Step = %d;'
                    %
                    (AntaresGlobal.current_step, num_trials,
                     json.dumps(config_to_json(tuner.task.best.config)),
                     compute_gflops(tuner.task.flop, tuner.task.best.timecost),
                     compute_mem_ratio(
                         tuner.task.best.timecost), tuner.task.best.occur))

                if auto_commit and best_slot >= 0:
                    with open(local_get_dir_file('my_kernel.cc', best_slot),
                              'r') as fp:
                        device_source = fp.read()
                    with open(local_get_dir_file('result.txt', best_slot),
                              'r') as fp:
                        t = float(fp.read().split()[0])
                    kernel_path = codehub_db(
                        os.environ['COMPUTE_V1'],
                        source_code=device_source + code_suffix(
                            tpr=t, step_prod=best_slot, step_plan=num_trials))
                    print('  >> Update current code to codehub: %s' %
                          kernel_path)
                return results

            tuner.task.best = Mock()
            tuner.task.best.timecost = float('inf')
            tuner.task.best.config = None
            tuner.task.best.occur = -1

            tuner.measure_batch = measure_batch
            tuner.measure_batch.n_parallel = batch_size
            callbacks = []

            history_log_for_transfer_learning = os.environ.get('RECORD', '')

            if history_log_for_transfer_learning:
                callbacks.append(
                    autotvm.callback.log_to_file(
                        history_log_for_transfer_learning))
                # Enable Transfer Learning for Incremental Task
                if os.path.exists(history_log_for_transfer_learning):
                    print(
                        '  >>  Loading incremental history from log file: %s ..'
                        % history_log_for_transfer_learning)
                    tuner.load_history(
                        autotvm.record.load_from_file(
                            history_log_for_transfer_learning))

            tuner.tune(n_trial=num_trials,
                       callbacks=callbacks,
                       measure_option=None)
            if math.isinf(tuner.task.best.timecost):
                print(
                    f'[Error] valid config found in the whole tuning. (Try other tuner types other than `TUNER={tuner_type}`?)'
                )
                cleanup_on_exit(0, None)

            best_config = json.dumps(config_to_json(tuner.task.best.config))

            if auto_commit:
                device_source = codehub_db(os.environ['COMPUTE_V1'])
                codehub_db(os.environ['COMPUTE_V1'],
                           source_code=device_source +
                           '\n// Antares Tuning Completed in %d steps.' %
                           AntaresGlobal.current_step)

            print(
                "\n[Best Config] CONFIG='%s'  ==>  Performance is up to %f Gflops, occurred at step %d / %d; time per run = %g sec."
                %
                (best_config,
                 compute_gflops(tuner.task.flop, tuner.task.best.timecost),
                 tuner.task.best.occur, num_trials, tuner.task.best.timecost))

            cleanup_on_exit(-1, None)
        else:
            raise Exception('Unrecognized tuner type: `%s`' % tuner_type)
        exit(0)
    else:
        saved_code = codehub_db(os.environ['COMPUTE_V1'])
        if saved_code is not None:
            print("  >> Using Saved Code from Codehub:")
            print("===========================")
            print(saved_code)
            print("===========================")
            exit(0)
        best_config = ''

    assert isinstance(best_config, str)

    best_config = best_config if best_config else task.config_space
    device_source, kernel_path, compile_args = get_target_source(best_config)

    if code_only:
        return device_source

    if verbose:
        print("====================================")
        print(device_source)
        print("====================================\n")

    do_compilation(compile_args)
    dev_id = int(os.environ.get('DEV_KEY', '0'))
    result = evaluate_perf(kernel_path, dev_id, device_source)
    exit(0 if result is not None and len(result) > 1 else 1)