Exemplo n.º 1
0
def search_op_config(code_only=False):
    tvm_target = 'cuda'
    logging.getLogger('autotvm').setLevel(logging.DEBUG)
    logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))

    default_tune_op = importlib.import_module('templates.' +
                                              (os.environ['OP']))
    print('  >> Backend = %s, Python PID = %s, Task = %s;' %
          (backend, os.getpid(), default_tune_op.__name__))

    task = autotvm.task.create(default_tune_op.get_template_op,
                               args=(),
                               target=tvm_target)
    op_attributes = default_tune_op.op_attributes
    op_summary = '_'.join([k + str(op_attributes[k]) for k in op_attributes])

    def json_to_config(json_dict):
        config = ConfigEntity.from_json_dict({
            "i": -1,
            "t": "",
            "c": None,
            "e": json_dict
        })
        return config

    def config_to_json(config):
        jobj = config.to_json_dict()['e']
        json_dict = dict()
        for i in range(len(jobj)):
            assert (jobj[i][1] in ['sp', 'ot'])
            json_dict[jobj[i][0]] = jobj[i][2]
        return json_dict

    num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0

    if 'CONFIG' in os.environ:
        params_given = json.loads(os.environ['CONFIG'])
        print("====>> [Current Config Option]", os.environ['CONFIG'])

        trial_config = []
        for key in params_given:
            trial_config.append([
                key, "sp" if type(params_given[key]) is list else "ot",
                params_given[key]
            ])
        best_config = json_to_config(trial_config)

    elif 'NNI_TRIAL_JOB_ID' in os.environ:
        show_search_space(task.config_space,
                          os.environ['NNI_TRIAL_JOB_ID'] == '@')
        import nni
        params_given = nni.get_next_parameter()
        if params_given is None:
            raise
        local_dir_id = os.environ['NNI_TRIAL_JOB_ID']
        t = run_config_entity(params_given, local_dir_id)
        gflops = compute_gflops(task.flop, t)
        print('[TVM-engine] Final entity result is: %g' % gflops)
        try:
            nni.report_final_result(gflops)
        except:
            print('[TVM-engine] (not reporting final result to NNI.)')
        exit(0)

    elif num_trials > 0:
        n_parallel = 16 if 'BATCH' not in os.environ else int(
            os.environ['BATCH'])
        measure_option = autotvm.measure_option(
            builder=autotvm.LocalBuilder(n_parallel=n_parallel),
            runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4))
        # if DO_TUNING:
        tuner = autotvm.tuner.XGBTuner(task, num_threads=8)

        from concurrent.futures import ThreadPoolExecutor
        thread_pool = ThreadPoolExecutor(max_workers=n_parallel)

        dev_num = get_tuning_parallism()

        def parse_configs(task, configs):
            results = []
            futures = []
            expected_timecost = 'inf'
            for i in range(len(configs)):
                futures.append(
                    thread_pool.submit(run_config_entity,
                                       config_to_json(configs[i]), i,
                                       expected_timecost, i % dev_num))
            for i in range(len(configs)):
                t = futures[i].result()
                if t < tuner.task.best_config[0]:
                    tuner.task.best_config = (t, configs[i])
                results.append(
                    autotvm.measure.MeasureResult(costs=(t, ),
                                                  error_no=0,
                                                  all_cost=i,
                                                  timestamp=time.time()))
            return results

        tuner.task.best_config = (float('inf'), None)
        tuner.parse_configs = parse_configs

        tuner.tune(n_trial=num_trials,
                   measure_option=measure_option,
                   callbacks=[])
        assert (not math.isinf(tuner.task.best_config[0]))
        best_config = tuner.task.best_config[1]
        print('\n[Best Config]', json.dumps(config_to_json(best_config)))
    else:
        best_config = task.config_space

    with ApplyConfig(best_config):
        with tvm.target.create(tvm_target):
            s, arg_bufs = default_tune_op.get_template_op()
            lower_source = str(tvm.lower(s, arg_bufs, simple_mode=True))

            # Verify Source Code
            assert (len(('\n' + lower_source).split('\nproduce ')) == 2)
            lower_file = local_get_dir_file('my_kernel.lower')
            with open(lower_file, 'w') as fp:
                fp.write(lower_source)

            max_threads_per_block = tvm.ndarray.gpu(0).max_threads_per_block
            max_shared_memory_per_block = tvm.ndarray.gpu(
                0).max_shared_memory_per_block

            thread_extents = subprocess.getoutput(
                "cat '%s' | grep '^ *// attr.*iter_var.*thread_extent'" %
                (lower_file)).split('\n')
            reserved_axes = dict({
                'threadIdx.x': None,
                'threadIdx.y': None,
                'threadIdx.z': None,
                'blockIdx.x': None,
                'blockIdx.y': None,
                'blockIdx.z': None
            })
            for line in thread_extents:
                thread_name = line.split('[iter_var(')[-1].split(',')[0]
                if thread_name in reserved_axes:
                    thread_val = int(line.split('thread_extent = ')[-1])
                    if reserved_axes[thread_name] is not None:
                        if reserved_axes[thread_name] != thread_val:
                            assert (False)
                    else:
                        reserved_axes[thread_name] = thread_val
                else:
                    raise Exception("Invalid thread_axis name: %s" %
                                    thread_name)

            num_threads = 1
            for thread_name in ['threadIdx.x', 'threadIdx.y', 'threadIdx.z']:
                if reserved_axes[thread_name] is not None:
                    num_threads *= reserved_axes[thread_name]
            if num_threads > max_threads_per_block:
                raise Exception(
                    "Invalid kernel code: using num_threads %d > max_threads_per_block %d"
                    % (num_threads, max_threads_per_block))

            allocate_shared = subprocess.getoutput(
                "cat '%s' | grep 'allocate .*shared\[.*\]'" %
                (lower_file)).split('\n')
            shared_memory_in_bytes = 0
            for line in allocate_shared:
                if not line:
                    continue
                parts = line.split('[')
                assert (len(parts) == 2)
                parts = parts[1].split(' * ')
                assert (len(parts) == 2)
                assert (parts[1][-1] == ']')
                allocate_type = parts[0]
                allocate_val = int(parts[1][:-1])
                if allocate_type in ['float32']:
                    shared_memory_in_bytes += allocate_val * 4
                else:
                    raise Exception(
                        "Unrecognized shared memory data type: %s" %
                        allocate_type)
            if shared_memory_in_bytes > max_shared_memory_per_block:
                raise Exception(
                    "Invalid kernel code: using shared_memory_in_bytes %d > max_shared_memory_per_block %d"
                    % (shared_memory_in_bytes, max_shared_memory_per_block))

            func = tvm.build(s, arg_bufs, tvm_target, name='template_op')

    assert (len(func.imported_modules) == 1)
    device_source = translate_code(func.imported_modules[0].get_source())

    if code_only:
        return device_source

    if lower_source and device_source:
        tune_slot_id = 0 if 'CUDA_VISIBLE_DEVICES' not in os.environ else int(
            os.environ['CUDA_VISIBLE_DEVICES'])
        exec_fd, _ = system_lock([tune_slot_id])
        gpu_id = 0
        ctx = tvm.context(tvm_target, gpu_id)
        tensors, outs = [], []
        for arg in arg_bufs:
            shape = [int(x) for x in arg.shape]
            is_output = arg.op.__class__ != tvm.tensor.PlaceholderOp
            from tvm._ffi.ndarray import empty
            td = empty(shape, arg.dtype, ctx)
            if is_output:
                outs.append(td)
            tensors.append(td)

        def timeout_handler():
            print("Error: Timeout during Kernel warmup")
            os._exit(1)

        my_timer = Timer(10, timeout_handler, [])
        my_timer.start()
        # Warmup
        func(*tensors)
        tvm.ndarray.gpu(gpu_id).sync()
        # Estimate
        t_start = time.time()
        func(*tensors)
        tvm.ndarray.gpu(gpu_id).sync()
        t_diff = time.time() - t_start
        my_timer.cancel()
        del my_timer

        num_runs = max(3, min(100, math.floor(1.0 / t_diff)))
        timeout_seconds = math.ceil((num_runs + 5) * t_diff)
        my_timer = Timer(timeout_seconds, timeout_handler, [])
        my_timer.start()
        timer_f = func.time_evaluator(func.entry_name, ctx, number=num_runs)
        t = timer_f(*tensors).mean
        my_timer.cancel()
        exec_fd()

        gflops = compute_gflops(task.flop, t)
        print("[TVM-engine] Average time cost of %d runs = %g ms, %g gflops." %
              (num_runs, t * 1e3, gflops))

        with open(local_get_dir_file('result.txt'), 'w') as fp:
            fp.write(str(t))
Exemplo n.º 2
0
def main_compute(code_only=False):
    tvm_target = 'cuda'
    tvm.register_func('tvm_callback_cuda_compile',
                      compile_source,
                      override=True)
    logging.getLogger('autotvm').setLevel(logging.DEBUG)
    logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))

    default_tune_op = importlib.import_module('templates.' + (
        os.environ['OP'] if 'OP' in os.environ else 'auto.generic'))
    print('  >> Backend = %s, Python PID = %s, Task = %s;' %
          (backend, os.getpid(), default_tune_op.__name__))

    task = autotvm.task.create("template_op", args=(), target=tvm_target)

    def json_to_config(json_dict, index=-1, code_hash=None):
        if not isinstance(json_dict, list):
            json_list = []
            for key in json_dict:
                json_list.append([
                    key, 'ot' if type(json_dict[key]) is not list else
                    ('sp' if json_dict[key][0:1] == [-1] else 're'),
                    json_dict[key]
                ])
            json_dict = json_list
        config = ConfigEntity.from_json_dict({
            "index": index,
            "time": "",
            "code_hash": code_hash,
            "entity": json_dict
        })
        # config = ConfigEntity.from_json_dict({"i": index, "t": "", "c": code_hash, "e": json_dict})
        return config

    def config_to_json(config):
        if config is None:
            return {}
        if isinstance(config, str):
            return json.loads(config)
        jobj = config.to_json_dict()['entity']
        # jobj = config.to_json_dict()['e']
        json_dict = dict()
        for i in range(len(jobj)):
            assert (jobj[i][1] in ['sp', 'ot', 're'])
            json_dict[jobj[i][0]] = jobj[i][2]
        return json_dict

    num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0

    config = os.environ.get('CONFIG', '').strip()
    if config != '':
        if config[0] != '[':
            params_given = json.loads(config)
            print("====>> [Current Config Option]", config)
            best_config = json_to_config(params_given)
        else:
            best_config = config

    elif 'NNI_TRIAL_JOB_ID' in os.environ:
        if os.environ['NNI_TRIAL_JOB_ID'] == '@':
            search_space = get_search_space(task.config_space)
            json_space = json.dumps(search_space)
            dump_to_file = './search_space.json'
            print("\n>> Writing Search Space to '%s', Search Space = %s;" %
                  (dump_to_file, json_space))
            with open("search_space.json", "w") as fp:
                fp.write(json_space)
            sys.exit(0)

        try:
            import nni
            params_given = nni.get_next_parameter()
            if params_given is None:
                raise
            local_dir_id = os.environ['NNI_TRIAL_JOB_ID']
        except:
            params_given = default_tune_op.get_choice_example()
            local_dir_id = '_'
        t = run_config_entity(params_given, local_dir_id)
        gflops = compute_gflops(task.flop, t)
        print('[Antares-engine] Final entity result is: %g' % gflops)
        try:
            nni.report_final_result(gflops)
        except:
            print('[Antares-engine] (not reporting final result to NNI.)')
        exit(0)

    elif num_trials > 0:
        dev_num = platform_config.get_execution_parallism()
        if dev_num <= 0:
            raise Exception("No valid device found for backend: %s." % backend)
        batch_size = int(os.environ.get('BATCH', '16'))

        from concurrent.futures import ThreadPoolExecutor
        try:
            if platform_config.allow_concurrent_compile_execution():
                raise Exception()
            worker_size = 1
        except:
            worker_size = batch_size
        thread_pool = ThreadPoolExecutor(max_workers=worker_size)

        task.antares_helper = Mock()
        task.antares_helper.json_to_config = json_to_config
        task.antares_helper.config_to_json = config_to_json
        task.antares_helper.to_json_search_space = get_search_space

        tuner_type = os.environ.get('TUNER', 'XGBoost')
        print('  >> MAKE_PARA = %d/%d, EXEC_PARA = %d, TUNER = %s' %
              (worker_size, batch_size, dev_num, tuner_type))

        auto_commit = os.environ.get('COMMIT', '')
        if auto_commit:
            saved_code = codehub_db(os.environ['COMPUTE_V1'])
            if saved_code is not None and auto_commit != 'force':
                raise Exception(
                    "Saved code has existed in codehub. Please try COMMIT=force to overide it."
                )
            os.environ.pop('COMMIT')

        try:
            tuner = importlib.import_module('tuner.%s.main' % tuner_type)
            tuner = tuner.MainTuner(task)
        except:
            raise Exception('>> Cannot import Antares Tuner: %s' % tuner_type)

        if tuner is not None:

            def measure_batch(inputs):
                results, futures = [], []
                best_slot = -1
                expected_timecost = tuner.task.best.timecost
                for i in range(len(inputs)):
                    futures.append(
                        thread_pool.submit(run_config_entity,
                                           config_to_json(inputs[i].config), i,
                                           expected_timecost, i % dev_num))
                for i in range(len(inputs)):
                    t = futures[i].result()
                    if t < tuner.task.best.timecost:
                        best_slot = i
                        tuner.task.best.timecost = t
                        tuner.task.best.config = inputs[i].config
                        tuner.task.best.occur = tuner.task.best.curr_step + i + 1
                    results.append(
                        autotvm.measure.MeasureResult(costs=(t, ),
                                                      error_no=0,
                                                      all_cost=i,
                                                      timestamp=time.time()))
                tuner.task.best.curr_step += len(results)

                print(
                    '\nSTEP[%d / %d] Current Best Config = %s, Perf = %g Gflops, Occur Step = %d;'
                    %
                    (tuner.task.best.curr_step, num_trials,
                     json.dumps(config_to_json(tuner.task.best.config)),
                     compute_gflops(tuner.task.flop, tuner.task.best.timecost),
                     tuner.task.best.occur))

                if auto_commit and best_slot >= 0:
                    with open(local_get_dir_file('my_kernel.cc', best_slot),
                              'r') as fp:
                        device_source = fp.read()
                    with open(local_get_dir_file('result.txt', best_slot),
                              'r') as fp:
                        t = float(fp.read().split()[0])
                    kernel_path = codehub_db(
                        os.environ['COMPUTE_V1'],
                        source_code=device_source +
                        '\n// Saved Perf = %g sec / run; Step Produced = %d;' %
                        (t, tuner.task.best.curr_step))
                    print('  >> Update current code to codehub: %s' %
                          kernel_path)
                return results

            tuner.task.best = Mock()
            tuner.task.best.timecost = float('inf')
            tuner.task.best.config = None
            tuner.task.best.occur = -1
            tuner.task.best.curr_step = 0

            tuner.measure_batch = measure_batch
            callbacks = []

            history_log_for_transfer_learning = os.environ.get('RECORD', '')

            if history_log_for_transfer_learning:
                callbacks.append(
                    autotvm.callback.log_to_file(
                        history_log_for_transfer_learning))
                # Enable Transfer Learning for Incremental Task
                if os.path.exists(history_log_for_transfer_learning):
                    print(
                        '  >>  Loading incremental history from log file: %s ..'
                        % history_log_for_transfer_learning)
                    tuner.load_history(
                        autotvm.record.load_from_file(
                            history_log_for_transfer_learning))

            tuner.tune(n_trial=num_trials,
                       measure_option=autotvm.measure_option(
                           builder=autotvm.LocalBuilder(n_parallel=batch_size),
                           runner=autotvm.LocalRunner(repeat=3,
                                                      min_repeat_ms=100,
                                                      timeout=4)),
                       callbacks=callbacks)
            assert not math.isinf(
                tuner.task.best.timecost
            ), "Not valid config found in the whole tuning."
            best_config = tuner.task.best.config

            print(
                "\n[Best Config] CONFIG='%s'  ==>  Performance is up to %f Gflops, occurred at step %d / %d; time per run = %g sec."
                %
                (json.dumps(config_to_json(best_config)),
                 compute_gflops(tuner.task.flop, tuner.task.best.timecost),
                 tuner.task.best.occur, num_trials, tuner.task.best.timecost))

            if hasattr(tuner, 'cleanup'):
                tuner.cleanup()
        else:
            raise Exception('Unrecognized tuner type: `%s`' % tuner_type)
        exit(0)
    else:
        if os.environ['OP'] == 'auto.generic':
            saved_code = codehub_db(os.environ['COMPUTE_V1'])
            if saved_code is not None:
                print("  >> Using Saved Code from Codehub:")
                print("===========================")
                print(saved_code)
                print("===========================")
                exit(0)
        best_config = task.config_space

    if isinstance(best_config, str):
        from tvm import auto_scheduler
        origin_cfg = json.loads(best_config)
        origin_cfg = {
            "i": [[
                '["main_compute.<locals>.auto_template"]',
                'cuda -keys=cuda,gpu -max_num_threads=%d -thread_warp_size=%d'
                % (device_properties().max_threads_per_block,
                   device_properties().warp_size)
            ], origin_cfg],
            "r": [[0], 0, 0, 0],
            "v":
            "v0.2",
        }
        origin_cfg_file = local_get_dir_file('my_kernel.cfg')
        with open(origin_cfg_file, 'w') as fp:
            fp.write(json.dumps(origin_cfg))
        origin_cfg = tvm.auto_scheduler.measure_record.load_records(
            origin_cfg_file)

        @auto_scheduler.register_workload
        def auto_template():
            _, arg_bufs = default_tune_op.get_template_op()
            return arg_bufs

        target = tvm.target.Target("cuda")
        auto_task = auto_scheduler.create_task(auto_template, (), target)
        for inp, res in origin_cfg:
            s, arg_bufs = auto_task.compute_dag.apply_steps_from_state(
                inp.state)
            break
    else:
        with ApplyConfig(best_config):
            with tvm.target.Target(tvm_target):
                s, arg_bufs = default_tune_op.get_template_op()

    if s is not None:
        lower_source = str(tvm.lower(s, arg_bufs, simple_mode=True))

        lower_file = local_get_dir_file('my_kernel.lower')
        with open(lower_file, 'w') as fp:
            fp.write(lower_source)

        # Verify Lower Code Code
        if len(('\n' + lower_source).split('\nprimfn(')) != 2:
            raise Exception('[Not Support Multi Unfuse-able kernels]\n\n' +
                            lower_source)

        max_threads_per_block = device_properties().max_threads_per_block
        max_shared_memory_per_block = device_properties(
        ).max_shared_memory_per_block
        assert max_threads_per_block > 0 and max_shared_memory_per_block >= 0, '[Error] Invalid device properties, maybe device is not detected correctly.'

        lower_lines = lower_source.split('\n')
        thread_extents, allocate_shared = [], []
        for ll in lower_lines:
            if ll.strip().startswith(
                    'attr [IterVar(') and ll.find(' "thread_extent" = ') >= 0:
                thread_name = ll.split('attr [IterVar(')[-1].split(':')[0]
                thread_val = int(
                    ll.split(' "thread_extent" = ')[-1].split(';')
                    [0].strip().split(' ')[0])
                thread_extents.append((thread_name, thread_val))
            elif ll.strip().startswith('allocate(') and ll.find(
                    '.shared, ') >= 0 and ll.endswith(");"):
                parts = ll[:-2].split(', ')[1:]
                allocate_type = parts[0]
                allocate_val = int(np.product(eval(parts[1])))
                allocate_shared.append((allocate_type, allocate_val))

        reserved_axes = dict()
        for thread_name, thread_val in thread_extents:
            if thread_name in reserved_axes:
                assert reserved_axes[
                    thread_name] == thread_val, "Invalid code: Multiple hints for thread extent conflict with each other: %d v.s. %d" % (
                        reserved_axes[thread_name], thread_val)
            else:
                reserved_axes[thread_name] = thread_val

        num_threads = 1
        for thread_name in ['threadIdx.x', 'threadIdx.y', 'threadIdx.z']:
            num_threads *= reserved_axes.get(thread_name, 1)
        assert num_threads <= max_threads_per_block, "Invalid kernel code: using num_threads(%d) > max_threads_per_block(%d)" % (
            num_threads, max_threads_per_block)

        shared_memory_in_bytes = 0
        for allocate_type, allocate_size in allocate_shared:
            if allocate_type.startswith('custom['):
                type_name = allocate_type[7:].split(']')[0]
                shared_memory_inc = int(
                    custom_dtypes[type_name][-1].split('@')[-1])
            else:
                shared_memory_inc = 8 * np.dtype(allocate_type).itemsize
            assert shared_memory_inc % 8 == 0, "The bits of shared_memory is not aligned with 8-bit bytes."
            shared_memory_in_bytes += shared_memory_inc // 8 * allocate_size

        if shared_memory_in_bytes > max_shared_memory_per_block:
            raise Exception(
                "Invalid kernel code: using shared_memory_in_bytes %d > max_shared_memory_per_block %d"
                % (shared_memory_in_bytes, max_shared_memory_per_block))

        # Compile Source Code
        def build_template():
            return tvm.build(s, arg_bufs, tvm_target, name='template_op')

        func = wait_for(build_template, 30)

    assert (len(func.imported_modules) == 1)
    device_source = translate_code(func.imported_modules[0].get_source())

    if code_only:
        return device_source

    print("====================================")
    print(device_source)
    print("====================================")

    print()
    try:
        eval_client = importlib.import_module('platforms.%s.evaluator.client' %
                                              backend)
    except ModuleNotFoundError:
        print('>> Evaluator for backend %s not found, skipping evaluation.' %
              backend)
        exit(0)
    except:
        traceback.print_exc()
        exit(1)

    def handle_result(result):
        print('\n[EvalAgent] Results =', json.dumps(result))
        if 'RESULT' in os.environ:
            if abs(float(os.environ['RESULT']) / result['K/0'] - 1.0) > 1e-6:
                result['TPR'] = None

        t = result.get('TPR', None)
        if t is None:
            print("\n[Antares] Incorrect compute kernel from evaluator.")
        else:
            gflops = compute_gflops(task.flop, t)
            print("\n[Antares] Average time cost / run = %g sec, %g gflops." %
                  (t, gflops))
            with open(local_get_dir_file('result.txt'), 'w') as fp:
                fp.write(str(t) + '\n')
                if 'K/0' in result:
                    fp.write(str(result['K/0']) + '\n')
        if os.environ['OP'] == 'auto.generic' and os.environ.get('COMMIT', ''):
            kernel_path = codehub_db(os.environ['COMPUTE_V1'],
                                     source_code=device_source +
                                     '\n// Saved Perf = %g sec / run' % t)
            print('  >> Update current code to codehub: %s' % kernel_path)

    tune_slot_id = int(os.environ.get(unified_slot_key, '0'))

    exec_fd, _ = system_lock([tune_slot_id])
    try:
        expected_timeout = None
        if 'EXPECTED_TIMEOUT' in os.environ and not math.isinf(
                float(os.environ['EXPECTED_TIMEOUT'])):
            expected_timeout = float(os.environ['EXPECTED_TIMEOUT'])
            expected_timeout = max(expected_timeout * 1.1,
                                   expected_timeout + 0.1)

        results = eval_client.eval(
            kernel_path=local_get_dir_file('my_kernel.cc'),
            expected_timeout=expected_timeout,
            func=func,
        )
    except:
        traceback.print_exc()
        exit(1)

    handle_result(results)
    exec_fd()
    exit(0)
Exemplo n.º 3
0
def main_compute(code_only=False):
  tvm.register_func('tvm_callback_cuda_compile', compile_source, override=True)
  logging.getLogger('autotvm').setLevel(logging.DEBUG)
  logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))

  default_tune_op = importlib.import_module('templates.' + (os.environ['OP'] if 'OP' in os.environ else 'auto.generic'))
  if verbose:
    print('  >> Backend = %s, Python PID = %s, Task = %s;' % (backend, os.getpid(), default_tune_op.__name__))

  task = autotvm.task.create("template_op", args=(), target=tvm_target)

  def json_to_config(json_dict, index=-1, code_hash=None):
    if not isinstance(json_dict, list):
      json_list = []
      for key in json_dict:
        json_list.append([key, 'ot' if type(json_dict[key]) is not list else ('sp' if json_dict[key][0:1] == [-1] else 're'), json_dict[key]])
      json_dict = json_list
    config = ConfigEntity.from_json_dict({"index": index, "time": "", "code_hash": code_hash, "entity": json_dict})
    # config = ConfigEntity.from_json_dict({"i": index, "t": "", "c": code_hash, "e": json_dict})
    return config

  def config_to_json(config):
    if config is None:
      return {}
    if isinstance(config, str):
      return json.loads(config)
    jobj = config.to_json_dict()['entity']
    # jobj = config.to_json_dict()['e']
    json_dict = dict()
    for i in range(len(jobj)):
      assert(jobj[i][1] in ['sp', 'ot', 're'])
      json_dict[jobj[i][0]] = jobj[i][2]
    return json_dict

  num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0

  config = os.environ.get('CONFIG', '').strip()
  if config != '':
    best_config = config
  elif 'NNI_TRIAL_JOB_ID' in os.environ:
    if os.environ['NNI_TRIAL_JOB_ID'] == '@':
      search_space = get_search_space(task.config_space)
      json_space = json.dumps(search_space)
      dump_to_file='./search_space.json'
      print("\n>> Writing Search Space to '%s', Search Space = %s;" % (dump_to_file, json_space))
      with open("search_space.json", "w") as fp:
        fp.write(json_space)
      sys.exit(0)

    try:
      import nni
      params_given = nni.get_next_parameter()
      if params_given is None:
        raise
      local_dir_id = os.environ['NNI_TRIAL_JOB_ID']
    except:
      params_given = default_tune_op.get_choice_example()
      local_dir_id = '_'
    t = run_config_entity(params_given, local_dir_id)
    gflops = compute_gflops(task.flop, t)
    print('[Antares-engine] Final entity result is: %g' % gflops)
    try:
      nni.report_final_result(gflops)
    except:
      print('[Antares-engine] (not reporting final result to NNI.)')
    exit(0)

  elif num_trials > 0:
    dev_num = platform_config.get_execution_parallism()
    if dev_num <= 0:
        raise Exception("No valid device found for backend: %s." % backend)
    batch_size = int(os.environ.get('BATCH', '16'))

    from concurrent.futures import ThreadPoolExecutor
    try:
      if platform_config.allow_concurrent_compile_execution():
        raise Exception()
      worker_size = 1
    except:
      worker_size = batch_size
    thread_pool = ThreadPoolExecutor(max_workers=worker_size)

    task.antares_helper = Mock()
    task.antares_helper.json_to_config = json_to_config
    task.antares_helper.config_to_json = config_to_json
    task.antares_helper.to_json_search_space = get_search_space

    tuner_type = os.environ.get('TUNER', '')
    if not tuner_type:
      comp = os.environ['COMPUTE_V1']
      if '=!' in comp and 'plan/' not in comp[comp.find(' ##') + 1:] and ';' not in comp and backend in ['c-rocm', 'c-cuda', 'c-hlsl', 'c-ocl']:
        tuner_type = 'AutoTVM2'
      else:
        tuner_type = 'XGBoost'
    print('  >> MAKE_PARA = %d/%d, EXEC_PARA = %d, TUNER = %s' % (worker_size, batch_size, dev_num, tuner_type))

    auto_commit = os.environ.get('COMMIT', '')
    if auto_commit:
      saved_code = codehub_db(os.environ['COMPUTE_V1'])
      if saved_code is not None and auto_commit != 'force':
        raise Exception("Saved code has existed in codehub. Please try COMMIT=force to override it.")
      os.environ.pop('COMMIT')

    try:
      tuner = importlib.import_module('tuner.%s.main' % tuner_type)
      tuner = tuner.MainTuner(task)
    except:
      raise Exception('>> Cannot import Antares Tuner: %s' % tuner_type)

    if tuner is not None:
      AntaresGlobal.current_step = 0

      def measure_batch(inputs):
        results, futures = [], []
        best_slot = -1
        expected_timecost = tuner.task.best.timecost
        for i in range(len(inputs)):
          futures.append(thread_pool.submit(run_config_entity, config_to_json(inputs[i].config), AntaresGlobal.current_step + i + 1, expected_timecost, i % dev_num))
        for i in range(len(inputs)):
          t = futures[i].result()
          if t < tuner.task.best.timecost:
            best_slot = AntaresGlobal.current_step + i + 1
            tuner.task.best.timecost = t
            tuner.task.best.config = inputs[i].config
            tuner.task.best.occur = best_slot
          results.append(autotvm.measure.MeasureResult(costs=(t,), error_no=0, all_cost=i, timestamp=time.time()))
        AntaresGlobal.current_step += len(results)

        print('\nSTEP[%d / %d] Current Best Config = %s, Perf = %g Gflops, MemRatio = %g %%, Occur Step = %d;' % (
          AntaresGlobal.current_step,
          num_trials,
          json.dumps(config_to_json(tuner.task.best.config)),
          compute_gflops(tuner.task.flop, tuner.task.best.timecost),
          compute_mem_ratio(tuner.task.best.timecost),
          tuner.task.best.occur))

        if auto_commit and best_slot >= 0:
          with open(local_get_dir_file('my_kernel.cc', best_slot), 'r') as fp:
            device_source = fp.read()
          with open(local_get_dir_file('result.txt', best_slot), 'r') as fp:
            t = float(fp.read().split()[0])
          kernel_path = codehub_db(os.environ['COMPUTE_V1'], source_code=device_source + '\n// Saved Perf = %g sec / run; Step Produced = %d;' % (t, best_slot))
          print('  >> Update current code to codehub: %s' % kernel_path)
        return results

      tuner.task.best = Mock()
      tuner.task.best.timecost = float('inf')
      tuner.task.best.config = None
      tuner.task.best.occur = -1

      tuner.measure_batch = measure_batch
      tuner.measure_batch.n_parallel = batch_size
      callbacks = []

      history_log_for_transfer_learning = os.environ.get('RECORD', '')

      if history_log_for_transfer_learning:
        callbacks.append(autotvm.callback.log_to_file(history_log_for_transfer_learning))
        # Enable Transfer Learning for Incremental Task
        if os.path.exists(history_log_for_transfer_learning):
          print('  >>  Loading incremental history from log file: %s ..' % history_log_for_transfer_learning)
          tuner.load_history(autotvm.record.load_from_file(history_log_for_transfer_learning))

      tuner.tune(n_trial=num_trials, measure_option=autotvm.measure_option(
          builder=autotvm.LocalBuilder(n_parallel=batch_size),
          runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)
      ), callbacks=callbacks)
      assert not math.isinf(tuner.task.best.timecost), "Not valid config found in the whole tuning."
      best_config = json.dumps(config_to_json(tuner.task.best.config))

      if auto_commit:
          device_source = codehub_db(os.environ['COMPUTE_V1'])
          codehub_db(os.environ['COMPUTE_V1'], source_code=device_source + '\n// Antares Tuning Completed in %d steps.' % AntaresGlobal.current_step)

      print("\n[Best Config] CONFIG='%s'  ==>  Performance is up to %f Gflops, occurred at step %d / %d; time per run = %g sec." % (
        best_config,
        compute_gflops(tuner.task.flop, tuner.task.best.timecost),
        tuner.task.best.occur,
        num_trials,
        tuner.task.best.timecost))

      if hasattr(tuner, 'cleanup'):
        tuner.cleanup()
    else:
      raise Exception('Unrecognized tuner type: `%s`' % tuner_type)
    exit(0)
  else:
    if os.environ['OP'] == 'auto.generic':
      saved_code = codehub_db(os.environ['COMPUTE_V1'])
      if saved_code is not None:
        print("  >> Using Saved Code from Codehub:")
        print("===========================")
        print(saved_code)
        print("===========================")
        exit(0)
    best_config = ''

  assert isinstance(best_config, str)
  if verbose:
    print("====>> [Current Config Option]", best_config)
  if best_config.startswith('['):
    from tvm import auto_scheduler
    origin_cfg = json.loads(best_config)
    origin_cfg = {
      "i": [['["main_compute.<locals>.auto_template"]', 'cuda -keys=cuda,gpu -max_num_threads=%d -thread_warp_size=%d' % (
                device_properties().max_threads_per_block, device_properties().warp_size
             )], origin_cfg],
      "r": [[0], 0, 0, 0],
      "v": "v0.2",
    }
    origin_cfg_file = local_get_dir_file('my_kernel.cfg')
    with open(origin_cfg_file, 'w') as fp:
      fp.write(json.dumps(origin_cfg))
    origin_cfg = tvm.auto_scheduler.measure_record.load_records(origin_cfg_file)
 
    @auto_scheduler.register_workload
    def auto_template():
      _, arg_bufs = default_tune_op.get_template_op()
      return arg_bufs

    target = tvm.target.Target("cuda")
    auto_task = auto_scheduler.create_task(auto_template, (), target)
    for inp, res in origin_cfg:
      s, arg_bufs = auto_task.compute_dag.apply_steps_from_state(inp.state)
      break
  else:
    config = json_to_config(json.loads(best_config)) if best_config else task.config_space
    with ApplyConfig(config):
      with tvm.target.Target(tvm_target):
        s, arg_bufs = default_tune_op.get_template_op()

  device_source, kernel_path = get_target_source(s, arg_bufs)

  if code_only:
    return device_source

  if verbose:
    print("====================================")
    print(device_source)
    print("====================================\n")

  dev_id = int(os.environ.get('DEV_KEY', '0'))
  result = evaluate_perf(kernel_path, task.flop, dev_id)
  exit(0 if result is not None else 1)
Exemplo n.º 4
0
def get_target_source(best_config, dir_sid=None):
    default_tune_op = AntaresGlobal.default_tune_op
    if not isinstance(best_config, str):
        # Default config
        with ApplyConfig(best_config):
            with tvm.target.Target(tvm_target):
                s, arg_bufs = default_tune_op.get_template_op()
    elif best_config.startswith('['):
        # Ansor config
        from tvm import auto_scheduler
        origin_cfg = json.loads(best_config)
        origin_cfg = {
            "i": [[
                '["main_compute.<locals>.auto_template"]',
                'cuda -keys=cuda,gpu -max_num_threads=%d -thread_warp_size=%d'
                % (device_properties().max_threads_per_block,
                   device_properties().warp_size)
            ], origin_cfg],
            "r": [[0], 0, 0, 0],
            "v":
            "v0.2",
        }
        origin_cfg_file = local_get_dir_file('my_kernel.cfg', dir_sid=dir_sid)
        with open(origin_cfg_file, 'w') as fp:
            fp.write(json.dumps(origin_cfg))
        origin_cfg = tvm.auto_scheduler.measure_record.load_records(
            origin_cfg_file)

        from tuner.Ansor.main import create_auto_task
        target = tvm.target.Target(tvm_target)
        auto_task = create_auto_task(target)

        for inp, res in origin_cfg:
            s, arg_bufs = auto_task.compute_dag.apply_steps_from_state(
                inp.state)
            break
    else:
        # Standard config
        json_to_config = AntaresGlobal.default_task.antares_helper.json_to_config
        config = json_to_config(json.loads(best_config))
        with ApplyConfig(config):
            with tvm.target.Target(tvm_target):
                s, arg_bufs = default_tune_op.get_template_op()

    if s is not None:
        lower_source = str(tvm.lower(s, arg_bufs, simple_mode=True))

        lower_file = local_get_dir_file('my_kernel.lower', dir_sid=dir_sid)
        with open(lower_file, 'w') as fp:
            fp.write(lower_source)

        # Verify Lower Code Code
        if len(('\n' + lower_source).split('\nprimfn(')) != 2:
            raise Exception('[Not Support Multi Unfuse-able kernels]\n\n' +
                            lower_source)

        max_threads_per_block = device_properties().max_threads_per_block
        max_shared_memory_per_block = device_properties(
        ).max_shared_memory_per_block
        assert max_threads_per_block > 0 and max_shared_memory_per_block >= 0, '[Error] Invalid device properties, maybe device is not detected correctly.'

        lower_lines = lower_source.split('\n')
        thread_extents, allocate_shared = [], []
        for ll in lower_lines:
            if ll.strip().startswith(
                    'attr [IterVar(') and ll.find(' "thread_extent" = ') >= 0:
                thread_name = ll.split('attr [IterVar(')[-1].split(':')[0]
                thread_val = int(
                    ll.split(' "thread_extent" = ')[-1].split(';')
                    [0].strip().split(' ')[0])
                thread_extents.append((thread_name, thread_val))
            elif ll.strip().startswith('allocate(') and ll.find(
                    '.shared, ') >= 0 and ll.endswith(");"):
                parts = ll[:-2].split(', ')[1:]
                allocate_type = parts[0]
                allocate_val = int(np.product(eval(parts[1])))
                allocate_shared.append((allocate_type, allocate_val))

        reserved_axes = dict()
        for thread_name, thread_val in thread_extents:
            if thread_name in reserved_axes:
                assert reserved_axes[
                    thread_name] == thread_val, "Invalid code: Multiple hints for thread extent conflict with each other: %d v.s. %d" % (
                        reserved_axes[thread_name], thread_val)
            else:
                reserved_axes[thread_name] = thread_val

        num_threads = 1
        for thread_name in ['threadIdx.x', 'threadIdx.y', 'threadIdx.z']:
            num_threads *= reserved_axes.get(thread_name, 1)
        assert num_threads <= max_threads_per_block, "Invalid kernel code: using num_threads(%d) > max_threads_per_block(%d)" % (
            num_threads, max_threads_per_block)

        shared_memory_in_bytes = 0
        for allocate_type, allocate_size in allocate_shared:
            if allocate_type.startswith('custom['):
                type_name = allocate_type[7:].split(']')[0]
            else:
                type_name = allocate_type
            shared_memory_in_bytes += get_type_size(type_name) * allocate_size

        if shared_memory_in_bytes > max_shared_memory_per_block:
            raise Exception(
                "Invalid kernel code: using shared_memory_in_bytes %d > max_shared_memory_per_block %d"
                % (shared_memory_in_bytes, max_shared_memory_per_block))

        # Compile Source Code
        def build_template():
            return tvm.build(s, arg_bufs, tvm_target, name='template_op')

        func = build_template()

    assert (len(func.imported_modules) == 1)
    device_source = translate_code(func.imported_modules[0].get_source(),
                                   best_config)
    kernel_path = local_get_dir_file('my_kernel.cc', dir_sid=dir_sid)
    with open(kernel_path, 'w') as fp:
        fp.write(device_source)

    kernel_out = local_get_dir_file('my_kernel.out', dir_sid=dir_sid)
    compile_args = platform_config.get_compile_kernel_args(
        kernel_path, kernel_out, device_properties())
    return device_source, kernel_path, compile_args