Exemplo n.º 1
def is_local_optimum(parameters, template, sizes, context):
    tree, _ = tools.tree_of(template, sizes, context)
    genetic_infos = tools.genetic_infos_of(template)

    if issubclass(template, sc.templates.elementwise_1d):
        sweep_over = [0, 1, 2]
    elif issubclass(template, sc.templates.reduce_1d):
        sweep_over = [0, 1, 2]
    elif issubclass(template, sc.templates.elementwise_2d):
        sweep_over = [0, 1, 2, 3, 4]
    elif issubclass(template, sc.templates.reduce_2d):
        sweep_over = [0, 1, 2, 3, 4]
    elif issubclass(template, sc.templates.gemm):
        sweep_over = [0, 1, 2, 3, 4]
    #Evaluate the provided parameters guess
    reference = tools.benchmark(template(*parameters), tree)
    if reference == float('inf'):
        return False
    #Latency bound -- ignore
    if reference < 1e-5:
        return True
    #Determine if local minimum
    timings = {}
    domain = [[v  for v in [x/2, x, x*2] if 1 <= v <= 2**2**genetic_infos['nbits'][i]] \
              if i in sweep_over else [x] for i, x in enumerate(parameters)]
    for x in product(*domain):
        if x == parameters:
        time = tools.benchmark(template(*x), tree)
        if time / reference < .98:
            return False
    return True
Exemplo n.º 3
def isaacPool(ctx, stream, shapes, layouts):
    # Shapes
    dtype, Npix, Nfilt = shapes
    N, K, M, P, Q = 1, 1, 1, 1, Npix
    T, R, S = 1, 1, Nfilt
    dtype = sc.dtype(dtype)
    pad_d, pad_h, pad_w, stride_d, stride_h, stride_w = 0, 0, 0, 1, 1, 1
    D = M * stride_d + T - 1 - 2 * pad_d - stride_d + 1
    H = P * stride_h + R - 1 - 2 * pad_h - stride_h + 1
    W = Q * stride_w + S - 1 - 2 * pad_w - stride_w + 1
    # Kernel
    generator = sc.templates.Pool(sc.dtype(dtype), K, D, H, W, N, M, P, Q, T,
                                  R, S, pad_d, pad_h, pad_w, stride_d,
                                  stride_h, stride_w, *layouts)
    src = generator.dump(ctx.device, "pool_fprop")
    module = sc.driver.Module(ctx, src)
    kernel = sc.driver.Kernel(module, "pool_fprop")
    with lock:
        # BuffeNfilt
        O = sc.driver.Buffer(ctx, K * M * P * Q * N * sc.size_of(dtype))
        I = sc.driver.Buffer(ctx, K * D * H * W * N * sc.size_of(dtype))
        # Result
        time = benchmark(
            (generator.enqueue(kernel, stream, I, O), stream.synchronize()),
            ctx.device, 1e-2)
    tflops = M * P * Q * N * K * T * R * S / time * 1e-12
    return tflops
Exemplo n.º 4
def isaacGemm(ctx, stream, shapes, layouts):
    # Shapes
    offa, offb, offc = 0, 0, 0
    dtype, AT, BT, M, N, K = shapes
    dtype = sc.dtype(dtype)
    AT, BT = sc.templates.op(AT), sc.templates.op(BT)
    ldc = M
    lda = M if AT == sc.templates.OP_N else K
    ldb = K if BT == sc.templates.OP_N else N
    # Kernel
    generator = sc.templates.GEMM(dtype, AT, BT, M, N, K, offa, lda, offb, ldb,
                                  offc, ldc, *layouts)
    src = generator.dump(ctx.device, "gemm")
    module = sc.driver.Module(ctx, src)
    kernel = sc.driver.Kernel(module, "gemm")
    with lock:
        # BuffeNfilt
        C = sc.driver.Buffer(ctx, M * N * sc.size_of(dtype))
        A = sc.driver.Buffer(ctx, M * K * sc.size_of(dtype))
        B = sc.driver.Buffer(ctx, K * N * sc.size_of(dtype))
        alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype)
        # Result
        ts = benchmark(
            lambda: (generator.enqueue(kernel, stream, alpha, A, B, beta, C),
                     stream.synchronize()), ctx.device, 1e-2)
    tflops = 2 * M * N * K / ts * 1e-12
    return tflops
Exemplo n.º 5
def isaacConv(ctx, stream, shapes, layouts):
    # Shapes
    dtype, Npix, K, C, Nfilt = shapes
    N, M, P, Q = 1, 1, 1, Npix
    T, R, S = 1, 1, Nfilt
    dtype = sc.float64 if dtype==8 else sc.float32
    pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w = 0, 0, 0, 1, 1, 1, 1, 1, 1
    D = M*stride_d + T - 1 - 2*pad_d - stride_d + 1
    H = P*stride_h + R - 1 - 2*pad_h - stride_h + 1
    W = Q*stride_w + S - 1 - 2*pad_w - stride_w + 1
    # Kernel
    generator = sc.templates.Conv(dtype, dtype, C, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, sc.templates.LINEAR, 1, sc.templates.NO_RESIDUAL, 0, 1, 1, 1, 1, 1, 1, *layouts)

    src = generator.dump(ctx.device, "conv_fprop")
    module = sc.driver.Module(ctx, src)
    kernel = sc.driver.Kernel(module, "conv_fprop")
    with lock:
        # Buffers
        O = sc.driver.Buffer(ctx, K*M*P*Q*N*sc.size_of(dtype))
        I = sc.driver.Buffer(ctx, C*D*H*W*N*sc.size_of(dtype))
        F = sc.driver.Buffer(ctx, C*T*R*S*K*sc.size_of(dtype))
        alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype)
        # Result
        time = benchmark(lambda: (generator.enqueue(kernel, stream, I, F, O, None, 1., 1., 1., [1.], 1., None), stream.synchronize()), ctx.device, 1e-2)
    tflops = 2*M*P*Q*N*K*C*T*R*S/time*1e-12
    return tflops
Exemplo n.º 7
def test_munkres_benchmark():
    if is_travis():
        vals = [10, 30, 70, 100, 300]
        vals = [10, 30]
    ts, res = benchmark(munkres_call, [vals], samples=5)
    mongodb_save('test_munkres_benchmark', {
        'values': vals,
        'durations': ts.tolist()
Exemplo n.º 8
 def evaluate(genome):
     idx = tuple(genome)
     if idx not in cache:
         time = tools.benchmark(template(*decode(genome)), tree)
         if time == float('inf'):
             return time,
         cache[idx] = time
     self.progress_bar.update(max(len(cache), it), self.niter,
                              decode(min(cache, key=cache.get)),
                              metric(sizes, min(cache.values())))
     return cache[idx],
Exemplo n.º 9
def planner_comparison(seed):
    params = get_data_random(seed + 1,
    agent_pos, grid, idle_goals, jobs = params
    mapstr = get_map_str(grid)
    maphash = str(hashlib.md5(mapstr.encode('utf-8')).hexdigest())[:8]

    fname = "planner/eval/cache/" + str(
        maphash) + '.pkl'  # unique filename based on map
    pre_calc_paths(jobs, idle_goals, grid, fname)

    config_opt = generate_config()
    config_opt['params'] = params
    config_opt['filename_pathsave'] = fname

    config_milp = config_opt.copy()
    config_milp['milp'] = 1

    config_cobra = config_opt.copy()
    config_cobra['cobra'] = 1

    config_greedy = config_opt.copy()
    config_greedy['greedy'] = 1

    config_nn = config_opt.copy()
    config_nn['number_nearest'] = 2

    config_col = config_nn.copy()
    config_col['all_collisions'] = True

    if is_cch():
        print("Configs: [config_opt, config_nn, config_milp]")
        configs = [config_opt, config_nn,
                   config_milp]  #, config_cobra, config_greedy, config_col]
        sizes = [1, 2]
            "Configs: [config_opt, config_nn, config_milp, config_cobra, config_greedy]"
        configs = [
            config_opt, config_nn, config_milp, config_cobra, config_greedy
        sizes = [2, 3, 4]
    ts, ress = benchmark(one_planner, [configs, sizes],

    return ts, ress
Exemplo n.º 10
def cudaGemm(ctx, stream, dtype, AT, BT, M, N, K):
    ldc = M
    lda = M if AT==1 else K
    ldb = K if BT==1 else N
    dtype = sc.dtype(dtype)
    C = sc.driver.Buffer(ctx, M*N*sc.size_of(dtype))
    A = sc.driver.Buffer(ctx, M*K*sc.size_of(dtype))
    B = sc.driver.Buffer(ctx, K*N*sc.size_of(dtype))
    alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype)
    time = benchmark(lambda: (sc.driver.cublasGemm(dtype, ctx, stream, 'N' if AT==1 else 'T', 'N' if BT==1 else 'T', M, N, K, alpha, A, lda, B,  ldb, beta, C, ldc), stream.synchronize()), ctx.device, 1e-2)
    tflops = 2*M*N*K/time*1e-12
    return tflops
Exemplo n.º 11
def cudaConv(ctx, stream, dtype, N, K, P, Q, C, R, S):
    pad_h, pad_w, stride_h, stride_w = 0, 0, 1, 1
    H = P*stride_h + R - 1 - 2*pad_h
    W = Q*stride_w + S - 1 - 2*pad_w
    dtype = sc.dtype(dtype)
    O = sc.driver.Buffer(ctx, K*P*Q*N*sc.size_of(dtype))
    I = sc.driver.Buffer(ctx, C*H*W*N*sc.size_of(dtype))
    F = sc.driver.Buffer(ctx, C*R*S*K*sc.size_of(dtype))
    alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype)
    time = benchmark(lambda: (sc.driver.cudnnConv(dtype, ctx, stream, H, W, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O), stream.synchronize()), ctx.device, 1e-2)
    tflops = 2*P*Q*K*N*C*R*S/time*1e-12
    return tflops
Exemplo n.º 12
def is_local_optimum(parameters, template, sizes, context):
    tree, _ = tools.tree_of(template, sizes, context)
    genetic_infos = tools.genetic_infos_of(template)
    if issubclass(template, sc.templates.elementwise_1d):
        sweep_over = [0,1,2]
    elif issubclass(template, sc.templates.reduce_1d):
        sweep_over = [0,1,2]
    elif issubclass(template, sc.templates.elementwise_2d):
        sweep_over = [0,1,2,3,4]
    elif issubclass(template, sc.templates.reduce_2d):
        sweep_over = [0,1,2,3,4]
    elif issubclass(template, sc.templates.matrix_product):
        sweep_over = [1,2,3,4,5,6,7]
    #Evaluate the provided parameters guess
        reference = tools.benchmark(template, parameters, tree)
    except profile_execution_failure:
        return False
    #Latency bound -- ignore
    if reference < 1e-5:
        return True
    timings = {}
    domain = [[v  for v in [x/2, x, x*2] if 1 <= v <= 2**2**genetic_infos['nbits'][i]] \
              if i in sweep_over else [x] for i, x in enumerate(parameters)]
    for x in product(*domain):
        if x==parameters:
            time = tools.benchmark(template, x, tree)
            if time/reference < .98:
                return False
        except profile_execution_failure:
    return True
Exemplo n.º 13
def exhaustive(template, sizes, context):
    tree, _ = tools.tree_of(template, sizes, context)
    metric = tools.metric_of(template)
    nbits = tools.genetic_infos_of(template)['nbits']
    categorical = tools.genetic_infos_of(template)['categorical']
    ranges = [range(2**x) for x in nbits]
    ranges = list(product(*ranges))
    timings = {}
    best = None
    for idx, r in enumerate(ranges):
        parameters = tuple([fetch_types[x] if i in categorical else 2**x for i,x in enumerate(r)])
            time = tools.benchmark(template, parameters, tree)
            if not best or time < best[1]:
                best = parameters, time
        except profile_execution_failure:
        if best:
            stdout.write('%.2f %% | Best %.2f [ for %s ]\r'%(float(idx*100)/len(ranges),metric(sizes, best[1]), best[0]))
    return best[0]
Exemplo n.º 14
def isaacPool(ctx, stream, shapes, layouts):
    # Shapes
    dtype, Npix, Nfilt = shapes
    N, K, M, P, Q = 1, 1, 1, 1, Npix
    T, R, S = 1, 1, Nfilt
    dtype = sc.float64 if dtype==8 else sc.float32
    pad_d, pad_h, pad_w, stride_d, stride_h, stride_w = 0, 0, 0, 1, 1, 1
    D = M*stride_d + T - 1 - 2*pad_d - stride_d + 1
    H = P*stride_h + R - 1 - 2*pad_h - stride_h + 1
    W = Q*stride_w + S - 1 - 2*pad_w - stride_w + 1
    # Kernel
    generator = sc.templates.Pool(dtype, dtype, sc.templates.MAX_POOL, K, D, H, W, N, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, *layouts)
    src = generator.dump(ctx.device, "pool_fprop")
    module = sc.driver.Module(ctx, src)
    kernel = sc.driver.Kernel(module, "pool_fprop")
    with lock:
        # BuffeNfilt
        O = sc.driver.Buffer(ctx, K*M*P*Q*N*sc.size_of(dtype))
        I = sc.driver.Buffer(ctx, K*D*H*W*N*sc.size_of(dtype))
        # Result
        time = benchmark(lambda: (generator.enqueue(kernel, stream, I, O, 1., 1.), stream.synchronize()), ctx.device, 1e-2)
    tflops = M*P*Q*N*K*T*R*S/time*1e-12
    return tflops
Exemplo n.º 15
def isaacGemm(ctx, stream, shapes, layouts):
    # Shapes
    offa, offb, offc = 0, 0, 0
    dtype, AT, BT, M, N, K = shapes
    dtype = sc.float64 if dtype==8 else sc.float32
    AT, BT = sc.templates.op(AT), sc.templates.op(BT)
    ldc = M
    lda = M if AT==sc.templates.OP_N else K
    ldb = K if BT==sc.templates.OP_N else N
    # Kernel
    generator = sc.templates.GEMM(dtype, dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc, *layouts)
    src = generator.dump(ctx.device, "gemm")
    module = sc.driver.Module(ctx, src)
    kernel = sc.driver.Kernel(module, "gemm")
    with lock:
        # BuffeNfilt
        C = sc.driver.Buffer(ctx, M*N*sc.size_of(dtype))
        A = sc.driver.Buffer(ctx, M*K*sc.size_of(dtype))
        B = sc.driver.Buffer(ctx, K*N*sc.size_of(dtype))
        alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype)
        # Result
        ts = benchmark(lambda: (generator.enqueue(kernel, stream, alpha, A, B, beta, C, 1., 1., 1., None), stream.synchronize()), ctx.device, 1e-2)
    tflops = 2*M*N*K/ts*1e-12
    return tflops
Exemplo n.º 16
    def run(self, level='intermediate'):

        assert level in ['simple', 'intermediate', 'full']
        tools.dtype = self.dtype
        device = self.device
        operation = self.operation
        context = sc.driver.context(device)

        if self.logger:
            self.logger.info(operation.__name__.replace('_', '-').upper())

        #BLAS1 training sizes
        if operation in [sc.templates.elementwise_1d, sc.templates.reduce_1d]:
            sizes = [(10**x, ) for x in range(3, 8)]

        #BLAS2 training sizes
        if operation in [
                sc.templates.elementwise_2d, sc.templates.reduce_2d_rows,
            sizes = []
            for N in [896, 1280, 1760, 2560]:
                sizes += [(N, N)]
            for M in [16, 32, 64, 128, 512, 1024]:
                for N in [1024, 4096, 16384, 65536]:
                    sizes += [(M, N)]
            for N in [16, 32, 64, 128, 512, 1024]:
                for M in [1024, 4096, 16384, 65536]:
                    sizes += [(M, N)]

        #BLAS3 training sizes
        if operation in [
                sc.templates.gemm_nn, sc.templates.gemm_nt,
                sc.templates.gemm_tn, sc.templates.gemm_tt
            sizes = []
            for N in [896, 1760, 2048, 2560]:
                sizes += [(N, N, N)]
            for N in [896, 1760, 2048, 2560]:
                for K in [16, 32, 64, 128]:
                    sizes += [(N, N, K)]
            for N in [16, 32, 64, 128, 256]:
                for K in [16000, 32000, 64000, 128000]:
                    sizes += [(N, N, K)]
            for M in [1760, 2048, 2560, 4096]:
                for N in [16, 32, 64, 128, 7000]:
                    sizes += [(M, N, M)]
            for K in [1760, 2048, 2560, 4096]:
                for M, N in [(5124, 9124), (35, 8457)]:
                    sizes += [(M, N, K)]
            for M, K in [(7680, 2560), (3072, 1024)]:
                for N in [16, 32, 64, 128]:
                    sizes += [(M, N, K)]

        #Training data
        performance = tools.metric_of(operation)
        profiles, X, Y = [], [], []

        #Restore progress
        savepath = os.path.join('save', tools.dtype.__name__,
        if not os.path.exists(savepath):
            with open(os.path.join(savepath, 'X.csv')) as f:
                X = [
                    tuple(map(int, row))
                    for row in csv.reader(f, delimiter=',')
            with open(os.path.join(savepath, 'profiles.csv')) as f:
                profiles = [
                    map(int, row) for v in row
                    for row in csv.reader(f, delimiter=',')
            with open(os.path.join(savepath, 'Y.csv')) as f:
                Y = [map(float, row) for row in csv.reader(f, delimiter=',')]
            #Recompute Y
            #Y = []
            #for x in X:
            #    tree, _ = tools.tree_of(operation, x, context)
            #    Y.append([performance(x, tools.benchmark(operation(*best), tree)) for best in profiles])

    #Save data

        def save():
            for (fname, data) in zip(['X.csv', 'Y.csv', 'profiles.csv'],
                                     [X, Y, profiles]):
                with open(os.path.join(savepath, fname), 'wb') as f:

        for idx, x in enumerate(sizes):
            #Create new line on log
            if idx > 0:
            self.progress_bar.set_prefix(', '.join(map(str, x)))
            #Skip if already saved
            if x in X:
                row = Y[X.index(x)]
                self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
            #Best existing profile for x
            tree, operands = tools.tree_of(operation, x, context)
            y = [
                performance(x, tools.benchmark(operation(*p), tree))
                for p in profiles
            best = profiles[np.argmax(y)] if y else None
            #Retune if necessary
            tune = not (best and optimize.is_local_optimum(
                best, operation, x, context))
            if tune:
                optimizer = optimize.GeneticOptimizer(
                best = optimizer.run(operation, x, context, prior=best)[0]
                if best not in profiles:
                    for xx, yy in zip(X, Y):
                        tree, _ = tools.tree_of(operation, xx, context)
                        time = tools.benchmark(operation(*best), tree)
                        yy.append(performance(xx, time))
            #Update dataset
            tree, operands = tools.tree_of(operation, x, context)
            y = [
                performance(x, tools.benchmark(operation(*prf), tree))
                for prf in profiles
            #Save data
            #print performance info in case no tuning was done
            if not tune:
                row = Y[X.index(x)]
                self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
        #Adding external profiles
        for prof in tools.external_profiles(operation):
            for x, y in zip(X, Y):
                tree, operands = tools.tree_of(operation, x, context)
                perf = performance(x, tools.benchmark(prof, tree, operation))
        #Pruning of useless profiles
        X = np.array(X)
        Y = np.array(Y)
        if len(Y[0]) > 1:
            idx = np.where(
                np.bincount(np.argmax(Y, 1), minlength=len(profiles)) == 0)[0]
            profiles = [p for ip, p in enumerate(profiles) if ip not in idx]
            Y = np.delete(Y, idx, axis=1)
        #Exporting to JSON
        json_path = tools.sanitize(
            device.name) + '.json' if not self.json_path else self.json_path
        if os.path.isfile(json_path):
            json_data = json.load(open(json_path, 'r'))
            json_data = {}
            json_data["version"] = "1.0"
        operation_name = operation.__name__
        if operation_name not in json_data:
            json_data[operation_name] = {}
        json_data[operation_name][tools.dtype.__name__] = {}
        D = json_data[operation_name][tools.dtype.__name__]
        if len(profiles) > 1:
            clf, nrmse = model.train(X, Y, profiles)
            D['predictor'] = [{
                e.tree_.value[:, :, 0].astype('float64').tolist()
            } for e in clf.estimators_]
        D['profiles'] = [tools.convert(x) for x in profiles]
        json.dump(json_data, open(json_path, 'w'))
Exemplo n.º 17
    def run(self, level = 'intermediate'): 
        assert level in ['simple', 'intermediate', 'full']
        tools.dtype = self.dtype
        device = self.device
        operation = self.operation
        context = sc.driver.context(device)
        if self.logger:

        #BLAS1 training sizes
        if operation in [sc.templates.elementwise_1d, sc.templates.reduce_1d]:
            sizes = [(10**x,) for x in range(3,8)]
        #BLAS2 training sizes
        if operation in [sc.templates.elementwise_2d, sc.templates.reduce_2d_rows, sc.templates.reduce_2d_cols]:
            sizes = []
            for N in [896, 1280, 1760, 2560]:
                sizes += [(N, N)]
            for M in [16, 32, 64, 128, 512, 1024]:
                for N in [1024, 4096, 16384, 65536]:
                    sizes += [(M, N)]
            for N in [16, 32, 64, 128, 512, 1024]:
                for M in [1024, 4096, 16384, 65536]:
                    sizes += [(M, N)]
        #BLAS3 training sizes
        if operation in [sc.templates.gemm_nn, sc.templates.gemm_nt, sc.templates.gemm_tn, sc.templates.gemm_tt]:
            sizes = []
            for N in [896, 1760, 2048, 2560]:
                sizes += [(N, N, N)]
            for N in [896, 1760, 2048, 2560]:
			   for K in [16, 32, 64, 128]:
				   sizes += [(N, N, K)]
            for N in [16, 32, 64, 128, 256]:
			   for K in [16000,32000,64000,128000]:
				   sizes += [(N, N, K)]
            for M in [1760, 2048, 2560, 4096]:
                for N in [16, 32, 64, 128, 7000]:
                    sizes += [(M, N, M)]
            for K in [1760, 2048, 2560, 4096]:
				for M, N in [(5124,9124),(35,8457)]:
					sizes += [(M, N, K)]
            for M, K in [(7680,2560),(3072,1024)]:
                for N in [16, 32, 64, 128]:
					sizes += [(M, N, K)]

        #Training data
        performance = tools.metric_of(operation)
        profiles, X, Y = [], [], []
        #Restore progress
        savepath = os.path.join('save', tools.dtype.__name__, operation.__name__)
        if not os.path.exists(savepath):
            with open(os.path.join(savepath, 'X.csv')) as f:
                X = [tuple(map(int, row)) for row in csv.reader(f, delimiter=',')]
            with open(os.path.join(savepath, 'profiles.csv')) as f:
                profiles = [map(int,row) for v in row for row in csv.reader(f, delimiter=',')]
            with open(os.path.join(savepath, 'Y.csv')) as f:
                Y = [map(float, row) for row in csv.reader(f, delimiter=',')]
            #Recompute Y
            #Y = []
            #for x in X:
            #    tree, _ = tools.tree_of(operation, x, context)
            #    Y.append([performance(x, tools.benchmark(operation(*best), tree)) for best in profiles])
      	#Save data
        def save():
            for (fname, data) in zip(['X.csv',  'Y.csv', 'profiles.csv'], [X, Y, profiles]):
                with open(os.path.join(savepath, fname), 'wb') as f:
        for idx, x in enumerate(sizes):
            #Create new line on log
            if idx>0:
            self.progress_bar.set_prefix(', '.join(map(str, x)))
            #Skip if already saved
            if x in X:
                row = Y[X.index(x)]
                self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
            #Best existing profile for x
            tree, operands = tools.tree_of(operation, x, context)
            y = [performance(x, tools.benchmark(operation(*p), tree)) for p in profiles]
            best = profiles[np.argmax(y)] if y else None            
            #Retune if necessary
            tune =  not (best and optimize.is_local_optimum(best, operation, x, context))
            if tune:
                optimizer = optimize.GeneticOptimizer(self.logger, naccept=1000, niter=1000, cxpb=.4, mutpb=.4, popsize=20, progress_bar = self.progress_bar)
                best = optimizer.run(operation, x, context, prior=best)[0]
                if best not in profiles:
                    for xx,yy in zip(X, Y):
                        tree, _ = tools.tree_of(operation, xx, context)
                        time = tools.benchmark(operation(*best), tree)
                        yy.append(performance(xx, time))
            #Update dataset
            tree, operands = tools.tree_of(operation, x, context)
            y = [performance(x,tools.benchmark(operation(*prf), tree)) for prf in profiles]
            #Save data
            #print performance info in case no tuning was done
            if not tune:
                row = Y[X.index(x)]
                self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
        #Adding external profiles
        for prof in tools.external_profiles(operation):
            for x, y in zip(X, Y):
                tree, operands = tools.tree_of(operation, x, context)
                perf = performance(x,tools.benchmark(prof, tree, operation))
                if max(y) <  perf:
                    print x, '\t', prof.__class__.__name__, '\toutperform: \t', int(perf), tools.metric_name_of(operation)
        #Pruning of useless profiles
        X = np.array(X)
        Y = np.array(Y)
        if len(Y[0]) > 1:
            idx = np.where(np.bincount(np.argmax(Y, 1), minlength=len(profiles))==0)[0]
            profiles = [p for ip,p in enumerate(profiles) if ip not in idx]
            Y = np.delete(Y, idx, axis=1) 
        #Exporting to JSON
        json_path = tools.sanitize(device.name) + '.json' if not self.json_path else self.json_path
        if os.path.isfile(json_path):
            json_data = json.load(open(json_path, 'r'))
            json_data = {}
            json_data["version"] = "1.0"
        operation_name = operation.__name__
        if operation_name not in json_data:
            json_data[operation_name] = {}
        json_data[operation_name][tools.dtype.__name__] = {}
        D = json_data[operation_name][tools.dtype.__name__]
        if len(profiles) > 1:
            clf, nrmse = model.train(X, Y, profiles)
            D['predictor'] = [{'children_left': e.tree_.children_left.tolist(),
                                'children_right': e.tree_.children_right.tolist(),
                                'threshold': e.tree_.threshold.astype('float64').tolist(),
                                'feature': e.tree_.feature.astype('float64').tolist(),
                                'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_]
        D['profiles'] = [tools.convert(x) for x in profiles]
        json.dump(json_data, open(json_path,'w'))
Exemplo n.º 18
    def run(self, level='intermediate'):

        assert level in ['simple', 'intermediate', 'full']

        device = self.device
        operation = self.operation
        context = sc.driver.context(device)

        if self.logger:
            self.logger.info(operation.__name__.replace('_', '-').upper())

        #BLAS1 training sizes
        if operation in [sc.templates.elementwise_1d, sc.templates.reduce_1d]:
            if level == 'simple':
                sizes = [(10000000, )]
            elif level == 'intermediate':
                sizes = [(x, ) for x in tools.expspace(1e3, 1e8, 10)]
                sizes = [(x, ) for x in tools.expspace(1e3, 1e8, 100)]

        #BLAS2 training sizes
        if operation in [
                sc.templates.elementwise_2d, sc.templates.reduce_2d_rows,
            if level == 'simple':
                sizes = [(1536, 1536)]
            elif level == 'intermediate':
                sizes = [(896, 896), (1536, 1536), (256, 256), (1024, 256),
                         (4096, 256), (16384, 256), (256, 1024), (256, 4096),
                         (256, 16384), (3025, 96)]
                sizes = product(pow2range(4, 17), pow2range(4, 17))

        #BLAS3 training sizes
        if operation in [
                sc.templates.matrix_product_nn, sc.templates.matrix_product_nt,
                sc.templates.matrix_product_tn, sc.templates.matrix_product_tt
            if level == 'simple':
                sizes = [(2560, 2560, 2560)]
            elif level == 'intermediate':
                sizes = [  #Square
                    (896, 896, 896),
                    (1536, 1536, 1536),
                    (2176, 2176, 2176),
                    #Rank-32 updates
                    (896, 896, 32),
                    (1536, 1536, 32),
                    (2176, 2176, 32),
                    (32, 32, 16000),
                    (64, 64, 64000),
                    (256, 256, 32000),
                    (3025, 64, 363),
                    (729, 192, 1200),
                    (169, 384, 1728),
                    (169, 256, 3456),
                    (169, 128, 2304),
                    (169, 2304, 256),
                    (169, 3456, 256),
                    (169, 1728, 384),
                    (729, 1600, 192),
                    (3025, 363, 64),
                    (2304, 256, 169),
                    (3456, 256, 169),
                    (1728, 384, 169),
                    (1600, 192, 729),
                    (363, 64, 3025)
            elif level == 'full':
                sizes = product(pow2range(5, 12), pow2range(5, 12),
                                pow2range(5, 17))

        #Remove duplicates and or too small/big tuples
        sizes = [
            x for x in sizes
            if 1e-4 <= tools.memory_footprint(operation, x) <= 2e-1

        #Training data
        performance = tools.metric_of(operation)
        profiles, X, Y = [], [], []

        #Restore previous run
        savepath = os.path.join('save', operation.__name__)
        if not os.path.exists(savepath):

            with open(os.path.join(savepath, 'X.csv')) as f:
                X = [
                    tuple(map(int, row))
                    for row in csv.reader(f, delimiter=',')

            with open(os.path.join(savepath, 'Y.csv')) as f:
                Y = [map(float, row) for row in csv.reader(f, delimiter=',')]

            with open(os.path.join(savepath, 'profiles.csv')) as f:

                def mmap(x):
                    if x == 'FETCH_FROM_LOCAL':
                        return sc.templates.fetch_type.FETCH_FROM_LOCAL
                    if x == 'FETCH_FROM_GLOBAL_CONTIGUOUS':
                        return sc.templates.fetch_type.FETCH_FROM_GLOBAL_CONTIGUOUS
                    if x == 'FETCH_FROM_GLOBAL_STRIDED':
                        return sc.templates.fetch_type.FETCH_FROM_GLOBAL_STRIDED
                    return int(x)

                profiles = [
                    map(mmap, row) for v in row
                    for row in csv.reader(f, delimiter=',')

        ##### Exploration #####
        for idx, x in enumerate(sizes):
            if idx > 0:

            self.progress_bar.set_prefix(', '.join(map(str, x)))
            #Skip if saved
            if x in X:
                row = Y[X.index(x)]
                self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))

            #Check if the current best prediction is not a local optimum
            idx = len(X)
            nparams = len(profiles)
            tree, operands = tools.tree_of(operation, x, context)
            if idx == 0:
                retune = True
                predicted = None
                if nparams == 1:
                    predicted = profiles[0]
                    clf = RandomForestRegressor(min(10, idx + 1),
                                                              idx + 1)).fit(
                                                                  X, Y)
                    #clf, nrmse = model.train(X, Y, profiles)
                    predperf = clf.predict(x)[0]
                    best = (-predperf).argsort()[:5]
                    perf = []
                    for b in best:
                            perf += [
                                    tools.benchmark(operation, profiles[b],
                        except profile_execution_failure:
                    predicted = profiles[best[argmax(perf)]]
                retune = not optimize.is_local_optimum(predicted, operation, x,

            #Retune if necessary
            if retune:
                optimizer = optimize.GeneticOptimizer(
                new = optimizer.run(operation, x, context, prior=predicted)[0]
                if new not in profiles:
                    if idx > 0:
                        for xx, yy in zip(X, Y):
                            _tree, _operands = tools.tree_of(
                                operation, xx, context)
                                time = tools.benchmark(operation, new, _tree)
                                perf = performance(xx, time)
                            except profile_execution_failure:
                                perf = 0
                            yy.append(0 if isinf(perf) else perf)

            ##### Training #####
            y = []
            fastest = max(predperf) if nparams > 1 else None
            for ip, p in enumerate(profiles):
                    perf = 0 if fastest and ip < nparams and predperf[
                        ip] / fastest < .1 else performance(
                            x, tools.benchmark(operation, p, tree))
                except profile_execution_failure:
                    perf = 0
                y.append(0 if isinf(perf) else perf)

            #Save data
            for (fname, data) in zip(['X.csv', 'Y.csv', 'profiles.csv'],
                                     [X, Y, profiles]):
                with open(os.path.join(savepath, fname), 'wb') as f:

            #print performance info in case no tuning was done
            if not retune:
                row = Y[X.index(x)]
                self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))

        #Remove unused profiles
        if len(Y[0]) > 1:
            unused = np.where(np.bincount(np.argmax(Y, 1)) == 0)[0]
            profiles = [p for ip, p in enumerate(profiles) if ip not in unused]
            Y = np.delete(Y,
                          np.where(np.bincount(np.argmax(Y, 1)) == 0),

        ##### Exportation #####
        json_path = tools.sanitize(
            device.name) + '.json' if not self.json_path else self.json_path
        if os.path.isfile(json_path):
            json_data = json.load(open(json_path, 'r'))
            json_data = {}
            json_data["version"] = "1.0"
        operation_name = operation.__name__
        if operation_name not in json_data:
            json_data[operation_name] = {}
        json_data[operation_name]['float32'] = {}
        D = json_data[operation_name]['float32']
        if len(profiles) > 1:
            clf, nrmse = model.train(X, Y, profiles)
            D['predictor'] = [{
                e.tree_.value[:, :, 0].astype('float64').tolist()
            } for e in clf.estimators_]
        D['profiles'] = [map(int, x) for x in profiles]
        json.dump(json_data, open(json_path, 'w'))