예제 #1
0
def isaacGemm(ctx, stream, shapes, layouts):
    # Shapes
    offa, offb, offc = 0, 0, 0
    dtype, AT, BT, M, N, K = shapes
    dtype = sc.dtype(dtype)
    AT, BT = sc.templates.op(AT), sc.templates.op(BT)
    ldc = M
    lda = M if AT == sc.templates.OP_N else K
    ldb = K if BT == sc.templates.OP_N else N
    # Kernel
    generator = sc.templates.GEMM(dtype, AT, BT, M, N, K, offa, lda, offb, ldb,
                                  offc, ldc, *layouts)
    src = generator.dump(ctx.device, "gemm")
    module = sc.driver.Module(ctx, src)
    kernel = sc.driver.Kernel(module, "gemm")
    with lock:
        # BuffeNfilt
        C = sc.driver.Buffer(ctx, M * N * sc.size_of(dtype))
        A = sc.driver.Buffer(ctx, M * K * sc.size_of(dtype))
        B = sc.driver.Buffer(ctx, K * N * sc.size_of(dtype))
        alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype)
        # Result
        ts = benchmark(
            lambda: (generator.enqueue(kernel, stream, alpha, A, B, beta, C),
                     stream.synchronize()), ctx.device, 1e-2)
    tflops = 2 * M * N * K / ts * 1e-12
    return tflops
예제 #2
0
def isaacConv(ctx, stream, shapes, layouts):
    # Shapes
    dtype, Npix, K, C, Nfilt = shapes
    N, M, P, Q = 1, 1, 1, Npix
    T, R, S = 1, 1, Nfilt
    dtype = sc.dtype(dtype)
    pad_d, pad_h, pad_w, stride_d, stride_h, stride_w = 0, 0, 0, 1, 1, 1
    D = M * stride_d + T - 1 - 2 * pad_d - stride_d + 1
    H = P * stride_h + R - 1 - 2 * pad_h - stride_h + 1
    W = Q * stride_w + S - 1 - 2 * pad_w - stride_w + 1
    # Kernel
    generator = sc.templates.Conv(sc.dtype(dtype), C, D, H, W, N, K, M, P, Q,
                                  T, R, S, pad_d, pad_h, pad_w, stride_d,
                                  stride_h, stride_w, *layouts)
    src = generator.dump(ctx.device, "conv_fprop")
    module = sc.driver.Module(ctx, src)
    kernel = sc.driver.Kernel(module, "conv_fprop")
    with lock:
        # Buffers
        O = sc.driver.Buffer(ctx, K * M * P * Q * N * sc.size_of(dtype))
        I = sc.driver.Buffer(ctx, C * D * H * W * N * sc.size_of(dtype))
        F = sc.driver.Buffer(ctx, C * T * R * S * K * sc.size_of(dtype))
        alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype)
        # Result
        time = benchmark(
            lambda: (generator.enqueue(kernel, stream, alpha, I, F, beta, O),
                     stream.synchronize()), ctx.device, 1e-2)
    tflops = 2 * M * P * Q * N * K * C * T * R * S / time * 1e-12
    return tflops
예제 #3
0
def cudaGemm(ctx, stream, dtype, AT, BT, M, N, K):
    ldc = M
    lda = M if AT==1 else K
    ldb = K if BT==1 else N
    dtype = sc.dtype(dtype)
    C = sc.driver.Buffer(ctx, M*N*sc.size_of(dtype))
    A = sc.driver.Buffer(ctx, M*K*sc.size_of(dtype))
    B = sc.driver.Buffer(ctx, K*N*sc.size_of(dtype))
    alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype)
    time = benchmark(lambda: (sc.driver.cublasGemm(dtype, ctx, stream, 'N' if AT==1 else 'T', 'N' if BT==1 else 'T', M, N, K, alpha, A, lda, B,  ldb, beta, C, ldc), stream.synchronize()), ctx.device, 1e-2)
    tflops = 2*M*N*K/time*1e-12
    return tflops
예제 #4
0
def cudaConv(ctx, stream, dtype, N, K, P, Q, C, R, S):
    pad_h, pad_w, stride_h, stride_w = 0, 0, 1, 1
    H = P*stride_h + R - 1 - 2*pad_h
    W = Q*stride_w + S - 1 - 2*pad_w
    dtype = sc.dtype(dtype)
    O = sc.driver.Buffer(ctx, K*P*Q*N*sc.size_of(dtype))
    I = sc.driver.Buffer(ctx, C*H*W*N*sc.size_of(dtype))
    F = sc.driver.Buffer(ctx, C*R*S*K*sc.size_of(dtype))
    alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype)
    time = benchmark(lambda: (sc.driver.cudnnConv(dtype, ctx, stream, H, W, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O), stream.synchronize()), ctx.device, 1e-2)
    tflops = 2*P*Q*K*N*C*R*S/time*1e-12
    return tflops