print("input size: {0}".format(HIGH_BATCH_INPUT_SIZE))
    run_profile(
        nn.parallel.DataParallel(AsymmetricQuantizer(
            QuantizerConfig(QuantizationParams(bits=NBITS))).cuda(),
                                 device_ids=device_ids), HIGH_BATCH_INPUT_SIZE,
        'cuda', GPU_RUNS_HIGH_BATCH)

    # CUDA DataParallel high batch
    # wall time
    print()
    print("CUDA DataParallel high batch")
    print("------------------------------------------------")
    print("Pytorch Symmetric(cuda {0}) DataParallel impl:".format(device_ids))
    print("input size: {0}".format(HIGH_BATCH_INPUT_SIZE))
    run_wall(
        nn.parallel.DataParallel(ReferenceQuantize(NBITS).cuda(),
                                 device_ids=device_ids), HIGH_BATCH_INPUT_SIZE,
        'cuda', GPU_RUNS_HIGH_BATCH)

    print()
    print("Custom Symmetric (cuda {0}) DataParallel impl:".format(device_ids))
    print("input size: {0}".format(HIGH_BATCH_INPUT_SIZE))
    run_wall(
        nn.parallel.DataParallel(SymmetricQuantizer(
            QuantizerConfig(QuantizationParams(bits=NBITS))).cuda(),
                                 device_ids=device_ids), HIGH_BATCH_INPUT_SIZE,
        'cuda', GPU_RUNS_HIGH_BATCH)

    print()
    print("Custom Assymetric (cuda {0}) DataParallel impl:".format(device_ids))
    print("input size: {0}".format(HIGH_BATCH_INPUT_SIZE))
    run_wall(
    run_profile(
        nn.parallel.DataParallel(AsymmetricQuantizer(
            DefaultedPTQuantizerSpec(
                num_bits=NBITS, scale_shape=per_tensor_scale_shape)).cuda(),
                                 device_ids=device_ids), HIGH_BATCH_INPUT_SIZE,
        'cuda', GPU_RUNS_HIGH_BATCH)

    # CUDA DataParallel high batch
    # wall time
    print()
    print("CUDA DataParallel high batch")
    print("------------------------------------------------")
    print("Pytorch Symmetric(cuda {0}) DataParallel impl:".format(device_ids))
    print("input size: {0}".format(HIGH_BATCH_INPUT_SIZE))
    run_wall(
        nn.parallel.DataParallel(ReferenceQuantize(NBITS).cuda(),
                                 device_ids=device_ids), HIGH_BATCH_INPUT_SIZE,
        'cuda', GPU_RUNS_HIGH_BATCH)

    print()
    print("Custom Symmetric (cuda {0}) DataParallel impl:".format(device_ids))
    print("input size: {0}".format(HIGH_BATCH_INPUT_SIZE))
    run_wall(
        nn.parallel.DataParallel(SymmetricQuantizer(
            DefaultedPTQuantizerSpec(
                num_bits=NBITS, scale_shape=per_tensor_scale_shape)).cuda(),
                                 device_ids=device_ids), HIGH_BATCH_INPUT_SIZE,
        'cuda', GPU_RUNS_HIGH_BATCH)

    print()
    print("Custom Asymmetric (cuda {0}) DataParallel impl:".format(device_ids))
    print("input size: {0}".format(HIGH_BATCH_INPUT_SIZE))