示例#1
0
def create_mb_source(image_height, image_width, num_channels, map_file):
    transforms = [ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear')]
    image_source = ImageDeserializer(map_file)
    image_source.ignore_labels()
    image_source.map_features('features', transforms)

    return MinibatchSource(image_source, randomize=False)
示例#2
0
def create_image_mb_source(map_file, is_training, total_number_of_samples):
    if not os.path.exists(map_file):
        raise RuntimeError("File '%s' does not exist." %map_file)

    # transformation pipeline for the features has jitter/crop only when training
    transforms = []
    if is_training:
        transforms += [
            xforms.crop(crop_type='randomside', side_ratio=0.4375:0.875, jitter_type='uniratio') # train uses jitter
        ]
    else: 
        transforms += [
            xforms.crop(crop_type='center', side_ratio=0.5833333) # test has no jitter
        ]

    transforms += [
        xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
    ]

    # deserializer
    return MinibatchSource(
        ImageDeserializer(map_file, StreamDefs(
            features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
            labels   = StreamDef(field='label', shape=num_classes))),   # and second as 'label'
        randomize = is_training, 
        epoch_size=total_number_of_samples,
        multithreaded_deserializer = True)
示例#3
0
def create_mb_source(image_height, image_width, num_channels, map_file, mean_file, is_training):
    if not os.path.exists(map_file):
        raise RuntimeError("File '%s' does not exist." % (map_file))

    # transformation pipeline for the features has jitter/crop only when training
    transforms = []
    if is_training:
        transforms += [
            xforms.crop(crop_type='randomside', side_ratio=0.875, jitter_type='uniratio') # train uses jitter
        ]
    else: 
        transforms += [
            xforms.crop(crop_type='center', side_ratio=0.875) # test has no jitter
        ]

    transforms += [
        xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),        
    ]

    if mean_file != '':
        transforms += [
            xforms.mean(mean_file),
        ]        

    # deserializer
    return MinibatchSource(
        ImageDeserializer(map_file, StreamDefs(
            features = StreamDef(field='image', transforms=transforms) # first column in map file is referred to as 'image'
            )),  
        randomize = is_training, 
        multithreaded_deserializer = True,
        max_sweeps = 1)
示例#4
0
def create_reader(path,
                  vocab_dim,
                  entity_dim,
                  randomize,
                  rand_size=DEFAULT_RANDOMIZATION_WINDOW,
                  size=INFINITELY_REPEAT):
    """
  Create data reader for the model
  Args:
    path: The data path
    vocab_dim: The dimention of the vocabulary
    entity_dim: The dimention of entities
    randomize: Where to shuffle the data before feed into the trainer
  """
    return MinibatchSource(CTFDeserializer(
        path,
        StreamDefs(context=StreamDef(field='C',
                                     shape=vocab_dim,
                                     is_sparse=True),
                   query=StreamDef(field='Q', shape=vocab_dim, is_sparse=True),
                   entities=StreamDef(field='E', shape=1, is_sparse=False),
                   label=StreamDef(field='L', shape=1, is_sparse=False),
                   entity_ids=StreamDef(field='EID',
                                        shape=entity_dim,
                                        is_sparse=True))),
                           randomize=randomize)
示例#5
0
def create_video_mb_source(map_files, num_channels, image_height, image_width,
                           num_classes):
    transforms = [
        xforms.crop(crop_type='center', crop_size=224),
        xforms.scale(width=image_width,
                     height=image_height,
                     channels=num_channels,
                     interpolations='linear')
    ]

    map_files = sorted(map_files,
                       key=lambda x: int(x.split('Map_')[1].split('.')[0]))
    print(map_files)

    # Create multiple image sources
    sources = []
    for i, map_file in enumerate(map_files):
        streams = {
            "feature" + str(i): StreamDef(field='image',
                                          transforms=transforms),
            "label" + str(i): StreamDef(field='label', shape=num_classes)
        }
        sources.append(ImageDeserializer(map_file, StreamDefs(**streams)))

    return MinibatchSource(sources, max_sweeps=1, randomize=False)
示例#6
0
def create_mb_source(img_height, img_width, img_channels, n_classes, n_rois, data_path, data_set):
    rois_dim = 4 * n_rois
    label_dim = n_classes * n_rois

    path = os.path.normpath(os.path.join(abs_path, data_path))
    if data_set == 'test':
        map_file = os.path.join(path, test_map_filename)
    else:
        map_file = os.path.join(path, train_map_filename)
    roi_file = os.path.join(path, data_set + rois_filename_postfix)
    label_file = os.path.join(path, data_set + roilabels_filename_postfix)

    if not os.path.exists(map_file) or not os.path.exists(roi_file) or not os.path.exists(label_file):
        raise RuntimeError("File '%s', '%s' or '%s' does not exist. "
                           "Please run install_fastrcnn.py from Examples/Image/Detection/FastRCNN to fetch them" %
                           (map_file, roi_file, label_file))

    # read images
    image_source = ImageDeserializer(map_file)
    image_source.ignore_labels()
    image_source.map_features(features_stream_name,
                              [ImageDeserializer.scale(width=img_width, height=img_height, channels=img_channels,
                                                       scale_mode="pad", pad_value=114, interpolations='linear')])

    # read rois and labels
    roi_source = CTFDeserializer(roi_file)
    roi_source.map_input(roi_stream_name, dim=rois_dim, format="dense")
    label_source = CTFDeserializer(label_file)
    label_source.map_input(label_stream_name, dim=label_dim, format="dense")

    # define a composite reader
    return MinibatchSource([image_source, roi_source, label_source], epoch_size=sys.maxsize, randomize=data_set == "train")
def create_mb_source(data_set, img_height, img_width, n_classes, n_rois, data_path, randomize):
    # set paths
    map_file   = join(data_path, data_set + '.txt')
    roi_file   = join(data_path, data_set + '.rois.txt')
    label_file = join(data_path, data_set + '.roilabels.txt')
    if not os.path.exists(map_file) or not os.path.exists(roi_file) or not os.path.exists(label_file):
        raise RuntimeError("File '%s', '%s' or '%s' does not exist. " % (map_file, roi_file, label_file))

    # read images
    nrImages = len(readTable(map_file))
    transforms = [scale(width=img_width, height=img_height, channels=3,
                        scale_mode="pad", pad_value=114, interpolations='linear')]
    image_source = ImageDeserializer(map_file, StreamDefs(features = StreamDef(field='image', transforms=transforms)))

    # read rois and labels
    rois_dim  = 4 * n_rois
    label_dim = n_classes * n_rois
    roi_source = CTFDeserializer(roi_file, StreamDefs(
        rois = StreamDef(field='rois', shape=rois_dim, is_sparse=False)))
    label_source = CTFDeserializer(label_file, StreamDefs(
        roiLabels = StreamDef(field='roiLabels', shape=label_dim, is_sparse=False)))

    # define a composite reader
    mb = MinibatchSource([image_source, roi_source, label_source], epoch_size=sys.maxsize, randomize=randomize)
    return (mb, nrImages)
def create_reader(map_file, train, dimensions, classes,
                  total_number_of_samples):
    print(
        f"Reading map file: {map_file} with number of samples {total_number_of_samples}"
    )

    # transformation pipeline for the features has jitter/crop only when training
    transforms = []
    # finalize_network uses data augmentation (translation only)
    if train:
        transforms += [
            xforms.crop(crop_type='randomside',
                        area_ratio=(0.08, 1.0),
                        aspect_ratio=(0.75, 1.3333),
                        jitter_type='uniratio'),
            xforms.color(brightness_radius=0.4,
                         contrast_radius=0.4,
                         saturation_radius=0.4)
        ]
    transforms += [
        xforms.scale(width=dimensions['width'],
                     height=dimensions['height'],
                     channels=dimensions['depth'],
                     interpolations='linear')
    ]
    source = MinibatchSource(ImageDeserializer(
        map_file,
        StreamDefs(features=StreamDef(field='image', transforms=transforms),
                   labels=StreamDef(field='label', shape=len(classes)))),
                             randomize=train,
                             max_samples=total_number_of_samples,
                             multithreaded_deserializer=True)
    return source
示例#9
0
def create_reader(map_file, mean_file, train):
    if not os.path.exists(map_file) or not os.path.exists(mean_file):
        raise RuntimeError(
            "File '%s' or '%s' does not exist. Please run install_cifar10.py from DataSets/CIFAR-10 to fetch them"
            % (map_file, mean_file))

    # transformation pipeline for the features has jitter/crop only when training
    transforms = []
    if train:
        transforms += [
            ImageDeserializer.crop(crop_type='randomside',
                                   side_ratio=0.8,
                                   jitter_type='uniratio')  # train uses jitter
        ]
    transforms += [
        ImageDeserializer.scale(width=image_width,
                                height=image_height,
                                channels=num_channels,
                                interpolations='linear'),
        ImageDeserializer.mean(mean_file)
    ]
    # deserializer
    return MinibatchSource(
        ImageDeserializer(
            map_file,
            StreamDefs(
                features=StreamDef(
                    field='image', transforms=transforms
                ),  # first column in map file is referred to as 'image'
                labels=StreamDef(field='label',
                                 shape=num_classes))))  # and second as 'label'
示例#10
0
def create_reader(map_file,
                  mean_file,
                  train,
                  image_height=64,
                  image_width=64,
                  num_channels=3,
                  num_classes=32):

    # transformation pipeline for the features has jitter/crop only when training
    # https://docs.microsoft.com/en-us/python/api/cntk.io.transforms?view=cntk-py-2.2
    trs = []
    if train:
        trs += [
            transforms.crop(crop_type='randomside',
                            side_ratio=0,
                            jitter_type='none')  # Horizontal flip enabled
        ]
    trs += [
        transforms.scale(width=image_width,
                         height=image_height,
                         channels=num_channels,
                         interpolations='linear'),
        transforms.mean(mean_file)
    ]
    # deserializer
    image_source = ImageDeserializer(
        map_file,
        StreamDefs(
            features=StreamDef(
                field='image', transforms=trs
            ),  # first column in map file is referred to as 'image'
            labels=StreamDef(field='label',
                             shape=num_classes)  # and second as 'label'
        ))
    return MinibatchSource(image_source)
示例#11
0
def test_large_minibatch(tmpdir):
    tmpfile = _write_data(tmpdir, MBDATA_DENSE_2)

    mb_source = MinibatchSource(CTFDeserializer(
        tmpfile,
        StreamDefs(features=StreamDef(field='S0', shape=1),
                   labels=StreamDef(field='S1', shape=1))),
                                randomization_window_in_chunks=0)

    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')

    mb = mb_source.next_minibatch(1000)
    features = mb[features_si]
    labels = mb[labels_si]

    # Actually, the minibatch spans over multiple sweeps,
    # not sure if this is an artificial situation, but
    # maybe instead of a boolean flag we should indicate
    # the largest sweep index the data was taken from.
    assert features.end_of_sweep
    assert labels.end_of_sweep

    assert features.num_samples == 1000 - 1000 % 7
    assert labels.num_samples == 5 * (1000 // 7)

    assert mb[features_si].num_sequences == (1000 // 7)
    assert mb[labels_si].num_sequences == (1000 // 7)
示例#12
0
def test_create_two_image_deserializers(tmpdir):
    mbdata = r'''filename	0
filename2	0
'''

    map_file = str(tmpdir / 'mbdata.txt')
    with open(map_file, 'w') as f:
        f.write(mbdata)

    image_width = 100
    image_height = 200
    num_channels = 3

    transforms = [
        xforms.crop(crop_type='randomside',
                    side_ratio=0.5,
                    jitter_type='uniratio'),
        xforms.scale(width=image_width,
                     height=image_height,
                     channels=num_channels,
                     interpolations='linear')
    ]

    image1 = ImageDeserializer(
        map_file,
        StreamDefs(f1=StreamDef(field='image', transforms=transforms)))
    image2 = ImageDeserializer(
        map_file,
        StreamDefs(f2=StreamDef(field='image', transforms=transforms)))

    mb_source = MinibatchSource([image1, image2])
    assert isinstance(mb_source, MinibatchSource)
示例#13
0
def test_MinibatchData_and_Value_as_input(tmpdir):

    mbdata = r'''0  |S0 100'''

    tmpfile = str(tmpdir / 'mbtest.txt')
    with open(tmpfile, 'w') as f:
        f.write(mbdata)

    defs = StreamDefs(f1=StreamDef(field='S0', shape=1))
    mb_source = MinibatchSource(CTFDeserializer(tmpfile, defs),
                                randomize=False)

    f1_si = mb_source.stream_info('f1')

    mb = mb_source.next_minibatch(1)

    f1 = input(shape=(1, ), needs_gradient=True, name='f')
    res = f1 * 2

    assert res.eval({f1: mb[f1_si]}) == [[200]]
    # Test MinibatchData
    assert res.eval(mb[f1_si]) == [[200]]
    # Test Value
    assert res.eval(mb[f1_si].data) == [[200]]
    # Test NumPy (converted back from MinibatchData)
    assert res.eval(mb[f1_si].asarray()) == [[200]]
    # Test Value
    assert res.eval(mb[f1_si].data) == [[200]]
示例#14
0
def test_multiple_mlf_files():
    os.chdir(data_path)

    feature_dim = 33
    num_classes = 132
    context = 2

    test_mlf_path = "../../../../Tests/EndToEndTests/Speech/Data/glob_00001.mlf"

    features_file = "glob_0000.scp"
    label_files = ["glob_0000.mlf", test_mlf_path]
    label_mapping_file = "state.list"

    fd = HTKFeatureDeserializer(
        StreamDefs(amazing_features=StreamDef(
            shape=feature_dim, context=(context, context), scp=features_file)))

    ld = HTKMLFDeserializer(
        label_mapping_file,
        StreamDefs(
            awesome_labels=StreamDef(shape=num_classes, mlf=label_files)))

    # Make sure we can read at least one minibatch.
    mbsource = MinibatchSource([fd, ld])
    mbsource.next_minibatch(1)

    os.chdir(abs_path)
示例#15
0
def create_reader(map_file, mean_file, train, distributed_communicator=None):
    if not os.path.exists(map_file) or not os.path.exists(mean_file):
        cifar_py3 = "" if sys.version_info.major < 3 else "_py3"
        raise RuntimeError(
            "File '%s' or '%s' does not exist. Please run CifarDownload%s.py and CifarConverter%s.py from CIFAR-10 to fetch them"
            % (map_file, mean_file, cifar_py3, cifar_py3))

    # transformation pipeline for the features has jitter/crop only when training
    transforms = []
    if train:
        transforms += [
            ImageDeserializer.crop(crop_type='Random',
                                   ratio=0.8,
                                   jitter_type='uniRatio')  # train uses jitter
        ]
    transforms += [
        ImageDeserializer.scale(width=image_width,
                                height=image_height,
                                channels=num_channels,
                                interpolations='linear'),
        ImageDeserializer.mean(mean_file)
    ]
    # deserializer
    return MinibatchSource(
        ImageDeserializer(
            map_file,
            StreamDefs(
                features=StreamDef(
                    field='image', transforms=transforms
                ),  # first column in map file is referred to as 'image'
                labels=StreamDef(field='label',
                                 shape=num_classes))),  # and second as 'label'
        distributed_communicator=distributed_communicator)
示例#16
0
文件: io_tests.py 项目: Shzaidi/CNTK
def test_base64_image_deserializer(tmpdir):
    import io, base64, uuid
    from PIL import Image
    images, b64_images = [], []

    np.random.seed(1)
    for i in range(10):
        data = np.random.randint(0, 2**8, (5, 7, 3))
        image = Image.fromarray(data.astype('uint8'), "RGB")
        buf = io.BytesIO()
        image.save(buf, format='PNG')
        assert image.width == 7 and image.height == 5
        b64_images.append(base64.b64encode(buf.getvalue()))
        images.append(np.array(image))

    image_data = str(tmpdir / 'mbdata1.txt')
    seq_ids = []
    uid = uuid.uuid1().int >> 64
    with open(image_data, 'wb') as f:
        for i, data in enumerate(b64_images):
            seq_id = uid ^ i
            seq_id = str(seq_id).encode('ascii')
            seq_ids.append(seq_id)
            line = seq_id + b'\t'
            label = str(i).encode('ascii')
            line += label + b'\t' + data + b'\n'
            f.write(line)

    ctf_data = str(tmpdir / 'mbdata2.txt')
    with open(ctf_data, 'wb') as f:
        for i, sid in enumerate(seq_ids):
            line = sid + b'\t' + b'|index ' + str(i).encode('ascii') + b'\n'
            f.write(line)

    transforms = [xforms.scale(width=7, height=5, channels=3)]
    b64_deserializer = Base64ImageDeserializer(
        image_data,
        StreamDefs(images=StreamDef(field='image', transforms=transforms),
                   labels=StreamDef(field='label', shape=10)))

    ctf_deserializer = CTFDeserializer(
        ctf_data, StreamDefs(index=StreamDef(field='index', shape=1)))

    mb_source = MinibatchSource([ctf_deserializer, b64_deserializer])
    assert isinstance(mb_source, MinibatchSource)

    for j in range(100):
        mb = mb_source.next_minibatch(10)

        index_stream = mb_source.streams['index']
        index = mb[index_stream].asarray().flatten()
        image_stream = mb_source.streams['images']

        results = mb[image_stream].asarray()

        for i in range(10):
            # original images are RBG, openCV produces BGR images,
            # reverse the last dimension of the original images
            bgrImage = images[int(index[i])][:, :, ::-1]
            assert (bgrImage == results[i][0]).all()
示例#17
0
def create_reader(map_file, mean_file, train):

    # transformation pipeline for the features has jitter/crop only when training
    trs = []
    #    if train:
    #        transforms += [
    #            ImageDeserializer.crop(crop_type='Random', ratio=0.8, jitter_type='uniRatio') # train uses jitter
    #        ]
    trs += [
        transforms.scale(width=image_width,
                         height=image_height,
                         channels=num_channels,
                         interpolations='linear'),
        transforms.mean(mean_file)
    ]
    # deserializer
    return MinibatchSource(
        ImageDeserializer(
            map_file,
            StreamDefs(
                features=StreamDef(
                    field='image', transforms=trs
                ),  # first column in map file is referred to as 'image'
                labels=StreamDef(field='label',
                                 shape=num_classes)  # and second as 'label'
            )))
示例#18
0
def test_eval_sparse_dense(tmpdir, device_id):
    from cntk import Axis
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
    from cntk.ops import input, times

    input_vocab_dim = label_vocab_dim = 69

    ctf_data = '''\
0	|S0 3:1 |# <s>	|S1 3:1 |# <s>
0	|S0 4:1 |# A	|S1 32:1 |# ~AH
0	|S0 5:1 |# B	|S1 36:1 |# ~B
0	|S0 4:1 |# A	|S1 31:1 |# ~AE
0	|S0 7:1 |# D	|S1 38:1 |# ~D
0	|S0 12:1 |# I	|S1 47:1 |# ~IY
0	|S0 1:1 |# </s>	|S1 1:1 |# </s>
2	|S0 60:1 |# <s>	|S1 3:1 |# <s>
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
'''
    ctf_file = str(tmpdir / '2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    mbs = MinibatchSource(CTFDeserializer(
        ctf_file,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_vocab_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='S1',
                                    shape=label_vocab_dim,
                                    is_sparse=True))),
                          randomize=False,
                          epoch_size=2)

    raw_input = sequence.input(shape=input_vocab_dim,
                               sequence_axis=Axis('inputAxis'),
                               name='raw_input',
                               is_sparse=True)

    mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100,
                                  input_map={raw_input: mbs.streams.features},
                                  device=cntk_device(device_id))

    z = times(raw_input, np.eye(input_vocab_dim))
    e_reader = z.eval(mb_valid, device=cntk_device(device_id))

    # CSR with the raw_input encoding in ctf_data
    one_hot_data = [[3, 4, 5, 4, 7, 12, 1], [60, 61]]
    data = [
        csr(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in one_hot_data
    ]
    e_csr = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a, b in zip(e_reader, e_csr)])

    # One-hot with the raw_input encoding in ctf_data
    data = Value.one_hot(one_hot_data,
                         num_classes=input_vocab_dim,
                         device=cntk_device(device_id))
    e_hot = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a, b in zip(e_reader, e_hot)])
示例#19
0
def create_mb_source(map_file, image_width, image_height, num_channels, num_classes, randomize=True):
    transforms = []
    transforms += [xforms.crop(crop_type='randomside', side_ratio=0.8)]
    transforms += [xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear')]
    return MinibatchSource(ImageDeserializer(map_file, StreamDefs(
            features =StreamDef(field='image', transforms=transforms),
            labels   =StreamDef(field='label', shape=num_classes))),
            randomize=randomize)
def create_reader(path, is_training, input_dim, num_label_classes):
    """
    reads CNTK formatted file with 'labels' and 'features'
    """    
    return MinibatchSource(CTFDeserializer(path, StreamDefs(
        labels = StreamDef(field='labels', shape=num_label_classes),
        features   = StreamDef(field='features', shape=input_dim)
    )), randomize = is_training, max_sweeps = INFINITELY_REPEAT if is_training else 1)   
def create_reader(path, randomize, input_vocab_dim, label_vocab_dim, size=INFINITELY_REPEAT):
    if not os.path.exists(path):
        raise RuntimeError("File '%s' does not exist." % (path))

    return MinibatchSource(CTFDeserializer(path, StreamDefs(
        features  = StreamDef(field='S0', shape=input_vocab_dim,  is_sparse=True),
        labels    = StreamDef(field='S1', shape=label_vocab_dim,  is_sparse=True)
    )), randomize=randomize, max_samples = size)
def create_reader_raw(path, is_training, input_dim, num_label_classes):
    """
    Reads in the unstardized values.
    """
    return MinibatchSource(CTFDeserializer(path, StreamDefs(
        labels = StreamDef(field='rawlabels', shape=num_label_classes),
        features   = StreamDef(field='rawfeatures', shape=input_dim)
    )), randomize = is_training, max_sweeps = INFINITELY_REPEAT if is_training else 1)        
示例#23
0
def create_reader(path, is_training, input_dim, label_dim):
    return MinibatchSource(
        CTFDeserializer(
            path,
            StreamDefs(features=StreamDef(field='features', shape=input_dim),
                       labels=StreamDef(field='labels', shape=label_dim))),
        randomize=is_training,
        epoch_size=INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP)
示例#24
0
def test_htk_deserializers():
    mbsize = 640
    epoch_size = 1000 * mbsize
    lr = [0.001]

    feature_dim = 33
    num_classes = 132
    context = 2

    os.chdir(data_path)

    features_file = "glob_0000.scp"
    labels_file = "glob_0000.mlf"
    label_mapping_file = "state.list"

    fd = HTKFeatureDeserializer(
        StreamDefs(amazing_features=StreamDef(
            shape=feature_dim, context=(context, context), scp=features_file)))

    ld = HTKMLFDeserializer(
        label_mapping_file,
        StreamDefs(
            awesome_labels=StreamDef(shape=num_classes, mlf=labels_file)))

    reader = MinibatchSource([fd, ld])

    features = C.input_variable(((2 * context + 1) * feature_dim))
    labels = C.input_variable((num_classes))

    model = Sequential(
        [For(range(3), lambda: Recurrence(LSTM(256))),
         Dense(num_classes)])
    z = model(features)
    ce = C.cross_entropy_with_softmax(z, labels)
    errs = C.classification_error(z, labels)

    learner = C.adam_sgd(z.parameters,
                         lr=C.learning_rate_schedule(lr, C.UnitType.sample,
                                                     epoch_size),
                         momentum=C.momentum_as_time_constant_schedule(1000),
                         low_memory=True,
                         gradient_clipping_threshold_per_sample=15,
                         gradient_clipping_with_truncation=True)
    trainer = C.Trainer(z, (ce, errs), learner)

    input_map = {
        features: reader.streams.amazing_features,
        labels: reader.streams.awesome_labels
    }

    pp = C.ProgressPrinter(freq=0)
    # just run and verify it doesn't crash
    for i in range(3):
        mb_data = reader.next_minibatch(mbsize, input_map=input_map)
        trainer.train_minibatch(mb_data)
        pp.update_with_trainer(trainer, with_metric=True)
    assert True
    os.chdir(abs_path)
示例#25
0
def test_index_caching(tmpdir):
    pytest.skip("test_index_caching is disabled")
    import os, time, glob, uuid
    MB = 1 << 20
    data = MBDATA_DENSE_1
    while(len(data) < 64 * MB):
        data += data

    timeWithoutCache, timeWithCache = 0, 0 

    cpu=C.device.cpu()
    streams = stream_defs[0]

    for _ in range(3):
        tmpfile = _write_data(tmpdir, data, str(uuid.uuid4()))

        cache_files = glob.glob(str(tmpdir + '/*.cache'))
        for cache_file in cache_files:
            os.remove(cache_file)

        config = CTFDeserializer(tmpfile, streams)
        config['cacheIndex'] = C.cntk_py.DictionaryValue(True)

        start = time.time()
        MinibatchSource(config, randomize=False).next_minibatch(1, device=cpu)
        end = time.time()

        timeWithoutCache += (end - start)

        time.sleep(5)
        
        cache_files = glob.glob(str(tmpdir + '/*.cache'))
        assert len(cache_files) == 1


        start = time.time()
        MinibatchSource(config, randomize=False).next_minibatch(1, device=cpu)
        end = time.time()

        os.remove(tmpfile)

        timeWithCache += (end - start)

    assert timeWithCache < timeWithoutCache
示例#26
0
    def compare_cbf_and_ctf(num_mbs, mb_size, randomize):
        ctf = MinibatchSource(CTFDeserializer(tmpfile, streams),
                              randomize=randomize)
        cbf = MinibatchSource(CBFDeserializer(tmpfile + '.bin', streams),
                              randomize=randomize)

        ctf_stream_names = sorted([x.m_name for x in ctf.stream_infos()])
        cbf_stream_names = sorted([x.m_name for x in cbf.stream_infos()])

        assert (ctf_stream_names == cbf_stream_names)
        for _ in range(num_mbs):
            ctf_mb = ctf.next_minibatch(mb_size, device=device)
            cbf_mb = cbf.next_minibatch(mb_size, device=device)

            for name in cbf_stream_names:
                ctf_data = ctf_mb[ctf[name]]
                cbf_data = cbf_mb[cbf[name]]

                assert ctf_data.num_samples == cbf_data.num_samples
                assert ctf_data.num_sequences == cbf_data.num_sequences
                assert ctf_data.shape == cbf_data.shape
                assert ctf_data.end_of_sweep == cbf_data.end_of_sweep
                assert ctf_data.is_sparse == cbf_data.is_sparse
                assert ctf_data.data.masked_count(
                ) == cbf_data.data.masked_count()

                # XXX:
                # assert(ctf_data.asarray() == cbf_data.asarray()).all()
                # not using asarray because for sparse values it fails with
                # some strange exception "sum of the rank of the mask and Variable
                #rank does not equal the Value's rank".

                assert C.cntk_py.are_equal(ctf_data.data.data,
                                           cbf_data.data.data)

                if (ctf_data.data.masked_count() > 0):
                    assert (ctf_data.data.mask == cbf_data.data.mask).all()
                # XXX: if mask_count is zero, mb_data.data.mask fails with
                # "AttributeError: 'Value' object has no attribute 'mask'"!

                # XXX: without invoking erase, next_minibatch will fail with:
                # "Resize: Cannot resize the matrix because it is a view."
                ctf_data.data.erase()
                cbf_data.data.erase()
示例#27
0
def test_one_sweep(tmpdir):
    ctf = create_ctf_deserializer(tmpdir)
    sources = [MinibatchSource(ctf, max_sweeps=1),
               MinibatchSource(ctf, max_samples=FULL_DATA_SWEEP),
               MinibatchSource(ctf, max_sweeps=1, max_samples=INFINITELY_REPEAT),
               MinibatchSource(ctf, max_samples=FULL_DATA_SWEEP, max_sweeps=INFINITELY_REPEAT)]

    for source in sources:
        input_map = {'features': source['features']}

        mb = source.next_minibatch(100, input_map)

        assert 'features' in mb
        assert mb['features'].num_samples == 4
        assert mb['features'].end_of_sweep

        mb = source.next_minibatch(100, input_map)

        assert not mb
示例#28
0
文件: io_tests.py 项目: s4sarath/CNTK
def test_minibatch(tmpdir):

    mbdata = r'''0	|S0 0   |S1 0
0	|S0 1 	|S1 1 
0	|S0 2 	
0	|S0 3 	|S1 3 
1	|S0 4 	
1	|S0 5 	|S1 1
1	|S0 6	|S1 2 
'''

    tmpfile = str(tmpdir/'mbtest.txt')
    with open(tmpfile, 'w') as f:
        f.write(mbdata)

    from cntk.io import CTFDeserializer, MinibatchSource, StreamDef, StreamDefs
    mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
        features  = StreamDef(field='S0', shape=1),
        labels    = StreamDef(field='S1', shape=1))))
     
    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')
    
    mb = mb_source.next_minibatch(1000)
    assert mb[features_si].num_sequences == 2
    assert mb[labels_si].num_sequences == 2

    features = mb[features_si]
    assert len(features.value) == 2
    expected_features = \
            [
                [[0],[1],[2],[3]],
                [[4],[5],[6]]
            ]

    for res, exp in zip (features.value, expected_features):
        assert np.allclose(res, exp)

    assert np.allclose(features.mask, 
            [[2, 1, 1, 1],
             [2, 1, 1, 0]])

    labels = mb[labels_si]
    assert len(labels.value) == 2
    expected_labels = \
            [
                [[0],[1],[3]], 
                [[1],[2]]
            ]
    for res, exp in zip (labels.value, expected_labels):
        assert np.allclose(res, exp)

    assert np.allclose(labels.mask, 
            [[2, 1, 1],
             [2, 1, 0]])
示例#29
0
def create_reader(path, is_training, input_dim, output_dim):
    return MinibatchSource(CTFDeserializer(
        path,
        StreamDefs(features=StreamDef(field='attribs',
                                      shape=input_dim,
                                      is_sparse=False),
                   labels=StreamDef(field='species',
                                    shape=output_dim,
                                    is_sparse=False))),
                           randomize=is_training,
                           max_sweeps=INFINITELY_REPEAT if is_training else 1)
示例#30
0
def create_reader(path, is_training):
    return MinibatchSource(CTFDeserializer(
        path,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_vocab_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='S1',
                                    shape=label_vocab_dim,
                                    is_sparse=True))),
                           randomize=is_training,
                           max_sweeps=INFINITELY_REPEAT if is_training else 1)