Пример #1
0
def test_multiple_mlf_files():
    os.chdir(data_path)

    feature_dim = 33
    num_classes = 132
    context = 2

    test_mlf_path = "../../../../Tests/EndToEndTests/Speech/Data/glob_00001.mlf"

    features_file = "glob_0000.scp"
    label_files = ["glob_0000.mlf", test_mlf_path]
    label_mapping_file = "state.list"

    fd = HTKFeatureDeserializer(
        StreamDefs(amazing_features=StreamDef(
            shape=feature_dim, context=(context, context), scp=features_file)))

    ld = HTKMLFDeserializer(
        label_mapping_file,
        StreamDefs(
            awesome_labels=StreamDef(shape=num_classes, mlf=label_files)))

    # Make sure we can read at least one minibatch.
    mbsource = MinibatchSource([fd, ld])
    mbsource.next_minibatch(1)

    os.chdir(abs_path)
Пример #2
0
def test_text_format(tmpdir):
    tmpfile = _write_data(tmpdir, MBDATA_SPARSE)

    input_dim = 1000
    num_output_classes = 5

    mb_source = MinibatchSource(CTFDeserializer(
        tmpfile,
        StreamDefs(features=StreamDef(field='x',
                                      shape=input_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='y',
                                    shape=num_output_classes,
                                    is_sparse=False))),
                                randomize=False)

    assert isinstance(mb_source, MinibatchSource)

    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')

    mb = mb_source.next_minibatch(7)

    features = mb[features_si]
    # 2 samples, max seq len 4, 1000 dim
    assert features.shape == (2, 4, input_dim)
    assert features.end_of_sweep
    assert features.num_sequences == 2
    assert features.num_samples == 7
    assert features.is_sparse

    labels = mb[labels_si]
    # 2 samples, max seq len 1, 5 dim
    assert labels.shape == (2, 1, num_output_classes)
    assert labels.end_of_sweep
    assert labels.num_sequences == 2
    assert labels.num_samples == 2
    assert not labels.is_sparse

    label_data = labels.asarray()
    assert np.allclose(
        label_data, np.asarray([[[1., 0., 0., 0., 0.]], [[0., 1., 0., 0.,
                                                          0.]]]))

    mb = mb_source.next_minibatch(1)
    features = mb[features_si]
    labels = mb[labels_si]

    assert not features.end_of_sweep
    assert not labels.end_of_sweep
    assert features.num_samples < 7
    assert labels.num_samples == 1
Пример #3
0
def test_max_samples(tmpdir):
    mb_source = MinibatchSource(create_ctf_deserializer(tmpdir), max_samples=1)

    input_map = {'features': mb_source['features']}
    mb = mb_source.next_minibatch(10, input_map)

    assert 'features' in mb
    assert mb['features'].num_samples == 1
    assert not mb['features'].end_of_sweep

    mb = mb_source.next_minibatch(10, input_map)

    assert not mb
Пример #4
0
def test_max_samples(tmpdir):
    mb_source = MinibatchSource(
        create_ctf_deserializer(tmpdir), max_samples=1)

    input_map = {'features': mb_source['features']}
    mb = mb_source.next_minibatch(10, input_map)

    assert 'features' in mb
    assert mb['features'].num_samples == 1
    assert not mb['features'].end_of_sweep

    mb = mb_source.next_minibatch(10, input_map)

    assert not mb
Пример #5
0
def test_text_format(tmpdir):
    tmpfile = _write_data(tmpdir, MBDATA_SPARSE)

    input_dim = 1000
    num_output_classes = 5

    mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
        features=StreamDef(field='x', shape=input_dim, is_sparse=True),
        labels=StreamDef(field='y', shape=num_output_classes, is_sparse=False)
    )), randomize=False)

    assert isinstance(mb_source, MinibatchSource)

    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')

    mb = mb_source.next_minibatch(7)

    features = mb[features_si]
    # 2 samples, max seq len 4, 1000 dim
    assert features.shape == (2, 4, input_dim)
    assert features.end_of_sweep
    assert features.num_sequences == 2
    assert features.num_samples == 7
    assert features.is_sparse

    labels = mb[labels_si]
    # 2 samples, max seq len 1, 5 dim
    assert labels.shape == (2, 1, num_output_classes)
    assert labels.end_of_sweep
    assert labels.num_sequences == 2
    assert labels.num_samples == 2
    assert not labels.is_sparse

    label_data = labels.asarray()
    assert np.allclose(label_data,
                       np.asarray([
                           [[1.,  0.,  0.,  0.,  0.]],
                           [[0.,  1.,  0.,  0.,  0.]]
                       ]))

    mb = mb_source.next_minibatch(1)
    features = mb[features_si]
    labels = mb[labels_si]

    assert not features.end_of_sweep
    assert not labels.end_of_sweep
    assert features.num_samples < 7
    assert labels.num_samples == 1
Пример #6
0
def test_large_minibatch(tmpdir):
    tmpfile = _write_data(tmpdir, MBDATA_DENSE_2)

    mb_source = MinibatchSource(CTFDeserializer(
        tmpfile,
        StreamDefs(features=StreamDef(field='S0', shape=1),
                   labels=StreamDef(field='S1', shape=1))),
                                randomization_window_in_chunks=0)

    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')

    mb = mb_source.next_minibatch(1000)
    features = mb[features_si]
    labels = mb[labels_si]

    # Actually, the minibatch spans over multiple sweeps,
    # not sure if this is an artificial situation, but
    # maybe instead of a boolean flag we should indicate
    # the largest sweep index the data was taken from.
    assert features.end_of_sweep
    assert labels.end_of_sweep

    assert features.num_samples == 1000 - 1000 % 7
    assert labels.num_samples == 5 * (1000 // 7)

    assert mb[features_si].num_sequences == (1000 // 7)
    assert mb[labels_si].num_sequences == (1000 // 7)
Пример #7
0
def test_MinibatchData_and_Value_as_input(tmpdir):

    mbdata = r'''0  |S0 100'''

    tmpfile = str(tmpdir / 'mbtest.txt')
    with open(tmpfile, 'w') as f:
        f.write(mbdata)

    defs = StreamDefs(f1=StreamDef(field='S0', shape=1))
    mb_source = MinibatchSource(CTFDeserializer(tmpfile, defs),
                                randomize=False)

    f1_si = mb_source.stream_info('f1')

    mb = mb_source.next_minibatch(1)

    f1 = input(shape=(1, ), needs_gradient=True, name='f')
    res = f1 * 2

    assert res.eval({f1: mb[f1_si]}) == [[200]]
    # Test MinibatchData
    assert res.eval(mb[f1_si]) == [[200]]
    # Test Value
    assert res.eval(mb[f1_si].data) == [[200]]
    # Test NumPy (converted back from MinibatchData)
    assert res.eval(mb[f1_si].asarray()) == [[200]]
    # Test Value
    assert res.eval(mb[f1_si].data) == [[200]]
Пример #8
0
def test_large_minibatch(tmpdir):
    tmpfile = _write_data(tmpdir, MBDATA_DENSE_2)

    mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
        features  = StreamDef(field='S0', shape=1),
        labels    = StreamDef(field='S1', shape=1))),
        randomization_window_in_chunks=0)

    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')

    mb = mb_source.next_minibatch(1000)
    features = mb[features_si]
    labels = mb[labels_si]

    # Actually, the minibatch spans over multiple sweeps,
    # not sure if this is an artificial situation, but
    # maybe instead of a boolean flag we should indicate
    # the largest sweep index the data was taken from.
    assert features.end_of_sweep
    assert labels.end_of_sweep

    assert features.num_samples == 1000 - 1000 % 7
    assert labels.num_samples == 5 * (1000 // 7)

    assert mb[features_si].num_sequences == (1000 // 7)
    assert mb[labels_si].num_sequences == (1000 // 7)
Пример #9
0
def test_base64_image_deserializer(tmpdir):
    import io, base64, uuid
    from PIL import Image
    images, b64_images = [], []

    np.random.seed(1)
    for i in range(10):
        data = np.random.randint(0, 2**8, (5, 7, 3))
        image = Image.fromarray(data.astype('uint8'), "RGB")
        buf = io.BytesIO()
        image.save(buf, format='PNG')
        assert image.width == 7 and image.height == 5
        b64_images.append(base64.b64encode(buf.getvalue()))
        images.append(np.array(image))

    image_data = str(tmpdir / 'mbdata1.txt')
    seq_ids = []
    uid = uuid.uuid1().int >> 64
    with open(image_data, 'wb') as f:
        for i, data in enumerate(b64_images):
            seq_id = uid ^ i
            seq_id = str(seq_id).encode('ascii')
            seq_ids.append(seq_id)
            line = seq_id + b'\t'
            label = str(i).encode('ascii')
            line += label + b'\t' + data + b'\n'
            f.write(line)

    ctf_data = str(tmpdir / 'mbdata2.txt')
    with open(ctf_data, 'wb') as f:
        for i, sid in enumerate(seq_ids):
            line = sid + b'\t' + b'|index ' + str(i).encode('ascii') + b'\n'
            f.write(line)

    transforms = [xforms.scale(width=7, height=5, channels=3)]
    b64_deserializer = Base64ImageDeserializer(
        image_data,
        StreamDefs(images=StreamDef(field='image', transforms=transforms),
                   labels=StreamDef(field='label', shape=10)))

    ctf_deserializer = CTFDeserializer(
        ctf_data, StreamDefs(index=StreamDef(field='index', shape=1)))

    mb_source = MinibatchSource([ctf_deserializer, b64_deserializer])
    assert isinstance(mb_source, MinibatchSource)

    for j in range(100):
        mb = mb_source.next_minibatch(10)

        index_stream = mb_source.streams['index']
        index = mb[index_stream].asarray().flatten()
        image_stream = mb_source.streams['images']

        results = mb[image_stream].asarray()

        for i in range(10):
            # original images are RBG, openCV produces BGR images,
            # reverse the last dimension of the original images
            bgrImage = images[int(index[i])][:, :, ::-1]
            assert (bgrImage == results[i][0]).all()
Пример #10
0
def test_MinibatchData_and_Value_as_input(tmpdir):

    mbdata = r'''0  |S0 100'''

    tmpfile = str(tmpdir/'mbtest.txt')
    with open(tmpfile, 'w') as f:
        f.write(mbdata)

    defs = StreamDefs(f1 = StreamDef(field='S0', shape=1))
    mb_source = MinibatchSource(CTFDeserializer(tmpfile, defs),
                                randomize=False)

    f1_si = mb_source.stream_info('f1')

    mb = mb_source.next_minibatch(1)

    f1 = input_variable(shape=(1,),
                       needs_gradient=True,
                       name='f')
    res = f1 * 2

    assert res.eval({f1: mb[f1_si]}) == [[200]]
    # Test MinibatchData
    assert res.eval(mb[f1_si]) == [[200]]
    # Test Value
    assert res.eval(mb[f1_si].data) == [[200]]
    # Test NumPy (converted back from MinibatchData)
    assert res.eval(mb[f1_si].value) == [[200]]
    # Test Value
    assert res.eval(mb[f1_si].data) == [[200]]
Пример #11
0
def test_eval_sparse_dense(tmpdir, device_id):
    from cntk import Axis
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
    from cntk.ops import input, times

    input_vocab_dim = label_vocab_dim = 69

    ctf_data = '''\
0	|S0 3:1 |# <s>	|S1 3:1 |# <s>
0	|S0 4:1 |# A	|S1 32:1 |# ~AH
0	|S0 5:1 |# B	|S1 36:1 |# ~B
0	|S0 4:1 |# A	|S1 31:1 |# ~AE
0	|S0 7:1 |# D	|S1 38:1 |# ~D
0	|S0 12:1 |# I	|S1 47:1 |# ~IY
0	|S0 1:1 |# </s>	|S1 1:1 |# </s>
2	|S0 60:1 |# <s>	|S1 3:1 |# <s>
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
'''
    ctf_file = str(tmpdir / '2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    mbs = MinibatchSource(CTFDeserializer(
        ctf_file,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_vocab_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='S1',
                                    shape=label_vocab_dim,
                                    is_sparse=True))),
                          randomize=False,
                          epoch_size=2)

    raw_input = sequence.input(shape=input_vocab_dim,
                               sequence_axis=Axis('inputAxis'),
                               name='raw_input',
                               is_sparse=True)

    mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100,
                                  input_map={raw_input: mbs.streams.features},
                                  device=cntk_device(device_id))

    z = times(raw_input, np.eye(input_vocab_dim))
    e_reader = z.eval(mb_valid, device=cntk_device(device_id))

    # CSR with the raw_input encoding in ctf_data
    one_hot_data = [[3, 4, 5, 4, 7, 12, 1], [60, 61]]
    data = [
        csr(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in one_hot_data
    ]
    e_csr = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a, b in zip(e_reader, e_csr)])

    # One-hot with the raw_input encoding in ctf_data
    data = Value.one_hot(one_hot_data,
                         num_classes=input_vocab_dim,
                         device=cntk_device(device_id))
    e_hot = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a, b in zip(e_reader, e_hot)])
Пример #12
0
def test_eval_sparse_dense(tmpdir, device_id):
    from cntk import Axis
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
    from cntk.device import cpu, gpu, set_default_device
    from cntk.ops import input_variable, times
    from scipy.sparse import csr_matrix

    input_vocab_dim = label_vocab_dim = 69

    ctf_data = '''\
0	|S0 3:1 |# <s>	|S1 3:1 |# <s>
0	|S0 4:1 |# A	|S1 32:1 |# ~AH
0	|S0 5:1 |# B	|S1 36:1 |# ~B
0	|S0 4:1 |# A	|S1 31:1 |# ~AE
0	|S0 7:1 |# D	|S1 38:1 |# ~D
0	|S0 12:1 |# I	|S1 47:1 |# ~IY
0	|S0 1:1 |# </s>	|S1 1:1 |# </s>
2	|S0 60:1 |# <s>	|S1 3:1 |# <s>
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
'''
    ctf_file = str(tmpdir/'2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
        features  = StreamDef(field='S0', shape=input_vocab_dim,  is_sparse=True),
        labels    = StreamDef(field='S1', shape=label_vocab_dim,  is_sparse=True)
    )), randomize=False, epoch_size = 2)

    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(
        shape=input_vocab_dim, dynamic_axes=input_dynamic_axes,
        name='raw_input', is_sparse=True)

    mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100, 
            input_map={raw_input : mbs.streams.features})

    z = times(raw_input, np.eye(input_vocab_dim))
    e_reader = z.eval(mb_valid)

    # CSR with the raw_input encoding in ctf_data
    one_hot_data = [
            [3, 4, 5, 4, 7, 12, 1], 
            [60, 61]
            ]
    data = [csr_matrix(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in
            one_hot_data]
    e_csr = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_csr)])

    # One-hot with the raw_input encoding in ctf_data
    data = one_hot(one_hot_data, num_classes=input_vocab_dim)
    e_hot = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_hot)])
Пример #13
0
def test_htk_deserializers():
    mbsize = 640
    epoch_size = 1000 * mbsize
    lr = [0.001]

    feature_dim = 33
    num_classes = 132
    context = 2

    os.chdir(data_path)

    features_file = "glob_0000.scp"
    labels_file = "glob_0000.mlf"
    label_mapping_file = "state.list"

    fd = HTKFeatureDeserializer(
        StreamDefs(amazing_features=StreamDef(
            shape=feature_dim, context=(context, context), scp=features_file)))

    ld = HTKMLFDeserializer(
        label_mapping_file,
        StreamDefs(
            awesome_labels=StreamDef(shape=num_classes, mlf=labels_file)))

    reader = MinibatchSource([fd, ld])

    features = C.input_variable(((2 * context + 1) * feature_dim))
    labels = C.input_variable((num_classes))

    model = Sequential(
        [For(range(3), lambda: Recurrence(LSTM(256))),
         Dense(num_classes)])
    z = model(features)
    ce = C.cross_entropy_with_softmax(z, labels)
    errs = C.classification_error(z, labels)

    learner = C.adam_sgd(z.parameters,
                         lr=C.learning_rate_schedule(lr, C.UnitType.sample,
                                                     epoch_size),
                         momentum=C.momentum_as_time_constant_schedule(1000),
                         low_memory=True,
                         gradient_clipping_threshold_per_sample=15,
                         gradient_clipping_with_truncation=True)
    trainer = C.Trainer(z, (ce, errs), learner)

    input_map = {
        features: reader.streams.amazing_features,
        labels: reader.streams.awesome_labels
    }

    pp = C.ProgressPrinter(freq=0)
    # just run and verify it doesn't crash
    for i in range(3):
        mb_data = reader.next_minibatch(mbsize, input_map=input_map)
        trainer.train_minibatch(mb_data)
        pp.update_with_trainer(trainer, with_metric=True)
    assert True
    os.chdir(abs_path)
Пример #14
0
    def compare_cbf_and_ctf(num_mbs, mb_size, randomize):
        ctf = MinibatchSource(CTFDeserializer(tmpfile, streams),
                              randomize=randomize)
        cbf = MinibatchSource(CBFDeserializer(tmpfile + '.bin', streams),
                              randomize=randomize)

        ctf_stream_names = sorted([x.m_name for x in ctf.stream_infos()])
        cbf_stream_names = sorted([x.m_name for x in cbf.stream_infos()])

        assert (ctf_stream_names == cbf_stream_names)
        for _ in range(num_mbs):
            ctf_mb = ctf.next_minibatch(mb_size, device=device)
            cbf_mb = cbf.next_minibatch(mb_size, device=device)

            for name in cbf_stream_names:
                ctf_data = ctf_mb[ctf[name]]
                cbf_data = cbf_mb[cbf[name]]

                assert ctf_data.num_samples == cbf_data.num_samples
                assert ctf_data.num_sequences == cbf_data.num_sequences
                assert ctf_data.shape == cbf_data.shape
                assert ctf_data.end_of_sweep == cbf_data.end_of_sweep
                assert ctf_data.is_sparse == cbf_data.is_sparse
                assert ctf_data.data.masked_count(
                ) == cbf_data.data.masked_count()

                # XXX:
                # assert(ctf_data.asarray() == cbf_data.asarray()).all()
                # not using asarray because for sparse values it fails with
                # some strange exception "sum of the rank of the mask and Variable
                #rank does not equal the Value's rank".

                assert C.cntk_py.are_equal(ctf_data.data.data,
                                           cbf_data.data.data)

                if (ctf_data.data.masked_count() > 0):
                    assert (ctf_data.data.mask == cbf_data.data.mask).all()
                # XXX: if mask_count is zero, mb_data.data.mask fails with
                # "AttributeError: 'Value' object has no attribute 'mask'"!

                # XXX: without invoking erase, next_minibatch will fail with:
                # "Resize: Cannot resize the matrix because it is a view."
                ctf_data.data.erase()
                cbf_data.data.erase()
Пример #15
0
def test_mlf_binary_files():
    os.chdir(data_path)

    feature_dim = 33
    num_classes = 132
    context = 2

    features_file = "glob_0000.scp"

    fd = HTKFeatureDeserializer(StreamDefs(
        amazing_features = StreamDef(shape=feature_dim, context=(context,context), scp=features_file)))

    ld = HTKMLFBinaryDeserializer(StreamDefs(awesome_labels = StreamDef(shape=num_classes, mlf=e2e_data_path + "mlf2.bin")))

    # Make sure we can read at least one minibatch.
    mbsource = MinibatchSource([fd,ld])
    mbsource.next_minibatch(1)

    os.chdir(abs_path)
Пример #16
0
def test_minibatch(tmpdir):

    mbdata = r'''0	|S0 0   |S1 0
0	|S0 1 	|S1 1 
0	|S0 2 	
0	|S0 3 	|S1 3 
1	|S0 4 	
1	|S0 5 	|S1 1
1	|S0 6	|S1 2 
'''

    tmpfile = str(tmpdir/'mbtest.txt')
    with open(tmpfile, 'w') as f:
        f.write(mbdata)

    from cntk.io import CTFDeserializer, MinibatchSource, StreamDef, StreamDefs
    mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
        features  = StreamDef(field='S0', shape=1),
        labels    = StreamDef(field='S1', shape=1))))
     
    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')
    
    mb = mb_source.next_minibatch(1000)
    assert mb[features_si].num_sequences == 2
    assert mb[labels_si].num_sequences == 2

    features = mb[features_si]
    assert len(features.value) == 2
    expected_features = \
            [
                [[0],[1],[2],[3]],
                [[4],[5],[6]]
            ]

    for res, exp in zip (features.value, expected_features):
        assert np.allclose(res, exp)

    assert np.allclose(features.mask, 
            [[2, 1, 1, 1],
             [2, 1, 1, 0]])

    labels = mb[labels_si]
    assert len(labels.value) == 2
    expected_labels = \
            [
                [[0],[1],[3]], 
                [[1],[2]]
            ]
    for res, exp in zip (labels.value, expected_labels):
        assert np.allclose(res, exp)

    assert np.allclose(labels.mask, 
            [[2, 1, 1],
             [2, 1, 0]])
Пример #17
0
    def compare_cbf_and_ctf(num_mbs, mb_size, randomize):
        ctf = MinibatchSource(CTFDeserializer(tmpfile, streams), randomize=randomize)
        cbf = MinibatchSource(CBFDeserializer(tmpfile+'.bin', streams), randomize=randomize)

        ctf_stream_names = sorted([x.m_name for x in ctf.stream_infos()])
        cbf_stream_names = sorted([x.m_name for x in cbf.stream_infos()])

        assert(ctf_stream_names == cbf_stream_names)
        for _ in range(num_mbs):
            ctf_mb = ctf.next_minibatch(mb_size, device=device)
            cbf_mb = cbf.next_minibatch(mb_size, device=device)

            for name in cbf_stream_names:
                ctf_data = ctf_mb[ctf[name]]
                cbf_data = cbf_mb[cbf[name]]

                
                assert ctf_data.num_samples == cbf_data.num_samples
                assert ctf_data.num_sequences == cbf_data.num_sequences
                assert ctf_data.shape == cbf_data.shape
                assert ctf_data.end_of_sweep == cbf_data.end_of_sweep
                assert ctf_data.is_sparse == cbf_data.is_sparse
                assert ctf_data.data.masked_count() == cbf_data.data.masked_count()

                # XXX:
                # assert(ctf_data.asarray() == cbf_data.asarray()).all()
                # not using asarray because for sparse values it fails with
                # some strange exception "sum of the rank of the mask and Variable 
                #rank does not equal the Value's rank".

                assert C.cntk_py.are_equal(ctf_data.data.data, cbf_data.data.data)

                if (ctf_data.data.masked_count() > 0):
                    assert (ctf_data.data.mask == cbf_data.data.mask).all()
                # XXX: if mask_count is zero, mb_data.data.mask fails with 
                # "AttributeError: 'Value' object has no attribute 'mask'"!

                # XXX: without invoking erase, next_minibatch will fail with:
                # "Resize: Cannot resize the matrix because it is a view."
                ctf_data.data.erase()
                cbf_data.data.erase()
Пример #18
0
def train():
	global sentences, vocabulary, reverse_vocabulary
	# function will create the trainer and train it for specified number of epochs
	# Print loss 50 times while training
	print_freqency = 50
	pp = ProgressPrinter(print_freqency)

	# get the trainer
	word_one_hot, context_one_hots, negative_one_hots, targets, trainer, word_negative_context_product, embedding_layer = create_trainer()
	
	# Create a CTF reader which reads the sparse inputs
	print("reader started")
	reader = CTFDeserializer(G.CTF_input_file)
	reader.map_input(G.word_input_field, dim=G.embedding_vocab_size, format="sparse")
	# context inputs
	for i in range(context_size):
		reader.map_input(G.context_input_field.format(i), dim=G.embedding_vocab_size, format="sparse")
	# negative inputs
	for i in range(G.negative):
		reader.map_input(G.negative_input_field.format(i), dim=G.embedding_vocab_size, format="sparse")
	# targets
	reader.map_input(G.target_input_field, dim=(G.negative + 1), format="dense")
	print("reader done")

	# Get minibatch source from reader
	is_training = True
	minibatch_source = MinibatchSource(reader, randomize=is_training, epoch_size=INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP)
	minibatch_source.streams[targets] = minibatch_source.streams[G.target_input_field]
	del minibatch_source.streams[G.target_input_field]
	print("minibatch source done")
	
	total_minibatches = total_training_instances // G.minibatch_size
	print("traning started")
	print("Total minibatches to train =", total_minibatches)
	for i in range(total_minibatches):
		# Collect minibatch
		# start_batch_collection = time.time()
		mb = minibatch_source.next_minibatch(G.minibatch_size, input_map=minibatch_source.streams)
		# end_batch_collection = time.time()
		# print("Batch collection time = %.6fsecs" % (end_batch_collection - start_batch_collection))
		# print("Time taken to collect one training_instance = %.6fsecs" % ((end_batch_collection - start_batch_collection)/G.minibatch_size))
		# Train minibatch
		# start_train = time.time()
		trainer.train_minibatch(mb)
		# end_train = time.time()
		# print("minibatch train time = %.6fsecs" % (end_train - start_train))
		# print("Time per training instance = %.6fsecs" % ((end_train - start_train)/G.minibatch_size))
		# Update progress printer
		pp.update_with_trainer(trainer)

		# start_batch_collection = time.time()
	print("Total training instances =", total_training_instances)
	return word_negative_context_product
Пример #19
0
def test_mlf_binary_files():
    os.chdir(data_path)

    feature_dim = 33
    num_classes = 132
    context = 2

    features_file = "glob_0000.scp"

    fd = HTKFeatureDeserializer(
        StreamDefs(amazing_features=StreamDef(
            shape=feature_dim, context=(context, context), scp=features_file)))

    ld = HTKMLFBinaryDeserializer(
        StreamDefs(awesome_labels=StreamDef(shape=num_classes,
                                            mlf=e2e_data_path + "mlf2.bin")))

    # Make sure we can read at least one minibatch.
    mbsource = MinibatchSource([fd, ld])
    mbsource.next_minibatch(1)

    os.chdir(abs_path)
Пример #20
0
def test_max_samples_over_several_sweeps(tmpdir):
    mb_source = MinibatchSource(
        create_ctf_deserializer(tmpdir), max_samples=11)

    input_map = {'features': mb_source['features']}

    for i in range(2):
        mb = mb_source.next_minibatch(5, input_map)

        assert 'features' in mb
        assert mb['features'].num_samples == 5
        assert mb['features'].end_of_sweep

    mb = mb_source.next_minibatch(5, input_map)

    assert 'features' in mb
    assert mb['features'].num_samples == 1
    assert not mb['features'].end_of_sweep

    mb = mb_source.next_minibatch(1, input_map)

    assert not mb
Пример #21
0
def test_max_sweeps(tmpdir):
    # set max sweeps to 3 (12 samples altogether).
    mb_source = MinibatchSource(create_ctf_deserializer(tmpdir), max_sweeps=3)

    input_map = {'features': mb_source['features']}

    for i in range(2):
        mb = mb_source.next_minibatch(5, input_map)

        assert 'features' in mb
        assert mb['features'].num_samples == 5
        assert mb['features'].end_of_sweep

    mb = mb_source.next_minibatch(5, input_map)

    assert 'features' in mb
    assert mb['features'].num_samples == 2
    assert mb['features'].end_of_sweep

    mb = mb_source.next_minibatch(1, input_map)

    assert not mb
Пример #22
0
def test_max_samples_over_several_sweeps(tmpdir):
    mb_source = MinibatchSource(create_ctf_deserializer(tmpdir),
                                max_samples=11)

    input_map = {'features': mb_source['features']}

    for i in range(2):
        mb = mb_source.next_minibatch(5, input_map)

        assert 'features' in mb
        assert mb['features'].num_samples == 5
        assert mb['features'].end_of_sweep

    mb = mb_source.next_minibatch(5, input_map)

    assert 'features' in mb
    assert mb['features'].num_samples == 1
    assert not mb['features'].end_of_sweep

    mb = mb_source.next_minibatch(1, input_map)

    assert not mb
Пример #23
0
def test_text_format(tmpdir):
    from cntk.io import CTFDeserializer, MinibatchSource, StreamDef, StreamDefs

    mbdata = r'''0	|x 560:1	|y 1 0 0 0 0
0	|x 0:1
0	|x 0:1
1	|x 560:1	|y 0 1 0 0 0
1	|x 0:1
1	|x 0:1
1	|x 424:1
'''
    tmpfile = str(tmpdir/'mbdata.txt')
    with open(tmpfile, 'w') as f:
        f.write(mbdata)

    input_dim = 1000
    num_output_classes = 5

    mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
         features  = StreamDef(field='x', shape=input_dim, is_sparse=True),
         labels    = StreamDef(field='y', shape=num_output_classes, is_sparse=False)
       )))

    assert isinstance(mb_source, MinibatchSource)

    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')

    mb = mb_source.next_minibatch(7)

    features = mb[features_si]
    # 2 samples, max seq len 4, 1000 dim
    assert features.shape == (2, 4, input_dim)
    assert features.is_sparse
    # TODO features is sparse and cannot be accessed right now:
    # *** RuntimeError: DataBuffer/WritableDataBuffer methods can only be called for NDArrayiew objects with dense storage format
    # 2 samples, max seq len 4, 1000 dim
    #assert features.data().shape().dimensions() == (2, 4, input_dim)
    #assert features.data().is_sparse()

    labels = mb[labels_si]
    # 2 samples, max seq len 1, 5 dim
    assert labels.shape == (2, 1, num_output_classes)
    assert not labels.is_sparse

    label_data = np.asarray(labels)
    assert np.allclose(label_data,
            np.asarray([
                [[ 1.,  0.,  0.,  0.,  0.]],
                [[ 0.,  1.,  0.,  0.,  0.]]
                ]))
Пример #24
0
def test_max_sweeps(tmpdir):
    # set max sweeps to 3 (12 samples altogether).
    mb_source = MinibatchSource(
        create_ctf_deserializer(tmpdir), max_sweeps=3)

    input_map = {'features': mb_source['features']}

    for i in range(2):
        mb = mb_source.next_minibatch(5, input_map)

        assert 'features' in mb
        assert mb['features'].num_samples == 5
        assert mb['features'].end_of_sweep

    mb = mb_source.next_minibatch(5, input_map)

    assert 'features' in mb
    assert mb['features'].num_samples == 2
    assert mb['features'].end_of_sweep

    mb = mb_source.next_minibatch(1, input_map)

    assert not mb
Пример #25
0
def test_prefetch_with_unpacking(tmpdir):
    data = r'''0  |S0 1 1 1 1   |S1 1000
1   |S0 2 2 2 2  |S1 100
2   |S0 3 3 3 3  |S1 100
3   |S0 1 1 1 1  |S1 10
4   |S0 2 2 2 2  |S1 1
5   |S0 3 3 3 3  |S1 2000
6   |S0 1 1 1 1  |S1 200
7   |S0 2 2 2 2  |S1 200
8   |S0 3 3 3 3  |S1 20
9   |S0 1 1 1 1  |S1 2
'''
    import time
    tmpfile = _write_data(tmpdir, data)

    input_dim = 4
    num_output_classes = 1

    mb_source = MinibatchSource(CTFDeserializer(
        tmpfile,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_dim,
                                      is_sparse=False),
                   labels=StreamDef(field='S1',
                                    shape=num_output_classes,
                                    is_sparse=False))),
                                randomize=False,
                                max_samples=FULL_DATA_SWEEP)

    input_map = {
        'S0': mb_source.streams.features,
        'S1': mb_source.streams.labels
    }
    empty = False
    mb_size = 3
    # On the last minibatch there will be resize called,
    # due to 10%3 = 1 sample  in the minibatch
    while not empty:
        mb = mb_source.next_minibatch(mb_size, input_map=input_map)
        time.sleep(1)  # make sure the prefetch kicks in
        if mb:
            # Force unpacking to check that we do
            # not break prefetch
            actual_size = mb['S0'].shape[0]
            assert (mb['S0'].asarray() == np.array(
                [[[1, 1, 1, 1]], [[2, 2, 2, 2]], [[3, 3, 3, 3]]],
                dtype=np.float32)[0:actual_size]).all()
        else:
            empty = True
Пример #26
0
def test_htk_deserializers():
    mbsize = 640
    epoch_size = 1000 * mbsize
    lr = [0.001]

    feature_dim = 33
    num_classes = 132
    context = 2

    os.chdir(data_path)

    features_file = "glob_0000.scp"
    labels_file = "glob_0000.mlf"
    label_mapping_file = "state.list"

    fd = HTKFeatureDeserializer(StreamDefs(
        amazing_features = StreamDef(shape=feature_dim, context=(context,context), scp=features_file)))

    ld = HTKMLFDeserializer(label_mapping_file, StreamDefs(
        awesome_labels = StreamDef(shape=num_classes, mlf=labels_file)))

    reader = MinibatchSource([fd,ld])

    features = C.input_variable(((2*context+1)*feature_dim))
    labels = C.input_variable((num_classes))

    model = Sequential([For(range(3), lambda : Recurrence(LSTM(256))),
                        Dense(num_classes)])
    z = model(features)
    ce = C.cross_entropy_with_softmax(z, labels)
    errs = C.classification_error    (z, labels)

    learner = C.adam_sgd(z.parameters,
                    lr=C.learning_rate_schedule(lr, C.UnitType.sample, epoch_size),
                    momentum=C.momentum_as_time_constant_schedule(1000),
                    low_memory=True,
                    gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True)
    trainer = C.Trainer(z, (ce, errs), learner)

    input_map={ features: reader.streams.amazing_features, labels: reader.streams.awesome_labels }

    pp = C.ProgressPrinter(freq=0)
    # just run and verify it doesn't crash
    for i in range(3):
        mb_data = reader.next_minibatch(mbsize, input_map=input_map)
        trainer.train_minibatch(mb_data)
        pp.update_with_trainer(trainer, with_metric=True)
    assert True
    os.chdir(abs_path)
Пример #27
0
def test_multiple_mlf_files():
    os.chdir(data_path)

    feature_dim = 33
    num_classes = 132
    context = 2

    test_mlf_path = e2e_data_path+"glob_00001.mlf"

    features_file = "glob_0000.scp"
    label_files = [ "glob_0000.mlf", test_mlf_path]
    label_mapping_file = "state.list"

    fd = HTKFeatureDeserializer(StreamDefs(
        amazing_features = StreamDef(shape=feature_dim, context=(context,context), scp=features_file)))

    ld = HTKMLFDeserializer(label_mapping_file, StreamDefs(
        awesome_labels = StreamDef(shape=num_classes, mlf=label_files)))

    # Make sure we can read at least one minibatch.
    mbsource = MinibatchSource([fd,ld])
    mbsource.next_minibatch(1)

    os.chdir(abs_path)
Пример #28
0
def test_crop_dimensionality(tmpdir):
    import io; from PIL import Image
    np.random.seed(1)

    file_mapping_path = str(tmpdir / 'file_mapping.txt')
    with open(file_mapping_path, 'w') as file_mapping:
        for i in range(5):
            data = np.random.randint(0, 2**8, (20, 40, 3))
            image = Image.fromarray(data.astype('uint8'), "RGB")
            buf = io.BytesIO()
            image.save(buf, format='PNG')
            assert image.width == 40 and image.height == 20
            
            label = str(i) 
            # save to mapping + png file
            file_name = label + '.png'
            with open(str(tmpdir/file_name), 'wb') as f:
                f.write(buf.getvalue())
            file_mapping.write('.../%s\t%s\n' % (file_name, label))

    transforms1 = [
        xforms.scale(width=40, height=20, channels=3),
        xforms.crop(crop_type='randomside', 
                    crop_size=(20, 10), side_ratio=(0.2, 0.5),
                    jitter_type='uniratio')]

    transforms2 = [
        xforms.crop(crop_type='randomside', 
                    crop_size=(20, 10), side_ratio=(0.2, 0.5),
                    jitter_type='uniratio')]

    d1 = ImageDeserializer(file_mapping_path,
        StreamDefs(
            images1=StreamDef(field='image', transforms=transforms1),
            labels1=StreamDef(field='label', shape=10)))

    d2 = ImageDeserializer(file_mapping_path,
        StreamDefs(
            images2=StreamDef(field='image', transforms=transforms2),
            labels2=StreamDef(field='label', shape=10)))

    mbs = MinibatchSource([d1, d2])
    for j in range(5):
        mb = mbs.next_minibatch(1)
        images1 = mb[mbs.streams.images1].asarray()
        images2 = mb[mbs.streams.images2].asarray()
        assert images1.shape == (1, 1, 3, 10, 20)
        assert (images1 == images2).all()
Пример #29
0
def test_base64_is_equal_image(tmpdir):
    import io, base64
    from PIL import Image
    np.random.seed(1)

    file_mapping_path = str(tmpdir / 'file_mapping.txt')
    base64_mapping_path = str(tmpdir / 'base64_mapping.txt')

    with open(file_mapping_path, 'w') as file_mapping:
        with open(base64_mapping_path, 'w') as base64_mapping:
            for i in range(10):
                data = np.random.randint(0, 2**8, (5, 7, 3))
                image = Image.fromarray(data.astype('uint8'), "RGB")
                buf = io.BytesIO()
                image.save(buf, format='PNG')
                assert image.width == 7 and image.height == 5

                label = str(i)
                # save to base 64 mapping file
                encoded = base64.b64encode(buf.getvalue()).decode('ascii')
                base64_mapping.write('%s\t%s\n' % (label, encoded))

                # save to mapping + png file
                file_name = label + '.png'
                with open(str(tmpdir / file_name), 'wb') as f:
                    f.write(buf.getvalue())
                file_mapping.write('.../%s\t%s\n' % (file_name, label))

    transforms = [xforms.scale(width=7, height=5, channels=3)]
    b64_deserializer = Base64ImageDeserializer(
        base64_mapping_path,
        StreamDefs(images1=StreamDef(field='image', transforms=transforms),
                   labels1=StreamDef(field='label', shape=10)))

    file_image_deserializer = ImageDeserializer(
        file_mapping_path,
        StreamDefs(images2=StreamDef(field='image', transforms=transforms),
                   labels2=StreamDef(field='label', shape=10)))

    mb_source = MinibatchSource([b64_deserializer, file_image_deserializer])
    for j in range(20):
        mb = mb_source.next_minibatch(1)

        images1_stream = mb_source.streams['images1']
        images1 = mb[images1_stream].asarray()
        images2_stream = mb_source.streams['images2']
        images2 = mb[images2_stream].asarray()
        assert (images1 == images2).all()
Пример #30
0
def test_crop_dimensionality(tmpdir):
    import io; from PIL import Image
    np.random.seed(1)

    file_mapping_path = str(tmpdir / 'file_mapping.txt')
    with open(file_mapping_path, 'w') as file_mapping:
        for i in range(5):
            data = np.random.randint(0, 2**8, (20, 40, 3))
            image = Image.fromarray(data.astype('uint8'), "RGB")
            buf = io.BytesIO()
            image.save(buf, format='PNG')
            assert image.width == 40 and image.height == 20
            
            label = str(i) 
            # save to mapping + png file
            file_name = label + '.png'
            with open(str(tmpdir/file_name), 'wb') as f:
                f.write(buf.getvalue())
            file_mapping.write('.../%s\t%s\n' % (file_name, label))

    transforms1 = [
        xforms.scale(width=40, height=20, channels=3),
        xforms.crop(crop_type='randomside', 
                    crop_size=(20, 10), side_ratio=(0.2, 0.5),
                    jitter_type='uniratio')]

    transforms2 = [
        xforms.crop(crop_type='randomside', 
                    crop_size=(20, 10), side_ratio=(0.2, 0.5),
                    jitter_type='uniratio')]

    d1 = ImageDeserializer(file_mapping_path,
        StreamDefs(
            images1=StreamDef(field='image', transforms=transforms1),
            labels1=StreamDef(field='label', shape=10)))

    d2 = ImageDeserializer(file_mapping_path,
        StreamDefs(
            images2=StreamDef(field='image', transforms=transforms2),
            labels2=StreamDef(field='label', shape=10)))

    mbs = MinibatchSource([d1, d2])
    for j in range(5):
        mb = mbs.next_minibatch(1)
        images1 = mb[mbs.streams.images1].asarray()
        images2 = mb[mbs.streams.images2].asarray()
        assert images1.shape == (1, 1, 3, 10, 20)
        assert (images1 == images2).all()
Пример #31
0
def test_multiple_streams_in_htk():
    feature_dim = 33
    context = 2

    os.chdir(data_path)

    features_file = "glob_0000.scp"

    fd = HTKFeatureDeserializer(StreamDefs(
        amazing_features = StreamDef(shape=feature_dim, context=(context,context), scp=features_file),
        amazing_features2 = StreamDef(shape=feature_dim, context=(context,context), scp=features_file)))

    mbs = MinibatchSource([fd])
    mb = mbs.next_minibatch(1)
    assert (mb[mbs.streams.amazing_features].asarray() == mb[mbs.streams.amazing_features2].asarray()).all()
    os.chdir(abs_path)
Пример #32
0
def test_base64_is_equal_image(tmpdir):
    import io, base64; from PIL import Image
    np.random.seed(1)

    file_mapping_path = str(tmpdir / 'file_mapping.txt')
    base64_mapping_path = str(tmpdir / 'base64_mapping.txt')

    with open(file_mapping_path, 'w') as file_mapping:
        with open(base64_mapping_path, 'w') as base64_mapping:
            for i in range(10):
                data = np.random.randint(0, 2**8, (5,7,3))
                image = Image.fromarray(data.astype('uint8'), "RGB")
                buf = io.BytesIO()
                image.save(buf, format='PNG')
                assert image.width == 7 and image.height == 5
                
                label = str(i) 
                # save to base 64 mapping file
                encoded = base64.b64encode(buf.getvalue()).decode('ascii')
                base64_mapping.write('%s\t%s\n' % (label, encoded))
         
                # save to mapping + png file
                file_name = label + '.png'
                with open(str(tmpdir/file_name), 'wb') as f:
                    f.write(buf.getvalue())
                file_mapping.write('.../%s\t%s\n' % (file_name, label))

    transforms = [xforms.scale(width=7, height=5, channels=3)]
    b64_deserializer = Base64ImageDeserializer(base64_mapping_path,
        StreamDefs(
            images1=StreamDef(field='image', transforms=transforms),
            labels1=StreamDef(field='label', shape=10)))

    file_image_deserializer = ImageDeserializer(file_mapping_path,
        StreamDefs(
            images2=StreamDef(field='image', transforms=transforms),
            labels2=StreamDef(field='label', shape=10)))

    mb_source = MinibatchSource([b64_deserializer, file_image_deserializer])
    for j in range(20):
        mb = mb_source.next_minibatch(1)

        images1_stream = mb_source.streams['images1']
        images1 = mb[images1_stream].asarray()
        images2_stream = mb_source.streams['images2']
        images2 = mb[images2_stream].asarray()
        assert(images1 == images2).all()
Пример #33
0
def test_full_sweep_minibatch(tmpdir):
    tmpfile = _write_data(tmpdir, MBDATA_DENSE_1)

    mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
        features  = StreamDef(field='S0', shape=1),
        labels    = StreamDef(field='S1', shape=1))),
        randomization_window_in_chunks=0, max_sweeps=1)

    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')

    mb = mb_source.next_minibatch(1000)

    assert mb[features_si].num_sequences == 2
    assert mb[labels_si].num_sequences == 2

    features = mb[features_si]
    assert features.end_of_sweep
    assert len(features.as_sequences()) == 2
    expected_features = \
        [
            [[0], [1], [2], [3]],
            [[4], [5], [6]]
        ]

    for res, exp in zip(features.as_sequences(), expected_features):
        assert np.allclose(res, exp)

    assert np.allclose(features.data.mask,
            [[2, 1, 1, 1],
             [2, 1, 1, 0]])

    labels = mb[labels_si]
    assert labels.end_of_sweep
    assert len(labels.as_sequences()) == 2
    expected_labels = \
            [
                [[0],[1],[3]],
                [[1],[2]]
            ]
    for res, exp in zip(labels.as_sequences(), expected_labels):
        assert np.allclose(res, exp)

    assert np.allclose(labels.data.mask,
            [[2, 1, 1],
             [2, 1, 0]])
Пример #34
0
def test_full_sweep_minibatch(tmpdir):
    tmpfile = _write_data(tmpdir, MBDATA_DENSE_1)

    mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
        features  = StreamDef(field='S0', shape=1),
        labels    = StreamDef(field='S1', shape=1))),
        randomization_window_in_chunks=0, max_sweeps=1)

    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')

    mb = mb_source.next_minibatch(1000)

    assert mb[features_si].num_sequences == 2
    assert mb[labels_si].num_sequences == 2

    features = mb[features_si]
    assert features.end_of_sweep
    assert len(features.as_sequences()) == 2
    expected_features = \
        [
            [[0], [1], [2], [3]],
            [[4], [5], [6]]
        ]

    for res, exp in zip(features.as_sequences(), expected_features):
        assert np.allclose(res, exp)

    assert np.allclose(features.data.mask,
            [[2, 1, 1, 1],
             [2, 1, 1, 0]])

    labels = mb[labels_si]
    assert labels.end_of_sweep
    assert len(labels.as_sequences()) == 2
    expected_labels = \
            [
                [[0],[1],[3]],
                [[1],[2]]
            ]
    for res, exp in zip(labels.as_sequences(), expected_labels):
        assert np.allclose(res, exp)

    assert np.allclose(labels.data.mask,
            [[2, 1, 1],
             [2, 1, 0]])
Пример #35
0
def decode_model(use_gpu=True, gpu_id=0):
    # use GPU or CPU according to parameters
    try_set_default_device(gpu(gpu_id) if use_gpu else cpu())

    model_dnn = load_model("./model/speech_enhancement.model")
    features_file = "./test_normed.scp"
    feature_dim = 257
    test_reader = MinibatchSource(HTKFeatureDeserializer(StreamDefs(
            amazing_features=StreamDef(
                    shape=feature_dim, context=(3, 3),
                    scp=features_file))),
                                  randomize=False, frame_mode=False)
    eval_input_map = {input: test_reader.streams.amazing_features}

    f = open(features_file)
    line = f.readline()
    while line:
        temp_input_path = line.split(']')[0]
        mb_size = temp_input_path.split(',')[-1]
        mb_size = int(mb_size) + 1
        noisy_fea = test_reader.next_minibatch(
                mb_size, input_map=eval_input_map)
        real_noisy_fea = noisy_fea[input].data

        node_in_graph = model_dnn.find_by_name('irm')
        output_nodes = combine([node_in_graph.owner])
        out_noisy_fea = output_nodes.eval(real_noisy_fea)
        # out_noisy_fea = as_composite(model_dnn.output1[0].owner).eval(
        #         real_noisy_fea)

        out_SE_noisy_fea = np.concatenate((out_noisy_fea), axis=0)

        out_file_path = line.split('=')[0]
        out_file_name = os.path.join('./enhanced_norm_fea_mat', out_file_path)
        out_file_fullpath = os.path.split(out_file_name)[0]
        # print (out_file_fullpath)
        if not os.path.exists(out_file_fullpath):
            os.makedirs(out_file_fullpath)
        sio.savemat(out_file_name, {'SE': out_SE_noisy_fea})
        line = f.readline()

    f.close()
Пример #36
0
def test_multiple_streams_in_htk():
    feature_dim = 33
    context = 2

    os.chdir(data_path)

    features_file = "glob_0000.scp"

    fd = HTKFeatureDeserializer(
        StreamDefs(amazing_features=StreamDef(shape=feature_dim,
                                              context=(context, context),
                                              scp=features_file),
                   amazing_features2=StreamDef(shape=feature_dim,
                                               context=(context, context),
                                               scp=features_file)))

    mbs = MinibatchSource([fd])
    mb = mbs.next_minibatch(1)
    assert (mb[mbs.streams.amazing_features].asarray() == mb[
        mbs.streams.amazing_features2].asarray()).all()
    os.chdir(abs_path)
Пример #37
0
def test_prefetch_with_unpacking(tmpdir):
    data = r'''0  |S0 1 1 1 1   |S1 1000
1   |S0 2 2 2 2  |S1 100
2   |S0 3 3 3 3  |S1 100
3   |S0 1 1 1 1  |S1 10
4   |S0 2 2 2 2  |S1 1
5   |S0 3 3 3 3  |S1 2000
6   |S0 1 1 1 1  |S1 200
7   |S0 2 2 2 2  |S1 200
8   |S0 3 3 3 3  |S1 20
9   |S0 1 1 1 1  |S1 2
'''
    import time
    tmpfile = _write_data(tmpdir, data)

    input_dim = 4
    num_output_classes = 1

    mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
        features=StreamDef(field='S0', shape=input_dim, is_sparse=False),
        labels=StreamDef(field='S1', shape=num_output_classes, is_sparse=False)
    )), randomize=False, max_samples=FULL_DATA_SWEEP)

    input_map = { 'S0' : mb_source.streams.features, 'S1' : mb_source.streams.labels }
    empty = False
    mb_size = 3
    # On the last minibatch there will be resize called, 
    # due to 10%3 = 1 sample  in the minibatch
    while not empty:
        mb = mb_source.next_minibatch(mb_size, input_map=input_map)
        time.sleep(1) # make sure the prefetch kicks in
        if mb:
            # Force unpacking to check that we do 
            # not break prefetch 
            actual_size = mb['S0'].shape[0]
            assert (mb['S0'].asarray() == np.array([[[1, 1, 1, 1]],
                                                    [[2, 2, 2, 2]],
                                                    [[3, 3, 3, 3]]], dtype=np.float32)[0:actual_size]).all()
        else:
            empty = True
Пример #38
0
def test_distributed_mb_source(tmpdir):
    input_dim = 69

    ctf_data = '''\
0	|S0 3:1 |# <s>	|S1 3:1 |# <s>
0	|S0 4:1 |# A	|S1 32:1 |# ~AH
0	|S0 5:1 |# B	|S1 36:1 |# ~B
0	|S0 4:1 |# A	|S1 31:1 |# ~AE
0	|S0 7:1 |# D	|S1 38:1 |# ~D
0	|S0 12:1 |# I	|S1 47:1 |# ~IY
0	|S0 1:1 |# </s>	|S1 1:1 |# </s>
2	|S0 60:1 |# <s>	|S1 3:1 |# <s>
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
3	|S0 60:1 |# <s>	|S1 3:1 |# <s>
3	|S0 61:1 |# A	|S1 32:1 |# ~AH
3	|S0 61:1 |# A	|S1 32:1 |# ~AH
3	|S0 61:1 |# A	|S1 32:1 |# ~AH
4	|S0 60:1 |# <s>	|S1 3:1 |# <s>
5	|S0 60:1 |# <s>	|S1 3:1 |# <s>
5	|S0 61:1 |# A	|S1 32:1 |# ~AH
6	|S0 60:1 |# <s>	|S1 3:1 |# <s>
6	|S0 61:1 |# A	|S1 32:1 |# ~AH
7	|S0 60:1 |# <s>	|S1 3:1 |# <s>
8	|S0 60:1 |# <s>	|S1 3:1 |# <s>
8	|S0 61:1 |# A	|S1 32:1 |# ~AH
9	|S0 60:1 |# <s>	|S1 3:1 |# <s>
9	|S0 61:1 |# A	|S1 32:1 |# ~AH
10	|S0 61:1 |# A	|S1 32:1 |# ~AH
'''
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, FULL_DATA_SWEEP

    ctf_file = str(tmpdir/'2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    # No randomization

    mb0 = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
        features  = StreamDef(field='S0', shape=input_dim,  is_sparse=True),
        labels    = StreamDef(field='S1', shape=input_dim,  is_sparse=True)
        )), 
        randomize=False, epoch_size=36) # A bit more than a sweep
    mb1 = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
        features  = StreamDef(field='S0', shape=input_dim,  is_sparse=True),
        labels    = StreamDef(field='S1', shape=input_dim,  is_sparse=True)
        )), 
        randomize=False, epoch_size=36) # A bit more than a sweep
    input = input_variable(shape=(input_dim,))
    label = input_variable(shape=(input_dim,))
    input_map = {
        input : mb0.streams.features,
        label : mb0.streams.labels
    }

    # Because we emulating two workers here, the minibatch_size_in_samples will be splitted in 2,
    # so below we expect 5 samples per worker.
    data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0)
    assert(data[input].num_samples == 7) # Sequence 0

    data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0)
    assert(data[input].num_samples == 4) # Sequence 3

    data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0)
    assert(data[input].num_samples == 5) # Sequences 5, 7, 9

    data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0)
    assert(data[input].num_samples == 7) # Sequence 0

    data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0)
    assert(data[input].num_samples == 4) # Sequence 3

    data = mb0.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0)
    assert(len(data) == 0) # No data

    data = mb1.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1)
    assert(data[input].num_samples == 4) # Sequences 2, 4

    data = mb1.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1)
    assert(data[input].num_samples == 5) # Sequences 6, 8, 10

    data = mb1.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1)
    assert(data[input].num_samples == 3) # Sequences 2

    data = mb1.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1)
    assert(len(data) == 0) # No data

    # Radomization

    mb3 = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
        features  = StreamDef(field='S0', shape=input_dim,  is_sparse=True),
        labels    = StreamDef(field='S1', shape=input_dim,  is_sparse=True)
        )), 
        randomize=True, epoch_size=FULL_DATA_SWEEP)

    mb4 = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
        features  = StreamDef(field='S0', shape=input_dim,  is_sparse=True),
        labels    = StreamDef(field='S1', shape=input_dim,  is_sparse=True)
        )), 
        randomize=True, epoch_size=FULL_DATA_SWEEP)

    data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0)
    assert(data[input].num_samples == 5)

    data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0)
    assert(data[input].num_samples == 4)

    data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0)
    assert(data[input].num_samples == 4)

    data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0)
    assert(data[input].num_samples == 5)

    data = mb3.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=0)
    assert(data[input].num_samples == 7)

    data = mb4.next_minibatch(minibatch_size_in_samples=10, input_map=input_map, num_data_partitions=2, partition_index=1)
    assert(len(data) == 0) # Due to chunking we do not expect any data for rank 1
Пример #39
0
def test_base64_image_deserializer(tmpdir):
    import io, base64, uuid; from PIL import Image
    images, b64_images = [], []

    np.random.seed(1)
    for i in range(10):
        data = np.random.randint(0, 2**8, (5,7,3))
        image = Image.fromarray(data.astype('uint8'), "RGB")
        buf = io.BytesIO()
        image.save(buf, format='PNG')
        assert image.width == 7 and image.height == 5
        b64_images.append(base64.b64encode(buf.getvalue()))
        images.append(np.array(image))

    image_data = str(tmpdir / 'mbdata1.txt')
    seq_ids = []
    uid = uuid.uuid1().int >> 64
    with open(image_data, 'wb') as f:
        for i,data in enumerate(b64_images):
            seq_id = uid ^ i
            seq_id = str(seq_id).encode('ascii')
            seq_ids.append(seq_id)
            line = seq_id + b'\t'
            label = str(i).encode('ascii')
            line += label + b'\t' + data + b'\n'
            f.write(line)

    ctf_data = str(tmpdir / 'mbdata2.txt')
    with open(ctf_data, 'wb') as f:
        for i, sid in enumerate(seq_ids):
            line = sid + b'\t' + b'|index '+str(i).encode('ascii') + b'\n'
            f.write(line)

    transforms = [xforms.scale(width=7, height=5, channels=3)]
    b64_deserializer = Base64ImageDeserializer(image_data,
        StreamDefs(
            images=StreamDef(field='image', transforms=transforms),
            labels=StreamDef(field='label', shape=10)))

    ctf_deserializer = CTFDeserializer(ctf_data,
        StreamDefs(index=StreamDef(field='index', shape=1)))

    mb_source = MinibatchSource([ctf_deserializer, b64_deserializer])
    assert isinstance(mb_source, MinibatchSource)

    for j in range(100):
        mb = mb_source.next_minibatch(10)

        index_stream = mb_source.streams['index']
        index = mb[index_stream].asarray().flatten()
        image_stream = mb_source.streams['images']

        results = mb[image_stream].asarray()

        for i in range(10):
            # original images are RBG, openCV produces BGR images,
            # reverse the last dimension of the original images
            bgrImage = images[int(index[i])][:,:,::-1]
            # transposing to get CHW representation
            bgrImage = np.transpose(bgrImage, (2, 0, 1))
            assert (bgrImage == results[i][0]).all()
Пример #40
0
def test_distributed_mb_source(tmpdir):
    input_dim = 69

    ctf_data = '''\
0	|S0 3:1 |# <s>	|S1 3:1 |# <s>
0	|S0 4:1 |# A	|S1 32:1 |# ~AH
0	|S0 5:1 |# B	|S1 36:1 |# ~B
0	|S0 4:1 |# A	|S1 31:1 |# ~AE
0	|S0 7:1 |# D	|S1 38:1 |# ~D
0	|S0 12:1 |# I	|S1 47:1 |# ~IY
0	|S0 1:1 |# </s>	|S1 1:1 |# </s>
2	|S0 60:1 |# <s>	|S1 3:1 |# <s>
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
3	|S0 60:1 |# <s>	|S1 3:1 |# <s>
3	|S0 61:1 |# A	|S1 32:1 |# ~AH
3	|S0 61:1 |# A	|S1 32:1 |# ~AH
3	|S0 61:1 |# A	|S1 32:1 |# ~AH
4	|S0 60:1 |# <s>	|S1 3:1 |# <s>
5	|S0 60:1 |# <s>	|S1 3:1 |# <s>
5	|S0 61:1 |# A	|S1 32:1 |# ~AH
6	|S0 60:1 |# <s>	|S1 3:1 |# <s>
6	|S0 61:1 |# A	|S1 32:1 |# ~AH
7	|S0 60:1 |# <s>	|S1 3:1 |# <s>
8	|S0 60:1 |# <s>	|S1 3:1 |# <s>
8	|S0 61:1 |# A	|S1 32:1 |# ~AH
9	|S0 60:1 |# <s>	|S1 3:1 |# <s>
9	|S0 61:1 |# A	|S1 32:1 |# ~AH
10	|S0 61:1 |# A	|S1 32:1 |# ~AH
'''
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, FULL_DATA_SWEEP

    ctf_file = str(tmpdir / '2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    # No randomization

    mb0 = MinibatchSource(CTFDeserializer(
        ctf_file,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='S1',
                                    shape=input_dim,
                                    is_sparse=True))),
                          randomize=False,
                          epoch_size=FULL_DATA_SWEEP)
    mb1 = MinibatchSource(CTFDeserializer(
        ctf_file,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='S1',
                                    shape=input_dim,
                                    is_sparse=True))),
                          randomize=False,
                          epoch_size=FULL_DATA_SWEEP)
    input = input_variable(shape=(input_dim, ))
    label = input_variable(shape=(input_dim, ))
    input_map = {input: mb0.streams.features, label: mb0.streams.labels}

    data = mb0.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=0)
    assert (data[input].num_samples == 7)

    data = mb0.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=0)
    assert (data[input].num_samples == 4)

    data = mb0.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=0)
    assert (data[input].num_samples == 5)

    data = mb1.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=1)
    assert (data[input].num_samples == 3)

    data = mb1.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=1)
    assert (data[input].num_samples == 5)

    # Radomization

    mb3 = MinibatchSource(CTFDeserializer(
        ctf_file,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='S1',
                                    shape=input_dim,
                                    is_sparse=True))),
                          randomize=True,
                          epoch_size=FULL_DATA_SWEEP)

    mb4 = MinibatchSource(CTFDeserializer(
        ctf_file,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='S1',
                                    shape=input_dim,
                                    is_sparse=True))),
                          randomize=True,
                          epoch_size=FULL_DATA_SWEEP)

    data = mb3.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=0)
    assert (data[input].num_samples == 5)

    data = mb3.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=0)
    assert (data[input].num_samples == 4)

    data = mb4.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=1)
    assert (len(data) == 0)
def simple_mnist():
    input_dim = 784
    num_output_classes = 10
    num_hidden_layers = 2
    hidden_layers_dim = 200

    # Input variables denoting the features and label data
    feature = C.input_variable(input_dim)
    label = C.input_variable(num_output_classes)

    # Instantiate the feedforward classification model
    scaled_input = element_times(constant(0.00390625), feature)

    # z = Sequential([
    #     Dense(hidden_layers_dim, activation=relu),
    #     Dense(hidden_layers_dim, activation=relu),
    #     Dense(num_output_classes)])(scaled_input)

    with default_options(activation=relu, init=C.glorot_uniform()):
        z = Sequential([For(range(num_hidden_layers),
            lambda i: Dense(hidden_layers_dim)),
            Dense(num_output_classes, activation=None)])(scaled_input)

    ce = cross_entropy_with_softmax(z, label)
    pe = classification_error(z, label)

    # setup the data
    path = abs_path + "\Train-28x28_cntk_text.txt"

    reader_train = MinibatchSource(CTFDeserializer(path, StreamDefs(
        features=StreamDef(field='features', shape=input_dim),
        labels=StreamDef(field='labels', shape=num_output_classes))))

    input_map = {
        feature: reader_train.streams.features,
        label: reader_train.streams.labels
    }

    # Training config
    minibatch_size = 64
    num_samples_per_sweep = 60000
    num_sweeps_to_train_with = 10

    # Instantiate progress writers.
    progress_writers = [ProgressPrinter(
        tag='Training',
        num_epochs=num_sweeps_to_train_with)]

    # Instantiate the trainer object to drive the model training
    lr = learning_rate_schedule(1, UnitType.sample)
    trainer = Trainer(z, (ce, pe), [adadelta(z.parameters, lr)], progress_writers)

    training_session(
        trainer=trainer,
        mb_source=reader_train,
        mb_size=minibatch_size,
        model_inputs_to_streams=input_map,
        max_samples=num_samples_per_sweep * num_sweeps_to_train_with,
        progress_frequency=num_samples_per_sweep
    ).train()

    # Load test data
    path = abs_path + "\Test-28x28_cntk_text.txt"

    reader_test = MinibatchSource(CTFDeserializer(path, StreamDefs(
        features=StreamDef(field='features', shape=input_dim),
        labels=StreamDef(field='labels', shape=num_output_classes))))

    input_map = {
        feature: reader_test.streams.features,
        label: reader_test.streams.labels
    }

    # Test data for trained model
    test_minibatch_size = 1024
    num_samples = 10000
    num_minibatches_to_test = num_samples / test_minibatch_size
    test_result = 0.0
    for i in range(0, int(num_minibatches_to_test)):
        mb = reader_test.next_minibatch(test_minibatch_size, input_map=input_map)
        eval_error = trainer.test_minibatch(mb)
        test_result = test_result + eval_error

    # Average of evaluation errors of all test minibatches
    return test_result / num_minibatches_to_test
Пример #42
0
def generate_visualization(use_brain_script_model, testing=False):
    num_objects_to_eval = 5

    if (use_brain_script_model):
        model_file_name = "07_Deconvolution_BS.model"
        encoder_output_file_name = "encoder_output_BS.txt"
        decoder_output_file_name = "decoder_output_BS.txt"
        enc_node_name = "z.pool1"
        input_node_name = "f2"
        output_node_name = "z"
    else:
        model_file_name = "07_Deconvolution_PY.model"
        encoder_output_file_name = "encoder_output_PY.txt"
        decoder_output_file_name = "decoder_output_PY.txt"
        enc_node_name = "pooling_node"
        input_node_name = "input_node"
        output_node_name = "output_node"

    # define location of output, model and data and check existence
    output_path = os.path.join(abs_path, "Output")
    model_file = os.path.join(model_path, model_file_name)
    data_file = os.path.join(data_path, "Test-28x28_cntk_text.txt")
    if not (os.path.exists(model_file) and os.path.exists(data_file)):
        print(
            "Cannot find required data or model. "
            "Please get the MNIST data set and run 'cntk configFile=07_Deconvolution_BS.cntk' or 'python 07_Deconvolution_PY.py' to create the model."
        )
        exit(0)

    # create minibatch source
    minibatch_source = MinibatchSource(CTFDeserializer(
        data_file,
        StreamDefs(features=StreamDef(field='features', shape=(28 * 28)),
                   labels=StreamDef(field='labels', shape=10))),
                                       randomize=False,
                                       max_sweeps=1)

    # use this to print all node names in the model
    # print_all_node_names(model_file, use_brain_script_model)

    # load model and pick desired nodes as output
    loaded_model = load_model(model_file)
    output_nodes = combine([
        loaded_model.find_by_name(input_node_name).owner,
        loaded_model.find_by_name(enc_node_name).owner,
        loaded_model.find_by_name(output_node_name).owner
    ])

    # evaluate model save output
    features_si = minibatch_source['features']
    with open(os.path.join(output_path, decoder_output_file_name),
              'wb') as decoder_text_file:
        with open(os.path.join(output_path, encoder_output_file_name),
                  'wb') as encoder_text_file:
            for i in range(0, num_objects_to_eval):
                mb = minibatch_source.next_minibatch(1)
                raw_dict = output_nodes.eval(mb[features_si])
                output_dict = {}
                for key in raw_dict.keys():
                    output_dict[key.name] = raw_dict[key]

                encoder_input = output_dict[input_node_name]
                encoder_output = output_dict[enc_node_name]
                decoder_output = output_dict[output_node_name]
                in_values = (encoder_input[0, 0].flatten())[np.newaxis]
                enc_values = (encoder_output[0, 0].flatten())[np.newaxis]
                out_values = (decoder_output[0, 0].flatten())[np.newaxis]

                if not testing:
                    # write results as text and png
                    np.savetxt(decoder_text_file, out_values, fmt="%.6f")
                    np.savetxt(encoder_text_file, enc_values, fmt="%.6f")
                    save_as_png(
                        in_values,
                        os.path.join(output_path,
                                     "imageAutoEncoder_%s__input.png" % i))
                    save_as_png(
                        out_values,
                        os.path.join(output_path,
                                     "imageAutoEncoder_%s_output.png" % i))

                    # visualizing the encoding is only possible and meaningful with a single conv filter
                    enc_dim = 7
                    if (enc_values.size == enc_dim * enc_dim):
                        save_as_png(
                            enc_values,
                            os.path.join(
                                output_path,
                                "imageAutoEncoder_%s_encoding.png" % i),
                            dim=enc_dim)

    print("Done. Wrote output to %s" % output_path)
Пример #43
0
def test_distributed_mb_source(tmpdir):
    input_dim = 69

    ctf_data = '''\
0	|S0 3:1 |# <s>	|S1 3:1 |# <s>
0	|S0 4:1 |# A	|S1 32:1 |# ~AH
0	|S0 5:1 |# B	|S1 36:1 |# ~B
0	|S0 4:1 |# A	|S1 31:1 |# ~AE
0	|S0 7:1 |# D	|S1 38:1 |# ~D
0	|S0 12:1 |# I	|S1 47:1 |# ~IY
0	|S0 1:1 |# </s>	|S1 1:1 |# </s>
2	|S0 60:1 |# <s>	|S1 3:1 |# <s>
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
3	|S0 60:1 |# <s>	|S1 3:1 |# <s>
3	|S0 61:1 |# A	|S1 32:1 |# ~AH
3	|S0 61:1 |# A	|S1 32:1 |# ~AH
3	|S0 61:1 |# A	|S1 32:1 |# ~AH
4	|S0 60:1 |# <s>	|S1 3:1 |# <s>
5	|S0 60:1 |# <s>	|S1 3:1 |# <s>
5	|S0 61:1 |# A	|S1 32:1 |# ~AH
6	|S0 60:1 |# <s>	|S1 3:1 |# <s>
6	|S0 61:1 |# A	|S1 32:1 |# ~AH
7	|S0 60:1 |# <s>	|S1 3:1 |# <s>
8	|S0 60:1 |# <s>	|S1 3:1 |# <s>
8	|S0 61:1 |# A	|S1 32:1 |# ~AH
9	|S0 60:1 |# <s>	|S1 3:1 |# <s>
9	|S0 61:1 |# A	|S1 32:1 |# ~AH
10	|S0 61:1 |# A	|S1 32:1 |# ~AH
'''
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs

    ctf_file = str(tmpdir / '2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    # No randomization

    mb0 = MinibatchSource(CTFDeserializer(
        ctf_file,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='S1',
                                    shape=input_dim,
                                    is_sparse=True))),
                          randomize=False,
                          max_samples=36)  # A bit more than a sweep
    mb1 = MinibatchSource(CTFDeserializer(
        ctf_file,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='S1',
                                    shape=input_dim,
                                    is_sparse=True))),
                          randomize=False,
                          max_samples=36)  # A bit more than a sweep
    input = sequence.input_variable(shape=(input_dim, ))
    label = sequence.input_variable(shape=(input_dim, ))
    input_map = {input: mb0.streams.features, label: mb0.streams.labels}

    # Because we emulating two workers here, the minibatch_size_in_samples will be splitted in 2,
    # so below we expect 5 samples per worker.
    data = mb0.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=0)
    assert (data[input].num_samples == 7)  # Sequence 0

    data = mb0.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=0)
    assert (data[input].num_samples == 4)  # Sequence 3

    data = mb0.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=0)
    assert (data[input].num_samples == 5)  # Sequences 5, 7, 9

    data = mb0.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=0)
    assert (data[input].num_samples == 7)  # Sequence 0

    data = mb0.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=0)
    assert (data[input].num_samples == 4)  # Sequence 3

    data = mb0.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=0)
    assert (len(data) == 0)  # No data

    data = mb1.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=1)
    assert (data[input].num_samples == 4)  # Sequences 2, 4

    data = mb1.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=1)
    assert (data[input].num_samples == 5)  # Sequences 6, 8, 10

    data = mb1.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=1)
    assert (data[input].num_samples == 3)  # Sequences 2

    data = mb1.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=1)
    assert (len(data) == 0)  # No data

    # Radomization

    mb3 = MinibatchSource(CTFDeserializer(
        ctf_file,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='S1',
                                    shape=input_dim,
                                    is_sparse=True))),
                          max_sweeps=1)

    mb4 = MinibatchSource(CTFDeserializer(
        ctf_file,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='S1',
                                    shape=input_dim,
                                    is_sparse=True))),
                          max_sweeps=1)

    data = mb3.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=0)
    assert (data[input].num_samples == 5)

    data = mb3.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=0)
    assert (data[input].num_samples == 4)

    data = mb3.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=0)
    assert (data[input].num_samples == 4)

    data = mb3.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=0)
    assert (data[input].num_samples == 5)

    data = mb3.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=0)
    assert (data[input].num_samples == 7)

    data = mb4.next_minibatch(minibatch_size_in_samples=10,
                              input_map=input_map,
                              num_data_partitions=2,
                              partition_index=1)
    assert (len(data) == 0
            )  # Due to chunking we do not expect any data for rank 1
Пример #44
0
def test_sweep_based_schedule(tmpdir, device_id):
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
    from cntk import cross_entropy_with_softmax, classification_error, plus, reduce_sum, sequence
    from cntk import Trainer

    input_dim = 69

    ctf_data = '''\
0   |S0 3:1   |S1 3:1 |# <s>
0   |S0 4:1 |# A    |S1 32:1 |# ~AH
0   |S0 5:1 |# B    |S1 36:1 |# ~B
0   |S0 4:1 |# A    |S1 31:1 |# ~AE
0   |S0 7:1 |# D    |S1 38:1 |# ~D
0   |S0 12:1 |# I   |S1 47:1 |# ~IY
0   |S0 1:1 |# </s> |S1 1:1 |# </s>
2   |S0 60:1 |# <s> |S1 3:1 |# <s>
2   |S0 61:1 |# A   |S1 32:1 |# ~AH
'''
    ctf_file = str(tmpdir/'2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
        features  = StreamDef(field='S0', shape=input_dim,  is_sparse=True),
        labels    = StreamDef(field='S1', shape=input_dim,  is_sparse=True)
    )), randomize=False)

    in1 = sequence.input_variable(shape=(input_dim,))
    labels = sequence.input_variable(shape=(input_dim,))
    p = parameter(shape=(input_dim,), init=10)
    z = plus(in1, reduce_sum(p), name='z')
    ce = cross_entropy_with_softmax(z, labels)
    errs = classification_error(z, labels)

    lr_per_sample = learning_rate_schedule([0.3, 0.2, 0.1, 0.0], UnitType.sample)
    learner = sgd(z.parameters, lr_per_sample)
    trainer = Trainer(z, (ce, errs), [learner])

    input_map = {
        in1       : mbs.streams.features,
        labels : mbs.streams.labels
    }

    # fetch minibatch (first sequence)
    data = mbs.next_minibatch(1, input_map=input_map) 
    trainer.train_minibatch(data)
    assert learner.learning_rate() == 0.3

    # fetch minibatch (second sequence, sweep ends at this point)
    data = mbs.next_minibatch(1, input_map=input_map)
    trainer.train_minibatch(data)
    assert learner.learning_rate() == 0.2

    # fetch minibatch (both sequences -- entire sweep in one go)
    data = mbs.next_minibatch(9, input_map=input_map)
    trainer.train_minibatch(data)
    assert learner.learning_rate() == 0.1

    # fetch minibatch (multiple sweeps)
    data = mbs.next_minibatch(30, input_map=input_map)
    trainer.train_minibatch(data, outputs=[z.output])
    assert learner.learning_rate() == 0.0
Пример #45
0
    # load model and pick desired nodes as output
    loaded_model = load_model(model_file)
    output_nodes = combine([
        loaded_model.find_by_name('f1').owner,
        loaded_model.find_by_name('z.p1').owner,
        loaded_model.find_by_name('z').owner
    ])

    # evaluate model save output
    features_si = minibatch_source['features']
    with open(os.path.join(output_path, "decoder_output_py.txt"),
              'wb') as decoder_text_file:
        with open(os.path.join(output_path, "encoder_output_py.txt"),
                  'wb') as encoder_text_file:
            for i in range(0, num_objects_to_eval):
                mb = minibatch_source.next_minibatch(1)
                raw_dict = output_nodes.eval(mb[features_si])
                output_dict = {}
                for key in raw_dict.keys():
                    output_dict[key.name] = raw_dict[key]

                encoder_input = output_dict['f1']
                encoder_output = output_dict['z.p1']
                decoder_output = output_dict['z']
                in_values = (encoder_input[0, 0].flatten())[np.newaxis]
                enc_values = (encoder_output[0, 0].flatten())[np.newaxis]
                out_values = (decoder_output[0, 0].flatten())[np.newaxis]

                # write results as text and png
                np.savetxt(decoder_text_file, out_values, fmt="%.6f")
                np.savetxt(encoder_text_file, enc_values, fmt="%.6f")
Пример #46
0
        max_samples=numberOfSamples * numberOfSweepsForTraining,
        progress_frequency=numberOfSamples
    )

trainingSession.train()

# Testing time #
testPath = "test.txt"

ctfdResultTest = CTFDeserializer(testPath, StreamDefs(
        features=StreamDef(field='features', shape=featuresShapeValue),
        labels=StreamDef(field='labels', shape=labelsShapeValue)))

readerTest = MinibatchSource(ctfdResultTest)

inputMapTest = {
    featuresShape: readerTest.streams.features,
    labelsShape: readerTest.streams.labels
}

minibatchSizeTest = 25
numberOfSamplesTest = 312
minibatchesToTest = numberOfSamplesTest / minibatchSizeTest
testResult = 0.0
for i in range(0, int(minibatchesToTest)):
    mb = readerTest.next_minibatch(minibatch_size_in_samples = minibatchSizeTest, input_map=inputMapTest)
    evalError = trainer.test_minibatch(mb)
    testResult = testResult + evalError

averageClassificationError = testResult / minibatchesToTest
print(averageClassificationError)
Пример #47
0
    def __train_cntk(self, path_to_folder: str, model_definition, epochs: int,
                     output_model_path: str, classes, minibatch_size: int):
        import cntk
        from cntk.learners import learning_parameter_schedule
        from cntk.ops import input_variable
        from cntk.io import MinibatchSource, ImageDeserializer, StreamDefs, StreamDef, MinibatchData, UserDeserializer
        import cntk.io.transforms as xforms
        from cntk.layers import default_options, Dense, Sequential, Activation, Embedding, Convolution2D, MaxPooling, Stabilizer, Convolution, Dropout, BatchNormalization
        from cntk.ops.functions import CloneMethod
        from cntk.logging import ProgressPrinter
        from cntk.losses import cross_entropy_with_softmax
        from cntk import classification_error, softmax, relu, ModelFormat, element_times, momentum_schedule, momentum_sgd
        import pandas as pd

        path_to_folder = path_to_folder.rstrip('/')

        map_file_train = path_to_folder + "/train_map.txt"
        map_file_test = path_to_folder + "/test_map.txt"
        classes_set = set()
        num_train = 0
        num_test = 0
        num_channels = 3

        class TrackDataset(UserDeserializer):
            def __init__(self, map_file, streams, chunksize=100):
                super(TrackDataset, self).__init__()
                self._batch_size = chunksize
                self.dataframes = pd.read_csv(map_file,
                                              sep='\t',
                                              dtype=str,
                                              header=None,
                                              names=["features", "labels"])
                self._streams = [
                    cntk.io.StreamInformation(s['name'], i, 'dense',
                                              np.float32, s['shape'])
                    for i, s in enumerate(streams)
                ]

                self._num_chunks = int(
                    math.ceil(len(self.dataframes) / chunksize))

            def _scale_image(self, image, width=224, height=168):
                try:
                    return image.resize((width, height), Image.LINEAR)
                except:
                    raise Exception('scale_image error')

            def stream_infos(self):
                return self._streams

            def num_chunks(self):
                return self._num_chunks

            def get_chunk(self, chunk_id):
                images = []
                labels = []
                maximum = (chunk_id + 1) * self._batch_size
                if (maximum > len(self.dataframes)):
                    maximum = len(self.dataframes)
                for i in range(chunk_id * self._batch_size, maximum):
                    img_name = self.dataframes.iloc[i, 0]
                    image = Image.open(img_name)
                    cl = self.dataframes.iloc[i, 1:].values[0]
                    image = self._scale_image(image)
                    image = np.moveaxis((np.array(image).astype('float32')),
                                        -1, 0)
                    image -= np.mean(image, keepdims=True)
                    image /= (np.std(image, keepdims=True) + 1e-6)
                    images.append(image)
                    yv = np.zeros(num_classes)
                    yv[classes.index(cl)] = 1
                    labels.append(yv)

                result = {}
                features = np.array(images)
                lab = np.array(labels).astype('float32')
                result[self._streams[0].m_name] = features
                result[self._streams[1].m_name] = lab
                return result

        try:
            with open(map_file_train) as f:
                csv_reader = csv.reader(f, delimiter='\t')
                for row in csv_reader:
                    cmd = row[1]
                    classes_set.add(cmd)
                    num_train = num_train + 1
        except Exception as e:
            raise Exception(
                "No train_map.txt file found in path " + path_to_folder +
                ". Did you create a dataset using create_balanced_dataset()?")

        num_classes = len(classes)

        with open(map_file_test) as f:
            for num_test, l in enumerate(f):
                pass

        # transforms = [
        #     xforms.scale(width=self.__image_width, height=self.__image_height, channels=num_channels, interpolations='linear'),
        #     xforms.mean(mean_file)
        # ]

        dataset_train = TrackDataset(map_file=map_file_train,
                                     streams=[
                                         dict(name='features',
                                              shape=(num_channels,
                                                     self.__image_height,
                                                     self.__image_width)),
                                         dict(name='labels',
                                              shape=(num_classes, ))
                                     ])
        reader_train = MinibatchSource([dataset_train], randomize=True)

        # a = dataset_train.num_chunks()

        dataset_test = TrackDataset(map_file=map_file_test,
                                    streams=[
                                        dict(name='features',
                                             shape=(num_channels,
                                                    self.__image_height,
                                                    self.__image_width)),
                                        dict(name='labels',
                                             shape=(num_classes, ))
                                    ])
        reader_test = MinibatchSource([dataset_test], randomize=True)

        # ImageDeserializer loads images in the BGR format, not RGB
        # reader_train = MinibatchSource(ImageDeserializer(map_file_train, StreamDefs(
        #     features = StreamDef(field='image', transforms=transforms),
        #     labels   = StreamDef(field='label', shape=num_classes)
        # )))

        # reader_test = MinibatchSource(ImageDeserializer(map_file_test, StreamDefs(
        #     features = StreamDef(field='image', transforms=transforms),
        #     labels   = StreamDef(field='label', shape=num_classes)
        # )))

        # mb = reader_train.next_minibatch(10)

        input_var = input_variable(
            (num_channels, self.__image_height, self.__image_width))
        label_var = input_variable((num_classes))

        model = model_definition(input_var)

        ce = cross_entropy_with_softmax(model, label_var)
        pe = classification_error(model, label_var)

        epoch_size = num_train

        lr_per_minibatch = learning_parameter_schedule([0.01] * 10 +
                                                       [0.003] * 10 + [0.001],
                                                       epoch_size=epoch_size)
        momentums = momentum_schedule(0.9, minibatch_size=minibatch_size)
        l2_reg_weight = 0.001

        learner = momentum_sgd(model.parameters,
                               lr=lr_per_minibatch,
                               momentum=momentums,
                               l2_regularization_weight=l2_reg_weight)
        progress_printer = ProgressPrinter(tag='Training', num_epochs=epochs)
        trainer = cntk.train.Trainer(model, (ce, pe), [learner],
                                     [progress_printer])

        input_map = {
            input_var: reader_train.streams.features,
            label_var: reader_train.streams.labels
        }

        print("Training started")
        batch_index = 0
        plot_data = {'batchindex': [], 'loss': [], 'error': []}
        for epoch in range(epochs):
            sample_count = 0
            while sample_count < epoch_size:
                data: MinibatchSource = reader_train.next_minibatch(
                    min(minibatch_size, epoch_size - sample_count),
                    input_map=input_map)

                trainer.train_minibatch(data)
                sample_count += data[label_var].num_samples

                batch_index += 1
                plot_data['batchindex'].append(batch_index)
                plot_data['loss'].append(
                    trainer.previous_minibatch_loss_average)
                plot_data['error'].append(
                    trainer.previous_minibatch_evaluation_average)

            trainer.summarize_training_progress()

        metric_numer = 0
        metric_denom = 0
        sample_count = 0
        minibatch_index = 0
        epoch_size = num_test

        while sample_count < epoch_size:
            current_minibatch = min(minibatch_size, epoch_size - sample_count)

            data = reader_test.next_minibatch(current_minibatch,
                                              input_map=input_map)

            metric_numer += trainer.test_minibatch(data) * current_minibatch
            metric_denom += current_minibatch

            sample_count += data[label_var].num_samples
            minibatch_index += 1

        print("")
        print("Final Results: Minibatch[1-{}]: errs = {:0.1f}% * {}".format(
            minibatch_index + 1, (metric_numer * 100.0) / metric_denom,
            metric_denom))
        print("")

        model.save(output_model_path, format=ModelFormat.ONNX)
def get_minibatch(bmuf, working_dir, mb_source):
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
    
    if mb_source == "numpy":
        for i in range(num_batches):
            features = []
            labels = []
            for j in range(batch_size):
                seq_len_j = [seq_len, seq_len + 5, seq_len - 5][j % 3]
                x = np.random.rand( seq_len_j, feat_dim).astype(np.float32)
                y = np.random.rand( seq_len_j, label_dim).astype(np.float32)
                features.append(x)    
                labels.append(y)
            yield {bmuf.feat: features, bmuf.label: labels}
    
    if mb_source in ("ctf_utterance", "ctf_frame", "ctf_bptt"):
        if mb_source == "ctf_frame":
            #frame mode data without sequence ids.
            ctf_data = ctf_data = '''\
|S0 0.49  0.18  0.84  0.7   0.59 |S1 0.12  0.24  0.14
|S0 0.69  0.63  0.47  0.93  0.69 |S1 0.34  0.85  0.17
|S0 0.04  0.5   0.39  0.86  0.28 |S1 0.62  0.36  0.53
|S0 0.71  0.9   0.15  0.83  0.18 |S1 0.2   0.74  0.04
|S0 0.38  0.67  0.46  0.53  0.75 |S1 0.6   0.14  0.35
|S0 0.94  0.54  0.09  0.55  0.08 |S1 0.07  0.53  0.47
|S0 0.11  0.24  0.17  0.72  0.72 |S1 0.9   0.98  0.18
|S0 0.3   1.    0.34  0.06  0.78 |S1 0.15  0.69  0.63
|S0 0.69  0.86  0.59  0.49  0.99 |S1 0.13  0.6   0.21
'''
        #sequence mode data with sequence id
        else:
            ctf_data = ctf_data = '''\
0	|S0 0.49  0.18  0.84  0.7   0.59 |S1 0.12  0.24  0.14
0	|S0 0.69  0.63  0.47  0.93  0.69 |S1 0.34  0.85  0.17
0	|S0 0.04  0.5   0.39  0.86  0.28 |S1 0.62  0.36  0.53
0	|S0 0.71  0.9   0.15  0.83  0.18 |S1 0.2   0.74  0.04
0	|S0 0.38  0.67  0.46  0.53  0.75 |S1 0.6   0.14  0.35
0	|S0 0.94  0.54  0.09  0.55  0.08 |S1 0.07  0.53  0.47
0	|S0 0.11  0.24  0.17  0.72  0.72 |S1 0.9   0.98  0.18
2	|S0 0.3   1.    0.34  0.06  0.78 |S1 0.15  0.69  0.63
2	|S0 0.69  0.86  0.59  0.49  0.99 |S1 0.13  0.6   0.21
'''

        ctf_file = os.path.join(working_dir, '2seqtest.txt')
        with open(ctf_file, 'w') as f:
            f.write(ctf_data)
    
        # ctf_utterance model
        frame_mode = False
        truncation_length = 0
        
        if mb_source == "ctf_frame":
            frame_mode = True
        elif mb_source == "ctf_bptt":
            truncation_length = 2
            
        mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
            features  = StreamDef(field='S0', shape=feat_dim,  is_sparse=False),
            labels    = StreamDef(field='S1', shape=label_dim,  is_sparse=False)
        )), randomize=False, max_samples = batch_size*num_batches, 
            frame_mode=frame_mode, truncation_length=truncation_length)
        
        for i in range(num_batches):
            minibatch = mbs.next_minibatch(batch_size, {bmuf.feat: mbs.streams.features, bmuf.label: mbs.streams.labels})
            if not minibatch:
                break
            yield minibatch
Пример #49
0
    def driver(self):
        np.random.seed(0)
        # Define the data dimensions
        image_shape = (1, 28, 28)
        input_dim = int(np.prod(image_shape, dtype=int))
        output_dim = 10
        num_train_samples = 60000
        num_test_samples = 10000
        # The local path where the training and test data might be found or will be downloaded to.
        training_data_path = os.path.join(os.getcwd(), "MNIST_data",
                                          "Train-28x28_cntk_text.txt")
        testing_data_path = os.path.join(os.getcwd(), "MNIST_data",
                                         "Test-28x28_cntk_text.txt")
        # Download the data if they don't already exist
        url_train_image = "train-images-idx3-ubyte.gz"
        url_train_labels = "train-labels-idx1-ubyte.gz"
        if not os.path.exists(training_data_path):
            url_train_image = "train-images-idx3-ubyte.gz"
            url_train_labels = "train-labels-idx1-ubyte.gz"
        print("Loading training data")
        saved_data_dir = os.path.join(os.getcwd(), "MNIST_data")
        train = self.load_mnist_data(url_train_image,
                                     url_train_labels,
                                     num_train_samples,
                                     local_data_dir=saved_data_dir)
        print("Writing training data text file...")
        self.save_as_txt(training_data_path, train)
        print("[Done]")
        url_test_image = "t10k-images-idx3-ubyte.gz"
        url_test_labels = "t10k-labels-idx1-ubyte.gz"
        if not os.path.exists(testing_data_path):
            url_test_image = "t10k-images-idx3-ubyte.gz"
            url_test_labels = "t10k-labels-idx1-ubyte.gz"
        print("Loading testing data")
        saved_data_dir = os.path.join(os.getcwd(), "MNIST_data2")
        test = self.load_mnist_data(url_test_image, url_test_labels,
                                    num_test_samples, saved_data_dir)
        print("Writing testing data text file...")
        self.save_as_txt(testing_data_path, test)
        print("[Done]")

        feature_stream_name = 'features'
        labels_stream_name = 'labels'

        # Convert to CNTK MinibatchSource
        # original as below deprecated------------
        #train_minibatch_source = cntk.text_format_minibatch_source(training_data_path, [
        #cntk.StreamConfiguration(feature_stream_name, input_dim),
        #cntk.StreamConfiguration(labels_stream_name, output_dim)])
        #------------------------------------------------------------------
        train_minibatch_source = MinibatchSource(
            CTFDeserializer(
                training_data_path,
                StreamDefs(features=StreamDef(field='features',
                                              shape=input_dim,
                                              is_sparse=False),
                           labels=StreamDef(field='labels',
                                            shape=output_dim,
                                            is_sparse=False))))
        training_features = train_minibatch_source[feature_stream_name]
        training_labels = train_minibatch_source[labels_stream_name]
        print("Training data from file %s successfully read." %
              training_data_path)

        #test_minibatch_source = cntk.text_format_minibatch_source(testing_data_path, [
        #cntk.StreamConfiguration(feature_stream_name, input_dim),
        #cntk.StreamConfiguration(labels_stream_name, output_dim)])
        test_minibatch_source = MinibatchSource(
            CTFDeserializer(
                testing_data_path,
                StreamDefs(features=StreamDef(field='features',
                                              shape=input_dim,
                                              is_sparse=False),
                           labels=StreamDef(field='labels',
                                            shape=output_dim,
                                            is_sparse=False))))
        test_features = test_minibatch_source[feature_stream_name]
        test_labels = test_minibatch_source[labels_stream_name]
        print("Test data from file %s successfully read." % testing_data_path)

        # Define the input to the neural network
        input_vars = cntk.ops.input_variable(image_shape, np.float32)
        # Create the convolutional neural network
        output = self.create_convolutional_neural_network(input_vars,
                                                          output_dim,
                                                          dropout_prob=0.5)
        #'''
        #----------------------
        #Setting up the trainer
        #----------------------
        #'''
        # Define the label as the other input parameter of the trainer
        labels = cntk.ops.input_variable(output_dim, np.float32)
        # Initialize the parameters for the trainer
        train_minibatch_size = 50
        learning_rate = 1e-4
        momentum = 0.9
        # Define the loss function
        #loss = cntk.ops.cross_entropy_with_softmax(output, labels)
        loss = cntk.cross_entropy_with_softmax(output, labels)
        # Define the function that calculates classification error
        #label_error = cntk.ops.classification_error(output, labels)
        label_error = cntk.classification_error(output, labels)
        # Instantiate the trainer object to drive the model training
        #learner = cntk.adam_sgd(output.parameters, learning_rate, momentum)
        learner = cntk.adam(
            output.parameters,
            learning_rate_schedule(learning_rate, UnitType.sample),
            momentum_schedule(momentum))
        trainer = cntk.Trainer(output, (loss, label_error), [learner])
        #'''
        #-----------------------------------------
        #Training the Convolutional Neural Network
        #-----------------------------------------
        #'''
        num_training_epoch = 1
        training_progress_output_freq = 100

        for epoch in range(num_training_epoch):
            sample_count = 0
            num_minibatch = 0
            # loop over minibatches in the epoch
            while sample_count < num_train_samples:
                minibatch = train_minibatch_source.next_minibatch(
                    min(train_minibatch_size,
                        num_train_samples - sample_count))
                # Specify the mapping of input variables in the model to actual minibatch data to be trained with
                data = {
                    input_vars: minibatch[training_features],
                    labels: minibatch[training_labels]
                }
                trainer.train_minibatch(data)
                sample_count += data[labels].num_samples
                num_minibatch += 1
                #Print the training progress data
                if num_minibatch % training_progress_output_freq == 0:
                    #training_loss = cntk.get_train_loss(trainer)
                    training_loss = trainer.previous_minibatch_loss_average
                    #eval_error = cntk.get_train_eval_criterion(trainer)
                    eval_error = trainer.previous_minibatch_evaluation_average
                    print(
                        "Epoch %d | # of Samples: %6d | Loss: %.6f | Error: %.6f"
                        % (epoch, sample_count, training_loss, eval_error))

        print("Training Completed.", end="\n\n")

        #'''
        #-------------------
        #Classification Test
        #--------------------
        #'''

        test_minibatch_size = 1000
        sample_count = 0
        test_results = []

        while sample_count < num_test_samples:
            minibatch = test_minibatch_source.next_minibatch(
                min(test_minibatch_size, num_test_samples - sample_count))
            # Specify the mapping of input variables in the model to actual minibatch data to be tested with
            data = {
                input_vars: minibatch[test_features],
                labels: minibatch[test_labels]
            }
            eval_error = trainer.test_minibatch(data)
            test_results.append(eval_error)
            sample_count += data[labels].num_samples
        # Printing the average of evaluation errors of all test minibatches
        print("Average errors of all test minibatches: %.3f%%" %
              (float(np.mean(test_results, dtype=float)) * 100))
        a = 5
def generate_visualization(use_brain_script_model, testing=False):
    num_objects_to_eval = 5

    if (use_brain_script_model):
        model_file_name = "07_Deconvolution_BS.model"
        encoder_output_file_name = "encoder_output_BS.txt"
        decoder_output_file_name = "decoder_output_BS.txt"
        enc_node_name = "z.pool1"
        input_node_name = "f2"
        output_node_name = "z"
    else:
        model_file_name = "07_Deconvolution_PY.model"
        encoder_output_file_name = "encoder_output_PY.txt"
        decoder_output_file_name = "decoder_output_PY.txt"
        enc_node_name = "pooling_node"
        input_node_name = "input_node"
        output_node_name = "output_node"

    # define location of output, model and data and check existence
    output_path = os.path.join(abs_path, "Output")
    model_file = os.path.join(model_path, model_file_name)
    data_file = os.path.join(data_path, "Test-28x28_cntk_text.txt")
    if not (os.path.exists(model_file) and os.path.exists(data_file)):
        print("Cannot find required data or model. "
              "Please get the MNIST data set and run 'cntk configFile=07_Deconvolution_BS.cntk' or 'python 07_Deconvolution_PY.py' to create the model.")
        exit(0)

    # create minibatch source
    minibatch_source = MinibatchSource(CTFDeserializer(data_file, StreamDefs(
        features  = StreamDef(field='features', shape=(28*28)),
        labels    = StreamDef(field='labels',   shape=10)
    )), randomize=False, max_sweeps = 1)

    # use this to print all node names in the model
    # print_all_node_names(model_file, use_brain_script_model)

    # load model and pick desired nodes as output
    loaded_model = load_model(model_file)
    output_nodes = combine(
        [loaded_model.find_by_name(input_node_name).owner,
         loaded_model.find_by_name(enc_node_name).owner,
         loaded_model.find_by_name(output_node_name).owner])

    # evaluate model save output
    features_si = minibatch_source['features']
    with open(os.path.join(output_path, decoder_output_file_name), 'wb') as decoder_text_file:
        with open(os.path.join(output_path, encoder_output_file_name), 'wb') as encoder_text_file:
            for i in range(0, num_objects_to_eval):
                mb = minibatch_source.next_minibatch(1)
                raw_dict = output_nodes.eval(mb[features_si])
                output_dict = {}
                for key in raw_dict.keys(): output_dict[key.name] = raw_dict[key]

                encoder_input = output_dict[input_node_name]
                encoder_output = output_dict[enc_node_name]
                decoder_output = output_dict[output_node_name]
                in_values = (encoder_input[0,0].flatten())[np.newaxis]
                enc_values = (encoder_output[0,0].flatten())[np.newaxis]
                out_values = (decoder_output[0,0].flatten())[np.newaxis]

                if not testing:
                    # write results as text and png
                    np.savetxt(decoder_text_file, out_values, fmt="%.6f")
                    np.savetxt(encoder_text_file, enc_values, fmt="%.6f")
                    save_as_png(in_values,  os.path.join(output_path, "imageAutoEncoder_%s__input.png" % i))
                    save_as_png(out_values, os.path.join(output_path, "imageAutoEncoder_%s_output.png" % i))

                    # visualizing the encoding is only possible and meaningful with a single conv filter
                    enc_dim = 7
                    if(enc_values.size == enc_dim*enc_dim):
                        save_as_png(enc_values, os.path.join(output_path, "imageAutoEncoder_%s_encoding.png" % i), dim=enc_dim)

    print("Done. Wrote output to %s" % output_path)
Пример #51
0
def test_sweep_based_schedule(tmpdir, device_id):
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
    from cntk import cross_entropy_with_softmax, classification_error, plus, reduce_sum, sequence
    from cntk import Trainer

    input_dim = 69

    ctf_data = '''\
0   |S0 3:1   |S1 3:1 |# <s>
0   |S0 4:1 |# A    |S1 32:1 |# ~AH
0   |S0 5:1 |# B    |S1 36:1 |# ~B
0   |S0 4:1 |# A    |S1 31:1 |# ~AE
0   |S0 7:1 |# D    |S1 38:1 |# ~D
0   |S0 12:1 |# I   |S1 47:1 |# ~IY
0   |S0 1:1 |# </s> |S1 1:1 |# </s>
2   |S0 60:1 |# <s> |S1 3:1 |# <s>
2   |S0 61:1 |# A   |S1 32:1 |# ~AH
'''
    ctf_file = str(tmpdir/'2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
        features  = StreamDef(field='S0', shape=input_dim,  is_sparse=True),
        labels    = StreamDef(field='S1', shape=input_dim,  is_sparse=True)
    )), randomize=False)

    in1 = sequence.input_variable(shape=(input_dim,))
    labels = sequence.input_variable(shape=(input_dim,))
    p = parameter(shape=(input_dim,), init=10)
    z = plus(in1, reduce_sum(p), name='z')
    ce = cross_entropy_with_softmax(z, labels)
    errs = classification_error(z, labels)

    lr_per_sample = learning_rate_schedule([0.3, 0.2, 0.1, 0.0], UnitType.sample)
    learner = sgd(z.parameters, lr_per_sample)
    trainer = Trainer(z, (ce, errs), [learner])

    input_map = {
        in1       : mbs.streams.features,
        labels : mbs.streams.labels
    }

    # fetch minibatch (first sequence)
    data = mbs.next_minibatch(1, input_map=input_map)
    trainer.train_minibatch(data)
    assert learner.learning_rate() == 0.3

    # fetch minibatch (second sequence, sweep ends at this point)
    data = mbs.next_minibatch(1, input_map=input_map)
    trainer.train_minibatch(data)
    assert learner.learning_rate() == 0.2

    # fetch minibatch (both sequences -- entire sweep in one go)
    data = mbs.next_minibatch(9, input_map=input_map)
    trainer.train_minibatch(data)
    assert learner.learning_rate() == 0.1

    # fetch minibatch (multiple sweeps)
    data = mbs.next_minibatch(30, input_map=input_map)
    trainer.train_minibatch(data, outputs=[z.output])
    assert learner.learning_rate() == 0.0
Пример #52
0
#############################
#        Prediction         #
#############################
sample_count 	= 0
output 			= np.zeros((number_images, numLabels, ImageH, ImageW), dtype = np.float32)

print("##################################################")
print("##############   Start Prediction   ##############")
print("##################################################\n")
print("Using model of epoch %d\n" % best_model)
print("Prediction:   0 %% (% 5.1f samples/s)"% 0, end = '', flush = True)

while sample_count < number_images:
	t_start_mb		= time.time()
	currentMBsize 	= min(minibatchSize, number_images - sample_count)
	data 			= reader.next_minibatch(currentMBsize, input_map = input_map)
	output_mb 		= model.eval(data)
	
	output[sample_count:sample_count+currentMBsize,] = np.squeeze(output_mb)
	sample_count += currentMBsize

	sys.stdout.write('\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b')
	print("% 3d %% (% 5.1f samples/s)" % (math.floor(100*sample_count/number_images), currentMBsize/(time.time()-t_start_mb)), end = '', flush = True)

sys.stdout.write('\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b')
print("                  ")

print("\nSaving file...", end = '', flush = True)
sio.savemat(OutputFile_PathAbs, {'pred':np.transpose(output)})
print("Finished!\n")