def create_mb_source(image_height, image_width, num_channels, map_file): transforms = [ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear')] image_source = ImageDeserializer(map_file) image_source.ignore_labels() image_source.map_features('features', transforms) return MinibatchSource(image_source, randomize=False)
def create_image_mb_source(map_file, is_training, total_number_of_samples): if not os.path.exists(map_file): raise RuntimeError("File '%s' does not exist." %map_file) # transformation pipeline for the features has jitter/crop only when training transforms = [] if is_training: transforms += [ xforms.crop(crop_type='randomside', side_ratio=0.4375:0.875, jitter_type='uniratio') # train uses jitter ] else: transforms += [ xforms.crop(crop_type='center', side_ratio=0.5833333) # test has no jitter ] transforms += [ xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), ] # deserializer return MinibatchSource( ImageDeserializer(map_file, StreamDefs( features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image' labels = StreamDef(field='label', shape=num_classes))), # and second as 'label' randomize = is_training, epoch_size=total_number_of_samples, multithreaded_deserializer = True)
def create_mb_source(image_height, image_width, num_channels, map_file, mean_file, is_training): if not os.path.exists(map_file): raise RuntimeError("File '%s' does not exist." % (map_file)) # transformation pipeline for the features has jitter/crop only when training transforms = [] if is_training: transforms += [ xforms.crop(crop_type='randomside', side_ratio=0.875, jitter_type='uniratio') # train uses jitter ] else: transforms += [ xforms.crop(crop_type='center', side_ratio=0.875) # test has no jitter ] transforms += [ xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), ] if mean_file != '': transforms += [ xforms.mean(mean_file), ] # deserializer return MinibatchSource( ImageDeserializer(map_file, StreamDefs( features = StreamDef(field='image', transforms=transforms) # first column in map file is referred to as 'image' )), randomize = is_training, multithreaded_deserializer = True, max_sweeps = 1)
def create_reader(path, vocab_dim, entity_dim, randomize, rand_size=DEFAULT_RANDOMIZATION_WINDOW, size=INFINITELY_REPEAT): """ Create data reader for the model Args: path: The data path vocab_dim: The dimention of the vocabulary entity_dim: The dimention of entities randomize: Where to shuffle the data before feed into the trainer """ return MinibatchSource(CTFDeserializer( path, StreamDefs(context=StreamDef(field='C', shape=vocab_dim, is_sparse=True), query=StreamDef(field='Q', shape=vocab_dim, is_sparse=True), entities=StreamDef(field='E', shape=1, is_sparse=False), label=StreamDef(field='L', shape=1, is_sparse=False), entity_ids=StreamDef(field='EID', shape=entity_dim, is_sparse=True))), randomize=randomize)
def create_video_mb_source(map_files, num_channels, image_height, image_width, num_classes): transforms = [ xforms.crop(crop_type='center', crop_size=224), xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear') ] map_files = sorted(map_files, key=lambda x: int(x.split('Map_')[1].split('.')[0])) print(map_files) # Create multiple image sources sources = [] for i, map_file in enumerate(map_files): streams = { "feature" + str(i): StreamDef(field='image', transforms=transforms), "label" + str(i): StreamDef(field='label', shape=num_classes) } sources.append(ImageDeserializer(map_file, StreamDefs(**streams))) return MinibatchSource(sources, max_sweeps=1, randomize=False)
def create_mb_source(img_height, img_width, img_channels, n_classes, n_rois, data_path, data_set): rois_dim = 4 * n_rois label_dim = n_classes * n_rois path = os.path.normpath(os.path.join(abs_path, data_path)) if data_set == 'test': map_file = os.path.join(path, test_map_filename) else: map_file = os.path.join(path, train_map_filename) roi_file = os.path.join(path, data_set + rois_filename_postfix) label_file = os.path.join(path, data_set + roilabels_filename_postfix) if not os.path.exists(map_file) or not os.path.exists(roi_file) or not os.path.exists(label_file): raise RuntimeError("File '%s', '%s' or '%s' does not exist. " "Please run install_fastrcnn.py from Examples/Image/Detection/FastRCNN to fetch them" % (map_file, roi_file, label_file)) # read images image_source = ImageDeserializer(map_file) image_source.ignore_labels() image_source.map_features(features_stream_name, [ImageDeserializer.scale(width=img_width, height=img_height, channels=img_channels, scale_mode="pad", pad_value=114, interpolations='linear')]) # read rois and labels roi_source = CTFDeserializer(roi_file) roi_source.map_input(roi_stream_name, dim=rois_dim, format="dense") label_source = CTFDeserializer(label_file) label_source.map_input(label_stream_name, dim=label_dim, format="dense") # define a composite reader return MinibatchSource([image_source, roi_source, label_source], epoch_size=sys.maxsize, randomize=data_set == "train")
def create_mb_source(data_set, img_height, img_width, n_classes, n_rois, data_path, randomize): # set paths map_file = join(data_path, data_set + '.txt') roi_file = join(data_path, data_set + '.rois.txt') label_file = join(data_path, data_set + '.roilabels.txt') if not os.path.exists(map_file) or not os.path.exists(roi_file) or not os.path.exists(label_file): raise RuntimeError("File '%s', '%s' or '%s' does not exist. " % (map_file, roi_file, label_file)) # read images nrImages = len(readTable(map_file)) transforms = [scale(width=img_width, height=img_height, channels=3, scale_mode="pad", pad_value=114, interpolations='linear')] image_source = ImageDeserializer(map_file, StreamDefs(features = StreamDef(field='image', transforms=transforms))) # read rois and labels rois_dim = 4 * n_rois label_dim = n_classes * n_rois roi_source = CTFDeserializer(roi_file, StreamDefs( rois = StreamDef(field='rois', shape=rois_dim, is_sparse=False))) label_source = CTFDeserializer(label_file, StreamDefs( roiLabels = StreamDef(field='roiLabels', shape=label_dim, is_sparse=False))) # define a composite reader mb = MinibatchSource([image_source, roi_source, label_source], epoch_size=sys.maxsize, randomize=randomize) return (mb, nrImages)
def create_reader(map_file, train, dimensions, classes, total_number_of_samples): print( f"Reading map file: {map_file} with number of samples {total_number_of_samples}" ) # transformation pipeline for the features has jitter/crop only when training transforms = [] # finalize_network uses data augmentation (translation only) if train: transforms += [ xforms.crop(crop_type='randomside', area_ratio=(0.08, 1.0), aspect_ratio=(0.75, 1.3333), jitter_type='uniratio'), xforms.color(brightness_radius=0.4, contrast_radius=0.4, saturation_radius=0.4) ] transforms += [ xforms.scale(width=dimensions['width'], height=dimensions['height'], channels=dimensions['depth'], interpolations='linear') ] source = MinibatchSource(ImageDeserializer( map_file, StreamDefs(features=StreamDef(field='image', transforms=transforms), labels=StreamDef(field='label', shape=len(classes)))), randomize=train, max_samples=total_number_of_samples, multithreaded_deserializer=True) return source
def create_reader(map_file, mean_file, train): if not os.path.exists(map_file) or not os.path.exists(mean_file): raise RuntimeError( "File '%s' or '%s' does not exist. Please run install_cifar10.py from DataSets/CIFAR-10 to fetch them" % (map_file, mean_file)) # transformation pipeline for the features has jitter/crop only when training transforms = [] if train: transforms += [ ImageDeserializer.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter ] transforms += [ ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), ImageDeserializer.mean(mean_file) ] # deserializer return MinibatchSource( ImageDeserializer( map_file, StreamDefs( features=StreamDef( field='image', transforms=transforms ), # first column in map file is referred to as 'image' labels=StreamDef(field='label', shape=num_classes)))) # and second as 'label'
def create_reader(map_file, mean_file, train, image_height=64, image_width=64, num_channels=3, num_classes=32): # transformation pipeline for the features has jitter/crop only when training # https://docs.microsoft.com/en-us/python/api/cntk.io.transforms?view=cntk-py-2.2 trs = [] if train: trs += [ transforms.crop(crop_type='randomside', side_ratio=0, jitter_type='none') # Horizontal flip enabled ] trs += [ transforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), transforms.mean(mean_file) ] # deserializer image_source = ImageDeserializer( map_file, StreamDefs( features=StreamDef( field='image', transforms=trs ), # first column in map file is referred to as 'image' labels=StreamDef(field='label', shape=num_classes) # and second as 'label' )) return MinibatchSource(image_source)
def test_large_minibatch(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_DENSE_2) mb_source = MinibatchSource(CTFDeserializer( tmpfile, StreamDefs(features=StreamDef(field='S0', shape=1), labels=StreamDef(field='S1', shape=1))), randomization_window_in_chunks=0) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(1000) features = mb[features_si] labels = mb[labels_si] # Actually, the minibatch spans over multiple sweeps, # not sure if this is an artificial situation, but # maybe instead of a boolean flag we should indicate # the largest sweep index the data was taken from. assert features.end_of_sweep assert labels.end_of_sweep assert features.num_samples == 1000 - 1000 % 7 assert labels.num_samples == 5 * (1000 // 7) assert mb[features_si].num_sequences == (1000 // 7) assert mb[labels_si].num_sequences == (1000 // 7)
def test_create_two_image_deserializers(tmpdir): mbdata = r'''filename 0 filename2 0 ''' map_file = str(tmpdir / 'mbdata.txt') with open(map_file, 'w') as f: f.write(mbdata) image_width = 100 image_height = 200 num_channels = 3 transforms = [ xforms.crop(crop_type='randomside', side_ratio=0.5, jitter_type='uniratio'), xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear') ] image1 = ImageDeserializer( map_file, StreamDefs(f1=StreamDef(field='image', transforms=transforms))) image2 = ImageDeserializer( map_file, StreamDefs(f2=StreamDef(field='image', transforms=transforms))) mb_source = MinibatchSource([image1, image2]) assert isinstance(mb_source, MinibatchSource)
def test_MinibatchData_and_Value_as_input(tmpdir): mbdata = r'''0 |S0 100''' tmpfile = str(tmpdir / 'mbtest.txt') with open(tmpfile, 'w') as f: f.write(mbdata) defs = StreamDefs(f1=StreamDef(field='S0', shape=1)) mb_source = MinibatchSource(CTFDeserializer(tmpfile, defs), randomize=False) f1_si = mb_source.stream_info('f1') mb = mb_source.next_minibatch(1) f1 = input(shape=(1, ), needs_gradient=True, name='f') res = f1 * 2 assert res.eval({f1: mb[f1_si]}) == [[200]] # Test MinibatchData assert res.eval(mb[f1_si]) == [[200]] # Test Value assert res.eval(mb[f1_si].data) == [[200]] # Test NumPy (converted back from MinibatchData) assert res.eval(mb[f1_si].asarray()) == [[200]] # Test Value assert res.eval(mb[f1_si].data) == [[200]]
def test_multiple_mlf_files(): os.chdir(data_path) feature_dim = 33 num_classes = 132 context = 2 test_mlf_path = "../../../../Tests/EndToEndTests/Speech/Data/glob_00001.mlf" features_file = "glob_0000.scp" label_files = ["glob_0000.mlf", test_mlf_path] label_mapping_file = "state.list" fd = HTKFeatureDeserializer( StreamDefs(amazing_features=StreamDef( shape=feature_dim, context=(context, context), scp=features_file))) ld = HTKMLFDeserializer( label_mapping_file, StreamDefs( awesome_labels=StreamDef(shape=num_classes, mlf=label_files))) # Make sure we can read at least one minibatch. mbsource = MinibatchSource([fd, ld]) mbsource.next_minibatch(1) os.chdir(abs_path)
def create_reader(map_file, mean_file, train, distributed_communicator=None): if not os.path.exists(map_file) or not os.path.exists(mean_file): cifar_py3 = "" if sys.version_info.major < 3 else "_py3" raise RuntimeError( "File '%s' or '%s' does not exist. Please run CifarDownload%s.py and CifarConverter%s.py from CIFAR-10 to fetch them" % (map_file, mean_file, cifar_py3, cifar_py3)) # transformation pipeline for the features has jitter/crop only when training transforms = [] if train: transforms += [ ImageDeserializer.crop(crop_type='Random', ratio=0.8, jitter_type='uniRatio') # train uses jitter ] transforms += [ ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), ImageDeserializer.mean(mean_file) ] # deserializer return MinibatchSource( ImageDeserializer( map_file, StreamDefs( features=StreamDef( field='image', transforms=transforms ), # first column in map file is referred to as 'image' labels=StreamDef(field='label', shape=num_classes))), # and second as 'label' distributed_communicator=distributed_communicator)
def test_base64_image_deserializer(tmpdir): import io, base64, uuid from PIL import Image images, b64_images = [], [] np.random.seed(1) for i in range(10): data = np.random.randint(0, 2**8, (5, 7, 3)) image = Image.fromarray(data.astype('uint8'), "RGB") buf = io.BytesIO() image.save(buf, format='PNG') assert image.width == 7 and image.height == 5 b64_images.append(base64.b64encode(buf.getvalue())) images.append(np.array(image)) image_data = str(tmpdir / 'mbdata1.txt') seq_ids = [] uid = uuid.uuid1().int >> 64 with open(image_data, 'wb') as f: for i, data in enumerate(b64_images): seq_id = uid ^ i seq_id = str(seq_id).encode('ascii') seq_ids.append(seq_id) line = seq_id + b'\t' label = str(i).encode('ascii') line += label + b'\t' + data + b'\n' f.write(line) ctf_data = str(tmpdir / 'mbdata2.txt') with open(ctf_data, 'wb') as f: for i, sid in enumerate(seq_ids): line = sid + b'\t' + b'|index ' + str(i).encode('ascii') + b'\n' f.write(line) transforms = [xforms.scale(width=7, height=5, channels=3)] b64_deserializer = Base64ImageDeserializer( image_data, StreamDefs(images=StreamDef(field='image', transforms=transforms), labels=StreamDef(field='label', shape=10))) ctf_deserializer = CTFDeserializer( ctf_data, StreamDefs(index=StreamDef(field='index', shape=1))) mb_source = MinibatchSource([ctf_deserializer, b64_deserializer]) assert isinstance(mb_source, MinibatchSource) for j in range(100): mb = mb_source.next_minibatch(10) index_stream = mb_source.streams['index'] index = mb[index_stream].asarray().flatten() image_stream = mb_source.streams['images'] results = mb[image_stream].asarray() for i in range(10): # original images are RBG, openCV produces BGR images, # reverse the last dimension of the original images bgrImage = images[int(index[i])][:, :, ::-1] assert (bgrImage == results[i][0]).all()
def create_reader(map_file, mean_file, train): # transformation pipeline for the features has jitter/crop only when training trs = [] # if train: # transforms += [ # ImageDeserializer.crop(crop_type='Random', ratio=0.8, jitter_type='uniRatio') # train uses jitter # ] trs += [ transforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), transforms.mean(mean_file) ] # deserializer return MinibatchSource( ImageDeserializer( map_file, StreamDefs( features=StreamDef( field='image', transforms=trs ), # first column in map file is referred to as 'image' labels=StreamDef(field='label', shape=num_classes) # and second as 'label' )))
def test_eval_sparse_dense(tmpdir, device_id): from cntk import Axis from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs from cntk.ops import input, times input_vocab_dim = label_vocab_dim = 69 ctf_data = '''\ 0 |S0 3:1 |# <s> |S1 3:1 |# <s> 0 |S0 4:1 |# A |S1 32:1 |# ~AH 0 |S0 5:1 |# B |S1 36:1 |# ~B 0 |S0 4:1 |# A |S1 31:1 |# ~AE 0 |S0 7:1 |# D |S1 38:1 |# ~D 0 |S0 12:1 |# I |S1 47:1 |# ~IY 0 |S0 1:1 |# </s> |S1 1:1 |# </s> 2 |S0 60:1 |# <s> |S1 3:1 |# <s> 2 |S0 61:1 |# A |S1 32:1 |# ~AH ''' ctf_file = str(tmpdir / '2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) mbs = MinibatchSource(CTFDeserializer( ctf_file, StreamDefs(features=StreamDef(field='S0', shape=input_vocab_dim, is_sparse=True), labels=StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True))), randomize=False, epoch_size=2) raw_input = sequence.input(shape=input_vocab_dim, sequence_axis=Axis('inputAxis'), name='raw_input', is_sparse=True) mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100, input_map={raw_input: mbs.streams.features}, device=cntk_device(device_id)) z = times(raw_input, np.eye(input_vocab_dim)) e_reader = z.eval(mb_valid, device=cntk_device(device_id)) # CSR with the raw_input encoding in ctf_data one_hot_data = [[3, 4, 5, 4, 7, 12, 1], [60, 61]] data = [ csr(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in one_hot_data ] e_csr = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a, b in zip(e_reader, e_csr)]) # One-hot with the raw_input encoding in ctf_data data = Value.one_hot(one_hot_data, num_classes=input_vocab_dim, device=cntk_device(device_id)) e_hot = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a, b in zip(e_reader, e_hot)])
def create_mb_source(map_file, image_width, image_height, num_channels, num_classes, randomize=True): transforms = [] transforms += [xforms.crop(crop_type='randomside', side_ratio=0.8)] transforms += [xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear')] return MinibatchSource(ImageDeserializer(map_file, StreamDefs( features =StreamDef(field='image', transforms=transforms), labels =StreamDef(field='label', shape=num_classes))), randomize=randomize)
def create_reader(path, is_training, input_dim, num_label_classes): """ reads CNTK formatted file with 'labels' and 'features' """ return MinibatchSource(CTFDeserializer(path, StreamDefs( labels = StreamDef(field='labels', shape=num_label_classes), features = StreamDef(field='features', shape=input_dim) )), randomize = is_training, max_sweeps = INFINITELY_REPEAT if is_training else 1)
def create_reader(path, randomize, input_vocab_dim, label_vocab_dim, size=INFINITELY_REPEAT): if not os.path.exists(path): raise RuntimeError("File '%s' does not exist." % (path)) return MinibatchSource(CTFDeserializer(path, StreamDefs( features = StreamDef(field='S0', shape=input_vocab_dim, is_sparse=True), labels = StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True) )), randomize=randomize, max_samples = size)
def create_reader_raw(path, is_training, input_dim, num_label_classes): """ Reads in the unstardized values. """ return MinibatchSource(CTFDeserializer(path, StreamDefs( labels = StreamDef(field='rawlabels', shape=num_label_classes), features = StreamDef(field='rawfeatures', shape=input_dim) )), randomize = is_training, max_sweeps = INFINITELY_REPEAT if is_training else 1)
def create_reader(path, is_training, input_dim, label_dim): return MinibatchSource( CTFDeserializer( path, StreamDefs(features=StreamDef(field='features', shape=input_dim), labels=StreamDef(field='labels', shape=label_dim))), randomize=is_training, epoch_size=INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP)
def test_htk_deserializers(): mbsize = 640 epoch_size = 1000 * mbsize lr = [0.001] feature_dim = 33 num_classes = 132 context = 2 os.chdir(data_path) features_file = "glob_0000.scp" labels_file = "glob_0000.mlf" label_mapping_file = "state.list" fd = HTKFeatureDeserializer( StreamDefs(amazing_features=StreamDef( shape=feature_dim, context=(context, context), scp=features_file))) ld = HTKMLFDeserializer( label_mapping_file, StreamDefs( awesome_labels=StreamDef(shape=num_classes, mlf=labels_file))) reader = MinibatchSource([fd, ld]) features = C.input_variable(((2 * context + 1) * feature_dim)) labels = C.input_variable((num_classes)) model = Sequential( [For(range(3), lambda: Recurrence(LSTM(256))), Dense(num_classes)]) z = model(features) ce = C.cross_entropy_with_softmax(z, labels) errs = C.classification_error(z, labels) learner = C.adam_sgd(z.parameters, lr=C.learning_rate_schedule(lr, C.UnitType.sample, epoch_size), momentum=C.momentum_as_time_constant_schedule(1000), low_memory=True, gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True) trainer = C.Trainer(z, (ce, errs), learner) input_map = { features: reader.streams.amazing_features, labels: reader.streams.awesome_labels } pp = C.ProgressPrinter(freq=0) # just run and verify it doesn't crash for i in range(3): mb_data = reader.next_minibatch(mbsize, input_map=input_map) trainer.train_minibatch(mb_data) pp.update_with_trainer(trainer, with_metric=True) assert True os.chdir(abs_path)
def test_index_caching(tmpdir): pytest.skip("test_index_caching is disabled") import os, time, glob, uuid MB = 1 << 20 data = MBDATA_DENSE_1 while(len(data) < 64 * MB): data += data timeWithoutCache, timeWithCache = 0, 0 cpu=C.device.cpu() streams = stream_defs[0] for _ in range(3): tmpfile = _write_data(tmpdir, data, str(uuid.uuid4())) cache_files = glob.glob(str(tmpdir + '/*.cache')) for cache_file in cache_files: os.remove(cache_file) config = CTFDeserializer(tmpfile, streams) config['cacheIndex'] = C.cntk_py.DictionaryValue(True) start = time.time() MinibatchSource(config, randomize=False).next_minibatch(1, device=cpu) end = time.time() timeWithoutCache += (end - start) time.sleep(5) cache_files = glob.glob(str(tmpdir + '/*.cache')) assert len(cache_files) == 1 start = time.time() MinibatchSource(config, randomize=False).next_minibatch(1, device=cpu) end = time.time() os.remove(tmpfile) timeWithCache += (end - start) assert timeWithCache < timeWithoutCache
def compare_cbf_and_ctf(num_mbs, mb_size, randomize): ctf = MinibatchSource(CTFDeserializer(tmpfile, streams), randomize=randomize) cbf = MinibatchSource(CBFDeserializer(tmpfile + '.bin', streams), randomize=randomize) ctf_stream_names = sorted([x.m_name for x in ctf.stream_infos()]) cbf_stream_names = sorted([x.m_name for x in cbf.stream_infos()]) assert (ctf_stream_names == cbf_stream_names) for _ in range(num_mbs): ctf_mb = ctf.next_minibatch(mb_size, device=device) cbf_mb = cbf.next_minibatch(mb_size, device=device) for name in cbf_stream_names: ctf_data = ctf_mb[ctf[name]] cbf_data = cbf_mb[cbf[name]] assert ctf_data.num_samples == cbf_data.num_samples assert ctf_data.num_sequences == cbf_data.num_sequences assert ctf_data.shape == cbf_data.shape assert ctf_data.end_of_sweep == cbf_data.end_of_sweep assert ctf_data.is_sparse == cbf_data.is_sparse assert ctf_data.data.masked_count( ) == cbf_data.data.masked_count() # XXX: # assert(ctf_data.asarray() == cbf_data.asarray()).all() # not using asarray because for sparse values it fails with # some strange exception "sum of the rank of the mask and Variable #rank does not equal the Value's rank". assert C.cntk_py.are_equal(ctf_data.data.data, cbf_data.data.data) if (ctf_data.data.masked_count() > 0): assert (ctf_data.data.mask == cbf_data.data.mask).all() # XXX: if mask_count is zero, mb_data.data.mask fails with # "AttributeError: 'Value' object has no attribute 'mask'"! # XXX: without invoking erase, next_minibatch will fail with: # "Resize: Cannot resize the matrix because it is a view." ctf_data.data.erase() cbf_data.data.erase()
def test_one_sweep(tmpdir): ctf = create_ctf_deserializer(tmpdir) sources = [MinibatchSource(ctf, max_sweeps=1), MinibatchSource(ctf, max_samples=FULL_DATA_SWEEP), MinibatchSource(ctf, max_sweeps=1, max_samples=INFINITELY_REPEAT), MinibatchSource(ctf, max_samples=FULL_DATA_SWEEP, max_sweeps=INFINITELY_REPEAT)] for source in sources: input_map = {'features': source['features']} mb = source.next_minibatch(100, input_map) assert 'features' in mb assert mb['features'].num_samples == 4 assert mb['features'].end_of_sweep mb = source.next_minibatch(100, input_map) assert not mb
def test_minibatch(tmpdir): mbdata = r'''0 |S0 0 |S1 0 0 |S0 1 |S1 1 0 |S0 2 0 |S0 3 |S1 3 1 |S0 4 1 |S0 5 |S1 1 1 |S0 6 |S1 2 ''' tmpfile = str(tmpdir/'mbtest.txt') with open(tmpfile, 'w') as f: f.write(mbdata) from cntk.io import CTFDeserializer, MinibatchSource, StreamDef, StreamDefs mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features = StreamDef(field='S0', shape=1), labels = StreamDef(field='S1', shape=1)))) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(1000) assert mb[features_si].num_sequences == 2 assert mb[labels_si].num_sequences == 2 features = mb[features_si] assert len(features.value) == 2 expected_features = \ [ [[0],[1],[2],[3]], [[4],[5],[6]] ] for res, exp in zip (features.value, expected_features): assert np.allclose(res, exp) assert np.allclose(features.mask, [[2, 1, 1, 1], [2, 1, 1, 0]]) labels = mb[labels_si] assert len(labels.value) == 2 expected_labels = \ [ [[0],[1],[3]], [[1],[2]] ] for res, exp in zip (labels.value, expected_labels): assert np.allclose(res, exp) assert np.allclose(labels.mask, [[2, 1, 1], [2, 1, 0]])
def create_reader(path, is_training, input_dim, output_dim): return MinibatchSource(CTFDeserializer( path, StreamDefs(features=StreamDef(field='attribs', shape=input_dim, is_sparse=False), labels=StreamDef(field='species', shape=output_dim, is_sparse=False))), randomize=is_training, max_sweeps=INFINITELY_REPEAT if is_training else 1)
def create_reader(path, is_training): return MinibatchSource(CTFDeserializer( path, StreamDefs(features=StreamDef(field='S0', shape=input_vocab_dim, is_sparse=True), labels=StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True))), randomize=is_training, max_sweeps=INFINITELY_REPEAT if is_training else 1)