示例#1
0
文件: fuel_utils.py 项目: gunkisu/asr
def get_datastream(path,
                   which_set,
                   batch_size=1,
                   norm_path=None,
                   use_ivectors=False,
                   truncate_ivectors=False,
                   ivector_dim=100,
                   shuffled=True):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    if shuffled:
        iterator_scheme = ShuffledScheme(batch_size=batch_size,
                                         examples=wsj_dataset.num_examples)
    else:
        iterator_scheme = SequentialScheme(batch_size=batch_size,
                                           examples=wsj_dataset.num_examples)
    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)

    if norm_path:
        data_mean_std = numpy.load(norm_path)
        base_stream = Normalize(data_stream=base_stream,
                                means=data_mean_std['mean'],
                                stds=data_mean_std['std'])

    if use_ivectors:
        fs = FilterSources(data_stream=base_stream,
                           sources=['features', 'ivectors', 'targets'])
        if truncate_ivectors:
            fs = TruncateTransformer(fs, 'ivectors', ivector_dim)
        # fs = ConcatenateTransformer(fs, ['features', 'ivectors'], 'features')
    else:
        fs = FilterSources(data_stream=base_stream,
                           sources=['features', 'targets'])
    return Padding(fs)
示例#2
0
 def test_works_on_all_sources(self):
     transformer = FilterSources(
         self.stream, sources=("features", "targets"))
     assert_equal(transformer.sources, ('features', 'targets'))
     assert_equal(list(transformer.get_epoch_iterator()),
                  [(numpy.ones((2, 2, 2)), numpy.array([0, 1])),
                   (numpy.ones((2, 2, 2)), numpy.array([0, 1]))])
示例#3
0
 def test_works_on_all_sources(self):
     transformer = FilterSources(
         self.stream, sources=("features", "targets"))
     assert_equal(transformer.sources, ('features', 'targets'))
     assert_equal(list(transformer.get_epoch_iterator()),
                  [(numpy.ones((2, 2, 2)), numpy.array([0, 1])),
                   (numpy.ones((2, 2, 2)), numpy.array([0, 1]))])
示例#4
0
文件: data.py 项目: tombosc/cpae
    def get_stream(self,
                   part,
                   batch_size=None,
                   max_length=None,
                   seed=None,
                   remove_keys=False,
                   add_bos_=True,
                   remove_n_identical_keys=True):
        dataset = self.get_dataset(part, max_length)
        if self._layout == 'lambada' and part == 'train':
            stream = DataStream(dataset,
                                iteration_scheme=RandomSpanScheme(
                                    dataset.num_examples, max_length, seed))
            stream = Mapping(stream, listify)
        else:
            stream = dataset.get_example_stream()

        if add_bos_:
            stream = SourcewiseMapping(stream,
                                       functools.partial(
                                           add_bos, Vocabulary.BOS),
                                       which_sources=('words'))
        if max_length != None:
            stream = SourcewiseMapping(stream,
                                       functools.partial(
                                           cut_if_too_long, max_length),
                                       which_sources=('words'))
        stream = SourcewiseMapping(stream, vectorize, which_sources=('words'))
        stream = SourcewiseMapping(stream,
                                   word_to_singleton_list,
                                   which_sources=('keys'))
        stream = SourcewiseMapping(stream, vectorize, which_sources=('keys'))
        stream = Flatten(stream, which_sources=('keys'))

        if self._layout == 'dict':
            if remove_keys:
                stream = FilterSources(
                    stream,
                    [source for source in stream.sources if source != 'keys'])
            if remove_n_identical_keys:
                print "remove identical keys"
                stream = FilterSources(stream, [
                    source for source in stream.sources
                    if source != 'n_identical_keys'
                ])
        if not batch_size:
            return stream

        stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

        stream = Padding(stream, mask_sources=('words'))
        #stream = Flatten(stream, which_sources=('n_identical_keys'))

        #if self._layout == 'dict':
        #    stream = FilterSources(stream, [source for source in stream.sources
        #                                    if source != 'keys_mask'])
        #    stream = FilterSources(stream, [source for source in stream.sources
        #                                    if source != 'n_identical_keys_mask'])
        return stream
示例#5
0
文件: fuel_utils.py 项目: gunkisu/asr
def create_ivector_test_datastream(path, which_set, batch_size=1, delay=0):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    iterator_scheme = SequentialScheme(batch_size=batch_size,
                                       examples=wsj_dataset.num_examples)

    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)

    fs = FilterSources(data_stream=base_stream,
                       sources=['features', 'ivectors', 'targets'])

    if delay:
        fs = DelayTransformer(fs, delay)

    fs = FilterSources(data_stream=fs, sources=['features', 'ivectors'])
    return Padding(fs)
示例#6
0
def combine_datastreams(ds_labeled, ds_unlabeled):
    # Rename the sources for clarity
    if ds_labeled is not None:
        names = {'features': 'features_labeled', 'targets': 'targets_labeled'}
        if 'mask' in ds_labeled.sources:
            names['mask'] = 'masks_labeled'
        ds_labeled = Rename(ds_labeled, names)

    # Rename the source for input pixels and hide its labels!
    if ds_unlabeled is not None:
        sources = list(ds_unlabeled.sources)
        # Mask away the features
        # Remove targets
        del sources[sources.index('targets')]

        names = {'features': 'features_unlabeled'}
        if 'mask' in ds_unlabeled.sources:
            names['mask'] = 'masks_unlabeled'
        ds_unlabeled = Rename(FilterSources(ds_unlabeled, sources),
                              names=names)

    if ds_labeled is None:
        return ds_unlabeled

    if ds_unlabeled is None:
        return ds_labeled

    return SemiDataStream(data_stream_labeled=ds_labeled,
                          data_stream_unlabeled=ds_unlabeled)
示例#7
0
文件: l3.py 项目: donghyunlee/play
def open_stream(which_sets=('train', ), port=5557, num_examples=None):

    dataset = Blizzard(which_sets=which_sets)

    if num_examples == None:
        num_examples = dataset.num_examples

    data_stream = DataStream.default_stream(dataset,
                                            iteration_scheme=SequentialScheme(
                                                num_examples, batch_size))

    data_stream = ScaleAndShift(data_stream,
                                scale=1 / data_std,
                                shift=-data_mean / data_std)
    data_stream = Mapping(data_stream,
                          _downsample_and_upsample,
                          add_sources=('upsampled', ))
    data_stream = Mapping(data_stream, _equalize_size)
    data_stream = Mapping(data_stream,
                          _get_residual,
                          add_sources=('residual', ))
    data_stream = FilterSources(data_stream,
                                sources=(
                                    'upsampled',
                                    'residual',
                                ))
    data_stream = Mapping(data_stream, _segment_axis)
    data_stream = Mapping(data_stream, _transpose)
    data_stream = ForceFloatX(data_stream)

    start_server(data_stream, port=port)
示例#8
0
文件: fuel_utils.py 项目: gunkisu/asr
def get_spkid_stream(path, which_set, batch_size=1):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    iterator_scheme = SequentialScheme(examples=wsj_dataset.num_examples,
                                       batch_size=batch_size)
    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)
    fs = FilterSources(data_stream=base_stream, sources=['spks'])
    return fs
示例#9
0
def get_uttid_stream(path, which_set='test_eval92', batch_size=1):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    print path, which_set
    iterator_scheme = SequentialScheme(examples=wsj_dataset.num_examples, batch_size=batch_size)
    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)
    fs = FilterSources(data_stream=base_stream, sources=['uttids'])
    return fs
示例#10
0
def get_datastream(path, which_set='train_si84', batch_size=1):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    print path, which_set
    iterator_scheme = ShuffledScheme(batch_size=batch_size, examples=wsj_dataset.num_examples)
    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)
    fs = FilterSources(data_stream=base_stream, sources=['features', 'targets'])
    padded_stream = Padding(data_stream=fs)
    return padded_stream
示例#11
0
def _liacl_data_stream(dataset,
                       rel2index,
                       batch_size,
                       word2index,
                       target='negative_sampling',
                       name="",
                       k=3,
                       shuffle=False,
                       neg_sample_kwargs={}):
    batches_per_epoch = int(np.ceil(dataset.num_examples / float(batch_size)))
    if shuffle:
        iteration_scheme = ShuffledScheme(dataset.num_examples, batch_size)
    else:
        iteration_scheme = SequentialScheme(dataset.num_examples, batch_size)
    data_stream = DataStream(dataset, iteration_scheme=iteration_scheme)
    data_stream = NumberizeWords(data_stream,
                                 word2index,
                                 default=word2index[UNKNOWN_TOKEN],
                                 which_sources=('head', 'tail'))
    data_stream = NumberizeWords(data_stream, rel2index, which_sources=('rel'))

    if target == "score":
        data_stream = Rename(data_stream, {'score': 'target'})
    else:
        data_stream = FilterSources(data_stream,
                                    sources=('head', 'tail', 'rel'))

    data_stream = Padding(data_stream,
                          mask_sources=('head, tail'),
                          mask_dtype=np.float32)

    if target == 'negative_sampling':
        logger.info('target for data stream ' + str(name) +
                    ' is negative sampling')
        data_stream = NegativeSampling(data_stream, k=k)
    elif target == 'filtered_negative_sampling':
        logger.info('target for data stream ' + str(name) +
                    ' is filtered negative sampling')
        data_stream = FilteredNegativeSampling(data_stream,
                                               k=k,
                                               **neg_sample_kwargs)
    elif target == 'score':
        logger.info('target for data stream ' + str(name) + ' is score')
    else:
        raise NotImplementedError(
            'target ', target,
            ' must be one of "score" or "negative_sampling"')

    data_stream = MergeSource(data_stream,
                              merge_sources=('head', 'tail', 'head_mask',
                                             'tail_mask', 'rel'),
                              merge_name='input')

    return data_stream, batches_per_epoch
示例#12
0
文件: fuel_utils.py 项目: gunkisu/asr
def create_ivector_test_datastream(path,
                                   which_set='test_eval92',
                                   batch_size=1):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    iterator_scheme = SequentialScheme(batch_size=batch_size,
                                       examples=wsj_dataset.num_examples)

    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)

    fs = FilterSources(data_stream=base_stream,
                       sources=['features', 'ivectors'])
    return Padding(fs)
示例#13
0
def get_stream(batch_size,
               source_window=4000,
               target_window=1000,
               num_examples=5000):
    from fuel.datasets.youtube_audio import YouTubeAudio
    data = YouTubeAudio('XqaJ2Ol5cC4')
    train_stream = data.get_example_stream()
    train_stream = ForceFloatX(train_stream)
    window_stream = Window(0,
                           source_window,
                           target_window,
                           overlapping=False,
                           data_stream=train_stream)
    source_stream = FilterSources(window_stream, sources=('features', ))
    feats_stream = Mapping(source_stream, mfcc)
    targets_stream = FilterSources(window_stream, sources=('targets', ))
    targets_stream = Flatten(targets_stream)
    stream = Merge((feats_stream, targets_stream),
                   sources=('features', 'targets'))
    #Add a random Scheme?
    it_scheme = ConstantScheme(batch_size, num_examples)
    batched_stream = Batch(stream, it_scheme, strictness=1)
    return batched_stream
示例#14
0
    def get_stream(self, part, batches=True, shuffle=True, add_sources=(),
                   num_examples=None, rng=None, seed=None):

        dataset = self.get_dataset(part, add_sources=add_sources)
        if num_examples is None:
            num_examples = dataset.num_examples

        if shuffle:
            iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng)
        else:
            iteration_scheme = SequentialExampleScheme(num_examples)

        stream = DataStream(
            dataset, iteration_scheme=iteration_scheme)

        stream = FilterSources(stream, (self.recordings_source,
                                        self.labels_source)+tuple(add_sources))
        if self.add_eos:
            stream = Mapping(stream, _AddLabel(self.eos_label))
        if self.add_bos:
            stream = Mapping(stream, _AddLabel(self.bos_label, append=False,
                                               times=self.add_bos))
        if self.preprocess_text:
            stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text)
        stream = Filter(stream, self.length_filter)
        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            stream = Mapping(stream, SortMapping(_length))
            stream = Unpack(stream)

        if self.preprocess_features == 'log_spectrogram':
            stream = Mapping(
                stream, functools.partial(apply_preprocessing,
                                          log_spectrogram))
        if self.normalization:
            stream = self.normalization.wrap_stream(stream)
        stream = ForceFloatX(stream)
        if not batches:
            return stream

        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(self.batch_size if part == 'train'
                                            else self.validation_batch_size))
        stream = Padding(stream)
        stream = Mapping(stream, switch_first_two_axes)
        stream = ForceCContiguous(stream)
        return stream
示例#15
0
def get_datastream(path, norm_path, which_set='train_si84', batch_size=1):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    data_mean_std = numpy.load(norm_path)

    iterator_scheme = ShuffledScheme(batch_size=batch_size,
                                     examples=wsj_dataset.num_examples)
    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)
    base_stream = Normalize(data_stream=base_stream,
                            means=data_mean_std['mean'],
                            stds=data_mean_std['std'])
    fs = FilterSources(data_stream=base_stream,
                       sources=['features', 'targets'])
    padded_stream = Padding(data_stream=fs)
    return padded_stream
示例#16
0
def define_stream(which_sets=('train',),
                initial_scale=1,
                scale=0.5,
                batch_size=64,
                seq_length=64,
                frame_size=128,
                tbptt_flag = True,
                num_examples=None):

    def _segment_axis(data):
        # Defined inside so that frame_size is available
        x = tuple([numpy.array([segment_axis(x, frame_size, 0) for x in var])
                   for var in data])
        return x

    scale = float(scale)

    dataset = Blizzard(which_sets=which_sets)

    if num_examples is None:
        num_examples = batch_size*(dataset.num_examples/batch_size)

    data_stream = DataStream.default_stream(
            dataset,
            iteration_scheme=SequentialScheme(num_examples, batch_size))

    data_stream = ScaleAndShift(data_stream,
                                scale=1/data_std,
                                shift=-data_mean/float(data_std))

    # Original sampling rate
    data_stream = Resample(data_stream, scale=initial_scale)
    data_stream = Mapping(data_stream, _copy, add_sources=('upsampled',))
    data_stream = Resample(data_stream, scale=scale, which_sources=('upsampled',))
    data_stream = Resample(data_stream, scale=1/scale, which_sources=('upsampled',))

    # data_stream = Mapping(data_stream, _downsample_and_upsample,
    #                       add_sources=('upsampled',))
    data_stream = Mapping(data_stream, _equalize_size)
    data_stream = Mapping(data_stream, _get_residual,
                          add_sources=('residual',))
    data_stream = FilterSources(data_stream,
                                sources=('upsampled', 'residual',))
    data_stream = Mapping(data_stream, _segment_axis)
    data_stream = Mapping(data_stream, _transpose)
    return data_stream
示例#17
0
def stream_handwriting(which_sets, batch_size, seq_size, tbptt=True):
    dataset = Handwriting(which_sets)
    data_stream = DataStream.default_stream(
        dataset,
        iteration_scheme=ShuffledScheme(
            batch_size * (dataset.num_examples / batch_size), batch_size))
    data_stream = FilterSources(data_stream, sources=('features', ))
    data_stream = Padding(data_stream)
    data_stream = Mapping(data_stream, _transpose)

    if tbptt:
        data_stream = SegmentSequence(data_stream,
                                      add_flag=True,
                                      seq_size=seq_size)

    data_stream = ForceFloatX(data_stream)

    return data_stream
示例#18
0
def test_filter_sources():
    stream = DataStream(
        IndexableDataset(
            OrderedDict([('features', numpy.ones((4, 2, 2))),
                         ('targets', numpy.array([0, 1, 0, 1]))])),
        iteration_scheme=SequentialScheme(4, 2))

    transformer = FilterSources(stream, sources=("features",))

    assert_equal(transformer.sources, ('features',))
    assert len(next(transformer.get_epoch_iterator())) == 1

    transformer = FilterSources(stream, sources=("features", "targets"))

    assert_equal(transformer.sources, ('features', 'targets'))
    assert len(next(transformer.get_epoch_iterator())) == 2

    transformer = FilterSources(stream, sources=("targets", "features"))

    assert_equal(transformer.sources, ('features', 'targets'))
    assert len(next(transformer.get_epoch_iterator())) == 2

    assert_raises(ValueError, transformer.get_data, [0, 1])
    assert_raises(ValueError, FilterSources, stream, ['error', 'targets'])
示例#19
0
    def get_stream(self, part, batches=True, shuffle=True,
                   add_sources=()):
        dataset = self.get_dataset(part, add_sources=add_sources)
        stream = (DataStream(dataset,
                             iteration_scheme=ShuffledExampleScheme(dataset.num_examples))
                  if shuffle
                  else dataset.get_example_stream())

        stream = FilterSources(stream, (self.recordings_source,
                                        self.labels_source)+tuple(add_sources))
        if self.add_eos:
            if self.prepend_eos:
                stream = Mapping(stream, _AddEosLabelBeginEnd(self.eos_label))
            else:
                stream = Mapping(stream, _AddEosLabelEnd(self.eos_label))
        if self.preprocess_text:
            stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text)
        stream = Filter(stream, self.length_filter)
        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            stream = Mapping(stream, SortMapping(_length))
            stream = Unpack(stream)

        if self.preprocess_features == 'log_spectrogram':
            stream = Mapping(
                stream, functools.partial(apply_preprocessing,
                                          log_spectrogram))
        if self.normalization:
            stream = self.normalization.wrap_stream(stream)
        stream = ForceFloatX(stream)
        if not batches:
            return stream

        stream = Batch(stream, iteration_scheme=ConstantScheme(self.batch_size))
        stream = Padding(stream)
        stream = Mapping(stream, switch_first_two_axes)
        stream = ForceCContiguous(stream)
        return stream
示例#20
0
文件: fuel_utils.py 项目: gunkisu/asr
def create_ivector_datastream(path,
                              which_set,
                              batch_size=1,
                              delay=0,
                              min_after_cache=1024,
                              length_sort=False):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    iterator_scheme = ShuffledScheme(batch_size=batch_size,
                                     examples=wsj_dataset.num_examples)

    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)

    fs = FilterSources(data_stream=base_stream,
                       sources=['features', 'ivectors', 'targets'])

    if length_sort:
        fs = LengthSortTransformer(fs, batch_size, min_after_cache)

    if delay:
        fs = DelayTransformer(fs, delay)
    return Padding(fs)
def test_filter_sources():
    stream = DataStream(
        IndexableDataset(OrderedDict([("features", numpy.ones((4, 2, 2))), ("targets", numpy.array([0, 1, 0, 1]))])),
        iteration_scheme=SequentialScheme(4, 2),
    )

    transformer = FilterSources(stream, sources=("features",))

    assert_equal(transformer.sources, ("features",))
    assert len(next(transformer.get_epoch_iterator())) == 1

    transformer = FilterSources(stream, sources=("features", "targets"))

    assert_equal(transformer.sources, ("features", "targets"))
    assert len(next(transformer.get_epoch_iterator())) == 2

    transformer = FilterSources(stream, sources=("targets", "features"))

    assert_equal(transformer.sources, ("features", "targets"))
    assert len(next(transformer.get_epoch_iterator())) == 2

    assert_raises(ValueError, transformer.get_data, [0, 1])
    assert_raises(ValueError, FilterSources, stream, ["error", "targets"])
示例#22
0
for batch in data_stream.get_epoch_iterator():
    for element in batch[0]:
        all_data = numpy.hstack([all_data, element])
mean_data = all_data.mean()
std_data = all_data.std()

data_stream = ScaleAndShift(data_stream,
                            scale=1 / std_data,
                            shift=-mean_data / std_data)
data_stream = Mapping(data_stream,
                      _downsample_and_upsample,
                      add_sources=('upsampled', ))
data_stream = Mapping(data_stream, _equalize_size)
data_stream = Mapping(data_stream, _get_residual, add_sources=('residual', ))
data_stream = FilterSources(data_stream, sources=(
    'upsampled',
    'residual',
))
data_stream = Mapping(data_stream, _segment_axis)
data_stream = Padding(data_stream)
data_stream = FilterSources(data_stream,
                            sources=('upsampled', 'residual', 'residual_mask'))
data_stream = Mapping(data_stream, _transpose)
data_stream = ForceFloatX(data_stream)

#################
# Model
#################

activations_x = [Rectifier()] * depth_x

dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \
示例#23
0
def parrot_stream(voice,
                  use_speaker=False,
                  which_sets=('train', ),
                  batch_size=32,
                  seq_size=50,
                  num_examples=None,
                  sorting_mult=4,
                  noise_level=None,
                  labels_type='full_labels',
                  check_ratio=False,
                  raw_data=True,
                  q_type='mu-law',
                  q_level=256):

    assert labels_type in [
        'full_labels', 'phonemes', 'unconditional', 'unaligned_phonemes',
        'text'
    ]

    dataset = VoiceData(voice=voice, which_sets=which_sets)

    sorting_size = batch_size * sorting_mult

    if not num_examples:
        num_examples = dataset.num_examples

    if 'train' in which_sets:
        scheme = ShuffledExampleScheme(num_examples)
    else:
        scheme = SequentialExampleScheme(num_examples)

    data_stream = DataStream.default_stream(dataset, iteration_scheme=scheme)

    if check_ratio and labels_type in ['unaligned_phonemes', 'text']:
        idx = data_stream.sources.index(labels_type)
        min_val = 8 if labels_type == 'text' else 12.
        max_val = 16 if labels_type == 'text' else 25.
        data_stream = Filter(
            data_stream, lambda x: _check_ratio(x, 0, idx, min_val, max_val))

    segment_sources = ('features', 'features_mask')
    all_sources = segment_sources

    if raw_data:
        raw_sources = ('raw_audio', )
        all_sources += raw_sources
    else:
        raw_sources = ()

    if labels_type != 'unconditional':
        all_sources += ('labels', )
        data_stream = Rename(data_stream, {labels_type: 'labels'})

    if labels_type in ['full_labels', 'phonemes']:
        segment_sources += ('labels', )

    elif labels_type in ['unaligned_phonemes', 'text']:
        all_sources += ('labels_mask', )

    data_stream = Batch(data_stream,
                        iteration_scheme=ConstantScheme(sorting_size))
    data_stream = Mapping(data_stream, SortMapping(_length))
    data_stream = Unpack(data_stream)
    data_stream = Batch(data_stream,
                        iteration_scheme=ConstantScheme(batch_size))

    data_stream = Filter(data_stream,
                         lambda x: _check_batch_size(x, batch_size))

    data_stream = Padding(data_stream)

    if use_speaker:
        data_stream = FilterSources(data_stream,
                                    all_sources + ('speaker_index', ))
    else:
        data_stream = FilterSources(data_stream, all_sources)

    data_stream = SourceMapping(data_stream,
                                _transpose,
                                which_sources=segment_sources)

    # The conditional is not necessary, but I'm still adding it for clarity.
    if raw_data:
        data_stream = SourceMapping(data_stream,
                                    _chunk,
                                    which_sources=raw_sources)

        raw_transformer = get_raw_transformer(q_type, q_level)
        data_stream = SourceMapping(data_stream,
                                    raw_transformer,
                                    which_sources=raw_sources)

    data_stream = SegmentSequence(data_stream,
                                  seq_size=seq_size + 1,
                                  share_value=1,
                                  return_last=False,
                                  add_flag=True,
                                  which_sources=segment_sources + raw_sources)

    if noise_level is not None:
        data_stream = AddConstantSource(data_stream, noise_level,
                                        'feedback_noise_level')

    return data_stream
    import ipdb
    from scribe.datasets.handwriting import Handwriting
    from fuel.transformers import Mapping, Padding, FilterSources, ForceFloatX
    from fuel.schemes import SequentialScheme
    from fuel.streams import DataStream

    def _transpose(data):
        return tuple(array.swapaxes(0,1) for array in data)

    batch_size = 10

    dataset = Handwriting(('train',))
    data_stream = DataStream.default_stream(
            dataset, iteration_scheme=SequentialScheme(
            50, batch_size))
    data_stream = FilterSources(data_stream, 
                          sources = ('features',))
    data_stream = Padding(data_stream)
    data_stream = Mapping(data_stream, _transpose)

    epoch = data_stream.get_epoch_iterator()
    for batch in epoch:
        print batch[0].shape

    print "Segmented:"
    data_stream = SegmentSequence(data_stream, add_flag = True)

    epoch = data_stream.get_epoch_iterator()
    for batch in epoch:
        print batch[0].shape, batch[2]

    #ipdb.set_trace()
示例#25
0
    import ipdb
    from scribe.datasets.handwriting import Handwriting
    from fuel.transformers import Mapping, Padding, FilterSources, ForceFloatX
    from fuel.schemes import SequentialScheme
    from fuel.streams import DataStream

    def _transpose(data):
        return tuple(array.swapaxes(0,1) for array in data)

    batch_size = 10

    dataset = Handwriting(('train',))
    data_stream = DataStream.default_stream(
            dataset, iteration_scheme=SequentialScheme(
            50, batch_size))
    data_stream = FilterSources(data_stream, 
                          sources = ('features',))
    data_stream = Padding(data_stream)
    data_stream = Mapping(data_stream, _transpose)

    epoch = data_stream.get_epoch_iterator()
    for batch in epoch:
        print batch[0].shape

    print "Segmented:"
    data_stream = SegmentSequence(data_stream, add_flag = True)

    epoch = data_stream.get_epoch_iterator()
    for batch in epoch:
        print batch[0].shape, batch[2]

    #ipdb.set_trace()
示例#26
0
    return tuple(array.swapaxes(0, 1) for array in data)


data_dir = os.environ['FUEL_DATA_PATH']
data_dir = os.path.join(data_dir, 'blizzard/', 'sp_standardize.npz')

data_stats = numpy.load(data_dir)
sp_mean = data_stats['sp_mean']
sp_std = data_stats['sp_std']

dataset = Blizzard(which_sets=('train', ), filename="sp_blizzard.hdf5")
data_stream = DataStream.default_stream(
    dataset,
    iteration_scheme=SequentialScheme(
        batch_size * (dataset.num_examples / batch_size), batch_size))
data_stream = FilterSources(data_stream, ('sp', ))
data_stream = ScaleAndShift(data_stream,
                            scale=1 / sp_std,
                            shift=-sp_mean / sp_std,
                            which_sources=('sp', ))
data_stream = Mapping(data_stream, _transpose)
data_stream = SegmentSequence(data_stream, seq_size, add_flag=True)
data_stream = ForceFloatX(data_stream)
train_stream = data_stream

num_valid_examples = 4 * 64
dataset = Blizzard(which_sets=('valid', ), filename="sp_blizzard.hdf5")
data_stream = DataStream.default_stream(dataset,
                                        iteration_scheme=SequentialScheme(
                                            num_valid_examples, batch_size))
data_stream = FilterSources(data_stream, ('sp', ))
示例#27
0
batch_size = 20
frame_size = 3
k = 20
target_size = frame_size * k

hidden_size_recurrent = 400
readout_size = 6 * k + 1

lr = 3e-4

dataset = Handwriting(('train', ))
data_stream = DataStream.default_stream(dataset,
                                        iteration_scheme=SequentialScheme(
                                            dataset.num_examples, batch_size))
data_stream = FilterSources(data_stream, sources=('features', ))
data_stream = Padding(data_stream)
data_stream = Mapping(data_stream, _transpose)
data_stream = ForceFloatX(data_stream)

dataset = Handwriting(('valid', ))
valid_stream = DataStream.default_stream(dataset,
                                         iteration_scheme=SequentialScheme(
                                             dataset.num_examples,
                                             10 * batch_size))
valid_stream = FilterSources(valid_stream, sources=('features', ))
valid_stream = Padding(valid_stream)
valid_stream = Mapping(valid_stream, _transpose)
valid_stream = ForceFloatX(valid_stream)

x_tr = next(data_stream.get_epoch_iterator())
示例#28
0
 def get_stream(self,
                part,
                batch_size=None,
                shuffle=False,
                max_length=None,
                raw_text=False,
                q_ids=False,
                seed=None,
                dataset=None):
     if not seed:
         seed = fuel.config.default_seed
     rng = numpy.random.RandomState(seed)
     if not dataset:
         dataset = self.get_dataset(part)
     if shuffle:
         stream = DataStream(dataset,
                             iteration_scheme=ShuffledExampleScheme(
                                 dataset.num_examples, rng=rng))
     else:
         stream = dataset.get_example_stream()
     if not q_ids:
         stream = FilterSources(
             stream,
             [source for source in dataset.sources if source != 'q_ids'])
     else:
         stream = SourcewiseMapping(stream,
                                    _str2vec,
                                    which_sources=('q_ids'))
     stream = PutTextTransfomer(stream, dataset, raw_text=True)
     # <eos> is added for two purposes: to serve a sentinel for coattention,
     # and also to ensure the answer span ends at a token
     eos = self.vocab.EOS
     stream = SourcewiseMapping(stream,
                                functools.partial(add_eos, eos),
                                which_sources=('contexts', 'questions'))
     stream = Mapping(stream,
                      functools.partial(select_random_answer, rng),
                      mapping_accepts=dict)
     if not batch_size:
         if self._retrieval:
             raise NotImplementedError()
         return stream
     if raw_text:
         stream = Mapping(stream,
                          keep_text,
                          mapping_accepts=dict,
                          add_sources=('contexts_text', 'questions_text'))
     stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))
     if self._retrieval:
         stream = Mapping(stream,
                          functools.partial(retrieve_and_pad_squad,
                                            self._retrieval),
                          mapping_accepts=dict,
                          add_sources=('defs', 'def_mask',
                                       'contexts_def_map',
                                       'questions_def_map'))
     stream = SourcewiseMapping(stream,
                                functools.partial(digitize, self.vocab),
                                which_sources=('contexts', 'questions'))
     stream = Padding(stream,
                      mask_sources=['contexts', 'questions'] +
                      (['contexts_text'] if raw_text else []))
     return stream
示例#29
0
batch_size = 20
frame_size = 3
k = 20
target_size = frame_size * k

hidden_size_recurrent = 400
readout_size =6*k+1

lr = 3e-4

dataset = Handwriting(('train',))
data_stream = DataStream.default_stream(
            dataset, iteration_scheme=SequentialScheme(
            dataset.num_examples, batch_size))
data_stream = FilterSources(data_stream, 
                          sources = ('features',))
data_stream = Padding(data_stream)
data_stream = Mapping(data_stream, _transpose)
data_stream = ForceFloatX(data_stream)

dataset = Handwriting(('valid',))
valid_stream = DataStream.default_stream(
            dataset, iteration_scheme=SequentialScheme(
            dataset.num_examples, 10*batch_size))
valid_stream = FilterSources(valid_stream, 
                          sources = ('features',))
valid_stream = Padding(valid_stream)
valid_stream = Mapping(valid_stream, _transpose)
valid_stream = ForceFloatX(valid_stream)

x_tr = next(data_stream.get_epoch_iterator())
示例#30
0
 def test_filters_axis_labels(self):
     transformer = FilterSources(self.stream, sources=("features",))
     assert_equal(transformer.axis_labels,
                  {'features': ('batch', 'width', 'height')})
示例#31
0
 def test_works_on_unsorted_sources(self):
     transformer = FilterSources(
         self.stream, sources=("targets", "features"))
     assert_equal(transformer.sources, ('features', 'targets'))
示例#32
0
 def test_works_on_sourcessubset(self):
     transformer = FilterSources(self.stream, sources=("features",))
     assert_equal(transformer.sources, ('features',))
     assert_equal(list(transformer.get_epoch_iterator()),
                  [(numpy.ones((2, 2, 2)),), (numpy.ones((2, 2, 2)),)])
示例#33
0
 def test_works_on_sourcessubset(self):
     transformer = FilterSources(self.stream, sources=("features",))
     assert_equal(transformer.sources, ('features',))
     assert_equal(list(transformer.get_epoch_iterator()),
                  [(numpy.ones((2, 2, 2)),), (numpy.ones((2, 2, 2)),)])