def setup_datastream(path, batch_size, sort_batch_count, valid=False): A = numpy.load( os.path.join(path, ('valid_x_raw.npy' if valid else 'train_x_raw.npy'))) B = numpy.load( os.path.join(path, ('valid_phn.npy' if valid else 'train_phn.npy'))) C = numpy.load( os.path.join( path, ('valid_seq_to_phn.npy' if valid else 'train_seq_to_phn.npy'))) D = [B[x[0]:x[1], 2] for x in C] ds = IndexableDataset({'input': A, 'output': D}) stream = DataStream(ds, iteration_scheme=ShuffledExampleScheme(len(A))) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('input')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size, num_examples=len(A))) stream = Padding(stream, mask_sources=['input', 'output']) return ds, stream
def train(self, req_vars): valid = TaxiDataset(self.config.valid_set, 'valid.hdf5', sources=('trip_id', )) valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] stream = TaxiDataset('train') if hasattr( self.config, 'use_cuts_for_training') and self.config.use_cuts_for_training: stream = DataStream(stream, iteration_scheme=TaxiTimeCutScheme()) else: stream = DataStream(stream, iteration_scheme=ShuffledExampleScheme( stream.num_examples)) stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) stream = transformers.TaxiGenerateSplits( stream, max_splits=self.config.max_splits) stream = transformers.taxi_add_datetime(stream) # stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts) stream = transformers.Select(stream, tuple(req_vars)) stream = Batch(stream, iteration_scheme=ConstantScheme(self.config.batch_size)) stream = MultiProcessing(stream) return stream
def train(self, req_vars): prefix_stream = DataStream(self.train_dataset, iteration_scheme=ShuffledExampleScheme( self.train_dataset.num_examples)) if not data.tvt: prefix_stream = transformers.TaxiExcludeTrips( prefix_stream, self.valid_trips_ids) prefix_stream = transformers.TaxiExcludeEmptyTrips(prefix_stream) prefix_stream = transformers.TaxiGenerateSplits( prefix_stream, max_splits=self.config.max_splits) prefix_stream = transformers.taxi_add_datetime(prefix_stream) prefix_stream = transformers.taxi_add_first_last_len( prefix_stream, self.config.n_begin_end_pts) prefix_stream = Batch(prefix_stream, iteration_scheme=ConstantScheme( self.config.batch_size)) candidate_stream = self.candidate_stream( self.config.train_candidate_size) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) stream = MultiProcessing(stream) return stream
def candidate_stream(self, n_candidates, sortmap=True): candidate_stream = DataStream(self.train_dataset, iteration_scheme=ShuffledExampleScheme( self.train_dataset.num_examples)) if not data.tvt: candidate_stream = transformers.TaxiExcludeTrips( candidate_stream, self.valid_trips_ids) candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream) candidate_stream = transformers.taxi_add_datetime(candidate_stream) if not data.tvt: candidate_stream = transformers.add_destination(candidate_stream) if sortmap: candidate_stream = transformers.balanced_batch( candidate_stream, key='latitude', batch_size=n_candidates, batch_sort_size=self.config.batch_sort_size) else: candidate_stream = Batch( candidate_stream, iteration_scheme=ConstantScheme(n_candidates)) candidate_stream = Padding(candidate_stream, mask_sources=['latitude', 'longitude']) return candidate_stream
def _construct_shuffled_stream(self, dataset, for_type='train'): '''Construc a shuffled stream from an IndexableDataset object Subclass should add transformation on the stream, e.g., 1.Sort samples by size 2.Batch dataset 3.Add mask on samples :param dataset: fuel.IndexableDataset This is constructed by self._construct_dataset method. :return: fuel.stream.Datastream An object of fuel.stream.Datastream with ShuffledExampleScheme A fuel shuffled stream with basic transformations, ''' it = ShuffledExampleScheme(dataset.num_examples) stream = DataStream(dataset, iteration_scheme=it) # Sort samples by size and compact samples with similar size into a batch. # stream = Batch(stream, iteration_scheme=ConstantScheme(self.batch_size * self.sort_batch_count)) # comparison = _balanced_batch_helper(stream.sources.index(self.compare_source)) # stream = Mapping(stream, SortMapping(comparison)) # stream = Unpack(stream) # stream = Batch(stream, iteration_scheme=ConstantScheme(self.batch_size)) # # Add mask on inputs # for source in self.need_mask_sources.iteritems(): # stream = Padding(stream, mask_sources=[source[0]], mask_dtype=source[1]) return stream
def setup_squad_ranker_datastream(path, vocab_file, config, example_count=1836975): ds = SQuADRankerDataset(path, vocab_file) it = ShuffledExampleScheme(examples=example_count) stream = DataStream(ds, iteration_scheme=it) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('question')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=[ 'question', 'answer', 'better', 'worse', 'b_left', 'b_right', 'w_left', 'w_right' ], mask_dtype='int32') return ds, stream
def train(self, req_vars): prefix_stream = DataStream(self.train_dataset, iteration_scheme=ShuffledExampleScheme( self.train_dataset.num_examples)) if not data.tvt: prefix_stream = transformers.TaxiExcludeTrips( prefix_stream, self.valid_trips_ids) prefix_stream = transformers.TaxiExcludeEmptyTrips(prefix_stream) prefix_stream = transformers.TaxiGenerateSplits( prefix_stream, max_splits=self.config.max_splits) prefix_stream = transformers.taxi_add_datetime(prefix_stream) prefix_stream = transformers.balanced_batch( prefix_stream, key='latitude', batch_size=self.config.batch_size, batch_sort_size=self.config.batch_sort_size) prefix_stream = Padding(prefix_stream, mask_sources=['latitude', 'longitude']) candidate_stream = self.candidate_stream( self.config.train_candidate_size) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) # stream = MultiProcessing(stream) return stream
def get_stream(self, part, batch_size, seed=None, raw_text=False): d = self.get_dataset(part) print("Dataset with {} examples".format(d.num_examples)) it = ShuffledExampleScheme(d.num_examples, rng=numpy.random.RandomState(seed)) stream = DataStream(d, iteration_scheme=it) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) if self._retrieval: stream = FixedMapping( stream, functools.partial(retrieve_and_pad_snli, self._retrieval), add_sources=("defs", "def_mask", "sentence1_def_map", "sentence2_def_map") ) # This is because there is bug in Fuel :( Cannot concatenate tuple and list if not raw_text: stream = SourcewiseMapping(stream, functools.partial(digitize, self.vocab), which_sources=('sentence1', 'sentence2')) stream = Padding( stream, mask_sources=('sentence1', 'sentence2')) # Increases amount of outputs by x2 return stream
def train(self, req_vars): valid = TaxiDataset(self.config.valid_set, 'valid.hdf5', sources=('trip_id', )) valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] stream = TaxiDataset('train') stream = DataStream(stream, iteration_scheme=ShuffledExampleScheme( stream.num_examples)) stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) stream = transformers.TaxiExcludeEmptyTrips(stream) stream = transformers.taxi_add_datetime(stream) stream = transformers.add_destination(stream) stream = transformers.Select( stream, tuple(v for v in req_vars if not v.endswith('_mask'))) stream = transformers.balanced_batch( stream, key='latitude', batch_size=self.config.batch_size, batch_sort_size=self.config.batch_sort_size) stream = Padding(stream, mask_sources=['latitude', 'longitude']) stream = transformers.Select(stream, req_vars) return stream
def _get_shuffled_text_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, src_sparse_feat_map='', trg_sparse_feat_map='', **kwargs): """Creates a parallel data stream using ``ParallelTextFile``. This data set implementation allows random access, so we return a shuffled data stream using the ``ShuffledExampleScheme`` iteration scheme. The arguments to this method are given by the configuration dict. """ parallel_dataset = ParallelTextFile( src_data, trg_data, src_vocab_size, trg_vocab_size, src_sparse_feat_map=src_sparse_feat_map, trg_sparse_feat_map=trg_sparse_feat_map) #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples) iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples) return DataStream(parallel_dataset, iteration_scheme=iter_scheme)
def get_stream(self, part, batches=True, shuffle=True, add_sources=(), num_examples=None, rng=None, seed=None): dataset = self.get_dataset(part, add_sources=add_sources) if num_examples is None: num_examples = dataset.num_examples if shuffle: iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng) else: iteration_scheme = SequentialExampleScheme(num_examples) stream = DataStream( dataset, iteration_scheme=iteration_scheme) stream = FilterSources(stream, (self.recordings_source, self.labels_source)+tuple(add_sources)) if self.add_eos: stream = Mapping(stream, _AddLabel(self.eos_label)) if self.add_bos: stream = Mapping(stream, _AddLabel(self.bos_label, append=False, times=self.add_bos)) if self.preprocess_text: stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text) stream = Filter(stream, self.length_filter) if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) if self.preprocess_features == 'log_spectrogram': stream = Mapping( stream, functools.partial(apply_preprocessing, log_spectrogram)) if self.normalization: stream = self.normalization.wrap_stream(stream) stream = ForceFloatX(stream) if not batches: return stream stream = Batch( stream, iteration_scheme=ConstantScheme(self.batch_size if part == 'train' else self.validation_batch_size)) stream = Padding(stream) stream = Mapping(stream, switch_first_two_axes) stream = ForceCContiguous(stream) return stream
def get_sgnmt_shuffled_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the shuffled training data stream. This is similar to ``get_sgnmt_tr_stream`` but uses ``ParallelTextFile`` in combination with ``ShuffledExampleScheme`` to support reshuffling.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) parallel_dataset = ParallelTextFile(src_data, trg_data, src_vocab, trg_vocab, None) #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples) iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples) s = DataStream(parallel_dataset, iteration_scheme=iter_scheme) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping( s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def train(self, req_vars): stream = TaxiDataset('train', data.traintest_ds) if hasattr( self.config, 'use_cuts_for_training') and self.config.use_cuts_for_training: stream = DataStream(stream, iteration_scheme=TaxiTimeCutScheme()) else: stream = DataStream(stream, iteration_scheme=ShuffledExampleScheme( stream.num_examples)) if not data.tvt: valid = TaxiDataset(data.valid_set, data.valid_ds, sources=('trip_id', )) valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) if hasattr(self.config, 'max_splits'): stream = transformers.TaxiGenerateSplits( stream, max_splits=self.config.max_splits) elif not data.tvt: stream = transformers.add_destination(stream) if hasattr(self.config, 'train_max_len'): idx = stream.sources.index('latitude') def max_len_filter(x): return len(x[idx]) <= self.config.train_max_len stream = Filter(stream, max_len_filter) stream = transformers.TaxiExcludeEmptyTrips(stream) stream = transformers.taxi_add_datetime(stream) stream = transformers.Select( stream, tuple(v for v in req_vars if not v.endswith('_mask'))) stream = transformers.balanced_batch( stream, key='latitude', batch_size=self.config.batch_size, batch_sort_size=self.config.batch_sort_size) stream = Padding(stream, mask_sources=['latitude', 'longitude']) stream = transformers.Select(stream, req_vars) stream = MultiProcessing(stream) return stream
def setup_sorter_datastream(path, config): ds = SorterDataset(path) it = ShuffledExampleScheme(examples=config.example_count) stream = DataStream(ds, iteration_scheme=it) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('unsorted')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=['answer', 'unsorted'], mask_dtype='int32') return ds, stream
def candidate_stream(self, n_candidates): candidate_stream = DataStream(self.train_dataset, iteration_scheme=ShuffledExampleScheme( self.train_dataset.num_examples)) if not data.tvt: candidate_stream = transformers.TaxiExcludeTrips( candidate_stream, self.valid_trips_ids) candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream) candidate_stream = transformers.taxi_add_datetime(candidate_stream) candidate_stream = transformers.taxi_add_first_last_len( candidate_stream, self.config.n_begin_end_pts) if not data.tvt: candidate_stream = transformers.add_destination(candidate_stream) return Batch(candidate_stream, iteration_scheme=ConstantScheme(n_candidates))
def stream_handwriting( which_sets, batch_size, seq_size, num_letters, sorting_mult=20): assert sorting_mult > 0 dataset = Handwriting(which_sets) sorting_size = batch_size * sorting_mult num_examples = sorting_size * (dataset.num_examples / sorting_size) if which_sets == ('train',): print "Random order." scheme = ShuffledExampleScheme(num_examples) else: print "Sequential order." scheme = SequentialExampleScheme(num_examples) data_stream = DataStream.default_stream(dataset, iteration_scheme=scheme) # Sort by length of the data sequence. data_stream = Batch( data_stream, iteration_scheme=ConstantScheme(sorting_size)) data_stream = Mapping(data_stream, SortMapping(_length)) data_stream = Unpack(data_stream) data_stream = Batch( data_stream, iteration_scheme=ConstantScheme(batch_size)) data_stream = Padding(data_stream) data_stream = SourceMapping( data_stream, _transpose, which_sources=('features', 'features_mask')) data_stream = SegmentSequence( data_stream, seq_size=seq_size + 1, share_value=True, return_last=True, which_sources=('features', 'features_mask'), add_flag=True) return data_stream
def train(self, req_vars): valid = TaxiDataset(self.config.valid_set, 'valid.hdf5', sources=('trip_id', )) valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] dataset = TaxiDataset('train') prefix_stream = DataStream(dataset, iteration_scheme=TaxiTimeCutScheme( self.config.num_cuts)) prefix_stream = transformers.TaxiExcludeTrips(prefix_stream, valid_trips_ids) prefix_stream = transformers.TaxiGenerateSplits( prefix_stream, max_splits=self.config.max_splits) prefix_stream = transformers.taxi_add_datetime(prefix_stream) prefix_stream = transformers.taxi_add_first_last_len( prefix_stream, self.config.n_begin_end_pts) prefix_stream = Batch(prefix_stream, iteration_scheme=ConstantScheme( self.config.batch_size)) candidate_stream = DataStream(dataset, iteration_scheme=ShuffledExampleScheme( dataset.num_examples)) candidate_stream = transformers.TaxiExcludeTrips( candidate_stream, valid_trips_ids) candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream) candidate_stream = transformers.taxi_add_datetime(candidate_stream) candidate_stream = transformers.taxi_add_first_last_len( candidate_stream, self.config.n_begin_end_pts) candidate_stream = Batch(candidate_stream, iteration_scheme=ConstantScheme( self.config.train_candidate_size)) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) stream = MultiProcessing(stream) return stream
def get_stream(self, part, batches=True, shuffle=True, add_sources=()): dataset = self.get_dataset(part, add_sources=add_sources) stream = (DataStream(dataset, iteration_scheme=ShuffledExampleScheme(dataset.num_examples)) if shuffle else dataset.get_example_stream()) stream = FilterSources(stream, (self.recordings_source, self.labels_source)+tuple(add_sources)) if self.add_eos: if self.prepend_eos: stream = Mapping(stream, _AddEosLabelBeginEnd(self.eos_label)) else: stream = Mapping(stream, _AddEosLabelEnd(self.eos_label)) if self.preprocess_text: stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text) stream = Filter(stream, self.length_filter) if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) if self.preprocess_features == 'log_spectrogram': stream = Mapping( stream, functools.partial(apply_preprocessing, log_spectrogram)) if self.normalization: stream = self.normalization.wrap_stream(stream) stream = ForceFloatX(stream) if not batches: return stream stream = Batch(stream, iteration_scheme=ConstantScheme(self.batch_size)) stream = Padding(stream) stream = Mapping(stream, switch_first_two_axes) stream = ForceCContiguous(stream) return stream
def valid(self, req_vars): valid_dataset = TaxiDataset(self.config.valid_set, 'valid.hdf5') train_dataset = TaxiDataset('train') valid_trips_ids = valid_dataset.get_data( None, slice(0, valid_dataset.num_examples))[ valid_dataset.sources.index('trip_id')] prefix_stream = DataStream(valid_dataset, iteration_scheme=SequentialExampleScheme( valid_dataset.num_examples)) prefix_stream = transformers.taxi_add_datetime(prefix_stream) prefix_stream = transformers.taxi_add_first_last_len( prefix_stream, self.config.n_begin_end_pts) prefix_stream = Batch(prefix_stream, iteration_scheme=ConstantScheme( self.config.batch_size)) candidate_stream = DataStream(train_dataset, iteration_scheme=ShuffledExampleScheme( train_dataset.num_examples)) candidate_stream = transformers.TaxiExcludeTrips( candidate_stream, valid_trips_ids) candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream) candidate_stream = transformers.taxi_add_datetime(candidate_stream) candidate_stream = transformers.taxi_add_first_last_len( candidate_stream, self.config.n_begin_end_pts) candidate_stream = Batch(candidate_stream, iteration_scheme=ConstantScheme( self.config.valid_candidate_size)) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) stream = MultiProcessing(stream) return stream
def test_shuffled_example_scheme_requests_examples(): assert ShuffledExampleScheme(3).requests_examples
def test_shuffled_example_scheme_no_rng(): scheme = ShuffledExampleScheme(7) assert scheme.rng is not None
def get_stream(self, part, batches=True, shuffle=True, add_sources=(), num_examples=None, rng=None, seed=None): dataset = self.get_dataset(part, add_sources=add_sources) iteration_scheme = None if self.use_iteration_scheme: if num_examples is None: num_examples = dataset.num_examples if shuffle: iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng) else: iteration_scheme = SequentialExampleScheme(num_examples) stream = DataStream(dataset, iteration_scheme=iteration_scheme) # Transformations before rearrangement labels_source = self.sources_map['labels'] if self.add_eos: stream = _AddLabel(stream, self.eos_label, which_sources=[labels_source]) if self.add_bos: if self.bos_label is None: raise Exception('No bos label given') stream = _AddLabel(stream, self.bos_label, append=False, times=self.add_bos, which_sources=[labels_source]) if self.clip_length: stream = _Clip(stream, self.clip_length, force_eos=self.eos_label if self.force_eos_when_clipping else None, which_sources=[labels_source]) # More efficient packing of examples in batches if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) stream = Mapping(stream, SortMapping(_Length(index=0))) stream = Unpack(stream) stream = Rearrange( stream, dict_subset(self.sources_map, self.default_sources + list(add_sources))) # Tranformations after rearrangement if self.corrupt_sources: # Can only corrupt sources with the same alphabet # as labels for source, prob in zip(self.corrupt_sources['names'], self.corrupt_sources['probs']): stream = _Corrupt(stream, prob, self.token_map(source), self.eos_label, which_sources=[source]) if self.max_length and part == 'train': # Filtering by the maximum length is only done # for the training set. self.length_filter = _LengthFilter(indices=[ i for i, source in enumerate(stream.sources) if source in self.filter_by ], max_length=self.max_length) stream = Filter(stream, self.length_filter) stream = ForceFloatX(stream) if not batches: return stream stream = Batch( stream, iteration_scheme=ConstantScheme(self.batch_size if part == 'train' else self.validation_batch_size)) stream = Padding(stream) stream = Mapping(stream, switch_first_two_axes) stream = ForceCContiguous(stream) return stream
def get_stream(self, part, batch_size=None, shuffle=False, max_length=None, raw_text=False, q_ids=False, seed=None, dataset=None): if not seed: seed = fuel.config.default_seed rng = numpy.random.RandomState(seed) if not dataset: dataset = self.get_dataset(part) if shuffle: stream = DataStream(dataset, iteration_scheme=ShuffledExampleScheme( dataset.num_examples, rng=rng)) else: stream = dataset.get_example_stream() if not q_ids: stream = FilterSources( stream, [source for source in dataset.sources if source != 'q_ids']) else: stream = SourcewiseMapping(stream, _str2vec, which_sources=('q_ids')) stream = PutTextTransfomer(stream, dataset, raw_text=True) # <eos> is added for two purposes: to serve a sentinel for coattention, # and also to ensure the answer span ends at a token eos = self.vocab.EOS stream = SourcewiseMapping(stream, functools.partial(add_eos, eos), which_sources=('contexts', 'questions')) stream = Mapping(stream, functools.partial(select_random_answer, rng), mapping_accepts=dict) if not batch_size: if self._retrieval: raise NotImplementedError() return stream if raw_text: stream = Mapping(stream, keep_text, mapping_accepts=dict, add_sources=('contexts_text', 'questions_text')) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) if self._retrieval: stream = Mapping(stream, functools.partial(retrieve_and_pad_squad, self._retrieval), mapping_accepts=dict, add_sources=('defs', 'def_mask', 'contexts_def_map', 'questions_def_map')) stream = SourcewiseMapping(stream, functools.partial(digitize, self.vocab), which_sources=('contexts', 'questions')) stream = Padding(stream, mask_sources=['contexts', 'questions'] + (['contexts_text'] if raw_text else [])) return stream
def get_stream(self, part, batches=True, shuffle=True, add_sources=(), num_examples=None, rng=None, seed=None): dataset = self.get_dataset(part, add_sources=add_sources) if num_examples is None: num_examples = dataset.num_examples if shuffle: iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng) else: iteration_scheme = SequentialExampleScheme(num_examples) stream = DataStream( dataset, iteration_scheme=iteration_scheme) if self.add_eos: stream = Mapping(stream, _AddLabel( self.eos_label, index=stream.sources.index(self.sources_map['labels']))) if self.add_bos: if self.bos_label is None: raise Exception('No bos label given') stream = Mapping(stream, _AddLabel( self.bos_label, append=False, times=self.add_bos, index=stream.sources.index(self.sources_map['labels']))) if self.max_length: stream = Filter(stream, self.length_filter) if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) # # Hardcode 0 for source on which to sort. This will be good, as # most source lengths are correlated and, furthermore, the # labels will typically be the last source, thus in a single-input # case this sorts on input lengths # stream = Mapping(stream, SortMapping(_Length( index=0))) stream = Unpack(stream) if self.normalization: stream = self.normalization.wrap_stream(stream) stream = ForceFloatX(stream) stream = Rename(stream, names=dict_subset({v: k for (k, v) in self.sources_map.items()}, stream.sources, must_have=False)) if not batches: return stream stream = Batch( stream, iteration_scheme=ConstantScheme(self.batch_size if part == 'train' else self.validation_batch_size)) stream = Padding(stream) stream = Mapping(stream, switch_first_two_axes) stream = ForceCContiguous(stream) stream._produces_examples = False return stream
def main(dataset_path, use_c, log_min, log_max, num_steps): train_set = H5PYDataset( dataset_path, which_sets=('train',), sources=('features', 'targets'), subset=slice(0, 63257), load_in_memory=True) train_stream = DataStream.default_stream( train_set, iteration_scheme=ShuffledExampleScheme(train_set.num_examples)) def get_class_balanced_batch(iterator): train_features = [[] for _ in range(10)] train_targets = [[] for _ in range(10)] batch_size = 0 while batch_size < 1000: f, t = next(iterator) t = t[0] if len(train_features[t]) < 100: train_features[t].append(f) train_targets[t].append(t) batch_size += 1 train_features = numpy.vstack(sum(train_features, [])) train_targets = numpy.vstack(sum(train_targets, [])) return train_features, train_targets train_features, train_targets = get_class_balanced_batch( train_stream.get_epoch_iterator()) valid_set = H5PYDataset( dataset_path, which_sets=('train',), sources=('features', 'targets'), subset=slice(63257, 73257), load_in_memory=True) valid_features, valid_targets = valid_set.data_sources test_set = H5PYDataset( dataset_path, which_sets=('test',), sources=('features', 'targets'), load_in_memory=True) test_features, test_targets = test_set.data_sources if use_c is None: best_error_rate = 1.0 best_C = None for log_C in numpy.linspace(log_min, log_max, num_steps): C = numpy.exp(log_C) svm = LinearSVC(C=C) svm.fit(train_features, train_targets.ravel()) error_rate = 1 - numpy.mean( [svm.score(valid_features[1000 * i: 1000 * (i + 1)], valid_targets[1000 * i: 1000 * (i + 1)].ravel()) for i in range(10)]) if error_rate < best_error_rate: best_error_rate = error_rate best_C = C print('C = {}, validation error rate = {} '.format(C, error_rate) + '(best is {}, {})'.format(best_C, best_error_rate)) else: best_C = use_c error_rates = [] for _ in range(10): train_features, train_targets = get_class_balanced_batch( train_stream.get_epoch_iterator()) svm = LinearSVC(C=best_C) svm.fit(train_features, train_targets.ravel()) error_rates.append(1 - numpy.mean( [svm.score(valid_features[1000 * i: 1000 * (i + 1)], valid_targets[1000 * i: 1000 * (i + 1)].ravel()) for i in range(10)])) print('Validation error rate = {} +- {} '.format(numpy.mean(error_rates), numpy.std(error_rates))) error_rates = [] for _ in range(100): train_features, train_targets = get_class_balanced_batch( train_stream.get_epoch_iterator()) svm = LinearSVC(C=best_C) svm.fit(train_features, train_targets.ravel()) s = 1000 * numpy.sum( [svm.score(test_features[1000 * i: 1000 * (i + 1)], test_targets[1000 * i: 1000 * (i + 1)].ravel()) for i in range(26)]) s += 32 * svm.score(test_features[-32:], test_targets[-32:].ravel()) s = s / 26032.0 error_rates.append(1 - s) print('Test error rate = {} +- {} '.format(numpy.mean(error_rates), numpy.std(error_rates)))
def parrot_stream(voice, use_speaker=False, which_sets=('train', ), batch_size=32, seq_size=50, num_examples=None, sorting_mult=4, noise_level=None, labels_type='full_labels', check_ratio=False, raw_data=True, q_type='mu-law', q_level=256): assert labels_type in [ 'full_labels', 'phonemes', 'unconditional', 'unaligned_phonemes', 'text' ] dataset = VoiceData(voice=voice, which_sets=which_sets) sorting_size = batch_size * sorting_mult if not num_examples: num_examples = dataset.num_examples if 'train' in which_sets: scheme = ShuffledExampleScheme(num_examples) else: scheme = SequentialExampleScheme(num_examples) data_stream = DataStream.default_stream(dataset, iteration_scheme=scheme) if check_ratio and labels_type in ['unaligned_phonemes', 'text']: idx = data_stream.sources.index(labels_type) min_val = 8 if labels_type == 'text' else 12. max_val = 16 if labels_type == 'text' else 25. data_stream = Filter( data_stream, lambda x: _check_ratio(x, 0, idx, min_val, max_val)) segment_sources = ('features', 'features_mask') all_sources = segment_sources if raw_data: raw_sources = ('raw_audio', ) all_sources += raw_sources else: raw_sources = () if labels_type != 'unconditional': all_sources += ('labels', ) data_stream = Rename(data_stream, {labels_type: 'labels'}) if labels_type in ['full_labels', 'phonemes']: segment_sources += ('labels', ) elif labels_type in ['unaligned_phonemes', 'text']: all_sources += ('labels_mask', ) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(sorting_size)) data_stream = Mapping(data_stream, SortMapping(_length)) data_stream = Unpack(data_stream) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(batch_size)) data_stream = Filter(data_stream, lambda x: _check_batch_size(x, batch_size)) data_stream = Padding(data_stream) if use_speaker: data_stream = FilterSources(data_stream, all_sources + ('speaker_index', )) else: data_stream = FilterSources(data_stream, all_sources) data_stream = SourceMapping(data_stream, _transpose, which_sources=segment_sources) # The conditional is not necessary, but I'm still adding it for clarity. if raw_data: data_stream = SourceMapping(data_stream, _chunk, which_sources=raw_sources) raw_transformer = get_raw_transformer(q_type, q_level) data_stream = SourceMapping(data_stream, raw_transformer, which_sources=raw_sources) data_stream = SegmentSequence(data_stream, seq_size=seq_size + 1, share_value=1, return_last=False, add_flag=True, which_sources=segment_sources + raw_sources) if noise_level is not None: data_stream = AddConstantSource(data_stream, noise_level, 'feedback_noise_level') return data_stream
def get_one_stream(self, part, lang=None, batches=True, shuffle=True, add_sources=(), num_examples=None, rng=None, seed=None, num_result=None, soften_distributions=None, only_stream=False): assert lang in self.langs dataset = self.get_dataset(part, lang, add_sources=add_sources) if num_examples is None: num_examples = dataset.num_examples if shuffle: iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng) else: iteration_scheme = SequentialExampleScheme(num_examples) if num_result is None: num_result = num_examples if lang != self.langs[0] and not only_stream: iteration_scheme = RandomExampleScheme(num_examples, num_result=num_result, rng=rng) stream = DataStream( dataset, iteration_scheme=iteration_scheme) if soften_distributions: stream = Mapping(stream, SoftenResult(self.default_sources, soften_distributions)) for bconv in self._binary_convertable_data: if bconv in self.default_sources: stream = Mapping(stream, ConvertToMask(self.default_sources, bconv, self.num_features(bconv))) if self.add_eos: stream = Mapping(stream, _AddLabel( self.eos_label, index=stream.sources.index(self.sources_map['labels']))) if self.add_bos: if self.bos_label is None: raise Exception('No bos label given') stream = Mapping(stream, _AddLabel( self.bos_label, append=False, times=self.add_bos, index=stream.sources.index(self.sources_map['labels']))) if self.max_length: stream = Filter(stream, self.length_filter) if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) # # Hardcode 0 for source on which to sort. This will be good, as # most source lengths are correlated and, furthermore, the # labels will typically be the last source, thus in a single-input # case this sorts on input lengths # stream = Mapping(stream, SortMapping(_Length( index=0))) stream = Unpack(stream) if self.normalization: stream = self.normalization.wrap_stream(stream) stream = ForceFloatX(stream) stream = Rename(stream, names=dict_subset({v: k for (k, v) in self.sources_map.items()}, stream.sources, must_have=False)) if not batches: return stream, num_examples stream = Batch( stream, iteration_scheme=ConstantScheme(self.batch_size if part == 'train' else self.validation_batch_size)) stream._produces_examples = False return stream, num_examples
def get_stream(file, dictionary=None, add_dict=False, shuffle=False, batch_size=None, read_ahead=1): """ Creates a stream with train/valid/test examples. :param file: path to a file with the dataset :param dictionary: string->int dict with vocabulary from other datasets. If None, the dictionary is built from this dataset :param add_dict: if True, new words are added to the dictionary from this dataset :param shuffle: if True, the dataset examples are randomly shuffled :param batch_size :param read_ahead: Number of batches that shall be pre-fetched and ordered by context length to speed up computation """ # Dataset type (CNN/CBT/bAbI) data_type = self.args.dataset_type if not batch_size: batch_size = self.args.batch_size # Pattern for text tokenization pattern = re.compile(" |\t|\|") if data_type == 'babi': prepro = lambda x: nltk.word_tokenize(x) else: prepro = lambda x: pattern.split(x) if add_dict: # add words to dictionary f = codecs.open(file, 'r', encoding="utf8") vocabulary = get_vocabulary(f, prepro) code2token = map(lambda x: x[0], vocabulary.most_common()) new_word_count = 0 for word in code2token: if word not in dictionary: dictionary[word] = len(dictionary) new_word_count += 1 print "Added {} new words from file {} to previous vocabulary.".format( new_word_count, file) if not dictionary: print "Computing new vocabulary for file {}.".format(file) # compute vocabulary f = codecs.open(file, 'r', encoding="utf8") vocabulary = get_vocabulary(f, prepro) code2token = map(lambda x: x[0], vocabulary.most_common()) # Add special symbols (beginning/end of sentence, unknown token, end of question) code2token.extend(['<S>', '</S>', '<UNK>', '<QUESTION_END>']) dictionary = compute_token2code(code2token) # Select the data loader appropriate for the dataset common_params = { 'level': 'word', 'bos_token': None, 'eos_token': None, 'append_question': self.args.query_inited_context_encoder } if data_type == 'cnn': dataset = CNNDataset([file], dictionary, **common_params) elif data_type == 'cbt': dataset = CBDataset([file], dictionary, **common_params) elif data_type == 'babi': dataset = bAbIDataset([file], dictionary, **common_params) stream = dataset.get_example_stream() # Load all data into memory, this way we avoid reloading the data from disk in every epoch memory_data = [[] for _ in dataset.sources] for ex in stream.get_epoch_iterator(): for source_example, data_list in zip(ex, memory_data): data_list.append(source_example) data_dict = OrderedDict(zip(dataset.sources, memory_data)) mem_dataset = UnpickableIndexableDataset(data_dict) if shuffle: # shuffle the data after each epoch of training mem_dataset.example_iteration_scheme = ShuffledExampleScheme( mem_dataset.num_examples) stream = mem_dataset.get_example_stream() # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * read_ahead)) if read_ahead > 1: # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short stream = Padding( stream, mask_sources=['context', 'question', 'candidates']) return stream, dictionary