def _get_align_stream(src_data, trg_data, src_vocab_size, trg_vocab_size, seq_len, **kwargs): """Creates the stream which is used for the main loop. Args: src_data (string): Path to the source sentences trg_data (string): Path to the target sentences src_vocab_size (int): Size of the source vocabulary in the NMT model trg_vocab_size (int): Size of the target vocabulary in the NMT model seq_len (int): Maximum length of any source or target sentence Returns: ExplicitNext. Alignment data stream which can be iterated explicitly """ # Build dummy vocabulary to make TextFile happy src_vocab = _add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = _add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) s = Batch(s, iteration_scheme=ConstantScheme(1)) masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return ExplicitNext(masked_stream)
def get_train_stream(configuration, sfiles, tfiles, svocab_dict, tvocab_dict): s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') # Merge stream = Merge([s_dataset.get_example_stream(), t_dataset.get_example_stream()], ('source', 'target')) # Filter -- TODO stream = Filter(stream, predicate=_too_long(seq_len=configuration['seq_len'])) # Map - no need # Batch - Sort stream = Batch(stream, iteration_scheme=ConstantScheme( configuration['batch_size']*configuration['sort_k_batches'])) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) stream = Batch( stream, iteration_scheme=ConstantScheme(configuration['batch_size'])) # Pad # Note that </s>=0. Fuel only allows padding 0 by default masked_stream = Padding(stream) return masked_stream
def get_dev_stream_with_topicalq(test_set=None, src_vocab=None, src_vocab_size=30000, topical_test_set=None, topical_vocab=None, topical_vocab_size=2000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if test_set is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) print test_set, type(src_vocab) topical_vocab = cPickle.load(open(topical_vocab, 'rb')) #not ensure special token. topical_dataset = TextFile([topical_test_set], topical_vocab, None, None, '10') dev_dataset = TextFile([test_set], src_vocab, None) #dev_stream = DataStream(dev_dataset) # Merge them to get a source, target pair dev_stream = Merge([ dev_dataset.get_example_stream(), topical_dataset.get_example_stream() ], ('source', 'source_topical')) return dev_stream
def get_dev_stream(val_set=None, valid_sent_dict=None, src_vocab=None, trg_vocab=None, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set is not None and src_vocab is not None: # Load dictionaries and ensure special tokens exist src_vocab = ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) dev_dataset = TextFile([val_set], src_vocab, None) dev_dictset = TextFile([valid_sent_dict], trg_vocab, None) #dev_stream = DataStream(dev_dataset) # Merge them to get a source, target pair dev_stream = Merge([ dev_dataset.get_example_stream(), dev_dictset.get_example_stream() ], ('source', 'valid_sent_trg_dict')) return dev_stream
def get_devtest_stream(data_type='valid', input_file=None, **kwards): if data_type == 'valid': data_file = kwards.pop('valid_src') data_file_hist = kwards.pop('valid_src_hist') elif data_type == 'test': if input_file is None: data_file = kwards.pop('test_src') else: data_file = input_file # added by Longyue data_file_hist = kwards.pop('test_src_hist') else: logger.error('wrong datatype, which must be one of valid or test') unk_token = kwards.pop('unk_token') eos_token = kwards.pop('eos_token') vocab_src = kwards.pop('vocab_src') dataset = TextFile(files=[data_file], dictionary=pkl.load(open(vocab_src, 'rb')), level='word', unk_token=unk_token, bos_token=None, eos_token=eos_token) dev_stream = DataStream(dataset) # added by Longyue hist_len = 3 dev_stream_hist = [] for idx in range(hist_len): dataset_hist = TextFile(files=[data_file_hist + str(idx)], dictionary=pkl.load(open(vocab_src, 'rb')), level='word', unk_token=unk_token, bos_token=None, eos_token=eos_token) dev_stream_hist.append(DataStream(dataset_hist)) dev_stream_hist_combine = [] for d_s in dev_stream_hist: for item in d_s.get_epoch_iterator(): dev_stream_hist_combine.append(item) item_len = len(dev_stream_hist_combine) dev_stream_hist_split = [] for i in range(item_len / hist_len): tmp = [] for j in range(hist_len): tmp.append(dev_stream_hist_combine[i + item_len / hist_len * j]) dev_stream_hist_split.append(tmp) dev_stream_hist_split = tuple(dev_stream_hist_split) return dev_stream, dev_stream_hist_split
def get_tr_stream(config): cgs = config['cgs'] enc_ids, dec_ids = get_enc_dec_ids(cgs) # Prepare source vocabs and files, make sure special tokens are there src_files = config['src_datas'] src_vocabs = { k: cPickle.load(open(v)) for k, v in config['src_vocabs'].iteritems() } for k in src_vocabs.keys(): src_vocabs[k]['<S>'] = 0 src_vocabs[k]['</S>'] = config['src_eos_idxs'][k] src_vocabs[k]['<UNK>'] = config['unk_id'] # Prepare target vocabs and files, make sure special tokens are there trg_files = config['trg_datas'] trg_vocabs = { k: cPickle.load(open(v)) for k, v in config['trg_vocabs'].iteritems() } for k in trg_vocabs.keys(): trg_vocabs[k]['<S>'] = 0 trg_vocabs[k]['</S>'] = config['trg_eos_idxs'][k] trg_vocabs[k]['<UNK>'] = config['unk_id'] # Create individual source streams src_datasets = { cg: TextFile([src_files[cg]], src_vocabs[p_(cg)[0]], None) for cg in cgs } # Create individial target streams trg_datasets = { cg: TextFile([trg_files[cg]], trg_vocabs[p_(cg)[1]], None) for cg in cgs } # Build the preprocessing pipeline for individual streams ind_streams = {} for cg in cgs: logger.info('Building training stream for cg:[{}]'.format(cg)) masked_stream = get_src_trg_stream(cg, config, src_datasets, trg_datasets) ind_streams[cg] = masked_stream # Scheduler and meta-controller multi_enc_stream = MultiEncStream(ind_streams, schedule=config['schedule'], batch_sizes=config['batch_sizes'], transpose=True, start_after=config.get( 'start_after', None)) return multi_enc_stream
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream
def get_sgnmt_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the unshuffled training data stream. This corresponds to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the blocks examples.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping( s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def get_test_stream_withContext_grdTruth(test_ctx_datas=None, test_set_source=None, test_set_target=None, src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, batch_size=128, unk_id=1, ctx_num=3, **kwargs): """Setup development set stream if necessary.""" masked_stream = None if test_set_source is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) print test_set_source, type(src_vocab) # Get text files from both source and target ctx_datasets = [] for i in range(ctx_num): ctx_datasets.append(TextFile([test_ctx_datas[i]], src_vocab, None)) dev_dataset = TextFile([test_set_source], src_vocab, None) dev_target = TextFile([test_set_target], trg_vocab, None) dev_stream = Merge([i.get_example_stream() for i in ctx_datasets] + [ dev_dataset.get_example_stream(), dev_target.get_example_stream() ], tuple('context_' + str(i) for i in range(ctx_num)) + ('source', 'target')) stream = Mapping( dev_stream, _oov_to_unk(ctx_num=ctx_num, src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) masked_stream = PaddingWithEOSContext( stream, [src_vocab_size - 1 for i in range(ctx_num + 1)] + [trg_vocab_size - 1]) return masked_stream
def get_dev_stream_withContext_withPosTag(test_ctx_datas=None, test_posTag_datas=None, test_set_source=None, src_vocab=None, src_vocab_size=30000, unk_id=1, ctx_num=3, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if test_set_source is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) print test_set_source, type(src_vocab) # Get text files from both source and target ctx_datasets = [] posTag_datasets = [] for i in range(ctx_num): ctx_datasets.append(TextFile([test_ctx_datas[i]], src_vocab, None)) posTag_datasets.append( TextFile([test_posTag_datas[i]], src_vocab, None)) posTag_datasets.append( TextFile([test_posTag_datas[ctx_num]], src_vocab, None)) src_dataset = TextFile([test_set_source], src_vocab, None) # Merge them to get a source, target pair dev_stream = Merge( [i.get_example_stream() for i in ctx_datasets] + [i.get_example_stream() for i in posTag_datasets] + [src_dataset.get_example_stream()], tuple('context_' + str(i) for i in range(ctx_num)) + tuple('context_posTag_' + str(i) for i in range(ctx_num)) + ('source_posTag', 'source')) stream = Mapping( dev_stream, _oov_to_unk_posTag_dev(ctx_num=ctx_num, src_vocab_size=src_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(1)) masked_stream = PaddingWithEOSContext( stream, [src_vocab_size - 1 for i in range(2 * ctx_num + 2)]) return masked_stream
def get_dev_stream_with_grdTruth(val_set_source=None, val_set_target=None, src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, batch_size=128, unk_id=1, seq_len=50, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set_source is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) print val_set_source, type(src_vocab) dev_dataset = TextFile([val_set_source], src_vocab, None) trg_dataset = TextFile([val_set_target], trg_vocab, None) # Merge them to get a source, target pair dev_stream = Merge([ dev_dataset.get_example_stream(), trg_dataset.get_example_stream() ], ('dev_source', 'dev_target')) # Filter sequences that are too long stream = Filter(dev_stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token stream = Mapping( stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(1)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream
def get_dev_stream_with_context_features(val_context_features=None, val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" def _get_np_array(filename): return numpy.load(filename)['arr_0'] dev_stream = None if val_set is not None and src_vocab is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) dev_dataset = TextFile([val_set], src_vocab, None) # now add the source with the image features # create the image datastream (iterate over a file line-by-line) con_features = _get_np_array(val_context_features) con_feature_dataset = IterableDataset(con_features) valid_image_stream = DataStream(con_feature_dataset) # dev_stream = DataStream(dev_dataset) dev_stream = Merge([dev_dataset.get_example_stream(), valid_image_stream], ('source', 'initial_context')) # dev_stream = dev_stream.get_example_stream() return dev_stream
def get_devtest_stream(data_type='valid', input_file=None, **kwards): if data_type == 'valid': data_file = kwards.pop('valid_src') elif data_type == 'test': if input_file is None: data_file = kwards.pop('test_src') else: data_file = input_file else: logger.error('wrong datatype, which must be one of valid or test') unk_token = kwards.pop('unk_token') eos_token = kwards.pop('eos_token') vocab_src = kwards.pop('vocab_src') dataset = TextFile(files=[data_file], encoding='UTF-8', preprocess=to_lower_case, dictionary=pkl.load(open(vocab_src, 'rb')), level='word', unk_token=unk_token, bos_token=None, eos_token=eos_token) dev_stream = DataStream(dataset) return dev_stream
def get_dev_stream(val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, bos_token=None, **kwargs): """Setup development set stream if necessary.""" if type(bos_token) is str: bos_token = bos_token.decode('utf8') dev_stream = None if val_set is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) dev_dataset = TextFile([val_set], src_vocab, bos_token=bos_token, eos_token=u'</S>', unk_token=u'<UNK>', encoding='utf8') dev_stream = DataStream(dev_dataset) return dev_stream
def get_log_prob_stream(cg, config): eid, did = p_(cg) dataset = config['log_prob_sets'][cg] # Prepare source vocabs and files, make sure special tokens are there src_vocab = cPickle.load(open(config['src_vocabs'][eid])) src_vocab['<S>'] = 0 src_vocab['</S>'] = config['src_eos_idxs'][eid] src_vocab['<UNK>'] = config['unk_id'] # Prepare target vocabs and files, make sure special tokens are there trg_vocab = cPickle.load(open(config['trg_vocabs'][did])) trg_vocab['<S>'] = 0 trg_vocab['</S>'] = config['trg_eos_idxs'][did] trg_vocab['<UNK>'] = config['unk_id'] # Build the preprocessing pipeline for individual streams logger.info('Building logprob stream for cg:[{}]'.format(cg)) src_dataset = TextFile([dataset[0]], src_vocab, None) trg_dataset = TextFile([dataset[1]], trg_vocab, None) stream = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) stream = Mapping( stream, _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) bs = 100 if 'log_prob_bs' in config: if isinstance(config['log_prob_bs'], dict): bs = config['log_prob_bs'][cg] else: bs = config['log_prob_bs'] stream = Batch(stream, iteration_scheme=ConstantScheme(bs, num_examples=get_num_lines( dataset[0]))) masked_stream = Padding(stream) masked_stream = Mapping( masked_stream, _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]), (2, 0, config['trg_eos_idxs'][did])])) return masked_stream
def get_dev_stream_with_prefix_file(val_set=None, val_set_grndtruth=None, val_set_prefixes=None, val_set_suffixes=None, src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, unk_id=1, return_vocab=False, **kwargs): """Setup development stream with user-provided source, target, prefixes, and suffixes""" dev_stream = None if val_set is not None and val_set_grndtruth is not None and val_set_prefixes is not None and val_set_suffixes is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Note: user should have already provided the EOS token in the data representation for the suffix # Note: The reason that we need EOS tokens in the reference file is that IMT systems need to evaluate metrics # Note: which count prediction of the </S> token, and evaluation scripts are called on the files dev_source_dataset = TextFile([val_set], src_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') dev_target_dataset = TextFile([val_set_grndtruth], trg_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') dev_prefix_dataset = TextFile([val_set_prefixes], trg_vocab, bos_token='<S>', eos_token=None, unk_token='<UNK>') dev_suffix_dataset = TextFile([val_set_suffixes], trg_vocab, bos_token=None, eos_token=None, unk_token='<UNK>') dev_stream = Merge([dev_source_dataset.get_example_stream(), dev_target_dataset.get_example_stream(), dev_prefix_dataset.get_example_stream(), dev_suffix_dataset.get_example_stream()], ('source', 'target','target_prefix','target_suffix')) if return_vocab: return dev_stream, src_vocab, trg_vocab else: return dev_stream
def get_test_stream(sfiles, svocab_dict): dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') stream = Merge([dataset.get_example_stream(),], ('source', )) stream = Batch( stream, iteration_scheme=ConstantScheme(10)) stream = Padding(stream) return stream
def get_dev_stream_with_prefixes(val_set=None, val_set_grndtruth=None, src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, unk_id=1, return_vocab=False, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set is not None and val_set_grndtruth is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) dev_source_dataset = TextFile([val_set], src_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') dev_target_dataset = TextFile([val_set_grndtruth], trg_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') dev_stream = Merge([dev_source_dataset.get_example_stream(), dev_target_dataset.get_example_stream()], ('source', 'target')) # now add prefix and suffixes to this stream dev_stream = Mapping(dev_stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('dev_sample_ratio', 1.)), add_sources=('target_prefix', 'target_suffix')) dev_stream = Mapping(dev_stream, CopySourceAndTargetToMatchPrefixes(dev_stream)) # changing stream.produces_examples is a little hack which lets us use Unpack to flatten dev_stream.produces_examples = False # flatten the stream back out into (source, target, target_prefix, target_suffix) dev_stream = Unpack(dev_stream) if return_vocab: return dev_stream, src_vocab, trg_vocab else: return dev_stream
def get_dev_tr_stream_with_topic_target(val_set_source=None,val_set_target=None, src_vocab=None,trg_vocab=None, src_vocab_size=30000,trg_vocab_size=30000, trg_topic_vocab_size=2000,source_topic_vocab_size=2000, topical_dev_set=None,topic_vocab_input=None,topic_vocab_output=None,topical_vocab_size=2000, unk_id=1, **kwargs): """Prepares the training data stream.""" dev_stream = None if val_set_source is not None and src_vocab is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) topic_vocab_input=cPickle.load(open(topic_vocab_input,'rb')); topic_vocab_output=cPickle.load(open(topic_vocab_output, 'rb'));#already has <UNK> and </S> in it topic_binary_vocab={}; for k,v in topic_vocab_output.items(): if k=='<UNK>': topic_binary_vocab[k]=0; else: topic_binary_vocab[k]=1; # Get text files from both source and target src_dataset = TextFile([val_set_source], src_vocab, None) trg_dataset = TextFile([val_set_target], trg_vocab, None) src_topic_input=TextFile([topical_dev_set],topic_vocab_input,None,None,'rt') trg_topic_dataset = TextFile([val_set_target],topic_vocab_output,None); trg_topic_binary_dataset= TextFile([val_set_target],topic_binary_vocab,None); # Merge them to get a source, target pair dev_stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream(), src_topic_input.get_example_stream(), trg_topic_dataset.get_example_stream(), trg_topic_binary_dataset.get_example_stream()], ('source', 'target','source_topical','target_topic','target_binary_topic')) stream = Batch( dev_stream, iteration_scheme=ConstantScheme(1)) masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1,trg_vocab_size - 1, source_topic_vocab_size-1,trg_topic_vocab_size - 1,trg_topic_vocab_size-1]) return masked_stream
def get_stream(vocab, data, vocab_size, unk_id, eos_id, bos_id, noise=0): vocab = get_vocab(vocab, vocab_size, unk_id, eos_id, bos_id) # Maps words to their index in the vocabulary. OOV words are replaced by <UNK> index. # Also appends </S> index at the end. No <S> token (TODO: bos_id parameter useless). dataset = TextFile([data], vocab, None) stream = Mapping(dataset.get_example_stream(), _add_noise(noise)) stream.dataset = dataset # for backward-compatibility return stream
def get_dev_stream(sfiles, tfiles, svocab_dict, tvocab_dict): s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') # Merge stream = Merge([s_dataset.get_example_stream(), t_dataset.get_example_stream()], ('source', 'target')) # Batch - Sort stream = Batch(stream, iteration_scheme=ConstantScheme(1006)) # Pad # Note that </s>=0. Fuel only allows padding 0 by default masked_stream = Padding(stream) return masked_stream
def get_dev_stream(val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set is not None and src_vocab is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) dev_dataset = TextFile([val_set], src_vocab, None) dev_stream = DataStream(dev_dataset) return dev_stream
def test_text(): # Test word level and epochs. with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: sentences1 = f.name f.write("This is a sentence\n") f.write("This another one") with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: sentences2 = f.name f.write("More sentences\n") f.write("The last one") dictionary = {'<UNK>': 0, '</S>': 1, 'this': 2, 'a': 3, 'one': 4} text_data = TextFile(files=[sentences1, sentences2], dictionary=dictionary, bos_token=None, preprocess=lower) stream = DataStream(text_data) epoch = stream.get_epoch_iterator() assert len(list(epoch)) == 4 epoch = stream.get_epoch_iterator() for sentence in zip(range(3), epoch): pass f = BytesIO() cPickle.dump(epoch, f) sentence = next(epoch) f.seek(0) epoch = cPickle.load(f) assert next(epoch) == sentence assert_raises(StopIteration, next, epoch) # Test character level. dictionary = dict([(chr(ord('a') + i), i) for i in range(26)] + [(' ', 26)] + [('<S>', 27)] + [('</S>', 28)] + [('<UNK>', 29)]) text_data = TextFile(files=[sentences1, sentences2], dictionary=dictionary, preprocess=lower, level="character") sentence = next(DataStream(text_data).get_epoch_iterator())[0] assert sentence[:3] == [27, 19, 7] assert sentence[-3:] == [2, 4, 28]
def get_test_stream(test_set=None, src_vocab=None, trg_vocab=None, src_vocab_size=200000, trg_vocab_size=6540, unk_id=1, sort_k_batches=12): """Prepares the testing data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([test_set], src_vocab, None) trg_dataset = TextFile(['./data/test.zh'], trg_vocab, None) # Merge them to get a source, target pair stream = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Replace out of vocabulary tokens with unk token stream = Mapping(stream, _oov_to_unk()) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(sort_k_batches)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(1)) # Pad sequences that are short masked_stream = PaddingWithEOS(stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream
def get_stream(input_file, vocab_file, **kwards): unk_token = kwards.pop('unk_token') eos_token = kwards.pop('eos_token') dataset = TextFile(files=[input_file], dictionary=pkl.load(open(vocab_file, 'rb')), level='word', unk_token=unk_token, bos_token=None, eos_token=eos_token) stream = DataStream(dataset) return stream
def _get_text_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, **kwargs): """Creates a parallel data stream from two text files without random access. This stream cannot be used with reshuffling. The arguments to this method are given by the configuration dict. """ # Build dummy vocabulary to make TextFile happy src_vocab = _add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = _add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair return Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target'))
def get_sgnmt_dev_stream(val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set is not None: src_vocab = add_special_ids( {str(i): i for i in xrange(src_vocab_size)}) dev_dataset = TextFile([val_set], src_vocab, None) dev_stream = DataStream(dev_dataset) return dev_stream
def _get_sgnmt_dev_stream(val_set=None, src_vocab=None, src_vocab_size=30000, **kwargs): """Setup development set stream if necessary. The arguments to this method are given by the configuration dict. """ dev_stream = None if val_set is not None: src_vocab = _add_special_ids({str(i) : i for i in xrange(src_vocab_size)}) dev_dataset = TextFile([val_set], src_vocab, None) dev_stream = DataStream(dev_dataset) return dev_stream
def get_test_stream(src_vocab, trg_vocab, src_data, trg_data=None, src_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the test data stream (=no batches or gold labels).""" print('streaming...') # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=2, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=2, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, preprocess=get_unicode) print(src_data) #exit() trg_dataset = TextFile([trg_data], trg_vocab, preprocess=get_unicode) #stream=DataStream(src_dataset) stream = Merge([DataStream(src_dataset), DataStream(trg_dataset)], ('source', 'target')) return stream
def get_tst_stream(val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, **kwargs): tst_stream = None if val_set is not None and src_vocab is not None: # Load dictionaries and ensure special tokens exist src_vocab = ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) tst_dataset = TextFile([val_set], src_vocab, None) tst_stream = DataStream(tst_dataset) return tst_stream