예제 #1
0
def get_dev_stream_with_prefixes(val_set=None, val_set_grndtruth=None, src_vocab=None, src_vocab_size=30000,
                                 trg_vocab=None, trg_vocab_size=30000, unk_id=1, return_vocab=False, **kwargs):
    """Setup development set stream if necessary."""

    dev_stream = None
    if val_set is not None and val_set_grndtruth is not None:
        src_vocab = _ensure_special_tokens(
            src_vocab if isinstance(src_vocab, dict) else
            cPickle.load(open(src_vocab)),
            bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)

        trg_vocab = _ensure_special_tokens(
            trg_vocab if isinstance(trg_vocab, dict) else
            cPickle.load(open(trg_vocab)),
            bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

        dev_source_dataset = TextFile([val_set], src_vocab,
                                      bos_token='<S>',
                                      eos_token='</S>',
                                      unk_token='<UNK>')
        dev_target_dataset = TextFile([val_set_grndtruth], trg_vocab,
                                      bos_token='<S>',
                                      eos_token='</S>',
                                      unk_token='<UNK>')

        dev_stream = Merge([dev_source_dataset.get_example_stream(),
                            dev_target_dataset.get_example_stream()],
                           ('source', 'target'))

        # now add prefix and suffixes to this stream
        dev_stream = Mapping(dev_stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('dev_sample_ratio', 1.)),
                             add_sources=('target_prefix', 'target_suffix'))

        dev_stream = Mapping(dev_stream, CopySourceAndTargetToMatchPrefixes(dev_stream))

        # changing stream.produces_examples is a little hack which lets us use Unpack to flatten
        dev_stream.produces_examples = False
        # flatten the stream back out into (source, target, target_prefix, target_suffix)
        dev_stream = Unpack(dev_stream)

    if return_vocab:
        return dev_stream, src_vocab, trg_vocab
    else:
        return dev_stream
예제 #2
0
def setup_model_and_stream(exp_config, source_vocab, target_vocab):

    # TODO: this line is a mess
    sample_model, theano_sampling_source_input, theano_sampling_context_input, train_encoder, \
    train_decoder, generated = \
        get_sampling_model_and_input(exp_config)

    trg_vocab = target_vocab
    trg_vocab_size = exp_config['trg_vocab_size']
    src_vocab = source_vocab
    src_vocab_size = exp_config['src_vocab_size']

    theano_sample_func = sample_model.get_theano_function()
    sampling_func = SampleFunc(theano_sample_func, trg_vocab)

    # TODO: move stream creation to nn_imt.stream
    # def get_textfile_stream(source_file=None, src_vocab=None, src_vocab_size=30000,
    #                         unk_id=1, bos_token=None):
    src_stream = get_textfile_stream(
        source_file=exp_config['src_data'],
        src_vocab=exp_config['src_vocab'],
        src_vocab_size=exp_config['src_vocab_size'],
        unk_id=exp_config['unk_id'],
        bos_token='<S>')

    trg_stream = get_textfile_stream(
        source_file=exp_config['trg_data'],
        src_vocab=exp_config['trg_vocab'],
        src_vocab_size=exp_config['trg_vocab_size'],
        unk_id=exp_config['unk_id'],
        bos_token='<S>')

    # text file stream
    training_stream = Merge([src_stream, trg_stream], ('source', 'target'))

    # Filter sequences that are too long (Note this may break)
    training_stream = Filter(
        training_stream, predicate=_too_long(seq_len=exp_config['seq_len']))

    # Replace out of vocabulary tokens with unk token
    # TODO: doesn't the TextFile stream do this anyway?
    training_stream = Mapping(
        training_stream,
        _oov_to_unk(src_vocab_size=exp_config['src_vocab_size'],
                    trg_vocab_size=exp_config['trg_vocab_size'],
                    unk_id=exp_config['unk_id']))

    # add in the prefix and suffix seqs
    # working: add the sample ratio
    logger.info('Sample ratio is: {}'.format(exp_config.get(
        'sample_ratio', 1.)))
    training_stream = Mapping(
        training_stream,
        PrefixSuffixStreamTransformer(
            sample_ratio=exp_config.get('sample_ratio', 1.)),
        add_sources=('target_prefix', 'target_suffix'))

    training_stream = Mapping(
        training_stream, CopySourceAndTargetToMatchPrefixes(training_stream))

    # changing stream.produces_examples is a little hack which lets us use Unpack to flatten
    training_stream.produces_examples = False

    # flatten the stream back out into (source, target, target_prefix, target_suffix)
    training_stream = Unpack(training_stream)

    # METEOR
    trg_ivocab = {v: k for k, v in trg_vocab.items()}

    # TODO: Implement smoothed BLEU
    # TODO: Implement first-word accuracy (bilingual language model)

    min_risk_score_func = exp_config.get('min_risk_score_func', 'bleu')

    if min_risk_score_func == 'meteor':
        sampling_transformer = IMTSampleStreamTransformer(
            sampling_func,
            sentence_level_meteor,
            num_samples=exp_config['n_samples'],
            trg_ivocab=trg_ivocab,
            lang=exp_config['target_lang'],
            meteor_directory=exp_config['meteor_directory'])
    elif min_risk_score_func == 'imt_f1':
        sampling_transformer = IMTSampleStreamTransformer(
            sampling_func,
            sentence_level_imt_f1,
            num_samples=exp_config['n_samples'])
    # BLEU is default
    else:
        sampling_transformer = IMTSampleStreamTransformer(
            sampling_func,
            sentence_level_bleu,
            num_samples=exp_config['n_samples'])

    training_stream = Mapping(training_stream,
                              sampling_transformer,
                              add_sources=('samples', 'seq_probs', 'scores'))

    # now filter out segments whose samples are too good or too bad
    training_stream = Filter(training_stream, predicate=filter_by_sample_score)

    # Now make a very big batch that we can shuffle
    # Build a batched version of stream to read k batches ahead
    shuffle_batch_size = exp_config['shuffle_batch_size']
    training_stream = Batch(
        training_stream, iteration_scheme=ConstantScheme(shuffle_batch_size))

    training_stream = ShuffleBatchTransformer(training_stream)

    # unpack it again
    training_stream = Unpack(training_stream)

    # Build a batched version of stream to read k batches ahead
    batch_size = exp_config['batch_size']
    sort_k_batches = exp_config['sort_k_batches']
    training_stream = Batch(training_stream,
                            iteration_scheme=ConstantScheme(batch_size *
                                                            sort_k_batches))

    # Sort all samples in the read-ahead batch
    training_stream = Mapping(training_stream, SortMapping(_length))

    # Convert it into a stream again
    training_stream = Unpack(training_stream)

    # Construct batches from the stream with specified batch size
    training_stream = Batch(training_stream,
                            iteration_scheme=ConstantScheme(batch_size))

    # IDEA: add a transformer which flattens the target samples before we add the mask
    flat_sample_stream = FlattenSamples(training_stream)

    expanded_source_stream = CopySourceAndPrefixNTimes(
        flat_sample_stream, n_samples=exp_config['n_samples'])

    # Pad sequences that are short
    # TODO: is it correct to blindly pad the target_prefix and the target_suffix?
    # Note: we shouldn't need to pad the seq_probs because there is only one per sequence
    # TODO: DEVELOPMENT HACK
    exp_config['suffix_length'] = 1
    exp_config['truncate_sources'] = ['target_suffix']
    configurable_padding_args = {
        'suffix_length': exp_config.get('suffix_length', None),
        'truncate_sources': exp_config.get('truncate_sources', [])
    }
    import ipdb
    ipdb.set_trace()
    masked_stream = PaddingWithEOS(expanded_source_stream, [
        src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1,
        trg_vocab_size - 1, trg_vocab_size - 1
    ],
                                   mask_sources=('source', 'target',
                                                 'target_prefix',
                                                 'target_suffix', 'samples'),
                                   **configurable_padding_args)

    return train_encoder, train_decoder, theano_sampling_source_input, theano_sampling_context_input, generated, masked_stream
예제 #3
0
def get_tr_stream_with_prefixes(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000,
                                trg_vocab_size=30000, unk_id=1, seq_len=50,
                                batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the IMT training data stream."""

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab)),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab)),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

    # TODO: should training stream actually have begin and end tokens?
    # Note: this actually depends upon how the system was pre-trained, but systems used for initialization
    # Note: should _always_ have BOS tokens

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab,
                           bos_token='<S>',
                           eos_token='</S>',
                           unk_token='<UNK>')
    trg_dataset = TextFile([trg_data], trg_vocab,
                           bos_token='<S>',
                           eos_token='</S>',
                           unk_token='<UNK>')

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    # TODO: doesn't the TextFile stream do this anyway?
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))

    stream = Mapping(stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('train_sample_ratio', 1.)),
                     add_sources=('target_prefix', 'target_suffix'))

    stream = Mapping(stream, CopySourceAndTargetToMatchPrefixes(stream))

    # changing stream.produces_examples is a little hack which lets us use Unpack to flatten
    stream.produces_examples = False
    # flatten the stream back out into (source, target, target_prefix, target_suffix)
    stream = Unpack(stream)

    # Now make a very big batch that we can shuffle
    shuffle_batch_size = kwargs['shuffle_batch_size']
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(shuffle_batch_size)
                   )

    stream = ShuffleBatchTransformer(stream)

    # unpack it again
    stream = Unpack(stream)

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size * sort_k_batches)
                   )

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    # TODO: is it correct to blindly pad the target_prefix and the target_suffix?
    configurable_padding_args = {
        'suffix_length': kwargs.get('suffix_length', None),
        'truncate_sources': kwargs.get('truncate_sources', [])
    }
    logger.info('Training suffix length is: {}'.format(configurable_padding_args['suffix_length']))
    logger.info('I will mask the following sources after <suffix_length>: {}'.format(configurable_padding_args['truncate_sources']))
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1],
        mask_sources=('source', 'target', 'target_prefix', 'target_suffix'), **configurable_padding_args)

    return masked_stream, src_vocab, trg_vocab