示例#1
0
    def __init__(
        self,
        recordings_source,
        labels_source,
        eos_label,
        num_features,
        num_phonemes,
        dim_dec,
        dims_bidir,
        dims_bottom,
        enc_transition,
        dec_transition,
        use_states_for_readout,
        attention_type,
        lm=None,
        character_map=None,
        subsample=None,
        dims_top=None,
        prior=None,
        conv_n=None,
        bottom_activation=None,
        post_merge_activation=None,
        post_merge_dims=None,
        dim_matcher=None,
        embed_outputs=True,
        dec_stack=1,
        conv_num_filters=1,
        data_prepend_eos=True,
        energy_normalizer=None,  # softmax is th edefault set in SequenceContentAndConvAttention
        **kwargs):
        if bottom_activation is None:
            bottom_activation = Tanh()
        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(SpeechRecognizer, self).__init__(**kwargs)
        self.recordings_source = recordings_source
        self.labels_source = labels_source
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        bottom_activation = bottom_activation
        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        if dims_bottom:
            bottom = MLP([bottom_activation] * len(dims_bottom),
                         [num_features] + dims_bottom,
                         name="bottom")
        else:
            bottom = Identity(name='bottom')

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(
            self.enc_transition, dims_bidir,
            dims_bottom[-1] if len(dims_bottom) else num_features, subsample)

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()],
                      [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]],
                      name="top")
        else:
            top = Identity(name='top')

        if dec_stack == 1:
            transition = self.dec_transition(dim=dim_dec,
                                             activation=Tanh(),
                                             name="transition")
        else:
            transitions = [
                self.dec_transition(dim=dim_dec,
                                    activation=Tanh(),
                                    name="transition_{}".format(trans_level))
                for trans_level in xrange(dec_stack)
            ]
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        if attention_type == "content":
            attention = SequenceContentAttention(
                state_names=transition.apply.states,
                attended_dim=2 * dims_bidir[-1],
                match_dim=dim_matcher,
                name="cont_att")
        elif attention_type == "content_and_conv":
            attention = SequenceContentAndConvAttention(
                state_names=transition.apply.states,
                conv_n=conv_n,
                conv_num_filters=conv_num_filters,
                attended_dim=2 * dims_bidir[-1],
                match_dim=dim_matcher,
                prior=prior,
                energy_normalizer=energy_normalizer,
                name="conv_att")
        else:
            raise ValueError(
                "Unknown attention type {}".format(attention_type))
        if embed_outputs:
            feedback = LookupFeedback(num_phonemes + 1, dim_dec)
        else:
            feedback = OneOfNFeedback(num_phonemes + 1)
        if lm:
            # In case we use LM it is Readout that is responsible
            # for normalization.
            emitter = LMEmitter()
        else:
            emitter = SoftmaxEmitter(initial_output=num_phonemes,
                                     name="emitter")
        readout_config = dict(readout_dim=num_phonemes,
                              source_names=(transition.apply.states if
                                            use_states_for_readout else []) +
                              [attention.take_glimpses.outputs[0]],
                              emitter=emitter,
                              feedback_brick=feedback,
                              name="readout")
        if post_merge_dims:
            readout_config['merged_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence(
                [
                    Bias(post_merge_dims[0]).apply,
                    post_merge_activation.apply,
                    MLP(
                        [post_merge_activation] *
                        (len(post_merge_dims) - 1) + [Identity()],
                        # MLP was designed to support Maxout is activation
                        # (because Maxout in a way is not one). However
                        # a single layer Maxout network works with the trick below.
                        # For deeper Maxout network one has to use the
                        # Sequence brick.
                        [
                            d //
                            getattr(post_merge_activation, 'num_pieces', 1)
                            for d in post_merge_dims
                        ] + [num_phonemes]).apply,
                ],
                name='post_merge')
        readout = Readout(**readout_config)

        language_model = None
        if lm:
            lm_weight = lm.pop('weight', 0.0)
            normalize_am_weights = lm.pop('normalize_am_weights', True)
            normalize_lm_weights = lm.pop('normalize_lm_weights', False)
            normalize_tot_weights = lm.pop('normalize_tot_weights', False)
            am_beta = lm.pop('am_beta', 1.0)
            if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1:
                logger.warn(
                    "Beam search is prone to fail with no log-prob normalization"
                )
            language_model = LanguageModel(nn_char_map=character_map, **lm)
            readout = ShallowFusionReadout(
                lm_costs_name='lm_add',
                lm_weight=lm_weight,
                normalize_am_weights=normalize_am_weights,
                normalize_lm_weights=normalize_lm_weights,
                normalize_tot_weights=normalize_tot_weights,
                am_beta=am_beta,
                **readout_config)

        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      language_model=language_model,
                                      name="generator")

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.children = [encoder, top, bottom, generator]

        # Create input variables
        self.recordings = tensor.tensor3(self.recordings_source)
        self.recordings_mask = tensor.matrix(self.recordings_source + "_mask")
        self.labels = tensor.lmatrix(self.labels_source)
        self.labels_mask = tensor.matrix(self.labels_source + "_mask")
        self.batch_inputs = [
            self.recordings, self.recordings_source, self.labels,
            self.labels_mask
        ]
        self.single_recording = tensor.matrix(self.recordings_source)
        self.single_transcription = tensor.lvector(self.labels_source)
    def __init__(self,
                 input_dims,
                 input_num_chars,
                 bos_label, eos_label,
                 num_labels,
                 dim_dec, dims_bidir,
                 enc_transition, dec_transition,
                 use_states_for_readout,
                 attention_type,
                 criterion,
                 bottom,
                 lm=None, token_map=None,
                 bidir=True, window_size=None,
                 max_length=None, subsample=None,
                 dims_top=None, extra_input_dim=None,
                 prior=None, conv_n=None,
                 post_merge_activation=None,
                 post_merge_dims=None,
                 dim_matcher=None,
                 embed_outputs=True,
                 dim_output_embedding=None,
                 reuse_bottom_lookup_table=False,
                 dec_stack=1,
                 conv_num_filters=1,
                 data_prepend_eos=True,
                 # softmax is the default set in SequenceContentAndConvAttention
                 energy_normalizer=None,
                 # for speech this is the approximate phoneme duration in frames
                 max_decoded_length_scale=1,
                 # for criterions involving generation of outputs, whether
                 # or not they should be generated by the recognizer itself
                 generate_predictions=True,
                 compute_targets=True,
                 extra_generation_steps=3,
                 **kwargs):
        all_arguments = copy.deepcopy(locals())
        all_arguments.update(copy.deepcopy(kwargs))
        del all_arguments['kwargs']
        del all_arguments['self']

        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(EncoderDecoder, self).__init__(**kwargs)
        self.bos_label = bos_label
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        self.criterion = criterion
        self.generate_predictions = generate_predictions
        self.extra_generation_steps = extra_generation_steps
        self.compute_targets = compute_targets

        self.max_decoded_length_scale = max_decoded_length_scale

        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        bottom_class = bottom.pop('bottom_class')
        bottom = bottom_class(
            input_dims=input_dims, input_num_chars=input_num_chars,
            name='bottom',
            **bottom)

        # BiRNN
        if dims_bidir:
            if not subsample:
                subsample = [1] * len(dims_bidir)
            encoder = Encoder(self.enc_transition, dims_bidir,
                            bottom.get_dim(bottom.apply.outputs[0]),
                            subsample, bidir=bidir)
        elif window_size:
            encoder = ConvEncoder(
                max_length, bottom.get_dim(bottom.apply.outputs[0]), window_size)
        else:
            raise ValueError("Don't know which Encoder to use")
        dim_encoded = encoder.get_dim(encoder.apply.outputs[0])

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()],
                      [dim_encoded] + dims_top + [dim_encoded], name="top")
        else:
            top = Identity(name='top')

        if dec_stack == 1:
            transition = self.dec_transition(
                dim=dim_dec, activation=Tanh(), name="transition")
        else:
            assert not extra_input_dim
            transitions = [self.dec_transition(dim=dim_dec,
                                               activation=Tanh(),
                                               name="transition_{}".format(trans_level))
                           for trans_level in xrange(dec_stack)]
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        if attention_type == "content":
            attention = SequenceContentAttention(
                state_names=transition.apply.states,
                attended_dim=dim_encoded, match_dim=dim_matcher,
                name="cont_att")
        elif attention_type == "content_and_conv":
            attention = SequenceContentAndConvAttention(
                state_names=transition.apply.states,
                conv_n=conv_n,
                conv_num_filters=conv_num_filters,
                attended_dim=dim_encoded, match_dim=dim_matcher,
                prior=prior,
                energy_normalizer=energy_normalizer,
                name="conv_att")
        else:
            raise ValueError("Unknown attention type {}"
                             .format(attention_type))
        if not embed_outputs:
            raise ValueError("embed_outputs=False is not supported any more")
        if not reuse_bottom_lookup_table:
            embedding = LookupTable(num_labels + 1,
                            dim_dec if
                            dim_output_embedding is None
                            else dim_output_embedding)
        else:
            embedding = bottom.children[0]
        feedback = Feedback(
            embedding=embedding,
            output_names=[s for s in transition.apply.sequences
                           if s != 'mask'])

        # Create a readout
        readout_config = dict(
            num_tokens=num_labels,
            input_names=(transition.apply.states if use_states_for_readout else [])
                         + [attention.take_glimpses.outputs[0]],
            name="readout")
        if post_merge_dims:
            readout_config['merge_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence([
                Bias(post_merge_dims[0]).apply,
                post_merge_activation.apply,
                MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()],
                    # MLP was designed to support Maxout is activation
                    # (because Maxout in a way is not one). However
                    # a single layer Maxout network works with the trick below.
                    # For deeper Maxout network one has to use the
                    # Sequence brick.
                    [d//getattr(post_merge_activation, 'num_pieces', 1)
                     for d in post_merge_dims] + [num_labels]).apply,
            ], name='post_merge')
        if 'reward' in criterion and criterion['name'] != 'log_likelihood':
            if criterion['reward'] == 'edit_distance':
                readout_config['reward_brick'] = EditDistanceReward(
                    self.bos_label, self.eos_label)
            elif criterion['reward'] == 'delta_edit_distance':
                readout_config['reward_brick'] = EditDistanceReward(
                    self.bos_label, self.eos_label, deltas=True)
            elif criterion['reward'] == 'bleu':
                readout_config['reward_brick'] = BleuReward(
                    self.bos_label, self.eos_label, deltas=False)
            elif criterion['reward'] == 'delta_bleu':
                readout_config['reward_brick'] = BleuReward(
                    self.bos_label, self.eos_label, deltas=True)
            else:
                raise ValueError("Unknown reward type")
        if criterion['name'] == 'log_likelihood':
            readout_class = SoftmaxReadout
        elif criterion['name'] == 'critic':
            readout_class = CriticReadout
            criterion_copy = dict(criterion)
            del criterion_copy['name']
            readout_config.update(**criterion_copy)
        elif criterion['name'] == 'reinforce':
            readout_class = ReinforceReadout
            readout_config['merge_names'] = list(readout_config['input_names'])
            readout_config['entropy'] = criterion.get('entropy')
            readout_config['input_names'] += ['attended', 'attended_mask']
        elif criterion['name'] in ['sarsa', 'actor_critic']:
            readout_class = ActorCriticReadout
            if criterion['name'] == 'actor_critic':
                critic_arguments = dict(all_arguments)
                # No worries, critic will not compute log likelihood values.
                # We
                critic_arguments['criterion'] = {
                    'name': 'critic',
                    'value_softmax': criterion.get('value_softmax'),
                    'same_value_for_wrong': criterion.get('same_value_for_wrong'),
                    'groundtruth_word_bonus': criterion.get('groundtruth_word_bonus'),
                    'dueling_outputs':  criterion.get('dueling_outputs')}
                critic_arguments['name'] = 'critic'
                if criterion.get('critic_uses_actor_states'):
                    critic_arguments['extra_input_dim'] = dim_dec
                if (criterion.get('value_softmax')
                        or criterion.get('same_value_for_wrong')
                        or criterion.get('dueling_outputs')):
                    # Add an extra output for the critic
                    critic_arguments['num_labels'] = num_labels + 1
                if criterion.get('force_bidir'):
                    critic_arguments['dims_bidir'] = [dim_dec]
                critic_arguments['reuse_bottom_lookup_table'] = True
                critic_arguments['input_num_chars'] = {'inputs': num_labels}
                if criterion.get('downsize_critic'):
                    critic_arguments = _downsize_config(
                        critic_arguments, criterion['downsize_critic'])
                critic = EncoderDecoder(**critic_arguments)
                readout_config['critic'] = critic
            readout_config['merge_names'] = list(readout_config['input_names'])
            readout_config['freeze_actor'] = criterion.get('freeze_actor')
            readout_config['freeze_critic'] = criterion.get('freeze_critic')
            readout_config['critic_uses_actor_states'] = criterion.get('critic_uses_actor_states')
            readout_config['critic_uses_groundtruth'] = criterion.get('critic_uses_groundtruth')
            readout_config['critic_burnin_steps'] = criterion.get('critic_burnin_steps')
            readout_config['critic_loss'] = criterion.get('critic_loss')
            readout_config['discount'] = criterion.get('discount')
            readout_config['entropy_reward_coof'] = criterion.get('entropy_reward_coof')
            readout_config['cross_entropy_reward_coof'] = criterion.get('cross_entropy_reward_coof')
            readout_config['value_penalty'] = criterion.get('value_penalty')
            readout_config['value_penalty_type'] = criterion.get('value_penalty_type')
            readout_config['critic_policy_t'] = criterion.get('critic_policy_t')
            readout_config['bos_token'] = bos_label
            readout_config['accumulate_outputs'] = criterion.get('accumulate_outputs')
            readout_config['use_value_biases'] = criterion.get('use_value_biases')
            readout_config['actor_grad_estimate'] = criterion.get('actor_grad_estimate')
            readout_config['input_names'] += ['attended', 'attended_mask']
            # Note, that settings below are for the "clean" mode.
            # When get_cost_graph() is run with training=True, they
            # are temporarily overriden with the "real" settings from
            # "criterion"
            readout_config['compute_targets'] = True
            readout_config['trpo_coef'] = 0.0
            readout_config['solve_bellman'] = True
        else:
            raise ValueError("Unknown criterion {}".format(criterion['name']))
        readout = readout_class(**readout_config)

        if lm:
            raise ValueError("LM is currently not supported")

        recurrent = AttentionRecurrent(transition, attention)
        if extra_input_dim:
            recurrent = RecurrentWithExtraInput(
                recurrent, "extra_inputs", extra_input_dim, name="with_extra_inputs")
        generator = SequenceGenerator(
            recurrent=recurrent, readout=readout, feedback=feedback,
            name="generator")

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.softmax = Softmax()
        self.children = [encoder, top, bottom, generator, self.softmax]

        # Create input variables
        self.inputs = self.bottom.batch_inputs
        self.inputs_mask = self.bottom.mask

        self.labels = tensor.lmatrix('labels')
        self.labels_mask = tensor.matrix("labels_mask")

        self.predicted_labels = tensor.lmatrix('predicted_labels')
        self.predicted_mask = tensor.matrix('predicted_mask')
        self.prefix_labels = tensor.lmatrix('prefix_labels')
        self.prefix_steps = tensor.lscalar('prefix_steps')

        self.single_inputs = self.bottom.single_inputs
        self.single_labels = tensor.lvector('labels')
        self.single_predicted_labels = tensor.lvector('predicted_labels')
        self.n_steps = tensor.lscalar('n_steps')

        # Configure mixed_generate
        if criterion['name'] == 'actor_critic':
            critic = self.generator.readout.critic
            self.mixed_generate.sequences = []
            self.mixed_generate.states = (
                ['step'] +
                self.generator.recurrent.apply.states +
                ['critic_' + name for name in critic.generator.recurrent.apply.states])
            self.mixed_generate.outputs = (
                ['samples', 'step'] +
                self.generator.recurrent.apply.outputs +
                ['critic_' + name for name in critic.generator.recurrent.apply.outputs])
            self.mixed_generate.contexts = (
                self.generator.recurrent.apply.contexts +
                ['critic_' + name for name in critic.generator.recurrent.apply.contexts]
                + ['groundtruth', 'groundtruth_mask'])
            self.initial_states.outputs = self.mixed_generate.states

        self.prefix_generate.sequences = []
        self.prefix_generate.states = ['step'] + self.generator.recurrent.apply.states
        self.prefix_generate.outputs = ['samples', 'step'] + self.generator.recurrent.apply.outputs
        self.prefix_generate.contexts = self.generator.recurrent.apply.contexts
示例#3
0
    def __init__(self,
                 input_dims,
                 input_num_chars,
                 eos_label,
                 num_phonemes,
                 dim_dec, dims_bidir,
                 enc_transition, dec_transition,
                 use_states_for_readout,
                 attention_type,
                 criterion,
                 bottom,
                 lm=None, character_map=None,
                 bidir=True,
                 subsample=None,
                 dims_top=None,
                 prior=None, conv_n=None,
                 post_merge_activation=None,
                 post_merge_dims=None,
                 dim_matcher=None,
                 embed_outputs=True,
                 dim_output_embedding=None,
                 dec_stack=1,
                 conv_num_filters=1,
                 data_prepend_eos=True,
                 # softmax is the default set in SequenceContentAndConvAttention
                 energy_normalizer=None,
                 # for speech this is the approximate phoneme duration in frames
                 max_decoded_length_scale=1,
                 **kwargs):

        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(SpeechRecognizer, self).__init__(**kwargs)
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        self.criterion = criterion

        self.max_decoded_length_scale = max_decoded_length_scale

        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        bottom_class = bottom.pop('bottom_class')
        bottom = bottom_class(
            input_dims=input_dims, input_num_chars=input_num_chars,
            name='bottom',
            **bottom)

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(self.enc_transition, dims_bidir,
                          bottom.get_dim(bottom.apply.outputs[0]),
                          subsample, bidir=bidir)
        dim_encoded = encoder.get_dim(encoder.apply.outputs[0])

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()],
                      [dim_encoded] + dims_top + [dim_encoded], name="top")
        else:
            top = Identity(name='top')

        if dec_stack == 1:
            transition = self.dec_transition(
                dim=dim_dec, activation=Tanh(), name="transition")
        else:
            transitions = [self.dec_transition(dim=dim_dec,
                                               activation=Tanh(),
                                               name="transition_{}".format(trans_level))
                           for trans_level in xrange(dec_stack)]
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        if attention_type == "content":
            attention = SequenceContentAttention(
                state_names=transition.apply.states,
                attended_dim=dim_encoded, match_dim=dim_matcher,
                name="cont_att")
        elif attention_type == "content_and_conv":
            attention = SequenceContentAndConvAttention(
                state_names=transition.apply.states,
                conv_n=conv_n,
                conv_num_filters=conv_num_filters,
                attended_dim=dim_encoded, match_dim=dim_matcher,
                prior=prior,
                energy_normalizer=energy_normalizer,
                name="conv_att")
        else:
            raise ValueError("Unknown attention type {}"
                             .format(attention_type))
        if embed_outputs:
            feedback = LookupFeedback(num_phonemes + 1,
                                      dim_dec if
                                      dim_output_embedding is None
                                      else dim_output_embedding)
        else:
            feedback = OneOfNFeedback(num_phonemes + 1)
        if criterion['name'] == 'log_likelihood':
            emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter")
            if lm:
                # In case we use LM it is Readout that is responsible
                # for normalization.
                emitter = LMEmitter()
        elif criterion['name'].startswith('mse'):
            emitter = RewardRegressionEmitter(
                criterion['name'], eos_label, num_phonemes,
                criterion.get('min_reward', -1.0),
                name="emitter")
        else:
            raise ValueError("Unknown criterion {}".format(criterion['name']))
        readout_config = dict(
            readout_dim=num_phonemes,
            source_names=(transition.apply.states if use_states_for_readout else [])
                         + [attention.take_glimpses.outputs[0]],
            emitter=emitter,
            feedback_brick=feedback,
            name="readout")
        if post_merge_dims:
            readout_config['merged_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence([
                Bias(post_merge_dims[0]).apply,
                post_merge_activation.apply,
                MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()],
                    # MLP was designed to support Maxout is activation
                    # (because Maxout in a way is not one). However
                    # a single layer Maxout network works with the trick below.
                    # For deeper Maxout network one has to use the
                    # Sequence brick.
                    [d//getattr(post_merge_activation, 'num_pieces', 1)
                     for d in post_merge_dims] + [num_phonemes]).apply,
            ],
                name='post_merge')
        readout = Readout(**readout_config)

        language_model = None
        if lm and lm.get('path'):
            lm_weight = lm.pop('weight', 0.0)
            normalize_am_weights = lm.pop('normalize_am_weights', True)
            normalize_lm_weights = lm.pop('normalize_lm_weights', False)
            normalize_tot_weights = lm.pop('normalize_tot_weights', False)
            am_beta = lm.pop('am_beta', 1.0)
            if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1:
                logger.warn("Beam search is prone to fail with no log-prob normalization")
            language_model = LanguageModel(nn_char_map=character_map, **lm)
            readout = ShallowFusionReadout(lm_costs_name='lm_add',
                                           lm_weight=lm_weight,
                                           normalize_am_weights=normalize_am_weights,
                                           normalize_lm_weights=normalize_lm_weights,
                                           normalize_tot_weights=normalize_tot_weights,
                                           am_beta=am_beta,
                                           **readout_config)

        generator = SequenceGenerator(
            readout=readout, transition=transition, attention=attention,
            language_model=language_model,
            name="generator")

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.children = [encoder, top, bottom, generator]

        # Create input variables
        self.inputs = self.bottom.batch_inputs
        self.inputs_mask = self.bottom.mask

        self.labels = tensor.lmatrix('labels')
        self.labels_mask = tensor.matrix("labels_mask")

        self.single_inputs = self.bottom.single_inputs
        self.single_labels = tensor.lvector('labels')
        self.n_steps = tensor.lscalar('n_steps')
示例#4
0
    def __init__(
            self,
            input_dims,
            input_num_chars,
            eos_label,
            num_phonemes,
            dim_dec,
            dims_bidir,
            enc_transition,
            dec_transition,
            use_states_for_readout,
            attention_type,
            criterion,
            bottom,
            lm=None,
            character_map=None,
            bidir=True,
            subsample=None,
            dims_top=None,
            prior=None,
            conv_n=None,
            post_merge_activation=None,
            post_merge_dims=None,
            dim_matcher=None,
            embed_outputs=True,
            dim_output_embedding=None,
            dec_stack=1,
            conv_num_filters=1,
            data_prepend_eos=True,
            # softmax is the default set in SequenceContentAndConvAttention
            energy_normalizer=None,
            # for speech this is the approximate phoneme duration in frames
            max_decoded_length_scale=1,
            **kwargs):

        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(SpeechRecognizer, self).__init__(**kwargs)
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        self.criterion = criterion

        self.max_decoded_length_scale = max_decoded_length_scale

        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        bottom_class = bottom.pop('bottom_class')
        bottom = bottom_class(input_dims=input_dims,
                              input_num_chars=input_num_chars,
                              name='bottom',
                              **bottom)

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(self.enc_transition,
                          dims_bidir,
                          bottom.get_dim(bottom.apply.outputs[0]),
                          subsample,
                          bidir=bidir)
        dim_encoded = encoder.get_dim(encoder.apply.outputs[0])

        generators = [None, None]
        for i in range(2):
            # The top part, on top of BiRNN but before the attention
            if dims_top:
                top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded],
                          name="top{}".format(i))
            else:
                top = Identity(name='top{}'.format(i))

            if dec_stack == 1:
                transition = self.dec_transition(dim=dim_dec,
                                                 activation=Tanh(),
                                                 name="transition{}".format(i))
            else:
                transitions = [
                    self.dec_transition(dim=dim_dec,
                                        activation=Tanh(),
                                        name="transition_{}_{}".format(
                                            i, trans_level))
                    for trans_level in xrange(dec_stack)
                ]
                transition = RecurrentStack(transitions=transitions,
                                            skip_connections=True)
            # Choose attention mechanism according to the configuration
            if attention_type == "content":
                attention = SequenceContentAttention(
                    state_names=transition.apply.states,
                    attended_dim=dim_encoded,
                    match_dim=dim_matcher,
                    name="cont_att" + i)
            elif attention_type == "content_and_conv":
                attention = SequenceContentAndConvAttention(
                    state_names=transition.apply.states,
                    conv_n=conv_n,
                    conv_num_filters=conv_num_filters,
                    attended_dim=dim_encoded,
                    match_dim=dim_matcher,
                    prior=prior,
                    energy_normalizer=energy_normalizer,
                    name="conv_att{}".format(i))
            else:
                raise ValueError(
                    "Unknown attention type {}".format(attention_type))
            if embed_outputs:
                feedback = LookupFeedback(
                    num_phonemes + 1, dim_dec
                    if dim_output_embedding is None else dim_output_embedding)
            else:
                feedback = OneOfNFeedback(num_phonemes + 1)
            if criterion['name'] == 'log_likelihood':
                emitter = SoftmaxEmitter(initial_output=num_phonemes,
                                         name="emitter{}".format(i))
                if lm:
                    # In case we use LM it is Readout that is responsible
                    # for normalization.
                    emitter = LMEmitter()
            elif criterion['name'].startswith('mse'):
                emitter = RewardRegressionEmitter(criterion['name'],
                                                  eos_label,
                                                  num_phonemes,
                                                  criterion.get(
                                                      'min_reward', -1.0),
                                                  name="emitter")
            else:
                raise ValueError("Unknown criterion {}".format(
                    criterion['name']))
            readout_config = dict(
                readout_dim=num_phonemes,
                source_names=(transition.apply.states if use_states_for_readout
                              else []) + [attention.take_glimpses.outputs[0]],
                emitter=emitter,
                feedback_brick=feedback,
                name="readout{}".format(i))
            if post_merge_dims:
                readout_config['merged_dim'] = post_merge_dims[0]
                readout_config['post_merge'] = InitializableSequence(
                    [
                        Bias(post_merge_dims[0]).apply,
                        post_merge_activation.apply,
                        MLP(
                            [post_merge_activation] *
                            (len(post_merge_dims) - 1) + [Identity()],
                            # MLP was designed to support Maxout is activation
                            # (because Maxout in a way is not one). However
                            # a single layer Maxout network works with the trick below.
                            # For deeper Maxout network one has to use the
                            # Sequence brick.
                            [
                                d //
                                getattr(post_merge_activation, 'num_pieces', 1)
                                for d in post_merge_dims
                            ] + [num_phonemes]).apply,
                    ],
                    name='post_merge{}'.format(i))
            readout = Readout(**readout_config)

            language_model = None
            if lm and lm.get('path'):
                lm_weight = lm.pop('weight', 0.0)
                normalize_am_weights = lm.pop('normalize_am_weights', True)
                normalize_lm_weights = lm.pop('normalize_lm_weights', False)
                normalize_tot_weights = lm.pop('normalize_tot_weights', False)
                am_beta = lm.pop('am_beta', 1.0)
                if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1:
                    logger.warn(
                        "Beam search is prone to fail with no log-prob normalization"
                    )
                language_model = LanguageModel(nn_char_map=character_map, **lm)
                readout = ShallowFusionReadout(
                    lm_costs_name='lm_add',
                    lm_weight=lm_weight,
                    normalize_am_weights=normalize_am_weights,
                    normalize_lm_weights=normalize_lm_weights,
                    normalize_tot_weights=normalize_tot_weights,
                    am_beta=am_beta,
                    **readout_config)

            generators[i] = SequenceGenerator(readout=readout,
                                              transition=transition,
                                              attention=attention,
                                              language_model=language_model,
                                              name="generator{}".format(i))

        self.generator = generators[0]

        self.forward_to_backward = Linear(dim_dec, dim_dec)

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generators = generators
        self.children = [self.forward_to_backward, encoder, top, bottom
                         ] + generators

        # Create input variables
        self.inputs = self.bottom.batch_inputs
        self.inputs_mask = self.bottom.mask

        self.labels = tensor.lmatrix('labels')
        self.labels_mask = tensor.matrix("labels_mask")

        self.single_inputs = self.bottom.single_inputs
        self.single_labels = tensor.lvector('labels')
        self.n_steps = tensor.lscalar('n_steps')
示例#5
0
    def __init__(
            self,
            input_sources,
            input_sources_dims,
            eos_label,
            num_phonemes,
            dim_dec,
            dims_bidir,
            enc_transition,
            dec_transition,
            use_states_for_readout,
            attention_type,
            criterion,
            bottom,
            enc_transition_params={},
            dec_transition_params={},
            names_postfix='',
            lm=None,
            character_map=None,
            bidir=True,
            bidir_aggregation='concat',
            subsample=None,
            dims_top=None,
            prior=None,
            conv_n=None,
            post_merge_activation=None,
            post_merge_dims=None,
            dim_matcher=None,
            embed_outputs=True,
            dim_output_embedding=None,
            dec_stack=1,
            conv_num_filters=1,
            data_prepend_eos=False,
            # softmax is the default set in SequenceContentAndConvAttention
            energy_normalizer=None,
            # for speech this is the approximate phoneme duration in frames
            max_decoded_length_scale=3,
            use_dependent_words_for_labels=False,
            use_dependent_words_for_attention=False,
            reproduce_rec_weight_init_bug=True,
            pointers_weight=0.5,
            tags_weight=1.0,
            tag_layer=-1,  # -1 is last, 0 is after first bidir layer
            dependency_type='recurrent_soft',
            **kwargs):

        if post_merge_activation is None:
            post_merge_activation = Tanh()

        self.regularization_bricks = []
        possible_regularization_bricks = []

        self.names_postfix = names_postfix

        self.mask_dict = {}

        self.pointers_name = 'pointers' + names_postfix

        self.additional_sources = kwargs.pop('additional_sources')
        self.additional_sources_dims = kwargs.pop('additional_sources_dims')

        self.pointer_weight = pointers_weight
        self.soft_pointer_val = kwargs.pop('pointers_soften', 0.0)
        self.soft_pointer = self.soft_pointer_val > 0.0

        self.tags_weight = tags_weight
        self.tag_layer = tag_layer
        self.train_tags = True
        if self.tags_weight < 0 or len(self.additional_sources) <= 1:
            self.train_tags = False

        self.dependency_type = dependency_type

        super(DependencyRecognizer, self).__init__(**kwargs)

        self.reproduce_rec_weight_init_bug = reproduce_rec_weight_init_bug

        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        self.criterion = criterion

        self.max_decoded_length_scale = max_decoded_length_scale

        self.post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        bottom_class = bottom.pop('bottom_class')
        bottom = bottom_class(input_sources=input_sources,
                              input_sources_dims=input_sources_dims,
                              name='bottom',
                              pointers_soften=self.soft_pointer,
                              additional_sources=self.additional_sources,
                              **bottom)

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(self.enc_transition,
                          dims_bidir,
                          bottom.output_dim,
                          subsample,
                          bidir=bidir,
                          bidir_aggregation=bidir_aggregation,
                          enc_transition_params=enc_transition_params)
        possible_regularization_bricks += encoder.enc_transitions
        dim_encoded = encoder.get_dim(encoder.apply.outputs[0])

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded],
                      name="top")
        else:
            top = Identity(name='top')

        self.additional_sources_mlp = {}
        ndim_softmax = NDimensionalSoftmax()
        ndim_softmax._extra_ndim = 1
        for source in self.additional_sources:
            if source != self.pointers_name:
                if len(self.names_postfix) > 0:
                    source_glob_name = source[:-len(self.names_postfix)]
                else:
                    source_glob_name = source
                self.additional_sources_mlp[source] = \
                    MLP([ndim_softmax], [dim_encoded, self.additional_sources_dims[source]],
                        name='additional_'+source_glob_name)

        if dec_stack == 1:
            transition = self.dec_transition(dim=dim_dec,
                                             activation=Tanh(),
                                             name="transition",
                                             **dec_transition_params)
            possible_regularization_bricks += [transition]
        else:
            transitions = [
                self.dec_transition(dim=dim_dec,
                                    activation=Tanh(),
                                    name="transition_{}".format(trans_level),
                                    **dec_transition_params)
                for trans_level in xrange(dec_stack)
            ]
            possible_regularization_bricks += transitions
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        attention_class = ParsingAttention
        attention_kwargs = {}
        transition_with_att_class = ParsingAttentionRecurrent

        if self.dependency_type == "recurrent_soft":
            attention_kwargs['use_pointers'] = None
        elif self.dependency_type == "recurrent_hard":
            attention_kwargs['use_pointers'] = 'hard'
        elif self.dependency_type == "recurrent_semihard":
            attention_kwargs['use_pointers'] = 'semihard'
        else:
            raise ValueError("Unknown dependency type {}".format(
                self.dependency_type))

        if attention_type == "content":
            pass
        elif attention_type == "content_hard":
            attention_kwargs['hard_attention'] = True
        else:
            raise ValueError(
                "Unknown attention type {}".format(attention_type))

        if use_dependent_words_for_attention:
            attention_kwargs['use_word_annotations'] = True
            attention_kwargs['word_annontation_dim'] = dim_encoded

        attention = attention_class(state_names=transition.apply.states,
                                    attended_dim=dim_encoded,
                                    match_dim=dim_matcher,
                                    name="cont_att",
                                    **attention_kwargs)

        feedback = AttendedFeedback(num_phonemes + 1, dim_encoded)
        if criterion['name'] == 'log_likelihood':
            emitter = SoftmaxMultiEmitter(initial_output=num_phonemes,
                                          name="emitter")
        else:
            raise ValueError("Unknown criterion {}".format(criterion['name']))
        readout_source_names = (transition.apply.states
                                if use_states_for_readout else
                                []) + [attention.take_glimpses.outputs[0]]

        if use_dependent_words_for_labels:
            readout_source_names.append('attended')

        readout_config = dict(readout_dim=num_phonemes,
                              source_names=readout_source_names,
                              emitter=emitter,
                              feedback_brick=feedback,
                              name="readout")
        if post_merge_dims:
            readout_config['merged_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence(
                [
                    Bias(post_merge_dims[0]).apply,
                    post_merge_activation.apply,
                    MLP(
                        [post_merge_activation] *
                        (len(post_merge_dims) - 1) + [Identity()],
                        # MLP was designed to support Maxout is activation
                        # (because Maxout in a way is not one). However
                        # a single layer Maxout network works with the trick below.
                        # For deeper Maxout network one has to use the
                        # Sequence brick.
                        [
                            d //
                            getattr(post_merge_activation, 'num_pieces', 1)
                            for d in post_merge_dims
                        ] + [num_phonemes]).apply,
                ],
                name='post_merge')
        readout = Readout(**readout_config)

        generator = Generator(
            readout=readout,
            transition=transition,
            attention=attention,
            dim_dec=dim_dec,
            pointer_weight=self.pointer_weight,
            transition_with_att_class=transition_with_att_class,
            name="generator")

        for brick in possible_regularization_bricks:
            if 'regularize' in dir(brick):
                self.regularization_bricks += [brick]

        logger.info("Regularization bricks: {}".format(
            str(self.regularization_bricks)))

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.children = [encoder, top, bottom, generator]
        self.children.extend(self.additional_sources_mlp.values())

        # Create input variables
        self.inputs = self.bottom.get_batch_inputs()
        self.inputs_mask = self.bottom.get_mask()

        self.additional_sources = self.bottom.get_batch_additional_sources()

        self.labels = tensor.lmatrix('labels' + names_postfix)
        self.labels_mask = tensor.matrix('labels' + names_postfix + '_mask')
        #self.labels_mask = tensor.matrix('labels_mask'+names_postfix)

        self.single_inputs = self.bottom.get_single_sequence_inputs()
        self.single_labels = tensor.lvector('labels' + names_postfix)
        self.single_additional_sources = self.bottom.get_single_additional_sources(
        )
        self.n_steps = tensor.lscalar('n_steps' + names_postfix)