def __init__( self, recordings_source, labels_source, eos_label, num_features, num_phonemes, dim_dec, dims_bidir, dims_bottom, enc_transition, dec_transition, use_states_for_readout, attention_type, lm=None, character_map=None, subsample=None, dims_top=None, prior=None, conv_n=None, bottom_activation=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, energy_normalizer=None, # softmax is th edefault set in SequenceContentAndConvAttention **kwargs): if bottom_activation is None: bottom_activation = Tanh() if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.recordings_source = recordings_source self.labels_source = labels_source self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack bottom_activation = bottom_activation post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN if dims_bottom: bottom = MLP([bottom_activation] * len(dims_bottom), [num_features] + dims_bottom, name="bottom") else: bottom = Identity(name='bottom') # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder( self.enc_transition, dims_bidir, dims_bottom[-1] if len(dims_bottom) else num_features, subsample) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition") else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack) ] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError( "Unknown attention type {}".format(attention_type)) if embed_outputs: feedback = LookupFeedback(num_phonemes + 1, dim_dec) else: feedback = OneOfNFeedback(num_phonemes + 1) if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() else: emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter") readout_config = dict(readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout") if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge') readout = Readout(**readout_config) language_model = None if lm: lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn( "Beam search is prone to fail with no log-prob normalization" ) language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout( lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.children = [encoder, top, bottom, generator] # Create input variables self.recordings = tensor.tensor3(self.recordings_source) self.recordings_mask = tensor.matrix(self.recordings_source + "_mask") self.labels = tensor.lmatrix(self.labels_source) self.labels_mask = tensor.matrix(self.labels_source + "_mask") self.batch_inputs = [ self.recordings, self.recordings_source, self.labels, self.labels_mask ] self.single_recording = tensor.matrix(self.recordings_source) self.single_transcription = tensor.lvector(self.labels_source)
def __init__(self, input_dims, input_num_chars, bos_label, eos_label, num_labels, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, lm=None, token_map=None, bidir=True, window_size=None, max_length=None, subsample=None, dims_top=None, extra_input_dim=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, reuse_bottom_lookup_table=False, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=1, # for criterions involving generation of outputs, whether # or not they should be generated by the recognizer itself generate_predictions=True, compute_targets=True, extra_generation_steps=3, **kwargs): all_arguments = copy.deepcopy(locals()) all_arguments.update(copy.deepcopy(kwargs)) del all_arguments['kwargs'] del all_arguments['self'] if post_merge_activation is None: post_merge_activation = Tanh() super(EncoderDecoder, self).__init__(**kwargs) self.bos_label = bos_label self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.generate_predictions = generate_predictions self.extra_generation_steps = extra_generation_steps self.compute_targets = compute_targets self.max_decoded_length_scale = max_decoded_length_scale post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class( input_dims=input_dims, input_num_chars=input_num_chars, name='bottom', **bottom) # BiRNN if dims_bidir: if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.get_dim(bottom.apply.outputs[0]), subsample, bidir=bidir) elif window_size: encoder = ConvEncoder( max_length, bottom.get_dim(bottom.apply.outputs[0]), window_size) else: raise ValueError("Don't know which Encoder to use") dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition( dim=dim_dec, activation=Tanh(), name="transition") else: assert not extra_input_dim transitions = [self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack)] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=dim_encoded, match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError("Unknown attention type {}" .format(attention_type)) if not embed_outputs: raise ValueError("embed_outputs=False is not supported any more") if not reuse_bottom_lookup_table: embedding = LookupTable(num_labels + 1, dim_dec if dim_output_embedding is None else dim_output_embedding) else: embedding = bottom.children[0] feedback = Feedback( embedding=embedding, output_names=[s for s in transition.apply.sequences if s != 'mask']) # Create a readout readout_config = dict( num_tokens=num_labels, input_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], name="readout") if post_merge_dims: readout_config['merge_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence([ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [d//getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims] + [num_labels]).apply, ], name='post_merge') if 'reward' in criterion and criterion['name'] != 'log_likelihood': if criterion['reward'] == 'edit_distance': readout_config['reward_brick'] = EditDistanceReward( self.bos_label, self.eos_label) elif criterion['reward'] == 'delta_edit_distance': readout_config['reward_brick'] = EditDistanceReward( self.bos_label, self.eos_label, deltas=True) elif criterion['reward'] == 'bleu': readout_config['reward_brick'] = BleuReward( self.bos_label, self.eos_label, deltas=False) elif criterion['reward'] == 'delta_bleu': readout_config['reward_brick'] = BleuReward( self.bos_label, self.eos_label, deltas=True) else: raise ValueError("Unknown reward type") if criterion['name'] == 'log_likelihood': readout_class = SoftmaxReadout elif criterion['name'] == 'critic': readout_class = CriticReadout criterion_copy = dict(criterion) del criterion_copy['name'] readout_config.update(**criterion_copy) elif criterion['name'] == 'reinforce': readout_class = ReinforceReadout readout_config['merge_names'] = list(readout_config['input_names']) readout_config['entropy'] = criterion.get('entropy') readout_config['input_names'] += ['attended', 'attended_mask'] elif criterion['name'] in ['sarsa', 'actor_critic']: readout_class = ActorCriticReadout if criterion['name'] == 'actor_critic': critic_arguments = dict(all_arguments) # No worries, critic will not compute log likelihood values. # We critic_arguments['criterion'] = { 'name': 'critic', 'value_softmax': criterion.get('value_softmax'), 'same_value_for_wrong': criterion.get('same_value_for_wrong'), 'groundtruth_word_bonus': criterion.get('groundtruth_word_bonus'), 'dueling_outputs': criterion.get('dueling_outputs')} critic_arguments['name'] = 'critic' if criterion.get('critic_uses_actor_states'): critic_arguments['extra_input_dim'] = dim_dec if (criterion.get('value_softmax') or criterion.get('same_value_for_wrong') or criterion.get('dueling_outputs')): # Add an extra output for the critic critic_arguments['num_labels'] = num_labels + 1 if criterion.get('force_bidir'): critic_arguments['dims_bidir'] = [dim_dec] critic_arguments['reuse_bottom_lookup_table'] = True critic_arguments['input_num_chars'] = {'inputs': num_labels} if criterion.get('downsize_critic'): critic_arguments = _downsize_config( critic_arguments, criterion['downsize_critic']) critic = EncoderDecoder(**critic_arguments) readout_config['critic'] = critic readout_config['merge_names'] = list(readout_config['input_names']) readout_config['freeze_actor'] = criterion.get('freeze_actor') readout_config['freeze_critic'] = criterion.get('freeze_critic') readout_config['critic_uses_actor_states'] = criterion.get('critic_uses_actor_states') readout_config['critic_uses_groundtruth'] = criterion.get('critic_uses_groundtruth') readout_config['critic_burnin_steps'] = criterion.get('critic_burnin_steps') readout_config['critic_loss'] = criterion.get('critic_loss') readout_config['discount'] = criterion.get('discount') readout_config['entropy_reward_coof'] = criterion.get('entropy_reward_coof') readout_config['cross_entropy_reward_coof'] = criterion.get('cross_entropy_reward_coof') readout_config['value_penalty'] = criterion.get('value_penalty') readout_config['value_penalty_type'] = criterion.get('value_penalty_type') readout_config['critic_policy_t'] = criterion.get('critic_policy_t') readout_config['bos_token'] = bos_label readout_config['accumulate_outputs'] = criterion.get('accumulate_outputs') readout_config['use_value_biases'] = criterion.get('use_value_biases') readout_config['actor_grad_estimate'] = criterion.get('actor_grad_estimate') readout_config['input_names'] += ['attended', 'attended_mask'] # Note, that settings below are for the "clean" mode. # When get_cost_graph() is run with training=True, they # are temporarily overriden with the "real" settings from # "criterion" readout_config['compute_targets'] = True readout_config['trpo_coef'] = 0.0 readout_config['solve_bellman'] = True else: raise ValueError("Unknown criterion {}".format(criterion['name'])) readout = readout_class(**readout_config) if lm: raise ValueError("LM is currently not supported") recurrent = AttentionRecurrent(transition, attention) if extra_input_dim: recurrent = RecurrentWithExtraInput( recurrent, "extra_inputs", extra_input_dim, name="with_extra_inputs") generator = SequenceGenerator( recurrent=recurrent, readout=readout, feedback=feedback, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.softmax = Softmax() self.children = [encoder, top, bottom, generator, self.softmax] # Create input variables self.inputs = self.bottom.batch_inputs self.inputs_mask = self.bottom.mask self.labels = tensor.lmatrix('labels') self.labels_mask = tensor.matrix("labels_mask") self.predicted_labels = tensor.lmatrix('predicted_labels') self.predicted_mask = tensor.matrix('predicted_mask') self.prefix_labels = tensor.lmatrix('prefix_labels') self.prefix_steps = tensor.lscalar('prefix_steps') self.single_inputs = self.bottom.single_inputs self.single_labels = tensor.lvector('labels') self.single_predicted_labels = tensor.lvector('predicted_labels') self.n_steps = tensor.lscalar('n_steps') # Configure mixed_generate if criterion['name'] == 'actor_critic': critic = self.generator.readout.critic self.mixed_generate.sequences = [] self.mixed_generate.states = ( ['step'] + self.generator.recurrent.apply.states + ['critic_' + name for name in critic.generator.recurrent.apply.states]) self.mixed_generate.outputs = ( ['samples', 'step'] + self.generator.recurrent.apply.outputs + ['critic_' + name for name in critic.generator.recurrent.apply.outputs]) self.mixed_generate.contexts = ( self.generator.recurrent.apply.contexts + ['critic_' + name for name in critic.generator.recurrent.apply.contexts] + ['groundtruth', 'groundtruth_mask']) self.initial_states.outputs = self.mixed_generate.states self.prefix_generate.sequences = [] self.prefix_generate.states = ['step'] + self.generator.recurrent.apply.states self.prefix_generate.outputs = ['samples', 'step'] + self.generator.recurrent.apply.outputs self.prefix_generate.contexts = self.generator.recurrent.apply.contexts
def __init__(self, input_dims, input_num_chars, eos_label, num_phonemes, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, lm=None, character_map=None, bidir=True, subsample=None, dims_top=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=1, **kwargs): if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.max_decoded_length_scale = max_decoded_length_scale post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class( input_dims=input_dims, input_num_chars=input_num_chars, name='bottom', **bottom) # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.get_dim(bottom.apply.outputs[0]), subsample, bidir=bidir) dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition( dim=dim_dec, activation=Tanh(), name="transition") else: transitions = [self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack)] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=dim_encoded, match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError("Unknown attention type {}" .format(attention_type)) if embed_outputs: feedback = LookupFeedback(num_phonemes + 1, dim_dec if dim_output_embedding is None else dim_output_embedding) else: feedback = OneOfNFeedback(num_phonemes + 1) if criterion['name'] == 'log_likelihood': emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter") if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() elif criterion['name'].startswith('mse'): emitter = RewardRegressionEmitter( criterion['name'], eos_label, num_phonemes, criterion.get('min_reward', -1.0), name="emitter") else: raise ValueError("Unknown criterion {}".format(criterion['name'])) readout_config = dict( readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout") if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence([ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [d//getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims] + [num_phonemes]).apply, ], name='post_merge') readout = Readout(**readout_config) language_model = None if lm and lm.get('path'): lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn("Beam search is prone to fail with no log-prob normalization") language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout(lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generator = SequenceGenerator( readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.children = [encoder, top, bottom, generator] # Create input variables self.inputs = self.bottom.batch_inputs self.inputs_mask = self.bottom.mask self.labels = tensor.lmatrix('labels') self.labels_mask = tensor.matrix("labels_mask") self.single_inputs = self.bottom.single_inputs self.single_labels = tensor.lvector('labels') self.n_steps = tensor.lscalar('n_steps')
def __init__( self, input_dims, input_num_chars, eos_label, num_phonemes, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, lm=None, character_map=None, bidir=True, subsample=None, dims_top=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=1, **kwargs): if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.max_decoded_length_scale = max_decoded_length_scale post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class(input_dims=input_dims, input_num_chars=input_num_chars, name='bottom', **bottom) # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.get_dim(bottom.apply.outputs[0]), subsample, bidir=bidir) dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) generators = [None, None] for i in range(2): # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top{}".format(i)) else: top = Identity(name='top{}'.format(i)) if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition{}".format(i)) else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}_{}".format( i, trans_level)) for trans_level in xrange(dec_stack) ] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att" + i) elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=dim_encoded, match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att{}".format(i)) else: raise ValueError( "Unknown attention type {}".format(attention_type)) if embed_outputs: feedback = LookupFeedback( num_phonemes + 1, dim_dec if dim_output_embedding is None else dim_output_embedding) else: feedback = OneOfNFeedback(num_phonemes + 1) if criterion['name'] == 'log_likelihood': emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter{}".format(i)) if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() elif criterion['name'].startswith('mse'): emitter = RewardRegressionEmitter(criterion['name'], eos_label, num_phonemes, criterion.get( 'min_reward', -1.0), name="emitter") else: raise ValueError("Unknown criterion {}".format( criterion['name'])) readout_config = dict( readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout{}".format(i)) if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge{}'.format(i)) readout = Readout(**readout_config) language_model = None if lm and lm.get('path'): lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn( "Beam search is prone to fail with no log-prob normalization" ) language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout( lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generators[i] = SequenceGenerator(readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator{}".format(i)) self.generator = generators[0] self.forward_to_backward = Linear(dim_dec, dim_dec) # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generators = generators self.children = [self.forward_to_backward, encoder, top, bottom ] + generators # Create input variables self.inputs = self.bottom.batch_inputs self.inputs_mask = self.bottom.mask self.labels = tensor.lmatrix('labels') self.labels_mask = tensor.matrix("labels_mask") self.single_inputs = self.bottom.single_inputs self.single_labels = tensor.lvector('labels') self.n_steps = tensor.lscalar('n_steps')
def __init__( self, input_sources, input_sources_dims, eos_label, num_phonemes, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, enc_transition_params={}, dec_transition_params={}, names_postfix='', lm=None, character_map=None, bidir=True, bidir_aggregation='concat', subsample=None, dims_top=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, dec_stack=1, conv_num_filters=1, data_prepend_eos=False, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=3, use_dependent_words_for_labels=False, use_dependent_words_for_attention=False, reproduce_rec_weight_init_bug=True, pointers_weight=0.5, tags_weight=1.0, tag_layer=-1, # -1 is last, 0 is after first bidir layer dependency_type='recurrent_soft', **kwargs): if post_merge_activation is None: post_merge_activation = Tanh() self.regularization_bricks = [] possible_regularization_bricks = [] self.names_postfix = names_postfix self.mask_dict = {} self.pointers_name = 'pointers' + names_postfix self.additional_sources = kwargs.pop('additional_sources') self.additional_sources_dims = kwargs.pop('additional_sources_dims') self.pointer_weight = pointers_weight self.soft_pointer_val = kwargs.pop('pointers_soften', 0.0) self.soft_pointer = self.soft_pointer_val > 0.0 self.tags_weight = tags_weight self.tag_layer = tag_layer self.train_tags = True if self.tags_weight < 0 or len(self.additional_sources) <= 1: self.train_tags = False self.dependency_type = dependency_type super(DependencyRecognizer, self).__init__(**kwargs) self.reproduce_rec_weight_init_bug = reproduce_rec_weight_init_bug self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.max_decoded_length_scale = max_decoded_length_scale self.post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class(input_sources=input_sources, input_sources_dims=input_sources_dims, name='bottom', pointers_soften=self.soft_pointer, additional_sources=self.additional_sources, **bottom) # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.output_dim, subsample, bidir=bidir, bidir_aggregation=bidir_aggregation, enc_transition_params=enc_transition_params) possible_regularization_bricks += encoder.enc_transitions dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top") else: top = Identity(name='top') self.additional_sources_mlp = {} ndim_softmax = NDimensionalSoftmax() ndim_softmax._extra_ndim = 1 for source in self.additional_sources: if source != self.pointers_name: if len(self.names_postfix) > 0: source_glob_name = source[:-len(self.names_postfix)] else: source_glob_name = source self.additional_sources_mlp[source] = \ MLP([ndim_softmax], [dim_encoded, self.additional_sources_dims[source]], name='additional_'+source_glob_name) if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition", **dec_transition_params) possible_regularization_bricks += [transition] else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level), **dec_transition_params) for trans_level in xrange(dec_stack) ] possible_regularization_bricks += transitions transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration attention_class = ParsingAttention attention_kwargs = {} transition_with_att_class = ParsingAttentionRecurrent if self.dependency_type == "recurrent_soft": attention_kwargs['use_pointers'] = None elif self.dependency_type == "recurrent_hard": attention_kwargs['use_pointers'] = 'hard' elif self.dependency_type == "recurrent_semihard": attention_kwargs['use_pointers'] = 'semihard' else: raise ValueError("Unknown dependency type {}".format( self.dependency_type)) if attention_type == "content": pass elif attention_type == "content_hard": attention_kwargs['hard_attention'] = True else: raise ValueError( "Unknown attention type {}".format(attention_type)) if use_dependent_words_for_attention: attention_kwargs['use_word_annotations'] = True attention_kwargs['word_annontation_dim'] = dim_encoded attention = attention_class(state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att", **attention_kwargs) feedback = AttendedFeedback(num_phonemes + 1, dim_encoded) if criterion['name'] == 'log_likelihood': emitter = SoftmaxMultiEmitter(initial_output=num_phonemes, name="emitter") else: raise ValueError("Unknown criterion {}".format(criterion['name'])) readout_source_names = (transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]] if use_dependent_words_for_labels: readout_source_names.append('attended') readout_config = dict(readout_dim=num_phonemes, source_names=readout_source_names, emitter=emitter, feedback_brick=feedback, name="readout") if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge') readout = Readout(**readout_config) generator = Generator( readout=readout, transition=transition, attention=attention, dim_dec=dim_dec, pointer_weight=self.pointer_weight, transition_with_att_class=transition_with_att_class, name="generator") for brick in possible_regularization_bricks: if 'regularize' in dir(brick): self.regularization_bricks += [brick] logger.info("Regularization bricks: {}".format( str(self.regularization_bricks))) # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.children = [encoder, top, bottom, generator] self.children.extend(self.additional_sources_mlp.values()) # Create input variables self.inputs = self.bottom.get_batch_inputs() self.inputs_mask = self.bottom.get_mask() self.additional_sources = self.bottom.get_batch_additional_sources() self.labels = tensor.lmatrix('labels' + names_postfix) self.labels_mask = tensor.matrix('labels' + names_postfix + '_mask') #self.labels_mask = tensor.matrix('labels_mask'+names_postfix) self.single_inputs = self.bottom.get_single_sequence_inputs() self.single_labels = tensor.lvector('labels' + names_postfix) self.single_additional_sources = self.bottom.get_single_additional_sources( ) self.n_steps = tensor.lscalar('n_steps' + names_postfix)