Exemplo n.º 1
0
    def __init__(self,
                 name: str,
                 encoders: List[TemporalStateful],
                 vocabulary: Vocabulary,
                 data_id: str,
                 max_output_len: int = None,
                 hidden_dim: int = None,
                 activation: Callable = tf.nn.relu,
                 dropout_keep_prob: float = 1.0,
                 add_start_symbol: bool = False,
                 add_end_symbol: bool = False,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        check_argument_types()
        ModelPart.__init__(self, name, reuse, save_checkpoint, load_checkpoint,
                           initializers)

        self.encoders = encoders
        self.vocabulary = vocabulary
        self.data_id = data_id
        self.max_output_len = max_output_len
        self.hidden_dim = hidden_dim
        self.activation = activation
        self.dropout_keep_prob = dropout_keep_prob
        self.add_start_symbol = add_start_symbol
        self.add_end_symbol = add_end_symbol
Exemplo n.º 2
0
def beam_search_runner_range(
        output_series: str,
        decoder: BeamSearchDecoder,
        max_rank: int = None,
        postprocess: Callable[[List[str]], List[str]] = None) -> List[
            BeamSearchRunner]:
    """Return beam search runners for a range of ranks from 1 to max_rank.

    This means there is max_rank output series where the n-th series contains
    the n-th best hypothesis from the beam search.

    Args:
        output_series: Prefix of output series.
        decoder: Beam search decoder shared by all runners.
        max_rank: Maximum rank of the hypotheses.
        postprocess: Series-level postprocess applied on output.

    Returns:
        List of beam search runners getting hypotheses with rank from 1 to
        max_rank.
    """
    check_argument_types()

    if max_rank is None:
        max_rank = decoder.beam_size

    if max_rank > decoder.beam_size:
        raise ValueError(
            ("The maximum rank ({}) cannot be "
             "bigger than beam size {}.").format(
                 max_rank, decoder.beam_size))

    return [BeamSearchRunner("{}.rank{:03d}".format(output_series, r),
                             decoder, r, postprocess)
            for r in range(1, max_rank + 1)]
Exemplo n.º 3
0
    def __init__(self,
                 name: str,
                 input_sequence: Attendable,
                 hidden_size: int,
                 num_heads: int,
                 output_size: int = None,
                 state_proj_size: int = None,
                 dropout_keep_prob: float = 1.0,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        """Initialize an instance of the encoder."""
        check_argument_types()
        ModelPart.__init__(self, name, reuse, save_checkpoint, load_checkpoint,
                           initializers)

        self.input_sequence = input_sequence
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.output_size = output_size
        self.state_proj_size = state_proj_size
        self.dropout_keep_prob = dropout_keep_prob

        if self.dropout_keep_prob <= 0.0 or self.dropout_keep_prob > 1.0:
            raise ValueError("Dropout keep prob must be inside (0,1].")
Exemplo n.º 4
0
    def __init__(self,
                 name: str,
                 encoder: TemporalStateful,
                 vocabulary: Vocabulary,
                 data_id: str,
                 max_length: int = None,
                 merge_repeated_targets: bool = False,
                 merge_repeated_outputs: bool = True,
                 beam_width: int = 1,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        check_argument_types()
        ModelPart.__init__(self, name, reuse, save_checkpoint, load_checkpoint,
                           initializers)

        self.encoder = encoder
        self.vocabulary = vocabulary
        self.data_id = data_id
        self.max_length = max_length

        self.merge_repeated_targets = merge_repeated_targets
        self.merge_repeated_outputs = merge_repeated_outputs
        self.beam_width = beam_width
Exemplo n.º 5
0
    def __init__(self,
                 name: str,
                 encoders: List[TemporalStateful],
                 embedded_sequence: EmbeddedSequence,
                 data_id: str,
                 max_output_len: int = None,
                 hidden_dim: int = None,
                 activation: Callable = tf.nn.relu,
                 train_embeddings: bool = True,
                 dropout_keep_prob: float = 1.0,
                 add_start_symbol: bool = False,
                 add_end_symbol: bool = False,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:

        check_argument_types()
        SequenceLabeler.__init__(
            self, name, encoders, embedded_sequence.vocabulary, data_id,
            max_output_len, hidden_dim=hidden_dim, activation=activation,
            dropout_keep_prob=dropout_keep_prob,
            add_start_symbol=add_start_symbol, add_end_symbol=add_end_symbol,
            reuse=reuse, save_checkpoint=save_checkpoint,
            load_checkpoint=load_checkpoint, initializers=initializers)

        self.embedded_sequence = embedded_sequence
        self.train_embeddings = train_embeddings
Exemplo n.º 6
0
 def __init__(self,
              output_series: str,
              decoder: SupportedDecoder,
              postprocess: Postprocessor = None) -> None:
     check_argument_types()
     BaseRunner[SupportedDecoder].__init__(self, output_series, decoder)
     self.postprocess = postprocess
Exemplo n.º 7
0
    def __init__(self,
                 decoders: List[Any],
                 decoder_weights: List[ObjectiveWeight] = None,
                 l1_weight: float = 0.,
                 l2_weight: float = 0.,
                 clip_norm: float = None,
                 optimizer: tf.train.Optimizer = None,
                 var_scopes: List[str] = None,
                 var_collection: str = None) -> None:
        check_argument_types()

        if decoder_weights is None:
            decoder_weights = [None for _ in decoders]

        if len(decoder_weights) != len(decoders):
            raise ValueError(
                "decoder_weights (length {}) do not match decoders (length {})"
                .format(len(decoder_weights), len(decoders)))

        objectives = [CostObjective(dec, w)
                      for dec, w in zip(decoders, decoder_weights)]

        GenericTrainer.__init__(
            self,
            objectives=objectives,
            l1_weight=l1_weight,
            l2_weight=l2_weight,
            clip_norm=clip_norm,
            optimizer=optimizer,
            var_scopes=var_scopes,
            var_collection=var_collection)
Exemplo n.º 8
0
def pooling(
        prev_layer: tf.Tensor,
        prev_mask: tf.Tensor,
        specification: MaxPoolSpec,
        layer_num: int) -> Tuple[tf.Tensor, tf.Tensor]:
    try:
        check_argument_types()
    except TypeError as err:
        raise ValueError((
            "Specification of a max-pooling layer (number {} in config) "
            'needs to have 3 members: "M", pool size, stride, padding, '
            "was {}").format(layer_num, specification)) from err
    pool_type, pool_size, stride, pad = specification

    if pool_type == "M":
        pool_fn = tf.layers.max_pooling2d
    elif pool_type == "A":
        pool_fn = tf.layers.average_pooling2d
    else:
        raise ValueError(
            ("Unsupported type of pooling: {}, use 'M' for max-pooling or "
             "'A' for average pooling.").format(pool_type))

    if pad not in ["same", "valid"]:
        raise ValueError(
            "Padding must be 'same' or 'valid', was '{}' in layer {}."
            .format(pad, layer_num + 1))

    with tf.variable_scope("layer_{}_max_pool".format(layer_num)):
        next_layer = pool_fn(prev_layer, pool_size, stride)
        next_mask = tf.layers.max_pooling2d(prev_mask, pool_size, stride)
    return next_layer, next_mask
Exemplo n.º 9
0
 def __init__(self,
              name: str,
              cnn: CNNEncoder) -> None:
     check_argument_types()
     ModelPart.__init__(
         self, name, save_checkpoint=None, load_checkpoint=None)
     self._cnn = cnn
Exemplo n.º 10
0
def word2vec_vocabulary(w2v: Word2Vec) -> Vocabulary:
    """Return the vocabulary from a word2vec object.

    This is a helper method used from configuration.
    """
    check_argument_types()
    return w2v.vocabulary
Exemplo n.º 11
0
def plain_convolution(
        prev_layer: tf.Tensor,
        prev_mask: tf.Tensor,
        specification: ConvSpec,
        batch_norm_callback: Callable[[tf.Tensor], tf.Tensor],
        layer_num: int) -> Tuple[tf.Tensor, tf.Tensor, int]:
    try:
        check_argument_types()
    except TypeError as err:
        raise ValueError((
            "Specification of a convolutional layer (number {} in config) "
            'needs to have 5 members: "C", kernel size, stride, '
            "padding, output channels, was {}").format(
                layer_num, specification)) from err
    kernel_size, stride, pad, out_channels = specification[1:]

    if pad not in ["same", "valid"]:
        raise ValueError(
            ("Padding must be 'same' or 'valid', "
             "was '{}' in layer {}.").format(pad, layer_num + 1))

    with tf.variable_scope("layer_{}_convolution".format(layer_num)):
        next_layer = tf.layers.conv2d(
            prev_layer, out_channels, kernel_size,
            activation=None, padding=pad)

        next_layer = batch_norm_callback(next_layer)
        next_layer = tf.nn.relu(next_layer)

        next_mask = tf.layers.max_pooling2d(
            prev_mask, kernel_size, stride, padding=pad)

    return next_layer, next_mask, out_channels
Exemplo n.º 12
0
def mlp_output(layer_sizes: List[int],
               activation: Callable[[tf.Tensor], tf.Tensor] = tf.tanh,
               dropout_keep_prob: float = 1.0) -> Tuple[OutputProjection, int]:
    """Apply a multilayer perceptron.

    Compute RNN deep output using the multilayer perceptron
    with a specified activation function.
    (Pascanu et al., 2013 [https://arxiv.org/pdf/1312.6026v5.pdf])

    Arguments:
        layer_sizes: A list of sizes of the hiddel layers of the MLP
        dropout_keep_prob: the dropout keep probability
        activation: The activation function to use in each layer.
    """
    check_argument_types()

    def _projection(prev_state, prev_output, ctx_tensors, train_mode):
        mlp_input = tf.concat([prev_state, prev_output] + ctx_tensors, 1)

        return multilayer_projection(mlp_input, layer_sizes,
                                     activation=activation,
                                     dropout_keep_prob=dropout_keep_prob,
                                     train_mode=train_mode,
                                     scope="deep_output_mlp")

    return _projection, layer_sizes[-1]
Exemplo n.º 13
0
    def __init__(self,
                 wrapper: str,
                 name: str = "MultEval",
                 encoding: str = "utf-8",
                 metric: str = "bleu",
                 language: str = "en") -> None:
        """Initialize the wrapper.

        Arguments:
            wrapper: Path to multeval.sh script
            name: Name of the evaluator
            encoding: Encoding of input files
            language: Language of hypotheses and references
            metric: Evaluation metric "bleu", "ter", "meteor"
        """
        check_argument_types()
        super().__init__("{}_{}_{}".format(name, metric, language))

        self.wrapper = wrapper
        self.encoding = encoding
        self.language = language
        self.metric = metric

        if self.metric not in ["bleu", "ter", "meteor"]:
            warn("{} metric is not valid. Using bleu instead.".
                 format(self.metric))
            self.metric = "bleu"
Exemplo n.º 14
0
    def __init__(self,
                 num_sessions: int,
                 num_threads: int,
                 save_n_best: int = 1,
                 minimize_metric: bool = False,
                 gpu_allow_growth: bool = True,
                 per_process_gpu_memory_fraction: float = 1.0,
                 enable_tf_debug: bool = False) -> None:
        """Initialize a TensorflowManager.

        At this moment the graph must already exist. This method initializes
        required number of TensorFlow sessions and initializes them with
        provided variable files if they are provided.

        Args:
            num_sessions: Number of sessions to be initialized.
            num_threads: Number of threads sessions will run in.
            save_n_best: How many best models to keep
            minimize_metric: Whether the best model is the one with the lowest
                or the highest score
            gpu_allow_growth: TF to allocate incrementally, not all at once.
            per_process_gpu_memory_fraction: Limit TF memory use.
        """
        check_argument_types()

        self.session_cfg = tf.ConfigProto()
        self.session_cfg.inter_op_parallelism_threads = num_threads
        self.session_cfg.intra_op_parallelism_threads = num_threads
        self.session_cfg.allow_soft_placement = True  # needed for more GPUs
        # pylint: disable=no-member
        self.session_cfg.gpu_options.allow_growth = gpu_allow_growth
        self.session_cfg.gpu_options.per_process_gpu_memory_fraction = \
            per_process_gpu_memory_fraction
        # pylint: enable=no-member

        if save_n_best < 1:
            raise Exception("save_n_best parameter must be greater than zero")
        self.saver_max_to_keep = save_n_best
        self.minimize_metric = minimize_metric
        self.num_sessions = num_sessions

        self.sessions = [tf.Session(config=self.session_cfg)
                         for _ in range(self.num_sessions)]

        if enable_tf_debug:
            self.sessions = [tf_debug.LocalCLIDebugWrapperSession(sess)
                             for sess in self.sessions]

        self.saver = None

        self.best_score_index = None  # type: Optional[int]
        self.best_score_epoch = 0
        self.best_score_batch = 0

        init_score = np.inf if self.minimize_metric else -np.inf
        self.saved_scores = [init_score for _ in range(self.saver_max_to_keep)]
        self.best_score = init_score

        self.variables_files = []  # type: List[str]
        self._best_vars_file = None  # type: Optional[str]
Exemplo n.º 15
0
def maxout_output(
        maxout_size: int,
        dropout_keep_prob: float = 1.0) -> Tuple[OutputProjection, int]:
    """Apply maxout.

    Compute RNN output out of the previous state and output, and the
    context tensors returned from attention mechanisms, as described
    in the article

    This function corresponds to the equations for computation the
    t_tilde in the Bahdanau et al. (2015) paper, on page 14,
    with the maxout projection, before the last linear projection.

    Arguments:
        maxout_size: The size of the hidden maxout layer in the deep output

    Returns:
        Returns the maxout projection of the concatenated inputs
    """
    check_argument_types()

    def _projection(prev_state, prev_output, ctx_tensors, train_mode):
        state_out_ctx = tf.concat([prev_state, prev_output] + ctx_tensors, 1)
        return dropout(
            maxout(state_out_ctx, maxout_size),
            dropout_keep_prob, train_mode)

    return _projection, maxout_size
Exemplo n.º 16
0
def from_file_list(prefix: str,
                   shape: List[int],
                   suffix: str = "",
                   default_tensor_name: str = "arr_0") -> Callable:
    """Load a list of numpy arrays from a list of .npz numpy files.

    Args:
        prefix: A common prefix for the files in the list.
        shape: The shape of the numpy arrays stored in the referenced files.
        suffix: An optional suffix that will be appended to each path
        default_tensor_name: Key of the tensors to load from the npz files.

    Returns:
        A generator function that yields the loaded arryas.
    """
    check_argument_types()

    def load(files: List[str]) -> Iterable[np.ndarray]:
        for list_file in files:
            with open(list_file, encoding="utf-8") as f_list:
                for line in f_list:
                    path = os.path.join(prefix, line.rstrip()) + suffix
                    with np.load(path) as npz:
                        arr = npz[default_tensor_name]
                        arr_shape = list(arr.shape)
                        if arr_shape != shape:
                            raise ValueError(
                                "Shapes do not match: expected {}, found {}"
                                .format(shape, arr_shape))
                        yield arr
    return load
Exemplo n.º 17
0
    def __init__(self, n: int = 4,
                 deduplicate: bool = False,
                 name: str = None,
                 multiple_references_separator: str = None) -> None:
        """Instantiate BLEU evaluator.

        Args:
            n: Longest n-grams considered.
            deduplicate: Flag whether repated tokes should be treated as one.
            name: Name displayed in the logs and TensorBoard.
            multiple_references_separator: Token that separates multiple
                reference sentences. If ``None``, it assumes the reference is
                one sentence only.
        """
        check_argument_types()

        if name is None:
            name = "BLEU-{}".format(n)
            if deduplicate:
                name += "-dedup"
        super().__init__(name)

        self.n = n
        self.deduplicate = deduplicate
        self.multiple_references_separator = multiple_references_separator
Exemplo n.º 18
0
    def __init__(self,
                 output_series: str,
                 encoder: GenericModelPart,
                 attribute: str = "output",
                 select_session: int = None) -> None:
        """Initialize the representation runner.

        Args:
            output_series: Name of the output series with vectors.
            encoder: The encoder to use. This can be any ``GenericModelPart``
                object.
            attribute: The name of the encoder attribute that contains the
                data.
            used_session: Id of the TensorFlow session used in case of model
                ensembles.
        """
        check_argument_types()

        if attribute not in dir(encoder):
            warn("The encoder '{}' seems not to have the specified "
                 "attribute '{}'".format(encoder, attribute))

        TensorRunner.__init__(
            self,
            output_series,
            modelparts=[encoder],
            tensors=[attribute],
            batch_dims=[0],
            tensors_by_name=[],
            batch_dims_by_name=[],
            select_session=select_session,
            single_tensor=True)
Exemplo n.º 19
0
    def __init__(self,
                 name: str,
                 dimension: int,
                 data_id: str,
                 output_shape: int = None,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        """Instantiate StatefulFiller.

        Arguments:
            name: Name of the model part.
            dimension: Dimensionality of the input.
            data_id: Series containing the numpy objects.
            output_shape: Dimension of optional state projection.
        """
        check_argument_types()
        ModelPart.__init__(
            self, name, reuse, save_checkpoint, load_checkpoint, initializers)

        self.data_id = data_id
        self.dimension = dimension
        self.output_shape = output_shape

        if self.dimension <= 0:
            raise ValueError("Input vector dimension must be positive.")
        if self.output_shape is not None and self.output_shape <= 0:
            raise ValueError("Output vector dimension must be positive.")
Exemplo n.º 20
0
    def __init__(self,
                 name: str,
                 input_shape: List[int],
                 data_id: str,
                 projection_dim: int = None,
                 ff_hidden_dim: int = None,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        """Instantiate SpatialFiller.

        Args:
            name: Name of the model part.
            input_shape: Dimensionality of the input.
            data_id: Name of the data series with numpy objects.
            projection_dim: Optional, dimension of the states projection.
        """
        check_argument_types()
        ModelPart.__init__(
            self, name, reuse, save_checkpoint, load_checkpoint, initializers)

        self.data_id = data_id
        self.input_shape = input_shape
        self.projection_dim = projection_dim
        self.ff_hidden_dim = ff_hidden_dim

        if self.ff_hidden_dim is not None and self.projection_dim is None:
            raise ValueError(
                "projection_dim must be provided when using ff_hidden_dim")

        if len(self.input_shape) != 3:
            raise ValueError("The input shape should have 3 dimensions.")
Exemplo n.º 21
0
def linear_encoder_projection(dropout_keep_prob: float) -> EncoderProjection:
    """Return a linear encoder projection.

    Return a projection function which applies dropout on concatenated
    encoder final states and returns a linear projection to a rnn_size-sized
    tensor.

    Arguments:
        dropout_keep_prob: The dropout keep probability
    """
    check_argument_types()

    def func(train_mode: tf.Tensor,
             rnn_size: int,
             encoders: List[Stateful]) -> tf.Tensor:

        if rnn_size is None:
            raise ValueError(
                "You must supply rnn_size for this type of encoder projection")

        en_concat = concat_encoder_projection(train_mode, None, encoders)

        return dropout(
            tf.layers.dense(en_concat, rnn_size, name="encoders_projection"),
            dropout_keep_prob, train_mode)

    return cast(EncoderProjection, func)
Exemplo n.º 22
0
def from_t2t_vocabulary(path: str,
                        encoding: str = "utf-8") -> "Vocabulary":
    """Load a vocabulary generated during tensor2tensor training.

    Arguments:
        path: The path to the vocabulary file.
        encoding: The encoding of the vocabulary file (defaults to UTF-8).

    Returns:
        The new Vocabulary instantce.
    """
    check_argument_types()
    vocabulary = []  # type: List[str]

    with open(path, encoding=encoding) as wordlist:
        for line in wordlist:
            line = line.strip()

            # T2T vocab tends to wrap words in single quotes
            if ((line.startswith("'") and line.endswith("'"))
                    or (line.startswith('"') and line.endswith('"'))):
                line = line[1:-1]

            if line in ["<pad>", "<EOS>"]:
                continue

            vocabulary.append(line)

    log("Vocabulary form wordlist loaded, containing {} words"
        .format(len(vocabulary)))
    log_sample(vocabulary)

    return Vocabulary(vocabulary)
Exemplo n.º 23
0
    def __init__(self,
                 name: str,
                 parent_decoder: AutoregressiveDecoder,
                 beam_size: int,
                 max_steps: int,
                 length_normalization: float) -> None:
        """Construct the beam search decoder graph.

        Arguments:
            name: The name for the model part.
            parent_decoder: An autoregressive decoder from which to sample.
            beam_size: The number of hypotheses in the beam.
            max_steps: The maximum number of time steps to perform.
            length_normalization: The alpha parameter from Eq. 14 in the paper.
        """
        check_argument_types()
        ModelPart.__init__(self, name)

        self.parent_decoder = parent_decoder
        self.beam_size = beam_size
        self.length_normalization = length_normalization
        self.max_steps_int = max_steps

        # Create a placeholder for maximum number of steps that is necessary
        # during ensembling, when the decoder is called repetitively with the
        # max_steps attribute set to one.
        self.max_steps = tf.placeholder_with_default(self.max_steps_int, [])

        self._initial_loop_state = None  # type: Optional[BeamSearchLoopState]
Exemplo n.º 24
0
    def __init__(self,
                 name: str,
                 smooth_method: str = "exp",
                 smooth_value: float = 0.0,
                 force: bool = False,
                 lowercase: bool = False,
                 tokenize: str = "none",
                 use_effective_order: bool = False) -> None:
        check_argument_types()
        super().__init__(name)

        if tokenize not in TOKENIZERS:
            raise ValueError(
                "Unknown tokenizer '{}'. You must use one of sacrebleu's "
                "tokenizers: {}".format(tokenize, str(TOKENIZERS)))

        if smooth_method not in SMOOTH_VARIANTS:
            raise ValueError(
                "Unknown smoothing '{}'. You must use one of sacrebleu's "
                "smoothing methods: {}".format(smooth_method,
                                               str(SMOOTH_VARIANTS)))

        self.smooth_method = smooth_method
        self.smooth_value = smooth_value
        self.force = force
        self.lowercase = lowercase
        self.tokenize = tokenize
        self.use_effective_order = use_effective_order
Exemplo n.º 25
0
    def __init__(
            self,
            name: str,
            parent: TemporalStateful,
            factor: int,
            projection_size: int = None,
            projection_activation: Activation = None) -> None:
        """Initialize SentenceSplitter.

        Args:
            parent: TemporalStateful whose states will be split.
            factor: Factor by which the states will be split - the  resulting
                sequence will be longer by this factor.
            projection_size: If not None, specifies dimensionality of a
                projection before state splitting.
            projection_activation: Non-linearity function for the optional
                projection.
        """
        check_argument_types()

        ModelPart.__init__(
            self, name=name, save_checkpoint=None, load_checkpoint=None,
            initializers=None)
        self.parent = parent
        self.factor = factor
        self.projection_size = projection_size
        self.activation = projection_activation

        if projection_size is not None and projection_size % factor != 0:
            raise ValueError((
                "Dimension of projection ({}) must be "
                "dividable by the given factor ({}).").format(
                    projection_size, factor))
Exemplo n.º 26
0
    def __init__(self,
                 name: str,
                 n_heads: int,
                 keys_encoder: Attendable,
                 values_encoder: Attendable = None,
                 dropout_keep_prob: float = 1.0,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        check_argument_types()
        BaseAttention.__init__(self, name, reuse, save_checkpoint,
                               load_checkpoint, initializers)

        self.n_heads = n_heads
        self.dropout_keep_prob = dropout_keep_prob

        self.keys_encoder = keys_encoder

        if values_encoder is not None:
            self.values_encoder = values_encoder
        else:
            self.values_encoder = self.keys_encoder

        if self.n_heads <= 0:
            raise ValueError("Number of heads must be greater than zero.")

        if self.dropout_keep_prob <= 0.0 or self.dropout_keep_prob > 1.0:
            raise ValueError("Dropout keep prob must be inside (0,1].")

        self._variable_scope.set_initializer(tf.variance_scaling_initializer(
            mode="fan_avg", distribution="uniform"))
Exemplo n.º 27
0
def single_tensor(files: List[str]) -> np.ndarray:
    """Load a single tensor from a numpy file."""
    check_argument_types()
    if len(files) == 1:
        return np.load(files[0])

    return np.concatenate([np.load(f) for f in files], axis=0)
Exemplo n.º 28
0
    def __init__(self,
                 output_series: str,
                 attention: BaseAttention,
                 decoder: Decoder) -> None:
        check_argument_types()
        BaseRunner[BaseAttention].__init__(self, output_series, attention)

        self._key = "{}_run".format(decoder.name)
Exemplo n.º 29
0
    def __init__(self, name: str = None) -> None:
        check_argument_types()
        if name is None:
            name = type(self).__name__
            if name.endswith("Evaluator"):
                name = name[:-9]

        self._name = name
Exemplo n.º 30
0
    def mapply(self, fn, args, kwargs):
        try:
            memo = typeguard._CallMemo(fn, args=args, kwargs=kwargs)
            typeguard.check_argument_types(memo)
        except TypeError as exc:
            print(exc)
            raise XMLRPCInvalidParamTypes(exc)

        return super().mapply(fn, args, kwargs)
Exemplo n.º 31
0
    def __call__(self,
                 speech_mix: Union[torch.Tensor, np.ndarray],
                 fs: int = 8000) -> List[torch.Tensor]:
        """Inference

        Args:
            speech_mix: Input speech data (Batch, Nsamples [, Channels])
            fs: sample rate
        Returns:
            [separated_audio1, separated_audio2, ...]

        """
        assert check_argument_types()

        # Input as audio signal
        if isinstance(speech_mix, np.ndarray):
            speech_mix = torch.as_tensor(speech_mix)

        assert speech_mix.dim() > 1, speech_mix.size()
        batch_size = speech_mix.size(0)
        speech_mix = speech_mix.to(getattr(torch, self.dtype))
        # lengths: (B,)
        lengths = speech_mix.new_full([batch_size],
                                      dtype=torch.long,
                                      fill_value=speech_mix.size(1))

        # a. To device
        speech_mix = to_device(speech_mix, device=self.device)
        lengths = to_device(lengths, device=self.device)

        if self.segmenting and lengths[0] > self.segment_size * fs:
            # Segment-wise speech enhancement/separation
            overlap_length = int(
                np.round(fs * (self.segment_size - self.hop_size)))
            num_segments = int(
                np.ceil((speech_mix.size(1) - overlap_length) /
                        (self.hop_size * fs)))
            t = T = int(self.segment_size * fs)
            pad_shape = speech_mix[:, :T].shape
            enh_waves = []
            range_ = trange if self.show_progressbar else range
            for i in range_(num_segments):
                st = int(i * self.hop_size * fs)
                en = st + T
                if en >= lengths[0]:
                    # en - st < T (last segment)
                    en = lengths[0]
                    speech_seg = speech_mix.new_zeros(pad_shape)
                    t = en - st
                    speech_seg[:, :t] = speech_mix[:, st:en]
                else:
                    t = T
                    speech_seg = speech_mix[:, st:en]  # B x T [x C]

                lengths_seg = speech_mix.new_full([batch_size],
                                                  dtype=torch.long,
                                                  fill_value=T)
                # b. Enhancement/Separation Forward
                feats, f_lens = self.enh_model.encoder(speech_seg, lengths_seg)
                feats, _, _ = self.enh_model.separator(feats, f_lens)
                processed_wav = [
                    self.enh_model.decoder(f, lengths_seg)[0] for f in feats
                ]
                if speech_seg.dim() > 2:
                    # multi-channel speech
                    speech_seg_ = speech_seg[:, self.ref_channel]
                else:
                    speech_seg_ = speech_seg

                if self.normalize_segment_scale:
                    # normalize the scale to match the input mixture scale
                    mix_energy = torch.sqrt(
                        torch.mean(speech_seg_[:, :t].pow(2),
                                   dim=1,
                                   keepdim=True))
                    enh_energy = torch.sqrt(
                        torch.mean(sum(processed_wav)[:, :t].pow(2),
                                   dim=1,
                                   keepdim=True))
                    processed_wav = [
                        w * (mix_energy / enh_energy) for w in processed_wav
                    ]
                # List[torch.Tensor(num_spk, B, T)]
                enh_waves.append(torch.stack(processed_wav, dim=0))

            # c. Stitch the enhanced segments together
            waves = enh_waves[0]
            for i in range(1, num_segments):
                # permutation between separated streams in last and current segments
                perm = self.cal_permumation(
                    waves[:, :, -overlap_length:],
                    enh_waves[i][:, :, :overlap_length],
                    criterion="si_snr",
                )
                # repermute separated streams in current segment
                for batch in range(batch_size):
                    enh_waves[i][:, batch] = enh_waves[i][perm[batch], batch]

                if i == num_segments - 1:
                    enh_waves[i][:, :, t:] = 0
                    enh_waves_res_i = enh_waves[i][:, :, overlap_length:t]
                else:
                    enh_waves_res_i = enh_waves[i][:, :, overlap_length:]

                # overlap-and-add (average over the overlapped part)
                waves[:, :, -overlap_length:] = (
                    waves[:, :, -overlap_length:] +
                    enh_waves[i][:, :, :overlap_length]) / 2
                # concatenate the residual parts of the later segment
                waves = torch.cat([waves, enh_waves_res_i], dim=2)
            # ensure the stitched length is same as input
            assert waves.size(2) == speech_mix.size(1), (waves.shape,
                                                         speech_mix.shape)
            waves = torch.unbind(waves, dim=0)
        else:
            # b. Enhancement/Separation Forward
            feats, f_lens = self.enh_model.encoder(speech_mix, lengths)
            feats, _, _ = self.enh_model.separator(feats, f_lens)
            waves = [self.enh_model.decoder(f, lengths)[0] for f in feats]

        assert len(waves) == self.num_spk, len(waves) == self.num_spk
        assert len(waves[0]) == batch_size, (len(waves[0]), batch_size)
        if self.normalize_output_wav:
            waves = [
                (w / abs(w).max(dim=1, keepdim=True)[0] * 0.9).cpu().numpy()
                for w in waves
            ]  # list[(batch, sample)]
        else:
            waves = [w.cpu().numpy() for w in waves]

        return waves
Exemplo n.º 32
0
def inference(
    output_dir: str,
    batch_size: int,
    dtype: str,
    fs: int,
    ngpu: int,
    seed: int,
    num_workers: int,
    log_level: Union[int, str],
    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
    key_file: Optional[str],
    train_config: Optional[str],
    model_file: Optional[str],
    model_tag: Optional[str],
    allow_variable_data_keys: bool,
    segment_size: Optional[float],
    hop_size: Optional[float],
    normalize_segment_scale: bool,
    show_progressbar: bool,
    ref_channel: Optional[int],
    normalize_output_wav: bool,
    enh_s2t_task: bool,
):
    assert check_argument_types()
    if batch_size > 1:
        raise NotImplementedError("batch decoding is not implemented")
    if ngpu > 1:
        raise NotImplementedError("only single GPU decoding is supported")

    logging.basicConfig(
        level=log_level,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
    )

    if ngpu >= 1:
        device = "cuda"
    else:
        device = "cpu"

    # 1. Set random-seed
    set_all_random_seed(seed)

    # 2. Build separate_speech
    separate_speech_kwargs = dict(
        train_config=train_config,
        model_file=model_file,
        segment_size=segment_size,
        hop_size=hop_size,
        normalize_segment_scale=normalize_segment_scale,
        show_progressbar=show_progressbar,
        ref_channel=ref_channel,
        normalize_output_wav=normalize_output_wav,
        device=device,
        dtype=dtype,
        enh_s2t_task=enh_s2t_task,
    )
    separate_speech = SeparateSpeech.from_pretrained(
        model_tag=model_tag,
        **separate_speech_kwargs,
    )

    # 3. Build data-iterator
    loader = EnhancementTask.build_streaming_iterator(
        data_path_and_name_and_type,
        dtype=dtype,
        batch_size=batch_size,
        key_file=key_file,
        num_workers=num_workers,
        preprocess_fn=EnhancementTask.build_preprocess_fn(
            separate_speech.enh_train_args, False),
        collate_fn=EnhancementTask.build_collate_fn(
            separate_speech.enh_train_args, False),
        allow_variable_data_keys=allow_variable_data_keys,
        inference=True,
    )

    # 4. Start for-loop
    writers = []
    for i in range(separate_speech.num_spk):
        writers.append(
            SoundScpWriter(f"{output_dir}/wavs/{i + 1}",
                           f"{output_dir}/spk{i + 1}.scp"))

    for keys, batch in loader:
        assert isinstance(batch, dict), type(batch)
        assert all(isinstance(s, str) for s in keys), keys
        _bs = len(next(iter(batch.values())))
        assert len(keys) == _bs, f"{len(keys)} != {_bs}"
        batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")}

        waves = separate_speech(**batch)
        for (spk, w) in enumerate(waves):
            for b in range(batch_size):
                writers[spk][keys[b]] = fs, w[b]

    for writer in writers:
        writer.close()
Exemplo n.º 33
0
    def run(
        cls,
        model: AbsESPnetModel,
        optimizers: Sequence[torch.optim.Optimizer],
        schedulers: Sequence[Optional[AbsScheduler]],
        train_iter_factory: AbsIterFactory,
        valid_iter_factory: AbsIterFactory,
        plot_attention_iter_factory: Optional[AbsIterFactory],
        reporter: Reporter,
        output_dir: Path,
        max_epoch: int,
        seed: int,
        patience: Optional[int],
        keep_nbest_models: int,
        early_stopping_criterion: Sequence[str],
        best_model_criterion: Sequence[Sequence[str]],
        val_scheduler_criterion: Sequence[str],
        trainer_options,
        distributed_option: DistributedOption,
    ) -> None:
        """Perform training. This method performs the main process of training."""
        assert check_argument_types()
        # NOTE(kamo): Don't check the type more strictly as far trainer_options
        assert is_dataclass(trainer_options), type(trainer_options)

        # NOTE(kamo): trainer_options doesn't always have "train_dtype"
        use_apex = getattr(trainer_options, "train_dtype", "") in (
            "O0",
            "O1",
            "O2",
            "O3",
        )
        if use_apex:
            try:
                from apex import amp
            except ImportError:
                logging.error("You need to install apex. "
                              "See https://github.com/NVIDIA/apex#linux")

        start_epoch = reporter.get_epoch() + 1
        if start_epoch == max_epoch + 1:
            logging.warning(
                f"The training has already reached at max_epoch: {start_epoch}"
            )

        if distributed_option.distributed:
            # Use torch DDP instead of apex DDP
            # https://github.com/NVIDIA/apex/issues/494
            dp_model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=(
                    # Perform multi-Process with multi-GPUs
                    [torch.cuda.current_device()]
                    if distributed_option.ngpu == 1
                    # Perform single-Process with multi-GPUs
                    else None),
                output_device=(torch.cuda.current_device()
                               if distributed_option.ngpu == 1 else None),
            )
        elif distributed_option.ngpu > 1:
            # apex.amp supports DataParallel now.
            dp_model = torch.nn.parallel.DataParallel(
                model,
                device_ids=list(range(distributed_option.ngpu)),
            )
        else:
            # NOTE(kamo): DataParallel also should work with ngpu=1,
            # but for debuggability it's better to keep this block.
            dp_model = model

        if not distributed_option.distributed or distributed_option.dist_rank == 0:
            summary_writer = SummaryWriter(str(output_dir / "tensorboard"))
        else:
            summary_writer = None

        start_time = time.perf_counter()
        for iepoch in range(start_epoch, max_epoch + 1):
            if iepoch != start_epoch:
                logging.info(
                    "{}/{}epoch started. Estimated time to finish: {}".format(
                        iepoch,
                        max_epoch,
                        humanfriendly.format_timespan(
                            (time.perf_counter() - start_time) /
                            (iepoch - start_epoch) * (max_epoch - iepoch + 1)),
                    ))
            else:
                logging.info(f"{iepoch}/{max_epoch}epoch started")
            set_all_random_seed(seed + iepoch)

            reporter.set_epoch(iepoch)
            # 1. Train and validation for one-epoch
            with reporter.observe("train") as sub_reporter:
                all_steps_are_invalid = cls.train_one_epoch(
                    model=dp_model,
                    optimizers=optimizers,
                    schedulers=schedulers,
                    iterator=train_iter_factory.build_iter(iepoch),
                    reporter=sub_reporter,
                    summary_writer=summary_writer,
                    options=trainer_options,
                )

            with reporter.observe("valid") as sub_reporter:
                cls.validate_one_epoch(
                    model=dp_model,
                    iterator=valid_iter_factory.build_iter(iepoch),
                    reporter=sub_reporter,
                    options=trainer_options,
                )

            if not distributed_option.distributed or distributed_option.dist_rank == 0:
                # att_plot doesn't support distributed
                if plot_attention_iter_factory is not None:
                    with reporter.observe("att_plot") as sub_reporter:
                        cls.plot_attention(
                            model=model,
                            output_dir=output_dir / "att_ws",
                            summary_writer=summary_writer,
                            iterator=plot_attention_iter_factory.build_iter(
                                iepoch),
                            reporter=sub_reporter,
                            options=trainer_options,
                        )

            # 2. LR Scheduler step
            for scheduler in schedulers:
                if isinstance(scheduler, AbsValEpochStepScheduler):
                    scheduler.step(
                        reporter.get_value(*val_scheduler_criterion))
                elif isinstance(scheduler, AbsEpochStepScheduler):
                    scheduler.step()

            if not distributed_option.distributed or distributed_option.dist_rank == 0:
                # 3. Report the results
                logging.info(reporter.log_message())
                reporter.matplotlib_plot(output_dir / "images")
                reporter.tensorboard_add_scalar(summary_writer)

                # 4. Save/Update the checkpoint
                torch.save(
                    {
                        "model":
                        model.state_dict(),
                        "reporter":
                        reporter.state_dict(),
                        "optimizers": [o.state_dict() for o in optimizers],
                        "schedulers": [
                            s.state_dict() if s is not None else None
                            for s in schedulers
                        ],
                        "amp":
                        amp.state_dict() if use_apex else None,
                    },
                    output_dir / "checkpoint.pth",
                )

                # 5. Save the model and update the link to the best model
                torch.save(model.state_dict(),
                           output_dir / f"{iepoch}epoch.pth")

                # Creates a sym link latest.pth -> {iepoch}epoch.pth
                p = output_dir / "latest.pth"
                if p.is_symlink() or p.exists():
                    p.unlink()
                p.symlink_to(f"{iepoch}epoch.pth")

                _improved = []
                for _phase, k, _mode in best_model_criterion:
                    # e.g. _phase, k, _mode = "train", "loss", "min"
                    if reporter.has(_phase, k):
                        best_epoch = reporter.get_best_epoch(_phase, k, _mode)
                        # Creates sym links if it's the best result
                        if best_epoch == iepoch:
                            p = output_dir / f"{_phase}.{k}.best.pth"
                            if p.is_symlink() or p.exists():
                                p.unlink()
                            p.symlink_to(f"{iepoch}epoch.pth")
                            _improved.append(f"{_phase}.{k}")
                if len(_improved) == 0:
                    logging.info("There are no improvements in this epoch")
                else:
                    logging.info("The best model has been updated: " +
                                 ", ".join(_improved))

                # 6. Remove the model files excluding n-best epoch and latest epoch
                _removed = []
                # Get the union set of the n-best among multiple criterion
                nbests = set().union(*[
                    set(reporter.sort_epochs(ph, k, m)[:keep_nbest_models])
                    for ph, k, m in best_model_criterion
                    if reporter.has(ph, k)
                ])
                for e in range(1, iepoch):
                    p = output_dir / f"{e}epoch.pth"
                    if p.exists() and e not in nbests:
                        p.unlink()
                        _removed.append(str(p))
                if len(_removed) != 0:
                    logging.info("The model files were removed: " +
                                 ", ".join(_removed))

            # 7. If any updating haven't happened, stops the training
            if all_steps_are_invalid:
                logging.warning(
                    f"The gradients at all steps are invalid in this epoch. "
                    f"Something seems wrong. This training was stopped at {iepoch}epoch"
                )
                break

            # 8. Check early stopping
            if patience is not None:
                if reporter.check_early_stopping(patience,
                                                 *early_stopping_criterion):
                    break

        else:
            logging.info(f"The training was finished at {max_epoch} epochs ")
Exemplo n.º 34
0
    def train_one_epoch(
        cls,
        model: torch.nn.Module,
        iterator: Iterable[Tuple[List[str], Dict[str, torch.Tensor]]],
        optimizers: Sequence[torch.optim.Optimizer],
        schedulers: Sequence[Optional[AbsScheduler]],
        reporter: SubReporter,
        summary_writer: Optional[SummaryWriter],
        options: TrainerOptions,
    ) -> bool:
        assert check_argument_types()

        # Note(kamo): assumes one optimizer
        assert cls.num_optimizers == 1, cls.num_optimizers
        assert len(optimizers) == 1, len(optimizers)
        optimizer = optimizers[0]
        scheduler = schedulers[0]

        grad_noise = options.grad_noise
        accum_grad = options.accum_grad
        grad_clip = options.grad_clip
        log_interval = options.log_interval
        no_forward_run = options.no_forward_run
        ngpu = options.ngpu
        distributed = isinstance(model,
                                 torch.nn.parallel.DistributedDataParallel)
        use_apex = options.train_dtype in ("O0", "O1", "O2", "O3")

        if log_interval is None:
            try:
                log_interval = max(len(iterator) // 20, 10)
            except TypeError:
                log_interval = 100

        model.train()
        all_steps_are_invalid = True
        # [For distributed] Because iteration counts are not always equals between
        # processes, send stop-flag to the other processes if iterator is finished
        iterator_stop = torch.tensor(0).to("cuda" if ngpu > 0 else "cpu")

        start_time = time.perf_counter()
        for iiter, (_, batch) in enumerate(
                reporter.measure_iter_time(iterator, "iter_time"), 1):
            assert isinstance(batch, dict), type(batch)

            if distributed:
                torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM)
                if iterator_stop > 0:
                    break

            batch = to_device(batch, "cuda" if ngpu > 0 else "cpu")
            if no_forward_run:
                all_steps_are_invalid = False
                continue

            with reporter.measure_time("forward_time"):
                loss, stats, weight = model(**batch)
            stats = {k: v for k, v in stats.items() if v is not None}
            if ngpu > 1 or distributed:
                # Apply weighted averaging for loss and stats
                loss = (loss * weight.type(loss.dtype)).sum()

                # if distributed, this method can also apply all_reduce()
                stats, weight = recursive_average(stats, weight, distributed)

                # Now weight is summation over all workers
                loss /= weight
            if distributed:
                # NOTE(kamo): Multiply world_size because DistributedDataParallel
                # automatically normalizes the gradient by world_size.
                loss *= torch.distributed.get_world_size()

            reporter.register(stats, weight)

            loss /= accum_grad
            with reporter.measure_time("backward_time"):
                if use_apex:
                    try:
                        from apex import amp
                    except ImportError:
                        logging.error(
                            "You need to install apex. "
                            "See https://github.com/NVIDIA/apex#linux")

                    with amp.scale_loss(loss, optimizers) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

            if iiter % accum_grad == 0:
                # gradient noise injection
                if grad_noise:
                    add_gradient_noise(
                        model,
                        reporter.get_total_count(),
                        duration=100,
                        eta=1.0,
                        scale_factor=0.55,
                    )

                # compute the gradient norm to check if it is normal or not
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), grad_clip)
                # PyTorch<=1.4, clip_grad_norm_ returns float value
                if not isinstance(grad_norm, torch.Tensor):
                    grad_norm = torch.tensor(grad_norm)

                if not torch.isfinite(grad_norm):
                    logging.warning(
                        f"The grad norm is {grad_norm}. Skipping updating the model."
                    )
                else:
                    all_steps_are_invalid = False
                    with reporter.measure_time("optim_step_time"):
                        optimizer.step()
                    if isinstance(scheduler, AbsBatchStepScheduler):
                        scheduler.step()
                optimizer.zero_grad()

                # Register lr and train/load time[sec/step],
                # where step refers to accum_grad * mini-batch
                reporter.register(
                    dict(
                        {
                            f"lr_{i}": pg["lr"]
                            for i, pg in enumerate(optimizer.param_groups)
                            if "lr" in pg
                        },
                        train_time=time.perf_counter() - start_time,
                    ), )
                start_time = time.perf_counter()

            # NOTE(kamo): Call log_message() after next()
            reporter.next()
            if iiter % log_interval == 0:
                logging.info(reporter.log_message(-log_interval))
                if summary_writer is not None:
                    reporter.tensorboard_add_scalar(summary_writer,
                                                    -log_interval)

        else:
            if distributed:
                iterator_stop.fill_(1)
                torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM)

        return all_steps_are_invalid
Exemplo n.º 35
0
    def plot_attention(
        cls,
        model: torch.nn.Module,
        output_dir: Optional[Path],
        summary_writer: Optional[SummaryWriter],
        iterator: Iterable[Tuple[List[str], Dict[str, torch.Tensor]]],
        reporter: SubReporter,
        options: TrainerOptions,
    ) -> None:
        assert check_argument_types()
        import matplotlib

        ngpu = options.ngpu
        no_forward_run = options.no_forward_run

        matplotlib.use("Agg")
        import matplotlib.pyplot as plt
        from matplotlib.ticker import MaxNLocator

        model.eval()
        for ids, batch in iterator:
            assert isinstance(batch, dict), type(batch)
            assert len(next(iter(batch.values()))) == len(ids), (
                len(next(iter(batch.values()))),
                len(ids),
            )
            batch = to_device(batch, "cuda" if ngpu > 0 else "cpu")
            if no_forward_run:
                continue

            # 1. Forwarding model and gathering all attentions
            #    calculate_all_attentions() uses single gpu only.
            att_dict = calculate_all_attentions(model, batch)

            # 2. Plot attentions: This part is slow due to matplotlib
            for k, att_list in att_dict.items():
                assert len(att_list) == len(ids), (len(att_list), len(ids))
                for id_, att_w in zip(ids, att_list):

                    if isinstance(att_w, torch.Tensor):
                        att_w = att_w.detach().cpu().numpy()

                    if att_w.ndim == 2:
                        att_w = att_w[None]
                    elif att_w.ndim > 3 or att_w.ndim == 1:
                        raise RuntimeError(
                            f"Must be 2 or 3 dimension: {att_w.ndim}")

                    w, h = plt.figaspect(1.0 / len(att_w))
                    fig = plt.Figure(figsize=(w * 1.3, h * 1.3))
                    axes = fig.subplots(1, len(att_w))
                    if len(att_w) == 1:
                        axes = [axes]

                    for ax, aw in zip(axes, att_w):
                        ax.imshow(aw.astype(np.float32), aspect="auto")
                        ax.set_title(f"{k}_{id_}")
                        ax.set_xlabel("Input")
                        ax.set_ylabel("Output")
                        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
                        ax.yaxis.set_major_locator(MaxNLocator(integer=True))

                    if output_dir is not None:
                        p = output_dir / id_ / f"{k}.{reporter.get_epoch()}ep.png"
                        p.parent.mkdir(parents=True, exist_ok=True)
                        fig.savefig(p)

                    if summary_writer is not None:
                        summary_writer.add_figure(f"{k}_{id_}", fig,
                                                  reporter.get_epoch())
            reporter.next()
Exemplo n.º 36
0
 def build_options(cls, args: argparse.Namespace) -> TrainerOptions:
     """Build options consumed by train(), eval(), and plot_attention()"""
     assert check_argument_types()
     return build_dataclass(TrainerOptions, args)
Exemplo n.º 37
0
    def __init__(
            self,
            # TODO only stateful, attention will need temporal or spat.
            encoders: List[Union[TemporalStatefulWithOutput,
                                 SpatialStatefulWithOutput]],
            vocabulary: Vocabulary,
            data_id: str,
            name: str,
            max_output_len: int,
            dropout_keep_prob: float = 1.0,
            rnn_size: int = None,
            embedding_size: int = None,
            output_projection: OutputProjectionSpec = None,
            encoder_projection: Callable[
                [tf.Tensor, Optional[int], Optional[List[Any]]],
                tf.Tensor] = None,
            attentions: List[BaseAttention] = None,
            embeddings_source: EmbeddedSequence = None,
            attention_on_input: bool = True,
            rnn_cell: str = 'GRU',
            conditional_gru: bool = False,
            save_checkpoint: str = None,
            load_checkpoint: str = None) -> None:
        """Create a refactored version of monster decoder.

        Arguments:
            encoders: Input encoders of the decoder
            vocabulary: Target vocabulary
            data_id: Target data series
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects
            max_output_len: Maximum length of an output sequence
            dropout_keep_prob: Probability of keeping a value during dropout

        Keyword arguments:
            rnn_size: Size of the decoder hidden state, if None set
                according to encoders.
            embedding_size: Size of embedding vectors for target words
            output_projection: How to generate distribution over vocabulary
                from decoder rnn_outputs
            encoder_projection: How to construct initial state from encoders
            attention: The attention object to use. Optional.
            embeddings_source: Embedded sequence to take embeddings from
            rnn_cell: RNN Cell used by the decoder (GRU or LSTM)
            conditional_gru: Flag whether to use the Conditional GRU
                architecture
            attention_on_input: Flag whether attention from previous decoding
                step should be combined with the input in the next step.
        """
        ModelPart.__init__(self, name, save_checkpoint, load_checkpoint)
        check_argument_types()

        log("Initializing decoder, name: '{}'".format(name))

        self.encoders = encoders
        self.vocabulary = vocabulary
        self.data_id = data_id
        self.max_output_len = max_output_len
        self.dropout_keep_prob = dropout_keep_prob
        self.embedding_size = embedding_size
        self.rnn_size = rnn_size
        self.output_projection_spec = output_projection
        self.encoder_projection = encoder_projection
        self.attentions = attentions
        self.embeddings_source = embeddings_source
        self._conditional_gru = conditional_gru
        self._attention_on_input = attention_on_input
        self._rnn_cell_str = rnn_cell

        if self.attentions is None:
            self.attentions = []

        if self.embedding_size is None and self.embeddings_source is None:
            raise ValueError("You must specify either embedding size or the "
                             "embedded sequence from which to reuse the "
                             "embeddings (e.g. set either 'embedding_size' or "
                             " 'embeddings_source' parameter)")

        if self.embeddings_source is not None:
            if self.embedding_size is not None:
                warn("Overriding the embedding_size parameter with the"
                     " size of the reused embeddings from the encoder.")

            self.embedding_size = (
                self.embeddings_source.embedding_matrix.get_shape()[1].value)

        if self.encoder_projection is None:
            if not self.encoders:
                log("No encoder - language model only.")
                self.encoder_projection = empty_initial_state
            elif rnn_size is None:
                log("No rnn_size or encoder_projection: Using concatenation of"
                    " encoded states")
                self.encoder_projection = concat_encoder_projection
                self.rnn_size = sum(e.output.get_shape()[1].value
                                    for e in encoders)
            else:
                log("Using linear projection of encoders as the initial state")
                self.encoder_projection = linear_encoder_projection(
                    self.dropout_keep_prob)

        assert self.rnn_size is not None

        if self._rnn_cell_str not in RNN_CELL_TYPES:
            raise ValueError("RNN cell must be a either 'GRU' or 'LSTM'")

        if self.output_projection_spec is None:
            log("No output projection specified - using tanh projection")
            self.output_projection = nonlinear_output(self.rnn_size,
                                                      tf.tanh)[0]
            self.output_projection_size = self.rnn_size
        elif isinstance(self.output_projection_spec, tuple):
            (self.output_projection,
             self.output_projection_size) = tuple(self.output_projection_spec)
        else:
            self.output_projection = self.output_projection_spec
            self.output_projection_size = self.rnn_size

        if self._attention_on_input:
            self.input_projection = self.input_plus_attention
        else:
            self.input_projection = self.embed_input_symbol

        with self.use_scope():
            with tf.variable_scope("attention_decoder") as self.step_scope:
                pass

        # TODO when it is possible, remove the printing of the cost var
        log("Decoder initalized. Cost var: {}".format(str(self.cost)))
        log("Runtime logits tensor: {}".format(str(self.runtime_logits)))
Exemplo n.º 38
0
    def build_model(cls, args: argparse.Namespace) -> ESPnetTTSModel:
        assert check_argument_types()
        if isinstance(args.token_list, str):
            with open(args.token_list, encoding="utf-8") as f:
                token_list = [line.rstrip() for line in f]

            # "args" is saved as it is in a yaml file by BaseTask.main().
            # Overwriting token_list to keep it as "portable".
            args.token_list = token_list.copy()
        elif isinstance(args.token_list, (tuple, list)):
            token_list = args.token_list.copy()
        else:
            raise RuntimeError("token_list must be str or dict")

        vocab_size = len(token_list)
        logging.info(f"Vocabulary size: {vocab_size }")

        # 1. feats_extract
        if args.odim is None:
            # Extract features in the model
            feats_extract_class = feats_extractor_choices.get_class(
                args.feats_extract)
            feats_extract = feats_extract_class(**args.feats_extract_conf)
            odim = feats_extract.output_size()
        else:
            # Give features from data-loader
            args.feats_extract = None
            args.feats_extract_conf = None
            feats_extract = None
            odim = args.odim

        # 2. Normalization layer
        if args.normalize is not None:
            normalize_class = normalize_choices.get_class(args.normalize)
            normalize = normalize_class(**args.normalize_conf)
        else:
            normalize = None

        # 3. TTS
        tts_class = tts_choices.get_class(args.tts)
        tts = tts_class(idim=vocab_size, odim=odim, **args.tts_conf)

        # 4. Extra components
        pitch_extract = None
        energy_extract = None
        pitch_normalize = None
        energy_normalize = None
        if getattr(args, "pitch_extract", None) is not None:
            pitch_extract_class = pitch_extractor_choices.get_class(
                args.pitch_extract)
            if args.pitch_extract_conf.get("reduction_factor",
                                           None) is not None:
                assert args.pitch_extract_conf.get("reduction_factor",
                                                   None) == args.tts_conf.get(
                                                       "reduction_factor", 1)
            else:
                args.pitch_extract_conf[
                    "reduction_factor"] = args.tts_conf.get(
                        "reduction_factor", 1)
            pitch_extract = pitch_extract_class(**args.pitch_extract_conf)
        if getattr(args, "energy_extract", None) is not None:
            if args.energy_extract_conf.get("reduction_factor",
                                            None) is not None:
                assert args.energy_extract_conf.get("reduction_factor",
                                                    None) == args.tts_conf.get(
                                                        "reduction_factor", 1)
            else:
                args.energy_extract_conf[
                    "reduction_factor"] = args.tts_conf.get(
                        "reduction_factor", 1)
            energy_extract_class = energy_extractor_choices.get_class(
                args.energy_extract)
            energy_extract = energy_extract_class(**args.energy_extract_conf)
        if getattr(args, "pitch_normalize", None) is not None:
            pitch_normalize_class = pitch_normalize_choices.get_class(
                args.pitch_normalize)
            pitch_normalize = pitch_normalize_class(
                **args.pitch_normalize_conf)
        if getattr(args, "energy_normalize", None) is not None:
            energy_normalize_class = energy_normalize_choices.get_class(
                args.energy_normalize)
            energy_normalize = energy_normalize_class(
                **args.energy_normalize_conf)

        # 5. Build model
        model = ESPnetTTSModel(
            feats_extract=feats_extract,
            pitch_extract=pitch_extract,
            energy_extract=energy_extract,
            normalize=normalize,
            pitch_normalize=pitch_normalize,
            energy_normalize=energy_normalize,
            tts=tts,
            **args.model_conf,
        )
        assert check_return_type(model)
        return model
Exemplo n.º 39
0
    def add_task_arguments(cls, parser: argparse.ArgumentParser):
        # NOTE(kamo): Use '_' instead of '-' to avoid confusion
        assert check_argument_types()
        group = parser.add_argument_group(description="Task related")

        # NOTE(kamo): add_arguments(..., required=True) can't be used
        # to provide --print_config mode. Instead of it, do as
        required = parser.get_default("required")
        required += ["token_list"]

        group.add_argument(
            "--token_list",
            type=str_or_none,
            default=None,
            help="A text mapping int-id to token",
        )
        group.add_argument(
            "--odim",
            type=int_or_none,
            default=None,
            help="The number of dimension of output feature",
        )
        group.add_argument(
            "--model_conf",
            action=NestedDictAction,
            default=get_default_kwargs(ESPnetTTSModel),
            help="The keyword arguments for model class.",
        )

        group = parser.add_argument_group(description="Preprocess related")
        group.add_argument(
            "--use_preprocessor",
            type=str2bool,
            default=True,
            help="Apply preprocessing to data or not",
        )
        group.add_argument(
            "--token_type",
            type=str,
            default="phn",
            choices=["bpe", "char", "word", "phn"],
            help="The text will be tokenized in the specified level token",
        )
        group.add_argument(
            "--bpemodel",
            type=str_or_none,
            default=None,
            help="The model file of sentencepiece",
        )
        parser.add_argument(
            "--non_linguistic_symbols",
            type=str_or_none,
            help="non_linguistic_symbols file path",
        )
        parser.add_argument(
            "--cleaner",
            type=str_or_none,
            choices=[None, "tacotron", "jaconv", "vietnamese"],
            default=None,
            help="Apply text cleaning",
        )
        parser.add_argument(
            "--g2p",
            type=str_or_none,
            choices=[
                None,
                "g2p_en",
                "g2p_en_no_space",
                "pyopenjtalk",
                "pyopenjtalk_accent",
                "pyopenjtalk_kana",
                "pypinyin_g2p",
                "pypinyin_g2p_phone",
            ],
            default=None,
            help="Specify g2p method if --token_type=phn",
        )

        for class_choices in cls.class_choices_list:
            # Append --<name> and --<name>_conf.
            # e.g. --encoder and --encoder_conf
            class_choices.add_arguments(group)
Exemplo n.º 40
0
    def __init__(self,
                 name: str,
                 data_id: str,
                 input_size: int,
                 rnn_layers: List[RNNSpecTuple],
                 max_input_len: Optional[int] = None,
                 dropout_keep_prob: float = 1.0,
                 save_checkpoint: Optional[str] = None,
                 load_checkpoint: Optional[str] = None,
                 initializers: InitializerSpecs = None) -> None:
        """Create a new instance of the encoder.

        Arguments:
            data_id: Identifier of the data series fed to this encoder
            name: An unique identifier for this encoder
            rnn_layers: A list of tuples specifying the size and, optionally,
                the direction ('forward', 'backward' or 'bidirectional')
                and cell type ('GRU' or 'LSTM') of each RNN layer.

        Keyword arguments:
            dropout_keep_prob: The dropout keep probability
                (default 1.0)
        """
        check_argument_types()
        ModelPart.__init__(self, name, save_checkpoint, load_checkpoint,
                           initializers)

        self.data_id = data_id

        self._rnn_layers = [_make_rnn_spec(*r) for r in rnn_layers]
        self.max_input_len = max_input_len
        self.input_size = input_size
        self.dropout_keep_prob = dropout_keep_prob

        log("Initializing RNN encoder, name: '{}'".format(self.name))

        with self.use_scope():
            self.inputs = tf.placeholder(tf.float32,
                                         [None, None, self.input_size],
                                         "encoder_input")
            self._input_lengths = tf.placeholder(tf.int32, [None],
                                                 "encoder_padding_lengths")

            self.states_mask = tf.sequence_mask(self._input_lengths,
                                                dtype=tf.float32)

            states = self.inputs
            states_reversed = False

            def reverse_states():
                nonlocal states, states_reversed
                states = tf.reverse_sequence(states,
                                             self._input_lengths,
                                             batch_dim=0,
                                             seq_dim=1)
                states_reversed = not states_reversed

            for i, layer in enumerate(self._rnn_layers):
                with tf.variable_scope("rnn_{}_{}".format(i, layer.direction)):
                    if layer.direction == "bidirectional":
                        fw_cell = _make_rnn_cell(layer)
                        bw_cell = _make_rnn_cell(layer)
                        outputs_tup, encoded_tup = (
                            tf.nn.bidirectional_dynamic_rnn(
                                fw_cell,
                                bw_cell,
                                states,
                                self._input_lengths,
                                dtype=tf.float32))

                        if states_reversed:
                            # treat forward as backward and vice versa
                            outputs_tup = tuple(reversed(outputs_tup))
                            encoded_tup = tuple(reversed(encoded_tup))
                            states_reversed = False

                        states = tf.concat(outputs_tup, 2)
                        encoded = tf.concat(encoded_tup, 1)
                    elif layer.direction in ["forward", "backward"]:
                        should_be_reversed = (layer.direction == "backward")
                        if states_reversed != should_be_reversed:
                            reverse_states()

                        cell = _make_rnn_cell(layer)
                        states, encoded = tf.nn.dynamic_rnn(
                            cell,
                            states,
                            sequence_length=self._input_lengths,
                            dtype=tf.float32)
                    else:
                        raise ValueError("Unknown RNN direction {}".format(
                            layer.direction))

                if i < len(self._rnn_layers) - 1:
                    states = dropout(states, self.dropout_keep_prob,
                                     self.train_mode)

            if states_reversed:
                reverse_states()

            self.hidden_states = states
            self.encoded = encoded

        log("RNN encoder initialized")
Exemplo n.º 41
0
def inference(
    output_dir: str,
    maxlenratio: float,
    minlenratio: float,
    batch_size: int,
    dtype: str,
    beam_size: int,
    ngpu: int,
    seed: int,
    ctc_weight: float,
    lm_weight: float,
    penalty: float,
    nbest: int,
    num_workers: int,
    log_level: Union[int, str],
    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
    key_file: Optional[str],
    asr_train_config: str,
    asr_model_file: str,
    lm_train_config: Optional[str],
    lm_file: Optional[str],
    word_lm_train_config: Optional[str],
    word_lm_file: Optional[str],
    blank_symbol: str,
    token_type: Optional[str],
    bpemodel: Optional[str],
    allow_variable_data_keys: bool,
):
    assert check_argument_types()
    if batch_size > 1:
        raise NotImplementedError("batch decoding is not implemented")
    if word_lm_train_config is not None:
        raise NotImplementedError("Word LM is not implemented")
    if ngpu > 1:
        raise NotImplementedError("only single GPU decoding is supported")

    logging.basicConfig(
        level=log_level,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
    )

    if ngpu >= 1:
        device = "cuda"
    else:
        device = "cpu"

    # 1. Set random-seed
    set_all_random_seed(seed)

    # 2. Build ASR model
    scorers = {}
    asr_model, asr_train_args = ASRTask.build_model_from_file(
        asr_train_config, asr_model_file, device
    )
    asr_model.eval()

    decoder = asr_model.decoder
    ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
    token_list = asr_model.token_list
    scorers.update(
        decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)),
    )

    # 3. Build Language model
    if lm_train_config is not None:
        lm, lm_train_args = LMTask.build_model_from_file(
            lm_train_config, lm_file, device
        )
        scorers["lm"] = lm.lm

    # 4. Build BeamSearch object
    weights = dict(
        decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty,
    )
    beam_search = BeamSearch(
        beam_size=beam_size,
        weights=weights,
        scorers=scorers,
        sos=asr_model.sos,
        eos=asr_model.eos,
        vocab_size=len(token_list),
        token_list=token_list,
    )
    beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
    for scorer in scorers.values():
        if isinstance(scorer, torch.nn.Module):
            scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
    logging.info(f"Beam_search: {beam_search}")
    logging.info(f"Decoding device={device}, dtype={dtype}")

    # 5. Build data-iterator
    loader = ASRTask.build_streaming_iterator(
        data_path_and_name_and_type,
        dtype=dtype,
        batch_size=batch_size,
        key_file=key_file,
        num_workers=num_workers,
        preprocess_fn=ASRTask.build_preprocess_fn(asr_train_args, False),
        collate_fn=ASRTask.build_collate_fn(asr_train_args),
        allow_variable_data_keys=allow_variable_data_keys,
        inference=True,
    )

    # 6. [Optional] Build Text converter: e.g. bpe-sym -> Text
    if token_type is None:
        token_type = asr_train_args.token_type
    if bpemodel is None:
        bpemodel = asr_train_args.bpemodel

    if token_type is None:
        tokenizer = None
    elif token_type == "bpe":
        if bpemodel is not None:
            tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
        else:
            tokenizer = None
    else:
        tokenizer = build_tokenizer(token_type=token_type)
    converter = TokenIDConverter(token_list=token_list)
    logging.info(f"Text tokenizer: {tokenizer}")

    # 7 .Start for-loop
    # FIXME(kamo): The output format should be discussed about
    with DatadirWriter(output_dir) as writer:
        for keys, batch in loader:
            assert isinstance(batch, dict), type(batch)
            assert all(isinstance(s, str) for s in keys), keys
            _bs = len(next(iter(batch.values())))
            assert len(keys) == _bs, f"{len(keys)} != {_bs}"

            with torch.no_grad():
                # a. To device
                batch = to_device(batch, device)

                # b. Forward Encoder
                enc, _ = asr_model.encode(**batch)
                assert len(enc) == batch_size, len(enc)

                # c. Passed the encoder result and the beam search
                nbest_hyps = beam_search(
                    x=enc[0], maxlenratio=maxlenratio, minlenratio=minlenratio
                )
                nbest_hyps = nbest_hyps[:nbest]

            # Only supporting batch_size==1
            key = keys[0]
            for n in range(1, nbest + 1):
                hyp = nbest_hyps[n - 1]
                assert isinstance(hyp, Hypothesis), type(hyp)

                # remove sos/eos and get results
                token_int = hyp.yseq[1:-1].tolist()

                # remove blank symbol id, which is assumed to be 0
                token_int = list(filter(lambda x: x != 0, token_int))

                # Change integer-ids to tokens
                token = converter.ids2tokens(token_int)

                # Create a directory: outdir/{n}best_recog
                ibest_writer = writer[f"{n}best_recog"]

                # Write the result to each files
                ibest_writer["token"][key] = " ".join(token)
                ibest_writer["token_int"][key] = " ".join(map(str, token_int))
                ibest_writer["score"][key] = str(hyp.score)

                if tokenizer is not None:
                    text = tokenizer.tokens2text(token)
                    ibest_writer["text"][key] = text
Exemplo n.º 42
0
def run_application(component: Component, *, event_loop_policy: str = None,
                    max_threads: int = None, logging: Union[Dict[str, Any], int, None] = INFO):
    """
    Configure logging and start the given root component in the default asyncio event loop.

    Assuming the root component was started successfully, the event loop will continue running
    until the process is terminated.

    Initializes the logging system first based on the value of ``logging``:
      * If the value is a dictionary, it is passed to :func:`logging.config.dictConfig` as
        argument.
      * If the value is an integer, it is passed to :func:`logging.basicConfig` as the logging
        level.
      * If the value is ``None``, logging setup is skipped entirely.

    By default, the logging system is initialized using :func:`~logging.basicConfig` using the
    ``INFO`` logging level.

    The default executor in the event loop is replaced with a new
    :class:`~concurrent.futures.ThreadPoolExecutor` where the maximum number of threads is set to
    the value of ``max_threads`` or, if omitted, the return value of :func:`os.cpu_count()`.

    :param component: the root component
    :param event_loop_policy: entry point name (from the ``asphalt.core.event_loop_policies``
        namespace) of an alternate event loop policy (or a module:varname reference to one)
    :param max_threads: the maximum number of worker threads in the default thread pool executor
        (the default value depends on the event loop implementation)
    :param logging: a logging configuration dictionary, :ref:`logging level <python:levels>` or
        ``None``

    """
    assert check_argument_types()

    # Configure the logging system
    if isinstance(logging, dict):
        dictConfig(logging)
    elif isinstance(logging, int):
        basicConfig(level=logging)

    # Switch to an alternate event loop policy if one was provided
    logger = getLogger(__name__)
    if event_loop_policy:
        create_policy = policies.resolve(event_loop_policy)
        policy = create_policy()
        asyncio.set_event_loop_policy(policy)
        logger.info('Switched event loop policy to %s', qualified_name(policy))

    # Assign a new default executor with the given max worker thread limit if one was provided
    event_loop = asyncio.get_event_loop()
    if max_threads is not None:
        event_loop.set_default_executor(ThreadPoolExecutor(max_threads))
        logger.info('Installed a new thread pool executor with max_workers=%d', max_threads)

    logger.info('Starting application')
    context = Context()
    exception = None
    try:
        try:
            event_loop.run_until_complete(component.start(context))
        except Exception as e:
            exception = e
            logger.exception('Error during application startup')
        else:
            # Enable garbage collection of the component tree
            del component

            # Finally, run the event loop until the process is terminated or Ctrl+C is pressed
            event_loop.run_forever()
    except (KeyboardInterrupt, SystemExit):
        pass
    finally:
        # Cancel all running tasks
        for task in asyncio.Task.all_tasks(event_loop):
            task.cancel()

        # Run all the finish callbacks
        future = context.finished.dispatch(exception, return_future=True)
        event_loop.run_until_complete(future)

    event_loop.close()
    logger.info('Application stopped')

    if exception is not None:
        sys.exit(1)
Exemplo n.º 43
0
    def __init__(
        self,
        asr_train_config: Union[Path, str],
        asr_model_file: Union[Path, str] = None,
        lm_train_config: Union[Path, str] = None,
        lm_file: Union[Path, str] = None,
        token_type: str = None,
        bpemodel: str = None,
        device: str = "cpu",
        maxlenratio: float = 0.0,
        minlenratio: float = 0.0,
        batch_size: int = 1,
        dtype: str = "float32",
        beam_size: int = 20,
        ctc_weight: float = 0.5,
        lm_weight: float = 1.0,
        penalty: float = 0.0,
        nbest: int = 1,
    ):
        assert check_argument_types()

        # 1. Build ASR model
        scorers = {}
        asr_model, asr_train_args = ASRTask.build_model_from_file(
            asr_train_config, asr_model_file, device)
        asr_model.eval()

        decoder = asr_model.decoder
        ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
        token_list = asr_model.token_list
        scorers.update(
            decoder=decoder,
            ctc=ctc,
            length_bonus=LengthBonus(len(token_list)),
        )

        # 2. Build Language model
        if lm_train_config is not None:
            lm, lm_train_args = LMTask.build_model_from_file(
                lm_train_config, lm_file, device)
            scorers["lm"] = lm.lm

        # 3. Build BeamSearch object
        weights = dict(
            decoder=1.0 - ctc_weight,
            ctc=ctc_weight,
            lm=lm_weight,
            length_bonus=penalty,
        )
        beam_search = BeamSearch(
            beam_size=beam_size,
            weights=weights,
            scorers=scorers,
            sos=asr_model.sos,
            eos=asr_model.eos,
            vocab_size=len(token_list),
            token_list=token_list,
            pre_beam_score_key=None if ctc_weight == 1.0 else "full",
        )
        # TODO(karita): make all scorers batchfied
        if batch_size == 1:
            non_batch = [
                k for k, v in beam_search.full_scorers.items()
                if not isinstance(v, BatchScorerInterface)
            ]
            if len(non_batch) == 0:
                beam_search.__class__ = BatchBeamSearch
                logging.info("BatchBeamSearch implementation is selected.")
            else:
                logging.warning(f"As non-batch scorers {non_batch} are found, "
                                f"fall back to non-batch implementation.")
        beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
        for scorer in scorers.values():
            if isinstance(scorer, torch.nn.Module):
                scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
        logging.info(f"Beam_search: {beam_search}")
        logging.info(f"Decoding device={device}, dtype={dtype}")

        # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text
        if token_type is None:
            token_type = asr_train_args.token_type
        if bpemodel is None:
            bpemodel = asr_train_args.bpemodel

        if token_type is None:
            tokenizer = None
        elif token_type == "bpe":
            if bpemodel is not None:
                tokenizer = build_tokenizer(token_type=token_type,
                                            bpemodel=bpemodel)
            else:
                tokenizer = None
        else:
            tokenizer = build_tokenizer(token_type=token_type)
        converter = TokenIDConverter(token_list=token_list)
        logging.info(f"Text tokenizer: {tokenizer}")

        self.asr_model = asr_model
        self.asr_train_args = asr_train_args
        self.converter = converter
        self.tokenizer = tokenizer
        self.beam_search = beam_search
        self.maxlenratio = maxlenratio
        self.minlenratio = minlenratio
        self.device = device
        self.dtype = dtype
        self.nbest = nbest
Exemplo n.º 44
0
    def __call__(
            self, uid: str,
            data: Dict[str, Union[str, np.ndarray]]) -> Dict[str, np.ndarray]:
        assert check_argument_types()

        if self.speech_name in data:
            if self.train and self.rirs is not None and self.noises is not None:
                speech = data[self.speech_name]
                nsamples = len(speech)

                # speech: (Nmic, Time)
                if speech.ndim == 1:
                    speech = speech[None, :]
                else:
                    speech = speech.T
                # Calc power on non shlence region
                power = (speech[detect_non_silence(speech)]**2).mean()

                # 1. Convolve RIR
                if self.rirs is not None and self.rir_apply_prob >= np.random.random(
                ):
                    rir_path = np.random.choice(self.rirs)
                    if rir_path is not None:
                        rir, _ = soundfile.read(rir_path,
                                                dtype=np.float64,
                                                always_2d=True)

                        # rir: (Nmic, Time)
                        rir = rir.T

                        # speech: (Nmic, Time)
                        # Note that this operation doesn't change the signal length
                        speech = scipy.signal.convolve(
                            speech, rir, mode="full")[:, :speech.shape[1]]
                        # Reverse mean power to the original power
                        power2 = (speech[detect_non_silence(speech)]**2).mean()
                        speech = np.sqrt(power / max(power2, 1e-10)) * speech

                # 2. Add Noise
                if (self.noises is not None
                        and self.rir_apply_prob >= np.random.random()):
                    noise_path = np.random.choice(self.noises)
                    if noise_path is not None:
                        noise_db = np.random.uniform(self.noise_db_low,
                                                     self.noise_db_high)
                        with soundfile.SoundFile(noise_path) as f:
                            if f.frames == nsamples:
                                noise = f.read(dtype=np.float64,
                                               always_2d=True)
                            elif f.frames < nsamples:
                                offset = np.random.randint(
                                    0, nsamples - f.frames)
                                # noise: (Time, Nmic)
                                noise = f.read(dtype=np.float64,
                                               always_2d=True)
                                # Repeat noise
                                noise = np.pad(
                                    noise,
                                    [(offset, nsamples - f.frames - offset),
                                     (0, 0)],
                                    mode="wrap",
                                )
                            else:
                                offset = np.random.randint(
                                    0, f.frames - nsamples)
                                f.seek(offset)
                                # noise: (Time, Nmic)
                                noise = f.read(nsamples,
                                               dtype=np.float64,
                                               always_2d=True)
                                if len(noise) != nsamples:
                                    raise RuntimeError(
                                        f"Something wrong: {noise_path}")
                        # noise: (Nmic, Time)
                        noise = noise.T

                        noise_power = (noise**2).mean()
                        scale = (10**(-noise_db / 20) * np.sqrt(power) /
                                 np.sqrt(max(noise_power, 1e-10)))
                        speech = speech + scale * noise

                speech = speech.T
                ma = np.max(np.abs(speech))
                if ma > 1.0:
                    speech /= ma
                data[self.speech_name] = speech

            if self.speech_volume_normalize is not None:
                speech = data[self.speech_name]
                ma = np.max(np.abs(speech))
                data[self.
                     speech_name] = speech * self.speech_volume_normalize / ma

        if self.text_name in data and self.tokenizer is not None:
            text = data[self.text_name]
            text = self.text_cleaner(text)
            tokens = self.tokenizer.text2tokens(text)
            text_ints = self.token_id_converter.tokens2ids(tokens)
            data[self.text_name] = np.array(text_ints, dtype=np.int64)
        assert check_return_type(data)
        return data
Exemplo n.º 45
0
def inference(
    output_dir: str,
    maxlenratio: float,
    minlenratio: float,
    batch_size: int,
    dtype: str,
    beam_size: int,
    ngpu: int,
    seed: int,
    ctc_weight: float,
    lm_weight: float,
    penalty: float,
    nbest: int,
    num_workers: int,
    log_level: Union[int, str],
    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
    key_file: Optional[str],
    asr_train_config: str,
    asr_model_file: str,
    lm_train_config: Optional[str],
    lm_file: Optional[str],
    word_lm_train_config: Optional[str],
    word_lm_file: Optional[str],
    token_type: Optional[str],
    bpemodel: Optional[str],
    allow_variable_data_keys: bool,
):
    assert check_argument_types()
    if batch_size > 1:
        raise NotImplementedError("batch decoding is not implemented")
    if word_lm_train_config is not None:
        raise NotImplementedError("Word LM is not implemented")
    if ngpu > 1:
        raise NotImplementedError("only single GPU decoding is supported")

    logging.basicConfig(
        level=log_level,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
    )

    if ngpu >= 1:
        device = "cuda"
    else:
        device = "cpu"

    # 1. Set random-seed
    set_all_random_seed(seed)

    # 2. Build speech2text
    speech2text = Speech2Text(
        asr_train_config=asr_train_config,
        asr_model_file=asr_model_file,
        lm_train_config=lm_train_config,
        lm_file=lm_file,
        token_type=token_type,
        bpemodel=bpemodel,
        device=device,
        maxlenratio=maxlenratio,
        minlenratio=minlenratio,
        dtype=dtype,
        beam_size=beam_size,
        ctc_weight=ctc_weight,
        lm_weight=lm_weight,
        penalty=penalty,
        nbest=nbest,
    )

    # 3. Build data-iterator
    loader = ASRTask.build_streaming_iterator(
        data_path_and_name_and_type,
        dtype=dtype,
        batch_size=batch_size,
        key_file=key_file,
        num_workers=num_workers,
        preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args,
                                                  False),
        collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args),
        allow_variable_data_keys=allow_variable_data_keys,
        inference=True,
    )

    # 7 .Start for-loop
    # FIXME(kamo): The output format should be discussed about
    with DatadirWriter(output_dir) as writer:
        for keys, batch in loader:
            assert isinstance(batch, dict), type(batch)
            assert all(isinstance(s, str) for s in keys), keys
            _bs = len(next(iter(batch.values())))
            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
            batch = {
                k: v[0]
                for k, v in batch.items() if not k.endswith("_lengths")
            }

            # N-best list of (text, token, token_int, hyp_object)
            results = speech2text(**batch)

            # Only supporting batch_size==1
            key = keys[0]
            for n, (text, token, token_int,
                    hyp) in zip(range(1, nbest + 1), results):
                # Create a directory: outdir/{n}best_recog
                ibest_writer = writer[f"{n}best_recog"]

                # Write the result to each file
                ibest_writer["token"][key] = " ".join(token)
                ibest_writer["token_int"][key] = " ".join(map(str, token_int))
                ibest_writer["score"][key] = str(hyp.score)

                if text is not None:
                    ibest_writer["text"][key] = text
Exemplo n.º 46
0
    def __init__(self,
                 num_sessions: int,
                 num_threads: int,
                 save_n_best: int = 1,
                 minimize_metric: bool = False,
                 variable_files: Optional[List[str]] = None,
                 gpu_allow_growth: bool = True,
                 per_process_gpu_memory_fraction: float = 1.0,
                 enable_tf_debug: bool = False) -> None:
        """Initialize a TensorflowManager.

        At this moment the graph must already exist. This method initializes
        required number of TensorFlow sessions and initializes them with
        provided variable files if they are provided.

        Args:
            num_sessions: Number of sessions to be initialized.
            num_threads: Number of threads sessions will run in.
            save_n_best: How many best models to keep
            minimize_metric: Whether the best model is the one with the lowest
                or the highest score
            variable_files: List of variable files.
            gpu_allow_growth: TF to allocate incrementally, not all at once.
            per_process_gpu_memory_fraction: Limit TF memory use.
        """
        check_argument_types()

        session_cfg = tf.ConfigProto()
        session_cfg.inter_op_parallelism_threads = num_threads
        session_cfg.intra_op_parallelism_threads = num_threads
        session_cfg.allow_soft_placement = True  # needed for multiple GPUs
        # pylint: disable=no-member
        session_cfg.gpu_options.allow_growth = gpu_allow_growth
        session_cfg.gpu_options.per_process_gpu_memory_fraction = \
            per_process_gpu_memory_fraction

        if save_n_best < 1:
            raise Exception("save_n_best parameter must be greater than zero")
        self.saver_max_to_keep = save_n_best
        self.minimize_metric = minimize_metric

        self.sessions = [
            tf.Session(config=session_cfg) for _ in range(num_sessions)
        ]

        if enable_tf_debug:
            self.sessions = [
                tf_debug.LocalCLIDebugWrapperSession(sess)
                for sess in self.sessions
            ]

        init_op = tf.global_variables_initializer()
        for sess in self.sessions:
            sess.run(init_op)
        self.saver = tf.train.Saver(max_to_keep=self.saver_max_to_keep,
                                    var_list=[
                                        g for g in tf.global_variables()
                                        if "reward_" not in g.name
                                    ])

        if variable_files:
            if len(variable_files) != num_sessions:
                raise Exception(
                    ("The number of provided variable files ({}) "
                     "is different than a number sessions ({})").format(
                         len(variable_files), num_sessions))
            self.restore(variable_files)

        self.best_score_index = 0
        self.best_score_epoch = 0
        self.best_score_batch = 0

        init_score = np.inf if self.minimize_metric else -np.inf
        self.saved_scores = [init_score for _ in range(self.saver_max_to_keep)]
        self.best_score = init_score

        self.variables_files = []  # type: List[str]
        self._best_vars_file = None  # type: Optional[str]
Exemplo n.º 47
0
    def __init__(self,
                 name: str,
                 input_sequence: Sequence,
                 segment_size: int,
                 highway_depth: int,
                 rnn_size: int,
                 filters: List[Tuple[int, int]],
                 dropout_keep_prob: float = 1.0,
                 use_noisy_activations: bool = False,
                 save_checkpoint: Optional[str] = None,
                 load_checkpoint: Optional[str] = None) -> None:
        """Create a new instance of the sentence encoder.

        Arguments:
            name: An unique identifier for this encoder
            segment_size: The size of the segments over which we apply
                max-pooling.
            highway_depth: Depth of the highway layer.
            rnn_size: The size of the encoder's hidden state. Note
                that the actual encoder output state size will be
                twice as long because it is the result of
                concatenation of forward and backward hidden states.
            filters: Specification of CNN filters. It is a list of tuples
                specifying the filter size and number of channels.

        Keyword arguments:
            dropout_keep_prob: The dropout keep probability
                (default 1.0)
        """
        ModelPart.__init__(self, name, save_checkpoint, load_checkpoint)
        check_argument_types()

        self.input_sequence = input_sequence
        self.segment_size = segment_size
        self.highway_depth = highway_depth
        self.rnn_size = rnn_size
        self.filters = filters
        self.dropout_keep_prob = dropout_keep_prob
        self.use_noisy_activations = use_noisy_activations

        if dropout_keep_prob <= 0. or dropout_keep_prob > 1.:
            raise ValueError(("Dropout keep probability must be "
                              "in (0; 1], was {}").format(dropout_keep_prob))

        if rnn_size <= 0:
            raise ValueError("RNN size must be a positive integer.")

        if highway_depth <= 0:
            raise ValueError("Highway depth must be a positive integer.")

        if segment_size <= 0:
            raise ValueError("Segment size be a positive integer.")

        if not filters:
            raise ValueError("You must specify convolutional filters.")

        for filter_size, num_filters in self.filters:
            if filter_size <= 0:
                raise ValueError("Filter size must be a positive integer.")
            if num_filters <= 0:
                raise ValueError("Number of filters must be a positive int.")
Exemplo n.º 48
0
def inference(
    output_dir: str,
    batch_size: int,
    dtype: str,
    fs: int,
    ngpu: int,
    seed: int,
    num_workers: int,
    log_level: Union[int, str],
    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
    key_file: Optional[str],
    enh_train_config: str,
    enh_model_file: str,
    allow_variable_data_keys: bool,
    normalize_output_wav: bool,
):
    assert check_argument_types()
    if batch_size > 1:
        raise NotImplementedError("batch decoding is not implemented")
    if ngpu > 1:
        raise NotImplementedError("only single GPU decoding is supported")

    logging.basicConfig(
        level=log_level,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
    )

    if ngpu >= 1:
        device = "cuda"
    else:
        device = "cpu"

    # 1. Set random-seed
    set_all_random_seed(seed)

    # 2. Build Enh model
    enh_model, enh_train_args = EnhancementTask.build_model_from_file(
        enh_train_config, enh_model_file, device)
    enh_model.eval()

    num_spk = enh_model.num_spk

    # 3. Build data-iterator
    loader = EnhancementTask.build_streaming_iterator(
        data_path_and_name_and_type,
        dtype=dtype,
        batch_size=batch_size,
        key_file=key_file,
        num_workers=num_workers,
        preprocess_fn=EnhancementTask.build_preprocess_fn(
            enh_train_args, False),
        collate_fn=EnhancementTask.build_collate_fn(enh_train_args, False),
        allow_variable_data_keys=allow_variable_data_keys,
        inference=True,
    )

    writers = []
    for i in range(num_spk):
        writers.append(
            SoundScpWriter(f"{output_dir}/wavs/{i + 1}",
                           f"{output_dir}/spk{i + 1}.scp"))

    for keys, batch in loader:
        assert isinstance(batch, dict), type(batch)
        assert all(isinstance(s, str) for s in keys), keys
        _bs = len(next(iter(batch.values())))
        assert len(keys) == _bs, f"{len(keys)} != {_bs}"

        with torch.no_grad():
            # a. To device
            batch = to_device(batch, device)
            # b. Forward Enhancement Frontend
            feats, f_lens = enh_model.encoder(batch["speech_mix"],
                                              batch["speech_mix_lengths"])
            feats, _, _ = enh_model.separator(feats, f_lens)
            waves = [
                enh_model.decoder(f, batch["speech_mix_lengths"])[0]
                for f in feats
            ]

            assert len(waves[0]) == batch_size, len(waves[0])

        # FIXME(Chenda): will be incorrect when
        #  batch size is not 1 or multi-channel case
        if normalize_output_wav:
            waves = [
                (w / abs(w).max(dim=1, keepdim=True)[0] * 0.9).T.cpu().numpy()
                for w in waves
            ]  # list[(sample,batch)]
        else:
            waves = [w.T.cpu().numpy() for w in waves]
        for (i, w) in enumerate(waves):
            writers[i][keys[0]] = fs, w

    for writer in writers:
        writer.close()
Exemplo n.º 49
0
    def __init__(
        self,
        input_size: int,
        output_size: int = 256,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        attention_dropout_rate: float = 0.0,
        input_layer: Optional[str] = "conv2d",
        normalize_before: bool = True,
        concat_after: bool = False,
        positionwise_layer_type: str = "linear",
        positionwise_conv_kernel_size: int = 3,
        macaron_style: bool = False,
        pos_enc_class=StreamPositionalEncoding,
        selfattention_layer_type: str = "rel_selfattn",
        activation_type: str = "swish",
        use_cnn_module: bool = True,
        cnn_module_kernel: int = 31,
        padding_idx: int = -1,
        block_size: int = 40,
        hop_size: int = 16,
        look_ahead: int = 16,
        init_average: bool = True,
        ctx_pos_enc: bool = True,
    ):
        assert check_argument_types()
        super().__init__()
        self._output_size = output_size
        self.pos_enc = pos_enc_class(output_size, positional_dropout_rate)
        activation = get_activation(activation_type)

        if input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(input_size, output_size),
                torch.nn.LayerNorm(output_size),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
            )
            self.subsample = 1
        elif input_layer == "conv2d":
            self.embed = Conv2dSubsamplingWOPosEnc(
                input_size, output_size, dropout_rate, kernels=[3, 3], strides=[2, 2]
            )
            self.subsample = 4
        elif input_layer == "conv2d6":
            self.embed = Conv2dSubsamplingWOPosEnc(
                input_size, output_size, dropout_rate, kernels=[3, 5], strides=[2, 3]
            )
            self.subsample = 6
        elif input_layer == "conv2d8":
            self.embed = Conv2dSubsamplingWOPosEnc(
                input_size,
                output_size,
                dropout_rate,
                kernels=[3, 3, 3],
                strides=[2, 2, 2],
            )
            self.subsample = 8
        elif input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx),
            )
            self.subsample = 1
        elif isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer,
                pos_enc_class(output_size, positional_dropout_rate),
            )
            self.subsample = 1
        elif input_layer is None:
            self.embed = torch.nn.Sequential(
                pos_enc_class(output_size, positional_dropout_rate)
            )
            self.subsample = 1
        else:
            raise ValueError("unknown input_layer: " + input_layer)
        self.normalize_before = normalize_before
        if positionwise_layer_type == "linear":
            positionwise_layer = PositionwiseFeedForward
            positionwise_layer_args = (
                output_size,
                linear_units,
                dropout_rate,
            )
        elif positionwise_layer_type == "conv1d":
            positionwise_layer = MultiLayeredConv1d
            positionwise_layer_args = (
                output_size,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        elif positionwise_layer_type == "conv1d-linear":
            positionwise_layer = Conv1dLinear
            positionwise_layer_args = (
                output_size,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        else:
            raise NotImplementedError("Support only linear or conv1d.")
        convolution_layer = ConvolutionModule
        convolution_layer_args = (output_size, cnn_module_kernel, activation)

        self.encoders = repeat(
            num_blocks,
            lambda lnum: ContextualBlockEncoderLayer(
                output_size,
                MultiHeadedAttention(
                    attention_heads, output_size, attention_dropout_rate
                ),
                positionwise_layer(*positionwise_layer_args),
                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
                dropout_rate,
                num_blocks,
                normalize_before,
                concat_after,
            ),
        )
        if self.normalize_before:
            self.after_norm = LayerNorm(output_size)

        # for block processing
        self.block_size = block_size
        self.hop_size = hop_size
        self.look_ahead = look_ahead
        self.init_average = init_average
        self.ctx_pos_enc = ctx_pos_enc
Exemplo n.º 50
0
def rl_objective(decoder: Decoder,
                 reward_function: RewardFunction,
                 subtract_baseline: bool = False,
                 normalize: bool = False,
                 temperature: float = 1.,
                 ce_smoothing: float = 0.,
                 alpha: float = 1.,
                 sample_size: int = 1) -> Objective:
    """Construct RL objective for training with sentence-level feedback.

    Depending on the options the objective corresponds to:
    1) sample_size = 1, normalize = False, ce_smoothing = 0.0
     Bandit objective (Eq. 2) described in 'Bandit Structured Prediction for
     Neural Sequence-to-Sequence Learning'
     (http://www.aclweb.org/anthology/P17-1138)
     It's recommended to set subtract_baseline = True.
    2) sample_size > 1, normalize = True, ce_smoothing = 0.0
     Minimum Risk Training as described in 'Minimum Risk Training for Neural
     Machine Translation' (http://www.aclweb.org/anthology/P16-1159) (Eq. 12).
    3) sample_size > 1, normalize = False, ce_smoothing = 0.0
     The Google 'Reinforce' objective as proposed in 'Google’s NMT System:
     Bridging the Gap between Human and Machine Translation'
     (https://arxiv.org/pdf/1609.08144.pdf) (Eq. 8).
    4) sample_size > 1, normalize = False, ce_smoothing > 0.0
     Google's 'Mixed' objective in the above paper (Eq. 9),
     where ce_smoothing implements alpha.

    Note that 'alpha' controls the sharpness of the normalized distribution,
    while 'temperature' controls the sharpness during sampling.

    :param decoder: a recurrent decoder to sample from
    :param reward_function: any evaluator object
    :param subtract_baseline: avg reward is subtracted from obtained reward
    :param normalize: the probabilities of the samples are re-normalized
    :param sample_size: number of samples to obtain feedback for
    :param ce_smoothing: add cross-entropy loss with this coefficient to loss
    :param alpha: determines the shape of the normalized distribution
    :param temperature: the softmax temperature for sampling
    :return: Objective object to be used in generic trainer
    """
    check_argument_types()

    reference = decoder.train_inputs

    def _score_with_reward_function(references: np.array,
                                    hypotheses: np.array) -> np.array:
        """Score (time, batch) arrays with sentence-based reward function.

        Parts of the sentence after generated <pad> or </s> are ignored.
        BPE-postprocessing is also included.

        :param references: array of indices of references, shape (time, batch)
        :param hypotheses: array of indices of hypotheses, shape (time, batch)
        :return: an array of batch length with float rewards
        """
        rewards = []
        for refs, hyps in zip(references.transpose(), hypotheses.transpose()):
            ref_seq = []
            hyp_seq = []
            for r_token in refs:
                token = decoder.vocabulary.index_to_word[r_token]
                if token in (END_TOKEN, PAD_TOKEN):
                    break
                ref_seq.append(token)
            for h_token in hyps:
                token = decoder.vocabulary.index_to_word[h_token]
                if token in (END_TOKEN, PAD_TOKEN):
                    break
                hyp_seq.append(token)
            # join BPEs, split on " " to prepare list for evaluator
            refs_tokens = " ".join(ref_seq).replace("@@ ", "").split(" ")
            hyps_tokens = " ".join(hyp_seq).replace("@@ ", "").split(" ")
            reward = float(reward_function([hyps_tokens], [refs_tokens]))
            rewards.append(reward)
        return np.array(rewards, dtype=np.float32)

    samples_rewards = []
    samples_logprobs = []

    for _ in range(sample_size):
        # sample from logits
        # decoded, shape (time, batch)
        sample_loop_result = decoder.decoding_loop(train_mode=False,
                                                   sample=True,
                                                   temperature=temperature)
        sample_logits = sample_loop_result[0]
        sample_decoded = sample_loop_result[3]

        # rewards, shape (batch)
        # simulate from reference
        sample_reward = tf.py_func(_score_with_reward_function,
                                   [reference, sample_decoded], tf.float32)

        # pylint: disable=invalid-unary-operand-type
        word_logprobs = -tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=sample_decoded, logits=sample_logits)

        # sum word log prob to sentence log prob
        # no masking here, since otherwise shorter sentences are preferred
        sent_logprobs = tf.reduce_sum(word_logprobs, axis=0)

        samples_rewards.append(sample_reward)  # sample_size x batch
        samples_logprobs.append(sent_logprobs)  # sample_size x batch

    # stack samples, sample_size x batch
    samples_rewards_stacked = tf.stack(samples_rewards)
    samples_logprobs_stacked = tf.stack(samples_logprobs)

    if subtract_baseline:
        # if specified, compute the average reward baseline
        reward_counter = tf.Variable(0.0,
                                     trainable=False,
                                     name="reward_counter")
        reward_sum = tf.Variable(0.0, trainable=False, name="reward_sum")
        # increment the cumulative reward
        reward_counter = tf.assign_add(
            reward_counter, tf.to_float(decoder.batch_size * sample_size))
        # sum over batch and samples
        reward_sum = tf.assign_add(reward_sum,
                                   tf.reduce_sum(samples_rewards_stacked))
        # compute baseline: avg of previous rewards
        baseline = tf.div(reward_sum, tf.maximum(reward_counter, 1.0))
        samples_rewards_stacked -= baseline

        tf.summary.scalar("train_{}/rl_reward_baseline".format(
            decoder.data_id),
                          tf.reduce_mean(baseline),
                          collections=["summary_train"])

    if normalize:
        # normalize over sample space
        samples_logprobs_stacked = tf.nn.softmax(samples_logprobs_stacked *
                                                 alpha,
                                                 dim=0)

    scored_probs = tf.stop_gradient(
        tf.negative(samples_rewards_stacked)) * samples_logprobs_stacked

    # sum over samples
    total_loss = tf.reduce_sum(scored_probs, axis=0)

    # average over batch
    batch_loss = tf.reduce_mean(total_loss)

    if ce_smoothing > 0.0:
        batch_loss += tf.multiply(ce_smoothing, decoder.cost)

    tf.summary.scalar("train_{}/self_rl_cost".format(decoder.data_id),
                      batch_loss,
                      collections=["summary_train"])

    return Objective(name="{}_rl".format(decoder.name),
                     decoder=decoder,
                     loss=batch_loss,
                     gradients=None,
                     weight=None)
Exemplo n.º 51
0
    def __init__(
        self,
        g2p_type: Union[None, str],
        non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
        space_symbol: str = "<space>",
        remove_non_linguistic_symbols: bool = False,
    ):
        assert check_argument_types()
        if g2p_type is None:
            self.g2p = split_by_space
        elif g2p_type == "g2p_en":
            self.g2p = G2p_en(no_space=False)
        elif g2p_type == "g2p_en_no_space":
            self.g2p = G2p_en(no_space=True)
        elif g2p_type == "pyopenjtalk":
            self.g2p = pyopenjtalk_g2p
        elif g2p_type == "pyopenjtalk_kana":
            self.g2p = pyopenjtalk_g2p_kana
        elif g2p_type == "pyopenjtalk_accent":
            self.g2p = pyopenjtalk_g2p_accent
        elif g2p_type == "pyopenjtalk_accent_with_pause":
            self.g2p = pyopenjtalk_g2p_accent_with_pause
        elif g2p_type == "pyopenjtalk_prosody":
            self.g2p = pyopenjtalk_g2p_prosody
        elif g2p_type == "pypinyin_g2p":
            self.g2p = pypinyin_g2p
        elif g2p_type == "pypinyin_g2p_phone":
            self.g2p = pypinyin_g2p_phone
        elif g2p_type == "espeak_ng_arabic":
            self.g2p = Phonemizer(
                language="ar",
                backend="espeak",
                with_stress=True,
                preserve_punctuation=True,
            )
        elif g2p_type == "espeak_ng_german":
            self.g2p = Phonemizer(
                language="de",
                backend="espeak",
                with_stress=True,
                preserve_punctuation=True,
            )
        elif g2p_type == "espeak_ng_french":
            self.g2p = Phonemizer(
                language="fr-fr",
                backend="espeak",
                with_stress=True,
                preserve_punctuation=True,
            )
        elif g2p_type == "espeak_ng_spanish":
            self.g2p = Phonemizer(
                language="es",
                backend="espeak",
                with_stress=True,
                preserve_punctuation=True,
            )
        elif g2p_type == "espeak_ng_russian":
            self.g2p = Phonemizer(
                language="ru",
                backend="espeak",
                with_stress=True,
                preserve_punctuation=True,
            )
        elif g2p_type == "espeak_ng_greek":
            self.g2p = Phonemizer(
                language="el",
                backend="espeak",
                with_stress=True,
                preserve_punctuation=True,
            )
        elif g2p_type == "espeak_ng_finnish":
            self.g2p = Phonemizer(
                language="fi",
                backend="espeak",
                with_stress=True,
                preserve_punctuation=True,
            )
        elif g2p_type == "espeak_ng_hungarian":
            self.g2p = Phonemizer(
                language="hu",
                backend="espeak",
                with_stress=True,
                preserve_punctuation=True,
            )
        elif g2p_type == "espeak_ng_dutch":
            self.g2p = Phonemizer(
                language="nl",
                backend="espeak",
                with_stress=True,
                preserve_punctuation=True,
            )
        elif g2p_type == "espeak_ng_hindi":
            self.g2p = Phonemizer(
                language="hi",
                backend="espeak",
                with_stress=True,
                preserve_punctuation=True,
            )
        elif g2p_type == "g2pk":
            self.g2p = G2pk(no_space=False)
        elif g2p_type == "g2pk_no_space":
            self.g2p = G2pk(no_space=True)
        elif g2p_type == "espeak_ng_english_us_vits":
            # VITS official implementation-like processing
            # Reference: https://github.com/jaywalnut310/vits
            self.g2p = Phonemizer(
                language="en-us",
                backend="espeak",
                with_stress=True,
                preserve_punctuation=True,
                strip=True,
                word_separator=" ",
                phone_separator="",
                split_by_single_token=True,
            )
        else:
            raise NotImplementedError(f"Not supported: g2p_type={g2p_type}")

        self.g2p_type = g2p_type
        self.space_symbol = space_symbol
        if non_linguistic_symbols is None:
            self.non_linguistic_symbols = set()
        elif isinstance(non_linguistic_symbols, (Path, str)):
            non_linguistic_symbols = Path(non_linguistic_symbols)
            try:
                with non_linguistic_symbols.open("r", encoding="utf-8") as f:
                    self.non_linguistic_symbols = set(line.rstrip()
                                                      for line in f)
            except FileNotFoundError:
                warnings.warn(f"{non_linguistic_symbols} doesn't exist.")
                self.non_linguistic_symbols = set()
        else:
            self.non_linguistic_symbols = set(non_linguistic_symbols)
        self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
Exemplo n.º 52
0
def scoring(
    output_dir: str,
    dtype: str,
    log_level: Union[int, str],
    key_file: str,
    ref_scp: List[str],
    inf_scp: List[str],
    ref_channel: int,
    flexible_numspk: bool,
):
    assert check_argument_types()

    logging.basicConfig(
        level=log_level,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
    )

    assert len(ref_scp) == len(inf_scp), ref_scp
    num_spk = len(ref_scp)

    keys = [
        line.rstrip().split(maxsplit=1)[0] for line in open(key_file, encoding="utf-8")
    ]

    ref_readers = [SoundScpReader(f, dtype=dtype, normalize=True) for f in ref_scp]
    inf_readers = [SoundScpReader(f, dtype=dtype, normalize=True) for f in inf_scp]

    # get sample rate
    sample_rate, _ = ref_readers[0][keys[0]]

    # check keys
    if not flexible_numspk:
        for inf_reader, ref_reader in zip(inf_readers, ref_readers):
            assert inf_reader.keys() == ref_reader.keys()

    with DatadirWriter(output_dir) as writer:
        for key in keys:
            if not flexible_numspk:
                ref_audios = [ref_reader[key][1] for ref_reader in ref_readers]
                inf_audios = [inf_reader[key][1] for inf_reader in inf_readers]
            else:
                ref_audios = [
                    ref_reader[key][1]
                    for ref_reader in ref_readers
                    if key in ref_reader.keys()
                ]
                inf_audios = [
                    inf_reader[key][1]
                    for inf_reader in inf_readers
                    if key in inf_reader.keys()
                ]
            ref = np.array(ref_audios)
            inf = np.array(inf_audios)
            if ref.ndim > inf.ndim:
                # multi-channel reference and single-channel output
                ref = ref[..., ref_channel]
            elif ref.ndim < inf.ndim:
                # single-channel reference and multi-channel output
                inf = inf[..., ref_channel]
            elif ref.ndim == inf.ndim == 3:
                # multi-channel reference and output
                ref = ref[..., ref_channel]
                inf = inf[..., ref_channel]
            if not flexible_numspk:
                assert ref.shape == inf.shape, (ref.shape, inf.shape)
            else:
                # epsilon value to avoid divergence
                # caused by zero-value, e.g., log(0)
                eps = 0.000001
                # if num_spk of ref > num_spk of inf
                if ref.shape[0] > inf.shape[0]:
                    p = np.full((ref.shape[0] - inf.shape[0], inf.shape[1]), eps)
                    inf = np.concatenate([inf, p])
                    num_spk = ref.shape[0]
                # if num_spk of ref < num_spk of inf
                elif ref.shape[0] < inf.shape[0]:
                    p = np.full((inf.shape[0] - ref.shape[0], ref.shape[1]), eps)
                    ref = np.concatenate([ref, p])
                    num_spk = inf.shape[0]
                else:
                    num_spk = ref.shape[0]

            sdr, sir, sar, perm = bss_eval_sources(ref, inf, compute_permutation=True)

            for i in range(num_spk):
                stoi_score = stoi(ref[i], inf[int(perm[i])], fs_sig=sample_rate)
                estoi_score = stoi(
                    ref[i], inf[int(perm[i])], fs_sig=sample_rate, extended=True
                )
                si_snr_score = -float(
                    si_snr_loss(
                        torch.from_numpy(ref[i][None, ...]),
                        torch.from_numpy(inf[int(perm[i])][None, ...]),
                    )
                )
                writer[f"STOI_spk{i + 1}"][key] = str(stoi_score * 100)  # in percentage
                writer[f"ESTOI_spk{i + 1}"][key] = str(estoi_score * 100)
                writer[f"SI_SNR_spk{i + 1}"][key] = str(si_snr_score)
                writer[f"SDR_spk{i + 1}"][key] = str(sdr[i])
                writer[f"SAR_spk{i + 1}"][key] = str(sar[i])
                writer[f"SIR_spk{i + 1}"][key] = str(sir[i])
                # save permutation assigned script file
                if not flexible_numspk:
                    writer[f"wav_spk{i + 1}"][key] = inf_readers[perm[i]].data[key]
Exemplo n.º 53
0
    def train_one_epoch(
        cls,
        model: torch.nn.Module,
        iterator: Iterable[Tuple[List[str], Dict[str, torch.Tensor]]],
        optimizers: Sequence[torch.optim.Optimizer],
        schedulers: Sequence[Optional[AbsScheduler]],
        scaler: Optional[GradScaler],
        reporter: SubReporter,
        summary_writer,
        options: TrainerOptions,
        distributed_option: DistributedOption,
    ) -> bool:
        assert check_argument_types()

        grad_noise = options.grad_noise
        accum_grad = options.accum_grad
        grad_clip = options.grad_clip
        grad_clip_type = options.grad_clip_type
        log_interval = options.log_interval
        no_forward_run = options.no_forward_run
        ngpu = options.ngpu
        use_wandb = options.use_wandb
        distributed = distributed_option.distributed

        if log_interval is None:
            try:
                log_interval = max(len(iterator) // 20, 10)
            except TypeError:
                log_interval = 100

        model.train()
        all_steps_are_invalid = True
        # [For distributed] Because iteration counts are not always equals between
        # processes, send stop-flag to the other processes if iterator is finished
        iterator_stop = torch.tensor(0).to("cuda" if ngpu > 0 else "cpu")

        start_time = time.perf_counter()
        for iiter, (_, batch) in enumerate(
            reporter.measure_iter_time(iterator, "iter_time"), 1
        ):
            assert isinstance(batch, dict), type(batch)

            if distributed:
                torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM)
                if iterator_stop > 0:
                    break

            batch = to_device(batch, "cuda" if ngpu > 0 else "cpu")
            if no_forward_run:
                all_steps_are_invalid = False
                continue

            with autocast(scaler is not None):
                with reporter.measure_time("forward_time"):
                    retval = model(**batch)

                    # Note(kamo):
                    # Supporting two patterns for the returned value from the model
                    #   a. dict type
                    if isinstance(retval, dict):
                        loss = retval["loss"]
                        stats = retval["stats"]
                        weight = retval["weight"]
                        optim_idx = retval.get("optim_idx")
                        if optim_idx is not None and not isinstance(optim_idx, int):
                            if not isinstance(optim_idx, torch.Tensor):
                                raise RuntimeError(
                                    "optim_idx must be int or 1dim torch.Tensor, "
                                    f"but got {type(optim_idx)}"
                                )
                            if optim_idx.dim() >= 2:
                                raise RuntimeError(
                                    "optim_idx must be int or 1dim torch.Tensor, "
                                    f"but got {optim_idx.dim()}dim tensor"
                                )
                            if optim_idx.dim() == 1:
                                for v in optim_idx:
                                    if v != optim_idx[0]:
                                        raise RuntimeError(
                                            "optim_idx must be 1dim tensor "
                                            "having same values for all entries"
                                        )
                                optim_idx = optim_idx[0].item()
                            else:
                                optim_idx = optim_idx.item()

                    #   b. tuple or list type
                    else:
                        loss, stats, weight = retval
                        optim_idx = None

                stats = {k: v for k, v in stats.items() if v is not None}
                if ngpu > 1 or distributed:
                    # Apply weighted averaging for loss and stats
                    loss = (loss * weight.type(loss.dtype)).sum()

                    # if distributed, this method can also apply all_reduce()
                    stats, weight = recursive_average(stats, weight, distributed)

                    # Now weight is summation over all workers
                    loss /= weight
                if distributed:
                    # NOTE(kamo): Multiply world_size because DistributedDataParallel
                    # automatically normalizes the gradient by world_size.
                    loss *= torch.distributed.get_world_size()

                loss /= accum_grad

            reporter.register(stats, weight)

            with reporter.measure_time("backward_time"):
                if scaler is not None:
                    # Scales loss.  Calls backward() on scaled loss
                    # to create scaled gradients.
                    # Backward passes under autocast are not recommended.
                    # Backward ops run in the same dtype autocast chose
                    # for corresponding forward ops.
                    scaler.scale(loss).backward()
                else:
                    loss.backward()

            if iiter % accum_grad == 0:
                if scaler is not None:
                    # Unscales the gradients of optimizer's assigned params in-place
                    for iopt, optimizer in enumerate(optimizers):
                        if optim_idx is not None and iopt != optim_idx:
                            continue
                        scaler.unscale_(optimizer)

                # gradient noise injection
                if grad_noise:
                    add_gradient_noise(
                        model,
                        reporter.get_total_count(),
                        duration=100,
                        eta=1.0,
                        scale_factor=0.55,
                    )

                # compute the gradient norm to check if it is normal or not
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(),
                    max_norm=grad_clip,
                    norm_type=grad_clip_type,
                )
                # PyTorch<=1.4, clip_grad_norm_ returns float value
                if not isinstance(grad_norm, torch.Tensor):
                    grad_norm = torch.tensor(grad_norm)

                if not torch.isfinite(grad_norm):
                    logging.warning(
                        f"The grad norm is {grad_norm}. Skipping updating the model."
                    )

                    # Must invoke scaler.update() if unscale_() is used in the iteration
                    # to avoid the following error:
                    #   RuntimeError: unscale_() has already been called
                    #   on this optimizer since the last update().
                    # Note that if the gradient has inf/nan values,
                    # scaler.step skips optimizer.step().
                    if scaler is not None:
                        for iopt, optimizer in enumerate(optimizers):
                            if optim_idx is not None and iopt != optim_idx:
                                continue
                            scaler.step(optimizer)
                            scaler.update()

                else:
                    all_steps_are_invalid = False
                    with reporter.measure_time("optim_step_time"):
                        for iopt, (optimizer, scheduler) in enumerate(
                            zip(optimizers, schedulers)
                        ):
                            if optim_idx is not None and iopt != optim_idx:
                                continue
                            if scaler is not None:
                                # scaler.step() first unscales the gradients of
                                # the optimizer's assigned params.
                                scaler.step(optimizer)
                                # Updates the scale for next iteration.
                                scaler.update()
                            else:
                                optimizer.step()
                            if isinstance(scheduler, AbsBatchStepScheduler):
                                scheduler.step()
                for iopt, optimizer in enumerate(optimizers):
                    if optim_idx is not None and iopt != optim_idx:
                        continue
                    optimizer.zero_grad()

                # Register lr and train/load time[sec/step],
                # where step refers to accum_grad * mini-batch
                reporter.register(
                    dict(
                        {
                            f"optim{i}_lr{j}": pg["lr"]
                            for i, optimizer in enumerate(optimizers)
                            for j, pg in enumerate(optimizer.param_groups)
                            if "lr" in pg
                        },
                        train_time=time.perf_counter() - start_time,
                    ),
                )
                start_time = time.perf_counter()

            # NOTE(kamo): Call log_message() after next()
            reporter.next()
            if iiter % log_interval == 0:
                logging.info(reporter.log_message(-log_interval))
                if summary_writer is not None:
                    reporter.tensorboard_add_scalar(summary_writer, -log_interval)
                if use_wandb:
                    reporter.wandb_log()

        else:
            if distributed:
                iterator_stop.fill_(1)
                torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM)
        return all_steps_are_invalid
Exemplo n.º 54
0
    def __init__(
        self,
        st_train_config: Union[Path, str] = None,
        st_model_file: Union[Path, str] = None,
        lm_train_config: Union[Path, str] = None,
        lm_file: Union[Path, str] = None,
        ngram_scorer: str = "full",
        ngram_file: Union[Path, str] = None,
        token_type: str = None,
        bpemodel: str = None,
        device: str = "cpu",
        maxlenratio: float = 0.0,
        minlenratio: float = 0.0,
        batch_size: int = 1,
        dtype: str = "float32",
        beam_size: int = 20,
        lm_weight: float = 1.0,
        ngram_weight: float = 0.9,
        penalty: float = 0.0,
        nbest: int = 1,
        enh_s2t_task: bool = False,
    ):
        assert check_argument_types()

        task = STTask if not enh_s2t_task else EnhS2TTask

        # 1. Build ST model
        scorers = {}
        st_model, st_train_args = task.build_model_from_file(
            st_train_config, st_model_file, device
        )
        if enh_s2t_task:
            st_model.inherite_attributes(
                inherite_s2t_attrs=[
                    "ctc",
                    "decoder",
                    "eos",
                    "joint_network",
                    "sos",
                    "token_list",
                    "use_transducer_decoder",
                ]
            )
        st_model.to(dtype=getattr(torch, dtype)).eval()

        decoder = st_model.decoder
        token_list = st_model.token_list
        scorers.update(
            decoder=decoder,
            length_bonus=LengthBonus(len(token_list)),
        )

        # 2. Build Language model
        if lm_train_config is not None:
            lm, lm_train_args = LMTask.build_model_from_file(
                lm_train_config, lm_file, device
            )
            scorers["lm"] = lm.lm

        # 3. Build ngram model
        if ngram_file is not None:
            if ngram_scorer == "full":
                from espnet.nets.scorers.ngram import NgramFullScorer

                ngram = NgramFullScorer(ngram_file, token_list)
            else:
                from espnet.nets.scorers.ngram import NgramPartScorer

                ngram = NgramPartScorer(ngram_file, token_list)
        else:
            ngram = None
        scorers["ngram"] = ngram

        # 4. Build BeamSearch object
        weights = dict(
            decoder=1.0,
            lm=lm_weight,
            ngram=ngram_weight,
            length_bonus=penalty,
        )
        beam_search = BeamSearch(
            beam_size=beam_size,
            weights=weights,
            scorers=scorers,
            sos=st_model.sos,
            eos=st_model.eos,
            vocab_size=len(token_list),
            token_list=token_list,
            pre_beam_score_key="full",
        )
        # TODO(karita): make all scorers batchfied
        if batch_size == 1:
            non_batch = [
                k
                for k, v in beam_search.full_scorers.items()
                if not isinstance(v, BatchScorerInterface)
            ]
            if len(non_batch) == 0:
                beam_search.__class__ = BatchBeamSearch
                logging.info("BatchBeamSearch implementation is selected.")
            else:
                logging.warning(
                    f"As non-batch scorers {non_batch} are found, "
                    f"fall back to non-batch implementation."
                )
        beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
        for scorer in scorers.values():
            if isinstance(scorer, torch.nn.Module):
                scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
        logging.info(f"Beam_search: {beam_search}")
        logging.info(f"Decoding device={device}, dtype={dtype}")

        # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text
        if token_type is None:
            token_type = st_train_args.token_type
        if bpemodel is None:
            bpemodel = st_train_args.bpemodel

        if token_type is None:
            tokenizer = None
        elif token_type == "bpe":
            if bpemodel is not None:
                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
            else:
                tokenizer = None
        else:
            tokenizer = build_tokenizer(token_type=token_type)
        converter = TokenIDConverter(token_list=token_list)
        logging.info(f"Text tokenizer: {tokenizer}")

        self.st_model = st_model
        self.st_train_args = st_train_args
        self.converter = converter
        self.tokenizer = tokenizer
        self.beam_search = beam_search
        self.maxlenratio = maxlenratio
        self.minlenratio = minlenratio
        self.device = device
        self.dtype = dtype
        self.nbest = nbest
Exemplo n.º 55
0
    def __init__(self,
                 encoders: List[Any],
                 vocabulary: Vocabulary,
                 data_id: str,
                 name: str,
                 max_output_len: int,
                 dropout_keep_prob: float=1.0,
                 rnn_size: Optional[int]=None,
                 embedding_size: Optional[int]=None,
                 output_projection: Optional[Callable[
                     [tf.Tensor, tf.Tensor, List[tf.Tensor]], tf.Tensor]]=None,
                 encoder_projection: Optional[Callable[
                     [tf.Tensor, Optional[int], Optional[List[Any]]],
                     tf.Tensor]]=None,
                 use_attention: bool=False,
                 embeddings_encoder: Optional[Any]=None,
                 attention_on_input: bool=True,
                 rnn_cell: str='GRU',
                 conditional_gru: bool=False,
                 save_checkpoint: Optional[str]=None,
                 load_checkpoint: Optional[str]=None) -> None:
        """Create a refactored version of monster decoder.

        Arguments:
            encoders: Input encoders of the decoder
            vocabulary: Target vocabulary
            data_id: Target data series
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects
            max_output_len: Maximum length of an output sequence
            dropout_keep_prob: Probability of keeping a value during dropout

        Keyword arguments:
            rnn_size: Size of the decoder hidden state, if None set
                according to encoders.
            embedding_size: Size of embedding vectors for target words
            output_projection: How to generate distribution over vocabulary
                from decoder rnn_outputs
            encoder_projection: How to construct initial state from encoders
            use_attention: Flag whether to look at attention vectors of the
                encoders
            embeddings_encoder: Encoder to take embeddings from
            rnn_cell: RNN Cell used by the decoder (GRU or LSTM)
            conditional_gru: Flag whether to use the Conditional GRU
                architecture
            attention_on_input: Flag whether attention from previous decoding
                step should be combined with the input in the next step.
        """
        ModelPart.__init__(self, name, save_checkpoint, load_checkpoint)
        log("Initializing decoder, name: '{}'".format(name))

        assert check_argument_types()

        self.encoders = encoders
        self.vocabulary = vocabulary
        self.data_id = data_id
        self.max_output_len = max_output_len
        self.dropout_keep_prob = dropout_keep_prob
        self.embedding_size = embedding_size
        self.rnn_size = rnn_size
        self.output_projection = output_projection
        self.encoder_projection = encoder_projection
        self.use_attention = use_attention
        self.embeddings_encoder = embeddings_encoder
        self._rnn_cell = rnn_cell

        if self.embedding_size is None and self.embeddings_encoder is None:
            raise ValueError("You must specify either embedding size or the "
                             "encoder from which to reuse the embeddings ("
                             "e.g. set either 'embedding_size' or "
                             " 'embeddings_encoder' parameter)")

        if self.embeddings_encoder is not None:
            if self.embedding_size is not None:
                warn("Overriding the embedding_size parameter with the"
                     " size of the reused embeddings from the encoder.")

            self.embedding_size = (
                self.embeddings_encoder.embedding_matrix.get_shape()[1].value)

        if self.encoder_projection is None:
            if len(self.encoders) == 0:
                log("No encoder - language model only.")
                self.encoder_projection = empty_initial_state
            elif rnn_size is None:
                log("No rnn_size or encoder_projection: Using concatenation of"
                    " encoded states")
                self.encoder_projection = concat_encoder_projection
                self.rnn_size = sum(e.encoded.get_shape()[1].value
                                    for e in encoders)
            else:
                log("Using linear projection of encoders as the initial state")
                self.encoder_projection = linear_encoder_projection(
                    self.dropout_keep_prob)

        assert self.rnn_size is not None

        if self.output_projection is None:
            log("No output projection specified - using simple concatenation")
            self.output_projection = no_deep_output

        with tf.variable_scope(name):
            self._create_input_placeholders()
            self._create_training_placeholders()
            self._create_initial_state()
            self._create_embedding_matrix()

            with tf.name_scope("output_projection"):
                self.decoding_w = tf.get_variable(
                    "state_to_word_W", [self.rnn_size, len(self.vocabulary)],
                    initializer=tf.random_uniform_initializer(-0.5, 0.5))

                self.decoding_b = tf.get_variable(
                    "state_to_word_b", [len(self.vocabulary)],
                    initializer=tf.constant_initializer(
                        - math.log(len(self.vocabulary))))

            # POSLEDNI TRAIN INPUT SE V DEKODOVACI FUNKCI NEPOUZIJE
            # (jen jako target)
            embedded_train_inputs = self._embed_and_dropout(
                self.train_inputs[:-1])

            # POZOR TADY SE NEDELA DROPOUT
            embedded_go_symbols = tf.nn.embedding_lookup(self.embedding_matrix,
                                                         self.go_symbols)

            # fetch train attention objects
            self._train_attention_objects = {}
            # type: Dict[Attentive, tf.Tensor]
            if self.use_attention:
                with tf.name_scope("attention_object"):
                    self._train_attention_objects = {
                        e: e.create_attention_object()
                        for e in self.encoders
                        if isinstance(e, Attentive)}

            train_rnn_outputs, _ = self._attention_decoder(
                embedded_go_symbols,
                attention_on_input=attention_on_input,
                conditional_gru=conditional_gru,
                train_inputs=embedded_train_inputs,
                train_mode=True)

            assert not tf.get_variable_scope().reuse
            tf.get_variable_scope().reuse_variables()

            # fetch runtime attention objects
            self._runtime_attention_objects = {}
            # type: Dict[Attentive, tf.Tensor]
            if self.use_attention:
                self._runtime_attention_objects = {
                    e: e.create_attention_object()
                    for e in self.encoders
                    if isinstance(e, Attentive)}

            (self.runtime_rnn_outputs,
             self.runtime_rnn_states) = self._attention_decoder(
                 embedded_go_symbols,
                 attention_on_input=attention_on_input,
                 conditional_gru=conditional_gru,
                 train_mode=False)

            self.hidden_states = self.runtime_rnn_outputs

            def decode(rnn_outputs: List[tf.Tensor]) -> Tuple[
                    List[tf.Tensor], List[tf.Tensor]]:
                with tf.name_scope("output_projection"):
                    logits = []
                    decoded = []

                    for out in rnn_outputs:
                        out_activation = self._logit_function(out)
                        logits.append(out_activation)
                        decoded.append(tf.argmax(out_activation[:, 1:], 1) + 1)

                    return decoded, logits

            _, self.train_logits = decode(train_rnn_outputs)

            train_targets = tf.transpose(self.train_inputs)

            self.train_xents = tf.contrib.seq2seq.sequence_loss(
                tf.stack(self.train_logits, 1), train_targets,
                tf.transpose(self.train_padding),
                average_across_batch=False)
            self.train_loss = tf.reduce_mean(self.train_xents)
            self.cost = self.train_loss

            self.train_logprobs = [tf.nn.log_softmax(l)
                                   for l in self.train_logits]

            self.decoded, self.runtime_logits = decode(
                self.runtime_rnn_outputs)

            self.runtime_loss = tf.contrib.seq2seq.sequence_loss(
                tf.stack(self.runtime_logits, 1), train_targets,
                tf.transpose(self.train_padding))

            self.runtime_logprobs = [tf.nn.log_softmax(l)
                                     for l in self.runtime_logits]

            self._visualize_attention()

            log("Decoder initalized.")
Exemplo n.º 56
0
    def run(
        cls,
        model: AbsESPnetModel,
        optimizers: Sequence[torch.optim.Optimizer],
        schedulers: Sequence[Optional[AbsScheduler]],
        train_iter_factory: AbsIterFactory,
        valid_iter_factory: AbsIterFactory,
        plot_attention_iter_factory: Optional[AbsIterFactory],
        trainer_options,
        distributed_option: DistributedOption,
    ) -> None:
        """Perform training. This method performs the main process of training."""
        assert check_argument_types()
        # NOTE(kamo): Don't check the type more strictly as far trainer_options
        assert is_dataclass(trainer_options), type(trainer_options)
        assert len(optimizers) == len(schedulers), (len(optimizers), len(schedulers))

        if isinstance(trainer_options.keep_nbest_models, int):
            keep_nbest_models = [trainer_options.keep_nbest_models]
        else:
            if len(trainer_options.keep_nbest_models) == 0:
                logging.warning("No keep_nbest_models is given. Change to [1]")
                trainer_options.keep_nbest_models = [1]
            keep_nbest_models = trainer_options.keep_nbest_models

        output_dir = Path(trainer_options.output_dir)
        reporter = Reporter()
        if trainer_options.use_amp:
            if LooseVersion(torch.__version__) < LooseVersion("1.6.0"):
                raise RuntimeError(
                    "Require torch>=1.6.0 for  Automatic Mixed Precision"
                )
            if trainer_options.sharded_ddp:
                if fairscale is None:
                    raise RuntimeError(
                        "Requiring fairscale. Do 'pip install fairscale'"
                    )
                scaler = fairscale.optim.grad_scaler.ShardedGradScaler()
            else:
                scaler = GradScaler()
        else:
            scaler = None

        if trainer_options.resume and (output_dir / "checkpoint.pth").exists():
            cls.resume(
                checkpoint=output_dir / "checkpoint.pth",
                model=model,
                optimizers=optimizers,
                schedulers=schedulers,
                reporter=reporter,
                scaler=scaler,
                ngpu=trainer_options.ngpu,
            )

        start_epoch = reporter.get_epoch() + 1
        if start_epoch == trainer_options.max_epoch + 1:
            logging.warning(
                f"The training has already reached at max_epoch: {start_epoch}"
            )

        if distributed_option.distributed:
            if trainer_options.sharded_ddp:
                dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
                    module=model,
                    sharded_optimizer=optimizers,
                )
            else:
                dp_model = torch.nn.parallel.DistributedDataParallel(
                    model,
                    device_ids=(
                        # Perform multi-Process with multi-GPUs
                        [torch.cuda.current_device()]
                        if distributed_option.ngpu == 1
                        # Perform single-Process with multi-GPUs
                        else None
                    ),
                    output_device=(
                        torch.cuda.current_device()
                        if distributed_option.ngpu == 1
                        else None
                    ),
                    find_unused_parameters=trainer_options.unused_parameters,
                )
        elif distributed_option.ngpu > 1:
            dp_model = torch.nn.parallel.DataParallel(
                model,
                device_ids=list(range(distributed_option.ngpu)),
            )
        else:
            # NOTE(kamo): DataParallel also should work with ngpu=1,
            # but for debuggability it's better to keep this block.
            dp_model = model

        if trainer_options.use_tensorboard and (
            not distributed_option.distributed or distributed_option.dist_rank == 0
        ):
            from torch.utils.tensorboard import SummaryWriter

            train_summary_writer = SummaryWriter(
                str(output_dir / "tensorboard" / "train")
            )
            valid_summary_writer = SummaryWriter(
                str(output_dir / "tensorboard" / "valid")
            )
        else:
            train_summary_writer = None

        start_time = time.perf_counter()
        for iepoch in range(start_epoch, trainer_options.max_epoch + 1):
            if iepoch != start_epoch:
                logging.info(
                    "{}/{}epoch started. Estimated time to finish: {}".format(
                        iepoch,
                        trainer_options.max_epoch,
                        humanfriendly.format_timespan(
                            (time.perf_counter() - start_time)
                            / (iepoch - start_epoch)
                            * (trainer_options.max_epoch - iepoch + 1)
                        ),
                    )
                )
            else:
                logging.info(f"{iepoch}/{trainer_options.max_epoch}epoch started")
            set_all_random_seed(trainer_options.seed + iepoch)

            reporter.set_epoch(iepoch)
            # 1. Train and validation for one-epoch
            with reporter.observe("train") as sub_reporter:
                all_steps_are_invalid = cls.train_one_epoch(
                    model=dp_model,
                    optimizers=optimizers,
                    schedulers=schedulers,
                    iterator=train_iter_factory.build_iter(iepoch),
                    reporter=sub_reporter,
                    scaler=scaler,
                    summary_writer=train_summary_writer,
                    options=trainer_options,
                    distributed_option=distributed_option,
                )

            with reporter.observe("valid") as sub_reporter:
                cls.validate_one_epoch(
                    model=dp_model,
                    iterator=valid_iter_factory.build_iter(iepoch),
                    reporter=sub_reporter,
                    options=trainer_options,
                    distributed_option=distributed_option,
                )
            if not distributed_option.distributed or distributed_option.dist_rank == 0:
                # att_plot doesn't support distributed
                if plot_attention_iter_factory is not None:
                    with reporter.observe("att_plot") as sub_reporter:
                        cls.plot_attention(
                            model=model,
                            output_dir=output_dir / "att_ws",
                            summary_writer=train_summary_writer,
                            iterator=plot_attention_iter_factory.build_iter(iepoch),
                            reporter=sub_reporter,
                            options=trainer_options,
                        )

            # 2. LR Scheduler step
            for scheduler in schedulers:
                if isinstance(scheduler, AbsValEpochStepScheduler):
                    scheduler.step(
                        reporter.get_value(*trainer_options.val_scheduler_criterion)
                    )
                elif isinstance(scheduler, AbsEpochStepScheduler):
                    scheduler.step()
            if trainer_options.sharded_ddp:
                for optimizer in optimizers:
                    if isinstance(optimizer, fairscale.optim.oss.OSS):
                        optimizer.consolidate_state_dict()

            if not distributed_option.distributed or distributed_option.dist_rank == 0:
                # 3. Report the results
                logging.info(reporter.log_message())
                if trainer_options.use_matplotlib:
                    reporter.matplotlib_plot(output_dir / "images")
                if train_summary_writer is not None:
                    reporter.tensorboard_add_scalar(train_summary_writer, key1="train")
                    reporter.tensorboard_add_scalar(valid_summary_writer, key1="valid")
                if trainer_options.use_wandb:
                    reporter.wandb_log()

                # 4. Save/Update the checkpoint
                torch.save(
                    {
                        "model": model.state_dict(),
                        "reporter": reporter.state_dict(),
                        "optimizers": [o.state_dict() for o in optimizers],
                        "schedulers": [
                            s.state_dict() if s is not None else None
                            for s in schedulers
                        ],
                        "scaler": scaler.state_dict() if scaler is not None else None,
                    },
                    output_dir / "checkpoint.pth",
                )

                # 5. Save and log the model and update the link to the best model
                torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth")

                # Creates a sym link latest.pth -> {iepoch}epoch.pth
                p = output_dir / "latest.pth"
                if p.is_symlink() or p.exists():
                    p.unlink()
                p.symlink_to(f"{iepoch}epoch.pth")

                _improved = []
                for _phase, k, _mode in trainer_options.best_model_criterion:
                    # e.g. _phase, k, _mode = "train", "loss", "min"
                    if reporter.has(_phase, k):
                        best_epoch = reporter.get_best_epoch(_phase, k, _mode)
                        # Creates sym links if it's the best result
                        if best_epoch == iepoch:
                            p = output_dir / f"{_phase}.{k}.best.pth"
                            if p.is_symlink() or p.exists():
                                p.unlink()
                            p.symlink_to(f"{iepoch}epoch.pth")
                            _improved.append(f"{_phase}.{k}")
                if len(_improved) == 0:
                    logging.info("There are no improvements in this epoch")
                else:
                    logging.info(
                        "The best model has been updated: " + ", ".join(_improved)
                    )

                log_model = (
                    trainer_options.wandb_model_log_interval > 0
                    and iepoch % trainer_options.wandb_model_log_interval == 0
                )
                if log_model and trainer_options.use_wandb:
                    import wandb

                    logging.info("Logging Model on this epoch :::::")
                    artifact = wandb.Artifact(
                        name=f"model_{wandb.run.id}",
                        type="model",
                        metadata={"improved": _improved},
                    )
                    artifact.add_file(str(output_dir / f"{iepoch}epoch.pth"))
                    aliases = [
                        f"epoch-{iepoch}",
                        "best" if best_epoch == iepoch else "",
                    ]
                    wandb.log_artifact(artifact, aliases=aliases)

                # 6. Remove the model files excluding n-best epoch and latest epoch
                _removed = []
                # Get the union set of the n-best among multiple criterion
                nbests = set().union(
                    *[
                        set(reporter.sort_epochs(ph, k, m)[: max(keep_nbest_models)])
                        for ph, k, m in trainer_options.best_model_criterion
                        if reporter.has(ph, k)
                    ]
                )

                # Generated n-best averaged model
                if (
                    trainer_options.nbest_averaging_interval > 0
                    and iepoch % trainer_options.nbest_averaging_interval == 0
                ):
                    average_nbest_models(
                        reporter=reporter,
                        output_dir=output_dir,
                        best_model_criterion=trainer_options.best_model_criterion,
                        nbest=keep_nbest_models,
                    )

                for e in range(1, iepoch):
                    p = output_dir / f"{e}epoch.pth"
                    if p.exists() and e not in nbests:
                        p.unlink()
                        _removed.append(str(p))
                if len(_removed) != 0:
                    logging.info("The model files were removed: " + ", ".join(_removed))

            # 7. If any updating haven't happened, stops the training
            if all_steps_are_invalid:
                logging.warning(
                    f"The gradients at all steps are invalid in this epoch. "
                    f"Something seems wrong. This training was stopped at {iepoch}epoch"
                )
                break

            # 8. Check early stopping
            if trainer_options.patience is not None:
                if reporter.check_early_stopping(
                    trainer_options.patience, *trainer_options.early_stopping_criterion
                ):
                    break

        else:
            logging.info(
                f"The training was finished at {trainer_options.max_epoch} epochs "
            )

        # Generated n-best averaged model
        if not distributed_option.distributed or distributed_option.dist_rank == 0:
            average_nbest_models(
                reporter=reporter,
                output_dir=output_dir,
                best_model_criterion=trainer_options.best_model_criterion,
                nbest=keep_nbest_models,
            )
Exemplo n.º 57
0
    def __init__(
        self,
        input_size: int,
        output_size: int = 256,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        attention_dropout_rate: float = 0.0,
        input_layer: Optional[str] = "conv2d",
        pos_enc_class=PositionalEncoding,
        normalize_before: bool = True,
        concat_after: bool = False,
        positionwise_layer_type: str = "linear",
        positionwise_conv_kernel_size: int = 1,
        padding_idx: int = -1,
    ):
        assert check_argument_types()
        super().__init__()
        self._output_size = output_size

        if input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(input_size, output_size),
                torch.nn.LayerNorm(output_size),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
                pos_enc_class(output_size, positional_dropout_rate),
            )
        elif input_layer == "conv2d":
            self.embed = Conv2dSubsampling(input_size, output_size,
                                           dropout_rate)
        elif input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(input_size,
                                   output_size,
                                   padding_idx=padding_idx),
                pos_enc_class(output_size, positional_dropout_rate),
            )
        elif input_layer is None:
            self.embed = torch.nn.Sequential(
                pos_enc_class(output_size, positional_dropout_rate))
        else:
            raise ValueError("unknown input_layer: " + input_layer)
        self.normalize_before = normalize_before
        if positionwise_layer_type == "linear":
            positionwise_layer = PositionwiseFeedForward
            positionwise_layer_args = (
                output_size,
                linear_units,
                dropout_rate,
            )
        elif positionwise_layer_type == "conv1d":
            positionwise_layer = MultiLayeredConv1d
            positionwise_layer_args = (
                output_size,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        elif positionwise_layer_type == "conv1d-linear":
            positionwise_layer = Conv1dLinear
            positionwise_layer_args = (
                output_size,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        else:
            raise NotImplementedError("Support only linear or conv1d.")
        self.encoders = repeat(
            num_blocks,
            lambda lnum: EncoderLayer(
                output_size,
                MultiHeadedAttention(attention_heads, output_size,
                                     attention_dropout_rate),
                positionwise_layer(*positionwise_layer_args),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
        if self.normalize_before:
            self.after_norm = LayerNorm(output_size)
Exemplo n.º 58
0
    def __init__(self,
                 name: str,
                 encoders: List[Attendable],
                 vocabulary: Vocabulary,
                 data_id: str,
                 # TODO infer the default for these three from the encoder
                 ff_hidden_size: int,
                 n_heads_self: int,
                 n_heads_enc: Union[List[int], int],
                 depth: int,
                 max_output_len: int,
                 attention_combination_strategy: str = "serial",
                 n_heads_hier: int = None,
                 dropout_keep_prob: float = 1.0,
                 embedding_size: int = None,
                 embeddings_source: EmbeddedSequence = None,
                 tie_embeddings: bool = True,
                 label_smoothing: float = None,
                 self_attention_dropout_keep_prob: float = 1.0,
                 attention_dropout_keep_prob: Union[float, List[float]] = 1.0,
                 use_att_transform_bias: bool = False,
                 supress_unk: bool = False,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        """Create a decoder of the Transformer model.

        Described in Vaswani et al. (2017), arxiv.org/abs/1706.03762

        Arguments:
            encoders: Input encoders for the decoder.
            vocabulary: Target vocabulary.
            data_id: Target data series.
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects.
            max_output_len: Maximum length of an output sequence.
            dropout_keep_prob: Probability of keeping a value during dropout.
            embedding_size: Size of embedding vectors for target words.
            embeddings_source: Embedded sequence to take embeddings from.
            tie_embeddings: Use decoder.embedding_matrix also in place
                of the output decoding matrix.
            ff_hidden_size: Size of the feedforward sublayers.
            n_heads_self: Number of the self-attention heads.
            n_heads_enc: Number of the attention heads over each encoder.
                Either a list which size must be equal to ``encoders``, or a
                single integer. In the latter case, the number of heads is
                equal for all encoders.
            attention_comnbination_strategy: One of ``serial``, ``parallel``,
                ``flat``, ``hierarchical``. Controls the attention combination
                strategy for enc-dec attention.
            n_heads_hier: Number of the attention heads for the second
                attention in the ``hierarchical`` attention combination.
            depth: Number of sublayers.
            label_smoothing: A label smoothing parameter for cross entropy
                loss computation.
            attention_dropout_keep_prob: Probability of keeping a value
                during dropout on the attention output.
            supress_unk: If true, decoder will not produce symbols for unknown
                tokens.
            reuse: Reuse the variables from the given model part.
        """
        check_argument_types()
        AutoregressiveDecoder.__init__(
            self,
            name=name,
            vocabulary=vocabulary,
            data_id=data_id,
            max_output_len=max_output_len,
            dropout_keep_prob=dropout_keep_prob,
            embedding_size=embedding_size,
            embeddings_source=embeddings_source,
            tie_embeddings=tie_embeddings,
            label_smoothing=label_smoothing,
            supress_unk=supress_unk,
            reuse=reuse,
            save_checkpoint=save_checkpoint,
            load_checkpoint=load_checkpoint)

        self.encoders = encoders
        self.ff_hidden_size = ff_hidden_size
        self.n_heads_self = n_heads_self

        if isinstance(n_heads_enc, int):
            if attention_combination_strategy == "flat":
                self.n_heads_enc = [n_heads_enc]
            else:
                self.n_heads_enc = [n_heads_enc for _ in self.encoders]
        else:
            self.n_heads_enc = n_heads_enc

        self.depth = depth
        if isinstance(attention_dropout_keep_prob, float):
            self.attention_dropout_keep_prob = [
                attention_dropout_keep_prob for _ in encoders]
        else:
            self.attention_dropout_keep_prob = attention_dropout_keep_prob
        self.self_att_dropout_keep_prob = self_attention_dropout_keep_prob
        self.use_att_transform_bias = use_att_transform_bias
        self.attention_combination_strategy = attention_combination_strategy
        self.n_heads_hier = n_heads_hier

        self.encoder_states = lambda: [get_attention_states(e)
                                       for e in self.encoders]
        self.encoder_masks = lambda: [get_attention_mask(e)
                                      for e in self.encoders]

        if self.attention_combination_strategy not in STRATEGIES:
            raise ValueError(
                "Unknown attention combination strategy '{}'. "
                "Allowed: {}.".format(self.attention_combination_strategy,
                                      ", ".join(STRATEGIES)))

        if (self.attention_combination_strategy == "hierarchical"
                and self.n_heads_hier is None):
            raise ValueError(
                "You must provide n_heads_hier when using the hierarchical "
                "attention combination strategy.")

        if (self.attention_combination_strategy != "hierarchical"
                and self.n_heads_hier is not None):
            warn("Ignoring n_heads_hier parameter -- use the hierarchical "
                 "attention combination strategy instead.")

        if (self.attention_combination_strategy == "flat"
                and len(self.n_heads_enc) != 1):
            raise ValueError(
                "For the flat attention combination strategy, only a single "
                "value is permitted in n_heads_enc.")

        self._variable_scope.set_initializer(tf.variance_scaling_initializer(
            mode="fan_avg", distribution="uniform"))
Exemplo n.º 59
0
    def add_resource_factory(self,
                             factory_callback: factory_callback_type,
                             types: Union[type, Sequence[Type]],
                             name: str = 'default',
                             context_attr: str = None) -> None:
        """
        Add a resource factory to this context.

        This will cause a ``resource_added`` event to be dispatched.

        A resource factory is a callable that generates a "contextual" resource when it is
        requested by either using any of the methods :meth:`get_resource`, :meth:`require_resource`
        or :meth:`request_resource` or its context attribute is accessed.

        When a new resource is created in this manner, it is always bound to the context through
        it was requested, regardless of where in the chain the factory itself was added to.

        :param factory_callback: a (non-coroutine) callable that takes a context instance as
            argument and returns the created resource object
        :param types: one or more types to register the generated resource as on the target context
        :param name: name of the resource that will be created in the target context
        :param context_attr: name of the context attribute the created resource will be accessible
            as
        :raises asphalt.core.context.ResourceConflict: if there is an existing resource factory for
            the given type/name combinations or the given context variable

        """
        assert check_argument_types()
        self._check_closed()
        if not resource_name_re.fullmatch(name):
            raise ValueError(
                '"name" must be a nonempty string consisting only of alphanumeric '
                'characters and underscores')
        if iscoroutinefunction(factory_callback):
            raise TypeError(
                '"factory_callback" must not be a coroutine function')
        if not types:
            raise ValueError('"types" must not be empty')

        if isinstance(types, type):
            resource_types = (types, )  # type: Tuple[type, ...]
        else:
            resource_types = tuple(types)

        # Check for a conflicting context attribute
        if context_attr in self._resource_factories_by_context_attr:
            raise ResourceConflict(
                'this context already contains a resource factory for the context attribute {!r}'
                .format(context_attr))

        # Check for conflicts with existing resource factories
        for type_ in resource_types:
            if (type_, name) in self._resource_factories:
                raise ResourceConflict(
                    'this context already contains a resource factory for the '
                    'type {}'.format(qualified_name(type_)))

        # Add the resource factory to the appropriate lookup tables
        resource = ResourceContainer(factory_callback, resource_types, name,
                                     context_attr, True)
        for type_ in resource_types:
            self._resource_factories[(type_, name)] = resource

        if context_attr:
            self._resource_factories_by_context_attr[context_attr] = resource

        # Notify listeners that a new resource has been made available
        self.resource_added.dispatch(resource_types, name, True)
Exemplo n.º 60
0
    def build_model(cls, args: argparse.Namespace) -> ESPnetASRModel:
        assert check_argument_types()
        if isinstance(args.token_list, str):
            with open(args.token_list, encoding="utf-8") as f:
                token_list = [line.rstrip() for line in f]

            # Overwriting token_list to keep it as "portable".
            args.token_list = list(token_list)
        elif isinstance(args.token_list, (tuple, list)):
            token_list = list(args.token_list)
        else:
            raise RuntimeError("token_list must be str or list")
        vocab_size = len(token_list)
        logging.info(f"Vocabulary size: {vocab_size }")

        # 1. frontend
        if args.input_size is None:
            # Extract features in the model
            frontend_class = frontend_choices.get_class(args.frontend)
            frontend = frontend_class(**args.frontend_conf)
            input_size = frontend.output_size()
        else:
            # Give features from data-loader
            args.frontend = None
            args.frontend_conf = {}
            frontend = None
            input_size = args.input_size

        # 2. Data augmentation for spectrogram
        if args.specaug is not None:
            specaug_class = specaug_choices.get_class(args.specaug)
            specaug = specaug_class(**args.specaug_conf)
        else:
            specaug = None

        # 3. Normalization layer
        if args.normalize is not None:
            normalize_class = normalize_choices.get_class(args.normalize)
            normalize = normalize_class(**args.normalize_conf)
        else:
            normalize = None

        # 4. Pre-encoder input block
        # NOTE(kan-bayashi): Use getattr to keep the compatibility
        if getattr(args, "preencoder", None) is not None:
            preencoder_class = preencoder_choices.get_class(args.preencoder)
            preencoder = preencoder_class(**args.preencoder_conf)
            input_size = preencoder.output_size()
        else:
            preencoder = None

        # 4. Encoder
        encoder_class = encoder_choices.get_class(args.encoder)
        encoder = encoder_class(input_size=input_size, **args.encoder_conf)

        # 5. Decoder
        decoder_class = decoder_choices.get_class(args.decoder)

        decoder = decoder_class(
            vocab_size=vocab_size,
            encoder_output_size=encoder.output_size(),
            **args.decoder_conf,
        )

        # 6. CTC
        ctc = CTC(
            odim=vocab_size, encoder_output_sizse=encoder.output_size(), **args.ctc_conf
        )

        # 7. RNN-T Decoder (Not implemented)
        rnnt_decoder = None

        # 8. Build model
        model = ESPnetASRModel(
            vocab_size=vocab_size,
            frontend=frontend,
            specaug=specaug,
            normalize=normalize,
            preencoder=preencoder,
            encoder=encoder,
            decoder=decoder,
            ctc=ctc,
            rnnt_decoder=rnnt_decoder,
            token_list=token_list,
            **args.model_conf,
        )

        # FIXME(kamo): Should be done in model?
        # 9. Initialize
        if args.init is not None:
            initialize(model, args.init)

        assert check_return_type(model)
        return model