Exemplo n.º 1
0
    def __init__(self,
                 input_dim,
                 n_units,
                 n_layers,
                 bottleneck_dim,
                 dropout,
                 param_init=0.1):

        super(SequenceSummaryNetwork, self).__init__()

        self.n_layers = n_layers

        self.ssn = nn.ModuleList()
        self.ssn += [Linear(input_dim, n_units, bias=False, dropout=dropout)]
        for l in range(1, n_layers - 1, 1):
            self.ssn += [
                Linear(n_units,
                       bottleneck_dim if l == n_layers - 2 else n_units,
                       bias=False,
                       dropout=dropout)
            ]
        self.ssn += [
            Linear(bottleneck_dim, input_dim, bias=False, dropout=dropout)
        ]

        # Initialize parameters
        self.reset_parameters(param_init)
Exemplo n.º 2
0
    def __init__(self,
                 enc_dim,
                 dec_dim,
                 attn_type,
                 attn_dim,
                 init_r,
                 conv_out_channels=10,
                 conv_kernel_size=100):
        """Energy function."""
        super().__init__()

        self.attn_type = attn_type
        self.key = None
        self.mask = None

        self.w_key = Linear(enc_dim, attn_dim, bias=False)
        self.w_query = Linear(dec_dim, attn_dim, bias=False)
        if attn_type == 'location':
            self.w_conv = Linear(conv_out_channels, attn_dim, bias=False)
            self.conv = nn.Conv2d(in_channels=1,
                                  out_channels=conv_out_channels,
                                  kernel_size=(1, conv_kernel_size * 2 + 1),
                                  stride=1,
                                  padding=(0, conv_kernel_size),
                                  bias=False)
        else:
            assert attn_type == 'add'
        self.b = nn.Parameter(torch.Tensor(attn_dim).normal_())

        self.v = nn.utils.weight_norm(nn.Linear(attn_dim, 1))
        self.v.weight_g.data = torch.Tensor([1 / attn_dim]).sqrt()

        self.r = nn.Parameter(torch.Tensor([init_r]))
Exemplo n.º 3
0
    def __init__(self,
                 key_dim,
                 query_dim,
                 attn_type,
                 attn_dim,
                 dropout=0,
                 n_heads=4):

        super(MultiheadAttentionMechanism, self).__init__()

        self.attn_type = attn_type
        assert attn_dim % n_heads == 0
        self.d_k = attn_dim // n_heads
        self.n_heads = n_heads
        self.key = None
        self.value = None
        self.mask = None

        # attention dropout applied AFTER the softmax layer
        self.attn_dropout = nn.Dropout(p=dropout)

        if attn_type == 'scaled_dot':
            self.w_key = Linear(key_dim, attn_dim, bias=False)
            self.w_value = Linear(key_dim, attn_dim, bias=False)
            self.w_query = Linear(query_dim, attn_dim, bias=False)
        elif attn_type == 'add':
            self.w_key = Linear(key_dim, attn_dim, bias=True)
            self.w_value = Linear(key_dim, attn_dim, bias=False)
            self.w_query = Linear(query_dim, attn_dim, bias=False)
            self.v = Linear(attn_dim, n_heads, bias=False)
        else:
            raise NotImplementedError(attn_type)

        self.w_out = Linear(attn_dim, key_dim)
Exemplo n.º 4
0
    def __init__(self, factor, n_units, n_dirs):
        super(ConcatSubsampler, self).__init__()

        self.factor = factor
        if factor > 1:
            self.proj = Linear(n_units * n_dirs * factor, n_units * n_dirs)
            self.batch_norm = nn.BatchNorm1d(n_units * n_dirs)
Exemplo n.º 5
0
    def __init__(self,
                 input_dim,
                 in_channel,
                 channels,
                 kernel_sizes,
                 dropout,
                 bottleneck_dim=0,
                 param_init=0.1):

        super(GatedConvEncoder, self).__init__()
        logger = logging.getLogger("training")

        self.in_channel = in_channel
        assert input_dim % in_channel == 0
        self.input_freq = input_dim // in_channel
        self.bridge = None

        assert len(channels) > 0
        assert len(channels) == len(kernel_sizes)

        layers = OrderedDict()
        for l in range(len(channels)):
            layers['conv%d' % l] = GLUBlock(kernel_sizes[l][0],
                                            input_dim,
                                            channels[l],
                                            weight_norm=True,
                                            dropout=0.2)
            input_dim = channels[l]

        # weight normalization + GLU for the last fully-connected layer
        self.fc_glu = Linear(input_dim, input_dim * 2, weight_norm=True)

        self._output_dim = int(input_dim)

        if bottleneck_dim > 0:
            self.bridge = Linear(self._output_dim, bottleneck_dim)
            self._output_dim = bottleneck_dim

        self.layers = nn.Sequential(layers)

        # Initialize parameters
        self.reset_parameters(param_init)
Exemplo n.º 6
0
    def __init__(self,
                 input_dim,
                 in_channel,
                 channels,
                 kernel_sizes,
                 strides,
                 poolings,
                 dropout,
                 batch_norm=False,
                 residual=False,
                 bottleneck_dim=0,
                 param_init=0.1):

        super(ConvEncoder, self).__init__()
        logger = logging.getLogger("training")

        self.in_channel = in_channel
        assert input_dim % in_channel == 0
        self.input_freq = input_dim // in_channel
        self.residual = residual
        self.bridge = None

        assert len(channels) > 0
        assert len(channels) == len(kernel_sizes) == len(strides) == len(
            poolings)

        self.layers = nn.ModuleList()
        in_ch = in_channel
        in_freq = self.input_freq
        for l in range(len(channels)):
            block = Conv2LBlock(input_dim=in_freq,
                                in_channel=in_ch,
                                out_channel=channels[l],
                                kernel_size=kernel_sizes[l],
                                stride=strides[l],
                                pooling=poolings[l],
                                dropout=dropout,
                                batch_norm=batch_norm,
                                residual=residual)
            self.layers += [block]
            in_freq = block.input_dim
            in_ch = channels[l]

        self._output_dim = int(in_ch * in_freq)

        if bottleneck_dim > 0:
            self.bridge = Linear(self._output_dim, bottleneck_dim)
            self._output_dim = bottleneck_dim

        # Initialize parameters
        self.reset_parameters(param_init)
Exemplo n.º 7
0
    def __init__(self,
                 eos,
                 blank,
                 enc_n_units,
                 vocab,
                 dropout=0.0,
                 lsm_prob=0.0,
                 fc_list=[],
                 param_init=0.1):

        super(CTC, self).__init__()
        logger = logging.getLogger('training')

        self.eos = eos
        self.blank = blank
        self.vocab = vocab
        self.lsm_prob = lsm_prob

        self.space = -1
        # TODO(hirofumi): fix layer

        # Fully-connected layers before the softmax
        if len(fc_list) > 0:
            fc_layers = OrderedDict()
            for i in range(len(fc_list)):
                input_dim = enc_n_units if i == 0 else fc_list[i - 1]
                fc_layers['fc' + str(i)] = Linear(input_dim,
                                                  fc_list[i],
                                                  dropout=dropout)
            fc_layers['fc' + str(len(fc_list))] = Linear(fc_list[-1], vocab)
            self.output = nn.Sequential(fc_layers)
        else:
            self.output = Linear(enc_n_units, vocab)

        import warpctc_pytorch
        self.warpctc_loss = warpctc_pytorch.CTCLoss(size_average=True)
Exemplo n.º 8
0
    def __init__(self,
                 input_dim,
                 in_channel,
                 channels,
                 kernel_sizes,
                 dropout,
                 bottleneck_dim=0):

        super(TDSEncoder, self).__init__()
        logger = logging.getLogger("training")

        self.in_channel = in_channel
        assert input_dim % in_channel == 0
        self.input_freq = input_dim // in_channel
        self.bridge = None

        assert len(channels) > 0
        assert len(channels) == len(kernel_sizes)

        layers = OrderedDict()
        in_ch = in_channel
        in_freq = self.input_freq
        for l in range(len(channels)):
            # subsample
            if in_ch != channels[l]:
                layers['subsample%d' % l] = SubsampelBlock(in_channel=in_ch,
                                                           out_channel=channels[l],
                                                           in_freq=in_freq,
                                                           dropout=dropout)

            # Conv
            layers['tds%d_block%d' % (channels[l], l)] = TDSBlock(channel=channels[l],
                                                                  kernel_size=kernel_sizes[l][0],
                                                                  in_freq=in_freq,
                                                                  dropout=dropout)

            in_ch = channels[l]

        self._output_dim = int(in_ch * in_freq)

        if bottleneck_dim > 0:
            self.bridge = Linear(self._output_dim, bottleneck_dim)
            self._output_dim = bottleneck_dim

        self.layers = nn.Sequential(layers)

        # Initialize parameters
        self.reset_parameters()
Exemplo n.º 9
0
    def __init__(self,
                 enc_dim,
                 conv_out_channels,
                 conv_kernel_size,
                 threshold=0.9):
        super(CIF, self).__init__()

        self.threshold = threshold
        self.channel = conv_out_channels
        self.n_heads = 1

        self.conv = nn.Conv1d(in_channels=enc_dim,
                              out_channels=conv_out_channels,
                              kernel_size=conv_kernel_size * 2 + 1,
                              stride=1,
                              padding=conv_kernel_size)
        self.proj = Linear(conv_out_channels, 1)
Exemplo n.º 10
0
    def __init__(self, key_dim, query_dim, attn_dim, window, init_r=-4):
        """Monotonic chunk-wise attention.

            "Monotonic Chunkwise Attention" (ICLR 2018)
            https://openreview.net/forum?id=Hko85plCW

            if window == 1, this is equivalent to Hard monotonic attention
                "Online and Linear-Time Attention by Enforcing Monotonic Alignment" (ICML 2017)
                 http://arxiv.org/abs/1704.00784

        Args:
            key_dim (int): dimensions of key
            query_dim (int): dimensions of query
            attn_dim: (int) dimension of the attention layer
            window (int): chunk size
            init_r (int): initial value for parameter 'r' used in monotonic/chunk attention

        """
        super(MoChA, self).__init__()

        self.window = window
        self.n_heads = 1

        # Monotonic energy
        self.w_key_mono = Linear(key_dim, attn_dim, bias=True)
        self.w_query_mono = Linear(query_dim, attn_dim, bias=False)
        self.v_mono = Linear(attn_dim, 1, bias=False, weight_norm=True)
        self.r_mono = nn.Parameter(torch.Tensor([init_r]))

        # Chunk energy
        if window > 1:
            self.w_key_chunk = Linear(key_dim, attn_dim, bias=True)
            self.w_query_chunk = Linear(query_dim, attn_dim, bias=False)
            self.v_chunk = Linear(attn_dim, 1, bias=False, weight_norm=True)
            self.r_chunk = nn.Parameter(torch.Tensor([init_r]))

        # initialization
        self.v_mono.fc.weight_g.data = torch.Tensor([1 / attn_dim]).sqrt()
        if window > 1:
            self.v_mono.fc.weight_g.data = torch.Tensor([1 / attn_dim]).sqrt()
Exemplo n.º 11
0
    def __init__(self,
                 eos,
                 unk,
                 pad,
                 blank,
                 enc_n_units,
                 rnn_type,
                 n_units,
                 n_projs,
                 n_layers,
                 bottleneck_dim,
                 emb_dim,
                 vocab,
                 tie_embedding=False,
                 attn_conv_kernel_size=0,
                 dropout=0.0,
                 dropout_emb=0.0,
                 lsm_prob=0.0,
                 ctc_weight=0.0,
                 ctc_lsm_prob=0.0,
                 ctc_fc_list=[],
                 backward=False,
                 lm_fusion=None,
                 lm_fusion_type='cold',
                 discourse_aware='',
                 lm_init=None,
                 global_weight=1.0,
                 mtl_per_batch=False,
                 param_init=0.1,
                 replace_sos=False,
                 soft_label_weight=0.0):

        super(CIFRNNDecoder, self).__init__()
        logger = logging.getLogger('training')

        self.eos = eos
        self.unk = unk
        self.pad = pad
        self.blank = blank
        self.vocab = vocab
        self.rnn_type = rnn_type
        assert rnn_type in ['lstm', 'gru']
        self.enc_n_units = enc_n_units
        self.dec_n_units = n_units
        self.n_projs = n_projs
        self.n_layers = n_layers
        self.lsm_prob = lsm_prob
        self.ctc_weight = ctc_weight
        self.bwd = backward
        self.lm_fusion_type = lm_fusion_type
        self.global_weight = global_weight
        self.mtl_per_batch = mtl_per_batch
        self.replace_sos = replace_sos
        self.soft_label_weight = soft_label_weight

        self.quantity_loss_weight = 1.0

        # for contextualization
        self.discourse_aware = discourse_aware
        self.dstate_prev = None

        # for cache
        self.prev_spk = ''
        self.total_step = 0
        self.dstates_final = None
        self.lmstate_final = None

        if ctc_weight > 0:
            self.ctc = CTC(eos=eos,
                           blank=blank,
                           enc_n_units=enc_n_units,
                           vocab=vocab,
                           dropout=dropout,
                           lsm_prob=ctc_lsm_prob,
                           fc_list=ctc_fc_list,
                           param_init=param_init)

        if ctc_weight < global_weight:
            # Attention layer
            self.score = CIF(enc_dim=self.enc_n_units,
                             conv_kernel_size=attn_conv_kernel_size,
                             conv_out_channels=self.enc_n_units)

            # Decoder
            self.rnn = nn.ModuleList()
            if self.n_projs > 0:
                self.proj = nn.ModuleList(
                    [Linear(n_units, n_projs) for _ in range(n_layers)])
            self.dropout = nn.ModuleList(
                [nn.Dropout(p=dropout) for _ in range(n_layers)])
            rnn = nn.LSTM if rnn_type == 'lstm' else nn.GRU
            dec_odim = enc_n_units + emb_dim
            for l in range(n_layers):
                self.rnn += [rnn(dec_odim, n_units, 1)]
                dec_odim = n_units
                if self.n_projs > 0:
                    dec_odim = n_projs

            # LM fusion
            if lm_fusion is not None:
                self.linear_dec_feat = Linear(dec_odim + enc_n_units, n_units)
                if lm_fusion_type in ['cold', 'deep']:
                    self.linear_lm_feat = Linear(lm_fusion.n_units, n_units)
                    self.linear_lm_gate = Linear(n_units * 2, n_units)
                elif lm_fusion_type == 'cold_prob':
                    self.linear_lm_feat = Linear(lm_fusion.vocab, n_units)
                    self.linear_lm_gate = Linear(n_units * 2, n_units)
                else:
                    raise ValueError(lm_fusion_type)
                self.output_bn = Linear(n_units * 2, bottleneck_dim)

                # fix LM parameters
                for p in lm_fusion.parameters():
                    p.requires_grad = False
            elif discourse_aware == 'hierarchical':
                raise NotImplementedError
            else:
                self.output_bn = Linear(dec_odim + enc_n_units, bottleneck_dim)

            self.embed = Embedding(vocab,
                                   emb_dim,
                                   dropout=dropout_emb,
                                   ignore_index=pad)

            self.output = Linear(bottleneck_dim, vocab)
            # NOTE: include bias even when tying weights

            # Optionally tie weights as in:
            # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
            # https://arxiv.org/abs/1608.05859
            # and
            # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
            # https://arxiv.org/abs/1611.01462
            if tie_embedding:
                if emb_dim != bottleneck_dim:
                    raise ValueError(
                        'When using the tied flag, n_units must be equal to emb_dim.'
                    )
                self.output.fc.weight = self.embed.embed.weight

        # Initialize parameters
        self.reset_parameters(param_init)

        # resister the external LM
        self.lm = lm_fusion

        # decoder initialization with pre-trained LM
        if lm_init is not None:
            assert lm_init.vocab == vocab
            assert lm_init.n_units == n_units
            assert lm_init.emb_dim == emb_dim
            logger.info('===== Initialize the decoder with pre-trained RNNLM')
            assert lm_init.n_projs == 0  # TODO(hirofumi): fix later
            assert lm_init.n_units_null_context == enc_n_units

            # RNN
            for l in range(lm_init.n_layers):
                for n, p in lm_init.rnn[l].named_parameters():
                    assert getattr(self.rnn[l], n).size() == p.size()
                    getattr(self.rnn[l], n).data = p.data
                    logger.info('Overwrite %s' % n)

            # embedding
            assert self.embed.embed.weight.size(
            ) == lm_init.embed.embed.weight.size()
            self.embed.embed.weight.data = lm_init.embed.embed.weight.data
            logger.info('Overwrite %s' % 'embed.embed.weight')
Exemplo n.º 12
0
    def __init__(self, args, save_path=None):

        super(LMBase, self).__init__()
        logger = logging.getLogger('training')
        logger.info(self.__class__.__name__)

        self.save_path = save_path

        self.emb_dim = args.emb_dim
        self.rnn_type = args.lm_type
        assert args.lm_type in ['lstm', 'gru']
        self.n_units = args.n_units
        self.n_projs = args.n_projs
        self.n_layers = args.n_layers
        self.residual = args.residual
        self.use_glu = args.use_glu
        self.n_units_cv = args.n_units_null_context
        self.lsm_prob = args.lsm_prob

        self.vocab = args.vocab
        self.eos = 2
        self.pad = 3
        # NOTE: reserved in advance

        # for cache
        self.cache_theta = 0.2  # smoothing parameter
        self.cache_lambda = 0.2  # cache weight
        self.cache_ids = []
        self.cache_keys = []
        self.cache_attn = []

        self.embed = Embedding(vocab=self.vocab,
                               emb_dim=args.emb_dim,
                               dropout=args.dropout_in,
                               ignore_index=self.pad)

        rnn = nn.LSTM if args.lm_type == 'lstm' else nn.GRU
        self.rnn = nn.ModuleList()
        self.dropout = nn.ModuleList(
            [nn.Dropout(p=args.dropout_hidden) for _ in range(args.n_layers)])
        if args.n_projs > 0:
            self.proj = nn.ModuleList([
                Linear(args.n_units, args.n_projs)
                for _ in range(args.n_layers)
            ])
        rnn_idim = args.emb_dim + args.n_units_null_context
        for l in range(args.n_layers):
            self.rnn += [
                rnn(rnn_idim,
                    args.n_units,
                    1,
                    bias=True,
                    batch_first=True,
                    dropout=0,
                    bidirectional=False)
            ]
            rnn_idim = args.n_units
            if args.n_projs > 0:
                rnn_idim = args.n_projs

        if self.use_glu:
            self.fc_glu = Linear(rnn_idim,
                                 rnn_idim * 2,
                                 dropout=args.dropout_hidden)

        if args.adaptive_softmax:
            self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(
                rnn_idim,
                self.vocab,
                # cutoffs=[self.vocab // 10, 3 * self.vocab // 10],
                cutoffs=[self.vocab // 25, self.vocab // 5],
                div_value=4.0)
            self.output = None
        else:
            self.adaptive_softmax = None
            self.output = Linear(rnn_idim,
                                 self.vocab,
                                 dropout=args.dropout_out)
            # NOTE: include bias even when tying weights

            # Optionally tie weights as in:
            # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
            # https://arxiv.org/abs/1608.05859
            # and
            # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
            # https://arxiv.org/abs/1611.01462
            if args.tie_embedding:
                if args.n_units != args.emb_dim:
                    raise ValueError(
                        'When using the tied flag, n_units must be equal to emb_dim.'
                    )
                self.output.fc.weight = self.embed.embed.weight

        # Initialize parameters
        self.reset_parameters(args.param_init)

        # Recurrent weights are orthogonalized
        if args.rec_weight_orthogonal:
            self.reset_parameters(args.param_init,
                                  dist='orthogonal',
                                  keys=['rnn', 'weight'])
Exemplo n.º 13
0
    def __init__(self, args, save_path=None):

        super(LMBase, self).__init__()
        logger = logging.getLogger('training')
        logger.info(self.__class__.__name__)

        self.save_path = save_path

        self.emb_dim = args.emb_dim
        self.n_units = args.n_units
        self.n_layers = args.n_layers
        self.lsm_prob = args.lsm_prob

        self.vocab = args.vocab
        self.eos = 2
        self.pad = 3
        # NOTE: reserved in advance

        # for cache
        self.cache_theta = 0.2  # smoothing parameter
        self.cache_lambda = 0.2  # cache weight
        self.cache_ids = []
        self.cache_keys = []
        self.cache_attn = []

        self.embed = Embedding(vocab=self.vocab,
                               emb_dim=args.emb_dim,
                               dropout=args.dropout_in,
                               ignore_index=self.pad)

        model_size = args.lm_type.replace('gated_conv_', '')

        blocks = OrderedDict()
        if model_size == 'custom':
            blocks['conv1'] = GLUBlock(args.kernel_size,
                                       args.emb_dim,
                                       args.n_units,
                                       bottlececk_dim=args.n_projs,
                                       dropout=args.dropout_hidden)
            for l in range(args.n_layers - 1):
                blocks['conv%d' % (l + 2)] = GLUBlock(
                    args.kernel_size,
                    args.n_units,
                    args.n_units,
                    bottlececk_dim=args.n_projs,
                    dropout=args.dropout_hidden)
            last_dim = args.n_units

        elif model_size == '8':
            blocks['conv1'] = GLUBlock(4,
                                       args.emb_dim,
                                       900,
                                       dropout=args.dropout_hidden)
            for i in range(1, 8, 1):
                blocks['conv2-%d' % i] = GLUBlock(4,
                                                  900,
                                                  900,
                                                  dropout=args.dropout_hidden)
            last_dim = 900

        elif model_size == '8B':
            blocks['conv1'] = GLUBlock(1,
                                       args.emb_dim,
                                       512,
                                       dropout=args.dropout_hidden)
            for i in range(1, 4, 1):
                blocks['conv2-%d' % i] = GLUBlock(5,
                                                  512,
                                                  512,
                                                  bottlececk_dim=128,
                                                  dropout=args.dropout_hidden)
            for i in range(1, 4, 1):
                blocks['conv3-%d' % i] = GLUBlock(5,
                                                  512,
                                                  512,
                                                  bottlececk_dim=256,
                                                  dropout=args.dropout_hidden)
            blocks['conv4'] = GLUBlock(1,
                                       512,
                                       2048,
                                       bottlececk_dim=1024,
                                       dropout=args.dropout_hidden)
            last_dim = 2048

        elif model_size == '9':
            blocks['conv1'] = GLUBlock(4,
                                       args.emb_dim,
                                       807,
                                       dropout=args.dropout_hidden)
            for i in range(1, 4, 1):
                blocks['conv2-%d-1' % i] = GLUBlock(
                    4, 807, 807, dropout=args.dropout_hidden)
                blocks['conv2-%d-2' % i] = GLUBlock(
                    4, 807, 807, dropout=args.dropout_hidden)
            last_dim = 807

        elif model_size == '13':
            blocks['conv1'] = GLUBlock(4,
                                       args.emb_dim,
                                       1268,
                                       dropout=args.dropout_hidden)
            for i in range(1, 13, 1):
                blocks['conv2-%d' % i] = GLUBlock(4,
                                                  1268,
                                                  1268,
                                                  dropout=args.dropout_hidden)
            last_dim = 1268

        elif model_size == '14':
            for i in range(1, 4, 1):
                blocks['conv1-%d' % i] = GLUBlock(
                    6,
                    args.emb_dim if i == 1 else 850,
                    850,
                    dropout=args.dropout_hidden)
            blocks['conv2'] = GLUBlock(1,
                                       850,
                                       850,
                                       dropout=args.dropout_hidden)
            for i in range(1, 5, 1):
                blocks['conv3-%d' % i] = GLUBlock(5,
                                                  850,
                                                  850,
                                                  dropout=args.dropout_hidden)
            blocks['conv4'] = GLUBlock(1,
                                       850,
                                       850,
                                       dropout=args.dropout_hidden)
            for i in range(1, 4, 1):
                blocks['conv5-%d' % i] = GLUBlock(4,
                                                  850,
                                                  850,
                                                  dropout=args.dropout_hidden)
            blocks['conv6'] = GLUBlock(4,
                                       850,
                                       1024,
                                       dropout=args.dropout_hidden)
            blocks['conv7'] = GLUBlock(4,
                                       1024,
                                       2048,
                                       dropout=args.dropout_hidden)
            last_dim = 2048

        elif model_size == '14B':
            blocks['conv1'] = GLUBlock(5,
                                       args.emb_dim,
                                       512,
                                       dropout=args.dropout_hidden)
            for i in range(1, 4, 1):
                blocks['conv2-%d' % i] = GLUBlock(5,
                                                  512,
                                                  512,
                                                  bottlececk_dim=128,
                                                  dropout=args.dropout_hidden)
            for i in range(1, 4, 1):
                blocks['conv3-%d' % i] = GLUBlock(5,
                                                  512 if i == 1 else 1024,
                                                  1024,
                                                  bottlececk_dim=512,
                                                  dropout=args.dropout_hidden)
            for i in range(1, 7, 1):
                blocks['conv4-%d' % i] = GLUBlock(5,
                                                  1024 if i == 1 else 2048,
                                                  2048,
                                                  bottlececk_dim=1024,
                                                  dropout=args.dropout_hidden)
            blocks['conv5'] = GLUBlock(5,
                                       2048,
                                       4096,
                                       bottlececk_dim=1024,
                                       dropout=args.dropout_hidden)
            last_dim = 4096

        else:
            raise NotImplementedError(model_size)

        self.blocks = nn.Sequential(blocks)

        if args.adaptive_softmax:
            self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(
                last_dim,
                self.vocab,
                # cutoffs=[self.vocab // 10, 3 * self.vocab // 10],
                cutoffs=[self.vocab // 25, self.vocab // 5],
                div_value=4.0)
            self.output = None
        else:
            self.adaptive_softmax = None
            self.output = Linear(last_dim,
                                 self.vocab,
                                 dropout=args.dropout_out)
            # NOTE: include bias even when tying weights

            # Optionally tie weights as in:
            # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
            # https://arxiv.org/abs/1608.05859
            # and
            # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
            # https://arxiv.org/abs/1611.01462
            if args.tie_embedding:
                if args.n_units != args.emb_dim:
                    raise ValueError(
                        'When using the tied flag, n_units must be equal to emb_dim.'
                    )
                self.output.fc.weight = self.embed.embed.weight

        # Initialize parameters
        self.reset_parameters(args.param_init)
Exemplo n.º 14
0
    def __init__(self, args, save_path=None):

        super(LMBase, self).__init__()
        logger = logging.getLogger('training')
        logger.info(self.__class__.__name__)

        self.save_path = save_path

        self.d_model = args.d_model
        self.d_ff = args.d_ff
        self.pe_type = args.pe_type
        self.n_layers = args.n_layers
        self.attn_n_heads = args.attn_n_heads
        self.lsm_prob = args.lsm_prob

        self.vocab = args.vocab
        self.eos = 2
        self.pad = 3
        # NOTE: reserved in advance

        # self.lsm_prob = lsm_prob

        # for cache
        self.cache_theta = 0.2  # smoothing parameter
        self.cache_lambda = 0.2  # cache weight
        self.cache_ids = []
        self.cache_keys = []
        self.cache_attn = []

        self.embed = Embedding(
            vocab=self.vocab,
            emb_dim=self.d_model,
            dropout=0,  # NOTE: do not apply dropout here
            ignore_index=self.pad)
        self.pos_enc = PositionalEncoding(args.d_model, args.dropout_in,
                                          args.pe_type)

        self.layers = nn.ModuleList([
            TransformerDecoderBlock(args.d_model,
                                    args.d_ff,
                                    args.attn_type,
                                    args.attn_n_heads,
                                    args.dropout_hidden,
                                    args.dropout_att,
                                    args.layer_norm_eps,
                                    src_attention=False)
            for _ in range(self.n_layers)
        ])
        self.norm_out = nn.LayerNorm(args.d_model, eps=args.layer_norm_eps)

        if args.adaptive_softmax:
            self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(
                args.d_model,
                self.vocab,
                cutoffs=[round(self.vocab / 15), 3 * round(self.vocab / 15)],
                # cutoffs=[self.vocab // 25, 3 * self.vocab // 5],
                div_value=4.0)
            self.output = None
        else:
            self.adaptive_softmax = None
            self.output = Linear(self.d_model,
                                 self.vocab,
                                 dropout=args.dropout_out)

            # Optionally tie weights as in:
            # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
            # https://arxiv.org/abs/1608.05859
            # and
            # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
            # https://arxiv.org/abs/1611.01462
            if args.tie_embedding:
                self.output.fc.weight = self.embed.embed.weight

        # Initialize parameters
        self.reset_parameters()
Exemplo n.º 15
0
    def __init__(self,
                 eos,
                 unk,
                 pad,
                 blank,
                 enc_n_units,
                 attn_type,
                 attn_n_heads,
                 n_layers,
                 d_model,
                 d_ff,
                 vocab,
                 tie_embedding=False,
                 pe_type='add',
                 layer_norm_eps=1e-12,
                 dropout=0.0,
                 dropout_emb=0.0,
                 dropout_att=0.0,
                 lsm_prob=0.0,
                 focal_loss_weight=0.0,
                 focal_loss_gamma=2.0,
                 ctc_weight=0.0,
                 ctc_lsm_prob=0.0,
                 ctc_fc_list=[],
                 backward=False,
                 global_weight=1.0,
                 mtl_per_batch=False,
                 adaptive_softmax=False):

        super(TransformerDecoder, self).__init__()
        logger = logging.getLogger('training')

        self.eos = eos
        self.unk = unk
        self.pad = pad
        self.blank = blank
        self.enc_n_units = enc_n_units
        self.d_model = d_model
        self.n_layers = n_layers
        self.attn_n_heads = attn_n_heads
        self.pe_type = pe_type
        self.lsm_prob = lsm_prob
        self.focal_loss_weight = focal_loss_weight
        self.focal_loss_gamma = focal_loss_gamma
        self.ctc_weight = ctc_weight
        self.bwd = backward
        self.global_weight = global_weight
        self.mtl_per_batch = mtl_per_batch

        if ctc_weight > 0:
            self.ctc = CTC(eos=eos,
                           blank=blank,
                           enc_n_units=enc_n_units,
                           vocab=vocab,
                           dropout=dropout,
                           lsm_prob=ctc_lsm_prob,
                           fc_list=ctc_fc_list,
                           param_init=0.1)

        if ctc_weight < global_weight:
            self.embed = Embedding(
                vocab,
                d_model,
                dropout=0,  # NOTE: do not apply dropout here
                ignore_index=pad)
            self.pos_enc = PositionalEncoding(d_model, dropout_emb, pe_type)
            self.layers = nn.ModuleList([
                TransformerDecoderBlock(d_model, d_ff, attn_type, attn_n_heads,
                                        dropout, dropout_att, layer_norm_eps)
                for _ in range(n_layers)
            ])
            self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps)

            if adaptive_softmax:
                self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(
                    d_model,
                    vocab,
                    cutoffs=[
                        round(self.vocab / 15), 3 * round(self.vocab / 15)
                    ],
                    # cutoffs=[self.vocab // 25, 3 * self.vocab // 5],
                    div_value=4.0)
                self.output = None
            else:
                self.adaptive_softmax = None
                self.output = Linear(d_model, vocab)

                # Optionally tie weights as in:
                # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
                # https://arxiv.org/abs/1608.05859
                # and
                # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
                # https://arxiv.org/abs/1611.01462
                if tie_embedding:
                    self.output.fc.weight = self.embed.embed.weight

        # Initialize parameters
        self.reset_parameters()
Exemplo n.º 16
0
    def __init__(self,
                 eos,
                 unk,
                 pad,
                 blank,
                 enc_n_units,
                 rnn_type,
                 n_units,
                 n_projs,
                 n_layers,
                 residual,
                 bottleneck_dim,
                 emb_dim,
                 vocab,
                 tie_embedding=False,
                 dropout=0.0,
                 dropout_emb=0.0,
                 lsm_prob=0.0,
                 ctc_weight=0.0,
                 ctc_lsm_prob=0.0,
                 ctc_fc_list=[],
                 lm_init=None,
                 lmobj_weight=0.0,
                 share_lm_softmax=False,
                 global_weight=1.0,
                 mtl_per_batch=False,
                 param_init=0.1,
                 start_pointing=False,
                 end_pointing=True):

        super(RNNTransducer, self).__init__()
        logger = logging.getLogger('training')

        self.eos = eos
        self.unk = unk
        self.pad = pad
        self.blank = blank
        self.vocab = vocab
        self.rnn_type = rnn_type
        assert rnn_type in ['lstm_transducer', 'gru_transducer']
        self.enc_n_units = enc_n_units
        self.dec_n_units = n_units
        self.n_projs = n_projs
        self.n_layers = n_layers
        self.residual = residual
        self.lsm_prob = lsm_prob
        self.ctc_weight = ctc_weight
        self.lmobj_weight = lmobj_weight
        self.share_lm_softmax = share_lm_softmax
        self.global_weight = global_weight
        self.mtl_per_batch = mtl_per_batch

        # VAD
        self.start_pointing = start_pointing
        self.end_pointing = end_pointing

        # for cache
        self.prev_spk = ''
        self.lmstate_final = None
        self.state_cache = OrderedDict()

        if ctc_weight > 0:
            self.ctc = CTC(eos=eos,
                           blank=blank,
                           enc_n_units=enc_n_units,
                           vocab=vocab,
                           dropout=dropout,
                           lsm_prob=ctc_lsm_prob,
                           fc_list=ctc_fc_list,
                           param_init=param_init)

        if ctc_weight < global_weight:
            import warprnnt_pytorch
            self.warprnnt_loss = warprnnt_pytorch.RNNTLoss()

            # for MTL with LM objective
            if lmobj_weight > 0:
                if share_lm_softmax:
                    self.output_lmobj = self.output  # share paramters
                else:
                    self.output_lmobj = Linear(n_units, vocab)

            # Prediction network
            self.fast_impl = False
            rnn = nn.LSTM if rnn_type == 'lstm_transducer' else nn.GRU
            if n_projs == 0 and not residual:
                self.fast_impl = True
                self.rnn = rnn(emb_dim, n_units, n_layers,
                               bias=True,
                               batch_first=True,
                               dropout=dropout,
                               bidirectional=False)
                # NOTE: pytorch introduces a dropout layer on the outputs of each layer EXCEPT the last layer
                dec_idim = n_units
                self.dropout_top = nn.Dropout(p=dropout)
            else:
                self.rnn = nn.ModuleList()
                self.dropout = nn.ModuleList([nn.Dropout(p=dropout) for _ in range(n_layers)])
                if n_projs > 0:
                    self.proj = nn.ModuleList([Linear(dec_idim, n_projs) for _ in range(n_layers)])
                dec_idim = emb_dim
                for l in range(n_layers):
                    self.rnn += [rnn(dec_idim, n_units, 1,
                                     bias=True,
                                     batch_first=True,
                                     dropout=0,
                                     bidirectional=False)]
                    dec_idim = n_projs if n_projs > 0 else n_units

            self.embed = Embedding(vocab, emb_dim,
                                   dropout=dropout_emb,
                                   ignore_index=pad)

            self.w_enc = Linear(enc_n_units, bottleneck_dim, bias=True)
            self.w_dec = Linear(dec_idim, bottleneck_dim, bias=False)
            self.output = Linear(bottleneck_dim, vocab)

        # Initialize parameters
        self.reset_parameters(param_init)

        # prediction network initialization with pre-trained LM
        if lm_init is not None:
            assert lm_init.vocab == vocab
            assert lm_init.n_units == n_units
            assert lm_init.n_projs == n_projs
            assert lm_init.n_layers == n_layers
            assert lm_init.residual == residual

            param_dict = dict(lm_init.named_parameters())
            for n, p in self.named_parameters():
                if n in param_dict.keys() and p.size() == param_dict[n].size():
                    if 'output' in n:
                        continue
                    p.data = param_dict[n].data
                    logger.info('Overwrite %s' % n)
Exemplo n.º 17
0
    def __init__(self,
                 input_dim,
                 rnn_type,
                 n_units,
                 n_projs,
                 n_layers,
                 dropout_in,
                 dropout,
                 subsample,
                 subsample_type='drop',
                 n_stacks=1,
                 n_splices=1,
                 last_proj_dim=0,
                 conv_in_channel=1,
                 conv_channels=0,
                 conv_kernel_sizes=[],
                 conv_strides=[],
                 conv_poolings=[],
                 conv_batch_norm=False,
                 conv_residual=False,
                 conv_bottleneck_dim=0,
                 residual=False,
                 n_layers_sub1=0,
                 n_layers_sub2=0,
                 nin=False,
                 task_specific_layer=False,
                 param_init=0.1):

        super(RNNEncoder, self).__init__()
        logger = logging.getLogger("training")

        if len(subsample) > 0 and len(subsample) != n_layers:
            raise ValueError('subsample must be the same size as n_layers.')
        if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1):
            raise ValueError('Set n_layers_sub1 between 1 to n_layers.')
        if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2):
            raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.')

        self.rnn_type = rnn_type
        self.bidirectional = True if rnn_type in ['blstm', 'bgru', 'conv_blstm', 'conv_bgru'] else False
        self.n_units = n_units
        self.n_dirs = 2 if self.bidirectional else 1
        self.n_layers = n_layers

        # Setting for hierarchical encoder
        self.n_layers_sub1 = n_layers_sub1
        self.n_layers_sub2 = n_layers_sub2
        self.task_specific_layer = task_specific_layer

        # Setting for bridge layers
        self.bridge = None
        self.bridge_sub1 = None
        self.bridge_sub2 = None

        # Setting for residual connections
        self.residual = residual
        if residual:
            assert np.prod(subsample) == 1

        # Dropout for input-hidden connection
        self.dropout_in = nn.Dropout(p=dropout_in)

        # Setting for CNNs before RNNs
        if conv_channels and rnn_type not in ['blstm', 'lstm', 'bgru', 'gru']:
            channels = [int(c) for c in conv_channels.split('_')] if len(conv_channels) > 0 else []
            kernel_sizes = [[int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', ''))]
                            for c in conv_kernel_sizes.split('_')] if len(conv_kernel_sizes) > 0 else []
            if rnn_type in ['tds', 'gated_conv']:
                strides = []
                poolings = []
            else:
                strides = [[int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', ''))]
                           for c in conv_strides.split('_')] if len(conv_strides) > 0 else []
                poolings = [[int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', ''))]
                            for c in conv_poolings.split('_')] if len(conv_poolings) > 0 else []
            if 'conv_' in rnn_type:
                subsample = [1] * self.n_layers
                logger.warning('Subsampling is automatically ignored because CNN layers are used before RNN layers.')
        else:
            channels = []
            kernel_sizes = []
            strides = []
            poolings = []

        if len(channels) > 0:
            if rnn_type == 'tds':
                self.conv = TDSEncoder(input_dim=input_dim * n_stacks,
                                       in_channel=conv_in_channel,
                                       channels=channels,
                                       kernel_sizes=kernel_sizes,
                                       dropout=dropout,
                                       bottleneck_dim=last_proj_dim)
            elif rnn_type == 'gated_conv':
                self.conv = GatedConvEncoder(input_dim=input_dim * n_stacks,
                                             in_channel=conv_in_channel,
                                             channels=channels,
                                             kernel_sizes=kernel_sizes,
                                             dropout=dropout,
                                             bottleneck_dim=last_proj_dim,
                                             param_init=param_init)
            else:
                assert n_stacks == 1 and n_splices == 1
                self.conv = ConvEncoder(input_dim,
                                        in_channel=conv_in_channel,
                                        channels=channels,
                                        kernel_sizes=kernel_sizes,
                                        strides=strides,
                                        poolings=poolings,
                                        dropout=0,
                                        batch_norm=conv_batch_norm,
                                        residual=conv_residual,
                                        bottleneck_dim=conv_bottleneck_dim,
                                        param_init=param_init)
            self._output_dim = self.conv.output_dim
        else:
            self._output_dim = input_dim * n_splices * n_stacks
            self.conv = None

        self.padding = Padding()

        if rnn_type not in ['conv', 'tds', 'gated_conv']:
            # Fast implementation without processes between each layer
            self.fast_impl = False
            if np.prod(subsample) == 1 and n_projs == 0 and not residual and n_layers_sub1 == 0 and not nin:
                self.fast_impl = True
                if 'lstm' in rnn_type:
                    rnn = nn.LSTM
                elif 'gru' in rnn_type:
                    rnn = nn.GRU
                else:
                    raise ValueError('rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".')

                self.rnn = rnn(self._output_dim, n_units, n_layers,
                               bias=True, batch_first=True, dropout=dropout,
                               bidirectional=self.bidirectional)
                # NOTE: pytorch introduces a dropout layer on the outputs of each layer EXCEPT the last layer
                self._output_dim = n_units * self.n_dirs
                self.dropout_top = nn.Dropout(p=dropout)
            else:
                self.rnn = nn.ModuleList()
                self.dropout = nn.ModuleList()
                self.proj = None
                if n_projs > 0:
                    self.proj = nn.ModuleList()

                # subsample
                self.subsample = None
                if subsample_type == 'max_pool' and np.prod(subsample) > 1:
                    self.subsample = nn.ModuleList([MaxpoolSubsampler(subsample[l])
                                                    for l in range(n_layers)])
                elif subsample_type == 'concat' and np.prod(subsample) > 1:
                    self.subsample = nn.ModuleList([ConcatSubsampler(subsample[l], n_units, self.n_dirs)
                                                    for l in range(n_layers)])
                elif subsample_type == 'drop' and np.prod(subsample) > 1:
                    self.subsample = nn.ModuleList([DropSubsampler(subsample[l])
                                                    for l in range(n_layers)])

                # NiN
                self.nin = None
                if nin:
                    self.nin = nn.ModuleList()

                for l in range(n_layers):
                    if 'lstm' in rnn_type:
                        rnn_i = nn.LSTM
                    elif 'gru' in rnn_type:
                        rnn_i = nn.GRU
                    else:
                        raise ValueError('rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".')

                    self.rnn += [rnn_i(self._output_dim, n_units, 1,
                                       bias=True, batch_first=True, dropout=0,
                                       bidirectional=self.bidirectional)]
                    self.dropout += [nn.Dropout(p=dropout)]
                    self._output_dim = n_units * self.n_dirs

                    # Projection layer
                    if self.proj is not None:
                        if l != n_layers - 1:
                            self.proj += [Linear(n_units * self.n_dirs, n_projs)]
                            self._output_dim = n_projs

                    # Task specific layer
                    if l == n_layers_sub1 - 1 and task_specific_layer:
                        self.rnn_sub1 = rnn_i(self._output_dim, n_units, 1,
                                              bias=True, batch_first=True, dropout=0,
                                              bidirectional=self.bidirectional)
                        self.dropout_sub1 = nn.Dropout(p=dropout)
                        if last_proj_dim != self.output_dim:
                            self.bridge_sub1 = Linear(n_units, last_proj_dim)
                    if l == n_layers_sub2 - 1 and task_specific_layer:
                        self.rnn_sub2 = rnn_i(self._output_dim, n_units, 1,
                                              bias=True, batch_first=True, dropout=0,
                                              bidirectional=self.bidirectional)
                        self.dropout_sub2 = nn.Dropout(p=dropout)
                        if last_proj_dim != self.output_dim:
                            self.bridge_sub2 = Linear(n_units, last_proj_dim)

                    # Network in network
                    if self.nin is not None:
                        if l != n_layers - 1:
                            self.nin += [NiN(self._output_dim)]
                        # if n_layers_sub1 > 0 or n_layers_sub2 > 0:
                        #     assert task_specific_layer

            if last_proj_dim != self.output_dim:
                self.bridge = Linear(self._output_dim, last_proj_dim)
                self._output_dim = last_proj_dim

        # Initialize parameters
        self.reset_parameters(param_init)
Exemplo n.º 18
0
    def __init__(self,
                 input_dim,
                 attn_type,
                 attn_n_heads,
                 n_layers,
                 d_model,
                 d_ff,
                 pe_type='add',
                 layer_norm_eps=1e-12,
                 dropout_in=0,
                 dropout=0,
                 dropout_att=0,
                 last_proj_dim=0,
                 n_stacks=1,
                 n_splices=1,
                 conv_in_channel=1,
                 conv_channels=0,
                 conv_kernel_sizes=[],
                 conv_strides=[],
                 conv_poolings=[],
                 conv_batch_norm=False,
                 conv_residual=False,
                 conv_bottleneck_dim=0,
                 param_init=0.1):

        super(TransformerEncoder, self).__init__()
        logger = logging.getLogger("training")

        self.d_model = d_model
        self.n_layers = n_layers
        self.attn_n_heads = attn_n_heads
        self.pe_type = pe_type

        # Setting for CNNs before RNNs
        if conv_channels:
            channels = [int(c) for c in conv_channels.split('_')
                        ] if len(conv_channels) > 0 else []
            kernel_sizes = [[
                int(c.split(',')[0].replace('(', '')),
                int(c.split(',')[1].replace(')', ''))
            ] for c in conv_kernel_sizes.split('_')
                            ] if len(conv_kernel_sizes) > 0 else []
            strides = [[
                int(c.split(',')[0].replace('(', '')),
                int(c.split(',')[1].replace(')', ''))
            ] for c in conv_strides.split('_')
                       ] if len(conv_strides) > 0 else []
            poolings = [[
                int(c.split(',')[0].replace('(', '')),
                int(c.split(',')[1].replace(')', ''))
            ] for c in conv_poolings.split('_')
                        ] if len(conv_poolings) > 0 else []
        else:
            channels = []
            kernel_sizes = []
            strides = []
            poolings = []
            logger.warning(
                'Subsampling is automatically ignored because CNN layers are used before RNN layers.'
            )

        if len(channels) > 0:
            assert n_stacks == 1 and n_splices == 1
            self.conv = ConvEncoder(input_dim,
                                    in_channel=conv_in_channel,
                                    channels=channels,
                                    kernel_sizes=kernel_sizes,
                                    strides=strides,
                                    poolings=poolings,
                                    dropout=0,
                                    batch_norm=conv_batch_norm,
                                    residual=conv_residual,
                                    bottleneck_dim=d_model,
                                    param_init=param_init)
            self._output_dim = self.conv.output_dim
        else:
            self._output_dim = input_dim * n_splices * n_stacks
            self.conv = None

            self.embed = Linear(self._output_dim,
                                d_model)  # NOTE: do not apply dropout here

        self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type)
        self.layers = nn.ModuleList([
            TransformerEncoderBlock(d_model, d_ff, attn_type, attn_n_heads,
                                    dropout, dropout_att, layer_norm_eps)
            for l in range(n_layers)
        ])
        self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps)

        if last_proj_dim != self.output_dim:
            self.bridge = Linear(self._output_dim, last_proj_dim)
            self._output_dim = last_proj_dim
        else:
            self.bridge = None
            self._output_dim = d_model

        # Initialize parameters
        self.reset_parameters()
Exemplo n.º 19
0
    def __init__(self,
                 key_dim,
                 query_dim,
                 attn_type,
                 attn_dim,
                 sharpening_factor=1,
                 sigmoid_smoothing=False,
                 conv_out_channels=10,
                 conv_kernel_size=100,
                 dropout=0):

        super(AttentionMechanism, self).__init__()

        self.attn_type = attn_type
        self.attn_dim = attn_dim
        self.sharpening_factor = sharpening_factor
        self.sigmoid_smoothing = sigmoid_smoothing
        self.n_heads = 1
        self.key = None
        self.mask = None

        # attention dropout applied after the softmax layer
        self.attn_dropout = nn.Dropout(p=dropout)

        if attn_type == 'no':
            raise NotImplementedError
            # NOTE: sequence-to-sequence without attetnion (use the last state as a context vector)

        elif attn_type == 'add':
            self.w_key = Linear(key_dim, attn_dim, bias=True)
            self.w_query = Linear(query_dim, attn_dim, bias=False)
            self.v = Linear(attn_dim, 1, bias=False)

        elif attn_type == 'location':
            self.w_key = Linear(key_dim, attn_dim, bias=True)
            self.w_query = Linear(query_dim, attn_dim, bias=False)
            self.w_conv = Linear(conv_out_channels, attn_dim, bias=False)
            self.conv = nn.Conv2d(in_channels=1,
                                  out_channels=conv_out_channels,
                                  kernel_size=(1, conv_kernel_size * 2 + 1),
                                  stride=1,
                                  padding=(0, conv_kernel_size),
                                  bias=False)
            self.v = Linear(attn_dim, 1, bias=False)

        elif attn_type == 'dot':
            self.w_key = Linear(key_dim, attn_dim, bias=False)
            self.w_query = Linear(query_dim, attn_dim, bias=False)

        elif attn_type == 'luong_dot':
            pass
            # NOTE: no additional parameters

        elif attn_type == 'luong_general':
            self.w_key = Linear(key_dim, query_dim, bias=False)

        elif attn_type == 'luong_concat':
            self.w = Linear(key_dim + query_dim, attn_dim, bias=False)
            self.v = Linear(attn_dim, 1, bias=False)

        else:
            raise ValueError(attn_type)
Exemplo n.º 20
0
    def __init__(self, d_model, d_ff, dropout):
        super(PositionwiseFeedForward, self).__init__()

        self.w_1 = Linear(d_model, d_ff)
        self.w_2 = Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)