예제 #1
0
    def build_modules(self):

        e_length = expected_length(self.layers, self.death_rate)

        if self.reversible:
            print(
                "* Reversible Transformer Decoder with Absolute Attention with %.2f expected layers"
                % e_length)
        else:
            print(
                "* Transformer Decoder with Absolute Attention with %.2f expected layers"
                % e_length)

        for _l in range(self.layers):
            # linearly decay the death rate
            death_r = (_l + 1.0) / self.layers * self.death_rate
            if not self.reversible:
                # block = DecoderLayer(self.n_heads, self.model_size,
                #                      self.dropout, self.inner_size, self.attn_dropout,
                #                      variational=self.variational_dropout, death_rate=death_r)
                block = DecoderLayer(self.opt, death_rate=death_r)
            else:
                block = ReversibleTransformerDecoderLayer(self.opt,
                                                          death_rate=_l)

            self.layer_modules.append(block)
예제 #2
0
    def build_modules(self):

        e_length = expected_length(self.layers, self.death_rate)
        self.opt.ignore_source = self.ignore_source
        if self.reversible:
            print(
                "* Transformer Reversible Decoder with Relative Attention with %.2f expected layers"
                % e_length)
        else:
            print(
                "* Transformer Decoder with Relative Attention with %.2f expected layers"
                % e_length)

        self.layer_modules = nn.ModuleList()

        for l in range(self.layers):
            # linearly decay the death rate
            death_r = (l + 1.0) / self.layers * self.death_rate

            if not self.reversible:
                block = RelativeTransformerDecoderLayer(self.opt,
                                                        death_rate=death_r)
            else:
                block = ReversibleTransformerDecoderLayer(self.opt)

            self.layer_modules.append(block)
예제 #3
0
    def build_modules(self):

        e_length = expected_length(self.layers, self.death_rate)

        if self.reversible:
            print(
                "* Reversible Transformer Encoder with Absolute Attention with %.2f expected layers"
                % e_length)
        else:
            print(
                "* Transformer Encoder with Absolute Attention with %.2f expected layers"
                % e_length)

        for _l in range(self.layers):
            # linearly decay the death rate
            death_r = (_l + 1.0) / self.layers * self.death_rate

            if not self.lsh_src_attention:
                if not self.reversible:
                    block = EncoderLayer(self.opt, death_rate=death_r)
                else:
                    block = ReversibleTransformerEncoderLayer(
                        self.opt, death_rate=death_r)

            else:
                from onmt.models.reformer import ReformerEncoderLayer
                block = ReformerEncoderLayer(self.opt, death_rate=death_r)

            self.layer_modules.append(block)
예제 #4
0
    def build_modules(self):

        e_length = expected_length(self.layers, 0.0)
        self.opt.ignore_source = self.ignore_source
        opt = self.opt
        print(
            "* Speech Transformer Decoder with Relative Attention with %.2f layers"
            % e_length)

        self.layer_modules = nn.ModuleList()

        for l in range(self.layers):
            # linearly decay the death rate
            death_r = 0.0

            from .relative_transformer_layers import LIDFeedForward
            lid_network = LIDFeedForward(opt.model_size,
                                         2 * opt.model_size,
                                         opt.bottleneck_size,
                                         opt.n_languages,
                                         dropout=opt.dropout)

            block = RelativeTransformerDecoderLayer(self.opt,
                                                    death_rate=death_r,
                                                    lid_net=lid_network)

            self.layer_modules.append(block)
예제 #5
0
    def build_modules(self):

        e_length = expected_length(self.layers, self.death_rate)

        print("* Universal Transformer Decoder with Absolute Attention with %.2f expected layers" % e_length)

        self.universal_layer = UniversalDecoderLayer(self.opt, death_rate=self.death_rate)
예제 #6
0
    def build_modules(self):

        e_length = expected_length(self.layers, self.death_rate)
        print("* Relative Translation Encoder with %.2f expected layers" % e_length)

        self.layer_modules = nn.ModuleList()

        for _l in range(self.layers):
            # linearly decay the death rate
            death_r = (_l + 1.0) / self.layers * self.death_rate
            block = RelativeTransformerEncoderLayer(self.opt, death_rate=death_r)

            self.layer_modules.append(block)
예제 #7
0
    def build_modules(self):

        e_length = expected_length(self.layers, self.death_rate)

        print("* Transformer Decoder with Absolute Attention with %.2f expected layers" % e_length)

        self.layer_modules = nn.ModuleList()

        for l in range(self.layers):
            # linearly decay the death rate
            death_r = (l + 1.0) / self.layers * self.death_rate

            block = DecoderLayer(opt, death_rate=death_r)
            self.layer_modules.append(block)
예제 #8
0
    def build_modules(self):

        e_length = expected_length(self.layers, self.death_rate)
        print("* Conformer Encoder with %.2f expected layers" % e_length)
        if self.unidirectional:
            print("* Running a unidirectional Encoder.")

        self.layer_modules = nn.ModuleList()

        for _l in range(self.layers):
            # linearly decay the death rate
            death_r = (_l + 1.0) / self.layers * self.death_rate

            block = ConformerEncoderLayer(self.opt, death_rate=death_r)
            self.layer_modules.append(block)
예제 #9
0
    def build_modules(self):
        e_length = expected_length(self.layers, self.death_rate)

        print("* Transformer LM Decoder with Relative Attention with %.2f expected layers" % e_length)

        self.layer_modules = nn.ModuleList()

        for l in range(self.layers):
            # linearly decay the death rate
            death_r = (l + 1.0) / self.layers * self.death_rate

            block = TransformerXLDecoderLayer(self.n_heads, self.model_size,
                                              self.dropout, self.inner_size, self.attn_dropout,
                                              ignore_source=True,
                                              variational=self.variational_dropout, death_rate=death_r)
            self.layer_modules.append(block)
예제 #10
0
    def build_modules(self):
        assert self.opt.src_reversible == False
        e_length = expected_length(self.layers, self.death_rate)
        print(
            "* Bayes-By-Backprop Relative Transformer Encoder with %.2f expected layers"
            % e_length)
        if self.unidirectional:
            print("* Running a unidirectional Encoder.")

        self.layer_modules = nn.ModuleList()

        for _l in range(self.layers):
            # linearly decay the death rate
            death_r = (_l + 1.0) / self.layers * self.death_rate
            block = TransformerEncoderLayer(self.opt, death_rate=death_r)
            self.layer_modules.append(block)
예제 #11
0
    def build_modules(self):

        e_length = expected_length(self.layers, self.death_rate)
        self.opt.ignore_source = self.ignore_source
        print(
            "* Bayes-By-Backprop Relative Transformer Decoder with %.2f expected layers"
            % e_length)

        self.layer_modules = nn.ModuleList()

        for l in range(self.layers):
            # linearly decay the death rate
            death_r = (l + 1.0) / self.layers * self.death_rate

            block = TransformerDecoderLayer(self.opt, death_rate=death_r)

            self.layer_modules.append(block)
예제 #12
0
    def build_modules(self):

        self.death_rate = 0.0
        e_length = expected_length(self.layers, self.death_rate)
        self.opt.ignore_source = self.ignore_source
        opt = self.opt
        print("* Speech Transformer Decoder with Relative Attention with %.2f layers" % e_length)

        self.layer_modules = nn.ModuleList()

        for _l in range(self.layers):
            # linearly decay the death rate
            death_r = (_l + 1.0) / self.layers * self.death_rate

            block = RelativeTransformerDecoderLayer(self.opt, death_rate=death_r)

            self.layer_modules.append(block)
예제 #13
0
    def build_modules(self):

        e_length = expected_length(self.layers, self.death_rate)
        print(
            "* Transformer Encoder with Absolute Attention with %.2f expected layers"
            % e_length)

        for _l in range(self.layers):
            # linearly decay the death rate
            death_r = (_l + 1.0) / self.layers * self.death_rate

            block = EncoderLayer(self.n_heads,
                                 self.model_size,
                                 self.dropout,
                                 self.inner_size,
                                 self.attn_dropout,
                                 variational=self.varitional_dropout,
                                 death_rate=death_r)

            self.layer_modules.append(block)
예제 #14
0
    def build_modules(self):

        e_length = expected_length(self.layers, self.death_rate)
        if self.reversible:
            print("* Reversible Encoder with Relative Attention with %.2f expected layers" % e_length)
        else:
            print("* Transformer Encoder with Relative Attention with %.2f expected layers" % e_length)
        if self.unidirectional:
            print("* Running a unidirectional Encoder.")

        self.layer_modules = nn.ModuleList()

        for _l in range(self.layers):
            # linearly decay the death rate
            death_r = (_l + 1.0) / self.layers * self.death_rate

            if not self.reversible:
                block = RelativeTransformerEncoderLayer(self.opt, death_rate=death_r)
            else:
                block = ReversibleTransformerEncoderLayer(self.opt, death_rate=death_r)

            self.layer_modules.append(block)
예제 #15
0
    def __init__(self, opt, dicts, positional_encoder, encoder_type='text'):
        self.death_rate = opt.death_rate
        self.double_position = opt.double_position
        self.max_pos_length = opt.max_pos_length
        self.layer_modules = list()

        # build_modules will be called from the inherited constructor
        super(RelativeTransformerEncoder,
              self).__init__(opt, dicts, positional_encoder, encoder_type)

        print("Encoder type: %s", encoder_type)
        # self.positional_encoder = SinusoidalPositionalEmbedding(opt.model_size)

        # embedding for the positions
        # 2N - 1 positions because it runs from -N -> 0 -> N
        self.positional_encoder = nn.Embedding(2 * self.max_pos_length + 1,
                                               self.model_size)
        self.d_head = self.model_size // self.n_heads

        e_length = expected_length(self.layers, self.death_rate)

        print(
            "* Transformer Encoder with Relative Attention with %.2f expected layers"
            % e_length)