예제 #1
0
    def test_longformer(self):
        config = LongformerConfig()
        config.attention_mode = "n2"
        config.attention_window = [256] * 12
        config.attention_dilation = [1] * 12
        longformer = Longformer(config)
        encoder = TransformerEncoderBuilder.from_kwargs(
            n_layers=12,
            n_heads=12,
            query_dimensions=64,
            value_dimensions=64,
            feed_forward_dimensions=3072,
            attention_type="full",
            final_normalization=False,
            activation="gelu").get()
        longformer.eval()
        encoder.eval()

        # Before the weight copy they should be different
        x = torch.rand(3, 10, 768)
        o1 = longformer.encoder(x, head_mask=[None] * 12)[0]
        o2 = encoder(x)
        self.assertGreater(torch.abs(o1 - o2).max().item(), 1)

        # And after the copy they should be exactly the same
        encoder.load_state_dict(LongformerMapper().map(
            longformer.encoder.state_dict()))
        o1 = longformer.encoder(x, head_mask=[None] * 12)[0]
        o2 = encoder(x)
        self.assertLess(torch.abs(o1 - o2).max().item(), 1e-4)
예제 #2
0
    def test_selfattention(self):
        np.random.seed(1)
        random.seed(1)
        torch.manual_seed(1)
        torch.cuda.manual_seed(1)
        torch.cuda.manual_seed_all(1)

        seqlen = 1024
        embed_dim = 60
        num_heads = 3
        bsz = 3
        config = LongformerConfig()
        config.num_attention_heads = num_heads
        config.hidden_size = embed_dim
        config.attention_probs_dropout_prob = 0.0
        config.attention_window = [256]
        config.attention_dilation = [1]
        config.attention_mode = 'sliding_chunks'
        config.autoregressive = False

        attn = LongformerSelfAttention(config=config, layer_id=0)
        attn = attn.cuda()

        hidden_state = torch.randn(bsz, seqlen, embed_dim)
        attention_mask = torch.zeros(
            (bsz, 1, 1, seqlen), dtype=torch.int)  # local attention everywhere

        # test None attention_mask (default which is local attention everywhere)
        output_nonemask = self._run_test(attn, hidden_state, None)
        output = self._run_test(attn, hidden_state, attention_mask)
        self.assertTrue(torch.allclose(output, output_nonemask, atol=1e-7))

        # test padding
        attention_mask[:, :, :, -10:] = -1
        self._run_test(attn, hidden_state, attention_mask)

        # test same global attention on all examples
        attention_mask[:, :, :, :10] = 1
        self._run_test(attn, hidden_state, attention_mask)

        # test same number of global attention but different locations
        attention_mask[:] = 0
        attention_mask[:, :, :, -10:] = -1
        attention_mask[0, :, :, :10] = 1
        attention_mask[1, :, :, 5:15] = 1
        attention_mask[2, :, :, 10:20] = 1
        self._run_test(attn, hidden_state, attention_mask)

        # test variable number of global attention
        attention_mask[:] = 0
        attention_mask[:, :, :, -10:] = -1
        attention_mask[0, :, :, 5:15] = 1
        attention_mask[2, :, :, 13:17] = 1
        self._run_test(attn, hidden_state, attention_mask)
예제 #3
0
 def __init__(self):
     super(Model, self).__init__()
     self.config = LongformerConfig.from_pretrained('./longformer_pretrain')
     self.config.attention_mode = 'sliding_chunks'
     self.longformer = Longformer.from_pretrained('./longformer_pretrain',
                                                  config=self.config)
     self.output = nn.Linear(self.config.hidden_size, 2)
예제 #4
0
    def __init__(self, num_labels, args):
        super().__init__()

        config = LongformerConfig.from_pretrained(args.model + '-4096/')
        config.attention_mode = 'sliding_chunks'

        self.longformer = Longformer.from_pretrained(args.model + '-4096/',
                                                     config=config)
        self.dropout = nn.Dropout(self.longformer.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.longformer.config.hidden_size,
                                    num_labels)
예제 #5
0
    def __init__(self, args):
        super(EncoderLayer, self).__init__()
        # self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)

        config = LongformerConfig()
        config.num_attention_heads = args.num_heads
        config.hidden_size = args.encoder_hidden_size
        config.attention_probs_dropout_prob = args.dropout
        config.attention_window = [args.window_size]
        config.attention_dilation = [1]  # No dilation
        config.attention_mode = 'tvm'
        config.output_attentions = True
        config.autoregressive = False

        self.self_attn = LongformerSelfAttention(config=config, layer_id=0)
        self.pos_ffn = PositionwiseFeedForward(
            args.encoder_hidden_size, args.encoder_hidden_size, dropout=args.dropout)
        
        self.layer_norm = nn.LayerNorm(args.encoder_hidden_size)
        self.dropout = nn.Dropout(args.dropout)
        
        self.fc = nn.Linear(args.encoder_hidden_size, args.encoder_hidden_size)
        nn.init.xavier_normal_(self.fc.weight)
예제 #6
0
    def _run_test(self, device, dtype, attention_mode):

        config = LongformerConfig.from_pretrained(
            '/net/s3/s2-research/beltagy/longformer/model_release/longformer-base-4096/config.json'
        )
        config.attention_mode = attention_mode
        model = Longformer.from_pretrained(
            '/net/s3/s2-research/beltagy/longformer/model_release/longformer-base-4096/pytorch_model.bin',
            config=config)
        model = model.eval()

        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        tokenizer.model_max_length = 4096

        SAMPLE_TEXT = ' '.join(['Hello world! '] * 1025)  # long input document
        token_ids = tokenizer.encode(SAMPLE_TEXT)
        token_ids = token_ids[:4095] + token_ids[-1:]
        input_ids = torch.tensor(token_ids).unsqueeze(0)

        input_ids = input_ids.to(device=device)
        model = model.to(device=device, dtype=dtype)

        attention_mask = torch.ones(input_ids.shape,
                                    dtype=torch.long,
                                    device=input_ids.device)
        attention_mask[:, [
            1,
            4,
            21,
        ]] = 2

        output = model(input_ids, attention_mask=attention_mask)[0]
        output = output.float().sum()

        expected_output_sum = torch.tensor(
            76193.671875, device=device
        )  # with no padding needed, and fixed roberta-tokenizer

        print(
            f'device: {device}, dtype: {dtype}, attention_mode: {attention_mode} '
            f'Expected: {expected_output_sum}, Given: {output.sum()}')
        atol = 1e-2 if dtype == torch.half else 1e-4
        self.assertTrue(
            torch.allclose(output.sum(), expected_output_sum, atol=atol))
예제 #7
0
 def __init__(self, init_args):
     super().__init__()
     if isinstance(init_args, dict):
         # for loading the checkpoint, pl passes a dict (hparams are saved as dict)
         init_args = Namespace(**init_args)
     config_path = init_args.config_path or init_args.model_dir
     checkpoint_path = init_args.checkpoint_path or init_args.model_dir
     logger.info(
         f'loading model from config: {config_path}, checkpoint: {checkpoint_path}'
     )
     config = LongformerConfig.from_pretrained(config_path)
     config.attention_mode = init_args.attention_mode
     logger.info(f'attention mode set to {config.attention_mode}')
     self.model_config = config
     self.model = Longformer.from_pretrained(checkpoint_path, config=config)
     self.tokenizer = BertTokenizer.from_pretrained(init_args.tokenizer)
     self.tokenizer.model_max_length = self.model.config.max_position_embeddings
     self.hparams = init_args
     self.hparams.seqlen = self.model.config.max_position_embeddings
     self.classifier = nn.Linear(config.hidden_size, init_args.num_labels)
예제 #8
0
파일: test_readme.py 프로젝트: yf1291/nlp4
    def test_something(self):
        config = LongformerConfig.from_pretrained(self.model_dir)
        # choose the attention mode 'n2', 'tvm' or 'sliding_chunks'
        # 'n2': for regular n2 attantion
        # 'tvm': a custom CUDA kernel implementation of our sliding window attention
        # 'sliding_chunks': a PyTorch implementation of our sliding window attention
        config.attention_mode = 'sliding_chunks'

        model = Longformer.from_pretrained(self.model_dir, config=config)
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        tokenizer.model_max_length = model.config.max_position_embeddings

        SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document

        input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(
            0)  # batch of size 1

        # TVM code doesn't work on CPU. Uncomment this if `config.attention_mode = 'tvm'`
        # model = model.cuda(); input_ids = input_ids.cuda()

        # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
        attention_mask = torch.ones(
            input_ids.shape, dtype=torch.long,
            device=input_ids.device)  # initialize to local attention
        attention_mask[:, [
            1,
            4,
            21,
        ]] = 2  # Set global attention based on the task. For example,
        # classification: the <s> token
        # QA: question tokens

        # padding seqlen to the nearest multiple of 512. Needed for the 'sliding_chunks' attention
        input_ids, attention_mask = pad_to_window_size(
            input_ids, attention_mask, config.attention_window[0],
            tokenizer.pad_token_id)

        output = model(input_ids, attention_mask=attention_mask)[0]

        # could have done more here....
        self.assertIsNotNone(output)
예제 #9
0
import torch
from longformer.longformer import Longformer, LongformerConfig
from longformer.sliding_chunks import pad_to_window_size
from transformers import RobertaTokenizer

config = LongformerConfig.from_pretrained('downloads/longformer-base-4096/') 
# choose the attention mode 'n2', 'tvm' or 'sliding_chunks'
# 'n2': for regular n2 attantion
# 'tvm': a custom CUDA kernel implementation of our sliding window attention
# 'sliding_chunks': a PyTorch implementation of our sliding window attention
#config.attention_mode = 'n2'
config.attention_mode = 'tvm'
#config.attention_mode = 'sliding_chunks'

model = Longformer.from_pretrained('downloads/longformer-base-4096/', config=config)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer.model_max_length = model.config.max_position_embeddings

SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document

input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1

# TVM code doesn't work on CPU. Uncomment this if `config.attention_mode = 'tvm'`
model = model.cuda(); input_ids = input_ids.cuda()

# Attention mask values -- 0: no attention, 1: local attention, 2: global attention
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
attention_mask[:, [1, 4, 21,]] =  2  # Set global attention based on the task. For example,
                                     # classification: the <s> token
                                     # QA: question tokens
예제 #10
0
# model =  ReformerModel.from_pretrained('google/reformer-crime-and-punishment')
#
# input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
# print(input_ids.shape)
# outputs = model(input_ids)
#
# pooled_output = torch.mean(outputs[0], dim=1)
#
# last_hidden_states = outputs[0]

import torch
from longformer.longformer import Longformer, LongformerConfig
from longformer.sliding_chunks import pad_to_window_size
from transformers import RobertaTokenizer

config = LongformerConfig.from_pretrained('longformer-base-4096/')
# choose the attention mode 'n2', 'tvm' or 'sliding_chunks'
# 'n2': for regular n2 attantion
# 'tvm': a custom CUDA kernel implementation of our sliding window attention
# 'sliding_chunks': a PyTorch implementation of our sliding window attention
config.attention_mode = 'sliding_chunks'

model = Longformer.from_pretrained('longformer-base-4096/', config=config)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer.model_max_length = model.config.max_position_embeddings

SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
SAMPLE_TEXT = f'{tokenizer.cls_token}{SAMPLE_TEXT}{tokenizer.eos_token}'

input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(
    0)  # batch of size 1
예제 #11
0
# -*- coding: utf-8 -*-
"""
@Time : 2020/11/12 15:59
@Auth : xiaolu
@File :test2.py
@IDE :PyCharm
@Email:[email protected]
"""
import torch
from pdb import set_trace
from transformers import BertTokenizer, AdamW
from longformer.longformer import Longformer, LongformerConfig
from longformer.sliding_chunks import pad_to_window_size

config = LongformerConfig.from_pretrained('./longformer_pretrain')
config.attention_mode = 'sliding_chunks'
model = Longformer.from_pretrained('./longformer_pretrain', config=config)

tokenizer = BertTokenizer.from_pretrained('./longformer_pretrain/vocab.txt')
tokenizer.model_max_length = model.config.max_position_embeddings

input_text = '你是我患得患失的梦' * 200
input_ids = torch.tensor(tokenizer.encode(input_text)).unsqueeze(0)
attention_mask = torch.ones(input_ids.shape,
                            dtype=torch.long,
                            device=input_ids.device)

print(input_ids.size())

input_ids, attention_mask = pad_to_window_size(input_ids, attention_mask,
                                               config.attention_window[0],
예제 #12
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, MafiaDataTrainingArguments, TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    num_labels = 2
    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = LongformerConfig.from_pretrained('longformer-base-4096/')
    config.num_labels = num_labels
    config.attention_mode = 'sliding_chunks'

    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    tokenizer.model_max_length = config.max_position_embeddings

    longformer = Longformer.from_pretrained('longformer-base-4096/',
                                            config=config)

    model = LongformerForSequenceClassification(config, longformer)

    # Get datasets
    train_dataset = MafiascumDataset(
        data_args, tokenizer=tokenizer) if training_args.do_train else None
    eval_dataset = MafiascumDataset(
        data_args, tokenizer=tokenizer,
        mode="dev") if training_args.do_eval else None
    test_dataset = MafiascumDataset(
        data_args, tokenizer=tokenizer,
        mode="test") if training_args.do_predict else None

    def compute_metrics(p: EvalPrediction) -> Dict:
        def simple_accuracy(preds, labels):
            return (preds == labels).mean()

        def acc_and_f1(preds, labels):
            acc = simple_accuracy(preds, labels)
            f1 = f1_score(y_true=labels, y_pred=preds)
            return {
                "acc": acc,
                "f1": f1,
                "acc_and_f1": (acc + f1) / 2,
            }

        preds = np.argmax(p.predictions, axis=1)
        return acc_and_f1(preds, p.label_ids)

    # import torch_xla.core.xla_model as xm
    # device = xm.xla_device(n=1)
    device = 'cuda'

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        device=device,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_datasets = [eval_dataset]
        for eval_dataset in eval_datasets:
            eval_result = trainer.evaluate(eval_dataset=eval_dataset).metrics

            output_eval_file = os.path.join(
                training_args.output_dir,
                f"eval_results_{eval_dataset.args.task_name}.txt")
            if trainer.is_world_master():
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results {} *****".format(
                        eval_dataset.args.task_name))
                    for key, value in eval_result.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)

    if training_args.do_predict:
        logging.info("*** Test ***")
        test_datasets = [test_dataset]

        for test_dataset in test_datasets:
            predictions = trainer.predict(
                test_dataset=test_dataset).predictions
            # For classification
            predictions = np.argmax(predictions, axis=1)

            output_test_file = os.path.join(
                training_args.output_dir,
                f"test_results_{test_dataset.args.task_name}.txt")
            if trainer.is_world_master():
                with open(output_test_file, "w") as writer:
                    logger.info("***** Test results {} *****".format(
                        test_dataset.args.task_name))
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions):
                        item = test_dataset.get_labels()[item]
                        writer.write("%d\t%s\n" % (index, item))
    return eval_results
예제 #13
0
    def __init__(
        self,
        input_size: int,
        output_size: int = 256,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        attention_dropout_rate: float = 0.0,
        input_layer: str = "conv2d",
        normalize_before: bool = True,
        concat_after: bool = False,
        positionwise_layer_type: str = "linear",
        positionwise_conv_kernel_size: int = 3,
        macaron_style: bool = False,
        rel_pos_type: str = "legacy",
        pos_enc_layer_type: str = "abs_pos",
        selfattention_layer_type: str = "lf_selfattn",
        activation_type: str = "swish",
        use_cnn_module: bool = True,
        zero_triu: bool = False,
        cnn_module_kernel: int = 31,
        padding_idx: int = -1,
        interctc_layer_idx: List[int] = [],
        interctc_use_conditioning: bool = False,
        attention_windows: list = [100, 100, 100, 100, 100, 100],
        attention_dilation: list = [1, 1, 1, 1, 1, 1],
        attention_mode: str = "sliding_chunks",
    ):
        assert check_argument_types()
        super().__init__(input_size)
        self._output_size = output_size

        activation = get_activation(activation_type)

        if pos_enc_layer_type == "abs_pos":
            pos_enc_class = PositionalEncoding
        else:
            raise ValueError("incorrect or unknown pos_enc_layer: " +
                             pos_enc_layer_type + "Use abs_pos")

        if len(attention_dilation) != num_blocks:
            raise ValueError(
                "incorrect attention_dilation parameter of length" +
                str(len(attention_dilation)) + " does not match num_blocks" +
                str(num_blocks))

        if len(attention_windows) != num_blocks:
            raise ValueError(
                "incorrect attention_windows parameter of length" +
                str(len(attention_windows)) + " does not match num_blocks" +
                str(num_blocks))

        if attention_mode != "tvm" and max(attention_dilation) != 1:
            raise ValueError("incorrect attention mode for dilation: " +
                             attention_mode +
                             "Use attention_mode=tvm with Cuda Kernel")

        if input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(input_size, output_size),
                torch.nn.LayerNorm(output_size),
                torch.nn.Dropout(dropout_rate),
                pos_enc_class(output_size, positional_dropout_rate),
            )
        elif input_layer == "conv2d":
            self.embed = Conv2dSubsampling(
                input_size,
                output_size,
                dropout_rate,
                pos_enc_class(output_size, positional_dropout_rate),
            )
        elif input_layer == "conv2d2":
            self.embed = Conv2dSubsampling2(
                input_size,
                output_size,
                dropout_rate,
                pos_enc_class(output_size, positional_dropout_rate),
            )
        elif input_layer == "conv2d6":
            self.embed = Conv2dSubsampling6(
                input_size,
                output_size,
                dropout_rate,
                pos_enc_class(output_size, positional_dropout_rate),
            )
        elif input_layer == "conv2d8":
            self.embed = Conv2dSubsampling8(
                input_size,
                output_size,
                dropout_rate,
                pos_enc_class(output_size, positional_dropout_rate),
            )
        elif input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(input_size,
                                   output_size,
                                   padding_idx=padding_idx),
                pos_enc_class(output_size, positional_dropout_rate),
            )
        elif isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer,
                pos_enc_class(output_size, positional_dropout_rate),
            )
        elif input_layer is None:
            self.embed = torch.nn.Sequential(
                pos_enc_class(output_size, positional_dropout_rate))
        else:
            raise ValueError("unknown input_layer: " + input_layer)

        self.normalize_before = normalize_before
        if positionwise_layer_type == "linear":
            positionwise_layer = PositionwiseFeedForward
            positionwise_layer_args = (
                output_size,
                linear_units,
                dropout_rate,
                activation,
            )
        elif positionwise_layer_type == "conv1d":
            positionwise_layer = MultiLayeredConv1d
            positionwise_layer_args = (
                output_size,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        elif positionwise_layer_type == "conv1d-linear":
            positionwise_layer = Conv1dLinear
            positionwise_layer_args = (
                output_size,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        else:
            raise NotImplementedError("Support only linear or conv1d.")
        self.selfattention_layer_type = selfattention_layer_type
        if selfattention_layer_type == "lf_selfattn":
            assert pos_enc_layer_type == "abs_pos"
            from longformer.longformer import LongformerConfig

            from espnet.nets.pytorch_backend.transformer.longformer_attention import (
                LongformerAttention, )

            encoder_selfattn_layer = LongformerAttention

            config = LongformerConfig(
                attention_window=attention_windows,
                attention_dilation=attention_dilation,
                autoregressive=False,
                num_attention_heads=attention_heads,
                hidden_size=output_size,
                attention_probs_dropout_prob=dropout_rate,
                attention_mode=attention_mode,
            )
            encoder_selfattn_layer_args = (config, )
        else:
            raise ValueError("incompatible or unknown encoder_attn_layer: " +
                             selfattention_layer_type + " Use lf_selfattn")

        convolution_layer = ConvolutionModule
        convolution_layer_args = (output_size, cnn_module_kernel, activation)

        self.encoders = repeat(
            num_blocks,
            lambda layer_id: EncoderLayer(
                output_size,
                encoder_selfattn_layer(*(encoder_selfattn_layer_args +
                                         (layer_id, ))),
                positionwise_layer(*positionwise_layer_args),
                positionwise_layer(*positionwise_layer_args)
                if macaron_style else None,
                convolution_layer(*convolution_layer_args)
                if use_cnn_module else None,
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )

        if self.normalize_before:
            self.after_norm = LayerNorm(output_size)

        self.interctc_layer_idx = interctc_layer_idx
        if len(interctc_layer_idx) > 0:
            assert 0 < min(interctc_layer_idx) and max(
                interctc_layer_idx) < num_blocks
        self.interctc_use_conditioning = interctc_use_conditioning
        self.conditioning_layer = None