예제 #1
0
파일: model.py 프로젝트: augustdemi/mcrowd
    def __init__(self,
                 dec_inp_size,
                 dec_out_size,
                 d_latent,
                 N=6,
                 d_model=512,
                 d_ff=2048,
                 h=8,
                 dropout=0.1,
                 device='cpu',
                 d_map_latent=8):
        super(DecoderY, self).__init__()

        self.dec_out_size = dec_out_size
        self.d_model = d_model
        self.device = device

        self.trg_embed = nn.Sequential(
            LinearEmbedding(dec_inp_size, d_model - d_map_latent),
            PositionalEncoding(d_model - d_map_latent, dropout))
        self.decoder = Decoder(
            DecoderLayer(
                d_model, MultiHeadAttention(h, d_model),
                MultiHeadAttention(h, d_model),
                ConcatPointerwiseFeedforward(d_model, d_latent, d_ff, dropout),
                dropout), N)
        self.fc = nn.Linear(d_model, dec_out_size * 2)

        self.init_weights(self.decoder.parameters())
        self.init_weights(self.fc.parameters())

        self.map_encoder = load_map_encoder(device)
예제 #2
0
    def __init__(self,
                 src_len,
                 tgt_len,
                 enc_inp_size,
                 dec_inp_size,
                 dec_out_size,
                 N=6,
                 d_model=512,
                 d_ff=2048,
                 h=8,
                 dropout=0.1,
                 device='cpu'):
        super(Generator, self).__init__()
        self.device = device
        self.src_len = src_len
        self.tgt_len = tgt_len
        self.dec_inp_size = dec_inp_size

        c = copy.deepcopy
        attn = MultiHeadAttention(h, d_model)
        ff = PointerwiseFeedforward(d_model, d_ff, dropout)
        position = PositionalEncoding(d_model, dropout)
        self.generator = EncoderDecoder(
            Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
            Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout),
                    N),
            nn.Sequential(LinearEmbedding(enc_inp_size, d_model), c(position)),
            nn.Sequential(LinearEmbedding(dec_inp_size, d_model), c(position)),
            TFHeadGenerator(d_model, dec_out_size))

        # This was important from their code.
        # Initialize parameters with Glorot / fan_avg.
        for p in self.generator.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
예제 #3
0
    def __init__(self,
                 num_layers=None,
                 d_model=None,
                 num_heads=None,
                 dff=None,
                 input_vocab_size=None,
                 target_vocab_size=None,
                 model_dir=None,
                 pe_input=None,
                 pe_target=None,
                 rate=0.1,
                 decoder=None,
                 final_layer=None,
                 args=None):
        super(TransformerBert, self).__init__()

        self.encoder = BertEncoder(model_dir=model_dir,
                                   d_model=d_model,
                                   args=args)
        if decoder:
            self.decoder = decoder
        else:
            self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                                   target_vocab_size, pe_target, rate)
        if final_layer:
            self.final_layer = final_layer
        else:
            self.final_layer = tf.keras.layers.Dense(target_vocab_size)
예제 #4
0
    def create_model(cls, args):
        from transformer.conv_encoder import Conv2dSubsample
        from transformer.encoder import Encoder
        from transformer.attentionAssigner import Attention_Assigner
        # from transformer.attentionAssigner import Attention_Assigner_RNN as Attention_Assigner
        from transformer.decoder import Decoder_CIF as Decoder

        conv_encoder = Conv2dSubsample(d_input=args.d_input * args.LFR_m,
                                       d_model=args.d_model,
                                       n_layers=args.n_conv_layers)
        encoder = Encoder(d_input=args.d_model,
                          n_layers=args.n_layers_enc,
                          n_head=args.n_head,
                          d_model=args.d_model,
                          d_inner=args.d_inner,
                          dropout=args.dropout)
        assigner = Attention_Assigner(d_input=args.d_model,
                                      d_hidden=args.d_assigner_hidden,
                                      w_context=args.w_context,
                                      n_layers=args.n_assigner_layers)
        decoder = Decoder(sos_id=args.sos_id,
                          n_tgt_vocab=args.vocab_size,
                          n_layers=args.n_layers_dec,
                          n_head=args.n_head,
                          d_model=args.d_model,
                          d_inner=args.d_inner,
                          dropout=args.dropout)
        model = cls(conv_encoder, encoder, assigner, decoder,
                    args.spec_aug_cfg)

        return model
예제 #5
0
    def __init__(self,
                 enc_inp_size,
                 dec_inp_size,
                 dec_out_size,
                 N=6,
                 d_model=512,
                 d_ff=2048,
                 heads=8,
                 dropout=0.1,
                 mean=[0, 0],
                 std=[0, 0]):
        super(IndividualTF, self).__init__()
        "Helper: Construct a model from hyperparameters."
        c = copy.deepcopy
        attn = MultiHeadAttention(heads, d_model)
        ff = PointerwiseFeedforward(d_model, d_ff, dropout)
        position = PositionalEncoding(d_model, dropout)
        self.mean = np.array(mean)
        self.std = np.array(std)

        self.model = EncoderDecoder(
            Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
            Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout),
                    N),
            nn.Sequential(LinearEmbedding(enc_inp_size, d_model), c(position)),
            nn.Sequential(LinearEmbedding(dec_inp_size, d_model), c(position)),
            Generator(d_model, dec_out_size))

        # This was important from their code.
        # Initialize parameters with Glorot / fan_avg.
        for p in self.model.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    def create_model(cls, args):
        from transformer.decoder import Decoder
        from transformer.encoder import Encoder
        from transformer.conv_encoder import Conv2dSubsample

        conv_encoder = Conv2dSubsample(d_input=args.d_input * args.LFR_m,
                                       d_model=args.d_model,
                                       n_layers=args.n_conv_layers)
        encoder = Encoder(d_input=args.d_model,
                          n_layers=args.n_layers_enc,
                          n_head=args.n_head,
                          d_model=args.d_model,
                          d_inner=args.d_inner,
                          dropout=args.dropout)
        decoder = Decoder(sos_id=args.sos_id,
                          eos_id=args.eos_id,
                          n_tgt_vocab=args.vocab_size,
                          n_layers=args.n_layers_dec,
                          n_head=args.n_head,
                          d_model=args.d_model,
                          d_inner=args.d_inner,
                          dropout=args.dropout)

        model = cls(conv_encoder,
                    encoder,
                    decoder,
                    spec_aug_cfg=args.spec_aug_cfg)

        return model
예제 #7
0
def make_model(
    src_vocab: int,
    tgt_vocab: int,
    n: int = 6,
    d_model: int = 512,
    d_ff: int = 2048,
    h: int = 8,
    dropout: float = 0.1,
    device: torch.device = torch.device("cpu"),
) -> EncoderDecoder:
    """Helper: Construct a model from hyperparameters."""
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), n),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), n),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab),
    ).to(device)

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model
예제 #8
0
    def test_forward(self):
        # Parameters
        batch_size = 64
        sequence_length = 10
        d_k = d_v = d_model = input_size = 512
        d_ff = 2048
        nb_of_decoder_layers = 6

        # Initialize decoder
        decoder_layer = DecoderLayer(
            size=input_size,
            self_attn=MultiHeadAttention(n_head=8,
                                         d_model=d_model,
                                         d_k=d_k,
                                         d_v=d_v,
                                         dropout=0.1),
            memory_attn=MultiHeadAttention(n_head=8,
                                           d_model=d_model,
                                           d_k=d_k,
                                           d_v=d_v,
                                           dropout=0.1),
            feed_forward=PositionwiseFeedForward(d_model=d_model,
                                                 d_ff=d_ff,
                                                 dropout=0.1),
            dropout=0.1)

        decoder = Decoder(layer=decoder_layer, N=nb_of_decoder_layers)

        # Initialize input and memory
        x = torch.ones((batch_size, sequence_length, input_size))
        memory = torch.ones((batch_size, sequence_length, input_size))

        # Subsequent mask: mask all words with length > i
        decoder_mask = subsequent_mask(sequence_length)

        # Forward pass with fool input and memory (same here)
        out = decoder.forward(x, memory, decoder_mask, None)

        # Unit Tests
        self.assertIsInstance(out, torch.Tensor)
        self.assertEqual(out.shape, x.shape)
        self.assertEqual(out.shape, memory.shape)
        self.assertEqual(x.shape, memory.shape)
        self.assertEqual(torch.isnan(out).sum(), 0)
예제 #9
0
파일: model.py 프로젝트: augustdemi/mcrowd
class DecoderY(nn.Module):
    def __init__(self,
                 dec_inp_size,
                 dec_out_size,
                 d_latent,
                 N=6,
                 d_model=512,
                 d_ff=2048,
                 h=8,
                 dropout=0.1,
                 device='cpu',
                 d_map_latent=8):
        super(DecoderY, self).__init__()

        self.dec_out_size = dec_out_size
        self.d_model = d_model
        self.device = device

        self.trg_embed = nn.Sequential(
            LinearEmbedding(dec_inp_size, d_model - d_map_latent),
            PositionalEncoding(d_model - d_map_latent, dropout))
        self.decoder = Decoder(
            DecoderLayer(
                d_model, MultiHeadAttention(h, d_model),
                MultiHeadAttention(h, d_model),
                ConcatPointerwiseFeedforward(d_model, d_latent, d_ff, dropout),
                dropout), N)
        self.fc = nn.Linear(d_model, dec_out_size * 2)

        self.init_weights(self.decoder.parameters())
        self.init_weights(self.fc.parameters())

        self.map_encoder = load_map_encoder(device)

    def init_weights(self, params):
        for p in params:
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, enc_out, latents, trg, src_mask, trg_mask, map):
        map = map.to(self.device)
        map_feat = self.map_encoder(trg[:, :, :2].reshape(-1, 2),
                                    map.reshape(-1, map.shape[2], map.shape[3],
                                                map.shape[4]),
                                    train=False)
        map_feat = map_feat.reshape((-1, trg.shape[1], map_feat.shape[-1]))

        trg_emb = torch.cat((self.trg_embed(trg), map_feat), dim=-1)
        dec_out = self.decoder(trg_emb, enc_out, latents.unsqueeze(1),
                               src_mask, trg_mask)  # bs, 12, 512
        stats = self.fc(dec_out)  # bs, 12, out*2
        mu = stats[:, :, :self.dec_out_size]
        logvar = stats[:, :, self.dec_out_size:]
        return mu, logvar
예제 #10
0
    def __init__(self,
                 *args,
                 embedding_rank=None,
                 inner_rank=None,
                 ffward_rank=None,
                 **kwargs):
        # Run super constructor from NMTModel, but don't run NMTModel.__init__
        super(NMTModel, self).__init__()
        self.vocab = pickle.load(open(paths.vocab, 'rb'))

        if embedding_rank is None:
            embedding_rank = transformer_config.embedding_rank
        if inner_rank is None:
            inner_rank = transformer_config.inner_rank
        if ffward_rank is None:
            ffward_rank = transformer_config.ffward_rank
        print(transformer_config.embedding_factorization,
              transformer_config.inner_factorization,
              transformer_config.ffward_factorization)
        print(embedding_rank, inner_rank, ffward_rank)
        self.encoder = Encoder(len(self.vocab.src), embedding_rank, inner_rank,
                               ffward_rank)
        self.decoder = Decoder(len(self.vocab.tgt), embedding_rank, inner_rank,
                               ffward_rank)

        self.gpu = False
        self.initialize()

        self.optimizer = NoamOpt(transformer_config.layer_dimension,
                                 train_config.lr,
                                 4000,
                                 Adam(
                                     self.parameters(),
                                     lr=0,
                                     betas=(0.9, 0.98),
                                     eps=1e-9,
                                 ),
                                 beginning_step=0)

        self.num_accumulations = 0
        self.accumulate = max(1, train_config.accumulate)
예제 #11
0
 def __init__(self, vocabulary_size_in, vocabulary_size_out, constants, hyperparams):
     super(Transformer, self).__init__()
     self.constants = constants
     self.max_seq = hyperparams.MAX_SEQ
     self.EmbeddingSrc = Embedding(vocabulary_size=vocabulary_size_in, d_model=hyperparams.D_MODEL, constants=constants)
     self.EmbeddingTgt = Embedding(vocabulary_size=vocabulary_size_out, d_model=hyperparams.D_MODEL, constants=constants)
     self.Encoder = Encoder(nb_layers=hyperparams.NB_LAYERS, nb_heads=hyperparams.NB_HEADS, d_model=hyperparams.D_MODEL, nb_neurons=hyperparams.NB_NEURONS, dropout=hyperparams.DROPOUT)
     self.Decoder = Decoder(nb_layers=hyperparams.NB_LAYERS, nb_heads=hyperparams.NB_HEADS, d_model=hyperparams.D_MODEL, nb_neurons=hyperparams.NB_NEURONS, dropout=hyperparams.DROPOUT)
     self.Linear = nn.Linear(hyperparams.D_MODEL, vocabulary_size_out, bias=False)
     if hyperparams.SHARE_WEIGHTS:
         self.EmbeddingSrc.lookup_table.weight = self.Linear.weight
         self.EmbeddingTgt.lookup_table.weight = self.Linear.weight
예제 #12
0
 def __init__(self,
              num_layers,
              d_model,
              num_heads,
              dff,
              input_vocab_size,
              target_vocab_size,
              pe_input,
              pe_target,
              rate=0.1):
     super(Transformer, self).__init__()
     self.encoder = Encoder(num_layers, d_model, num_heads, dff,
                            input_vocab_size, pe_input, rate)
     self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                            target_vocab_size, pe_target, rate)
     self.final_layer = tf.keras.layers.Dense(target_vocab_size)
예제 #13
0
    def __init__(self,
                 n_src_vocab,
                 n_trg_vocab,
                 src_pad_idx,
                 trg_pad_idx,
                 d_word_vec=256,
                 d_model=256,
                 d_inner=512,
                 n_layer=3,
                 n_head=8,
                 dropout=0.1,
                 n_position=200):
        super(Transformer, self).__init__()
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx

        self.encoder = Encoder(n_src_vocab=n_src_vocab,
                               d_word_vec=d_word_vec,
                               d_model=d_model,
                               d_inner=d_inner,
                               n_layer=n_layer,
                               n_head=n_head,
                               pad_idx=src_pad_idx,
                               dropout=dropout,
                               n_position=n_position,
                               max_seq_len=32)
        self.decoder = Decoder(n_trg_vocab=n_trg_vocab,
                               d_word_vec=d_word_vec,
                               d_model=d_model,
                               d_inner=d_inner,
                               n_layer=n_layer,
                               n_head=n_head,
                               pad_idx=trg_pad_idx,
                               n_position=n_position,
                               dropout=dropout)

        self.trg_word_prj = nn.Linear(d_model, n_trg_vocab, bias=False)

        # for name, param in self.named_parameters():
        #     if param.dim() > 1:
        #         nn.init.xavier_normal(param)

        for param in self.parameters():
            if param.dim() > 1:
                nn.init.xavier_uniform_(param)
예제 #14
0
    def create_model(cls, args):
        from transformer.decoder import Decoder
        from transformer.encoder import Encoder

        encoder = Encoder(d_input=args.d_input * args.LFR_m,
                          n_layers=args.n_layers_enc,
                          n_head=args.n_head,
                          d_model=args.d_model,
                          d_inner=args.d_inner,
                          dropout=args.dropout)
        decoder = Decoder(sos_id=args.sos_id,
                          eos_id=args.eos_id,
                          n_tgt_vocab=args.vocab_size,
                          n_layers=args.n_layers_dec,
                          n_head=args.n_head,
                          d_model=args.d_model,
                          d_inner=args.d_inner,
                          dropout=args.dropout)

        model = cls.create_model(encoder, decoder)

        return model
    def __init__(self,
                 src_vocab_size,
                 tgt_vocab_size,
                 device,
                 d_model=512,
                 p_dropout=0.1):
        super(Transformer, self).__init__()
        self.d_model = d_model

        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.positional_encoder1 = PositionalEncoder(device,
                                                     d_model=d_model,
                                                     p_dropout=p_dropout)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoder2 = PositionalEncoder(device,
                                                     d_model=d_model,
                                                     p_dropout=p_dropout)
        self.encoder = Encoder(6, d_model)
        self.decoder = Decoder(6, d_model)
        self.linear = nn.Linear(d_model, tgt_vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)

        # Share weights
        self.linear.weight = self.tgt_embedding.weight
예제 #16
0
def train_net(args):
    torch.manual_seed(7)
    np.random.seed(7)
    checkpoint = args.checkpoint
    start_epoch = 0
    best_loss = float('inf')
    writer = SummaryWriter()
    epochs_since_improvement = 0

    # Initialize / load checkpoint
    if checkpoint is None:
        # model
        encoder = Encoder(n_src_vocab,
                          args.n_layers_enc,
                          args.n_head,
                          args.d_k,
                          args.d_v,
                          args.d_model,
                          args.d_inner,
                          dropout=args.dropout,
                          pe_maxlen=args.pe_maxlen)
        decoder = Decoder(
            sos_id,
            eos_id,
            n_tgt_vocab,
            args.d_word_vec,
            args.n_layers_dec,
            args.n_head,
            args.d_k,
            args.d_v,
            args.d_model,
            args.d_inner,
            dropout=args.dropout,
            tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing,
            pe_maxlen=args.pe_maxlen)
        model = Transformer(encoder, decoder)
        # print(model)
        # model = nn.DataParallel(model)

        # optimizer
        optimizer = TransformerOptimizer(
            torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09))

    else:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        epochs_since_improvement = checkpoint['epochs_since_improvement']
        model = checkpoint['model']
        optimizer = checkpoint['optimizer']

    # Move to GPU, if available
    model = model.to(device)

    # Custom dataloaders
    train_dataset = AiChallenger2017Dataset('train')
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               collate_fn=pad_collate,
                                               shuffle=True,
                                               num_workers=args.num_workers)
    valid_dataset = AiChallenger2017Dataset('valid')
    valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                               batch_size=args.batch_size,
                                               collate_fn=pad_collate,
                                               shuffle=False,
                                               num_workers=args.num_workers)

    # Epochs
    for epoch in range(start_epoch, args.epochs):
        # One epoch's training
        train_loss = train(train_loader=train_loader,
                           model=model,
                           optimizer=optimizer,
                           epoch=epoch,
                           logger=logger,
                           writer=writer)

        writer.add_scalar('epoch/train_loss', train_loss, epoch)
        writer.add_scalar('epoch/learning_rate', optimizer.lr, epoch)

        print('\nLearning rate: {}'.format(optimizer.lr))
        print('Step num: {}\n'.format(optimizer.step_num))

        # One epoch's validation
        valid_loss = valid(valid_loader=valid_loader,
                           model=model,
                           logger=logger)
        writer.add_scalar('epoch/valid_loss', valid_loss, epoch)

        # Check if there was an improvement
        is_best = valid_loss < best_loss
        best_loss = min(valid_loss, best_loss)
        if not is_best:
            epochs_since_improvement += 1
            print("\nEpochs since last improvement: %d\n" %
                  (epochs_since_improvement, ))
        else:
            epochs_since_improvement = 0

        # Save checkpoint
        save_checkpoint(epoch, epochs_since_improvement, model, optimizer,
                        best_loss, is_best)
예제 #17
0
def main():
    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')

    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0001,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=WORD_MAXLEN,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)
    parser.add_argument(
        '--word',
        action='store_true',
        help='Train/Predict model using word based label (default: False)')
    parser.add_argument('--gen_label_index',
                        action='store_true',
                        help='Generate word label index map(default: False)')
    parser.add_argument('--iteration', type=str, help='Iteratiom')
    parser.add_argument('--premodel_session',
                        type=str,
                        help='Session name of premodel')

    # transformer model parameter
    parser.add_argument('--d_model',
                        type=int,
                        default=128,
                        help='transformer_d_model')
    parser.add_argument('--n_head',
                        type=int,
                        default=8,
                        help='transformer_n_head')
    parser.add_argument('--num_encoder_layers',
                        type=int,
                        default=4,
                        help='num_encoder_layers')
    parser.add_argument('--num_decoder_layers',
                        type=int,
                        default=4,
                        help='transformer_num_decoder_layers')
    parser.add_argument('--dim_feedforward',
                        type=int,
                        default=2048,
                        help='transformer_d_model')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.1,
                        help='transformer_dropout')

    # transformer warmup parameter
    parser.add_argument('--warmup_multiplier',
                        type=int,
                        default=3,
                        help='transformer_warmup_multiplier')
    parser.add_argument('--warmup_epoch',
                        type=int,
                        default=10,
                        help='transformer_warmup_epoch')

    args = parser.parse_args()
    char_loader = CharLabelLoader()
    char_loader.load_char2index('./hackathon.labels')
    label_loader = char_loader
    if args.word:
        if args.gen_label_index:
            generate_word_label_index_file(char_loader, TRAIN_LABEL_CHAR_PATH)
            from subprocess import call
            call(f'cat {TRAIN_LABEL_CHAR_PATH}', shell=True)
        # ??? ??? ??? ??
        word_loader = CharLabelLoader()
        word_loader.load_char2index('./hackathon.pos.labels')
        label_loader = word_loader
        if os.path.exists(TRAIN_LABEL_CHAR_PATH):
            generate_word_label_file(char_loader, word_loader,
                                     TRAIN_LABEL_POS_PATH,
                                     TRAIN_LABEL_CHAR_PATH)
    char2index = label_loader.char2index
    index2char = label_loader.index2char
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    ############ model
    print("model: transformer")
    # model = Transformer(d_model= args.d_model, n_head= args.n_head, num_encoder_layers= args.num_encoder_layers, num_decoder_layers= args.num_decoder_layers,
    #                     dim_feedforward= args.dim_feedforward, dropout= args.dropout, vocab_size= len(char2index), sound_maxlen= SOUND_MAXLEN, word_maxlen= WORD_MAXLEN)

    encoder = Encoder(d_input=128,
                      n_layers=6,
                      n_head=4,
                      d_k=128,
                      d_v=128,
                      d_model=128,
                      d_inner=2048,
                      dropout=0.1,
                      pe_maxlen=SOUND_MAXLEN)
    decoder = Decoder(sos_id=SOS_token,
                      eos_id=EOS_token,
                      n_tgt_vocab=len(char2index),
                      d_word_vec=128,
                      n_layers=6,
                      n_head=4,
                      d_k=128,
                      d_v=128,
                      d_model=128,
                      d_inner=2048,
                      dropout=0.1,
                      tgt_emb_prj_weight_sharing=True,
                      pe_maxlen=SOUND_MAXLEN)
    model = Transformer(encoder, decoder)

    optimizer = TransformerOptimizer(
        torch.optim.Adam(model.parameters(),
                         lr=0.0004,
                         betas=(0.9, 0.98),
                         eps=1e-09))

    ############/

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    model = nn.DataParallel(model).to(device)
    """
    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)

    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.max_epochs)
    scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=args.warmup_multiplier, total_epoch=args.warmup_epoch, after_scheduler=scheduler_cosine)
    
    
    criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device)
    """

    bind_model(model, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    # target_path = os.path.join(DATASET_PATH, 'train_label')
    target_path = TRAIN_LABEL_CHAR_PATH
    if args.word:
        target_path = TRAIN_LABEL_POS_PATH
    load_targets(target_path)

    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, valid_ratio=0.05)

    if args.iteration:
        if args.premodel_session:
            nsml.load(args.iteration, session=args.premodel_session)
            logger.info(f'Load {args.premodel_session} {args.iteration}')
        else:
            nsml.load(args.iteration)
            logger.info(f'Load {args.iteration}')
    logger.info('start')

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):
        # learning rate scheduler

        train_queue = queue.Queue(args.workers * 2)

        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      optimizer, device, train_begin,
                                      args.workers, 10, args.teacher_forcing)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        train_loader.join()

        print("~~~~~~~~~~~~")

        if epoch == 10 or (epoch > 48 and epoch % 10 == 9):
            valid_queue = queue.Queue(args.workers * 2)
            valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                          args.batch_size, 0)
            valid_loader.start()

            eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                           device, args.max_len,
                                           args.batch_size)
            logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                        (epoch, eval_loss, eval_cer))

            valid_loader.join()

            nsml.report(False,
                        step=epoch,
                        train_epoch__loss=train_loss,
                        train_epoch__cer=train_cer,
                        eval__loss=eval_loss,
                        eval__cer=eval_cer)

            best_model = (eval_loss < best_loss)
            nsml.save(args.save_name)

            if best_model:
                nsml.save('best')
                best_loss = eval_loss
예제 #18
0
def train_net(args):
    # 为了保证程序执行结果一致, 给随机化设定种子
    torch.manual_seed(7)
    np.random.seed(7)
    checkpoint = args.checkpoint

    start_epoch = 0
    writer = SummaryWriter()

    if checkpoint is None:
        # model
        encoder = Encoder(Config.vocab_size, args.n_layers_enc, args.n_head,
                          args.d_k, args.d_v, args.d_model, args.d_inner,
                          dropout=args.dropout, pe_maxlen=args.pe_maxlen)

        decoder = Decoder(Config.sos_id, Config.eos_id, Config.vocab_size,
                          args.d_word_vec, args.n_layers_dec, args.n_head,
                          args.d_k, args.d_v, args.d_model, args.d_inner,
                          dropout=args.dropout,
                          tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing,
                          pe_maxlen=args.pe_maxlen)

        model = Transformer(encoder, decoder)

        # optimizer
        optimizer = TransformerOptimizer(
            torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09))

    else:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        model = checkpoint['model']
        optimizer = checkpoint['optimizer']

    # Move to GPU, if available
    model = model.to(Config.device)

    # Custom dataloaders  数据的加载 注意这里指定了一个参数collate_fn代表的数据需要padding
    train_dataset = TranslateDataset()

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, collate_fn=pad_collate,
                                               shuffle=True, num_workers=args.num_workers)

    # Epochs
    Loss_list = []
    for epoch in range(start_epoch, args.epochs):
        # One epoch's training
        train_loss = train(train_loader=train_loader,
                           model=model,
                           optimizer=optimizer,
                           epoch=epoch,
                           logger=logger,
                           writer=writer)

        l = str(train_loss)
        Loss_list.append(l)

        l_temp = l + '\n'
        with open('loss_epoch.txt', 'a+') as f:
            f.write(l_temp)

        writer.add_scalar('epoch/train_loss', train_loss, epoch)
        writer.add_scalar('epoch/learning_rate', optimizer.lr, epoch)

        print('\nLearning rate: {}'.format(optimizer.lr))
        print('Step num: {}\n'.format(optimizer.step_num))

        # Save checkpoint
        save_checkpoint(epoch, model, optimizer, train_loss)
    with open('loss.txt', 'w') as f:
        f.write('\n'.join(Loss_list))
예제 #19
0
class TransformerModel(NMTModel):
    """
    A standard Encoder-Decoder architecture. Base for this and many
    other models.
    """
    def __init__(self,
                 *args,
                 embedding_rank=None,
                 inner_rank=None,
                 ffward_rank=None,
                 **kwargs):
        # Run super constructor from NMTModel, but don't run NMTModel.__init__
        super(NMTModel, self).__init__()
        self.vocab = pickle.load(open(paths.vocab, 'rb'))

        if embedding_rank is None:
            embedding_rank = transformer_config.embedding_rank
        if inner_rank is None:
            inner_rank = transformer_config.inner_rank
        if ffward_rank is None:
            ffward_rank = transformer_config.ffward_rank
        print(transformer_config.embedding_factorization,
              transformer_config.inner_factorization,
              transformer_config.ffward_factorization)
        print(embedding_rank, inner_rank, ffward_rank)
        self.encoder = Encoder(len(self.vocab.src), embedding_rank, inner_rank,
                               ffward_rank)
        self.decoder = Decoder(len(self.vocab.tgt), embedding_rank, inner_rank,
                               ffward_rank)

        self.gpu = False
        self.initialize()

        self.optimizer = NoamOpt(transformer_config.layer_dimension,
                                 train_config.lr,
                                 4000,
                                 Adam(
                                     self.parameters(),
                                     lr=0,
                                     betas=(0.9, 0.98),
                                     eps=1e-9,
                                 ),
                                 beginning_step=0)

        self.num_accumulations = 0
        self.accumulate = max(1, train_config.accumulate)

    def reset_optimizer(self):
        self.optimizer = NoamOpt(
            transformer_config.layer_dimension,
            1,
            4000,
            Adam(
                self.parameters(),
                lr=0,
                betas=(0.9, 0.98),
                eps=1e-9,
            ),
        )

    def __call__(self, src, tgt, update_params=True):
        "Take in and process masked src and target sequences."
        src_encoding, src_mask = self.encode(src)
        loss, norm = self.decode(
            src_encoding,
            src_mask,
            tgt,
        )

        if update_params:
            self.step(loss)
        if self.gpu:
            loss = loss.cpu()
        return loss.detach().numpy() * norm

    def encode(self, src):
        src_encodings = self.prepare_sents(src, 'src')
        return self.encoder(src_encodings)

    def decode(self, src_encoding, src_mask, tgt):
        tgt_enc = self.prepare_sents(tgt, 'tgt')
        return self.decoder(
            src_encoding,
            src_mask,
            tgt_enc,
        )

    def initialize(self):
        # Initialize parameters with Glorot
        # TODO: Make sure this works correctly
        for param in self.parameters():
            if param.dim() > 1:
                nn.init.xavier_uniform(param)

    def update_lr(self, *args, **kwargs):
        """
        Overwrite update_lr needed by other models becuase Transformer is very
        sensitive to hyperparameters and manages its own lr decay
        """
        pass

    @staticmethod
    def load(model_path: str):
        dict_path = model_path + ".dict.pt"
        model = TransformerModel()
        print("Loading whole model")
        load_partial_state_dict(model, torch.load(dict_path))
        return model

    def load_params(self, model_path, no_opt=False):
        dict_path = model_path + ".dict.pt"
        self.load_state_dict(torch.load(dict_path))
        if not no_opt:
            opt_path = model_path + ".opt.pt"
            self.optimizer.load_state_dict(torch.load(opt_path))

    def save(self, path: str, no_opt=False):
        dict_path = path + ".dict.pt"
        torch.save(self.state_dict(), dict_path)
        if not no_opt:
            opt_path = path + ".opt.pt"
            torch.save(self.optimizer.state_dict(), opt_path)

    def beam_search(self, src, max_step=100, replace=False, start_symbol=1):

        if decoder_config.greedy_search:

            batch_size = len(src)
            stop = 2

            inferred = [None for _ in range(batch_size)]
            memory, src_mask = self.encode(src)
            pred_sents = torch.ones(batch_size, 1,
                                    dtype=torch.long).fill_(start_symbol)
            scores = np.zeros((batch_size, ))

            if self.gpu:
                pred_sents = pred_sents.cuda()

            for i in range(1, max_step + 1):
                out = self.decoder.get_word_scores(memory, src_mask,
                                                   Variable(pred_sents))

                next_scores, next_words = torch.max(out, dim=1)
                pred_sents = torch.cat(
                    [pred_sents, next_words.unsqueeze(1)], dim=1)

                stopped_sentences = np.where(
                    next_words.detach().cpu().numpy() == stop)[0]
                ongoing_sentences = np.where(
                    next_words.detach().cpu().numpy() != stop)[0]

                place_in_inferred(inferred, pred_sents, scores, next_scores,
                                  stopped_sentences)
                pred_sents = pred_sents[ongoing_sentences]
                memory = memory[ongoing_sentences]
                src_mask = src_mask[ongoing_sentences]

                if len(ongoing_sentences) == 0:
                    break

            place_in_inferred(inferred, pred_sents, scores, next_scores,
                              np.arange(len(ongoing_sentences)))
            return [[convert_hypothesis(inferred[i], self.vocab, scores[i])]
                    for i in range(batch_size)]

        else:  # beam search

            try:
                memory, src_mask = self.encode(src)
            except IndexError:
                print(src)

            batch_size = len(src)
            beam_size = decoder_config.beam_size
            beam_batch = BeamBatch(batch_size, beam_size, memory, src_mask,
                                   self.gpu)

            for i in range(1, max_step + 1):
                sizes = beam_batch.get_sizes()
                memory, src_mask = beam_batch.expand_memory_and_mask()
                pred_sents = beam_batch.open_hyps_tensor()

                out = self.decoder.get_word_scores(memory, src_mask,
                                                   Variable(pred_sents))
                out = out.detach().cpu().numpy()

                next_words = np.argpartition(-out, beam_size - 1,
                                             axis=-1)[:, :beam_size]
                next_words = torch.LongTensor(next_words).cuda()
                next_words = next_words.view(sum(sizes) * beam_size, 1)
                pred_sents = pred_sents.repeat(beam_size, 1).reshape(
                    beam_size, sum(sizes),
                    -1).transpose(1, 0).reshape(beam_size * sum(sizes), -1)
                pred_sents = torch.cat([pred_sents, next_words], dim=1)

                next_scores = -np.partition(
                    -out, beam_size - 1)[:, :beam_size].flatten()
                old_scores = np.array(beam_batch.get_open_scores())
                old_scores = np.repeat(old_scores, beam_size)
                next_scores = score_update(old_scores, next_scores, i)
                beam_batch.update(sizes, next_scores, pred_sents)

                if beam_batch.is_closed():
                    break

            # print([2 in beam_batch.best_results()[i][0] for i in range(batch_size)])
            # print([len(beam_batch.best_results()[i][0]) for i in range(batch_size)])
            return [[
                convert_hypothesis(beam_batch.best_results()[i][0], self.vocab,
                                   beam_batch.best_results()[i][1])
            ] for i in range(batch_size)]
예제 #20
0
    def __init__(self, params: dict):
        """
        Instantiate the ``Transformer`` class.

        :param params: Dict containing the set of parameters for the entire model\
         (e.g ``EncoderLayer``, ``DecoderLayer`` etc.) broken down in relevant sections, e.g.:

            params = {
                'd_model': 512,
                'src_vocab_size': 27000,
                'tgt_vocab_size': 27000,

                'N': 6,
                'dropout': 0.1,

                'attention': {'n_head': 8,
                              'd_k': 64,
                              'd_v': 64,
                              'dropout': 0.1},

                'feed-forward': {'d_ff': 2048,
                                 'dropout': 0.1},
            }

        """
        # call base constructor
        super(Transformer, self).__init__()

        # Save params for Checkpoint
        self._params = params

        # instantiate Encoder layer
        enc_layer = EncoderLayer(
            size=params['d_model'],
            self_attention=MultiHeadAttention(
                n_head=params['attention']['n_head'],
                d_model=params['d_model'],
                d_k=params['attention']['d_k'],
                d_v=params['attention']['d_v'],
                dropout=params['attention']['dropout']),
            feed_forward=PositionwiseFeedForward(
                d_model=params['d_model'],
                d_ff=params['feed-forward']['d_ff'],
                dropout=params['feed-forward']['dropout']),
            dropout=params['dropout'])

        # instantiate Encoder
        self.encoder = Encoder(layer=enc_layer, n_layers=params['N'])

        # instantiate Decoder layer
        decoder_layer = DecoderLayer(
            size=params['d_model'],
            self_attn=MultiHeadAttention(
                n_head=params['attention']['n_head'],
                d_model=params['d_model'],
                d_k=params['attention']['d_k'],
                d_v=params['attention']['d_v'],
                dropout=params['attention']['dropout']),
            memory_attn=MultiHeadAttention(
                n_head=params['attention']['n_head'],
                d_model=params['d_model'],
                d_k=params['attention']['d_k'],
                d_v=params['attention']['d_v'],
                dropout=params['attention']['dropout']),
            feed_forward=PositionwiseFeedForward(
                d_model=params['d_model'],
                d_ff=params['feed-forward']['d_ff'],
                dropout=params['feed-forward']['dropout']),
            dropout=params['dropout'])

        # instantiate Decoder
        self.decoder = Decoder(layer=decoder_layer, N=params['N'])

        pos_encoding = PositionalEncoding(d_model=params['d_model'],
                                          dropout=params['dropout'])

        self.src_embeddings = nn.Sequential(
            Embeddings(d_model=params['d_model'],
                       vocab_size=params['src_vocab_size']), pos_encoding)

        self.trg_embeddings = nn.Sequential(
            Embeddings(d_model=params['d_model'],
                       vocab_size=params['tgt_vocab_size']), pos_encoding)

        self.classifier = OutputClassifier(d_model=params['d_model'],
                                           vocab=params['tgt_vocab_size'])

        # Initialize parameters with Glorot / fan_avg.
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
예제 #21
0
    encoder = Encoder(args.d_input * args.LFR_m,
                      args.n_layers_enc,
                      args.n_head,
                      args.d_k,
                      args.d_v,
                      args.d_model,
                      args.d_inner,
                      dropout=args.dropout,
                      pe_maxlen=args.pe_maxlen)
    decoder = Decoder(
        sos_id,
        eos_id,
        vocab_size,
        args.d_word_vec,
        args.n_layers_dec,
        args.n_head,
        args.d_k,
        args.d_v,
        args.d_model,
        args.d_inner,
        dropout=args.dropout,
        tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing,
        pe_maxlen=args.pe_maxlen)
    model = Transformer(encoder, decoder)

    optimizer = TransformerOptimizer(
        torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
        args.k, args.d_model, args.warmup_steps)

    print(args.k)
    print(args.d_model)
    print(args.warmup_steps)
예제 #22
0
    def __init__(self,
                 n_src_vocab,
                 n_tgt_vocab,
                 len_max_seq_enc,
                 len_max_seq_dec,
                 d_word_vec=512,
                 d_model=512,
                 d_inner=2048,
                 n_layers=6,
                 n_head=8,
                 d_k=64,
                 d_v=64,
                 dropout=0.1,
                 tgt_emb_prj_weight_sharing=True,
                 emb_src_tgt_weight_sharing=True,
                 pretrained_embeddings=None):

        super().__init__()

        self.encoder = Encoder(n_src_vocab=n_src_vocab,
                               len_max_seq=len_max_seq_enc,
                               d_word_vec=d_word_vec,
                               d_model=d_model,
                               d_inner=d_inner,
                               n_layers=n_layers,
                               n_head=n_head,
                               d_k=d_k,
                               d_v=d_v,
                               dropout=dropout,
                               pretrained_embeddings=pretrained_embeddings)

        self.decoder = Decoder(n_tgt_vocab=n_tgt_vocab,
                               len_max_seq=len_max_seq_dec,
                               d_word_vec=d_word_vec,
                               d_model=d_model,
                               d_inner=d_inner,
                               n_layers=n_layers,
                               n_head=n_head,
                               d_k=d_k,
                               d_v=d_v,
                               dropout=dropout,
                               pretrained_embeddings=pretrained_embeddings)

        self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False)
        nn.init.xavier_normal_(self.tgt_word_prj.weight)

        assert d_model == d_word_vec, \
            'To facilitate the residual connections, \
             the dimensions of all module outputs shall be the same.'

        if tgt_emb_prj_weight_sharing:
            # Share the weight matrix between target word embedding & the final logit dense layer
            self.tgt_word_prj.weight = self.decoder.tgt_word_emb.weight
            self.x_logit_scale = (d_model**-0.5)
        else:
            self.x_logit_scale = 1.

        if emb_src_tgt_weight_sharing:
            # Share the weight matrix between source & target word embeddings
            assert n_src_vocab == n_tgt_vocab, \
                "To share word embedding table, the vocabulary size of src/tgt shall be the same."
            self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight