def __init__(self, vocab_size, h_sizes, params):
        super(Seq2Seq, self).__init__()

        e_size, max_len, n_layers, dropout_p, model_name, use_attention = [params[i] for i in [C.EMBEDDING_DIM, C.OUTPUT_MAX_LEN, C.H_LAYERS, C.DROPOUT, C.MODEL_NAME, C.USE_ATTENTION]]
        r_hsize, q_hsize, a_hsize = h_sizes

        self.use_attention = use_attention
        self.model_name = model_name
        self.decoder = DecoderRNN(vocab_size=vocab_size, max_len=max_len, embedding_size=e_size, hidden_size=a_hsize,
                            n_layers=n_layers, dropout_p=dropout_p,
                            sos_id=C.SOS_INDEX, eos_id=C.EOS_INDEX, model_name=model_name, use_attention=self.use_attention)

        if model_name == C.LM_ANSWERS:
            self.question_encoder = None
        else:
            self.question_encoder = EncoderRNN(vocab_size=vocab_size, max_len=max_len, embedding_size=e_size,
                        hidden_size=q_hsize, n_layers=n_layers, dropout_p=dropout_p)
            self.decoder.embedding.weight = self.question_encoder.embedding.weight

        if model_name == C.LM_QUESTION_ANSWERS_REVIEWS:
            self.reviews_encoder = EncoderRNN(vocab_size=vocab_size, max_len=max_len, embedding_size=e_size,
                        hidden_size=r_hsize, n_layers=n_layers, dropout_p=dropout_p)
            self.decoder.embedding.weight = self.reviews_encoder.embedding.weight
        else:
            self.reviews_encoder = None


        if self.model_name == C.LM_QUESTION_ANSWERS:
            assert q_hsize == a_hsize
        if self.model_name == C.LM_QUESTION_ANSWERS_REVIEWS:
            # TODO Fix this workaround
            if self.use_attention:
                assert a_hsize == q_hsize == r_hsize
            else:
                assert a_hsize == q_hsize + r_hsize
Exemplo n.º 2
0
    def __init__(self,
                 vocab_size,
                 max_len,
                 hidden_size,
                 vocab_embed_size,
                 sos_id,
                 eos_id,
                 num_layers=1,
                 rnn_cell='LSTM',
                 bidirectional=False,
                 input_dropout_p=0,
                 dropout_p=0,
                 answer_max_len=None,
                 embedding=None):
        """Constructor for VQAModel.

        Args:
            vocab_size: Number of words in the vocabulary.
            max_len: The maximum length of the answers we generate.
            hidden_size: Number of dimensions of RNN hidden cell.
            vocab_embed_size: Number of dimensions of RNN embedding.
            sos_id: Vocab id for <start>.
            eos_id: Vocab id for <end>.
            num_layers: The number of layers of the RNNs.
            rnn_cell: LSTM or RNN or GRU.
            bidirectional: Whether the RNN is bidirectional.
            input_dropout_p: Dropout applied to the input question words.
            dropout_p: Dropout applied internally between RNN steps.
            embedding: Pretrained Embedding weights.
        """
        super(SEGModel, self).__init__()
        self.encoder_cnn = SpatialResnetEncoder(2)
        self.encoder_rnn = EncoderRNN(vocab_size,
                                      max_len,
                                      hidden_size,
                                      input_dropout_p=input_dropout_p,
                                      dropout_p=dropout_p,
                                      n_layers=num_layers,
                                      bidirectional=bidirectional,
                                      rnn_cell=rnn_cell,
                                      vocab_embed_size=vocab_embed_size,
                                      variable_lengths=True,
                                      embedding=embedding)
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.vocab_embed_size = vocab_embed_size
        self.bidirectional_multiplier = 2 if bidirectional else 1
        answer_max_len = answer_max_len if answer_max_len is not None else max_len
Exemplo n.º 3
0
from utils.loader import MultiLoader, BaseDataLoader
from utils.trainer import train

if __name__ == '__main__':
    h_params = HyperParams()

    random.seed(h_params.seed)
    torch.manual_seed(h_params.seed)
    torch.cuda.manual_seed_all(h_params.seed)
    cuda = not h_params.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if cuda else 'cpu')

    feature_size = 40  # MFCC n_mfcc = 40이라 40

    enc = EncoderRNN(feature_size, h_params.hidden_size,
                     input_dropout_p = h_params.dropout, dropout_p = h_params.dropout,
                     n_layers = h_params.encoder_layer_size,
                     bidirectional = h_params.bidirectional, rnn_cell = 'gru', variable_lengths = False)

    dec = DecoderRNN(len(char2index), h_params.max_len, h_params.hidden_size * (2 if h_params.bidirectional else 1),
                     SOS_token, EOS_token,
                     n_layers = h_params.decoder_layer_size, rnn_cell = 'gru', bidirectional = h_params.bidirectional,
                     input_dropout_p = h_params.dropout, dropout_p = h_params.dropout, use_attention = h_params.attention)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()
    model = nn.DataParallel(model).to(device) # 병렬처리 부분인 듯

    # Adam Algorithm
    optimizer = optim.Adam(model.module.parameters(), lr = h_params.lr)
    # CrossEntropy로 loss 계산
    criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device)
Exemplo n.º 4
0
class SEGModel(nn.Module):
    """Stacked attention sequence-to-sequence model.
    """
    def __init__(self,
                 vocab_size,
                 max_len,
                 hidden_size,
                 vocab_embed_size,
                 sos_id,
                 eos_id,
                 num_layers=1,
                 rnn_cell='LSTM',
                 bidirectional=False,
                 input_dropout_p=0,
                 dropout_p=0,
                 answer_max_len=None,
                 embedding=None):
        """Constructor for VQAModel.

        Args:
            vocab_size: Number of words in the vocabulary.
            max_len: The maximum length of the answers we generate.
            hidden_size: Number of dimensions of RNN hidden cell.
            vocab_embed_size: Number of dimensions of RNN embedding.
            sos_id: Vocab id for <start>.
            eos_id: Vocab id for <end>.
            num_layers: The number of layers of the RNNs.
            rnn_cell: LSTM or RNN or GRU.
            bidirectional: Whether the RNN is bidirectional.
            input_dropout_p: Dropout applied to the input question words.
            dropout_p: Dropout applied internally between RNN steps.
            embedding: Pretrained Embedding weights.
        """
        super(SEGModel, self).__init__()
        self.encoder_cnn = SpatialResnetEncoder(2)
        self.encoder_rnn = EncoderRNN(vocab_size,
                                      max_len,
                                      hidden_size,
                                      input_dropout_p=input_dropout_p,
                                      dropout_p=dropout_p,
                                      n_layers=num_layers,
                                      bidirectional=bidirectional,
                                      rnn_cell=rnn_cell,
                                      vocab_embed_size=vocab_embed_size,
                                      variable_lengths=True,
                                      embedding=embedding)
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.vocab_embed_size = vocab_embed_size
        self.bidirectional_multiplier = 2 if bidirectional else 1
        answer_max_len = answer_max_len if answer_max_len is not None else max_len

    def params_to_train(self):
        params = (list(self.encoder_rnn.parameters()) +
                  list(self.encoder_cnn.fc.parameters()))
        # Don't train the embedding weights.
        params = filter(lambda p: p.requires_grad, params)
        return params

    def flatten_parameters(self):
        self.encoder_rnn.rnn.flatten_parameters()

    def forward(self, images, answers, alengths=None, questions=None):
        """Passes the image and the question through a VQA model and generates answers.

        Args:
            images: Batch of image Variables.
            questions: Batch of question Variables.
            qlengths: List of question lengths.
            answers: Batch of answer Variables.

        Returns:
            - outputs: The output scores for all steps in the RNN.
            - hidden: The hidden states of all the RNNs.
            - ret_dict: A dictionary of attributes. See DecoderRNN.py for details.
        """

        # features is (N * 2048 * 56 * 56)

        input_spatial_dim = images.size()[2:]
        features = self.encoder_cnn.resnet(images)

        # encoder_hidden is ((BIDIRECTIONAL x NUM_LAYERS) * N * HIDDEN_SIZE).
        _, encoder_hidden_ans = self.encoder_rnn(answers, alengths, None)

        if self.encoder_rnn.rnn_cell is nn.LSTM:
            encoder_hidden_ans = encoder_hidden_ans[0]
        encoder_hidden_ans = encoder_hidden_ans.transpose(0, 1).contiguous()

        if self.bidirectional_multiplier == 2:
            encoder_hidden = torch.cat(
                (encoder_hidden_ans[:, 0], encoder_hidden_ans[:, -1]), dim=1)
        else:
            encoder_hidden = encoder_hidden_ans[:, -1]

        if questions is not None:
            alengths = process_lengths(questions)
            # Reorder based on length
            sort_index = sorted(range(len(alengths)),
                                key=lambda x: alengths[x].item(),
                                reverse=True)
            questions = questions[sort_index]
            alengths = np.array(alengths)[sort_index].tolist()
            _, encoder_hidden_qs = self.encoder_rnn(questions, alengths, None)
            if self.encoder_rnn.rnn_cell is nn.LSTM:
                encoder_hidden_qs = encoder_hidden_qs[0]
                encoder_hidden_qs = encoder_hidden_qs.transpose(
                    0, 1).contiguous()

            if self.bidirectional_multiplier == 2:
                encoder_hidden_qs = torch.cat(
                    (encoder_hidden_qs[:, 0], encoder_hidden_qs[:, -1]), dim=1)
            else:
                encoder_hidden_qs = encoder_hidden_qs[:, -1]

            # Reorder to match answer ordering
            ordering = [sort_index.index(i) for i in range(images.size(0))]
            encoder_hidden_qs = encoder_hidden_qs[ordering]
            encoder_hidden = torch.cat([encoder_hidden, encoder_hidden_qs],
                                       dim=1)

        # Pass the features through the stacked attention network.
        encoder_hidden = encoder_hidden.unsqueeze(2).unsqueeze(2).repeat(
            1, 1, features.size(2), features.size(3))
        features = self.encoder_cnn.fc(features * encoder_hidden)
        result = nn.functional.upsample_bilinear(input=features,
                                                 size=input_spatial_dim)

        return result
Exemplo n.º 5
0
if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("USAGE: python test.py <checkpoint_dir>")
        sys.exit()

    checkpoint_dir = sys.argv[1]
    weights = torch.load("data/GloVe_embeddings.pt")
    device = torch.device('cuda:0')
    bi_enable = True
    if bi_enable:
        encoder1 = EncoderRNN_bi(weights, cfg.EMBEDDING_SIZE, cfg.HIDDEN_SIZE,
                                 1).to(device)
        attn_decoder1 = AttnDecoderRNN_bi(weights, 2 * cfg.HIDDEN_SIZE, 200003,
                                          1).to(device)
    else:
        encoder1 = EncoderRNN(weights, cfg.EMBEDDING_SIZE, cfg.HIDDEN_SIZE,
                              2).to(device)
        attn_decoder1 = AttnDecoderRNN(weights, cfg.HIDDEN_SIZE, 200003,
                                       2).to(device)

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    # extractTestSum() Only run once to extract reference summaries from test set
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
        extractTestSum()
    if not os.path.exists(system_dir):
        os.makedirs(system_dir)

    r = Rouge155()
    r.system_dir = system_dir
    r.model_dir = model_dir
Exemplo n.º 6
0
# 메인 함수 간소화된 버젼
if __name__ == '__main__':
    hparams = HyperParams()  # 하이퍼파라미터 불러오기
    cuda = hparams.use_cuda and torch.cuda.is_available(
    )  # 쿠다 사용여부 & 쿠다 사용 가능여부
    device = torch.device(
        'cuda' if cuda else 'cpu')  # 쿠다 사용 가능시 gpu 사용 아니면 cpu 사용

    # 입력 데이터의 피쳐 사이즈
    feat_size = 33

    # EncoderRnn (인코더) 생성
    encoder = EncoderRNN(
        feat_size=feat_size,
        hidden_size=hparams.hidden_size,
        dropout_p=hparams.dropout,
        layer_size=hparams.encoder_layer_size,
        bidirectional=hparams.use_bidirectional,
        rnn_cell='gru')  # use_pyramidal=hparams.use_pyramidal)

    # Decoder (디코더) 생성
    decoder = DecoderRNN(vocab_size=len(char2index),
                         max_len=hparams.max_len,
                         hidden_size=hparams.hidden_size *
                         (2 if hparams.use_bidirectional else 1),
                         sos_id=SOS_token,
                         eos_id=EOS_token,
                         layer_size=hparams.speller_layer_size,
                         rnn_cell='gru',
                         dropout_p=hparams.dropout,
                         use_attention=hparams.use_attention)
Exemplo n.º 7
0
    def __init__(self, vocab_size, h_sizes, params):
        super(Seq2Seq, self).__init__()
        #TEST
        e_size, max_len, n_layers, dropout_p, model_name, use_attention, rnn_cell, bidirectional, use_glove = [
            params[i] for i in [
                C.EMBEDDING_DIM, C.OUTPUT_MAX_LEN, C.H_LAYERS, C.DROPOUT,
                C.MODEL_NAME, C.USE_ATTENTION, C.RNN_CELL, C.BIDIRECTIONAL,
                C.USE_GLOVE
            ]
        ]
        r_hsize, q_hsize, a_hsize = h_sizes

        use_attention = bool(use_attention)
        bidirectional = bool(bidirectional)
        use_glove = bool(use_glove)
        self.use_attention = use_attention
        self.model_name = model_name
        self.decoder = DecoderRNN(vocab_size=vocab_size,
                                  max_len=max_len,
                                  embedding_size=e_size,
                                  hidden_size=a_hsize,
                                  use_glove=use_glove,
                                  n_layers=n_layers,
                                  dropout_p=dropout_p,
                                  bidirectional=bidirectional,
                                  rnn_cell=rnn_cell,
                                  sos_id=C.SOS_INDEX,
                                  eos_id=C.EOS_INDEX,
                                  model_name=model_name,
                                  use_attention=self.use_attention)

        if model_name == C.LM_ANSWERS:
            self.question_encoder = None
        else:
            self.question_encoder = EncoderRNN(vocab_size=vocab_size,
                                               max_len=max_len,
                                               embedding_size=e_size,
                                               use_glove=use_glove,
                                               hidden_size=q_hsize,
                                               n_layers=n_layers,
                                               dropout_p=dropout_p,
                                               bidirectional=bidirectional,
                                               rnn_cell=rnn_cell)
            if use_glove == False:
                self.decoder.embedding.weight = self.question_encoder.embedding.weight

        if model_name == C.LM_QUESTION_ANSWERS_REVIEWS:
            self.reviews_encoder = EncoderRNN(vocab_size=vocab_size,
                                              max_len=max_len,
                                              embedding_size=e_size,
                                              use_glove=use_glove,
                                              hidden_size=r_hsize,
                                              n_layers=n_layers,
                                              dropout_p=dropout_p,
                                              bidirectional=bidirectional,
                                              rnn_cell=rnn_cell)
            if use_glove == False:
                self.decoder.embedding.weight = self.reviews_encoder.embedding.weight

        else:
            self.reviews_encoder = None

        if self.model_name == C.LM_QUESTION_ANSWERS:
            assert q_hsize == a_hsize
        if self.model_name == C.LM_QUESTION_ANSWERS_REVIEWS:
            # TODO Fix this workaround
            if self.use_attention:
                assert a_hsize == q_hsize == r_hsize
            else:
                assert a_hsize == q_hsize + r_hsize
Exemplo n.º 8
0

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("USAGE: python train.py <checkpoint_dir>")
        sys.exit()

    checkpoint_dir = sys.argv[1]
    weights = torch.load("data/GloVe_embeddings.pt")
    device = torch.device('cpu')
    bi_enable = False
    if bi_enable:
        encoder1 = EncoderRNN_bi(weights, cfg.EMBEDDING_SIZE, cfg.HIDDEN_SIZE, 1, dropout_p=0.1).to(device)
        attn_decoder1 = AttnDecoderRNN_bi(weights, 2 * cfg.HIDDEN_SIZE, 200003, 1, dropout_p=0.1).to(device)
    else:
        encoder1 = EncoderRNN(weights, cfg.EMBEDDING_SIZE, cfg.HIDDEN_SIZE, 2, dropout_p=0.1).to(device)
        attn_decoder1 = AttnDecoderRNN(weights, cfg.HIDDEN_SIZE, 200003, 2, dropout_p=0.1).to(device)

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    # extractTestSum() Only run once to extract reference summaries from test set
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
        extractTestSum()
    if not os.path.exists(system_dir):
        os.makedirs(system_dir)

    r = Rouge155()
    r.system_dir = system_dir
    r.model_dir = model_dir
    r.system_filename_pattern = system_filename_pattern