Exemplo n.º 1
0
    def __init__(self, opt):
        super(ITModel, self).__init__()
        self.vocab_size = opt.vocab_size
        self.input_encoding_size = opt.input_encoding_size
        self.rnn_type = opt.rnn_type
        self.rnn_size = opt.rnn_size
        self.num_layers = opt.num_layers
        self.drop_prob_lm = opt.drop_prob_lm
        self.seq_length = opt.seq_length
        self.fc_feat_size = opt.fc_feat_size
        self.att_feat_size = opt.att_feat_size
        self.att_size = opt.att_size
        self.batch_size = opt.batch_size * opt.seq_per_img
        self.rnn_atten = opt.rnn_atten
        self.num_layers = opt.num_layers
        self.num_parallels = opt.num_parallels
        self.sample_rate = opt.sample_rate
        self.use_linear = opt.use_linear
        self.rnn_size_list = opt.rnn_size_list

        # LSTM
        self.core = rnn_utils.get_lstm(opt)

        # self.vocab_size + 1 -> self.input_encoding_size
        self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size)

        if self.use_linear:
            # (batch_size * fc_feat_size) -> (batch_size * input_encoding_size)
            self.img_embed = nn.Linear(self.fc_feat_size, self.rnn_size)
            self.att_embed = nn.Linear(self.att_feat_size, self.rnn_size)

            # self.relu = nn.RReLU(inplace=True)
            self.relu = nn.ReLU()
            self.init_weight()
Exemplo n.º 2
0
    def __init__(self, opt):
        super(ShowTellModel, self).__init__()

        self.vocab_size = opt.vocab_size
        self.input_encoding_size = opt.input_encoding_size
        self.rnn_type = opt.rnn_type
        self.rnn_size = opt.rnn_size
        self.num_layers = opt.num_layers
        self.drop_prob_lm = opt.drop_prob_lm
        self.seq_length = opt.seq_length
        self.fc_feat_size = opt.fc_feat_size
        self.att_feat_size = opt.att_feat_size
        self.att_size = opt.att_size
        self.use_linear = opt.use_linear
        self.gram_num = opt.gram_num

        # LSTM
        self.core = rnn_utils.get_lstm(opt)

        # self.vocab_size + 1 -> self.input_encoding_size
        if self.gram_num > 0:
            self.embed = nn.Sequential(
                nn.Embedding(self.vocab_size + 1, self.input_encoding_size),
                Embed.WordEmbed(self.gram_num))
        else:
            self.embed = nn.Embedding(self.vocab_size + 1,
                                      self.input_encoding_size)

        if self.use_linear:
            self.img_embed = nn.Linear(self.fc_feat_size, self.rnn_size)

            # self.relu = nn.RReLU(inplace=True)
            self.relu = nn.ReLU()

            self.init_weight()
    def __init__(self, opt):
        super(TopDownAttenModel, self).__init__()
        self.vocab_size = opt.vocab_size
        self.input_encoding_size = opt.input_encoding_size
        self.rnn_type = opt.rnn_type
        self.rnn_size = opt.rnn_size
        self.num_layers = opt.num_layers
        self.drop_prob_lm = opt.drop_prob_lm
        self.seq_length = opt.seq_length
        self.fc_feat_size = opt.fc_feat_size

        self.bu_feat_size = opt.bu_feat_size
        self.bu_size = opt.bu_size

        # LSTM
        self.core = rnn_utils.get_lstm(opt)

        # self.vocab_size + 1 -> self.input_encoding_size
        self.embed = nn.Embedding(self.vocab_size + 1,
                                  self.input_encoding_size)

        self.img_embed = nn.Linear(self.fc_feat_size, self.rnn_size)
        self.att_embed = nn.Linear(self.bu_feat_size, self.rnn_size)

        # self.relu = nn.RReLU(inplace=True)
        self.relu = nn.ReLU()
        self.init_weight()
Exemplo n.º 4
0
    def __init__(self, opt):
        super(MoreSupWeightModel, self).__init__()
        self.vocab_size = opt.vocab_size
        self.input_encoding_size = opt.input_encoding_size
        self.rnn_type = opt.rnn_type
        self.rnn_size = opt.rnn_size
        self.num_layers = opt.num_layers
        self.drop_prob_lm = opt.drop_prob_lm
        self.seq_length = opt.seq_length
        self.fc_feat_size = opt.fc_feat_size
        self.att_feat_size = opt.att_feat_size
        self.att_size = opt.att_size
        self.batch_size = opt.batch_size * opt.seq_per_img
        self.rnn_atten = opt.rnn_atten
        self.num_parallels = opt.num_parallels
        self.sample_rate = opt.sample_rate
        self.use_linear = opt.use_linear
        self.rnn_size_list = opt.rnn_size_list
        self.gram_num = opt.gram_num
        self.logprob_pool_type = opt.logprob_pool_type # 0 mean 1 max

        # reviewnet
        self.use_reviewnet = opt.use_reviewnet
        if self.use_reviewnet == 1:
            self.review_length = opt.review_length
            self.review_nets = nn.ModuleList()
            for i in range(self.review_length):
                self.review_nets[i] = LSTM.LSTM_SOFT_ATT_NOX(self.rnn_size, self.att_size, self.drop_prob_lm)
            opt.att_size = self.review_length

        # LSTM
        # opt.input_encoding_size = opt.input_encoding_size * 2
        self.core = rnn_utils.get_lstm(opt)

        if self.rnn_atten == "ATT_LSTM":
            self.atten = LSTM.LSTM_ATTEN_LAYER(self.rnn_size)

        # self.vocab_size + 1 -> self.input_encoding_size
        # self.vocab_size + 1 -> self.input_encoding_size
        if self.gram_num > 0:
            self.embed = nn.Sequential(nn.Embedding(self.vocab_size + 1, self.input_encoding_size),
                                       Embed.WordEmbed(self.gram_num))
            # self.embed_tc = nn.Sequential(nn.Embedding(self.vocab_size + 1, self.input_encoding_size),
            #                            Embed.WordEmbed(self.gram_num))
            # self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size)
            # self.embed_tc = nn.Embedding(self.vocab_size + 1, self.input_encoding_size)
        else:
            self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size)

        self.embed_tc = nn.Embedding(self.vocab_size + 1, self.input_encoding_size)

        if self.use_linear:
            # (batch_size * fc_feat_size) -> (batch_size * input_encoding_size)
            self.img_embed = nn.Linear(self.fc_feat_size, self.input_encoding_size)
            self.att_embed = nn.Linear(self.att_feat_size, self.input_encoding_size)

            # self.relu = nn.RReLU(inplace=True)
            self.relu = nn.ReLU()
            self.init_weight()
Exemplo n.º 5
0
    def __init__(self, opt):
        super(DoubleAttenMModel, self).__init__()
        self.vocab_size = opt.vocab_size
        self.input_encoding_size = opt.input_encoding_size
        self.rnn_type = opt.rnn_type
        self.rnn_size = opt.rnn_size
        self.num_layers = opt.num_layers
        self.drop_prob_lm = opt.drop_prob_lm
        self.seq_length = opt.seq_length
        self.fc_feat_size = opt.fc_feat_size
        self.att_feat_size = opt.att_feat_size
        self.att_size = opt.att_size
        self.batch_size = opt.batch_size * opt.seq_per_img
        self.rnn_atten = opt.rnn_atten
        self.num_layers = opt.num_layers
        self.num_parallels = opt.num_parallels
        self.sample_rate = opt.sample_rate
        self.use_linear = opt.use_linear
        self.rnn_size_list = opt.rnn_size_list

        # reviewnet
        self.use_reviewnet = opt.use_reviewnet
        if self.use_reviewnet == 1:
            self.review_length = opt.review_length
            self.review_nets = nn.ModuleList()
            for i in range(self.review_length):
                self.review_nets[i] = LSTM.LSTM_SOFT_ATT_NOX(
                    self.rnn_size, self.att_size, self.drop_prob_lm)
            opt.att_size = self.review_length

        # LSTM
        self.core = rnn_utils.get_lstm(opt)

        if self.rnn_atten == "ATT_LSTM":
            self.atten = LSTM.LSTM_ATTEN_LAYER(self.rnn_size)

        # self.vocab_size + 1 -> self.input_encoding_size
        self.embed = nn.Embedding(self.vocab_size + 1,
                                  self.input_encoding_size)
        # self.embed_tc = nn.Embedding(self.vocab_size + 1, self.input_encoding_size)

        if self.use_linear:
            # (batch_size * fc_feat_size) -> (batch_size * input_encoding_size)
            self.img_embed = nn.Linear(self.fc_feat_size, self.rnn_size)
            self.att_embed = nn.Linear(self.att_feat_size, self.rnn_size)

            # self.relu = nn.RReLU(inplace=True)
            self.relu = nn.ReLU()
            self.init_weight()
Exemplo n.º 6
0
    def __init__(self, opt):
        super(MoreAttenModel, self).__init__()
        self.vocab_size = opt.vocab_size
        self.input_encoding_size = opt.input_encoding_size
        self.num_layers = opt.num_layers
        self.seq_length = opt.seq_length
        self.rnn_size = opt.rnn_size
        self.batch_size = opt.batch_size * opt.seq_per_img
        self.sample_rate = opt.sample_rate
        self.att_size = opt.att_size
        self.att_feat_size = opt.att_feat_size

        # LSTM
        self.core = rnn_utils.get_lstm(opt)

        self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size)

        self.att_embed = nn.Linear(self.att_feat_size, self.input_encoding_size)

        self.relu = nn.ReLU()
    def __init__(self, opt):
        super(ShowAttenTellPhraseBuModel, self).__init__()

        self.vocab_size = opt.vocab_size
        self.input_encoding_size = opt.input_encoding_size
        self.rnn_type = opt.rnn_type
        self.rnn_size = opt.rnn_size
        self.num_layers = opt.num_layers
        self.drop_prob_lm = opt.drop_prob_lm
        self.seq_length = opt.seq_length
        self.fc_feat_size = opt.fc_feat_size
        self.att_feat_size = opt.att_feat_size
        self.att_size = opt.att_size
        self.use_linear = opt.use_linear
        self.word_gram_num = opt.word_gram_num
        self.phrase_gram_num = opt.phrase_gram_num
        self.conv_gram_num = opt.conv_gram_num
        self.context_len = opt.context_len
        self.use_prob_weight = opt.use_prob_weight
        self.phrase_type = opt.phrase_type
        self.mil_type = opt.mil_type
        self.use_gated_layer = getattr(opt, 'use_gated_layer', 0)

        self.sample_rate = getattr(opt, 'sample_rate', 0)

        self.word_embedding_type = getattr(opt, 'word_embedding_type', 0)

        self.bu_size = getattr(opt, 'bu_size', opt.att_size)
        self.bu_feat_size = getattr(opt, 'bu_feat_size', opt.att_feat_size)

        self.use_bilinear = getattr(opt, 'use_bilinear', False)
        self.bilinear_output = getattr(opt, 'bilinear_output', 1000)

        self.relu_type = getattr(opt, 'relu_type', 0)

        # LSTM
        self.core = rnn_utils.get_lstm(opt)

        # self.vocab_size + 1 -> self.input_encoding_size
        if self.word_embedding_type == 1:
            self.embed = Embed.EmbeddingWithBias(self.vocab_size + 1,
                                                 self.input_encoding_size)
        else:
            if self.word_gram_num > 0:
                self.embed = nn.Sequential(
                    nn.Embedding(self.vocab_size + 1,
                                 self.input_encoding_size),
                    Embed.WordEmbed(self.word_gram_num))
            else:
                self.embed = nn.Embedding(self.vocab_size + 1,
                                          self.input_encoding_size)

        # phrase embed
        if self.phrase_type == 1:
            self.phraseEmbed = Embed.PhraseEmbed(self.phrase_gram_num,
                                                 self.rnn_size)
        elif self.phrase_type == 2:
            self.phraseEmbed = Embed.ConvEmbed(self.conv_gram_num)
        elif self.phrase_type == 3:
            self.phraseEmbed = Embed.PhraseEmbed(self.phrase_gram_num,
                                                 self.rnn_size)
            self.phraseEmbed1 = Embed.ConvEmbed(self.conv_gram_num)

        # word weight linear
        # input_encoding_size
        if self.use_prob_weight:
            self.prob_weight_layer = nn.Sequential(
                nn.Linear(self.fc_feat_size, self.vocab_size + 1),
                nn.Softmax())

        if self.use_linear:
            self.img_embed = nn.Linear(self.fc_feat_size, self.rnn_size)
            self.att_embed = nn.Linear(self.att_feat_size, self.rnn_size)
            self.bu_embed = nn.Linear(self.bu_feat_size, self.rnn_size)

            # self.relu = nn.RReLU(inplace=True)

            if self.relu_type == 0:
                if self.use_gated_layer == 1:
                    self.relu = GatedLayer.GatedTanh(self.input_encoding_size)
                else:
                    self.relu = nn.PReLU()
            elif self.relu_type == 1:
                self.img_relu = nn.PReLU()
                self.att_relu = nn.PReLU()
                self.bu_relu = nn.PReLU()

            self.init_weight()

        if self.use_bilinear:
            self.bilinear_layer = CompactBilinearPooling(
                self.rnn_size, self.rnn_size, self.bilinear_output)
            self.bilinear_layer1 = nn.Linear(self.bilinear_output,
                                             self.rnn_size)
    def __init__(self, opt):
        super(ShowAttenTellPhraseModel, self).__init__()
        self.vocab_size = opt.vocab_size
        self.input_encoding_size = opt.input_encoding_size
        self.rnn_type = opt.rnn_type
        self.rnn_size = opt.rnn_size
        self.num_layers = opt.num_layers
        self.drop_prob_lm = opt.drop_prob_lm
        self.seq_length = opt.seq_length
        self.fc_feat_size = opt.fc_feat_size
        self.att_feat_size = opt.att_feat_size
        self.att_size = opt.att_size
        self.use_linear = opt.use_linear
        self.word_gram_num = opt.word_gram_num
        self.phrase_gram_num = opt.phrase_gram_num
        self.conv_gram_num = opt.conv_gram_num
        self.context_len = opt.context_len
        self.use_prob_weight = opt.use_prob_weight
        self.phrase_type = opt.phrase_type
        self.mil_type = opt.mil_type
        self.use_gated_layer = getattr(opt, 'use_gated_layer', 0)
        self.use_linear_embedding = getattr(opt, 'use_linear_embedding', 0)
        self.relu_type = getattr(opt, 'relu_type', 0)

        # LSTM
        self.core = rnn_utils.get_lstm(opt)

        # self.vocab_size + 1 -> self.input_encoding_size
        if self.use_linear_embedding == 1:
            self.embed = nn.Linear(self.vocab_size + 1,
                                   self.input_encoding_size)
            self.embed.weight.data.normal_(0, 1)
        else:
            if self.word_gram_num > 0:
                self.embed = nn.Sequential(
                    nn.Embedding(self.vocab_size + 1,
                                 self.input_encoding_size),
                    Embed.WordEmbed(self.word_gram_num))
            else:
                self.embed = nn.Embedding(self.vocab_size + 1,
                                          self.input_encoding_size)

        # phrase embed
        if self.phrase_type == 1:
            self.phraseEmbed = Embed.PhraseEmbed(self.phrase_gram_num,
                                                 self.rnn_size)
        elif self.phrase_type == 2:
            self.phraseEmbed = Embed.ConvEmbed(self.conv_gram_num)
        elif self.phrase_type == 3:
            self.phraseEmbed = Embed.PhraseEmbed(self.phrase_gram_num,
                                                 self.rnn_size)
            self.phraseEmbed1 = Embed.ConvEmbed(self.conv_gram_num)

        # word weight linear
        # input_encoding_size
        if self.use_prob_weight:
            self.prob_weight_layer = nn.Sequential(
                nn.Linear(self.fc_feat_size, self.vocab_size + 1),
                nn.Sigmoid())

        if self.use_linear:
            self.img_embed = nn.Linear(self.fc_feat_size, self.rnn_size)
            self.att_embed = nn.Linear(self.att_feat_size, self.rnn_size)

            # self.relu = nn.RReLU(inplace=True)

            if self.relu_type == 0:
                if self.use_gated_layer == 1:
                    self.relu = GatedLayer.GatedTanh(self.input_encoding_size)
                else:
                    self.relu = nn.PReLU()
            elif self.relu_type == 1:
                self.img_relu = nn.PReLU()
                self.att_relu = nn.PReLU()

            self.init_weight()