def __init__(self, opt): super(ITModel, self).__init__() self.vocab_size = opt.vocab_size self.input_encoding_size = opt.input_encoding_size self.rnn_type = opt.rnn_type self.rnn_size = opt.rnn_size self.num_layers = opt.num_layers self.drop_prob_lm = opt.drop_prob_lm self.seq_length = opt.seq_length self.fc_feat_size = opt.fc_feat_size self.att_feat_size = opt.att_feat_size self.att_size = opt.att_size self.batch_size = opt.batch_size * opt.seq_per_img self.rnn_atten = opt.rnn_atten self.num_layers = opt.num_layers self.num_parallels = opt.num_parallels self.sample_rate = opt.sample_rate self.use_linear = opt.use_linear self.rnn_size_list = opt.rnn_size_list # LSTM self.core = rnn_utils.get_lstm(opt) # self.vocab_size + 1 -> self.input_encoding_size self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) if self.use_linear: # (batch_size * fc_feat_size) -> (batch_size * input_encoding_size) self.img_embed = nn.Linear(self.fc_feat_size, self.rnn_size) self.att_embed = nn.Linear(self.att_feat_size, self.rnn_size) # self.relu = nn.RReLU(inplace=True) self.relu = nn.ReLU() self.init_weight()
def __init__(self, opt): super(ShowTellModel, self).__init__() self.vocab_size = opt.vocab_size self.input_encoding_size = opt.input_encoding_size self.rnn_type = opt.rnn_type self.rnn_size = opt.rnn_size self.num_layers = opt.num_layers self.drop_prob_lm = opt.drop_prob_lm self.seq_length = opt.seq_length self.fc_feat_size = opt.fc_feat_size self.att_feat_size = opt.att_feat_size self.att_size = opt.att_size self.use_linear = opt.use_linear self.gram_num = opt.gram_num # LSTM self.core = rnn_utils.get_lstm(opt) # self.vocab_size + 1 -> self.input_encoding_size if self.gram_num > 0: self.embed = nn.Sequential( nn.Embedding(self.vocab_size + 1, self.input_encoding_size), Embed.WordEmbed(self.gram_num)) else: self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) if self.use_linear: self.img_embed = nn.Linear(self.fc_feat_size, self.rnn_size) # self.relu = nn.RReLU(inplace=True) self.relu = nn.ReLU() self.init_weight()
def __init__(self, opt): super(TopDownAttenModel, self).__init__() self.vocab_size = opt.vocab_size self.input_encoding_size = opt.input_encoding_size self.rnn_type = opt.rnn_type self.rnn_size = opt.rnn_size self.num_layers = opt.num_layers self.drop_prob_lm = opt.drop_prob_lm self.seq_length = opt.seq_length self.fc_feat_size = opt.fc_feat_size self.bu_feat_size = opt.bu_feat_size self.bu_size = opt.bu_size # LSTM self.core = rnn_utils.get_lstm(opt) # self.vocab_size + 1 -> self.input_encoding_size self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) self.img_embed = nn.Linear(self.fc_feat_size, self.rnn_size) self.att_embed = nn.Linear(self.bu_feat_size, self.rnn_size) # self.relu = nn.RReLU(inplace=True) self.relu = nn.ReLU() self.init_weight()
def __init__(self, opt): super(MoreSupWeightModel, self).__init__() self.vocab_size = opt.vocab_size self.input_encoding_size = opt.input_encoding_size self.rnn_type = opt.rnn_type self.rnn_size = opt.rnn_size self.num_layers = opt.num_layers self.drop_prob_lm = opt.drop_prob_lm self.seq_length = opt.seq_length self.fc_feat_size = opt.fc_feat_size self.att_feat_size = opt.att_feat_size self.att_size = opt.att_size self.batch_size = opt.batch_size * opt.seq_per_img self.rnn_atten = opt.rnn_atten self.num_parallels = opt.num_parallels self.sample_rate = opt.sample_rate self.use_linear = opt.use_linear self.rnn_size_list = opt.rnn_size_list self.gram_num = opt.gram_num self.logprob_pool_type = opt.logprob_pool_type # 0 mean 1 max # reviewnet self.use_reviewnet = opt.use_reviewnet if self.use_reviewnet == 1: self.review_length = opt.review_length self.review_nets = nn.ModuleList() for i in range(self.review_length): self.review_nets[i] = LSTM.LSTM_SOFT_ATT_NOX(self.rnn_size, self.att_size, self.drop_prob_lm) opt.att_size = self.review_length # LSTM # opt.input_encoding_size = opt.input_encoding_size * 2 self.core = rnn_utils.get_lstm(opt) if self.rnn_atten == "ATT_LSTM": self.atten = LSTM.LSTM_ATTEN_LAYER(self.rnn_size) # self.vocab_size + 1 -> self.input_encoding_size # self.vocab_size + 1 -> self.input_encoding_size if self.gram_num > 0: self.embed = nn.Sequential(nn.Embedding(self.vocab_size + 1, self.input_encoding_size), Embed.WordEmbed(self.gram_num)) # self.embed_tc = nn.Sequential(nn.Embedding(self.vocab_size + 1, self.input_encoding_size), # Embed.WordEmbed(self.gram_num)) # self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) # self.embed_tc = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) else: self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) self.embed_tc = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) if self.use_linear: # (batch_size * fc_feat_size) -> (batch_size * input_encoding_size) self.img_embed = nn.Linear(self.fc_feat_size, self.input_encoding_size) self.att_embed = nn.Linear(self.att_feat_size, self.input_encoding_size) # self.relu = nn.RReLU(inplace=True) self.relu = nn.ReLU() self.init_weight()
def __init__(self, opt): super(DoubleAttenMModel, self).__init__() self.vocab_size = opt.vocab_size self.input_encoding_size = opt.input_encoding_size self.rnn_type = opt.rnn_type self.rnn_size = opt.rnn_size self.num_layers = opt.num_layers self.drop_prob_lm = opt.drop_prob_lm self.seq_length = opt.seq_length self.fc_feat_size = opt.fc_feat_size self.att_feat_size = opt.att_feat_size self.att_size = opt.att_size self.batch_size = opt.batch_size * opt.seq_per_img self.rnn_atten = opt.rnn_atten self.num_layers = opt.num_layers self.num_parallels = opt.num_parallels self.sample_rate = opt.sample_rate self.use_linear = opt.use_linear self.rnn_size_list = opt.rnn_size_list # reviewnet self.use_reviewnet = opt.use_reviewnet if self.use_reviewnet == 1: self.review_length = opt.review_length self.review_nets = nn.ModuleList() for i in range(self.review_length): self.review_nets[i] = LSTM.LSTM_SOFT_ATT_NOX( self.rnn_size, self.att_size, self.drop_prob_lm) opt.att_size = self.review_length # LSTM self.core = rnn_utils.get_lstm(opt) if self.rnn_atten == "ATT_LSTM": self.atten = LSTM.LSTM_ATTEN_LAYER(self.rnn_size) # self.vocab_size + 1 -> self.input_encoding_size self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) # self.embed_tc = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) if self.use_linear: # (batch_size * fc_feat_size) -> (batch_size * input_encoding_size) self.img_embed = nn.Linear(self.fc_feat_size, self.rnn_size) self.att_embed = nn.Linear(self.att_feat_size, self.rnn_size) # self.relu = nn.RReLU(inplace=True) self.relu = nn.ReLU() self.init_weight()
def __init__(self, opt): super(MoreAttenModel, self).__init__() self.vocab_size = opt.vocab_size self.input_encoding_size = opt.input_encoding_size self.num_layers = opt.num_layers self.seq_length = opt.seq_length self.rnn_size = opt.rnn_size self.batch_size = opt.batch_size * opt.seq_per_img self.sample_rate = opt.sample_rate self.att_size = opt.att_size self.att_feat_size = opt.att_feat_size # LSTM self.core = rnn_utils.get_lstm(opt) self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) self.att_embed = nn.Linear(self.att_feat_size, self.input_encoding_size) self.relu = nn.ReLU()
def __init__(self, opt): super(ShowAttenTellPhraseBuModel, self).__init__() self.vocab_size = opt.vocab_size self.input_encoding_size = opt.input_encoding_size self.rnn_type = opt.rnn_type self.rnn_size = opt.rnn_size self.num_layers = opt.num_layers self.drop_prob_lm = opt.drop_prob_lm self.seq_length = opt.seq_length self.fc_feat_size = opt.fc_feat_size self.att_feat_size = opt.att_feat_size self.att_size = opt.att_size self.use_linear = opt.use_linear self.word_gram_num = opt.word_gram_num self.phrase_gram_num = opt.phrase_gram_num self.conv_gram_num = opt.conv_gram_num self.context_len = opt.context_len self.use_prob_weight = opt.use_prob_weight self.phrase_type = opt.phrase_type self.mil_type = opt.mil_type self.use_gated_layer = getattr(opt, 'use_gated_layer', 0) self.sample_rate = getattr(opt, 'sample_rate', 0) self.word_embedding_type = getattr(opt, 'word_embedding_type', 0) self.bu_size = getattr(opt, 'bu_size', opt.att_size) self.bu_feat_size = getattr(opt, 'bu_feat_size', opt.att_feat_size) self.use_bilinear = getattr(opt, 'use_bilinear', False) self.bilinear_output = getattr(opt, 'bilinear_output', 1000) self.relu_type = getattr(opt, 'relu_type', 0) # LSTM self.core = rnn_utils.get_lstm(opt) # self.vocab_size + 1 -> self.input_encoding_size if self.word_embedding_type == 1: self.embed = Embed.EmbeddingWithBias(self.vocab_size + 1, self.input_encoding_size) else: if self.word_gram_num > 0: self.embed = nn.Sequential( nn.Embedding(self.vocab_size + 1, self.input_encoding_size), Embed.WordEmbed(self.word_gram_num)) else: self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) # phrase embed if self.phrase_type == 1: self.phraseEmbed = Embed.PhraseEmbed(self.phrase_gram_num, self.rnn_size) elif self.phrase_type == 2: self.phraseEmbed = Embed.ConvEmbed(self.conv_gram_num) elif self.phrase_type == 3: self.phraseEmbed = Embed.PhraseEmbed(self.phrase_gram_num, self.rnn_size) self.phraseEmbed1 = Embed.ConvEmbed(self.conv_gram_num) # word weight linear # input_encoding_size if self.use_prob_weight: self.prob_weight_layer = nn.Sequential( nn.Linear(self.fc_feat_size, self.vocab_size + 1), nn.Softmax()) if self.use_linear: self.img_embed = nn.Linear(self.fc_feat_size, self.rnn_size) self.att_embed = nn.Linear(self.att_feat_size, self.rnn_size) self.bu_embed = nn.Linear(self.bu_feat_size, self.rnn_size) # self.relu = nn.RReLU(inplace=True) if self.relu_type == 0: if self.use_gated_layer == 1: self.relu = GatedLayer.GatedTanh(self.input_encoding_size) else: self.relu = nn.PReLU() elif self.relu_type == 1: self.img_relu = nn.PReLU() self.att_relu = nn.PReLU() self.bu_relu = nn.PReLU() self.init_weight() if self.use_bilinear: self.bilinear_layer = CompactBilinearPooling( self.rnn_size, self.rnn_size, self.bilinear_output) self.bilinear_layer1 = nn.Linear(self.bilinear_output, self.rnn_size)
def __init__(self, opt): super(ShowAttenTellPhraseModel, self).__init__() self.vocab_size = opt.vocab_size self.input_encoding_size = opt.input_encoding_size self.rnn_type = opt.rnn_type self.rnn_size = opt.rnn_size self.num_layers = opt.num_layers self.drop_prob_lm = opt.drop_prob_lm self.seq_length = opt.seq_length self.fc_feat_size = opt.fc_feat_size self.att_feat_size = opt.att_feat_size self.att_size = opt.att_size self.use_linear = opt.use_linear self.word_gram_num = opt.word_gram_num self.phrase_gram_num = opt.phrase_gram_num self.conv_gram_num = opt.conv_gram_num self.context_len = opt.context_len self.use_prob_weight = opt.use_prob_weight self.phrase_type = opt.phrase_type self.mil_type = opt.mil_type self.use_gated_layer = getattr(opt, 'use_gated_layer', 0) self.use_linear_embedding = getattr(opt, 'use_linear_embedding', 0) self.relu_type = getattr(opt, 'relu_type', 0) # LSTM self.core = rnn_utils.get_lstm(opt) # self.vocab_size + 1 -> self.input_encoding_size if self.use_linear_embedding == 1: self.embed = nn.Linear(self.vocab_size + 1, self.input_encoding_size) self.embed.weight.data.normal_(0, 1) else: if self.word_gram_num > 0: self.embed = nn.Sequential( nn.Embedding(self.vocab_size + 1, self.input_encoding_size), Embed.WordEmbed(self.word_gram_num)) else: self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) # phrase embed if self.phrase_type == 1: self.phraseEmbed = Embed.PhraseEmbed(self.phrase_gram_num, self.rnn_size) elif self.phrase_type == 2: self.phraseEmbed = Embed.ConvEmbed(self.conv_gram_num) elif self.phrase_type == 3: self.phraseEmbed = Embed.PhraseEmbed(self.phrase_gram_num, self.rnn_size) self.phraseEmbed1 = Embed.ConvEmbed(self.conv_gram_num) # word weight linear # input_encoding_size if self.use_prob_weight: self.prob_weight_layer = nn.Sequential( nn.Linear(self.fc_feat_size, self.vocab_size + 1), nn.Sigmoid()) if self.use_linear: self.img_embed = nn.Linear(self.fc_feat_size, self.rnn_size) self.att_embed = nn.Linear(self.att_feat_size, self.rnn_size) # self.relu = nn.RReLU(inplace=True) if self.relu_type == 0: if self.use_gated_layer == 1: self.relu = GatedLayer.GatedTanh(self.input_encoding_size) else: self.relu = nn.PReLU() elif self.relu_type == 1: self.img_relu = nn.PReLU() self.att_relu = nn.PReLU() self.init_weight()