def __init__(self, opt, tokenizer, device): super(GuesserNetwork, self).__init__() word_num = tokenizer.no_words self.opt = opt word_embedding_dim = opt["embedding_dim"] category_num = opt["n_category"] category_dim = opt["category_embed_dim"] hidden_size = opt["hidden"] n_layers = opt["layer"] dropout = opt["dropout"] image_dim = opt["image_dim"] self.cell = GRUEncoder(input_size=word_embedding_dim, hidden_size=hidden_size, embedding=None, n_layers=n_layers, p=dropout, bidirectional=True, out_p=dropout, device=device) self.word_embedding = nn.Embedding(word_num, word_embedding_dim) torch.nn.init.normal_(self.word_embedding.weight, 0.0, 0.1) self.category_embedding = nn.Embedding(category_num + 1, category_dim) torch.nn.init.normal_(self.category_embedding.weight, 0.0, 0.1) obj_dim = opt["MLP1_hidden"] self.final_dim = opt["MLP2_hidden"] if opt["image"]: self.linear = nn.Linear(opt["hidden"] + image_dim, self.final_dim) self.mlp = nn.Sequential(nn.Linear(category_dim + 8, obj_dim), nn.ReLU(), nn.Linear(obj_dim, self.final_dim), nn.ReLU()) self.device = device
def __init__(self, args, tokenizer, device): super(QGenNetwork, self).__init__() self.args = args self.word_embedding_dim = args["embedding_dim"] self.device = device # encoder decoder共用word embedding self.word_embed = MyEmbedding(tokenizer.vocab_list, args["embedding_dim"]) self.dialogue_encoder = GRUEncoder( input_size=self.word_embedding_dim, hidden_size=args["session_hidden"], embedding=self.word_embed, n_layers=args["session_layer"], p=(0 if args["session_layer"] == 1 else args["session_dropout"]), bidirectional=args["session_bi"], out_p=0, device=device) session_v_dim = args["image_dim"] + 76 if args[ "image_arch"] == "rcnn" else args["image_dim"] self.image_compress = nn.Conv1d(args["image_dim"], args["v_feature"], 1, 1, padding=0, dilation=1, groups=1, bias=True) torch.nn.init.kaiming_normal_(self.image_compress.weight, nonlinearity='relu') torch.nn.init.constant_(self.image_compress.bias, 0.0) if args["visual_att"]: self.attention = VisualAttention(session_v_dim, args["session_hidden"], args["visual_att_dim"], args["glimpse"]) self.glimpse = args["glimpse"] else: self.attention = None self.visual_dropout = nn.Dropout(args["visual_dropout"]) self.fusion = ConcatFusion(args["session_hidden"], args["v_feature"], args["decoder_hidden"]) self.decoder = GRUDecoderBase(n_vocab=tokenizer.no_words, embedding=self.word_embed, embedding_dim=self.word_embedding_dim, category_dim=None, n_layers=args["decoder_layer"], hidden_size=args["decoder_hidden"], dropout=args["decoder_dropout"], beam=args["beam_size"], device=device) self.gen_loss = nn.CrossEntropyLoss(ignore_index=PAD)
def __init__(self, opt, tokenizer, device): super(OracleNetwork, self).__init__() self.word_num = tokenizer.no_words self.word_embedding_dim = opt["embedding_dim"] self.category_num = opt["n_category"] self.category_embedding_dim = opt["category_embed_dim"] self.hidden_size = opt["hidden"] self.n_layers = opt["layer"] self.dropout = opt["dropout"] self.opt = opt self.rnn = GRUEncoder(input_size=self.word_embedding_dim, hidden_size=self.hidden_size, embedding=None, n_layers=self.n_layers, p=self.dropout, bidirectional=opt["bidirectional"], out_p=self.dropout, device=device) self.word_embedding = nn.Embedding(self.word_num, self.word_embedding_dim) self.category_embedding = nn.Embedding(self.category_num + 1, self.category_embedding_dim) torch.nn.init.normal_(self.word_embedding.weight, 0.0, 0.1) torch.nn.init.normal_(self.category_embedding.weight, 0.0, 0.1) fc_dim = self.hidden_size if opt["category"]: fc_dim += self.category_embedding_dim if opt["spatial"]: fc_dim += 8 # 空间位置维度信息 if opt["image"]: fc_dim += opt["image_dim"] if opt["crop"]: fc_dim += opt["crop_dim"] self.fc1 = nn.Linear(fc_dim, opt["MLP_hidden"]) self.fc2 = nn.Linear(opt["MLP_hidden"], num_classes) self.device = device
def __init__(self, args, tokenizer, device): super(QGenNetwork, self).__init__() self.args = args self.word_embedding_dim = args["embedding_dim"] self.device = device # encoder decoder共用word embedding self.word_embed = MyEmbedding(tokenizer.vocab_list, args["embedding_dim"]) self.query_encoder = GRUEncoder( input_size=self.word_embedding_dim, hidden_size=args["query_hidden"], embedding=self.word_embed, n_layers=args["query_layer"], p=(0 if args["query_layer"] == 1 else args["query_dropout"]), bidirectional=args["query_bi"], out_p=0, device=device) self.ans_embed = nn.Embedding(3, args["answer_dim"]) torch.nn.init.normal_(self.ans_embed.weight, 0.0, 0.1) session_q_dim = args["query_hidden"] + args["answer_dim"] image_dim = args["image_dim"] + 76 if args[ "image_arch"] == "rcnn" else args["image_dim"] self.image_compress = nn.Conv1d(image_dim, args["v_feature"], 1, 1, padding=0, dilation=1, groups=1, bias=True) self.visual_dropout = nn.Dropout(args["visual_dropout"]) if args["visual_att"]: self.visual_attn = VisualAttention(image_dim, args["session_hidden"], args["visual_att_dim"], args["glimpse"]) else: self.visual_attn = None self.session_encoder = GRUEncoder( input_size=session_q_dim, hidden_size=args["session_hidden"], embedding=None, n_layers=args["session_layer"], p=(0 if args["session_layer"] == 1 else args["session_dropout"]), bidirectional=False, out_p=0, device=device) self.decoder_linear = nn.Linear( args["session_hidden"] + args["v_feature"], args["decoder_hidden"]) self.decoder = GRUDecoderBase(n_vocab=tokenizer.no_words, embedding=self.word_embed, embedding_dim=self.word_embedding_dim, category_dim=None, n_layers=args["decoder_layer"], hidden_size=args["decoder_hidden"], dropout=args["decoder_dropout"], beam=args["beam_size"], device=device) self.gen_loss = nn.CrossEntropyLoss(ignore_index=constants.PAD)
def __init__(self, args, tokenizer, device): super(QGenNetwork, self).__init__() self.args = args self.word_embedding_dim = args["embedding_dim"] self.device = device # encoder decoder共用word embedding self.word_embed = MyEmbedding(tokenizer.vocab_list, args["embedding_dim"]) self.query_encoder = GRUEncoder( input_size=self.word_embedding_dim, hidden_size=args["query_hidden"], embedding=self.word_embed, n_layers=args["query_layer"], p=(0 if args["query_layer"] == 1 else args["query_dropout"]), bidirectional=args["query_bi"], out_p=0, device=device) self.pad_token_id = tokenizer.padding_token if args["multi_cate"]: ic("mutli-category") self.category_embed = nn.Linear(c_len, args["category_dim"]) else: ic("single-category") self.category_embed = nn.Embedding(c_len, args["category_dim"]) torch.nn.init.normal_(self.category_embed.weight, 0.0, 0.1) self.ans_embed = nn.Embedding(3, args["answer_dim"]) torch.nn.init.normal_(self.ans_embed.weight, 0.0, 0.1) session_q_dim = args["query_hidden"] + args["answer_dim"] + args[ "category_dim"] image_dim = args["image_dim"] + 76 if args[ "image_arch"] == "rcnn" else args["image_dim"] self.image_compress = nn.Conv1d(image_dim, args["v_feature"], 1, 1, padding=0, dilation=1, groups=1, bias=True) self.visual_dropout = nn.Dropout(args["visual_dropout"]) if args["visual_att"]: ic("with visual attention") self.visual_attn = VisualAttention(image_dim, args["session_hidden"], args["visual_att_dim"], args["glimpse"]) else: ic("without visual attention") self.visual_attn = None self.session_encoder = GRUEncoder( input_size=session_q_dim, hidden_size=args["session_hidden"], embedding=None, n_layers=args["session_layer"], p=(0 if args["session_layer"] == 1 else args["session_dropout"]), bidirectional=False, out_p=0, device=device) self.cls = "cls" in args["task"] self.category_weight_func = self.category_punish_weight if "weight_type" in args and "weight_type" == "prior": self.category_weight_func = self.category_prior_weight if self.cls: self.fc2 = nn.Sequential( nn.Linear(args["session_hidden"] + args["v_feature"], args["session_hidden"] + args["v_feature"] // 2), nn.ReLU(), nn.Linear(args["session_hidden"] + args["v_feature"] // 2, c_len)) self.punish_cate = nn.Parameter(torch.Tensor(c_len)) torch.nn.init.normal_(self.punish_cate, 0.0, 0.1) if args["multi_cate"]: self.loss_fn = nn.BCEWithLogitsLoss() else: self.loss_fn = nn.CrossEntropyLoss() self.gen = "gen" in args["task"] if self.gen: self.decoder_linear = nn.Linear( args["session_hidden"] + args["v_feature"], args["decoder_hidden"]) if args["decoder_att"]: ic("decoder with attention") self.decoder = GRUDecoderAtt( n_vocab=tokenizer.no_words, embedding=self.word_embed, embedding_dim=self.word_embedding_dim, category_dim=args["category_dim"], session_dim=args["session_hidden"], v_feature_dim=None, n_layers=args["decoder_layer"], hidden_size=args["decoder_hidden"], attention_size=args["decoder_attention_dim"], dropout=args["decoder_dropout"], beam=args["beam_size"], device=device) elif "category_once" in args and args["category_once"] is True: ic("decoder with category once") self.decoder = GRUDecoderCategoryOnce( n_vocab=tokenizer.no_words, embedding=self.word_embed, embedding_dim=self.word_embedding_dim, category_dim=args["category_dim"], n_layers=args["decoder_layer"], hidden_size=args["decoder_hidden"], dropout=args["decoder_dropout"], beam=args["beam_size"], device=device) else: ic("decoder with category each input") self.decoder = GRUDecoderBase( n_vocab=tokenizer.no_words, embedding=self.word_embed, embedding_dim=self.word_embedding_dim, category_dim=args["category_dim"], n_layers=args["decoder_layer"], hidden_size=args["decoder_hidden"], dropout=args["decoder_dropout"], beam=args["beam_size"], device=device) self.gen_loss = nn.CrossEntropyLoss(ignore_index=PAD)