Exemplo n.º 1
0
    def __init__(self, opt, tokenizer, device):
        super(GuesserNetwork, self).__init__()
        word_num = tokenizer.no_words
        self.opt = opt
        word_embedding_dim = opt["embedding_dim"]
        category_num = opt["n_category"]
        category_dim = opt["category_embed_dim"]
        hidden_size = opt["hidden"]
        n_layers = opt["layer"]
        dropout = opt["dropout"]
        image_dim = opt["image_dim"]
        self.cell = GRUEncoder(input_size=word_embedding_dim,
                               hidden_size=hidden_size,
                               embedding=None,
                               n_layers=n_layers,
                               p=dropout,
                               bidirectional=True,
                               out_p=dropout,
                               device=device)
        self.word_embedding = nn.Embedding(word_num, word_embedding_dim)
        torch.nn.init.normal_(self.word_embedding.weight, 0.0, 0.1)
        self.category_embedding = nn.Embedding(category_num + 1, category_dim)
        torch.nn.init.normal_(self.category_embedding.weight, 0.0, 0.1)
        obj_dim = opt["MLP1_hidden"]
        self.final_dim = opt["MLP2_hidden"]

        if opt["image"]:
            self.linear = nn.Linear(opt["hidden"] + image_dim, self.final_dim)

        self.mlp = nn.Sequential(nn.Linear(category_dim + 8, obj_dim),
                                 nn.ReLU(), nn.Linear(obj_dim, self.final_dim),
                                 nn.ReLU())
        self.device = device
Exemplo n.º 2
0
 def __init__(self, args, tokenizer, device):
     super(QGenNetwork, self).__init__()
     self.args = args
     self.word_embedding_dim = args["embedding_dim"]
     self.device = device
     # encoder decoder共用word embedding
     self.word_embed = MyEmbedding(tokenizer.vocab_list,
                                   args["embedding_dim"])
     self.dialogue_encoder = GRUEncoder(
         input_size=self.word_embedding_dim,
         hidden_size=args["session_hidden"],
         embedding=self.word_embed,
         n_layers=args["session_layer"],
         p=(0 if args["session_layer"] == 1 else args["session_dropout"]),
         bidirectional=args["session_bi"],
         out_p=0,
         device=device)
     session_v_dim = args["image_dim"] + 76 if args[
         "image_arch"] == "rcnn" else args["image_dim"]
     self.image_compress = nn.Conv1d(args["image_dim"],
                                     args["v_feature"],
                                     1,
                                     1,
                                     padding=0,
                                     dilation=1,
                                     groups=1,
                                     bias=True)
     torch.nn.init.kaiming_normal_(self.image_compress.weight,
                                   nonlinearity='relu')
     torch.nn.init.constant_(self.image_compress.bias, 0.0)
     if args["visual_att"]:
         self.attention = VisualAttention(session_v_dim,
                                          args["session_hidden"],
                                          args["visual_att_dim"],
                                          args["glimpse"])
         self.glimpse = args["glimpse"]
     else:
         self.attention = None
     self.visual_dropout = nn.Dropout(args["visual_dropout"])
     self.fusion = ConcatFusion(args["session_hidden"], args["v_feature"],
                                args["decoder_hidden"])
     self.decoder = GRUDecoderBase(n_vocab=tokenizer.no_words,
                                   embedding=self.word_embed,
                                   embedding_dim=self.word_embedding_dim,
                                   category_dim=None,
                                   n_layers=args["decoder_layer"],
                                   hidden_size=args["decoder_hidden"],
                                   dropout=args["decoder_dropout"],
                                   beam=args["beam_size"],
                                   device=device)
     self.gen_loss = nn.CrossEntropyLoss(ignore_index=PAD)
Exemplo n.º 3
0
 def __init__(self, opt, tokenizer, device):
     super(OracleNetwork, self).__init__()
     self.word_num = tokenizer.no_words
     self.word_embedding_dim = opt["embedding_dim"]
     self.category_num = opt["n_category"]
     self.category_embedding_dim = opt["category_embed_dim"]
     self.hidden_size = opt["hidden"]
     self.n_layers = opt["layer"]
     self.dropout = opt["dropout"]
     self.opt = opt
     self.rnn = GRUEncoder(input_size=self.word_embedding_dim,
                           hidden_size=self.hidden_size,
                           embedding=None,
                           n_layers=self.n_layers,
                           p=self.dropout,
                           bidirectional=opt["bidirectional"],
                           out_p=self.dropout,
                           device=device)
     self.word_embedding = nn.Embedding(self.word_num,
                                        self.word_embedding_dim)
     self.category_embedding = nn.Embedding(self.category_num + 1,
                                            self.category_embedding_dim)
     torch.nn.init.normal_(self.word_embedding.weight, 0.0, 0.1)
     torch.nn.init.normal_(self.category_embedding.weight, 0.0, 0.1)
     fc_dim = self.hidden_size
     if opt["category"]:
         fc_dim += self.category_embedding_dim
     if opt["spatial"]:
         fc_dim += 8  # 空间位置维度信息
     if opt["image"]:
         fc_dim += opt["image_dim"]
     if opt["crop"]:
         fc_dim += opt["crop_dim"]
     self.fc1 = nn.Linear(fc_dim, opt["MLP_hidden"])
     self.fc2 = nn.Linear(opt["MLP_hidden"], num_classes)
     self.device = device
Exemplo n.º 4
0
    def __init__(self, args, tokenizer, device):
        super(QGenNetwork, self).__init__()
        self.args = args
        self.word_embedding_dim = args["embedding_dim"]
        self.device = device
        # encoder decoder共用word embedding
        self.word_embed = MyEmbedding(tokenizer.vocab_list,
                                      args["embedding_dim"])
        self.query_encoder = GRUEncoder(
            input_size=self.word_embedding_dim,
            hidden_size=args["query_hidden"],
            embedding=self.word_embed,
            n_layers=args["query_layer"],
            p=(0 if args["query_layer"] == 1 else args["query_dropout"]),
            bidirectional=args["query_bi"],
            out_p=0,
            device=device)
        self.ans_embed = nn.Embedding(3, args["answer_dim"])
        torch.nn.init.normal_(self.ans_embed.weight, 0.0, 0.1)
        session_q_dim = args["query_hidden"] + args["answer_dim"]
        image_dim = args["image_dim"] + 76 if args[
            "image_arch"] == "rcnn" else args["image_dim"]
        self.image_compress = nn.Conv1d(image_dim,
                                        args["v_feature"],
                                        1,
                                        1,
                                        padding=0,
                                        dilation=1,
                                        groups=1,
                                        bias=True)
        self.visual_dropout = nn.Dropout(args["visual_dropout"])
        if args["visual_att"]:
            self.visual_attn = VisualAttention(image_dim,
                                               args["session_hidden"],
                                               args["visual_att_dim"],
                                               args["glimpse"])
        else:
            self.visual_attn = None

        self.session_encoder = GRUEncoder(
            input_size=session_q_dim,
            hidden_size=args["session_hidden"],
            embedding=None,
            n_layers=args["session_layer"],
            p=(0 if args["session_layer"] == 1 else args["session_dropout"]),
            bidirectional=False,
            out_p=0,
            device=device)

        self.decoder_linear = nn.Linear(
            args["session_hidden"] + args["v_feature"], args["decoder_hidden"])
        self.decoder = GRUDecoderBase(n_vocab=tokenizer.no_words,
                                      embedding=self.word_embed,
                                      embedding_dim=self.word_embedding_dim,
                                      category_dim=None,
                                      n_layers=args["decoder_layer"],
                                      hidden_size=args["decoder_hidden"],
                                      dropout=args["decoder_dropout"],
                                      beam=args["beam_size"],
                                      device=device)
        self.gen_loss = nn.CrossEntropyLoss(ignore_index=constants.PAD)
Exemplo n.º 5
0
    def __init__(self, args, tokenizer, device):
        super(QGenNetwork, self).__init__()
        self.args = args
        self.word_embedding_dim = args["embedding_dim"]
        self.device = device
        # encoder decoder共用word embedding
        self.word_embed = MyEmbedding(tokenizer.vocab_list,
                                      args["embedding_dim"])
        self.query_encoder = GRUEncoder(
            input_size=self.word_embedding_dim,
            hidden_size=args["query_hidden"],
            embedding=self.word_embed,
            n_layers=args["query_layer"],
            p=(0 if args["query_layer"] == 1 else args["query_dropout"]),
            bidirectional=args["query_bi"],
            out_p=0,
            device=device)
        self.pad_token_id = tokenizer.padding_token
        if args["multi_cate"]:
            ic("mutli-category")
            self.category_embed = nn.Linear(c_len, args["category_dim"])
        else:
            ic("single-category")
            self.category_embed = nn.Embedding(c_len, args["category_dim"])
        torch.nn.init.normal_(self.category_embed.weight, 0.0, 0.1)
        self.ans_embed = nn.Embedding(3, args["answer_dim"])
        torch.nn.init.normal_(self.ans_embed.weight, 0.0, 0.1)
        session_q_dim = args["query_hidden"] + args["answer_dim"] + args[
            "category_dim"]
        image_dim = args["image_dim"] + 76 if args[
            "image_arch"] == "rcnn" else args["image_dim"]
        self.image_compress = nn.Conv1d(image_dim,
                                        args["v_feature"],
                                        1,
                                        1,
                                        padding=0,
                                        dilation=1,
                                        groups=1,
                                        bias=True)
        self.visual_dropout = nn.Dropout(args["visual_dropout"])
        if args["visual_att"]:
            ic("with visual attention")
            self.visual_attn = VisualAttention(image_dim,
                                               args["session_hidden"],
                                               args["visual_att_dim"],
                                               args["glimpse"])
        else:
            ic("without visual attention")
            self.visual_attn = None

        self.session_encoder = GRUEncoder(
            input_size=session_q_dim,
            hidden_size=args["session_hidden"],
            embedding=None,
            n_layers=args["session_layer"],
            p=(0 if args["session_layer"] == 1 else args["session_dropout"]),
            bidirectional=False,
            out_p=0,
            device=device)

        self.cls = "cls" in args["task"]
        self.category_weight_func = self.category_punish_weight
        if "weight_type" in args and "weight_type" == "prior":
            self.category_weight_func = self.category_prior_weight
        if self.cls:
            self.fc2 = nn.Sequential(
                nn.Linear(args["session_hidden"] + args["v_feature"],
                          args["session_hidden"] + args["v_feature"] // 2),
                nn.ReLU(),
                nn.Linear(args["session_hidden"] + args["v_feature"] // 2,
                          c_len))
            self.punish_cate = nn.Parameter(torch.Tensor(c_len))
            torch.nn.init.normal_(self.punish_cate, 0.0, 0.1)
            if args["multi_cate"]:
                self.loss_fn = nn.BCEWithLogitsLoss()
            else:
                self.loss_fn = nn.CrossEntropyLoss()
        self.gen = "gen" in args["task"]
        if self.gen:
            self.decoder_linear = nn.Linear(
                args["session_hidden"] + args["v_feature"],
                args["decoder_hidden"])
            if args["decoder_att"]:
                ic("decoder with attention")
                self.decoder = GRUDecoderAtt(
                    n_vocab=tokenizer.no_words,
                    embedding=self.word_embed,
                    embedding_dim=self.word_embedding_dim,
                    category_dim=args["category_dim"],
                    session_dim=args["session_hidden"],
                    v_feature_dim=None,
                    n_layers=args["decoder_layer"],
                    hidden_size=args["decoder_hidden"],
                    attention_size=args["decoder_attention_dim"],
                    dropout=args["decoder_dropout"],
                    beam=args["beam_size"],
                    device=device)
            elif "category_once" in args and args["category_once"] is True:
                ic("decoder with category once")
                self.decoder = GRUDecoderCategoryOnce(
                    n_vocab=tokenizer.no_words,
                    embedding=self.word_embed,
                    embedding_dim=self.word_embedding_dim,
                    category_dim=args["category_dim"],
                    n_layers=args["decoder_layer"],
                    hidden_size=args["decoder_hidden"],
                    dropout=args["decoder_dropout"],
                    beam=args["beam_size"],
                    device=device)
            else:
                ic("decoder with category each input")
                self.decoder = GRUDecoderBase(
                    n_vocab=tokenizer.no_words,
                    embedding=self.word_embed,
                    embedding_dim=self.word_embedding_dim,
                    category_dim=args["category_dim"],
                    n_layers=args["decoder_layer"],
                    hidden_size=args["decoder_hidden"],
                    dropout=args["decoder_dropout"],
                    beam=args["beam_size"],
                    device=device)
            self.gen_loss = nn.CrossEntropyLoss(ignore_index=PAD)