def __init__(self, text_query, image_embed_dim, text_embed_dim, use_bert, name): super().__init__() # img model # pretrained表示是否是预训练模型 img_model = torchvision.models.resnet18(pretrained=True) self.name = name class GlobalAvgPool2d(torch.nn.Module): def forward(self, x): # (1,1)表示输出的维度 # 例如输入x.shape=(512, 3, 64, 64),输出的shape=(512, 3, 1, 1) # 即求(64, 64)的平均值 return F.adaptive_avg_pool2d(x, (1, 1)) img_model.avgpool = GlobalAvgPool2d() # fc --> full connection 全连接层,即线性层 # Linear(in, out)两个参数表示输入和输出的维度 img_model.fc = torch.nn.Sequential(torch.nn.Linear(image_embed_dim, image_embed_dim)) self.img_model = img_model # text model self.text_model = text_model.TextLSTMModel( texts_to_build_vocab = text_query, word_embed_dim = text_embed_dim, lstm_hidden_dim = text_embed_dim)
def __init__(self, embed_dim, texts): super(ImageTextEncodeTransformModel, self).__init__() self.snorm = torch_functions.NormalizationLayer(normalize_scale=4.0, learn_scale=True) # image self.img_encoder = torchvision.models.resnet50(pretrained=True) self.img_encoder.fc = torch.nn.Sequential( torch.nn.Dropout(0.2), torch.nn.Linear(2048, 2048), torch.nn.BatchNorm1d(2048), torch.nn.Dropout(0.2), torch.nn.ReLU(), torch.nn.Linear(2048, embed_dim)) # text self.text_encoder = text_model.TextLSTMModel( texts_to_build_vocab=texts, word_embed_dim=256, lstm_hidden_dim=embed_dim) self.text_encoder.fc_output = torch.nn.Sequential( torch.nn.Dropout(0.1), torch.nn.Linear(embed_dim, 2048), torch.nn.BatchNorm1d(2048), torch.nn.Dropout(0.1), torch.nn.ReLU(), torch.nn.Linear(2048, embed_dim)) # transformer self.transformer = MTirgTransform(embed_dim)
def __init__(self, texts, opt): super(ImgEncoderTextEncoderBase, self).__init__() img_encoder = opt.img_encoder text_encoder = opt.text_encoder embed_dim = opt.embed_dim if img_encoder == 'efficientnet': img_model = EfficientNet.from_pretrained('efficientnet-b0') img_model._fc = torch.nn.Sequential( torch.nn.Linear(1280, embed_dim)) elif img_encoder == 'resnet18': img_model = torchvision.models.resnet18(pretrained=True) img_model.avgpool = GlobalAvgPool2d() img_model.fc = torch.nn.Sequential(torch.nn.Linear(512, embed_dim)) elif img_encoder == 'resnet50': img_model = torchvision.models.resnet50(pretrained=True) img_model.avgpool = GlobalAvgPool2d() img_model.fc = torch.nn.Sequential(torch.nn.Linear( 2048, embed_dim)) elif img_encoder == 'resnet101': img_model = torchvision.models.resnet101(pretrained=True) img_model.avgpool = GlobalAvgPool2d() img_model.fc = torch.nn.Sequential(torch.nn.Linear( 2048, embed_dim)) else: print('Invalid image encoder', img_encoder) print('available: efficientnet, resnet18, resnet50, resnet101') sys.exit() class GlobalAvgPool2d(torch.nn.Module): def forward(self, x): return F.adaptive_avg_pool2d(x, (1, 1)) self.img_model = img_model self.embed_dim = embed_dim if text_encoder == 'lstm': self.text_model = text_model.TextLSTMModel( texts_to_build_vocab=texts, word_embed_dim=512, lstm_hidden_dim=embed_dim) elif text_encoder == 'dualenc': self.text_model = text_model.TextDualencModel( texts_to_build_vocab=texts, embed_dim=embed_dim, word_embed_dim=300, lstm_hidden_dim=512) else: print('Invalid text encoder', text_encoder) print('available: lstm, dualenc') sys.exit()
def __init__(self, vocab_size, embed_dim=512): super(ImgEncoderTextEncoderBase, self).__init__() # img model img_model = torchvision.models.resnet18(pretrained=True) class GlobalAvgPool2d(torch.nn.Module): def forward(self, x): return F.adaptive_avg_pool2d(x, (1, 1)) img_model.avgpool = GlobalAvgPool2d() img_model.fc = torch.nn.Sequential(torch.nn.Linear(512, embed_dim)) self.img_model = img_model # text model self.text_model = text_model.TextLSTMModel(vocab_size, word_embed_dim=embed_dim, lstm_hidden_dim=embed_dim)
def __init__(self, texts, embed_dim): super(ImgEncoderTextEncoderBase, self).__init__() img_model = torchvision.models.resnet18(pretrained=False) img_model.load_state_dict(torch.load(Path1 + r'\resnet18-5c106cde.pth')) class GlobalAvgPool2d(torch.nn.Module): def forward(self, x): return F.adaptive_avg_pool2d(x, (1, 1)) img_model.avgpool = GlobalAvgPool2d() img_model.fc = torch.nn.Sequential(torch.nn.Linear(512, embed_dim)) self.img_model = img_model # text model self.text_model = text_model.TextLSTMModel(texts_to_build_vocab=texts, word_embed_dim=embed_dim, lstm_hidden_dim=embed_dim)
def __init__(self, text_query, image_embed_dim, text_embed_dim, use_bert, name): super().__init__() # img model img_model = torchvision.models.resnet18(pretrained=True) self.name = name class GlobalAvgPool2d(torch.nn.Module): def forward(self, x): return F.adaptive_avg_pool2d(x, (1, 1)) img_model.avgpool = GlobalAvgPool2d() img_model.fc = torch.nn.Sequential( torch.nn.Linear(image_embed_dim, image_embed_dim)) self.img_model = img_model # text model self.text_model = text_model.TextLSTMModel( texts_to_build_vocab=text_query, word_embed_dim=text_embed_dim, lstm_hidden_dim=text_embed_dim)