def __init__(self, args, vocab, n_dim, image_dim, layers, dropout, num_choice=5): super().__init__() self.vocab = vocab V = len(vocab) D = n_dim self.hidden_dim = n_dim #video_encoder_layer = nn.TransformerEncoderLayer(d_model=300, nhead=6, dim_feedforward=1024, dropout=0.1, activation='gelu') #self.video_encoder = nn.TransformerEncoder(video_encoder_layer, num_layers=1) self.video_encoder = nn.GRU(image_dim + 21, 150, bidirectional=True, batch_first=True) multimodal_encoder_layer = nn.TransformerEncoderLayer( d_model=n_dim, nhead=6, dim_feedforward=1024, dropout=0.5, activation='gelu') self.transformer = nn.TransformerEncoder(multimodal_encoder_layer, num_layers=2) self.embedding = nn.Embedding(V, D) n_dim = args.n_dim image_dim = args.image_dim self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') self.language_model = RobertaModel.from_pretrained('roberta-base', return_dict=True) #for param in self.language_model.base_model.parameters(): # param.requires_grad = False # Update config to finetune token type embeddings #self.language_model.config.type_vocab_size = 3 # Create a new Embeddings layer, with 2 possible segments IDs instead of 1 #self.language_model.embeddings.token_type_embeddings = nn.Embedding(3, self.language_model.config.hidden_size) # Initialize it #self.language_model.embeddings.token_type_embeddings.weight.data.normal_(mean=0.0, std=self.language_model.config.initializer_range) ''' # Freeze the first 10 layers modules = [self.language_model.encoder.layer[:10]] for module in modules: for param in module.parameters(): param.requires_grad = False ''' #self.cmat = ContextMatching(n_dim * 3) #self.lstm_raw = RNNEncoder(300, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.lstm_script = RNNEncoder(321, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.script_on = "script" in args.stream_type self.vbb_on = "visual_bb" in args.stream_type self.vmeta_on = "visual_meta" in args.stream_type #self.conv_pool = Conv1d(n_dim*4+1, n_dim*2) self.character = nn.Parameter(torch.randn(22, D, device=args.device, dtype=torch.float), requires_grad=True) self.norm1 = Norm(D) self.lang_proj = nn.Linear(768, 300) self.visual_proj = nn.Linear(2048, 300) #self.mh_video = nn.MultiheadAttention(300, 6) #self.context_gru = nn.GRU(300, 150, bidirectional=True, batch_first=True) self.cross1 = UtilityLayer(300) self.cross2 = UtilityLayer(300) self.cross3 = UtilityLayer(300) self.context_proj = nn.Linear(5 * 300, 300) self.char_classifier = nn.Linear(300, 21) self.mask_classifier = nn.Linear(300, self.tokenizer.vocab_size) self.output = nn.Linear(300, 1) self.answer_rnn = nn.LSTM(300, 300, 1, batch_first=True, dropout=0) speaker_name = [ 'None', # index 0: unknown speaker 'Anna', 'Chairman', 'Deogi', 'Dokyung', 'Gitae', 'Haeyoung1', 'Haeyoung2', 'Heeran', 'Hun', 'Jeongsuk', 'Jinsang', 'Jiya', 'Kyungsu', 'Sangseok', 'Seohee', 'Soontack', 'Sukyung', 'Sungjin', 'Taejin', 'Yijoon' ] self.speaker_to_index = { name: index for index, name in enumerate(speaker_name) } self.index_to_speaker = { v: k for k, v in self.speaker_to_index.items() } if self.script_on: self.lstm_script = RNNEncoder(321, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.classifier_script = nn.Sequential(nn.Linear(n_dim * 2, 1), nn.Softmax(dim=1)) self.mhattn_script = CharMatching(4, D, D) if self.vmeta_on: self.lstm_vmeta = RNNEncoder(321, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.classifier_vmeta = nn.Sequential(nn.Linear(n_dim * 2, 1), nn.Softmax(dim=1)) self.mhattn_vmeta = CharMatching(4, D, D) if self.vbb_on: self.lstm_vbb = RNNEncoder(image_dim + 21, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.vbb_fc = nn.Sequential( nn.Dropout(0.5), nn.Linear(image_dim, n_dim), nn.Tanh(), ) self.classifier_vbb = nn.Sequential(nn.Linear(n_dim * 2, 1), nn.Softmax(dim=1)) self.mhattn_vbb = CharMatching(4, D, D)
def __init__(self, args, vocab, n_dim, image_dim, layers, dropout, num_choice=5): super().__init__() self.vocab = vocab V = len(vocab) D = n_dim self.hidden_dim = n_dim self.embedding = nn.Embedding(V, D) n_dim = args.n_dim image_dim = args.image_dim bert_vocab_size = 30525 self.bert = BertModel.from_pretrained('bert-base-cased') self.bert.resize_token_embeddings(bert_vocab_size) self.bert_dim = 768 self.cmat = ContextMatching(n_dim * 3) self.lstm_raw = RNNEncoder(self.bert_dim, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.lstm_script = RNNEncoder(321, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.script_on = "script" in args.stream_type self.vbb_on = "visual_bb" in args.stream_type self.vmeta_on = "visual_meta" in args.stream_type self.conv_pool = Conv1d(n_dim * 4 + 1, n_dim * 2) self.util = UtilityLayer(hidden_dim=300, feedforward_dim=600, n_head=10, dropout=0.1) self.util2 = UtilityLayer(hidden_dim=300, feedforward_dim=600, n_head=10, dropout=0.1) self.summary_s = SummaryAttn(300, 8, 0.1) self.summary_m = SummaryAttn(300, 8, 0.1) self.summary_b = SummaryAttn(300, 8, 0.1) self.summary_q = SummaryAttn(300, 5, 0.1) self.character = nn.Parameter(torch.randn(22, D, device=args.device, dtype=torch.float), requires_grad=True) self.norm1 = Norm(D) self.output = nn.Sequential(nn.Linear(4 * 300, 300), nn.PReLU()) self.linear_addit = nn.Sequential(nn.Linear(1800, 300), nn.PReLU()) if self.script_on: self.lstm_script = RNNEncoder(self.bert_dim + 21, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.classifier_script = nn.Sequential(nn.Linear(n_dim * 2, 1), nn.Softmax(dim=1)) self.mhattn_script = CharMatching(4, D, D) if self.vmeta_on: self.lstm_vmeta = RNNEncoder(self.bert_dim + 21, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.classifier_vmeta = nn.Sequential(nn.Linear(n_dim * 2, 1), nn.Softmax(dim=1)) self.mhattn_vmeta = CharMatching(4, D, D) if self.vbb_on: self.lstm_vbb = RNNEncoder(image_dim + 21, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.vbb_fc = nn.Sequential( nn.Dropout(0.5), nn.Linear(image_dim, n_dim), nn.Tanh(), ) self.classifier_vbb = nn.Sequential(nn.Linear(n_dim * 2, 1), nn.Softmax(dim=1)) self.mhattn_vbb = CharMatching(4, D, D)
def __init__(self, args, vocab, n_dim, image_dim, layers, dropout, num_choice=5): super().__init__() self.vocab = vocab V = len(vocab) D = n_dim # set appropriate CLS, SEP tokens here (from BERT tokenizer) self.CLS = 101 self.SEP = 102 self.hidden_dim = n_dim self.embedding = nn.Embedding(V, D) n_dim = args.n_dim image_dim = args.image_dim #bert_vocab_size = 30543 self.bert_dim = 768 #self.bert = BertModel.from_pretrained('bert-base-uncased') self.bert = RobertaModel.from_pretrained('roberta-base') #self.bert.resize_token_embeddings(bert_vocab_size) for param in self.bert.parameters(): param.requires_grad = False self.cmat = ContextMatching(n_dim * 3) self.lstm_raw = RNNEncoder(self.bert_dim, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") #self.lstm_script = RNNEncoder(321, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.script_on = "script" in args.stream_type self.vbb_on = "visual_bb" in args.stream_type self.vmeta_on = "visual_meta" in args.stream_type self.conv_pool = Conv1d(n_dim * 4 + 1, n_dim * 2) self.character = nn.Parameter(torch.randn(22, D, device=args.device, dtype=torch.float), requires_grad=True) self.norm1 = Norm(D) self.visual_projection = nn.Sequential(nn.Linear(512, 300), nn.ReLU()) self.person_projection = nn.Sequential(nn.Linear(512, 300), nn.ReLU()) self.output = nn.Sequential(nn.Linear(768, 1), nn.PReLU()) if self.script_on: #self.lstm_script = RNNEncoder(321, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.lstm_script = RNNEncoder(self.bert_dim + 21, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.classifier_script = nn.Sequential(nn.Linear(n_dim * 2, 1), nn.Softmax(dim=1)) self.mhattn_script = CharMatching(4, D, D) if self.vmeta_on: #self.lstm_vmeta = RNNEncoder(321, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.lstm_vmeta = RNNEncoder(self.bert_dim + 21, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.classifier_vmeta = nn.Sequential(nn.Linear(n_dim * 2, 1), nn.Softmax(dim=1)) self.mhattn_vmeta = CharMatching(4, D, D) if self.vbb_on: self.lstm_vbb = RNNEncoder(image_dim + 21, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.vbb_fc = nn.Sequential( nn.Dropout(0.5), nn.Linear(image_dim, n_dim), nn.Tanh(), ) self.classifier_vbb = nn.Sequential(nn.Linear(n_dim * 2, 1), nn.Softmax(dim=1)) self.mhattn_vbb = CharMatching(4, D, D)
def __init__(self, args, vocab, n_dim, image_dim, layers, dropout, num_choice=5): super().__init__() self.vocab = vocab V = len(vocab) D = n_dim self.hidden_dim = n_dim self.embedding = nn.Embedding(V, D) n_dim = args.n_dim image_dim = args.image_dim self.cmat = ContextMatching(n_dim * 3) self.lstm_raw = RNNEncoder(300, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.lstm_script = RNNEncoder(321, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.script_on = "script" in args.stream_type self.vbb_on = "visual_bb" in args.stream_type self.vmeta_on = "visual_meta" in args.stream_type self.conv_pool = Conv1d(n_dim * 4 + 1, n_dim * 2) self.util = UtilityLayer(hidden_dim=300, feedforward_dim=1024, n_head=10, dropout=0.1) self.util2 = UtilityLayer(hidden_dim=300, feedforward_dim=1024, n_head=10, dropout=0.1) self.util3 = UtilityLayer(hidden_dim=300, feedforward_dim=1024, n_head=10, dropout=0.1) self.summary_s = SummaryAttn(300, 3, 0.1) self.summary_m = SummaryAttn(300, 3, 0.1) self.summary_b = SummaryAttn(300, 3, 0.1) self.summary_f = SummaryAttn(300, 3, 0.1) self.summary_q = SummaryAttn(300, 3, 0.1) self.summary_addit = SummaryAttn(300, 3, 0.1) self.character = nn.Parameter(torch.randn(22, D, device=args.device, dtype=torch.float), requires_grad=True) self.norm1 = Norm(D) self.image_projection = nn.Sequential(nn.Linear(512, 300), nn.PReLU()) self.output = nn.Sequential(nn.Linear(4 * 300, 300), nn.PReLU()) self.linear_addit = nn.Sequential(nn.Linear(1800 + 3, 300), nn.PReLU()) self.mh_bb = nn.MultiheadAttention(embed_dim=300, num_heads=6) self.mh_script = nn.MultiheadAttention(embed_dim=300, num_heads=6) self.mh_meta = nn.MultiheadAttention(embed_dim=300, num_heads=6) self.mh_answers = nn.MultiheadAttention(embed_dim=300, num_heads=6) if self.script_on: self.lstm_script = RNNEncoder(321, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.classifier_script = nn.Sequential(nn.Linear(n_dim * 2, 1), nn.Softmax(dim=1)) self.mhattn_script = CharMatching(4, D, D) if self.vmeta_on: self.lstm_vmeta = RNNEncoder(321, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.classifier_vmeta = nn.Sequential(nn.Linear(n_dim * 2, 1), nn.Softmax(dim=1)) self.mhattn_vmeta = CharMatching(4, D, D) if self.vbb_on: self.lstm_vbb = RNNEncoder(image_dim + 21, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.vbb_fc = nn.Sequential( nn.Dropout(0.5), nn.Linear(image_dim, n_dim), nn.Tanh(), ) self.classifier_vbb = nn.Sequential(nn.Linear(n_dim * 2, 1), nn.Softmax(dim=1)) self.mhattn_vbb = CharMatching(4, D, D)
def __init__(self, args, vocab, n_dim, image_dim, layers, dropout, num_choice=5): super().__init__() self.vocab = vocab V = len(vocab) D = n_dim self.hidden_dim = n_dim #self.bert = BertModel.from_pretrained('bert-base-uncased') encoder_layer = nn.TransformerEncoderLayer(d_model=n_dim, nhead=6, dim_feedforward=1024, dropout=0.5, activation='gelu') self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=2) #self.transformer = nn.Transformer(d_model=n_dim, nhead=6) self.embedding = nn.Embedding(V, D) n_dim = args.n_dim image_dim = args.image_dim self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') self.language_model = RobertaModel.from_pretrained('roberta-base', return_dict=True) for param in self.language_model.base_model.parameters(): param.requires_grad = False self.cmat = ContextMatching(n_dim * 3) self.lstm_raw = RNNEncoder(300, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.lstm_script = RNNEncoder(321, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.script_on = "script" in args.stream_type self.vbb_on = "visual_bb" in args.stream_type self.vmeta_on = "visual_meta" in args.stream_type self.conv_pool = Conv1d(n_dim * 4 + 1, n_dim * 2) self.character = nn.Parameter(torch.randn(22, D, device=args.device, dtype=torch.float), requires_grad=True) self.norm1 = Norm(D) self.output = nn.Sequential(nn.Linear(5 * 300, 5), nn.Softmax(dim=1)) if self.script_on: self.lstm_script = RNNEncoder(321, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.classifier_script = nn.Sequential(nn.Linear(n_dim * 2, 1), nn.Softmax(dim=1)) self.mhattn_script = CharMatching(4, D, D) if self.vmeta_on: self.lstm_vmeta = RNNEncoder(321, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.classifier_vmeta = nn.Sequential(nn.Linear(n_dim * 2, 1), nn.Softmax(dim=1)) self.mhattn_vmeta = CharMatching(4, D, D) if self.vbb_on: self.lstm_vbb = RNNEncoder(image_dim + 21, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm") self.vbb_fc = nn.Sequential( nn.Dropout(0.5), nn.Linear(image_dim, n_dim), nn.Tanh(), ) self.classifier_vbb = nn.Sequential(nn.Linear(n_dim * 2, 1), nn.Softmax(dim=1)) self.mhattn_vbb = CharMatching(4, D, D)