def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm', grouping='single_scale'): super(MCB_LIDAR, self).__init__() self.qa = qa self.image_feat_size = image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.attention = MCBAttention(ques_feat_size, 512, 512, 8000) self.classifier = AnswerModule(8000 + ques_feat_size + 1024, num_classes, (256, ), use_batchnorm=True, dropout=0.5) #3584 if method is concat self.mcb = MCBPolling(512, 8000, n_modalities=2) self.linweights = nn.Linear(ques_feat_size, 7) self.grouping = grouping if self.grouping == 'single_scale': self.lidar_module = LidarSsgModule(normal_channel=False) if self.grouping == 'single_scale': self.lidar_module = LidarMsgModule(normal_channel=False)
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm'): super(LSTM_BASIC, self).__init__() self.qa = qa # special_words = ["<UNK>"] # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors)) # padding = vocab['question_token_to_idx']['<NULL>'] # D = word2vec.vector_size # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words) self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.classifier = AnswerModule(ques_feat_size, num_classes, (256, ))
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm', method='concat'): super(MCB, self).__init__() self.qa = qa # special_words = ["<UNK>"] # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors)) # padding = vocab['question_token_to_idx']['<NULL>'] # D = word2vec.vector_size # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words) self.image_feat_size = image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.attention = MCBAttention(ques_feat_size, 512, 512, 8000) self.classifier = AnswerModule(8000, num_classes, (256, ), use_batchnorm=True, dropout=0.5) #3584 if method is concat self.mcb = MCBPolling(512, 8000, n_modalities=2) self.linweights = nn.Linear(ques_feat_size, 7) self.method = method
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size,num_classes, qa=None, encoder='lstm',method='concat'): super(MUTAN,self).__init__() self.qa = qa # special_words = ["<UNK>"] # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors)) # padding = vocab['question_token_to_idx']['<NULL>'] # D = word2vec.vector_size # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words) self.image_feat_size=image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.opt=dict( dim_v= 512, dim_q= 1024, attention=dict( nb_glimpses= 2, dim_hv= 310, dim_hq= 310, dim_mm= 510, R= 5, dropout_v= 0.5, dropout_q= 0.5, dropout_mm= 0.5, activation_v= "tanh", activation_q= "tanh", dropout_hv= 0, dropout_hq= 0), fusion=dict( dim_hv= 620, dim_hq= 310, dim_mm= 510, R= 5, dropout_v= 0.5, dropout_q= 0.5, activation_v= "tanh", activation_q= "tanh", dropout_hv= 0, dropout_hq= 0) ) self.attention=MutanAtt(self.opt) self.classifier = AnswerModule(self.opt['fusion']['dim_mm'], num_classes,(),use_batchnorm=True,dropout=0.5) #3584 if method is concat self.linweights=nn.Linear(ques_feat_size ,7) self.method=method
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size,num_classes, qa=None, encoder='lstm',grouping='single_scale'): super(MUTAN_LIDAR,self).__init__() self.qa = qa self.image_feat_size=image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.opt=dict( dim_v= 512, dim_q= 1024, attention=dict( nb_glimpses= 2, dim_hv= 310, dim_hq= 310, dim_mm= 510, R= 5, dropout_v= 0.5, dropout_q= 0.5, dropout_mm= 0.5, activation_v= "tanh", activation_q= "tanh", dropout_hv= 0, dropout_hq= 0), fusion=dict( dim_hv= 620, dim_hq= 310, dim_mm= 510, R= 5, dropout_v= 0.5, dropout_q= 0.5, activation_v= "tanh", activation_q= "tanh", dropout_hv= 0, dropout_hq= 0) ) self.attention=MutanAtt(self.opt) self.classifier = AnswerModule(self.opt['fusion']['dim_mm']+1024+ques_feat_size, num_classes,(),use_batchnorm=True,dropout=0.5) #3584 if method is concat self.linweights=nn.Linear(ques_feat_size ,7) self.grouping=grouping if self.grouping=='single_scale': self.lidar_module=LidarSsgModule(normal_channel=False) if self.grouping=='single_scale': self.lidar_module=LidarMsgModule(normal_channel=False)
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm', method='concat'): super(DAN, self).__init__() self.qa = qa # special_words = ["<UNK>"] # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors)) # padding = vocab['question_token_to_idx']['<NULL>'] # D = word2vec.vector_size # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words) self.image_feat_size = image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder, bidirectional=True, give_last=False) self.attention = DANAttention(ques_feat_size, 512, 512, 2) self.linweights = nn.Linear(ques_feat_size, 7) self.method = method self.softmax = nn.Softmax(dim=1) self.tanh = nn.Tanh() if self.method == 'concat': self.classifier = AnswerModule( 2 * 512 * 7 + ques_feat_size, num_classes, (256, ), use_batchnorm=True, dropout=0.5) #3584 if method is concat else: self.classifier = AnswerModule( 2 * 512, num_classes, (256, ), use_batchnorm=True, dropout=0.5) #3584 if method is concat
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm', method='concat'): super(SAN, self).__init__() self.qa = qa # special_words = ["<UNK>"] # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors)) # padding = vocab['question_token_to_idx']['<NULL>'] # D = word2vec.vector_size # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words) self.image_feat_size = image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.attention = StackedAttention(ques_feat_size, 512, 512, 2) self.method = method if self.method == 'concat': self.classifier = AnswerModule( 3584, num_classes, (256, ), use_batchnorm=True, dropout=0.5) #3584 if method is concat if self.method == 'hierarchical': self.classifier = AnswerModule( 512, num_classes, (256, ), use_batchnorm=True, dropout=0.5) #3584 if method is concat # self.linweights=nn.Linear(512,7) self.linweights = nn.Sequential( nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5), nn.Linear(256, 7), #no of glimpses nn.Sigmoid())
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size,num_classes, qa=None, encoder='lstm',grouping='single_scale'): super(MLB_LIDAR,self).__init__() self.qa = qa # special_words = ["<UNK>"] # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors)) # padding = vocab['question_token_to_idx']['<NULL>'] # D = word2vec.vector_size # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words) self.image_feat_size=image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.opt=dict( dim_v=512, dim_q = 1024, attention=dict( nb_glimpses= 4, dim_h= 1200, dropout_v= 0.5, dropout_q= 0.5, dropout_mm= 0.5, activation_v= "tanh", activation_q= "tanh", activation_mm= "tanh"), fusion=dict( dim_h= 1200, dropout_v= 0.5, dropout_q= 0.5, activation_v= "tanh", activation_q= "tanh") ) self.attention=MLBAtt(self.opt) self.classifier = AnswerModule(self.opt['fusion']['dim_h']* self.opt['attention']['nb_glimpses']+ques_feat_size+1024, num_classes,(),use_batchnorm=True,dropout=0.5) #3584 if method is concat self.linweights=nn.Linear(ques_feat_size ,7) self.lidar_module=LidarSsgModule(normal_channel=False)
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm', grouping='single_scale'): super(MFB_LIDAR, self).__init__() self.qa = qa # special_words = ["<UNK>"] # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors)) # padding = vocab['question_token_to_idx']['<NULL>'] # D = word2vec.vector_size # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words) self.image_feat_size = image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder, give_last=False) self.attention = CoAtt(ques_feat_size, image_feature_size, 2) self.classifier = AnswerModule( 500 + ques_feat_size + 1024, num_classes, (256, ), use_batchnorm=True, dropout=0.5) #3500 if method is concat else 500 self.linweights = nn.Linear(ques_feat_size, 7) self.grouping = grouping if self.grouping == 'single_scale': self.lidar_module = LidarSsgModule(normal_channel=False) if self.grouping == 'single_scale': self.lidar_module = LidarMsgModule(normal_channel=False)
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm', method='dot'): super(CNN_LSTM, self).__init__() self.qa = qa # special_words = ["<UNK>"] # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors)) # padding = vocab['question_token_to_idx']['<NULL>'] # D = word2vec.vector_size # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words) self.image_feat_size = image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.image_features_resize = nn.Linear(self.image_feat_size, ques_feat_size) self.method = method if self.method == 'dot': self.classifier = AnswerModule(ques_feat_size, num_classes, use_batchnorm=True, dropout=0.5) if self.method == 'concat': self.classifier = AnswerModule(ques_feat_size + image_feature_size * 7, num_classes, use_batchnorm=True, dropout=0.5)
def __init__(self, **kwargs): if 'question_h5' not in kwargs: raise ValueError('Must give question_h5') if 'image_feature_h5' not in kwargs: raise ValueError('Must give image_feature_h5') if 'lidar_feature_h5' not in kwargs: raise ValueError('Must give lidar_feature_h5') image_feature_h5_path = kwargs.pop('image_feature_h5') load_lidar=kwargs.pop('load_lidar') lidar_feature_h5_path = kwargs.pop('lidar_feature_h5') vocab_path = kwargs.pop('vocab') vocab = load_vocab(vocab_path) question_h5_path = kwargs.pop('question_h5') print('Reading questions from ', question_h5_path) with h5py.File(question_h5_path, 'r') as question_h5: self.dataset = ArgoDataset( question_h5, image_feature_h5_path, lidar_feature_h5_path,vocab=vocab,load_lidar=load_lidar) kwargs['collate_fn'] = argo_collate super(ArgoDataLoader, self).__init__(self.dataset, **kwargs)
def __init__(self, **kwargs): if 'question_h5' not in kwargs: raise ValueError('Must give question_h5') if 'image_feature_h5' not in kwargs: raise ValueError('Must give image_feature_h5') if 'lidar_feature_h5' not in kwargs: raise ValueError('Must give lidar_feature_h5') image_feature_h5_path = kwargs.pop('image_feature_h5') # self.image_feature_h5 = h5py.File(image_feature_h5_path, 'r') # print('Reading image features from ', image_feature_h5_path) lidar_feature_h5_path = kwargs.pop('lidar_feature_h5') load_lidar = kwargs.pop('load_lidar') # print('Reading lidar features from ', lidar_feature_h5_path) # self.lidar_feature_h5 = h5py.File(lidra_feature_h5_path, 'r') # image # lidar vocab_path = kwargs.pop('vocab') vocab = load_vocab(vocab_path) question_h5_path = kwargs.pop('question_h5') # disable_lidar=kwargs.pop('disable_lidar') print('Reading questions from ', question_h5_path) with h5py.File(question_h5_path, 'r') as question_h5: self.dataset = ArgoDataset(question_h5, image_feature_h5_path, lidar_feature_h5_path, vocab=vocab, load_lidar=load_lidar) kwargs['collate_fn'] = argo_collate # file closed at this point but allquestion are store in variable super(ArgoDataLoader, self).__init__(self.dataset, **kwargs)
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm', method='concat', grouping='single_scale'): super(LIDAR_MODEL, self).__init__() self.qa = qa self.image_feat_size = image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.method = method self.grouping = grouping if self.grouping == 'single_scale': self.lidar_module = LidarSsgModule(normal_channel=False) if self.grouping == 'multi_scale': self.lidar_module = LidarMsgModule(normal_channel=False) if self.method == 'dot': self.classifier = AnswerModule(ques_feat_size, num_classes, use_batchnorm=True, dropout=0.5) if self.method == 'concat': self.classifier = AnswerModule(ques_feat_size + 1024, num_classes, use_batchnorm=True, dropout=0.5)
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm', grouping='single_scale'): super(SAN_LIDAR, self).__init__() self.qa = qa self.image_feat_size = image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.attention = StackedAttention(ques_feat_size, 512, 512, 2) self.classifier = AnswerModule(512, num_classes, (256, ), use_batchnorm=True, dropout=0.5) #3584 if method is concat # self.linweights=nn.Linear(512,7) self.linweights = nn.Sequential( nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5), nn.Linear(256, 7), #no of glimpses nn.Sigmoid()) self.grouping = grouping if self.grouping == 'single_scale': self.lidar_module = LidarSsgModule(normal_channel=False) if self.grouping == 'single_scale': self.lidar_module = LidarMsgModule(normal_channel=False)
def visualize_loop(args, val_loader): image_feature_size = 512 lidar_feature_size = 1024 if args.model_type == 'SAN': question_feat_size = 512 model = SAN(args, question_feat_size, image_feature_size, lidar_feature_size, num_classes=34, qa=None, encoder=args.encoder_type, method='hierarchical') if args.model_type == 'MCB': question_feat_size = 512 model = MCB(args, question_feat_size, image_feature_size, lidar_feature_size, num_classes=34, qa=None, encoder=args.encoder_type, method='hierarchical') if args.model_type == 'MFB': question_feat_size = 512 # image_feature_size=512 model = MFB(args, question_feat_size, image_feature_size, lidar_feature_size, num_classes=34, qa=None, encoder=args.encoder_type, method='hierarchical') if args.model_type == 'MLB': question_feat_size = 1024 image_feature_size = 512 model = MLB(args, question_feat_size, image_feature_size, lidar_feature_size, num_classes=34, qa=None, encoder=args.encoder_type, method='hierarchical') if args.model_type == 'MUTAN': question_feat_size = 1024 image_feature_size = 512 model = MUTAN(args, question_feat_size, image_feature_size, lidar_feature_size, num_classes=34, qa=None, encoder=args.encoder_type, method='hierarchical') if args.model_type == 'DAN': question_feat_size = 512 model = DAN(args, question_feat_size, image_feature_size, lidar_feature_size, num_classes=34, qa=None, encoder=args.encoder_type, method='hierarchical') data = load_weights(args, model, optimizer=None) if type(data) == list: model, optimizer, start_epoch, loss, accuracy = data print("Loaded weights") print("Epoch: %d, loss: %.3f, Accuracy: %.4f " % (start_epoch, loss, accuracy), flush=True) else: print(" error occured while loading model training freshly") model = data return ###########################################################################multiple GPU use# # if torch.cuda.device_count() > 1: # print("Using ", torch.cuda.device_count(), "GPUs!") # model = nn.DataParallel(model) model.to(device=args.device) model.eval() import argoverse from argoverse.data_loading.argoverse_tracking_loader import ArgoverseTrackingLoader from argoverse.utils.json_utils import read_json_file from argoverse.map_representation.map_api import ArgoverseMap vocab = load_vocab(os.path.join(args.input_base, args.vocab)) argoverse_loader = ArgoverseTrackingLoader( '../../../Data/train/argoverse-tracking') k = 1 with torch.no_grad(): for data in tqdm(val_loader): question, image_feature, ques_lengths, point_set, answer, image_name = data question = question.to(device=args.device) ques_lengths = ques_lengths.to(device=args.device) image_feature = image_feature.to(device=args.device) point_set = point_set.to(device=args.device) pred, wgt, energies = model(question, image_feature, ques_lengths, point_set) question = question.cpu().data.numpy() answer = answer.cpu().data.numpy() pred = F.softmax(pred, dim=1) pred = torch.argmax(pred, dim=1) pred = np.asarray(pred.cpu().data) wgt = wgt.cpu().data.numpy() energies = energies.squeeze(1).cpu().data.numpy() ques_lengths = ques_lengths.cpu().data.numpy() pat = re.compile(r'(.*)@(.*)') _, keep = np.where([answer == pred]) temp_batch_size = question.shape[0] for b in range(temp_batch_size): q = get_ques(question[b], ques_lengths[b], vocab) ans = get_ans(answer[b]) pred_ans = get_ans(pred[b]) # print(q,ans) c = list(re.findall(pat, image_name[b]))[0] log_id = c[0] idx = int(c[1]) print(k) argoverse_data = argoverse_loader.get(log_id) if args.model_type == 'SAN': plot_att(argoverse_data, idx, wgt[b, :, 1, :], energies[b], q, ans, args.save_dir, k, pred_ans) if args.model_type == 'MCB': plot_att(argoverse_data, idx, wgt[b], energies[b], q, ans, args.save_dir, k, pred_ans) if args.model_type == 'MFB': plot_att(argoverse_data, idx, wgt[b, :, :, 1], energies[b], q, ans, args.save_dir, k, pred_ans) if args.model_type == 'MLB': plot_att(argoverse_data, idx, wgt[b, :, 3, :], energies[b], q, ans, args.save_dir, k, pred_ans) if args.model_type == 'MUTAN': #only two glimpses plot_att(argoverse_data, idx, wgt[b, :, 1, :], energies[b], q, ans, args.save_dir, k, pred_ans) if args.model_type == 'DAN': #only two memory plot_att(argoverse_data, idx, wgt[b, :, 1, :], energies[b], q, ans, args.save_dir, k, pred_ans) k = k + 1