def __init__(self, dbname='vg', eps=1e-3): super(FrequencyBias, self).__init__() if dbname == 'vg': db = VG elif dbname == 'vg200': db = VG200 elif dbname == 'vg200_kr': db = VG200_Keyrel elif dbname == 'vg200_kr_cap': db = VG200_Keyrel_captions elif dbname == 'vrd': db = VRD fg_matrix, bg_matrix = get_counts(train_data=db( mode='train', filter_duplicate_rels=False), must_overlap=True) bg_matrix += 1 fg_matrix[:, :, 0] = bg_matrix pred_dist = np.log(fg_matrix / fg_matrix.sum(2)[:, :, None] + eps) self.num_objs = pred_dist.shape[0] pred_dist = torch.FloatTensor(pred_dist).view(-1, pred_dist.shape[2]) self.obj_baseline = nn.Embedding(pred_dist.size(0), pred_dist.size(1)) self.obj_baseline.weight.data = pred_dist
def __init__(self, train_data, eps=1e-3): super(FrequencyBias, self).__init__() fg_matrix, bg_matrix = get_counts(train_data, must_overlap=True) bg_matrix += 1 fg_matrix[:, :, 0] = bg_matrix pred_dist = np.log(fg_matrix / fg_matrix.sum(2)[:, :, None] + eps) self.num_objs = pred_dist.shape[0] pred_dist = torch.FloatTensor(pred_dist).view(-1, pred_dist.shape[2]) self.obj_baseline = nn.Embedding(pred_dist.size(0), pred_dist.size(1)) self.obj_baseline.weight.data = pred_dist
def __init__(self, eps=1e-3): super(FrequencyBias, self).__init__() fg_matrix, bg_matrix = get_counts(must_overlap=True) # bg_matrix += 1 fg_matrix[:, :, 0] = bg_matrix pred_dist = fg_matrix / (fg_matrix.sum(2)[:, :, None] + eps) self.num_objs = pred_dist.shape[0] pred_dist = torch.FloatTensor(pred_dist).view(-1, pred_dist.shape[2]) pred_dist_log = torch.nn.functional.log_softmax(Variable(pred_dist), dim=-1).data self.obj_baseline = nn.Embedding(pred_dist.size(0), pred_dist.size(1)) self.obj_baseline.weight.data = pred_dist_log self.obj_baseline_state = torch.nn.functional.softmax(Variable( pred_dist.clone()).cuda(), dim=-1)
MUST_OVERLAP = False train, val, test = VG.splits(num_val_im=conf.val_size, filter_non_overlap=MUST_OVERLAP, filter_duplicate_rels=True, use_proposals=conf.use_proposals) if conf.test: print("test data!") val = test train_loader, val_loader = VGDataLoader.splits(train, val, mode='rel', batch_size=conf.batch_size, num_workers=conf.num_workers, num_gpus=conf.num_gpus) fg_matrix, bg_matrix = get_counts(train_data=train, must_overlap=MUST_OVERLAP) detector = ObjectDetector( classes=train.ind_to_classes, num_gpus=conf.num_gpus, mode='rpntrain' if not conf.use_proposals else 'proposals', use_resnet=conf.use_resnet, nms_filter_duplicates=True, thresh=0.01) detector.eval() detector.cuda() classifier = ObjectDetector(classes=train.ind_to_classes, num_gpus=conf.num_gpus, mode='gtbox', use_resnet=conf.use_resnet,
def val_epoch(mode, sgg_model, loader, name, triplet_counts, triplet2str, n_batches=-1, is_test=False, save_scores=False, predicate_weight=0, train=None, wandb_log=None, **kwargs): print('\nEvaluate %s %s triplets' % (name.upper(), 'test' if is_test else 'val')) sgg_model.eval() evaluator, all_pred_entries, all_metrics = {}, {}, [] EVAL_MODES = ['sgdet'] if mode == 'sgdet' else ['predcls', 'sgcls'] assert mode in EVAL_MODES, (mode, 'other modes not supported') predicate_weights = None if predicate_weight != 0: fg_matrix, bg_matrix = get_counts(train, must_overlap=True) fg_matrix[:, :, 0] = bg_matrix + 1 fg_matrix = fg_matrix + 1 predicate_weights = fg_matrix.mean(axis=(0, 1))**predicate_weight with NO_GRAD(): for eval_m in EVAL_MODES: if eval_m == 'sgdet' and name.find('val_') >= 0: continue # skip for validation, because it takes a lot of time print('\nEvaluating %s...' % eval_m.upper()) evaluator[eval_m] = BasicSceneGraphEvaluator( eval_m) # graph constrained evaluator evaluator[eval_m + '_nogc'] = BasicSceneGraphEvaluator( eval_m, multiple_preds=True, # graph unconstrained evaluator per_triplet=name in all_shot_splits, triplet_counts=triplet_counts, triplet2str=triplet2str) # for calculating recall of each relationship except no relationship evaluator_list, evaluator_multiple_preds_list = [], [] if name not in ['val_zs', 'test_zs'] and name.find('val_') < 0: for index, name_s in enumerate( loader.dataset.ind_to_predicates): if index == 0: continue evaluator_list.append( (index, name_s, BasicSceneGraphEvaluator.all_modes())) evaluator_multiple_preds_list.append( (index, name_s, BasicSceneGraphEvaluator.all_modes( multiple_preds=True))) set_mode(sgg_model, mode=eval_m, is_train=False, verbose=True) # For all val/test batches all_pred_entries[eval_m] = [] for val_b, batch in enumerate(tqdm(loader)): pred_entry = val_batch(sgg_model, val_b, batch, evaluator, eval_m, loader.dataset, evaluator_list, evaluator_multiple_preds_list, train=train, predicate_weights=predicate_weights, **kwargs) if save_scores: all_pred_entries[eval_m].extend(pred_entry) if n_batches > -1 and val_b + 1 >= n_batches: break evaluator[eval_m].print_stats() evaluator[eval_m + '_nogc'].print_stats() mean_recall = mean_recall_mp = None if len(evaluator_list) > 0: # Compute Mean Recall Results mean_recall = calculate_mR_from_evaluator_list(evaluator_list, eval_m, save_file=None) mean_recall_mp = calculate_mR_from_evaluator_list( evaluator_multiple_preds_list, eval_m, multiple_preds=True, save_file=None) if not wandb_log: continue # Log using WANDB eval_gc = evaluator[eval_m].result_dict eval_no_gc = evaluator[eval_m + '_nogc'].result_dict results_dict = {} for eval_, mean_eval, sfx in zip([eval_gc, eval_no_gc], [mean_recall, mean_recall_mp], ['GC', 'NOGC']): for k, v in eval_[eval_m + '_recall'].items(): all_metrics.append(np.mean(v)) results_dict['%s/%s_R@%i_%s' % (eval_m, name, k, sfx)] = np.mean(v) if mean_eval: for k, v in mean_eval.items(): results_dict['%s/%s_m%s_%s' % (eval_m, name, k, sfx)] = np.mean(v) # Per triplet metrics try: if name in all_shot_splits: for case in ['', '_norm']: for k, v in eval_no_gc[eval_m + '_recall_triplet' + case].items(): results_dict['%s/%s_R@%i_triplet%s' % (eval_m, name, k, case)] = v for metric in ['meanrank', 'medianrank'] + ( ['medianrankclass'] if case == '' else []): results_dict['%s/%s_%s_triplet%s' % (eval_m, name, metric, case)] = \ eval_no_gc[eval_m + ('_%s_triplet' % metric) + case] except Exception as e: print('error in per triplet eval', e)
def __init__(self, vocabs, vocab_size, input_encoding_size, rnn_type='lstm', rnn_size=512, num_layers=1, drop_prob_lm=0.5, seq_length=16, seq_per_img=5, fc_feat_size=4096, att_feat_size=512, num_relation=20, object_classes=None, predicate_classes=None, triplet_embed_dim=-1, embed_triplet=True, freq_bl=False): super(RelCaptionModel, self).__init__() self.vocabs = vocabs self.vocabs['0'] = '__SENTSIGN__' ## ix self.vocabs = {i: self.vocabs[str(i)] for i in range(len(self.vocabs))} vocab_list = [self.vocabs[i] for i in range(len(self.vocabs))] self.vocab_size = vocab_size + 1 # including all the words and <UNK>, and 0 for <start>/<end> self.input_encoding_size = input_encoding_size self.rnn_type = rnn_type self.rnn_size = rnn_size self.num_layers = num_layers self.drop_prob_lm = drop_prob_lm self.seq_length = seq_length self.fc_feat_size = fc_feat_size self.ss_prob = 0.0 # Schedule sampling probability self.num_relation_per_img = num_relation self.seq_per_img = seq_per_img self.embed_triplet = embed_triplet self.triplet_embed_dim = triplet_embed_dim self.freq_bl = freq_bl self.linear = nn.Linear(self.fc_feat_size, self.num_layers * self.rnn_size) # feature to rnn_size embed_vec = obj_edge_vectors(vocab_list, wv_dim=self.input_encoding_size) self.embed = nn.Embedding(self.vocab_size, self.input_encoding_size) self.embed.weight.data = embed_vec.clone() if self.embed_triplet: assert object_classes is not None and predicate_classes is not None object_embed_vec = obj_edge_vectors(object_classes, wv_dim=self.triplet_embed_dim) predicate_embed_vec = obj_edge_vectors( predicate_classes, wv_dim=self.triplet_embed_dim) self.object_embed = nn.Embedding(len(object_classes), self.triplet_embed_dim) self.object_embed.weight.data = object_embed_vec.clone() self.predicate_embed = nn.Embedding(len(predicate_classes), self.triplet_embed_dim) self.predicate_embed.weight.data = predicate_embed_vec.clone() self.logit = nn.Linear(self.rnn_size, self.vocab_size) self.dropout = nn.Dropout(self.drop_prob_lm) self.core = RelCaptionCore(input_encoding_size, rnn_type, rnn_size, num_layers, drop_prob_lm, fc_feat_size, att_feat_size, triplet_embed_dim, embed_triplet) if self.freq_bl: self.freq_matrix, _ = get_counts(train_data=VG200( mode='train', filter_duplicate_rels=False, num_val_im=1000), must_overlap=True) else: self.freq_matrix = None self.init_weights()
def __init__(self, classes, rel_classes, inputs_dim, hidden_dim, recurrent_dropout_probability=0.2, use_highway=True, use_input_projection_bias=True): """Initializes the RNN Args: classes: rel_classes: inputs_dim: hidden_dim: Hidden dim of the decoder recurrent_dropout_probability: use_highway: use_input_projection_bias: """ # TODO add database bias in this module super(MemoryRNN, self).__init__() self.classes = classes self.rel_classes = rel_classes self.hidden_size = hidden_dim self.inputs_dim = inputs_dim self.nms_thresh = 0.3 self.rel_mem_h = nn.Embedding(self.num_rels, hidden_dim) self.rel_mem_h.weight.data.fill_(0) self.rel_mem_c = nn.Embedding(self.num_rels, hidden_dim) self.rel_mem_c.weight.data.fill_(0) self.recurrent_dropout_probability = recurrent_dropout_probability self.use_highway = use_highway # We do the projections for all the gates all at once, so if we are # using highway layers, we need some extra projections, which is # why the sizes of the Linear layers change here depending on this flag. if use_highway: self.input_linearity = torch.nn.Linear( self.inputs_dim, 6 * self.hidden_size, bias=use_input_projection_bias) self.state_linearity = torch.nn.Linear(self.hidden_size, 5 * self.hidden_size, bias=True) else: self.input_linearity = torch.nn.Linear( self.inputs_dim, 4 * self.hidden_size, bias=use_input_projection_bias) self.state_linearity = torch.nn.Linear(self.hidden_size, 4 * self.hidden_size, bias=True) self.out = nn.Linear(self.hidden_size, len(self.rel_classes)) self.reset_parameters() fg_matrix, bg_matrix = get_counts() rel_obj_distribution = fg_matrix / (fg_matrix.sum(2)[:, :, None] + 1e-5) rel_obj_distribution = torch.FloatTensor(rel_obj_distribution) rel_obj_distribution = rel_obj_distribution.view(-1, self.num_rels) self.rel_obj_distribution = nn.Embedding(rel_obj_distribution.size(0), self.num_rels) # (#obj_class * #obj_class, #rel_class) self.rel_obj_distribution.weight.data = rel_obj_distribution
def __init__(self, classes, rel_classes, embed_dim, obj_dim, inputs_dim, hidden_dim, pooling_dim, recurrent_dropout_probability=0.2, use_highway=True, use_input_projection_bias=True, use_vision=True, use_bias=True, use_tanh=True, limit_vision=True, sl_pretrain=False, num_iter=-1): """ Initializes the RNN :param embed_dim: Dimension of the embeddings :param encoder_hidden_dim: Hidden dim of the encoder, for attention purposes :param hidden_dim: Hidden dim of the decoder :param vocab_size: Number of words in the vocab :param bos_token: To use during decoding (non teacher forcing mode)) :param bos: beginning of sentence token :param unk: unknown token (not used) """ super(DecoderRNN, self).__init__() self.rel_embedding_dim = 100 self.classes = classes self.rel_classes = rel_classes embed_vecs = obj_edge_vectors(['start'] + self.classes, wv_dim=100) self.obj_embed = nn.Embedding(len(self.classes), embed_dim) self.obj_embed.weight.data = embed_vecs embed_rels = obj_edge_vectors(self.rel_classes, wv_dim=self.rel_embedding_dim) self.rel_embed = nn.Embedding(len(self.rel_classes), self.rel_embedding_dim) self.rel_embed.weight.data = embed_rels self.embed_dim = embed_dim self.obj_dim = obj_dim self.hidden_size = hidden_dim self.inputs_dim = inputs_dim self.pooling_dim = pooling_dim self.nms_thresh = 0.3 self.use_vision = use_vision self.use_bias = use_bias self.use_tanh = use_tanh self.limit_vision = limit_vision self.sl_pretrain = sl_pretrain self.num_iter = num_iter self.recurrent_dropout_probability = recurrent_dropout_probability self.use_highway = use_highway # We do the projections for all the gates all at once, so if we are # using highway layers, we need some extra projections, which is # why the sizes of the Linear layers change here depending on this flag. if use_highway: self.input_linearity = torch.nn.Linear( self.input_size, 6 * self.hidden_size, bias=use_input_projection_bias) self.state_linearity = torch.nn.Linear(self.hidden_size, 5 * self.hidden_size, bias=True) else: self.input_linearity = torch.nn.Linear( self.input_size, 4 * self.hidden_size, bias=use_input_projection_bias) self.state_linearity = torch.nn.Linear(self.hidden_size, 4 * self.hidden_size, bias=True) # self.obj_in_lin = torch.nn.Linear(self.rel_embedding_dim, self.rel_embedding_dim, bias=True) self.out = nn.Linear(self.hidden_size, len(self.classes)) self.reset_parameters() # For relation predication embed_vecs2 = obj_edge_vectors(self.classes, wv_dim=embed_dim) self.obj_embed2 = nn.Embedding(self.num_classes, embed_dim) self.obj_embed2.weight.data = embed_vecs2.clone() # self.post_lstm = nn.Linear(self.hidden_dim, self.pooling_dim * 2) self.post_lstm = nn.Linear(self.obj_dim + 2 * self.embed_dim + 128, self.pooling_dim * 2) # Initialize to sqrt(1/2n) so that the outputs all have mean 0 and variance 1. # (Half contribution comes from LSTM, half from embedding. # In practice the pre-lstm stuff tends to have stdev 0.1 so I multiplied this by 10. self.post_lstm.weight.data.normal_( 0, 10.0 * math.sqrt(1.0 / self.hidden_size) ) ######## there may need more consideration self.post_lstm.bias.data.zero_() self.rel_compress = nn.Linear(self.pooling_dim, self.num_rels, bias=True) self.rel_compress.weight = torch.nn.init.xavier_normal( self.rel_compress.weight, gain=1.0) if self.use_bias: self.freq_bias = FrequencyBias() # simple relation model from dataloaders.visual_genome import VG from lib.get_dataset_counts import get_counts, box_filter fg_matrix, bg_matrix = get_counts(train_data=VG.splits( num_val_im=5000, filter_non_overlap=True, filter_duplicate_rels=True, use_proposals=False)[0], must_overlap=True) prob_matrix = fg_matrix.astype(np.float32) prob_matrix[:, :, 0] = bg_matrix # TRYING SOMETHING NEW. prob_matrix[:, :, 0] += 1 prob_matrix /= np.sum(prob_matrix, 2)[:, :, None] # prob_matrix /= float(fg_matrix.max()) prob_matrix[:, :, 0] = 0 # Zero out BG self.prob_matrix = prob_matrix