def load_embeddings(self, labels): embed_h5py = io.load_h5py_object(self.const.embed_h5py)['embeddings'] word_to_idx = io.load_json_object(self.const.embed_word_to_idx_json) embeddings = np.zeros([len(labels), self.const.embed_dims]) word_to_label = {} for i, label in enumerate(labels): if ' ' in label: words = label.split(' ') elif '_' in label: words = label.split('_') else: words = [label] denom = len(words) for word in words: if word == 'tree': denom = len(words) - 1 continue if word not in word_to_label: word_to_label[word] = set() word_to_label[word].add(label) idx = word_to_idx[word] embeddings[i] += embed_h5py[idx][()] embeddings[i] /= denom if self.const.no_glove == True: embeddings[:, :self.const.glove_dim] = 0 self.embed.weight.data.copy_(torch.from_numpy(embeddings))
def get_neg_noun_samples_feats(self, image_id, cap_id, noun_id=None): str_image_id = str(image_id) str_cap_id = str(cap_id) if (str_image_id in self.neg_noun_samples) and \ (str_cap_id in self.neg_noun_samples[str_image_id]): negs = self.neg_noun_samples[str_image_id][str_cap_id]['negs'] else: feats = np.zeros( [1 + self.const.num_neg_nouns, self.const.neg_noun_feat_dim], dtype=np.float32) noun_id = -1 return feats, noun_id if noun_id is None: str_noun_id = random.choice(list(negs.keys())) noun_id = int(str_noun_id) neg_samples_feats = io.load_h5py_object( self.const.neg_noun_samples_h5py) feat_name = f'{str_image_id}_{str_cap_id}_{str_noun_id}' feats = neg_samples_feats[feat_name][()].astype(np.float32) neg_samples_feats.close() return feats, noun_id
def read_object_boxes(self,image_id): f = io.load_h5py_object(self.const.boxes_hdf5) boxes = f[image_id][()] f.close() return boxes
def read_object_features(self,image_id): f = io.load_h5py_object(self.const.features_hdf5) features = f[image_id][()] f.close() return features
def main(exp_const, data_const): if exp_const.fine == True: from . import fine_categories as C vis_dir = os.path.join(exp_const.exp_dir, 'fine') print('*' * 80) print('Fine Categories') print('*' * 80) else: from . import categories as C vis_dir = os.path.join(exp_const.exp_dir, 'coarse') print('*' * 80) print('Coarse Categories') print('*' * 80) io.mkdir_if_not_exists(vis_dir, recursive=True) #print('Reading words and categories ...') categories = sorted([c for c in dir(C) if '__' not in c and c != 'C']) categories_to_idx = {l: i for i, l in enumerate(categories)} all_words = set() word_to_label = {} for category in categories: category_words = getattr(C, category) all_words.update(category_words) for word in category_words: word_to_label[word] = category entropy = {} accuracy = {} homogeneity = {} completeness = {} v_measure = {} ari = {} for embed_type, embed_info in data_const.embed_info.items(): print(f'- {embed_type}', end=' ', flush=True) #print('Loading embeddings ...') embed_ = io.load_h5py_object(embed_info.word_vecs_h5py)['embeddings'] word_to_idx = io.load_json_object(embed_info.word_to_idx_json) #print('Selecting words ...') words = [word for word in all_words if word in word_to_idx] labels = [word_to_label[word] for word in words] idxs = [word_to_idx[word] for word in words] embed = np.zeros([len(idxs), embed_.shape[1]]) for i, j in enumerate(idxs): embed[i] = embed_[j] embed = embed_info.get_embedding(embed) #print(f'Computing word features ({embed_type}) ...') word_feats = get_word_feats(embed, dim=2, embed_type='original') #print(f'Learn Decision Tree ({embed_type}) ...') entropy[embed_type] = [] accuracy[embed_type] = [] homogeneity[embed_type] = [] completeness[embed_type] = [] v_measure[embed_type] = [] ari[embed_type] = [] if exp_const.fine == True: depths = [ 1, 4, 8, 12, 16, 20, 24, 28, 32, 36, 42, 48, 54, 60, 66, 72, 78 ] else: depths = [ 1, 4, 8, 12, 16, 20, 24, 28, 32, 36, 42, 48, 54, 60, 66, 72, 78 ] for depth in depths: dt = DecisionTreeClassifier( criterion='gini', max_depth=depth, min_samples_leaf=2, ) dt.fit(word_feats, labels) prob = dt.predict_proba(word_feats) pred_labels = dt.predict(word_feats) confmat = np.zeros([len(categories), len(categories)]) counts = np.zeros([len(categories), 1]) for i, label in enumerate(labels): r = categories_to_idx[label] confmat[r] += prob[i] counts[r, 0] += 1 confmat = confmat / counts ce = -np.mean(np.sum(confmat * np.log(confmat + 1e-6), 1)) acc = 0 for gt_l, pred_l in zip(labels, pred_labels): acc += gt_l == pred_l acc = acc / len(labels) homo_score,comp_score,v_measure_score = \ skmetrics.homogeneity_completeness_v_measure( labels, pred_labels) ari_score = skmetrics.adjusted_rand_score(labels, pred_labels) entropy[embed_type].append(ce) accuracy[embed_type].append(acc) homogeneity[embed_type].append(homo_score) completeness[embed_type].append(comp_score) v_measure[embed_type].append(v_measure_score) ari[embed_type].append(ari_score) print('[Done]') plot_metric_vs_depth('Accuracy', accuracy, depths, os.path.join(vis_dir, 'accuracy.html')) plot_metric_vs_depth('Homogeneity', homogeneity, depths, os.path.join(vis_dir, 'homogeneity.html')) plot_metric_vs_depth('Completeness', completeness, depths, os.path.join(vis_dir, 'completeness.html')) plot_metric_vs_depth('V-Measure', v_measure, depths, os.path.join(vis_dir, 'v_measure.html')) plot_metric_vs_depth('Adjusted Rand Index', ari, depths, os.path.join(vis_dir, 'ari.html')) print('') print( 'Aggregate performance across different tree depths (Copy to your latex table/spreadsheet)' ) metrics = ['v_measure', 'ari', 'accuracy'] print('') print('-' * 40) metric_str = 'Embedding' for metric in metrics: metric_str += ' & ' metric_str += metric print(metric_str) print('-' * 40) for embed_type in data_const.embed_info.keys(): metric_str = embed_type for metric in metrics: metric_str += ' & ' metric_value = round(np.mean(locals()[metric][embed_type]), 2) metric_str += '{:.2f}'.format(metric_value) metric_str += ' \\\\' print(metric_str) print('-' * 40) print('')
def main(exp_const,data_const): if exp_const.fine==True: from . import fine_categories as C vis_dir = os.path.join(exp_const.exp_dir,'fine') print('*'*80) print('Fine Categories') print('*'*80) else: from . import categories as C vis_dir = os.path.join(exp_const.exp_dir,'coarse') print('*'*80) print('Coarse Categories') print('*'*80) io.mkdir_if_not_exists(vis_dir,recursive=True) #print('Reading words and categories ...') categories = sorted([c for c in dir(C) if '__' not in c and c!='C']) categories_to_idx = {l:i for i,l in enumerate(categories)} all_words = set() word_to_label = {} for category in categories: category_words = getattr(C,category) all_words.update(category_words) for word in category_words: word_to_label[word] = category homogeneity = {} completeness = {} v_measure = {} ari = {} for embed_type, embed_info in data_const.embed_info.items(): print(f'- {embed_type}',end=' ',flush=True) #print('Loading embeddings ...') embed_ = io.load_h5py_object(embed_info.word_vecs_h5py)['embeddings'] word_to_idx = io.load_json_object(embed_info.word_to_idx_json) #print('Selecting words ...') words = [word for word in all_words if word in word_to_idx] labels = [word_to_label[word] for word in words] idxs = [word_to_idx[word] for word in words] embed = np.zeros([len(idxs),embed_.shape[1]]) for i,j in enumerate(idxs): embed[i] = embed_[j] embed = embed_info.get_embedding(embed) #print(f'Computing word features ({embed_type}) ...') word_feats = get_word_feats( embed, dim=2, embed_type='original') #print(f'Clustering ({embed_type}) ...') homogeneity[embed_type] = [] completeness[embed_type] = [] v_measure[embed_type] = [] ari[embed_type] = [] if exp_const.fine==True: n_clusters_list = [1,4,8,16,24,32,40,48,56,64,72,80] else: n_clusters_list = [1,4,8,16,24,32,40,48,56,64,72,80] for n_clusters in n_clusters_list: clustering = AgglomerativeClustering( n_clusters=n_clusters, affinity='cosine', linkage='average') pred_labels = clustering.fit_predict(word_feats) homo_score,comp_score,v_measure_score = \ skmetrics.homogeneity_completeness_v_measure( labels, pred_labels) ari_score = skmetrics.adjusted_rand_score(labels,pred_labels) homogeneity[embed_type].append(homo_score) completeness[embed_type].append(comp_score) v_measure[embed_type].append(v_measure_score) ari[embed_type].append(ari_score) print('[Done]') plot_metric_vs_clusters( 'Homogeneity', homogeneity, n_clusters_list, os.path.join(vis_dir,'homogeneity.html'), exp_const.fine) plot_metric_vs_clusters( 'Completeness', completeness, n_clusters_list, os.path.join(vis_dir,'completeness.html'), exp_const.fine) plot_metric_vs_clusters( 'V-Measure', v_measure, n_clusters_list, os.path.join(vis_dir,'v_measure.html'), exp_const.fine) plot_metric_vs_clusters( 'Adjusted Rand Index', ari, n_clusters_list, os.path.join(vis_dir,'ari.html'), exp_const.fine) print('') print('Aggregate performance across different cluster numbers (Copy to your latex table/spreadsheet)') metrics = ['v_measure','ari'] print('') print('-'*40) metric_str = 'Embedding' for metric in metrics: metric_str += ' & ' metric_str += metric print(metric_str) print('-'*40) for embed_type in data_const.embed_info.keys(): metric_str = embed_type for metric in metrics: metric_str += ' & ' metric_value = round(np.mean(locals()[metric][embed_type]),2) metric_str += '{:.2f}'.format(metric_value) metric_str += ' \\\\' print(metric_str) print('-'*40) print('')