示例#1
0
    def load_embeddings(self, labels):
        embed_h5py = io.load_h5py_object(self.const.embed_h5py)['embeddings']
        word_to_idx = io.load_json_object(self.const.embed_word_to_idx_json)
        embeddings = np.zeros([len(labels), self.const.embed_dims])
        word_to_label = {}
        for i, label in enumerate(labels):
            if ' ' in label:
                words = label.split(' ')
            elif '_' in label:
                words = label.split('_')
            else:
                words = [label]

            denom = len(words)
            for word in words:
                if word == 'tree':
                    denom = len(words) - 1
                    continue

                if word not in word_to_label:
                    word_to_label[word] = set()
                word_to_label[word].add(label)

                idx = word_to_idx[word]
                embeddings[i] += embed_h5py[idx][()]
            embeddings[i] /= denom

        if self.const.no_glove == True:
            embeddings[:, :self.const.glove_dim] = 0

        self.embed.weight.data.copy_(torch.from_numpy(embeddings))
示例#2
0
    def get_neg_noun_samples_feats(self, image_id, cap_id, noun_id=None):
        str_image_id = str(image_id)
        str_cap_id = str(cap_id)

        if (str_image_id in self.neg_noun_samples) and \
            (str_cap_id in self.neg_noun_samples[str_image_id]):
            negs = self.neg_noun_samples[str_image_id][str_cap_id]['negs']
        else:
            feats = np.zeros(
                [1 + self.const.num_neg_nouns, self.const.neg_noun_feat_dim],
                dtype=np.float32)
            noun_id = -1
            return feats, noun_id

        if noun_id is None:
            str_noun_id = random.choice(list(negs.keys()))
            noun_id = int(str_noun_id)

        neg_samples_feats = io.load_h5py_object(
            self.const.neg_noun_samples_h5py)
        feat_name = f'{str_image_id}_{str_cap_id}_{str_noun_id}'
        feats = neg_samples_feats[feat_name][()].astype(np.float32)
        neg_samples_feats.close()

        return feats, noun_id
示例#3
0
 def read_object_boxes(self,image_id):
     f = io.load_h5py_object(self.const.boxes_hdf5)
     boxes = f[image_id][()]
     f.close()
     return boxes
示例#4
0
 def read_object_features(self,image_id):
     f = io.load_h5py_object(self.const.features_hdf5)
     features = f[image_id][()]
     f.close()
     return features
示例#5
0
def main(exp_const, data_const):
    if exp_const.fine == True:
        from . import fine_categories as C
        vis_dir = os.path.join(exp_const.exp_dir, 'fine')
        print('*' * 80)
        print('Fine Categories')
        print('*' * 80)
    else:
        from . import categories as C
        vis_dir = os.path.join(exp_const.exp_dir, 'coarse')
        print('*' * 80)
        print('Coarse Categories')
        print('*' * 80)

    io.mkdir_if_not_exists(vis_dir, recursive=True)

    #print('Reading words and categories ...')
    categories = sorted([c for c in dir(C) if '__' not in c and c != 'C'])
    categories_to_idx = {l: i for i, l in enumerate(categories)}
    all_words = set()
    word_to_label = {}
    for category in categories:
        category_words = getattr(C, category)
        all_words.update(category_words)
        for word in category_words:
            word_to_label[word] = category

    entropy = {}
    accuracy = {}
    homogeneity = {}
    completeness = {}
    v_measure = {}
    ari = {}
    for embed_type, embed_info in data_const.embed_info.items():
        print(f'- {embed_type}', end=' ', flush=True)

        #print('Loading embeddings ...')
        embed_ = io.load_h5py_object(embed_info.word_vecs_h5py)['embeddings']
        word_to_idx = io.load_json_object(embed_info.word_to_idx_json)

        #print('Selecting words ...')
        words = [word for word in all_words if word in word_to_idx]
        labels = [word_to_label[word] for word in words]
        idxs = [word_to_idx[word] for word in words]

        embed = np.zeros([len(idxs), embed_.shape[1]])
        for i, j in enumerate(idxs):
            embed[i] = embed_[j]
        embed = embed_info.get_embedding(embed)

        #print(f'Computing word features ({embed_type}) ...')
        word_feats = get_word_feats(embed, dim=2, embed_type='original')

        #print(f'Learn Decision Tree ({embed_type}) ...')
        entropy[embed_type] = []
        accuracy[embed_type] = []
        homogeneity[embed_type] = []
        completeness[embed_type] = []
        v_measure[embed_type] = []
        ari[embed_type] = []

        if exp_const.fine == True:
            depths = [
                1, 4, 8, 12, 16, 20, 24, 28, 32, 36, 42, 48, 54, 60, 66, 72, 78
            ]
        else:
            depths = [
                1, 4, 8, 12, 16, 20, 24, 28, 32, 36, 42, 48, 54, 60, 66, 72, 78
            ]

        for depth in depths:
            dt = DecisionTreeClassifier(
                criterion='gini',
                max_depth=depth,
                min_samples_leaf=2,
            )
            dt.fit(word_feats, labels)
            prob = dt.predict_proba(word_feats)
            pred_labels = dt.predict(word_feats)

            confmat = np.zeros([len(categories), len(categories)])
            counts = np.zeros([len(categories), 1])
            for i, label in enumerate(labels):
                r = categories_to_idx[label]
                confmat[r] += prob[i]
                counts[r, 0] += 1

            confmat = confmat / counts
            ce = -np.mean(np.sum(confmat * np.log(confmat + 1e-6), 1))

            acc = 0
            for gt_l, pred_l in zip(labels, pred_labels):
                acc += gt_l == pred_l
            acc = acc / len(labels)

            homo_score,comp_score,v_measure_score = \
                skmetrics.homogeneity_completeness_v_measure(
                    labels,
                    pred_labels)
            ari_score = skmetrics.adjusted_rand_score(labels, pred_labels)

            entropy[embed_type].append(ce)
            accuracy[embed_type].append(acc)
            homogeneity[embed_type].append(homo_score)
            completeness[embed_type].append(comp_score)
            v_measure[embed_type].append(v_measure_score)
            ari[embed_type].append(ari_score)

        print('[Done]')

        plot_metric_vs_depth('Accuracy', accuracy, depths,
                             os.path.join(vis_dir, 'accuracy.html'))

        plot_metric_vs_depth('Homogeneity', homogeneity, depths,
                             os.path.join(vis_dir, 'homogeneity.html'))

        plot_metric_vs_depth('Completeness', completeness, depths,
                             os.path.join(vis_dir, 'completeness.html'))

        plot_metric_vs_depth('V-Measure', v_measure, depths,
                             os.path.join(vis_dir, 'v_measure.html'))

        plot_metric_vs_depth('Adjusted Rand Index', ari, depths,
                             os.path.join(vis_dir, 'ari.html'))

    print('')
    print(
        'Aggregate performance across different tree depths (Copy to your latex table/spreadsheet)'
    )
    metrics = ['v_measure', 'ari', 'accuracy']

    print('')

    print('-' * 40)
    metric_str = 'Embedding'
    for metric in metrics:
        metric_str += ' & '
        metric_str += metric
    print(metric_str)
    print('-' * 40)

    for embed_type in data_const.embed_info.keys():
        metric_str = embed_type
        for metric in metrics:
            metric_str += ' & '
            metric_value = round(np.mean(locals()[metric][embed_type]), 2)
            metric_str += '{:.2f}'.format(metric_value)

        metric_str += ' \\\\'
        print(metric_str)
    print('-' * 40)

    print('')
示例#6
0
def main(exp_const,data_const):
    if exp_const.fine==True:
        from . import fine_categories as C
        vis_dir = os.path.join(exp_const.exp_dir,'fine')
        print('*'*80)
        print('Fine Categories')
        print('*'*80)
    else:
        from . import categories as C
        vis_dir = os.path.join(exp_const.exp_dir,'coarse')
        print('*'*80)
        print('Coarse Categories')
        print('*'*80)

    io.mkdir_if_not_exists(vis_dir,recursive=True)

    #print('Reading words and categories ...')
    categories = sorted([c for c in dir(C) if '__' not in c and c!='C'])
    categories_to_idx = {l:i for i,l in enumerate(categories)}
    all_words = set()
    word_to_label = {}
    for category in categories:
        category_words = getattr(C,category)
        all_words.update(category_words)
        for word in category_words:
            word_to_label[word] = category

    homogeneity = {}
    completeness = {}
    v_measure = {}
    ari = {}
    for embed_type, embed_info in data_const.embed_info.items():
        print(f'- {embed_type}',end=' ',flush=True)

        #print('Loading embeddings ...')
        embed_ = io.load_h5py_object(embed_info.word_vecs_h5py)['embeddings']
        word_to_idx = io.load_json_object(embed_info.word_to_idx_json)
        
        #print('Selecting words ...')
        words = [word for word in all_words if word in word_to_idx]
        labels = [word_to_label[word] for word in words]
        idxs = [word_to_idx[word] for word in words]

        embed = np.zeros([len(idxs),embed_.shape[1]])
        for i,j in enumerate(idxs):
            embed[i] = embed_[j]
        embed = embed_info.get_embedding(embed)

        #print(f'Computing word features ({embed_type}) ...')
        word_feats = get_word_feats(
            embed,
            dim=2,
            embed_type='original')

        #print(f'Clustering ({embed_type}) ...')
        homogeneity[embed_type] = []
        completeness[embed_type] = []
        v_measure[embed_type] = []
        ari[embed_type] = []

        if exp_const.fine==True:
            n_clusters_list = [1,4,8,16,24,32,40,48,56,64,72,80]
        else:
            n_clusters_list = [1,4,8,16,24,32,40,48,56,64,72,80]

        for n_clusters in n_clusters_list:
            clustering = AgglomerativeClustering(
                n_clusters=n_clusters,
                affinity='cosine',
                linkage='average')
            pred_labels = clustering.fit_predict(word_feats)
            
            homo_score,comp_score,v_measure_score = \
                skmetrics.homogeneity_completeness_v_measure(
                    labels,
                    pred_labels)
            ari_score = skmetrics.adjusted_rand_score(labels,pred_labels)

            homogeneity[embed_type].append(homo_score)
            completeness[embed_type].append(comp_score)
            v_measure[embed_type].append(v_measure_score)
            ari[embed_type].append(ari_score)
        
        print('[Done]')

        plot_metric_vs_clusters(
            'Homogeneity',
            homogeneity,
            n_clusters_list,
            os.path.join(vis_dir,'homogeneity.html'),
            exp_const.fine)

        plot_metric_vs_clusters(
            'Completeness',
            completeness,
            n_clusters_list,
            os.path.join(vis_dir,'completeness.html'),
            exp_const.fine)

        plot_metric_vs_clusters(
            'V-Measure',
            v_measure,
            n_clusters_list,
            os.path.join(vis_dir,'v_measure.html'),
            exp_const.fine)

        plot_metric_vs_clusters(
            'Adjusted Rand Index',
            ari,
            n_clusters_list,
            os.path.join(vis_dir,'ari.html'),
            exp_const.fine)


    print('')
    print('Aggregate performance across different cluster numbers (Copy to your latex table/spreadsheet)')
    metrics = ['v_measure','ari']
    
    print('')

    print('-'*40)
    metric_str = 'Embedding'
    for metric in metrics:
        metric_str += ' & '
        metric_str += metric
    print(metric_str)
    print('-'*40)

    for embed_type in data_const.embed_info.keys():
        metric_str = embed_type
        for metric in metrics:
            metric_str += ' & '
            metric_value = round(np.mean(locals()[metric][embed_type]),2)
            metric_str += '{:.2f}'.format(metric_value)
            
        metric_str += ' \\\\'
        print(metric_str)
    print('-'*40)

    print('')