예제 #1
0
    def __init__(self):

        print("Loading forest model...")
        tic()
        self.forest = utilites.loadVariableFromFile(
            "static/Corel5K/forest/forest_128.pkl")
        print("Done.")
        toc()
        self.train_vectors = loadmat(
            utilites.getAbsPath('static/Corel5K/train_vectors_original.mat'))
        self.train_vectors = self.train_vectors['train_vectors']
        self.train_file_path = utilites.loadVariableFromFile(
            "static/Corel5K/train_file_path.pkl")
        # load contents of concepts
        self.concepts = utilites.loadVariableFromFile(
            "static/Corel5K/cluster_contents.pkl")
        self.tag_scores = utilites.loadVariableFromFile(
            "static/Corel5K/all_tags_scores.pkl")

        self.train_vectors_classic = loadmat(
            utilites.getAbsPath(
                'static/Corel5K/baseline_features/corel5k_train_feats_classic.mat'
            ))
        self.train_vectors_classic = self.train_vectors_classic[
            'corel5k_train_feats_classic']

        self.test_vectors_classic = loadmat(
            utilites.getAbsPath(
                'static/Corel5K/baseline_features/corel5k_test_feats_classic.mat'
            ))
        self.test_vectors_classic = self.test_vectors_classic[
            'corel5k_test_feats_classic']
        self.test_file_name = utilites.loadVariableFromFile(
            'static/Corel5K/corel5k_test_file_name.pkl')
        self.feat_dict_classic = dict(
            zip(self.test_file_name, self.test_vectors_classic))

        # start a matlab session for feature extraction
        self.matlab = matlab_wrapper.MatlabSession(
            matlab_root="/Applications/MATLAB_R2015b.app")  # start matlab
        self.matlab.eval('run MatConvNet/matlab/vl_setupnn')  # basic config
        self.matlab.eval('run vlfeat/toolbox/vl_setup')  ## basic config
        self.matlab.eval("feature('DefaultCharacterSet', 'UTF8')")
        print("Loading cnn model...")
        tic()
        self.matlab.eval(
            "net = load('/Users/TONYSUN/Desktop/SIR_Corel5K_demo/static/cnnmodel/imagenet-matconvnet-vgg-verydeep-16.mat')"
        )
        toc()
        print("Matlab session started.")
        print("Ready for work ^_^.")
예제 #2
0
def perform_clustering(alpha=0.0, num_clusters=100):
    """
    clustering the tag/terms and return the cluster ids for each tag
    :param alpha: parameter to combine visual and textual similarity matrix
    :param num_clusters: number of clusters/concepts obtained
    :return: cluster ids for each tag
    """
    vis_sim_mat = utilites.loadVariableFromFile(
        "Corel5k/tag_affinity_matrix_scaled.pkl")
    tex_sim_mat = utilites.loadVariableFromFile(
        "Corel5k/tag_textual_similarity_matrix.pkl")

    tex_sim_mat = adjust_and_norm_affinity(tex_sim_mat)
    vis_sim_mat = expit(vis_sim_mat)

    # introduce a parameter alpha to merge the two matrics
    joint_mat = alpha * vis_sim_mat + (1 - alpha) * tex_sim_mat

    # let's start spectrum clustering
    # obtain cluster IDs for each word
    # eigen_solver: None, arpack, lobpcg, or amg
    cluster_ids = spectral_clustering(joint_mat,
                                      n_clusters=num_clusters,
                                      eigen_solver='arpack')
    print("Done...")
    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a cluster number
    words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl")
    word_centroid_map = dict(zip(words, cluster_ids))
    utilites.saveVariableToFile(cluster_ids, "Corel5k/concepts_ids.pkl")

    cluster_contents = []
    # For the first 10 clusters
    for cluster in range(0, num_clusters):
        # print the cluster number
        print("\nCluster %d" % cluster)
        # Find all of the words for that cluster number, and print them out
        r_words = []
        for i in range(0, len(word_centroid_map.values())):
            if (word_centroid_map.values()[i] == cluster):
                r_words.append(word_centroid_map.keys()[i])

        print(r_words)
        cluster_contents.append(r_words)

    utilites.saveVariableToFile(cluster_contents,
                                "Corel5k/cluster_contents.pkl")

    return cluster_ids
예제 #3
0
def load_forest():
    """
    Load forest file stored on the disk
    :return: loaded forest
    """
    return utilites.loadVariableFromFile(
        "Corel5K/forest_400_trees_64_feats/forest.pkl")
예제 #4
0
def display_clusters(num_clusters):
    cluster_ids = utilites.loadVariableFromFile("Corel5k/concepts_ids.pkl")
    words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl")
    word_centroid_map = dict(zip(words, cluster_ids))
    cluster_contents = []
    # For the first 10 clusters
    for cluster in range(0, num_clusters):
        # print the cluster number
        print("\nCluster %d" % cluster)
        # Find all of the words for that cluster number, and print them out
        r_words = []
        for i in range(0, len(word_centroid_map.values())):
            if (word_centroid_map.values()[i] == cluster):
                r_words.append(word_centroid_map.keys()[i])

        print(r_words)
        cluster_contents.append(r_words)
def display_clusters(num_clusters):
    cluster_ids = utilites.loadVariableFromFile("Corel5k/concepts_ids")
    words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl")
    word_centroid_map = dict(zip(words, cluster_ids))
    cluster_contents = []
    # For the first 10 clusters
    for cluster in range(0, num_clusters):
        # print the cluster number
        print("\nCluster %d" % cluster)
        # Find all of the words for that cluster number, and print them out
        r_words = []
        for i in range(0,len(word_centroid_map.values())):
            if( word_centroid_map.values()[i] == cluster ):
                r_words.append(word_centroid_map.keys()[i])

        print (r_words)
        cluster_contents.append(r_words)
def perform_clustering(alpha=0.0, num_clusters=100):
    """
    clustering the tag/terms and return the cluster ids for each tag
    :param alpha: parameter to combine visual and textual similarity matrix
    :param num_clusters: number of clusters/concepts obtained
    :return: cluster ids for each tag
    """
    vis_sim_mat = utilites.loadVariableFromFile("Corel5k/tag_affinity_matrix_scaled.pkl")
    tex_sim_mat = utilites.loadVariableFromFile("Corel5k/tag_textual_similarity_matrix.pkl")

    tex_sim_mat = adjust_and_norm_affinity(tex_sim_mat)
    vis_sim_mat = expit(vis_sim_mat)

    # introduce a parameter alpha to merge the two matrics
    joint_mat = alpha * vis_sim_mat + (1 - alpha) * tex_sim_mat

    # let's start spectrum clustering
    # obtain cluster IDs for each word
    # eigen_solver: None, arpack, lobpcg, or amg
    cluster_ids = spectral_clustering(joint_mat, n_clusters=num_clusters, eigen_solver='arpack')
    print("Done...")
    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a cluster number
    words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl")
    word_centroid_map = dict(zip(words, cluster_ids))
    utilites.saveVariableToFile(cluster_ids, "Corel5k/concepts_ids.pkl")

    cluster_contents = []
    # For the first 10 clusters
    for cluster in range(0, num_clusters):
        # print the cluster number
        print("\nCluster %d" % cluster)
        # Find all of the words for that cluster number, and print them out
        r_words = []
        for i in range(0,len(word_centroid_map.values())):
            if( word_centroid_map.values()[i] == cluster ):
                r_words.append(word_centroid_map.keys()[i])

        print (r_words)
        cluster_contents.append(r_words)

    utilites.saveVariableToFile(cluster_contents, "Corel5k/cluster_contents.pkl")

    return cluster_ids
예제 #7
0
    def __init__(self):

        print ("Loading forest model...")
        tic()
        self.forest = utilites.loadVariableFromFile("static/Corel5K/forest/forest_128.pkl")
        print ("Done.")
        toc()
        self.train_vectors = loadmat(utilites.getAbsPath("static/Corel5K/train_vectors_original.mat"))
        self.train_vectors = self.train_vectors["train_vectors"]
        self.train_file_path = utilites.loadVariableFromFile("static/Corel5K/train_file_path.pkl")
        # load contents of concepts
        self.concepts = utilites.loadVariableFromFile("static/Corel5K/cluster_contents.pkl")
        self.tag_scores = utilites.loadVariableFromFile("static/Corel5K/all_tags_scores.pkl")

        self.train_vectors_classic = loadmat(
            utilites.getAbsPath("static/Corel5K/baseline_features/corel5k_train_feats_classic.mat")
        )
        self.train_vectors_classic = self.train_vectors_classic["corel5k_train_feats_classic"]

        self.test_vectors_classic = loadmat(
            utilites.getAbsPath("static/Corel5K/baseline_features/corel5k_test_feats_classic.mat")
        )
        self.test_vectors_classic = self.test_vectors_classic["corel5k_test_feats_classic"]
        self.test_file_name = utilites.loadVariableFromFile("static/Corel5K/corel5k_test_file_name.pkl")
        self.feat_dict_classic = dict(zip(self.test_file_name, self.test_vectors_classic))

        # start a matlab session for feature extraction
        self.matlab = matlab_wrapper.MatlabSession(matlab_root="/Applications/MATLAB_R2015b.app")  # start matlab
        self.matlab.eval("run MatConvNet/matlab/vl_setupnn")  # basic config
        self.matlab.eval("run vlfeat/toolbox/vl_setup")  ## basic config
        self.matlab.eval("feature('DefaultCharacterSet', 'UTF8')")
        print ("Loading cnn model...")
        tic()
        self.matlab.eval(
            "net = load('/Users/TONYSUN/Desktop/SIR_Corel5K_demo/static/cnnmodel/imagenet-matconvnet-vgg-verydeep-16.mat')"
        )
        toc()
        print ("Matlab session started.")
        print ("Ready for work ^_^.")
예제 #8
0
def get_concept_anno():
    """
    build new concept annotation matrix upon old tag based annotation
    :param num_clusters: number of clusters/concepts
    :return: new concept based annotation matrix
    """
    cluster_ids = utilites.loadVariableFromFile("Corel5k/concepts_ids.pkl")
    # all tags
    words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl")
    # all tag ids from 1 to length of cluster_ids
    word_ids = range(len(cluster_ids))
    # get number of clusters by counting unique cluster ids
    num_clusters = len(set(cluster_ids))
    # construct to indicate which cluster does the given word belong to
    cluster_map = dict(zip(word_ids, cluster_ids))
    # load the original tag annotation matrix
    word_anno = utilites.loadVariableFromFile(
        "Corel5k/train_anno_filtered.pkl")

    # initialize a zero matrix as concept matrix
    anno = np.zeros((len(word_anno), num_clusters), dtype=np.int)

    # for every instance in the anno
    for i in range(len(word_anno)):
        print('This is instance ' + str(i) + '.')
        # for every tag in all tags
        for j in range(len(cluster_ids)):
            # if this tag appears in the original tag annotation  matrix
            if word_anno[i][j] == 1:
                # we first find which concept this tag belongs to
                # and then set the occurrence of this concept is 1
                anno[i][cluster_map[j]] = 1
                print("The words is " + words[j] + ", and the concept is " +
                      str(cluster_map[j]))

    utilites.saveVariableToFile(anno, "Corel5k/train_anno_concept.pkl")

    return anno
def get_concept_anno():
    """
    build new concept annotation matrix upon old tag based annotation
    :param num_clusters: number of clusters/concepts
    :return: new concept based annotation matrix
    """
    cluster_ids = utilites.loadVariableFromFile("Corel5k/concepts_ids.pkl")
    # all tags
    words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl")
    # all tag ids from 1 to length of cluster_ids
    word_ids = range(len(cluster_ids))
    # get number of clusters by counting unique cluster ids
    num_clusters = len(set(cluster_ids))
    # construct to indicate which cluster does the given word belong to
    cluster_map = dict(zip(word_ids, cluster_ids))
    # load the original tag annotation matrix
    word_anno = utilites.loadVariableFromFile("Corel5k/train_anno_filtered.pkl")

    # initialize a zero matrix as concept matrix
    anno = np.zeros((len(word_anno), num_clusters), dtype=np.int)

    # for every instance in the anno
    for i in range(len(word_anno)):
        print('This is instance ' + str(i) + '.')
        # for every tag in all tags
        for j in range(len(cluster_ids)):
            # if this tag appears in the original tag annotation  matrix
            if word_anno[i][j] == 1:
                # we first find which concept this tag belongs to
                # and then set the occurrence of this concept is 1
                anno[i][cluster_map[j]] = 1
                print("The words is " + words[j] + ", and the concept is " + str(cluster_map[j]))

    utilites.saveVariableToFile(anno, "Corel5k/train_anno_concept.pkl")

    return anno
def generate_forest():

    train_original = loadmat(utilites.getAbsPath('Corel5K/train_vectors_original.mat'))
    train_original = train_original['train_vectors']
    train_label = utilites.loadVariableFromFile(utilites.getAbsPath("Corel5K/train_anno_concept.pkl"))

    # prepare data
    train_data = Data()
    train_data.samples = train_original
    train_data.labels = train_label
    train_data.orig_sample_indexes = np.array(range(len(train_original)))
    train_data.features = np.array(range(np.shape(train_original)[1]))

    tic()
    rand_forest = generate_random_forest(train_data, 400)
    toc()

    return rand_forest
예제 #11
0
# Each image have 5 annotations
# All the annotations are manually done and each one appears as a single sentence
"""
import gensim
import utilites
import setup
from sklearn.metrics import pairwise

# return text vectors calculated using Word2Vec by gensim
"""
open the annotation text file and read content
build word vectors using Word2Vec and then extract
the term/vector pairs into a dictionary
"""
# get all filtered term(tag) names
terms_corel5k_filtered = utilites.loadVariableFromFile(
    "Corel5k/terms_corel5k_filtered.pkl")
# get training image annotations: lists of separate terms
train_anno_filtered = utilites.loadVariableFromFile(
    "Corel5k/train_anno_filtered.pkl")

# initialize a model using parameters above
word_model = gensim.models.Word2Vec.load_word2vec_format(utilites.getAbsPath(
    setup.lmodel_file_path),
                                                         binary=True)
"""
Calculate similarity matrix using given vectors
We use pairwise distances to build the matrix
"""
print("Extracting word vectors...")
vecs = []
# Index2word is a list that contains the names of the words in
# All the annotations are manually done and each one appears as a single sentence
"""
import gensim
import utilites
import setup
from sklearn.metrics import pairwise


# return text vectors calculated using Word2Vec by gensim
"""
open the annotation text file and read content
build word vectors using Word2Vec and then extract
the term/vector pairs into a dictionary
"""
# get all filtered term(tag) names
terms_corel5k_filtered = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl")
# get training image annotations: lists of separate terms
train_anno_filtered = utilites.loadVariableFromFile("Corel5k/train_anno_filtered.pkl")

# initialize a model using parameters above
word_model = gensim.models.Word2Vec.load_word2vec_format(utilites.getAbsPath(setup.lmodel_file_path), binary=True)

"""
Calculate similarity matrix using given vectors
We use pairwise distances to build the matrix
"""
print("Extracting word vectors...")
vecs = []
# Index2word is a list that contains the names of the words in
for word in terms_corel5k_filtered:
    vecs.append(word_model[word])  # now we extract all word vectors from the model
예제 #13
0
    if tempBool:
        print( "Elapsed time: %f seconds.\n" %tempTimeInterval )

def tic():
    # Records a time in TicToc, marks the beginning of a time interval
    toc(False)

# we need to parse each test sample here
# get all terms from txt file
"""
test_file = open(utilites.getAbsPath('static/Corel5K/corel5k_test_list.txt'))
test_file_list = test_file.readlines()
test_file_list = [term.strip().decode('utf-8').replace('\n', '') + '.jpeg' for term in test_file_list]
utilites.saveVariableToFile(test_file_list, utilites.getAbsPath('static/Corel5K/corel5k_test_list.pkl'))
"""
test_file_list = utilites.loadVariableFromFile('static/Corel5K/corel5k_test_list.pkl')
train_file_path = utilites.loadVariableFromFile("static/Corel5K/train_file_path.pkl")
test_anno = utilites.loadVariableFromFile('static/Corel5K/test_anno_filtered.pkl')
train_anno = utilites.loadVariableFromFile('static/Corel5K/train_anno_filtered.pkl')
train_anno_concept = utilites.loadVariableFromFile("static/Corel5K/train_anno_concept.pkl")
test_anno_concept = utilites.loadVariableFromFile("static/Corel5K/test_anno_concept.pkl")
all_prob = utilites.loadVariableFromFile("static/Corel5K/all_probs.pkl")
concepts = utilites.loadVariableFromFile("static/Corel5K/cluster_contents.pkl")
train_vectors = loadmat(utilites.getAbsPath('static/Corel5K/train_vectors_original.mat'))
train_vectors = train_vectors['train_vectors']
test_vectors = loadmat(utilites.getAbsPath('static/Corel5K/test_vectors_original.mat'))
test_vectors = test_vectors['test_vectors']

train_vectors_classic = loadmat(utilites.getAbsPath('static/Corel5K/baseline_features/corel5k_train_feats_classic.mat'))
train_vectors_classic = train_vectors_classic['corel5k_train_feats_classic']
pool.close()
pool.join()

# retain tags that appears more than 5 times
tag_frequency = []
for te in range(len(terms_corel5k)):
    tag_frequency.append(sum(train_anno[:, te]))
tag_frequency = np.asarray(tag_frequency)

# get the index of infrequent tags in the tag list
r_infreq_tags = np.where(tag_frequency < 5)[0]
# thresholding
scores_tags_th = np.where(np.asarray(scores_tags) <= 1)[0]
# integrate low frequency and low scored
filtered_index = np.union1d(scores_tags_th, r_infreq_tags)
lwords = utilites.loadVariableFromFile("Corel5k/words_of_lmodel.pkl")
# find tags which are not available in language model
no_avail_tags = list(set(terms_corel5k) - set(terms_corel5k).intersection(set(lwords)))

# get index of not available tags
no_avail_index = []
for i in range(len(no_avail_tags)):
    if no_avail_tags[i] in terms_corel5k:
        no_avail_index.append(terms_corel5k.index(no_avail_tags[i]))

filtered_index = np.union1d(filtered_index, no_avail_index)

# get abandaned tags
filtered_tags = np.asarray(terms_corel5k)[filtered_index]
# filter tags
terms_corel5k_filtered = np.delete(terms_corel5k, filtered_index)
pool.close()
pool.join()

# retain tags that appears more than 5 times
tag_frequency = []
for te in range(len(terms_corel5k)):
    tag_frequency.append(sum(train_anno[:, te]))
tag_frequency = np.asarray(tag_frequency)

# get the index of infrequent tags in the tag list
r_infreq_tags = np.where(tag_frequency < 5)[0]
# thresholding
scores_tags_th = np.where(np.asarray(scores_tags) <= 1)[0]
# integrate low frequency and low scored
filtered_index = np.union1d(scores_tags_th, r_infreq_tags)
lwords = utilites.loadVariableFromFile("Corel5k/words_of_lmodel.pkl")
# find tags which are not available in language model
no_avail_tags = list(
    set(terms_corel5k) - set(terms_corel5k).intersection(set(lwords)))

# get index of not available tags
no_avail_index = []
for i in range(len(no_avail_tags)):
    if no_avail_tags[i] in terms_corel5k:
        no_avail_index.append(terms_corel5k.index(no_avail_tags[i]))

filtered_index = np.union1d(filtered_index, no_avail_index)

# get abandaned tags
filtered_tags = np.asarray(terms_corel5k)[filtered_index]
# filter tags
def load_forest():
    """
    Load forest file stored on the disk
    :return: loaded forest
    """
    return utilites.loadVariableFromFile("Corel5K/forest_400_trees_64_feats/forest.pkl")