def __init__(self): print("Loading forest model...") tic() self.forest = utilites.loadVariableFromFile( "static/Corel5K/forest/forest_128.pkl") print("Done.") toc() self.train_vectors = loadmat( utilites.getAbsPath('static/Corel5K/train_vectors_original.mat')) self.train_vectors = self.train_vectors['train_vectors'] self.train_file_path = utilites.loadVariableFromFile( "static/Corel5K/train_file_path.pkl") # load contents of concepts self.concepts = utilites.loadVariableFromFile( "static/Corel5K/cluster_contents.pkl") self.tag_scores = utilites.loadVariableFromFile( "static/Corel5K/all_tags_scores.pkl") self.train_vectors_classic = loadmat( utilites.getAbsPath( 'static/Corel5K/baseline_features/corel5k_train_feats_classic.mat' )) self.train_vectors_classic = self.train_vectors_classic[ 'corel5k_train_feats_classic'] self.test_vectors_classic = loadmat( utilites.getAbsPath( 'static/Corel5K/baseline_features/corel5k_test_feats_classic.mat' )) self.test_vectors_classic = self.test_vectors_classic[ 'corel5k_test_feats_classic'] self.test_file_name = utilites.loadVariableFromFile( 'static/Corel5K/corel5k_test_file_name.pkl') self.feat_dict_classic = dict( zip(self.test_file_name, self.test_vectors_classic)) # start a matlab session for feature extraction self.matlab = matlab_wrapper.MatlabSession( matlab_root="/Applications/MATLAB_R2015b.app") # start matlab self.matlab.eval('run MatConvNet/matlab/vl_setupnn') # basic config self.matlab.eval('run vlfeat/toolbox/vl_setup') ## basic config self.matlab.eval("feature('DefaultCharacterSet', 'UTF8')") print("Loading cnn model...") tic() self.matlab.eval( "net = load('/Users/TONYSUN/Desktop/SIR_Corel5K_demo/static/cnnmodel/imagenet-matconvnet-vgg-verydeep-16.mat')" ) toc() print("Matlab session started.") print("Ready for work ^_^.")
def perform_clustering(alpha=0.0, num_clusters=100): """ clustering the tag/terms and return the cluster ids for each tag :param alpha: parameter to combine visual and textual similarity matrix :param num_clusters: number of clusters/concepts obtained :return: cluster ids for each tag """ vis_sim_mat = utilites.loadVariableFromFile( "Corel5k/tag_affinity_matrix_scaled.pkl") tex_sim_mat = utilites.loadVariableFromFile( "Corel5k/tag_textual_similarity_matrix.pkl") tex_sim_mat = adjust_and_norm_affinity(tex_sim_mat) vis_sim_mat = expit(vis_sim_mat) # introduce a parameter alpha to merge the two matrics joint_mat = alpha * vis_sim_mat + (1 - alpha) * tex_sim_mat # let's start spectrum clustering # obtain cluster IDs for each word # eigen_solver: None, arpack, lobpcg, or amg cluster_ids = spectral_clustering(joint_mat, n_clusters=num_clusters, eigen_solver='arpack') print("Done...") # Create a Word / Index dictionary, mapping each vocabulary word to # a cluster number words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl") word_centroid_map = dict(zip(words, cluster_ids)) utilites.saveVariableToFile(cluster_ids, "Corel5k/concepts_ids.pkl") cluster_contents = [] # For the first 10 clusters for cluster in range(0, num_clusters): # print the cluster number print("\nCluster %d" % cluster) # Find all of the words for that cluster number, and print them out r_words = [] for i in range(0, len(word_centroid_map.values())): if (word_centroid_map.values()[i] == cluster): r_words.append(word_centroid_map.keys()[i]) print(r_words) cluster_contents.append(r_words) utilites.saveVariableToFile(cluster_contents, "Corel5k/cluster_contents.pkl") return cluster_ids
def load_forest(): """ Load forest file stored on the disk :return: loaded forest """ return utilites.loadVariableFromFile( "Corel5K/forest_400_trees_64_feats/forest.pkl")
def display_clusters(num_clusters): cluster_ids = utilites.loadVariableFromFile("Corel5k/concepts_ids.pkl") words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl") word_centroid_map = dict(zip(words, cluster_ids)) cluster_contents = [] # For the first 10 clusters for cluster in range(0, num_clusters): # print the cluster number print("\nCluster %d" % cluster) # Find all of the words for that cluster number, and print them out r_words = [] for i in range(0, len(word_centroid_map.values())): if (word_centroid_map.values()[i] == cluster): r_words.append(word_centroid_map.keys()[i]) print(r_words) cluster_contents.append(r_words)
def display_clusters(num_clusters): cluster_ids = utilites.loadVariableFromFile("Corel5k/concepts_ids") words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl") word_centroid_map = dict(zip(words, cluster_ids)) cluster_contents = [] # For the first 10 clusters for cluster in range(0, num_clusters): # print the cluster number print("\nCluster %d" % cluster) # Find all of the words for that cluster number, and print them out r_words = [] for i in range(0,len(word_centroid_map.values())): if( word_centroid_map.values()[i] == cluster ): r_words.append(word_centroid_map.keys()[i]) print (r_words) cluster_contents.append(r_words)
def perform_clustering(alpha=0.0, num_clusters=100): """ clustering the tag/terms and return the cluster ids for each tag :param alpha: parameter to combine visual and textual similarity matrix :param num_clusters: number of clusters/concepts obtained :return: cluster ids for each tag """ vis_sim_mat = utilites.loadVariableFromFile("Corel5k/tag_affinity_matrix_scaled.pkl") tex_sim_mat = utilites.loadVariableFromFile("Corel5k/tag_textual_similarity_matrix.pkl") tex_sim_mat = adjust_and_norm_affinity(tex_sim_mat) vis_sim_mat = expit(vis_sim_mat) # introduce a parameter alpha to merge the two matrics joint_mat = alpha * vis_sim_mat + (1 - alpha) * tex_sim_mat # let's start spectrum clustering # obtain cluster IDs for each word # eigen_solver: None, arpack, lobpcg, or amg cluster_ids = spectral_clustering(joint_mat, n_clusters=num_clusters, eigen_solver='arpack') print("Done...") # Create a Word / Index dictionary, mapping each vocabulary word to # a cluster number words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl") word_centroid_map = dict(zip(words, cluster_ids)) utilites.saveVariableToFile(cluster_ids, "Corel5k/concepts_ids.pkl") cluster_contents = [] # For the first 10 clusters for cluster in range(0, num_clusters): # print the cluster number print("\nCluster %d" % cluster) # Find all of the words for that cluster number, and print them out r_words = [] for i in range(0,len(word_centroid_map.values())): if( word_centroid_map.values()[i] == cluster ): r_words.append(word_centroid_map.keys()[i]) print (r_words) cluster_contents.append(r_words) utilites.saveVariableToFile(cluster_contents, "Corel5k/cluster_contents.pkl") return cluster_ids
def __init__(self): print ("Loading forest model...") tic() self.forest = utilites.loadVariableFromFile("static/Corel5K/forest/forest_128.pkl") print ("Done.") toc() self.train_vectors = loadmat(utilites.getAbsPath("static/Corel5K/train_vectors_original.mat")) self.train_vectors = self.train_vectors["train_vectors"] self.train_file_path = utilites.loadVariableFromFile("static/Corel5K/train_file_path.pkl") # load contents of concepts self.concepts = utilites.loadVariableFromFile("static/Corel5K/cluster_contents.pkl") self.tag_scores = utilites.loadVariableFromFile("static/Corel5K/all_tags_scores.pkl") self.train_vectors_classic = loadmat( utilites.getAbsPath("static/Corel5K/baseline_features/corel5k_train_feats_classic.mat") ) self.train_vectors_classic = self.train_vectors_classic["corel5k_train_feats_classic"] self.test_vectors_classic = loadmat( utilites.getAbsPath("static/Corel5K/baseline_features/corel5k_test_feats_classic.mat") ) self.test_vectors_classic = self.test_vectors_classic["corel5k_test_feats_classic"] self.test_file_name = utilites.loadVariableFromFile("static/Corel5K/corel5k_test_file_name.pkl") self.feat_dict_classic = dict(zip(self.test_file_name, self.test_vectors_classic)) # start a matlab session for feature extraction self.matlab = matlab_wrapper.MatlabSession(matlab_root="/Applications/MATLAB_R2015b.app") # start matlab self.matlab.eval("run MatConvNet/matlab/vl_setupnn") # basic config self.matlab.eval("run vlfeat/toolbox/vl_setup") ## basic config self.matlab.eval("feature('DefaultCharacterSet', 'UTF8')") print ("Loading cnn model...") tic() self.matlab.eval( "net = load('/Users/TONYSUN/Desktop/SIR_Corel5K_demo/static/cnnmodel/imagenet-matconvnet-vgg-verydeep-16.mat')" ) toc() print ("Matlab session started.") print ("Ready for work ^_^.")
def get_concept_anno(): """ build new concept annotation matrix upon old tag based annotation :param num_clusters: number of clusters/concepts :return: new concept based annotation matrix """ cluster_ids = utilites.loadVariableFromFile("Corel5k/concepts_ids.pkl") # all tags words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl") # all tag ids from 1 to length of cluster_ids word_ids = range(len(cluster_ids)) # get number of clusters by counting unique cluster ids num_clusters = len(set(cluster_ids)) # construct to indicate which cluster does the given word belong to cluster_map = dict(zip(word_ids, cluster_ids)) # load the original tag annotation matrix word_anno = utilites.loadVariableFromFile( "Corel5k/train_anno_filtered.pkl") # initialize a zero matrix as concept matrix anno = np.zeros((len(word_anno), num_clusters), dtype=np.int) # for every instance in the anno for i in range(len(word_anno)): print('This is instance ' + str(i) + '.') # for every tag in all tags for j in range(len(cluster_ids)): # if this tag appears in the original tag annotation matrix if word_anno[i][j] == 1: # we first find which concept this tag belongs to # and then set the occurrence of this concept is 1 anno[i][cluster_map[j]] = 1 print("The words is " + words[j] + ", and the concept is " + str(cluster_map[j])) utilites.saveVariableToFile(anno, "Corel5k/train_anno_concept.pkl") return anno
def get_concept_anno(): """ build new concept annotation matrix upon old tag based annotation :param num_clusters: number of clusters/concepts :return: new concept based annotation matrix """ cluster_ids = utilites.loadVariableFromFile("Corel5k/concepts_ids.pkl") # all tags words = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl") # all tag ids from 1 to length of cluster_ids word_ids = range(len(cluster_ids)) # get number of clusters by counting unique cluster ids num_clusters = len(set(cluster_ids)) # construct to indicate which cluster does the given word belong to cluster_map = dict(zip(word_ids, cluster_ids)) # load the original tag annotation matrix word_anno = utilites.loadVariableFromFile("Corel5k/train_anno_filtered.pkl") # initialize a zero matrix as concept matrix anno = np.zeros((len(word_anno), num_clusters), dtype=np.int) # for every instance in the anno for i in range(len(word_anno)): print('This is instance ' + str(i) + '.') # for every tag in all tags for j in range(len(cluster_ids)): # if this tag appears in the original tag annotation matrix if word_anno[i][j] == 1: # we first find which concept this tag belongs to # and then set the occurrence of this concept is 1 anno[i][cluster_map[j]] = 1 print("The words is " + words[j] + ", and the concept is " + str(cluster_map[j])) utilites.saveVariableToFile(anno, "Corel5k/train_anno_concept.pkl") return anno
def generate_forest(): train_original = loadmat(utilites.getAbsPath('Corel5K/train_vectors_original.mat')) train_original = train_original['train_vectors'] train_label = utilites.loadVariableFromFile(utilites.getAbsPath("Corel5K/train_anno_concept.pkl")) # prepare data train_data = Data() train_data.samples = train_original train_data.labels = train_label train_data.orig_sample_indexes = np.array(range(len(train_original))) train_data.features = np.array(range(np.shape(train_original)[1])) tic() rand_forest = generate_random_forest(train_data, 400) toc() return rand_forest
# Each image have 5 annotations # All the annotations are manually done and each one appears as a single sentence """ import gensim import utilites import setup from sklearn.metrics import pairwise # return text vectors calculated using Word2Vec by gensim """ open the annotation text file and read content build word vectors using Word2Vec and then extract the term/vector pairs into a dictionary """ # get all filtered term(tag) names terms_corel5k_filtered = utilites.loadVariableFromFile( "Corel5k/terms_corel5k_filtered.pkl") # get training image annotations: lists of separate terms train_anno_filtered = utilites.loadVariableFromFile( "Corel5k/train_anno_filtered.pkl") # initialize a model using parameters above word_model = gensim.models.Word2Vec.load_word2vec_format(utilites.getAbsPath( setup.lmodel_file_path), binary=True) """ Calculate similarity matrix using given vectors We use pairwise distances to build the matrix """ print("Extracting word vectors...") vecs = [] # Index2word is a list that contains the names of the words in
# All the annotations are manually done and each one appears as a single sentence """ import gensim import utilites import setup from sklearn.metrics import pairwise # return text vectors calculated using Word2Vec by gensim """ open the annotation text file and read content build word vectors using Word2Vec and then extract the term/vector pairs into a dictionary """ # get all filtered term(tag) names terms_corel5k_filtered = utilites.loadVariableFromFile("Corel5k/terms_corel5k_filtered.pkl") # get training image annotations: lists of separate terms train_anno_filtered = utilites.loadVariableFromFile("Corel5k/train_anno_filtered.pkl") # initialize a model using parameters above word_model = gensim.models.Word2Vec.load_word2vec_format(utilites.getAbsPath(setup.lmodel_file_path), binary=True) """ Calculate similarity matrix using given vectors We use pairwise distances to build the matrix """ print("Extracting word vectors...") vecs = [] # Index2word is a list that contains the names of the words in for word in terms_corel5k_filtered: vecs.append(word_model[word]) # now we extract all word vectors from the model
if tempBool: print( "Elapsed time: %f seconds.\n" %tempTimeInterval ) def tic(): # Records a time in TicToc, marks the beginning of a time interval toc(False) # we need to parse each test sample here # get all terms from txt file """ test_file = open(utilites.getAbsPath('static/Corel5K/corel5k_test_list.txt')) test_file_list = test_file.readlines() test_file_list = [term.strip().decode('utf-8').replace('\n', '') + '.jpeg' for term in test_file_list] utilites.saveVariableToFile(test_file_list, utilites.getAbsPath('static/Corel5K/corel5k_test_list.pkl')) """ test_file_list = utilites.loadVariableFromFile('static/Corel5K/corel5k_test_list.pkl') train_file_path = utilites.loadVariableFromFile("static/Corel5K/train_file_path.pkl") test_anno = utilites.loadVariableFromFile('static/Corel5K/test_anno_filtered.pkl') train_anno = utilites.loadVariableFromFile('static/Corel5K/train_anno_filtered.pkl') train_anno_concept = utilites.loadVariableFromFile("static/Corel5K/train_anno_concept.pkl") test_anno_concept = utilites.loadVariableFromFile("static/Corel5K/test_anno_concept.pkl") all_prob = utilites.loadVariableFromFile("static/Corel5K/all_probs.pkl") concepts = utilites.loadVariableFromFile("static/Corel5K/cluster_contents.pkl") train_vectors = loadmat(utilites.getAbsPath('static/Corel5K/train_vectors_original.mat')) train_vectors = train_vectors['train_vectors'] test_vectors = loadmat(utilites.getAbsPath('static/Corel5K/test_vectors_original.mat')) test_vectors = test_vectors['test_vectors'] train_vectors_classic = loadmat(utilites.getAbsPath('static/Corel5K/baseline_features/corel5k_train_feats_classic.mat')) train_vectors_classic = train_vectors_classic['corel5k_train_feats_classic']
pool.close() pool.join() # retain tags that appears more than 5 times tag_frequency = [] for te in range(len(terms_corel5k)): tag_frequency.append(sum(train_anno[:, te])) tag_frequency = np.asarray(tag_frequency) # get the index of infrequent tags in the tag list r_infreq_tags = np.where(tag_frequency < 5)[0] # thresholding scores_tags_th = np.where(np.asarray(scores_tags) <= 1)[0] # integrate low frequency and low scored filtered_index = np.union1d(scores_tags_th, r_infreq_tags) lwords = utilites.loadVariableFromFile("Corel5k/words_of_lmodel.pkl") # find tags which are not available in language model no_avail_tags = list(set(terms_corel5k) - set(terms_corel5k).intersection(set(lwords))) # get index of not available tags no_avail_index = [] for i in range(len(no_avail_tags)): if no_avail_tags[i] in terms_corel5k: no_avail_index.append(terms_corel5k.index(no_avail_tags[i])) filtered_index = np.union1d(filtered_index, no_avail_index) # get abandaned tags filtered_tags = np.asarray(terms_corel5k)[filtered_index] # filter tags terms_corel5k_filtered = np.delete(terms_corel5k, filtered_index)
pool.close() pool.join() # retain tags that appears more than 5 times tag_frequency = [] for te in range(len(terms_corel5k)): tag_frequency.append(sum(train_anno[:, te])) tag_frequency = np.asarray(tag_frequency) # get the index of infrequent tags in the tag list r_infreq_tags = np.where(tag_frequency < 5)[0] # thresholding scores_tags_th = np.where(np.asarray(scores_tags) <= 1)[0] # integrate low frequency and low scored filtered_index = np.union1d(scores_tags_th, r_infreq_tags) lwords = utilites.loadVariableFromFile("Corel5k/words_of_lmodel.pkl") # find tags which are not available in language model no_avail_tags = list( set(terms_corel5k) - set(terms_corel5k).intersection(set(lwords))) # get index of not available tags no_avail_index = [] for i in range(len(no_avail_tags)): if no_avail_tags[i] in terms_corel5k: no_avail_index.append(terms_corel5k.index(no_avail_tags[i])) filtered_index = np.union1d(filtered_index, no_avail_index) # get abandaned tags filtered_tags = np.asarray(terms_corel5k)[filtered_index] # filter tags
def load_forest(): """ Load forest file stored on the disk :return: loaded forest """ return utilites.loadVariableFromFile("Corel5K/forest_400_trees_64_feats/forest.pkl")