def __init__(self, title): self.title = title self.content = "" self.summary = "" self.contentTfIdf = {} self.summryTfIdf = {} self.summaryhyperlinks = [] #(word :link) self.hyperlinks = [] self.SeeAlso = [] self.Categories = [] self.PrunedCategories = [] self.successful = True if title not in variable.allTfIdf.keys(): print("Title: " + title) Content = givePrunedContent(title, "NULL") if (Content == "NULL"): self.successful = False else: self.contentTfIdf = tfidf.TfIdf(Content) summary = giveSummary(title, "NULL") self.summryTfIdf = tfidf.TfIdf(summary) self.content = Content self.summary = summary variable.allTfIdf[title] = (self.contentTfIdf, self.summryTfIdf) variable.Allcontent[title] = (self.content, self.summary) else: self.contentTfIdf = variable.allTfIdf[title][0] self.summryTfIdf = variable.allTfIdf[title][1] self.content = variable.Allcontent[title][0] self.summary = variable.Allcontent[title][1]
def tf_idf(self, document): """Returns the tf-idf score for ngrams for the document. The num_grams specifies the level of grams: words, bigrams, trigrams, etc. The function creates an idf corpus for the subcorpora. If the document is not in the subcorpora, it is also added to the idf corpus, so that each word appears in at least one document. Stopwords are optional, but if True, they are created based on the stopword_percentage_threshold parameter.""" path = '../stripped_text/' idf_path = "../" + str( self.num_grams) + "grams/" + self.subcorpora + "_v_" + str( self.vary_defn) + "_sw_" + str(self.stopwords) + "_s_" + str( self.spread) + ".txt" if os.path.exists(idf_path): #idf corpus already exists print "idf corpus exists." #create tfidf object with existing corpus _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, idf_path, self.idf_dict, self.stopword_file) #determine document TLG#### filename: filedict = file_dict('../ref_file.txt') docFilename = filedict[document][0] else: #idf corpus not yet in existence print "creating idf corpus." docFilename = self.add_docs(document) _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, None, self.idf_dict, self.stopword_file) #actually determine tf-idf score for ngrams in document: tfidf_list = _tfidf.get_doc_keywords(path + docFilename, self.num_grams) #print tfidf scores to .txt and .csv: print "print tfidf scores to .txt and .csv" if document == self.doc1: print "printing doc1 to ", self.doc1_tfidf_file print_to_file_by_ngram(tfidf_list, self.doc1_tfidf_file) print_to_csv_file(self.doc1_tfidf_file, self.num_grams) elif document == self.doc2: print "printing doc2 to ", self.doc2_tfidf_file print_to_file_by_ngram(tfidf_list, self.doc2_tfidf_file) print_to_csv_file(self.doc2_tfidf_file, self.num_grams) ##this should only happen if it's not already saved #save idf corpus for later use: # _tfidf.save_corpus_to_file(idf_path) return tfidf_list
def findSim(keyword, pathcorpus): this_path = os.path.split(__file__)[0] pathcorpus = os.path.join(this_path, pathcorpus) # membaca sekaligus pre-processing semua artikel corpus simpan ke dictionary table = tfidf.TfIdf() articles = {} for item in os.listdir(pathcorpus): if item.endswith(".txt"): with open(pathcorpus + "/" + item, 'r',encoding="utf-8") as file: # articles[item] = lib1.prepro_base(file.read()).split() table.add_document(item, lib1.prepro_base(file.read()).split()) keys = keyword.split() result = table.similarities(keys) res = [] # for x, title in result, articles: # if x[1]: # res.append([x[0], (round(x[1], 3)*100), title.value()]) for x in result: if x[1]: with open(pathcorpus + '/' + x[0], 'r',encoding="utf-8") as file: res.append([x[0], x[1], file.readline()]) print(res) return res
def main(): clanky = getarticles.getarticles() # načtení článků z RSS table = tfidf.TfIdf() # inicializace TfIdf for clanek in clanky: tokeny = tokenize.tokenize( clanek['text']) # tokeny jsou základní tvary slov clanek['tokeny'] = tokeny table.add_document(clanek['url'], tokeny) # přidání článku do tfidf ke zpracování pocet_souvislosti = 0 # celkový počet nalezených souvislostí, hodí se k hraní si s trash holdem for clanek in clanky: print("------------------") print(clanek['nadpis']) print("zdroj: " + clanek['url']) for podobnost in table.similarities( clanek['tokeny'] ): # podobnost[0] je url článku, podobnost[1] samotné číslo if podobnost[1] > THRASH_HOLD and podobnost[0] != clanek[ 'url']: # nejpodobnější je článek sám sobě print("souvisí: " + podobnost[0]) pocet_souvislosti += 1 print("------------------") print("nalezeno souvislostí: ", pocet_souvislosti)
def test_read(self): '''Test reading in a file as a string''' my_t = t.TfIdf() result = my_t.read_file('./text-files/a-drinking-song-yeats.txt') self.assertEqual( result, 'Wine comes in at the mouth\nAnd love comes in at the eye;\nThat\'s all we shall know for truth\nBefore we grow old and die.\nI lift the glass to my mouth,\nI look at you, and I sigh.' )
def test_similarity(self): table = tfidf.TfIdf() table.add_document("foo", ["a", "b", "c", "d", "e", "f", "g", "h"]) table.add_document("bar", ["a", "b", "c", "i", "j", "k"]) table.add_document("baz", ["k", "l", "m", "n"]) self.assertEqual(table.similarities(["a", "b", "c"]), [["foo", 0.6875], ["bar", 0.75], ["baz", 0.0]])
def test_string_short(self): ''' Test breaking up a string ''' my_t = t.TfIdf() test_string = '''How now brown Cow? Peter piper.''' result = my_t.string_to_list(test_string) self.assertEqual(result, ['how', 'now', 'brown', 'cow', 'peter', 'piper'])
def test_line_breaks(self): '''Test string with line breaks''' my_t = t.TfIdf() test_string = "Wine comes in at the mouth\nAnd love comes in at the eye;\nThat\'s all we shall know for truth\nBefore" result = my_t.string_to_list(test_string) self.assertEqual(result, [ 'wine', 'comes', 'in', 'at', 'the', 'mouth', 'and', 'love', 'comes', 'in', 'at', 'the', 'eye', "that's", 'all', 'we', 'shall', 'know', 'for', 'truth', 'before' ])
def testGetIdf(self): """Test querying the IDF for existent and nonexistent terms.""" my_tfidf = tfidf.TfIdf("tfidf_testcorpus.txt", \ DEFAULT_IDF = DEFAULT_IDF_UNITTEST) # Test querying for a nonexistent term. self.assertEqual(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("nonexistent")) self.assertEqual(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("THE")) self.assertTrue(my_tfidf.get_idf("a") > my_tfidf.get_idf("the")) self.assertAlmostEquals(my_tfidf.get_idf("girl"), my_tfidf.get_idf("moon"))
def create_and_save_idf_corpus(self): #should check if path exists yet idf_path = "../" + str( self.num_grams) + "grams/" + self.subcorpora + "_v_" + str( self.vary_defn) + "_sw_" + str(self.stopwords) + "_s_" + str( self.spread) + ".txt" num_docs = self.compile_idf_dict( ) #in compile...() it should check that each file has been made. _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, None, self.idf_dict, self.stopword_file) _tfidf.set_num_docs(num_docs) _tfidf.save_corpus_to_file(idf_path)
def testKeywords(self): """Test retrieving keywords from a document, ordered by tf-idf.""" my_tfidf = tfidf.TfIdf("tfidf_testcorpus.txt", DEFAULT_IDF = 0.01) # Test retrieving keywords when there is only one keyword. keywords = my_tfidf.get_doc_keywords("the spoon and the fork") self.assertEqual("the", keywords[0][0]) # Test retrieving multiple keywords. keywords = my_tfidf.get_doc_keywords("the girl said hello over the phone") self.assertEqual("girl", keywords[0][0]) self.assertEqual("phone", keywords[1][0]) self.assertEqual("said", keywords[2][0]) self.assertEqual("the", keywords[3][0])
def testNoCorpusFiles(self): my_tfidf = tfidf.TfIdf(DEFAULT_IDF=DEFAULT_IDF_UNITTEST) self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("moon")) self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("water")) self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("said")) my_tfidf.add_input_document("moon") my_tfidf.add_input_document("moon said hello") self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("water")) self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 1), my_tfidf.get_idf("said")) self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 2), my_tfidf.get_idf("moon"))
def wiki_query(query): query= query.replace(' ', '+') query= query.replace(',', '|') page_info = '' url = 'http://en.wikipedia.org/w/api.php?format=json&action=query&titles=' + query + '&prop=revisions&rvprop=content' jsonVal= json.loads(urllib2.urlopen(url).read()) if (jsonVal): for page in jsonVal["query"]["pages"]: i = jsonVal["query"]["pages"][page] if "revisions" in i: page_info = i["revisions"] tfobj = tf.TfIdf() input_doc = tfobj.add_input_document(str(page_info)) keywords = tfobj.get_idf(str(query)) return keywords return page_info
def testStopwordFile(self): my_tfidf = tfidf.TfIdf("tfidf_testcorpus.txt", "tfidf_teststopwords.txt", DEFAULT_IDF = DEFAULT_IDF_UNITTEST) self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("water")) self.assertEquals(0, my_tfidf.get_idf("moon")) self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 5), my_tfidf.get_idf("said")) my_tfidf.add_input_document("moon") my_tfidf.add_input_document("moon and water") self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 1), my_tfidf.get_idf("water")) self.assertEquals(0, my_tfidf.get_idf("moon")) self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 5), my_tfidf.get_idf("said"))
def create_stopword_file(self, subcorpora, stopword_percentage_threshold): """Creates a stopword file. Returns stopword filename.""" _tfidf = tfidf.TfIdf() filedict = file_dict('../ref_file.txt') path = '../stripped_text/' for filename in filedict[self.subcorpora]: print filename _tfidf.add_input_document(path + filename) print str(stopword_percentage_threshold) _tfidf.save_corpus_to_file( "../1grams/" + self.subcorpora + ".txt", "../stopwords/" + self.subcorpora + "_" + str(stopword_percentage_threshold) + ".txt", stopword_percentage_threshold) return "../stopwords/" + self.subcorpora + "_" + str( stopword_percentage_threshold) + ".txt" #returns stopword filename
def testAddCorpus(self): """Test adding input documents to the corpus.""" my_tfidf = tfidf.TfIdf("tfidf_testcorpus.txt", \ DEFAULT_IDF = DEFAULT_IDF_UNITTEST) self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("water")) self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 1), my_tfidf.get_idf("moon")) self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 5), my_tfidf.get_idf("said")) my_tfidf.add_input_document("water, moon") self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 1), my_tfidf.get_idf("water")) self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 2), my_tfidf.get_idf("moon")) self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 5), my_tfidf.get_idf("said"))
def main(): testpath = "/Accounts/groenemm/summer/repo/tfidf/cachedtexts" testfiles = os.listdir(testpath) maxrank = 10 verbose = True t = tfidf.TfIdf(corpus_filename="reformattedfreqlist.txt",stopword_filename="stopwords.txt") sc = sphinx_inter.SphinxClient("dmusican41812",rankingmode=SPH_RANK_BM25,fieldweights={"title":4, "body":1}) #tests = importTestCites("citelist.txt") testresults = TestResults() for path,pageid in [(os.path.join(testpath,testfile),int(testfile)) for testfile in testfiles]: with open(path) as f: text = f.read() if text == None or text == "": continue testresults.addResult(*testCitation(pageid,text,getCalaisQuery,maxrank,sc,t,verbose)) testresults.printSummary()
def build_idf(self, description_column_name, out_file=None, csv_location=None): """ :param description_column_name: :param out_file: :param csv_location: :return: """ idfcalc = tfidf.TfIdf() for entry in self.corpus.loc[:, description_column_name].values: idfcalc.add_input_document(entry) idf_list = [] term_list = [] for term in idfcalc.term_num_docs: idf = idfcalc.get_idf(term) idf_list.append(idf) term_list.append(term) idf_vector = pd.Series(idf_list, index=term_list) idf_vector = idf_vector.sort_values(ascending=False) if out_file: if csv_location: idf_vector.to_csv(csv_location + '/' + out_file) else: print("Error: no location specified for output csv") self.idf_vector_created = True self.idf_vector = idf_vector return idf_list, term_list
import tfidf import numpy import collections fai_result ={} data = tfidf.TfIdf() fai = data.csv2dict("/home/wzswan/Downloads/github/DD-LSTW/fai.csv") fai_count = collections.Counter(fai) pe = data.csv2dict("/home/wzswan/Downloads/github/DD-LSTW/pe.csv") stock = data.csv2dict("/home/wzswan/Downloads/github/DD-LSTW/stock.csv") print fai print fai_count #print pe #print stock table = tfidf.TfIdf() table.add_document("FAI",fai) table.add_document("PE",pe) table.add_document("STOCK_INDEX",stock) #print table #print "key 16 is:%s"%(table.similarities(["16"])) fai_result = {'24':table.similarities(["24"]), '25': table.similarities(["25"]),'26':table.similarities(["26"]) , '27':table.similarities(["27"]),'20': table.similarities(["20"]),'21':table.similarities(["21"]), '22':table.similarities(["22"]) ,'23':table.similarities(["23"]) ,'28':table.similarities(["28"]) ,'29': table.similarities(["29"]), '38':table.similarities(["38"]) ,'15':table.similarities(["15"]) , '17':table.similarities(["17"]) ,'16':table.similarities(["16"]) ,'33':table.similarities(["33"]) , '18':table.similarities(["18"]) ,'30':table.similarities(["30"]) ,'37':table.similarities(["37"]) ,
import tfidf from stopwords import * #test1, all parameters set to default mytfidf = tfidf.TfIdf() mytfidf.add_input_document("../test_files/kowari.txt") mytfidf.add_input_document("../test_files/platypus.txt") mytfidf.save_corpus_to_file(idf_filename="test1.txt") #test2, turn on variant word order mytfidf = tfidf.TfIdf(variant_word_order=True) mytfidf.add_input_document("../test_files/kowari.txt") mytfidf.add_input_document("../test_files/platypus.txt") mytfidf.save_corpus_to_file(idf_filename="test2.txt") #test3, try with trigrams mytfidf = tfidf.TfIdf(ngram_size=3) mytfidf.add_input_document("../test_files/kowari.txt") mytfidf.add_input_document("../test_files/platypus.txt") mytfidf.save_corpus_to_file(idf_filename="test3.txt") #test4, try giving it a window of 4 mytfidf = tfidf.TfIdf(window=4) mytfidf.add_input_document("../test_files/kowari.txt") mytfidf.add_input_document("../test_files/platypus.txt") mytfidf.save_corpus_to_file(idf_filename="test4.txt")
import tfidf if __name__ == "__main__": table = tfidf.TfIdf() table.add_document("foo", ["a", "b", "c", "d", "e", "f", "g", "h"]) table.add_document("bar", ["a", "b", "c", "i", "j", "k"]) table.add_document("baz", ["a", "l", "m", "n"]) table.add_document("taz", ["t"]) print table.similarities(["a", "l", "m", "n"]) print table.similarities(["t"]) #[["foo", 0.6875], ["bar", 0.75], ["baz", 0.0]])
def run(pathImages, method, keypnt, numpatch, equalnum, imdes, imsample, percentage, codebook, dist, size, fselec, fselec_perc, histnorm, clust, K, pca, nclusters, rep, levels): ################################################################# # # Initializations and result file configurations # ################################################################# #warnings.simplefilter("error") if os.path.exists('save_HIST.txt') == True: os.remove('save_HIST.txt') if os.path.exists('save_dist.txt') == True: os.remove('save_dist.txt') if os.path.exists('saveClustersKmeans.txt') == True: os.remove('saveClustersKmeans.txt') im_dataset_name = pathImages.split('/')[-1] date_time = datetime.datetime.now().strftime('%b-%d-%I%M%p-%G') name_results_file = im_dataset_name + '_' + keypnt + '_' + str( numpatch ) + '_' + str(equalnum) + '_' + imdes + '_' + 'levels:' + str( levels ) + '_' + imsample + '_' + codebook + '_' + str( size ) + '_' + fselec + '_' + histnorm + '_' + clust + '_' + dist + '_' + date_time #dir_results = 'Results_' + im_dataset_name + '_SPM_' + date_time dir_results = 'Results_SPM' if not os.path.exists(dir_results): os.makedirs(dir_results) file_count = 2 file_name = os.path.join(dir_results, name_results_file) while os.path.exists(file_name + ".txt"): file_name = os.path.join(dir_results, name_results_file) + "_" + str(file_count) file_count = file_count + 1 f = open(file_name + ".txt", 'w') ################################################################# # # Get images # ################################################################# #pathImages = '/Users/Mariana/mieec/Tese/Development/ImageDatabases/Graz-01_sample' imList = get_imlist(pathImages) print 'Number of images read = ' + str(len(imList)) f.write("Number of images in dataset read: " + str(len(imList)) + "\n") ################################################################# # # Image description # ################################################################# #Number of regions n_regions = np.power(4, levels - 1) #Get detector classes det_sift = siftLib.Sift(numpatch / n_regions, equalnum) det_surf = surfLib.Surf(numpatch / n_regions, equalnum) det_fast = fastDetector.Fast(numpatch / n_regions, equalnum) det_star = starDetector.Star(numpatch / n_regions, equalnum) det_orb = orbLib.Orb(numpatch / n_regions, equalnum) det_random = randomDetector.Random(numpatch / n_regions) names_detectors = np.array( ["SIFT", "SURF", "FAST", "STAR", "ORB", "RANDOM"]) detectors = np.array( [det_sift, det_surf, det_fast, det_star, det_orb, det_random]) #Get the detector passed in the -k argument index = np.where(names_detectors == keypnt)[0] if index.size > 0: detector_to_use = detectors[index[0]] else: print 'Wrong detector name passed in the -k argument. Options: SIFT, SURF, FAST, STAR, ORB and RANDOM' sys.exit() #FOR RESULTS FILE detector_to_use.writeParametersDet(f) #Get descriptor classes des_sift = siftLib.Sift(numpatch / n_regions, equalnum) des_surf = surfLib.Surf(numpatch / n_regions, equalnum) des_orb = orbLib.Orb(numpatch / n_regions) des_brief = briefDescriptor.Brief() des_freak = freakDescriptor.Freak() names_descriptors = np.array(["SIFT", "SURF", "ORB", "BRIEF", "FREAK"]) descriptors = np.array([des_sift, des_surf, des_orb, des_brief, des_freak]) #Get the detector passed in the -d argument index = np.where(names_descriptors == imdes)[0] if index.size > 0: descriptor_to_use = descriptors[index[0]] else: print 'Wrong descriptor name passed in the -d argument. Options: SIFT, SURF, ORB, BRIEF and FREAK' sys.exit() #FOR RESULTS FILE descriptor_to_use.writeParametersDes(f) kp_vector = [] #vector with the keypoints object des_vector = [ ] #vector wih the descriptors (in order to obtain the codebook) number_of_kp = [] #vector with the number of keypoints per image counter = 1 #save current time start_time = time.time() labels = [] class_names = [] #Border border = 40 side = int(np.sqrt(n_regions)) des_vector_byregion = [0] * n_regions number_of_kp_region = [0] * n_regions filled = [0] * n_regions #matrixes of the indexes mat_indexes = np.array([[0, 1, 4, 5, 16, 17, 20, 21], [2, 3, 6, 7, 18, 19, 22, 23], [8, 9, 12, 13, 24, 25, 28, 29], [10, 11, 14, 15, 26, 27, 30, 31], [32, 33, 36, 37, 48, 49, 52, 53], [34, 35, 38, 39, 50, 51, 54, 55], [40, 41, 44, 45, 56, 57, 60, 61], [42, 43, 46, 47, 58, 59, 62, 63]]) #detect the keypoints and compute the sift descriptors for each image for im in imList: if 'DS_Store' not in im: print 'image: ' + str(im) + ' number: ' + str(counter) #read image img = cv2.imread(im, 0) # region for i in range(0, side): for j in range(0, side): #mask in order to avoid keypoints in border of image. size = 40 pixels height, width = img.shape h_region = (height - 2 * border) / np.sqrt(n_regions) w_region = (width - 2 * border) / np.sqrt(n_regions) mask = np.zeros(img.shape, dtype=np.uint8) mask[border + i * h_region:border + (i + 1) * h_region, border + j * w_region:border + (j + 1) * w_region] = 1 #get keypoints from detector kp = detector_to_use.detectKp(img, mask) #get features from descriptor des = descriptor_to_use.computeDes(img, kp) number_of_kp.append(len(kp)) #print i*np.sqrt(n_regions)+j #print number_of_kp_region[int(i*np.sqrt(n_regions)+j)] if filled[mat_indexes[i, j]] == 1: #descriptors of all the regions (in a list) des_vector_byregion[mat_indexes[ i, j]] = np.concatenate( (des_vector_byregion[mat_indexes[i, j]], des), axis=0) #number of descriptors in each region number_of_kp_region[mat_indexes[ i, j]] = np.concatenate( (number_of_kp_region[mat_indexes[i, j]], np.array([len(kp)])), axis=0) else: des_vector_byregion[mat_indexes[i, j]] = des number_of_kp_region[mat_indexes[i, j]] = np.array( [len(kp)]) filled[mat_indexes[i, j]] = 1 #print des_vector_byregion #print number_of_kp_region #for evaluation name1 = im.split("/")[-1] name = name1.split("_")[0] if name in class_names: index = class_names.index(name) labels.append(index) else: class_names.append(name) index = class_names.index(name) labels.append(index) counter += 1 #measure the time to compute the description of each image (divide time elapsed by # of images) elapsed_time = (time.time() - start_time) / len(imList) print 'Time to compute detector and descriptor for each image = ' + str( elapsed_time) f.write( 'Average time to compute detector and descriptor for each image = ' + str(elapsed_time) + '\n') n_images = counter - 1 average_words = sum(number_of_kp) / float(len(number_of_kp)) #all the descriptors together des_vector = np.concatenate(np.array(des_vector_byregion)) print 'Total number of features = ' + str(len(des_vector)) f.write('Total number of features obtained = ' + str(len(des_vector)) + '\n') print 'Average number of keypoints per image = ' + str(average_words) f.write('Average number of keypoints per image = ' + str(average_words) + '\n') ################################################################# # # Image and Keypoint sampling # ################################################################# rand_indexes = [] nmi_indexes = [] for iteraction in range(0, rep): print "\nIteraction #" + str(iteraction + 1) + '\n' f.write("\nIteraction #" + str(iteraction + 1) + '\n') print 'Sampling images and keypoints prior to codebook computation...' if imsample != "NONE": sampleKp = sampleKeypoints.SamplingImandKey( n_images, number_of_kp, average_words, percentage) sampleallKp = sampleAllKeypoints.SamplingAllKey(percentage) names_sampling = np.array(["SAMPLEI", "SAMPLEP"]) sample_method = np.array([sampleKp, sampleallKp]) #Get the detector passed in the -g argument index = np.where(names_sampling == imsample)[0] if index.size > 0: sampling_to_use = sample_method[index[0]] else: print 'Wrong sampling method passed in the -g argument. Options: NONE, SAMPLEI, SAMPLEP' sys.exit() #FOR RESULTS FILE sampling_to_use.writeFile(f) des_vector_sampled = sampling_to_use.sampleKeypoints(des_vector) print 'Total number of features after sampling = ' + str( len(des_vector_sampled)) f.write('Total number of features after sampling = ' + str(len(des_vector_sampled)) + '\n') print 'Images and keypoints sampled...' else: print 'No sampling method chosen' #FOR RESULTS FILE f.write( "No method of keypoint sampling chosen. Use all keypoints for codebook construction \n" ) des_vector_sampled = des_vector ################################################################# # # Codebook computation # ################################################################# print 'Obtaining codebook...' #save current time start_time = time.time() #Get detector classes codebook_kmeans = KMeans1.KMeans1(size) codebook_birch = Birch.Birch(size) codebook_minibatch = minibatch.MiniBatch(size) codebook_randomv = randomSamplesBook.RandomVectors(size) codebook_allrandom = allrandom.AllRandom(size) names_codebook = np.array( ["KMEANS", "BIRCH", "MINIBATCH", "RANDOMV", "RANDOM"]) codebook_algorithm = np.array([ codebook_kmeans, codebook_birch, codebook_minibatch, codebook_randomv, codebook_allrandom ]) #Get the detector passed in the -c argument index = np.where(names_codebook == codebook)[0] if index.size > 0: codebook_to_use = codebook_algorithm[index[0]] else: print 'Wrong codebook construction algorithm name passed in the -c argument. Options: KMEANS, MINIBATCH, RANDOMV and RANDOM' sys.exit() #FOR RESULTS FILE codebook_to_use.writeFileCodebook(f) #Get centers and projections using codebook algorithm centers, projections = codebook_to_use.obtainCodebook( des_vector_sampled, des_vector) #compute the number of unique descriptor vectors codebook_randomv.unique_vectors(centers) elapsed_time = (time.time() - start_time) print 'Time to compute codebook = ' + str(elapsed_time) f.write('Time to compute codebook = ' + str(elapsed_time) + '\n') ################################################################# # # Obtain Histogram # ################################################################# des_byregion = des_vector_byregion numkp_region = number_of_kp_region hist_total = [] for level in range(levels - 1, -1, -1): print 'Level = ' + str(level) n_regions = np.power(4, level) for i in range(0, n_regions): print 'Obtaining histograms...' #print 'projection shape = '+ str(projections.shape) #print 'size = ' + str(size) #print 'n of images = ' + str(n_images) #print 'number of kp' + str(number_of_kp) #print len(des_vector_byregion) #print len(des_vector_byregion[0]) #print len(des_vector_byregion[0][0]) result = scipy.cluster.vq.vq(np.array(des_byregion[i]), centers) projections_region = result[0] #print 'projections = ' + str(projections_region) #print n_images #print number_of_kp_region[i] #print len(number_of_kp_region) #print len(number_of_kp_region[0]) hist = histogram.computeHist(projections_region, size, n_images, numkp_region[i]) #print hist print 'Histograms obtained' #print hist ################################################################ # # Feature selection # ################################################################# print 'Number of visual words = ' + str(len(hist[0])) if fselec != "NONE": print 'Applying feature selection to descriptors...' filter_max = filterMax.WordFilterMax(fselec_perc[0]) filter_min = filterMin.WordFilterMin(fselec_perc[1]) filter_maxmin = filterMaxMin.WordFilterMaxMin( fselec_perc[0], fselec_perc[1]) names_filter = np.array(["FMAX", "FMIN", "FMAXMIN"]) filter_method = np.array( [filter_max, filter_min, filter_maxmin]) #Get the detector passed in the -f argument index = np.where(names_filter == fselec)[0] if index.size > 0: filter_to_use = filter_method[index[0]] else: print 'Wrong codebook construction algorithm name passed in the -f argument. Options: NONE, FMAX, FMIN, FMAXMIN' sys.exit() hist = filter_to_use.applyFilter(hist, size, n_images) #FOR RESULTS FILE filter_to_use.writeFile(f) new_size = hist.shape[1] print 'Visual words Filtered' print 'Number of visual words filtered = ' + str(size - new_size) f.write("Number of visual words filtered = " + str(size - new_size) + '\n') print 'Final number of visual words = ' + str(new_size) f.write('Final number of visual words = ' + str(new_size) + '\n') else: #FOR RESULTS FILE filter_min = filterMin.WordFilterMin(0) hist = filter_min.applyFilter(hist, size, n_images) new_size = hist.shape[1] print 'Number of visual words filtered = ' + str(size - new_size) f.write("No feature selection applied \n") ################################################################# # # Histogram Normalization # ################################################################# if histnorm != "NONE": #Get detector classes norm_sbin = simpleBinarization.SimpleBi() norm_tfnorm = tfnorm.Tfnorm() norm_tfidf = tfidf.TfIdf() norm_tfidf2 = tfidf2.TfIdf2() norm_tfidfnorm = tfidfnorm.TfIdfnorm() norm_okapi = okapi.Okapi(average_words) names_normalization = np.array([ "SBIN", "TFNORM", "TFIDF", "TFIDF2", "TFIDFNORM", "OKAPI" ]) normalization_method = np.array([ norm_sbin, norm_tfnorm, norm_tfidf, norm_tfidf2, norm_tfidfnorm, norm_okapi ]) #Get the detector passed in the -h argument index = np.where(names_normalization == histnorm)[0] if index.size > 0: normalization_to_use = normalization_method[index[0]] new_hist = normalization_to_use.normalizeHist( hist, new_size, n_images) else: print 'Wrong normalization name passed in the -h argument. Options: SBIN, TFNORM, TFIDF and TFIDF2' sys.exit() #FOR RESULTS FILE normalization_to_use.writeFile(f) else: #FOR RESULTS FILE f.write("No histogram normalization applied\n") new_hist = hist hist_total.append(np.array(new_hist)) #concatenate des_vector_byregion TODOOOOOOOOOO des_vector_aux = [] number_of_kp_aux = [] if level != 0: side = 4 ntimes = int(np.power(4, level - 1)) for h in range(0, ntimes): #print len(des_byregion) #print h*side #print (h+1)*side des_vector_aux.append( np.concatenate(des_byregion[h * side:(h + 1) * side], axis=0)) count = 0 for n in numkp_region[h * side:(h + 1) * side]: if count != 0: sum_np = [sum(x) for x in zip(sum_np, n)] else: sum_np = n count = count + 1 number_of_kp_aux.append(sum_np) des_byregion = des_vector_aux numkp_region = number_of_kp_aux #print hist_total hist_total = np.concatenate(hist_total, axis=1) print len(hist_total[0]) ################################################################# # # Clustering of the features # ################################################################# #save current time start_time = time.time() #Get detector classes clust_dbscan = Dbscan.Dbscan(dist) clust_kmeans = KMeans1.KMeans1([nclusters]) clust_birch = Birch.Birch(nclusters) clust_meanSift = meanSift.MeanSift(nclusters) clust_hierar1 = hierarchicalClustering.Hierarchical(nclusters, dist) clust_hierar2 = hierarchicalClustScipy.HierarchicalScipy(dist) clust_community = communityDetection.CommunityDetection(dist) names_clustering = np.array([ "DBSCAN", "KMEANS", "BIRCH", "MEANSIFT", "HIERAR1", "HIERAR2", "COMM" ]) clustering_algorithm = np.array([ clust_dbscan, clust_kmeans, clust_birch, clust_meanSift, clust_hierar1, clust_hierar2, clust_community ]) #Get the detector passed in the -a argument index = np.where(names_clustering == clust)[0] if index.size > 0: clustering_to_use = clustering_algorithm[index[0]] else: print 'Wrong clustering algorithm name passed in the -a argument. Options: DBSCAN, KMEANS, BIRCH, MEANSIFT, HIERAR1, HIERAR2, COMM' sys.exit() clusters = clustering_to_use.obtainClusters(hist_total) #FOR RESULTS FILE clustering_to_use.writeFileCluster(f) elapsed_time = (time.time() - start_time) print 'Time to run clustering algorithm = ' + str(elapsed_time) f.write('Time to run clustering algorithm = ' + str(elapsed_time) + '\n') print 'Number of clusters obtained = ' + str(max(clusters) + 1) f.write('Number of clusters obtained = ' + str(max(clusters) + 1) + '\n') print 'Clusters obtained = ' + str(np.asarray(clusters)) #date_time = datetime.datetime.now().strftime('%b-%d-%I%M%p-%G') #np.savetxt('saveClusters_'+date_time+'_.txt', clusters, '%i', ',') ##ADDED ################################################################## ## ## Create folder with central images for each cluster ## ################################################################## #dir_results = 'Results_' + im_dataset_name + '_SPM_' + date_time ##obtain representative images for each cluster #central_ims = clust_community.obtainCenteralImages(new_hist, clusters) #central_folder = os.path.join(dir_results,'CenterImages') #if not os.path.exists(central_folder): #os.makedirs(central_folder) #count=0 #for central_im in central_ims: #filename = os.path.join(central_folder,'Cluster_'+str(count)+'.jpg') #img = cv2.imread(imPaths[central_im],1) #cv2.imwrite(filename, img) #count = count + 1 ##ADDED ################################################################## ## ## Separate Clusters into folders ## ################################################################## #clusters_folder = os.path.join(dir_results,'Clusters') #if not os.path.exists(clusters_folder): #os.makedirs(clusters_folder) #clust_dir = [] #for iclust in range(0,nclusters): #direc = os.path.join(clusters_folder,'Cluster_'+str(iclust)) #if not os.path.exists(direc): #os.makedirs(direc) #clust_dir.append(direc) #for im in range(0,len(imPaths)): #im_name = imPaths[im].split('/')[-1] ##print clust_dir[int(clusters[im])] #filename = os.path.join(clust_dir[int(clusters[im])],im_name) ##print filename #img = cv2.imread(imPaths[im],1) #cv2.imwrite(filename, img) ################################################################# # # Evaluation # ################################################################# users = 0 if users == 1: rand_index = evaluationUsers.randIndex(clusters) rand_indexes.append(rand_index) print 'rand_index = ' + str(rand_index) f.write("Rand Index = " + str(rand_index) + "\n") else: if len(clusters) == len(labels): f.write("\nResults\n") f.write('Clusters Obtained = ' + str(np.asarray(clusters))) f.write('Labels = ' + str(np.asarray(labels))) rand_index = metrics.adjusted_rand_score(labels, clusters) rand_indexes.append(rand_index) print 'rand_index = ' + str(rand_index) f.write("Rand Index = " + str(rand_index) + "\n") NMI_index = metrics.normalized_mutual_info_score( labels, clusters) nmi_indexes.append(NMI_index) print 'NMI_index = ' + str(NMI_index) f.write("NMI Index = " + str(NMI_index) + "\n") if rep > 1: f.write("\nFINAL RESULTS\n") f.write("Avg Rand Index = " + str(float(sum(rand_indexes)) / rep) + "\n") f.write("Std Rand Index = " + str(statistics.stdev(rand_indexes)) + "\n") f.write("Avg NMI Index = " + str(float(sum(nmi_indexes)) / rep) + "\n") f.write("Std NMI Index = " + str(statistics.stdev(nmi_indexes)) + "\n") f.close()
prob *= smooth_prob else: prob *= self.type_model[source][word] / self.get_data_num( source) return prob max_prob, result = 0, '' for i in range(len(self.sources)): prob = get_prob(vec, self.sources[i]) if prob > max_prob: max_prob, result = prob, self.sources[i] return result if __name__ == '__main__': tfidf = tfidf.TfIdf() model = Model() paras = os.listdir('test') #print(paras) count, right = 0, 0 for source in model.sources: if source in paras: papers = os.listdir('test/' + source) for paper in papers: with open('/'.join(['test', source, paper])) as file: vec = tfidf.paragraph2vec(file.read()) if len(vec) < 10: continue print(vec) result = model.classify(vec) if result == source: right += 1 count += 1
def run(pathImages, method, numpatch, imsample, percentage, codebook, dist, size, fselec, fselec_perc, histnorm, clust, nclusters, rep): ################################################################# # # Initializations and result file configurations # ################################################################# im_dataset_name = pathImages.split('/')[-1] date_time = datetime.datetime.now().strftime('%b-%d-%I%M%p-%G') name_results_file = 'BOC_' + im_dataset_name + '_' + str( numpatch ) + '_' + imsample + '_' + codebook + '_' + str( size ) + '_' + fselec + '_' + histnorm + '_' + clust + '_' + dist + '_' + date_time #dir_results = 'Results_' + im_dataset_name + '_BOC_' + date_time dir_results = 'Results_BOC' if not os.path.exists(dir_results): os.makedirs(dir_results) file_count = 2 file_name = os.path.join(dir_results, name_results_file) while os.path.exists(file_name + ".txt"): file_name = os.path.join(dir_results, name_results_file) + "_" + str(file_count) file_count = file_count + 1 f = open(file_name + ".txt", 'w') ################################################################# # # Get images # ################################################################# #pathImages = '/Users/Mariana/mieec/Tese/Development/ImageDatabases/Graz-01_sample' imList = get_imlist(pathImages) print 'Number of images read = ' + str(len(imList)) f.write("Number of images in dataset read: " + str(len(imList)) + "\n") ################################################################# # # Image description # ################################################################# kp_vector = [] #vector with the keypoints object des_vector = [ ] #vector wih the descriptors (in order to obtain the codebook) number_of_kp = [] #vector with the number of keypoints per image counter = 1 #save current time start_time = time.time() labels = [] class_names = [] #ADDED imPaths = [] #number of divisions of the image div = int(np.sqrt(numpatch)) n_images = 0 #detect the keypoints and compute the sift descriptors for each image for im in imList: if 'DS_Store' not in im: #ADDED imPaths.append(im) print 'image: ' + str(im) + ' number: ' + str(counter) #read image img = cv2.imread(im, 1) img_gray = cv2.imread(im, 0) img_lab = cv2.cvtColor(img, cv.CV_BGR2Lab) height, width, comp = img_lab.shape h_region = height / div w_region = width / div des = [] for i in range(0, div): for j in range(0, div): #mask mask = np.zeros(img_gray.shape, dtype=np.uint8) mask[i * h_region:(i + 1) * h_region, j * w_region:(j + 1) * w_region] = 1 hist = cv2.calcHist([img_lab], [0, 1, 2], mask, [256, 256, 256], [0, 256, 0, 256, 0, 256]) max_color_l, max_color_a, max_color_b = np.where( hist == np.max(hist)) des.append( [max_color_l[0], max_color_a[0], max_color_b[0]]) number_of_kp.append(div * div) if counter == 1: des_vector = des else: des_vector = np.concatenate((des_vector, des), axis=0) counter += 1 #for evaluation name1 = im.split("/")[-1] name = name1.split("_")[0] if name in class_names: index = class_names.index(name) labels.append(index) else: class_names.append(name) index = class_names.index(name) labels.append(index) n_images = n_images + 1 #measure the time to compute the description of each image (divide time elapsed by # of images) elapsed_time = (time.time() - start_time) / len(imList) print 'Time to compute detector and descriptor for each image = ' + str( elapsed_time) f.write( 'Average time to compute detector and descriptor for each image = ' + str(elapsed_time) + '\n') average_words = sum(number_of_kp) / float(len(number_of_kp)) print 'Total number of features = ' + str(len(des_vector)) f.write('Total number of features obtained = ' + str(len(des_vector)) + '\n') print 'Average number of keypoints per image = ' + str(average_words) f.write('Average number of keypoints per image = ' + str(average_words) + '\n') ################################################################# # # Image and Keypoint sampling # ################################################################# rand_indexes = [] nmi_indexes = [] for iteraction in range(0, rep): print "\nIteraction #" + str(iteraction + 1) + '\n' f.write("\nIteraction #" + str(iteraction + 1) + '\n') print 'Sampling images and keypoints prior to codebook computation...' if imsample != "NONE": sampleKp = sampleKeypoints.SamplingImandKey( n_images, number_of_kp, average_words, percentage) sampleallKp = sampleAllKeypoints.SamplingAllKey(percentage) names_sampling = np.array(["SAMPLEI", "SAMPLEP"]) sample_method = np.array([sampleKp, sampleallKp]) #Get the sampling method passed in the -g argument index = np.where(names_sampling == imsample)[0] if index.size > 0: sampling_to_use = sample_method[index[0]] else: print 'Wrong sampling method passed in the -g argument. Options: NONE, SAMPLEI, SAMPLEP' sys.exit() #FOR RESULTS FILE sampling_to_use.writeFile(f) des_vector_sampled = sampling_to_use.sampleKeypoints(des_vector) print 'Total number of features after sampling = ' + str( len(des_vector_sampled)) f.write('Total number of features after sampling = ' + str(len(des_vector_sampled)) + '\n') print 'Images and keypoints sampled...' else: print 'No sampling method chosen' #FOR RESULTS FILE f.write( "No method of keypoint sampling chosen. Use all keypoints for codebook construction \n" ) des_vector_sampled = des_vector ################################################################# # # Codebook computation # ################################################################# print 'Obtaining codebook...' #save current time start_time = time.time() #Get detector classes codebook_kmeans = KMeans1.KMeans1(size) codebook_birch = Birch.Birch(size) codebook_minibatch = minibatch.MiniBatch(size) codebook_randomv = randomSamplesBook.RandomVectors(size) codebook_allrandom = allrandom.AllRandom(size) names_codebook = np.array( ["KMEANS", "BIRCH", "MINIBATCH", "RANDOMV", "RANDOM"]) codebook_algorithm = np.array([ codebook_kmeans, codebook_birch, codebook_minibatch, codebook_randomv, codebook_allrandom ]) #Get the codebook algorithm passed in the -c argument index = np.where(names_codebook == codebook)[0] if index.size > 0: codebook_to_use = codebook_algorithm[index[0]] else: print 'Wrong codebook construction algorithm name passed in the -c argument. Options: KMEANS, MINIBATCH, RANDOMV and RANDOM' sys.exit() #FOR RESULTS FILE codebook_to_use.writeFileCodebook(f) #Get centers and projections using codebook algorithm ceters, projections = codebook_to_use.obtainCodebook( des_vector_sampled, des_vector) elapsed_time = (time.time() - start_time) print 'Time to compute codebook = ' + str(elapsed_time) f.write('Time to compute codebook = ' + str(elapsed_time) + '\n') ################################################################# # # Obtain Histogram # ################################################################# print 'Obtaining histograms...' #print 'projection shape = '+ str(projections.shape) #print 'size = ' + str(size) #print 'n of images = ' + str(n_images) #print 'number of kp' + str(number_of_kp) hist = histogram.computeHist(projections, size, n_images, number_of_kp) print hist print 'Histograms obtained' ################################################################ # # Feature selection # ################################################################# print 'Number of visual words = ' + str(len(hist[0])) if fselec != "NONE": print 'Applying feature selection to descriptors...' filter_max = filterMax.WordFilterMax(fselec_perc[0]) filter_min = filterMin.WordFilterMin(fselec_perc[1]) filter_maxmin = filterMaxMin.WordFilterMaxMin( fselec_perc[0], fselec_perc[1]) names_filter = np.array(["FMAX", "FMIN", "FMAXMIN"]) filter_method = np.array([filter_max, filter_min, filter_maxmin]) #Get the feature selection method passed in the -f argument index = np.where(names_filter == fselec)[0] if index.size > 0: filter_to_use = filter_method[index[0]] else: print 'Wrong codebook construction algorithm name passed in the -f argument. Options: NONE, FMAX, FMIN, FMAXMIN' sys.exit() hist = filter_to_use.applyFilter(hist, size, n_images) #FOR RESULTS FILE filter_to_use.writeFile(f) new_size = hist.shape[1] print 'Visual words Filtered' print 'Number of visual words filtered = ' + str(size - new_size) f.write("Number of visual words filtered = " + str(size - new_size) + '\n') print 'Final number of visual words = ' + str(new_size) f.write('Final number of visual words = ' + str(new_size) + '\n') else: #FOR RESULTS FILE filter_min = filterMin.WordFilterMin(0) hist = filter_min.applyFilter(hist, size, n_images) new_size = hist.shape[1] print 'Number of visual words filtered = ' + str(size - new_size) f.write("No feature selection applied \n") ################################################################# # # Histogram Normalization # ################################################################# if histnorm != "NONE": #Get detector classes norm_sbin = simpleBinarization.SimpleBi() norm_tfnorm = tfnorm.Tfnorm() norm_tfidf = tfidf.TfIdf() norm_tfidf2 = tfidf2.TfIdf2() norm_tfidfnorm = tfidfnorm.TfIdfnorm() norm_okapi = okapi.Okapi(average_words) norm_power = powerNorm.PowerNorm() names_normalization = np.array([ "SBIN", "TFNORM", "TFIDF", "TFIDF2", "TFIDFNORM", "OKAPI", "POWER" ]) normalization_method = np.array([ norm_sbin, norm_tfnorm, norm_tfidf, norm_tfidf2, norm_tfidfnorm, norm_okapi, norm_power ]) #Get the detector passed in the -h argument index = np.where(names_normalization == histnorm)[0] if index.size > 0: normalization_to_use = normalization_method[index[0]] new_hist = normalization_to_use.normalizeHist( hist, new_size, n_images) else: print 'Wrong normalization name passed in the -h argument. Options: SBIN, TFNORM, TFIDF and TFIDF2' sys.exit() #FOR RESULTS FILE normalization_to_use.writeFile(f) else: #FOR RESULTS FILE f.write("No histogram normalization applied\n") new_hist = hist ################################################################# # # Clustering of the features # ################################################################# #save current time start_time = time.time() #Get detector classes clust_dbscan = Dbscan.Dbscan(dist) clust_kmeans = KMeans1.KMeans1([nclusters]) clust_birch = Birch.Birch(nclusters) clust_meanSift = meanSift.MeanSift(nclusters) clust_hierar1 = hierarchicalClustering.Hierarchical(nclusters, dist) clust_hierar2 = hierarchicalClustScipy.HierarchicalScipy(dist) clust_community = communityDetection.CommunityDetection(dist) names_clustering = np.array([ "DBSCAN", "KMEANS", "BIRCH", "MEANSIFT", "HIERAR1", "HIERAR2", "COMM" ]) clustering_algorithm = np.array([ clust_dbscan, clust_kmeans, clust_birch, clust_meanSift, clust_hierar1, clust_hierar2, clust_community ]) #Get the detector passed in the -a argument index = np.where(names_clustering == clust)[0] if index.size > 0: clustering_to_use = clustering_algorithm[index[0]] else: print 'Wrong clustering algorithm name passed in the -a argument. Options: DBSCAN, KMEANS, BIRCH, MEANSIFT, HIERAR1, HIERAR2, COMM' sys.exit() clusters = clustering_to_use.obtainClusters(new_hist) #FOR RESULTS FILE clustering_to_use.writeFileCluster(f) elapsed_time = (time.time() - start_time) print 'Time to run clustering algorithm = ' + str(elapsed_time) f.write('Time to run clustering algorithm = ' + str(elapsed_time) + '\n') print 'Number of clusters obtained = ' + str(max(clusters) + 1) f.write('Number of clusters obtained = ' + str(max(clusters) + 1) + '\n') nclusters = max(clusters) + 1 print 'Clusters obtained = ' + str(np.asarray(clusters)) #date_time = datetime.datetime.now().strftime('%b-%d-%I%M%p-%G') #np.savetxt('saveClusters_'+date_time+'_.txt', clusters, '%i', ',') #ADDED ################################################################# # # Create folder with central images for each cluster # ################################################################# #obtain representative images for each cluster central_ims = clust_community.obtainCenteralImages(new_hist, clusters) central_folder = os.path.join(dir_results, 'CenterImages') if not os.path.exists(central_folder): os.makedirs(central_folder) count = 0 for central_im in central_ims: filename = os.path.join(central_folder, 'Cluster_' + str(count) + '.jpg') img = cv2.imread(imPaths[central_im], 1) cv2.imwrite(filename, img) count = count + 1 #ADDED ################################################################# # # Separate Clusters into folders # ################################################################# clusters_folder = os.path.join(dir_results, 'Clusters') if not os.path.exists(clusters_folder): os.makedirs(clusters_folder) clust_dir = [] for iclust in range(0, nclusters): direc = os.path.join(clusters_folder, 'Cluster_' + str(iclust)) if not os.path.exists(direc): os.makedirs(direc) clust_dir.append(direc) for im in range(0, len(imPaths)): im_name = imPaths[im].split('/')[-1] #print clust_dir[int(clusters[im])] filename = os.path.join(clust_dir[int(clusters[im])], im_name) #print filename img = cv2.imread(imPaths[im], 1) cv2.imwrite(filename, img) ################################################################# # # Evaluation # ################################################################# users = 0 if users == 1: rand_index = evaluationUsers.randIndex(clusters) rand_indexes.append(rand_index) print 'rand_index = ' + str(rand_index) f.write("Rand Index = " + str(rand_index) + "\n") else: if len(clusters) == len(labels): f.write("\nResults\n") f.write('Clusters Obtained = ' + str(np.asarray(clusters))) f.write('Labels = ' + str(np.asarray(labels))) rand_index = metrics.adjusted_rand_score(labels, clusters) rand_indexes.append(rand_index) print 'rand_index = ' + str(rand_index) f.write("Rand Index = " + str(rand_index) + "\n") NMI_index = metrics.normalized_mutual_info_score( labels, clusters) nmi_indexes.append(NMI_index) print 'NMI_index = ' + str(NMI_index) f.write("NMI Index = " + str(NMI_index) + "\n") if rep > 1: f.write("\nFINAL RESULTS\n") f.write("Avg Rand Index = " + str(float(sum(rand_indexes)) / rep) + "\n") f.write("Std Rand Index = " + str(statistics.stdev(rand_indexes)) + "\n") if users != 1: f.write("Avg NMI Index = " + str(float(sum(nmi_indexes)) / rep) + "\n") f.write("Std NMI Index = " + str(statistics.stdev(nmi_indexes)) + "\n") f.close()
import sys from helper_functions import curr_test_dict paramDict = curr_test_dict("curr_test.txt") path = '../TLG_idf_files/' + paramDict[ 'num_grams'] + 'grams' + '/' + paramDict['spread'] + "_" + "v" + paramDict[ 'variant'] + "_" + "sw" + paramDict['stopwords'] + "/" if not os.path.exists(path): os.makedirs(path) TLG_file = os.readlink(sys.argv[1]) TLG_file = TLG_file.split('/') TLG_file = TLG_file[-1] if not os.path.exists(path + TLG_file): print TLG_file if paramDict['variant'] == 'True': variant = True else: variant = False _tfidf = tfidf.TfIdf(int(paramDict['spread']), variant, None, None, paramDict['stopword_file']) _tfidf.add_input_document('../stripped_text/' + TLG_file, int(paramDict['num_grams'])) _tfidf.save_corpus_to_file(path + TLG_file) else: print 'path already exists for ', TLG_file
def __init__(self, k, n_d, n_w): self.k = k or PseudoRelevanceFeedback._K self.n_d = n_d or PseudoRelevanceFeedback._N_D self.n_w = n_w or PseudoRelevanceFeedback._N_W self.tf_idf = tfidf.TfIdf(self.k)
def run(pathImages, method, keypnt, numpatch, equalnum, imdes, imsample, percentage, codebook, dist, size, fselec, fselec_perc, histnorm, clust, K, pca, nclusters, rep): ################################################################# # # Initializations and result file configurations # ################################################################# im_dataset_name = pathImages.split('/')[-1] date_time = datetime.datetime.now().strftime('%b-%d-%I%M%p-%G') name_results_file = 'BOF_' + im_dataset_name + '_' + keypnt + '_' + str( numpatch ) + '_' + str( equalnum ) + '_' + imdes + '_' + imsample + '_' + codebook + '_' + str( size ) + '_' + fselec + '_' + histnorm + '_' + clust + '_' + dist + '_' + date_time #dir_results = 'Results_' + im_dataset_name + '_BOF_' + date_time dir_results = 'Results_BOF' if not os.path.exists(dir_results): os.makedirs(dir_results) file_count = 2 file_name = os.path.join(dir_results, name_results_file) while os.path.exists(file_name + ".txt"): file_name = os.path.join(dir_results, name_results_file) + "_" + str(file_count) file_count = file_count + 1 f = open(file_name + ".txt", 'w') ################################################################# # # Get images # ################################################################# #pathImages = '/Users/Mariana/mieec/Tese/Development/ImageDatabases/Graz-01_sample' imList = get_imlist(pathImages) print 'Number of images read = ' + str(len(imList)) f.write("Number of images in dataset read: " + str(len(imList)) + "\n") ################################################################# # # Image description # ################################################################# #Get detector classes det_sift = siftLib.Sift(numpatch, equalnum) det_surf = surfLib.Surf(numpatch, equalnum) det_fast = fastDetector.Fast(numpatch, equalnum) det_star = starDetector.Star(numpatch, equalnum) det_orb = orbLib.Orb(numpatch, equalnum) det_random = randomDetector.Random(numpatch) names_detectors = np.array( ["SIFT", "SURF", "FAST", "STAR", "ORB", "RANDOM"]) detectors = np.array( [det_sift, det_surf, det_fast, det_star, det_orb, det_random]) #Get the detector passed in the -k argument index = np.where(names_detectors == keypnt)[0] if index.size > 0: detector_to_use = detectors[index[0]] else: print 'Wrong detector name passed in the -k argument. Options: SIFT, SURF, FAST, STAR, ORB and RANDOM' sys.exit() #FOR RESULTS FILE detector_to_use.writeParametersDet(f) #Get descriptor classes des_sift = siftLib.Sift(numpatch, equalnum) des_surf = surfLib.Surf(numpatch, equalnum) des_orb = orbLib.Orb(numpatch) des_brief = briefDescriptor.Brief() des_freak = freakDescriptor.Freak() names_descriptors = np.array(["SIFT", "SURF", "ORB", "BRIEF", "FREAK"]) descriptors = np.array([des_sift, des_surf, des_orb, des_brief, des_freak]) #Get the detector passed in the -d argument index = np.where(names_descriptors == imdes)[0] if index.size > 0: descriptor_to_use = descriptors[index[0]] else: print 'Wrong descriptor name passed in the -d argument. Options: SIFT, SURF, ORB, BRIEF and FREAK' sys.exit() #FOR RESULTS FILE descriptor_to_use.writeParametersDes(f) kp_vector = [] #vector with the keypoints object des_vector = [ ] #vector wih the descriptors (in order to obtain the codebook) number_of_kp = [] #vector with the number of keypoints per image counter = 1 #save current time start_time = time.time() labels = [] class_names = [] #ADDED imPaths = [] #detect the keypoints and compute the sift descriptors for each image for im in imList: if 'DS_Store' not in im: #ADDED imPaths.append(im) print 'image: ' + str(im) + ' number: ' + str(counter) #read image img = cv2.imread(im, 0) #mask in order to avoid keypoints in border of image. size = 40 pixels border = 40 height, width = img.shape mask = np.zeros(img.shape, dtype=np.uint8) mask[border:height - border, border:width - border] = 1 #get keypoints from detector kp = detector_to_use.detectKp(img, mask) #get features from descriptor des = descriptor_to_use.computeDes(img, kp) number_of_kp.append(len(kp)) kp_vector.append(kp) if counter == 1: des_vector = des else: des_vector = np.concatenate((des_vector, des), axis=0) counter += 1 #for evaluation name1 = im.split("/")[-1] name = name1.split("_")[0] if name in class_names: index = class_names.index(name) labels.append(index) else: class_names.append(name) index = class_names.index(name) labels.append(index) #measure the time to compute the description of each image (divide time elapsed by # of images) elapsed_time = (time.time() - start_time) / len(imList) print 'Time to compute detector and descriptor for each image = ' + str( elapsed_time) f.write( 'Average time to compute detector and descriptor for each image = ' + str(elapsed_time) + '\n') n_images = len(kp_vector) average_words = sum(number_of_kp) / float(len(number_of_kp)) print 'Total number of features = ' + str(len(des_vector)) f.write('Total number of features obtained = ' + str(len(des_vector)) + '\n') print 'Average number of keypoints per image = ' + str(average_words) f.write('Average number of keypoints per image = ' + str(average_words) + '\n') ################################################################# # # Dimentionality reduction # ################################################################# if pca != None: start_time = time.time() print 'Applying PCA...' pca = PCA(n_components=pca) descriptors_reduced = pca.fit(des_vector).transform(des_vector) print 'PCA Applied.' print 'time to apply PCA = ' + str(time.time() - start_time) des_vector = descriptors_reduced ################################################################# # # Image and Keypoint sampling # ################################################################# rand_indexes = [] nmi_indexes = [] for iteraction in range(0, rep): print "\nIteraction #" + str(iteraction + 1) + '\n' f.write("\nIteraction #" + str(iteraction + 1) + '\n') print 'Sampling images and keypoints prior to codebook computation...' if imsample != "NONE": sampleKp = sampleKeypoints.SamplingImandKey( n_images, number_of_kp, average_words, percentage) sampleallKp = sampleAllKeypoints.SamplingAllKey(percentage) names_sampling = np.array(["SAMPLEI", "SAMPLEP"]) sample_method = np.array([sampleKp, sampleallKp]) #Get the detector passed in the -g argument index = np.where(names_sampling == imsample)[0] if index.size > 0: sampling_to_use = sample_method[index[0]] else: print 'Wrong sampling method passed in the -g argument. Options: NONE, SAMPLEI, SAMPLEP' sys.exit() #FOR RESULTS FILE sampling_to_use.writeFile(f) des_vector_sampled = sampling_to_use.sampleKeypoints(des_vector) print 'Total number of features after sampling = ' + str( len(des_vector_sampled)) f.write('Total number of features after sampling = ' + str(len(des_vector_sampled)) + '\n') print 'Images and keypoints sampled...' else: print 'No sampling method chosen' #FOR RESULTS FILE f.write( "No method of keypoint sampling chosen. Use all keypoints for codebook construction \n" ) des_vector_sampled = des_vector ################################################################# # # Codebook computation # ################################################################# print 'Obtaining codebook...' #save current time start_time = time.time() #Get detector classes codebook_kmeans = KMeans1.KMeans1(size) codebook_birch = Birch.Birch(size) codebook_minibatch = minibatch.MiniBatch(size) codebook_randomv = randomSamplesBook.RandomVectors(size) codebook_allrandom = allrandom.AllRandom(size) names_codebook = np.array( ["KMEANS", "BIRCH", "MINIBATCH", "RANDOMV", "RANDOM"]) codebook_algorithm = np.array([ codebook_kmeans, codebook_birch, codebook_minibatch, codebook_randomv, codebook_allrandom ]) #Get the detector passed in the -c argument index = np.where(names_codebook == codebook)[0] if index.size > 0: codebook_to_use = codebook_algorithm[index[0]] else: print 'Wrong codebook construction algorithm name passed in the -c argument. Options: KMEANS, MINIBATCH, RANDOMV and RANDOM' sys.exit() #FOR RESULTS FILE codebook_to_use.writeFileCodebook(f) #Get centers and projections using codebook algorithm centers, projections = codebook_to_use.obtainCodebook( des_vector_sampled, des_vector) #compute the number of unique descriptor vectors codebook_randomv.unique_vectors(centers) elapsed_time = (time.time() - start_time) print 'Time to compute codebook = ' + str(elapsed_time) f.write('Time to compute codebook = ' + str(elapsed_time) + '\n') ################################################################# # # Obtain Histogram # ################################################################# print 'Obtaining histograms...' #print 'projection shape = '+ str(projections.shape) #print 'size = ' + str(size) #print 'n of images = ' + str(n_images) #print 'number of kp' + str(number_of_kp) hist = histogram.computeHist(projections, size, n_images, number_of_kp) #print hist print 'Histograms obtained' ################################################################ # # Feature selection # ################################################################# print 'Number of visual words = ' + str(len(hist[0])) if fselec != "NONE": print 'Applying feature selection to descriptors...' filter_max = filterMax.WordFilterMax(fselec_perc[0]) filter_min = filterMin.WordFilterMin(fselec_perc[1]) filter_maxmin = filterMaxMin.WordFilterMaxMin( fselec_perc[0], fselec_perc[1]) names_filter = np.array(["FMAX", "FMIN", "FMAXMIN"]) filter_method = np.array([filter_max, filter_min, filter_maxmin]) #Get the detector passed in the -f argument index = np.where(names_filter == fselec)[0] if index.size > 0: filter_to_use = filter_method[index[0]] else: print 'Wrong codebook construction algorithm name passed in the -f argument. Options: NONE, FMAX, FMIN, FMAXMIN' sys.exit() hist = filter_to_use.applyFilter(hist, size, n_images) #FOR RESULTS FILE filter_to_use.writeFile(f) new_size = hist.shape[1] print 'Visual words Filtered' print 'Number of visual words filtered = ' + str(size - new_size) f.write("Number of visual words filtered = " + str(size - new_size) + '\n') print 'Final number of visual words = ' + str(new_size) f.write('Final number of visual words = ' + str(new_size) + '\n') else: #FOR RESULTS FILE filter_min = filterMin.WordFilterMin(0) hist = filter_min.applyFilter(hist, size, n_images) new_size = hist.shape[1] print 'Number of visual words filtered = ' + str(size - new_size) f.write("No feature selection applied \n") ################################################################# # # Histogram Normalization # ################################################################# if histnorm != "NONE": #Get detector classes norm_sbin = simpleBinarization.SimpleBi() norm_tfnorm = tfnorm.Tfnorm() norm_tfidf = tfidf.TfIdf() norm_tfidf2 = tfidf2.TfIdf2() norm_tfidf3 = tfidf3.Tfidf3() norm_power = powerNorm.PowerNorm() norm_tfidfnorm = tfidfnorm.TfIdfnorm() norm_okapi = okapi.Okapi(average_words) names_normalization = np.array( ["SBIN", "TFNORM", "TFIDF", "TFIDF2", "TFIDFNORM", "OKAPI"]) normalization_method = np.array([ norm_sbin, norm_tfnorm, norm_tfidf, norm_tfidf2, norm_tfidfnorm, norm_okapi ]) #Get the detector passed in the -h argument index = np.where(names_normalization == histnorm)[0] if index.size > 0: normalization_to_use = normalization_method[index[0]] new_hist = normalization_to_use.normalizeHist( hist, new_size, n_images) else: print 'Wrong normalization name passed in the -h argument. Options: SBIN, TFNORM, TFIDF and TFIDF2' sys.exit() #FOR RESULTS FILE normalization_to_use.writeFile(f) else: #FOR RESULTS FILE f.write("No histogram normalization applied\n") new_hist = hist ################################################################# # # Clustering of the features # ################################################################# #save current time start_time = time.time() #Get detector classes clust_dbscan = Dbscan.Dbscan(dist) clust_kmeans = KMeans1.KMeans1([nclusters]) clust_kmeans2 = kmeans2.KMeans2([nclusters]) clust_birch = Birch.Birch(nclusters) clust_meanSift = meanSift.MeanSift(nclusters) clust_hierar1 = hierarchicalClustering.Hierarchical(nclusters, dist) clust_hierar2 = hierarchicalClustScipy.HierarchicalScipy(dist) clust_community = communityDetection.CommunityDetection(dist) names_clustering = np.array([ "DBSCAN", "KMEANS", "BIRCH", "MEANSIFT", "HIERAR1", "HIERAR2", "COMM" ]) clustering_algorithm = np.array([ clust_dbscan, clust_kmeans, clust_birch, clust_meanSift, clust_hierar1, clust_hierar2, clust_community ]) #Get the detector passed in the -a argument index = np.where(names_clustering == clust)[0] if index.size > 0: clustering_to_use = clustering_algorithm[index[0]] else: print 'Wrong clustering algorithm name passed in the -a argument. Options: DBSCAN, KMEANS, BIRCH, MEANSIFT, HIERAR1, HIERAR2, COMM' sys.exit() clusters = clustering_to_use.obtainClusters(new_hist) #FOR RESULTS FILE clustering_to_use.writeFileCluster(f) elapsed_time = (time.time() - start_time) print 'Time to run clustering algorithm = ' + str(elapsed_time) f.write('Time to run clustering algorithm = ' + str(elapsed_time) + '\n') #ADDED nclusters = int(max(clusters) + 1) print 'Number of clusters obtained = ' + str(max(clusters) + 1) f.write('Number of clusters obtained = ' + str(max(clusters) + 1) + '\n') print 'Clusters obtained = ' + str(np.asarray(clusters)) #date_time = datetime.datetime.now().strftime('%b-%d-%I%M%p-%G') #np.savetxt('saveClusters_'+date_time+'_.txt', clusters, '%i', ',') #ADDED ################################################################# # # Create folder with central images for each cluster # ################################################################# ###obtain representative images for each cluster #central_ims = clust_community.obtainCenteralImages(new_hist, clusters) #central_folder = os.path.join(dir_results,'CenterImages') #if not os.path.exists(central_folder): #os.makedirs(central_folder) #count=0 #for central_im in central_ims: #filename = os.path.join(central_folder,'Cluster_'+str(count)+'.jpg') #img = cv2.imread(imPaths[central_im],1) #cv2.imwrite(filename, img) #count = count + 1 ##ADDED ################################################################## ## ## Separate Clusters into folders ## ################################################################## #clusters_folder = os.path.join(dir_results,'Clusters') #if not os.path.exists(clusters_folder): #os.makedirs(clusters_folder) #clust_dir = [] #for iclust in range(0,nclusters): #direc = os.path.join(clusters_folder,'Cluster_'+str(iclust)) #if not os.path.exists(direc): #os.makedirs(direc) #clust_dir.append(direc) #for im in range(0,len(imPaths)): #im_name = imPaths[im].split('/')[-1] ##print clust_dir[int(clusters[im])] #filename = os.path.join(clust_dir[int(clusters[im])],im_name) ##print filename #img = cv2.imread(imPaths[im],1) #cv2.imwrite(filename, img) ##calculate distances between images and closest images #closest_im = distances.calculateClosest(new_hist,dist) ##print closest_im #if not os.path.exists('ClosestImages'): #os.makedirs('ClosestImages') #file_name = os.path.join('ClosestImages',name_results_file) #f2 = open(file_name + ".txt", 'w') #counter = 0 #counter2 = 1 #for ims in closest_im: #for im in ims: #f2.write(str(counter2) + '-' + str(counter) + '-' + str(im) + '\n') #counter2 = counter2 + 1 #counter = counter + 1 #f2.close() ################################################################# # # Evaluation # ################################################################# users = 0 #labels = np.load('IndividualClustersMatrix.npy') if users == 1: rand_index = evaluationUsers.randIndex(clusters) rand_indexes.append(rand_index) print 'rand_index = ' + str(rand_index) f.write("Rand Index = " + str(rand_index) + "\n") else: if len(clusters) == len(labels): f.write("\nResults\n") f.write('Clusters Obtained = ' + str(np.asarray(clusters))) f.write('Labels = ' + str(np.asarray(labels))) rand_index = metrics.adjusted_rand_score(labels, clusters) rand_indexes.append(rand_index) print 'rand_index = ' + str(rand_index) f.write("Rand Index = " + str(rand_index) + "\n") NMI_index = metrics.normalized_mutual_info_score( labels, clusters) nmi_indexes.append(NMI_index) print 'NMI_index = ' + str(NMI_index) f.write("NMI Index = " + str(NMI_index) + "\n") if rep > 1: f.write("\nFINAL RESULTS\n") f.write("Avg Rand Index = " + str(float(sum(rand_indexes)) / rep) + "\n") f.write("Std Rand Index = " + str(statistics.stdev(rand_indexes)) + "\n") if users != 1: f.write("Avg NMI Index = " + str(float(sum(nmi_indexes)) / rep) + "\n") f.write("Std NMI Index = " + str(statistics.stdev(nmi_indexes)) + "\n") f.close()
new_size = hist.shape[1] print 'Number of visual words filtered = ' + str(size - new_size) f.write("No feature selection applied \n") ################################################################# # # Histogram Normalization # ################################################################# if histnorm != "NONE": #Get detector classes norm_sbin = simpleBinarization.SimpleBi() norm_tfnorm = tfnorm.Tfnorm() norm_tfidf = tfidf.TfIdf() norm_tfidf2 = tfidf2.TfIdf2() norm_tfidfnorm = tfidfnorm.TfIdfnorm() norm_okapi = okapi.Okapi(average_words) names_normalization = np.array( ["SBIN", "TFNORM", "TFIDF", "TFIDF2", "TFIDFNORM", "OKAPI"]) normalization_method = np.array([ norm_sbin, norm_tfnorm, norm_tfidf, norm_tfidf2, norm_tfidfnorm, norm_okapi ]) #Get the detector passed in the -h argument index = np.where(names_normalization == histnorm)[0] if index.size > 0: normalization_to_use = normalization_method[index[0]]