Python TfIdf 예제들, tfidf.TfIdf Python 예제들

예제 #1

0

파일 보기

 def __init__(self, title):
     self.title = title
     self.content = ""
     self.summary = ""
     self.contentTfIdf = {}
     self.summryTfIdf = {}
     self.summaryhyperlinks = []  #(word :link)
     self.hyperlinks = []
     self.SeeAlso = []
     self.Categories = []
     self.PrunedCategories = []
     self.successful = True
     if title not in variable.allTfIdf.keys():
         print("Title: " + title)
         Content = givePrunedContent(title, "NULL")
         if (Content == "NULL"):
             self.successful = False
         else:
             self.contentTfIdf = tfidf.TfIdf(Content)
             summary = giveSummary(title, "NULL")
             self.summryTfIdf = tfidf.TfIdf(summary)
             self.content = Content
             self.summary = summary
             variable.allTfIdf[title] = (self.contentTfIdf,
                                         self.summryTfIdf)
             variable.Allcontent[title] = (self.content, self.summary)
     else:
         self.contentTfIdf = variable.allTfIdf[title][0]
         self.summryTfIdf = variable.allTfIdf[title][1]
         self.content = variable.Allcontent[title][0]
         self.summary = variable.Allcontent[title][1]

예제 #2

0

파일 보기

    def tf_idf(self, document):
        """Returns the tf-idf score for ngrams for the document. The num_grams specifies the level of grams:
        words, bigrams, trigrams, etc. The function creates an idf corpus for the subcorpora. If the document
        is not in the subcorpora, it is also added to the idf corpus, so that each word appears in at least
        one document.
        Stopwords are optional, but if True, they are created based on the stopword_percentage_threshold
        parameter."""

        path = '../stripped_text/'

        idf_path = "../" + str(
            self.num_grams) + "grams/" + self.subcorpora + "_v_" + str(
                self.vary_defn) + "_sw_" + str(self.stopwords) + "_s_" + str(
                    self.spread) + ".txt"

        if os.path.exists(idf_path):
            #idf corpus already exists
            print "idf corpus exists."

            #create tfidf object with existing corpus
            _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, idf_path,
                                 self.idf_dict, self.stopword_file)

            #determine document TLG#### filename:
            filedict = file_dict('../ref_file.txt')
            docFilename = filedict[document][0]

        else:  #idf corpus not yet in existence
            print "creating idf corpus."
            docFilename = self.add_docs(document)
            _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, None,
                                 self.idf_dict, self.stopword_file)

        #actually determine tf-idf score for ngrams in document:
        tfidf_list = _tfidf.get_doc_keywords(path + docFilename,
                                             self.num_grams)

        #print tfidf scores to .txt and .csv:
        print "print tfidf scores to .txt and .csv"
        if document == self.doc1:
            print "printing doc1 to ", self.doc1_tfidf_file
            print_to_file_by_ngram(tfidf_list, self.doc1_tfidf_file)
            print_to_csv_file(self.doc1_tfidf_file, self.num_grams)
        elif document == self.doc2:
            print "printing doc2 to ", self.doc2_tfidf_file
            print_to_file_by_ngram(tfidf_list, self.doc2_tfidf_file)
            print_to_csv_file(self.doc2_tfidf_file, self.num_grams)

##this should only happen if it's not already saved
#save idf corpus for later use:
#        _tfidf.save_corpus_to_file(idf_path)

        return tfidf_list

예제 #3

0

파일 보기

파일: main.py 프로젝트: asharimh97/search

def findSim(keyword, pathcorpus):
    this_path = os.path.split(__file__)[0]
    pathcorpus = os.path.join(this_path, pathcorpus)

    # membaca sekaligus pre-processing semua artikel corpus simpan ke dictionary
    table = tfidf.TfIdf()
    articles = {}
    for item in os.listdir(pathcorpus):
        if item.endswith(".txt"):
            with open(pathcorpus + "/" + item, 'r',encoding="utf-8") as file:
                # articles[item] = lib1.prepro_base(file.read()).split()
                table.add_document(item, lib1.prepro_base(file.read()).split())

    keys = keyword.split()
    result = table.similarities(keys)
    res = []
    # for x, title in result, articles:
    #     if x[1]:
    #         res.append([x[0], (round(x[1], 3)*100), title.value()])

    for x in result:
        if x[1]:
            with open(pathcorpus + '/' + x[0], 'r',encoding="utf-8") as file:
                res.append([x[0], x[1], file.readline()])

    print(res)

    return res

예제 #4

0

파일 보기

def main():
    clanky = getarticles.getarticles()  # načtení článků z RSS
    table = tfidf.TfIdf()  # inicializace TfIdf
    for clanek in clanky:
        tokeny = tokenize.tokenize(
            clanek['text'])  # tokeny jsou základní tvary slov
        clanek['tokeny'] = tokeny
        table.add_document(clanek['url'],
                           tokeny)  # přidání článku do tfidf ke zpracování

    pocet_souvislosti = 0  # celkový počet nalezených souvislostí, hodí se k hraní si s trash holdem

    for clanek in clanky:
        print("------------------")
        print(clanek['nadpis'])
        print("zdroj: " + clanek['url'])
        for podobnost in table.similarities(
                clanek['tokeny']
        ):  # podobnost[0] je url článku, podobnost[1] samotné číslo
            if podobnost[1] > THRASH_HOLD and podobnost[0] != clanek[
                    'url']:  # nejpodobnější je článek sám sobě
                print("souvisí: " + podobnost[0])
                pocet_souvislosti += 1
    print("------------------")
    print("nalezeno souvislostí: ", pocet_souvislosti)

예제 #5

0

파일 보기

파일: test_tfidf.py 프로젝트: nmandal3/dsde1-computing1

 def test_read(self):
     '''Test reading in a file as a string'''
     my_t = t.TfIdf()
     result = my_t.read_file('./text-files/a-drinking-song-yeats.txt')
     self.assertEqual(
         result,
         'Wine comes in at the mouth\nAnd love comes in at the eye;\nThat\'s all we shall know for truth\nBefore we grow old and die.\nI lift the glass to my mouth,\nI look at you, and I sigh.'
     )

예제 #6

0

파일 보기

파일: test_tfidf.py 프로젝트: cosmonautd/Botkit

    def test_similarity(self):
        table = tfidf.TfIdf()
        table.add_document("foo", ["a", "b", "c", "d", "e", "f", "g", "h"])
        table.add_document("bar", ["a", "b", "c", "i", "j", "k"])
        table.add_document("baz", ["k", "l", "m", "n"])

        self.assertEqual(table.similarities(["a", "b", "c"]),
                         [["foo", 0.6875], ["bar", 0.75], ["baz", 0.0]])

예제 #7

0

파일 보기

파일: test_tfidf.py 프로젝트: nmandal3/dsde1-computing1

 def test_string_short(self):
     '''
     Test breaking up a string
     '''
     my_t = t.TfIdf()
     test_string = '''How now brown Cow?
     Peter piper.'''
     result = my_t.string_to_list(test_string)
     self.assertEqual(result,
                      ['how', 'now', 'brown', 'cow', 'peter', 'piper'])

예제 #8

0

파일 보기

파일: test_tfidf.py 프로젝트: nmandal3/dsde1-computing1

 def test_line_breaks(self):
     '''Test string with line breaks'''
     my_t = t.TfIdf()
     test_string = "Wine comes in at the mouth\nAnd love comes in at the eye;\nThat\'s all we shall know for truth\nBefore"
     result = my_t.string_to_list(test_string)
     self.assertEqual(result, [
         'wine', 'comes', 'in', 'at', 'the', 'mouth', 'and', 'love',
         'comes', 'in', 'at', 'the', 'eye', "that's", 'all', 'we', 'shall',
         'know', 'for', 'truth', 'before'
     ])

예제 #9

0

파일 보기

파일: tfidf_test.py 프로젝트: zmskye/tfidf-python

  def testGetIdf(self):
    """Test querying the IDF for existent and nonexistent terms."""
    my_tfidf = tfidf.TfIdf("tfidf_testcorpus.txt", \
                           DEFAULT_IDF = DEFAULT_IDF_UNITTEST)

    # Test querying for a nonexistent term.
    self.assertEqual(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("nonexistent"))
    self.assertEqual(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("THE"))

    self.assertTrue(my_tfidf.get_idf("a") > my_tfidf.get_idf("the"))
    self.assertAlmostEquals(my_tfidf.get_idf("girl"), my_tfidf.get_idf("moon"))

예제 #10

0

파일 보기

파일: primary.py 프로젝트: hoguer/SACTAG

 def create_and_save_idf_corpus(self):
     #should check if path exists yet
     idf_path = "../" + str(
         self.num_grams) + "grams/" + self.subcorpora + "_v_" + str(
             self.vary_defn) + "_sw_" + str(self.stopwords) + "_s_" + str(
                 self.spread) + ".txt"
     num_docs = self.compile_idf_dict(
     )  #in compile...() it should check that each file has been made.
     _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, None, self.idf_dict,
                          self.stopword_file)
     _tfidf.set_num_docs(num_docs)
     _tfidf.save_corpus_to_file(idf_path)

예제 #11

0

파일 보기

파일: tfidf_test.py 프로젝트: zmskye/tfidf-python

  def testKeywords(self):
    """Test retrieving keywords from a document, ordered by tf-idf."""
    my_tfidf = tfidf.TfIdf("tfidf_testcorpus.txt", DEFAULT_IDF = 0.01)

    # Test retrieving keywords when there is only one keyword.
    keywords = my_tfidf.get_doc_keywords("the spoon and the fork")
    self.assertEqual("the", keywords[0][0])

    # Test retrieving multiple keywords.
    keywords = my_tfidf.get_doc_keywords("the girl said hello over the phone")
    self.assertEqual("girl", keywords[0][0])
    self.assertEqual("phone", keywords[1][0])
    self.assertEqual("said", keywords[2][0])
    self.assertEqual("the", keywords[3][0])

예제 #12

0

파일 보기

파일: tfidf_test.py 프로젝트: shahpranjal/alternis

    def testNoCorpusFiles(self):
        my_tfidf = tfidf.TfIdf(DEFAULT_IDF=DEFAULT_IDF_UNITTEST)

        self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("moon"))
        self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("water"))
        self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("said"))

        my_tfidf.add_input_document("moon")
        my_tfidf.add_input_document("moon said hello")

        self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("water"))
        self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 1),
                                my_tfidf.get_idf("said"))
        self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 2),
                                my_tfidf.get_idf("moon"))

예제 #13

0

파일 보기

def wiki_query(query):
    query= query.replace(' ', '+')
    query= query.replace(',', '|')
    page_info = ''
    url = 'http://en.wikipedia.org/w/api.php?format=json&action=query&titles=' + query + '&prop=revisions&rvprop=content'
    jsonVal= json.loads(urllib2.urlopen(url).read())
    if (jsonVal):
        for page in jsonVal["query"]["pages"]:
            i = jsonVal["query"]["pages"][page]
            if "revisions" in i:
                page_info = i["revisions"]
    tfobj = tf.TfIdf()
    input_doc = tfobj.add_input_document(str(page_info))
    keywords = tfobj.get_idf(str(query))
    return keywords
    return page_info

예제 #14

0

파일 보기

파일: tfidf_test.py 프로젝트: zmskye/tfidf-python

  def testStopwordFile(self):
    my_tfidf = tfidf.TfIdf("tfidf_testcorpus.txt", "tfidf_teststopwords.txt",
                           DEFAULT_IDF = DEFAULT_IDF_UNITTEST)

    self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("water"))
    self.assertEquals(0, my_tfidf.get_idf("moon"))
    self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 5),
      my_tfidf.get_idf("said"))

    my_tfidf.add_input_document("moon")
    my_tfidf.add_input_document("moon and water")

    self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 1),
                            my_tfidf.get_idf("water"))
    self.assertEquals(0, my_tfidf.get_idf("moon"))
    self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 5),
      my_tfidf.get_idf("said"))

예제 #15

0

파일 보기

파일: primary_back.py 프로젝트: hoguer/SACTAG

    def create_stopword_file(self, subcorpora, stopword_percentage_threshold):
        """Creates a stopword file. Returns stopword filename."""

        _tfidf = tfidf.TfIdf()
        filedict = file_dict('../ref_file.txt')
        path = '../stripped_text/'
        for filename in filedict[self.subcorpora]:
            print filename
            _tfidf.add_input_document(path + filename)
        print str(stopword_percentage_threshold)
        _tfidf.save_corpus_to_file(
            "../1grams/" + self.subcorpora + ".txt",
            "../stopwords/" + self.subcorpora + "_" +
            str(stopword_percentage_threshold) + ".txt",
            stopword_percentage_threshold)

        return "../stopwords/" + self.subcorpora + "_" + str(
            stopword_percentage_threshold) + ".txt"  #returns stopword filename

예제 #16

0

파일 보기

파일: tfidf_test.py 프로젝트: shahpranjal/alternis

    def testAddCorpus(self):
        """Test adding input documents to the corpus."""
        my_tfidf = tfidf.TfIdf("tfidf_testcorpus.txt", \
                               DEFAULT_IDF = DEFAULT_IDF_UNITTEST)

        self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("water"))
        self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 1),
                                my_tfidf.get_idf("moon"))
        self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 5),
                                my_tfidf.get_idf("said"))

        my_tfidf.add_input_document("water, moon")

        self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 1),
                                my_tfidf.get_idf("water"))
        self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 2),
                                my_tfidf.get_idf("moon"))
        self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 5),
                                my_tfidf.get_idf("said"))

예제 #17

0

파일 보기

def main():
	testpath = "/Accounts/groenemm/summer/repo/tfidf/cachedtexts"
	testfiles =	 os.listdir(testpath)
	maxrank = 10
	verbose = True
	t = tfidf.TfIdf(corpus_filename="reformattedfreqlist.txt",stopword_filename="stopwords.txt")
	sc = sphinx_inter.SphinxClient("dmusican41812",rankingmode=SPH_RANK_BM25,fieldweights={"title":4, "body":1})
	#tests = importTestCites("citelist.txt")
	testresults = TestResults()
	

	for path,pageid in [(os.path.join(testpath,testfile),int(testfile)) for testfile in testfiles]:
		with open(path) as f:
			text = f.read()

		if text == None or text == "":
			continue
		
		testresults.addResult(*testCitation(pageid,text,getCalaisQuery,maxrank,sc,t,verbose))

	testresults.printSummary()

예제 #18

0

파일 보기

    def build_idf(self,
                  description_column_name,
                  out_file=None,
                  csv_location=None):
        """

        :param description_column_name:
        :param out_file:
        :param csv_location:
        :return:
        """
        idfcalc = tfidf.TfIdf()

        for entry in self.corpus.loc[:, description_column_name].values:
            idfcalc.add_input_document(entry)

        idf_list = []
        term_list = []

        for term in idfcalc.term_num_docs:
            idf = idfcalc.get_idf(term)
            idf_list.append(idf)
            term_list.append(term)

        idf_vector = pd.Series(idf_list, index=term_list)
        idf_vector = idf_vector.sort_values(ascending=False)

        if out_file:
            if csv_location:
                idf_vector.to_csv(csv_location + '/' + out_file)
            else:
                print("Error: no location specified for output csv")

        self.idf_vector_created = True
        self.idf_vector = idf_vector

        return idf_list, term_list

예제 #19

0

파일 보기

파일: run.py 프로젝트: wzswan/DD-LSTW

import tfidf
import numpy
import collections

fai_result ={}

data = tfidf.TfIdf()
fai = data.csv2dict("/home/wzswan/Downloads/github/DD-LSTW/fai.csv")
fai_count = collections.Counter(fai)
pe = data.csv2dict("/home/wzswan/Downloads/github/DD-LSTW/pe.csv")
stock = data.csv2dict("/home/wzswan/Downloads/github/DD-LSTW/stock.csv")
print fai
print fai_count
#print pe
#print stock
table = tfidf.TfIdf()
table.add_document("FAI",fai)
table.add_document("PE",pe)
table.add_document("STOCK_INDEX",stock)

#print table

#print "key 16 is:%s"%(table.similarities(["16"]))


fai_result = {'24':table.similarities(["24"]), '25': table.similarities(["25"]),'26':table.similarities(["26"]) ,
'27':table.similarities(["27"]),'20': table.similarities(["20"]),'21':table.similarities(["21"]),
'22':table.similarities(["22"]) ,'23':table.similarities(["23"]) ,'28':table.similarities(["28"])
,'29': table.similarities(["29"]), '38':table.similarities(["38"]) ,'15':table.similarities(["15"]) ,
'17':table.similarities(["17"]) ,'16':table.similarities(["16"]) ,'33':table.similarities(["33"]) ,
'18':table.similarities(["18"]) ,'30':table.similarities(["30"]) ,'37':table.similarities(["37"]) ,

예제 #20

0

파일 보기

import tfidf
from stopwords import *

#test1, all parameters set to default
mytfidf = tfidf.TfIdf()

mytfidf.add_input_document("../test_files/kowari.txt")
mytfidf.add_input_document("../test_files/platypus.txt")
mytfidf.save_corpus_to_file(idf_filename="test1.txt")

#test2, turn on variant word order
mytfidf = tfidf.TfIdf(variant_word_order=True)

mytfidf.add_input_document("../test_files/kowari.txt")
mytfidf.add_input_document("../test_files/platypus.txt")
mytfidf.save_corpus_to_file(idf_filename="test2.txt")

#test3, try with trigrams
mytfidf = tfidf.TfIdf(ngram_size=3)

mytfidf.add_input_document("../test_files/kowari.txt")
mytfidf.add_input_document("../test_files/platypus.txt")
mytfidf.save_corpus_to_file(idf_filename="test3.txt")

#test4, try giving it a window of 4
mytfidf = tfidf.TfIdf(window=4)

mytfidf.add_input_document("../test_files/kowari.txt")
mytfidf.add_input_document("../test_files/platypus.txt")
mytfidf.save_corpus_to_file(idf_filename="test4.txt")

예제 #21

0

파일 보기

파일: tfidf_example.py 프로젝트: perfanov/sunnystronghold

import tfidf

if __name__ == "__main__":
    table = tfidf.TfIdf()
    table.add_document("foo", ["a", "b", "c", "d", "e", "f", "g", "h"])
    table.add_document("bar", ["a", "b", "c", "i", "j", "k"])
    table.add_document("baz", ["a", "l", "m", "n"])
    table.add_document("taz", ["t"])

    print table.similarities(["a", "l", "m", "n"])
    print table.similarities(["t"])
    #[["foo", 0.6875], ["bar", 0.75], ["baz", 0.0]])

예제 #22

0

파일 보기

def run(pathImages, method, keypnt, numpatch, equalnum, imdes, imsample,
        percentage, codebook, dist, size, fselec, fselec_perc, histnorm, clust,
        K, pca, nclusters, rep, levels):

    #################################################################
    #
    # Initializations and result file configurations
    #
    #################################################################

    #warnings.simplefilter("error")

    if os.path.exists('save_HIST.txt') == True:
        os.remove('save_HIST.txt')

    if os.path.exists('save_dist.txt') == True:
        os.remove('save_dist.txt')

    if os.path.exists('saveClustersKmeans.txt') == True:
        os.remove('saveClustersKmeans.txt')

    im_dataset_name = pathImages.split('/')[-1]

    date_time = datetime.datetime.now().strftime('%b-%d-%I%M%p-%G')

    name_results_file = im_dataset_name + '_' + keypnt + '_' + str(
        numpatch
    ) + '_' + str(equalnum) + '_' + imdes + '_' + 'levels:' + str(
        levels
    ) + '_' + imsample + '_' + codebook + '_' + str(
        size
    ) + '_' + fselec + '_' + histnorm + '_' + clust + '_' + dist + '_' + date_time

    #dir_results = 'Results_' + im_dataset_name + '_SPM_' + date_time
    dir_results = 'Results_SPM'

    if not os.path.exists(dir_results):
        os.makedirs(dir_results)

    file_count = 2
    file_name = os.path.join(dir_results, name_results_file)
    while os.path.exists(file_name + ".txt"):
        file_name = os.path.join(dir_results,
                                 name_results_file) + "_" + str(file_count)
        file_count = file_count + 1
    f = open(file_name + ".txt", 'w')

    #################################################################
    #
    # Get images
    #
    #################################################################

    #pathImages = '/Users/Mariana/mieec/Tese/Development/ImageDatabases/Graz-01_sample'

    imList = get_imlist(pathImages)

    print 'Number of images read = ' + str(len(imList))
    f.write("Number of images in dataset read: " + str(len(imList)) + "\n")

    #################################################################
    #
    # Image description
    #
    #################################################################

    #Number of regions
    n_regions = np.power(4, levels - 1)

    #Get detector classes
    det_sift = siftLib.Sift(numpatch / n_regions, equalnum)
    det_surf = surfLib.Surf(numpatch / n_regions, equalnum)
    det_fast = fastDetector.Fast(numpatch / n_regions, equalnum)
    det_star = starDetector.Star(numpatch / n_regions, equalnum)
    det_orb = orbLib.Orb(numpatch / n_regions, equalnum)
    det_random = randomDetector.Random(numpatch / n_regions)

    names_detectors = np.array(
        ["SIFT", "SURF", "FAST", "STAR", "ORB", "RANDOM"])
    detectors = np.array(
        [det_sift, det_surf, det_fast, det_star, det_orb, det_random])

    #Get the detector passed in the -k argument
    index = np.where(names_detectors == keypnt)[0]
    if index.size > 0:
        detector_to_use = detectors[index[0]]
    else:
        print 'Wrong detector name passed in the -k argument. Options: SIFT, SURF, FAST, STAR, ORB and RANDOM'
        sys.exit()

    #FOR RESULTS FILE
    detector_to_use.writeParametersDet(f)

    #Get descriptor classes
    des_sift = siftLib.Sift(numpatch / n_regions, equalnum)
    des_surf = surfLib.Surf(numpatch / n_regions, equalnum)
    des_orb = orbLib.Orb(numpatch / n_regions)
    des_brief = briefDescriptor.Brief()
    des_freak = freakDescriptor.Freak()

    names_descriptors = np.array(["SIFT", "SURF", "ORB", "BRIEF", "FREAK"])
    descriptors = np.array([des_sift, des_surf, des_orb, des_brief, des_freak])

    #Get the detector passed in the -d argument
    index = np.where(names_descriptors == imdes)[0]
    if index.size > 0:
        descriptor_to_use = descriptors[index[0]]
    else:
        print 'Wrong descriptor name passed in the -d argument. Options: SIFT, SURF, ORB, BRIEF and FREAK'
        sys.exit()

    #FOR RESULTS FILE
    descriptor_to_use.writeParametersDes(f)

    kp_vector = []  #vector with the keypoints object
    des_vector = [
    ]  #vector wih the descriptors (in order to obtain the codebook)
    number_of_kp = []  #vector with the number of keypoints per image

    counter = 1

    #save current time
    start_time = time.time()

    labels = []
    class_names = []

    #Border
    border = 40

    side = int(np.sqrt(n_regions))
    des_vector_byregion = [0] * n_regions
    number_of_kp_region = [0] * n_regions
    filled = [0] * n_regions

    #matrixes of the indexes
    mat_indexes = np.array([[0, 1, 4, 5, 16, 17, 20, 21],
                            [2, 3, 6, 7, 18, 19, 22, 23],
                            [8, 9, 12, 13, 24, 25, 28, 29],
                            [10, 11, 14, 15, 26, 27, 30, 31],
                            [32, 33, 36, 37, 48, 49, 52, 53],
                            [34, 35, 38, 39, 50, 51, 54, 55],
                            [40, 41, 44, 45, 56, 57, 60, 61],
                            [42, 43, 46, 47, 58, 59, 62, 63]])

    #detect the keypoints and compute the sift descriptors for each image
    for im in imList:
        if 'DS_Store' not in im:
            print 'image: ' + str(im) + ' number: ' + str(counter)
            #read image
            img = cv2.imread(im, 0)

            # region
            for i in range(0, side):
                for j in range(0, side):

                    #mask in order to avoid keypoints in border of image. size = 40 pixels
                    height, width = img.shape
                    h_region = (height - 2 * border) / np.sqrt(n_regions)
                    w_region = (width - 2 * border) / np.sqrt(n_regions)
                    mask = np.zeros(img.shape, dtype=np.uint8)

                    mask[border + i * h_region:border + (i + 1) * h_region,
                         border + j * w_region:border + (j + 1) * w_region] = 1

                    #get keypoints from detector
                    kp = detector_to_use.detectKp(img, mask)

                    #get features from descriptor
                    des = descriptor_to_use.computeDes(img, kp)

                    number_of_kp.append(len(kp))

                    #print i*np.sqrt(n_regions)+j
                    #print number_of_kp_region[int(i*np.sqrt(n_regions)+j)]

                    if filled[mat_indexes[i, j]] == 1:
                        #descriptors of all the regions (in a list)
                        des_vector_byregion[mat_indexes[
                            i, j]] = np.concatenate(
                                (des_vector_byregion[mat_indexes[i, j]], des),
                                axis=0)

                        #number of descriptors in each region
                        number_of_kp_region[mat_indexes[
                            i, j]] = np.concatenate(
                                (number_of_kp_region[mat_indexes[i, j]],
                                 np.array([len(kp)])),
                                axis=0)
                    else:
                        des_vector_byregion[mat_indexes[i, j]] = des
                        number_of_kp_region[mat_indexes[i, j]] = np.array(
                            [len(kp)])
                        filled[mat_indexes[i, j]] = 1

                    #print des_vector_byregion
                    #print number_of_kp_region

            #for evaluation
            name1 = im.split("/")[-1]
            name = name1.split("_")[0]

            if name in class_names:
                index = class_names.index(name)
                labels.append(index)
            else:
                class_names.append(name)
                index = class_names.index(name)
                labels.append(index)
            counter += 1

    #measure the time to compute the description of each image (divide time elapsed by # of images)
    elapsed_time = (time.time() - start_time) / len(imList)
    print 'Time to compute detector and descriptor for each image = ' + str(
        elapsed_time)

    f.write(
        'Average time to compute detector and descriptor for each image = ' +
        str(elapsed_time) + '\n')

    n_images = counter - 1

    average_words = sum(number_of_kp) / float(len(number_of_kp))

    #all the descriptors together
    des_vector = np.concatenate(np.array(des_vector_byregion))

    print 'Total number of features = ' + str(len(des_vector))
    f.write('Total number of features obtained = ' + str(len(des_vector)) +
            '\n')
    print 'Average number of keypoints per image = ' + str(average_words)
    f.write('Average number of keypoints per image = ' + str(average_words) +
            '\n')

    #################################################################
    #
    # Image and Keypoint sampling
    #
    #################################################################

    rand_indexes = []
    nmi_indexes = []

    for iteraction in range(0, rep):

        print "\nIteraction #" + str(iteraction + 1) + '\n'
        f.write("\nIteraction #" + str(iteraction + 1) + '\n')

        print 'Sampling images and keypoints prior to codebook computation...'

        if imsample != "NONE":

            sampleKp = sampleKeypoints.SamplingImandKey(
                n_images, number_of_kp, average_words, percentage)
            sampleallKp = sampleAllKeypoints.SamplingAllKey(percentage)

            names_sampling = np.array(["SAMPLEI", "SAMPLEP"])
            sample_method = np.array([sampleKp, sampleallKp])

            #Get the detector passed in the -g argument
            index = np.where(names_sampling == imsample)[0]
            if index.size > 0:
                sampling_to_use = sample_method[index[0]]
            else:
                print 'Wrong sampling method passed in the -g argument. Options: NONE, SAMPLEI, SAMPLEP'
                sys.exit()

            #FOR RESULTS FILE
            sampling_to_use.writeFile(f)

            des_vector_sampled = sampling_to_use.sampleKeypoints(des_vector)

            print 'Total number of features after sampling = ' + str(
                len(des_vector_sampled))
            f.write('Total number of features after sampling = ' +
                    str(len(des_vector_sampled)) + '\n')

            print 'Images and keypoints sampled...'

        else:
            print 'No sampling method chosen'
            #FOR RESULTS FILE
            f.write(
                "No method of keypoint sampling chosen. Use all keypoints for codebook construction \n"
            )
            des_vector_sampled = des_vector

        #################################################################
        #
        # Codebook computation
        #
        #################################################################

        print 'Obtaining codebook...'

        #save current time
        start_time = time.time()

        #Get detector classes
        codebook_kmeans = KMeans1.KMeans1(size)
        codebook_birch = Birch.Birch(size)
        codebook_minibatch = minibatch.MiniBatch(size)
        codebook_randomv = randomSamplesBook.RandomVectors(size)
        codebook_allrandom = allrandom.AllRandom(size)

        names_codebook = np.array(
            ["KMEANS", "BIRCH", "MINIBATCH", "RANDOMV", "RANDOM"])
        codebook_algorithm = np.array([
            codebook_kmeans, codebook_birch, codebook_minibatch,
            codebook_randomv, codebook_allrandom
        ])

        #Get the detector passed in the -c argument
        index = np.where(names_codebook == codebook)[0]
        if index.size > 0:
            codebook_to_use = codebook_algorithm[index[0]]
        else:
            print 'Wrong codebook construction algorithm name passed in the -c argument. Options: KMEANS, MINIBATCH, RANDOMV and RANDOM'
            sys.exit()

        #FOR RESULTS FILE
        codebook_to_use.writeFileCodebook(f)

        #Get centers and projections using codebook algorithm
        centers, projections = codebook_to_use.obtainCodebook(
            des_vector_sampled, des_vector)

        #compute the number of unique descriptor vectors
        codebook_randomv.unique_vectors(centers)

        elapsed_time = (time.time() - start_time)
        print 'Time to compute codebook = ' + str(elapsed_time)
        f.write('Time to compute codebook = ' + str(elapsed_time) + '\n')

        #################################################################
        #
        # Obtain Histogram
        #
        #################################################################

        des_byregion = des_vector_byregion
        numkp_region = number_of_kp_region

        hist_total = []

        for level in range(levels - 1, -1, -1):

            print 'Level = ' + str(level)

            n_regions = np.power(4, level)

            for i in range(0, n_regions):

                print 'Obtaining histograms...'

                #print 'projection shape = '+ str(projections.shape)
                #print 'size = ' + str(size)
                #print 'n of images = ' + str(n_images)
                #print 'number of kp' + str(number_of_kp)

                #print len(des_vector_byregion)
                #print len(des_vector_byregion[0])
                #print len(des_vector_byregion[0][0])

                result = scipy.cluster.vq.vq(np.array(des_byregion[i]),
                                             centers)
                projections_region = result[0]

                #print 'projections = ' + str(projections_region)
                #print n_images
                #print number_of_kp_region[i]

                #print len(number_of_kp_region)
                #print len(number_of_kp_region[0])

                hist = histogram.computeHist(projections_region, size,
                                             n_images, numkp_region[i])
                #print hist
                print 'Histograms obtained'

                #print hist

                ################################################################
                #
                # Feature selection
                #
                #################################################################

                print 'Number of visual words = ' + str(len(hist[0]))

                if fselec != "NONE":

                    print 'Applying feature selection to descriptors...'

                    filter_max = filterMax.WordFilterMax(fselec_perc[0])
                    filter_min = filterMin.WordFilterMin(fselec_perc[1])
                    filter_maxmin = filterMaxMin.WordFilterMaxMin(
                        fselec_perc[0], fselec_perc[1])

                    names_filter = np.array(["FMAX", "FMIN", "FMAXMIN"])
                    filter_method = np.array(
                        [filter_max, filter_min, filter_maxmin])

                    #Get the detector passed in the -f argument
                    index = np.where(names_filter == fselec)[0]
                    if index.size > 0:
                        filter_to_use = filter_method[index[0]]
                    else:
                        print 'Wrong codebook construction algorithm name passed in the -f argument. Options: NONE, FMAX, FMIN, FMAXMIN'
                        sys.exit()

                    hist = filter_to_use.applyFilter(hist, size, n_images)

                    #FOR RESULTS FILE
                    filter_to_use.writeFile(f)

                    new_size = hist.shape[1]

                    print 'Visual words Filtered'
                    print 'Number of visual words filtered = ' + str(size -
                                                                     new_size)
                    f.write("Number of visual words filtered = " +
                            str(size - new_size) + '\n')
                    print 'Final number of visual words = ' + str(new_size)
                    f.write('Final number of visual words = ' + str(new_size) +
                            '\n')

                else:
                    #FOR RESULTS FILE
                    filter_min = filterMin.WordFilterMin(0)
                    hist = filter_min.applyFilter(hist, size, n_images)
                    new_size = hist.shape[1]
                    print 'Number of visual words filtered = ' + str(size -
                                                                     new_size)
                    f.write("No feature selection applied \n")

                #################################################################
                #
                # Histogram Normalization
                #
                #################################################################

                if histnorm != "NONE":

                    #Get detector classes
                    norm_sbin = simpleBinarization.SimpleBi()
                    norm_tfnorm = tfnorm.Tfnorm()
                    norm_tfidf = tfidf.TfIdf()
                    norm_tfidf2 = tfidf2.TfIdf2()
                    norm_tfidfnorm = tfidfnorm.TfIdfnorm()
                    norm_okapi = okapi.Okapi(average_words)

                    names_normalization = np.array([
                        "SBIN", "TFNORM", "TFIDF", "TFIDF2", "TFIDFNORM",
                        "OKAPI"
                    ])
                    normalization_method = np.array([
                        norm_sbin, norm_tfnorm, norm_tfidf, norm_tfidf2,
                        norm_tfidfnorm, norm_okapi
                    ])

                    #Get the detector passed in the -h argument
                    index = np.where(names_normalization == histnorm)[0]
                    if index.size > 0:
                        normalization_to_use = normalization_method[index[0]]
                        new_hist = normalization_to_use.normalizeHist(
                            hist, new_size, n_images)
                    else:
                        print 'Wrong normalization name passed in the -h argument. Options: SBIN, TFNORM, TFIDF and TFIDF2'
                        sys.exit()

                    #FOR RESULTS FILE
                    normalization_to_use.writeFile(f)

                else:
                    #FOR RESULTS FILE
                    f.write("No histogram normalization applied\n")
                    new_hist = hist

                hist_total.append(np.array(new_hist))

            #concatenate des_vector_byregion TODOOOOOOOOOO
            des_vector_aux = []
            number_of_kp_aux = []
            if level != 0:
                side = 4
                ntimes = int(np.power(4, level - 1))
                for h in range(0, ntimes):
                    #print len(des_byregion)
                    #print h*side
                    #print (h+1)*side
                    des_vector_aux.append(
                        np.concatenate(des_byregion[h * side:(h + 1) * side],
                                       axis=0))
                    count = 0
                    for n in numkp_region[h * side:(h + 1) * side]:
                        if count != 0:
                            sum_np = [sum(x) for x in zip(sum_np, n)]
                        else:
                            sum_np = n
                        count = count + 1
                    number_of_kp_aux.append(sum_np)

            des_byregion = des_vector_aux
            numkp_region = number_of_kp_aux

        #print hist_total

        hist_total = np.concatenate(hist_total, axis=1)

        print len(hist_total[0])

        #################################################################
        #
        # Clustering of the features
        #
        #################################################################

        #save current time
        start_time = time.time()

        #Get detector classes
        clust_dbscan = Dbscan.Dbscan(dist)
        clust_kmeans = KMeans1.KMeans1([nclusters])
        clust_birch = Birch.Birch(nclusters)
        clust_meanSift = meanSift.MeanSift(nclusters)
        clust_hierar1 = hierarchicalClustering.Hierarchical(nclusters, dist)
        clust_hierar2 = hierarchicalClustScipy.HierarchicalScipy(dist)
        clust_community = communityDetection.CommunityDetection(dist)

        names_clustering = np.array([
            "DBSCAN", "KMEANS", "BIRCH", "MEANSIFT", "HIERAR1", "HIERAR2",
            "COMM"
        ])
        clustering_algorithm = np.array([
            clust_dbscan, clust_kmeans, clust_birch, clust_meanSift,
            clust_hierar1, clust_hierar2, clust_community
        ])

        #Get the detector passed in the -a argument
        index = np.where(names_clustering == clust)[0]
        if index.size > 0:
            clustering_to_use = clustering_algorithm[index[0]]
        else:
            print 'Wrong clustering algorithm name passed in the -a argument. Options: DBSCAN, KMEANS, BIRCH, MEANSIFT, HIERAR1, HIERAR2, COMM'
            sys.exit()

        clusters = clustering_to_use.obtainClusters(hist_total)

        #FOR RESULTS FILE
        clustering_to_use.writeFileCluster(f)

        elapsed_time = (time.time() - start_time)
        print 'Time to run clustering algorithm = ' + str(elapsed_time)
        f.write('Time to run clustering algorithm = ' + str(elapsed_time) +
                '\n')

        print 'Number of clusters obtained = ' + str(max(clusters) + 1)
        f.write('Number of clusters obtained = ' + str(max(clusters) + 1) +
                '\n')

        print 'Clusters obtained = ' + str(np.asarray(clusters))

        #date_time = datetime.datetime.now().strftime('%b-%d-%I%M%p-%G')
        #np.savetxt('saveClusters_'+date_time+'_.txt', clusters, '%i', ',')

        ##ADDED
        ##################################################################
        ##
        ## Create folder with central images for each cluster
        ##
        ##################################################################

        #dir_results = 'Results_' + im_dataset_name + '_SPM_' + date_time

        ##obtain representative images for each cluster
        #central_ims = clust_community.obtainCenteralImages(new_hist, clusters)

        #central_folder = os.path.join(dir_results,'CenterImages')
        #if not os.path.exists(central_folder):
        #os.makedirs(central_folder)

        #count=0
        #for central_im in central_ims:
        #filename = os.path.join(central_folder,'Cluster_'+str(count)+'.jpg')
        #img = cv2.imread(imPaths[central_im],1)
        #cv2.imwrite(filename, img)
        #count = count + 1

        ##ADDED
        ##################################################################
        ##
        ## Separate Clusters into folders
        ##
        ##################################################################

        #clusters_folder = os.path.join(dir_results,'Clusters')
        #if not os.path.exists(clusters_folder):
        #os.makedirs(clusters_folder)

        #clust_dir = []
        #for iclust in range(0,nclusters):
        #direc = os.path.join(clusters_folder,'Cluster_'+str(iclust))
        #if not os.path.exists(direc):
        #os.makedirs(direc)
        #clust_dir.append(direc)

        #for im in range(0,len(imPaths)):
        #im_name = imPaths[im].split('/')[-1]
        ##print clust_dir[int(clusters[im])]
        #filename = os.path.join(clust_dir[int(clusters[im])],im_name)
        ##print filename
        #img = cv2.imread(imPaths[im],1)
        #cv2.imwrite(filename, img)

        #################################################################
        #
        # Evaluation
        #
        #################################################################

        users = 0

        if users == 1:

            rand_index = evaluationUsers.randIndex(clusters)
            rand_indexes.append(rand_index)
            print 'rand_index = ' + str(rand_index)
            f.write("Rand Index = " + str(rand_index) + "\n")

        else:
            if len(clusters) == len(labels):

                f.write("\nResults\n")

                f.write('Clusters Obtained = ' + str(np.asarray(clusters)))
                f.write('Labels = ' + str(np.asarray(labels)))

                rand_index = metrics.adjusted_rand_score(labels, clusters)
                rand_indexes.append(rand_index)
                print 'rand_index = ' + str(rand_index)
                f.write("Rand Index = " + str(rand_index) + "\n")

                NMI_index = metrics.normalized_mutual_info_score(
                    labels, clusters)
                nmi_indexes.append(NMI_index)
                print 'NMI_index = ' + str(NMI_index)
                f.write("NMI Index = " + str(NMI_index) + "\n")

    if rep > 1:
        f.write("\nFINAL RESULTS\n")
        f.write("Avg Rand Index = " + str(float(sum(rand_indexes)) / rep) +
                "\n")
        f.write("Std Rand Index = " + str(statistics.stdev(rand_indexes)) +
                "\n")
        f.write("Avg NMI Index = " + str(float(sum(nmi_indexes)) / rep) + "\n")
        f.write("Std NMI Index = " + str(statistics.stdev(nmi_indexes)) + "\n")
    f.close()

예제 #23

0

파일 보기

파일: model.py 프로젝트: duduscript/bayes

                    prob *= smooth_prob
                else:
                    prob *= self.type_model[source][word] / self.get_data_num(
                        source)
            return prob

        max_prob, result = 0, ''
        for i in range(len(self.sources)):
            prob = get_prob(vec, self.sources[i])
            if prob > max_prob:
                max_prob, result = prob, self.sources[i]
        return result


if __name__ == '__main__':
    tfidf = tfidf.TfIdf()
    model = Model()
    paras = os.listdir('test')
    #print(paras)
    count, right = 0, 0
    for source in model.sources:
        if source in paras:
            papers = os.listdir('test/' + source)
            for paper in papers:
                with open('/'.join(['test', source, paper])) as file:
                    vec = tfidf.paragraph2vec(file.read())
                    if len(vec) < 10: continue
                    print(vec)
                    result = model.classify(vec)
                    if result == source: right += 1
                    count += 1

예제 #24

0

파일 보기

def run(pathImages, method, numpatch, imsample, percentage, codebook, dist,
        size, fselec, fselec_perc, histnorm, clust, nclusters, rep):

    #################################################################
    #
    # Initializations and result file configurations
    #
    #################################################################

    im_dataset_name = pathImages.split('/')[-1]

    date_time = datetime.datetime.now().strftime('%b-%d-%I%M%p-%G')

    name_results_file = 'BOC_' + im_dataset_name + '_' + str(
        numpatch
    ) + '_' + imsample + '_' + codebook + '_' + str(
        size
    ) + '_' + fselec + '_' + histnorm + '_' + clust + '_' + dist + '_' + date_time

    #dir_results = 'Results_' + im_dataset_name + '_BOC_' + date_time
    dir_results = 'Results_BOC'

    if not os.path.exists(dir_results):
        os.makedirs(dir_results)

    file_count = 2
    file_name = os.path.join(dir_results, name_results_file)
    while os.path.exists(file_name + ".txt"):
        file_name = os.path.join(dir_results,
                                 name_results_file) + "_" + str(file_count)
        file_count = file_count + 1
    f = open(file_name + ".txt", 'w')

    #################################################################
    #
    # Get images
    #
    #################################################################

    #pathImages = '/Users/Mariana/mieec/Tese/Development/ImageDatabases/Graz-01_sample'

    imList = get_imlist(pathImages)

    print 'Number of images read = ' + str(len(imList))
    f.write("Number of images in dataset read: " + str(len(imList)) + "\n")

    #################################################################
    #
    # Image description
    #
    #################################################################

    kp_vector = []  #vector with the keypoints object
    des_vector = [
    ]  #vector wih the descriptors (in order to obtain the codebook)
    number_of_kp = []  #vector with the number of keypoints per image

    counter = 1

    #save current time
    start_time = time.time()

    labels = []
    class_names = []

    #ADDED
    imPaths = []

    #number of divisions of the image
    div = int(np.sqrt(numpatch))

    n_images = 0
    #detect the keypoints and compute the sift descriptors for each image
    for im in imList:
        if 'DS_Store' not in im:
            #ADDED
            imPaths.append(im)
            print 'image: ' + str(im) + ' number: ' + str(counter)
            #read image
            img = cv2.imread(im, 1)
            img_gray = cv2.imread(im, 0)
            img_lab = cv2.cvtColor(img, cv.CV_BGR2Lab)

            height, width, comp = img_lab.shape
            h_region = height / div
            w_region = width / div

            des = []
            for i in range(0, div):
                for j in range(0, div):

                    #mask
                    mask = np.zeros(img_gray.shape, dtype=np.uint8)
                    mask[i * h_region:(i + 1) * h_region,
                         j * w_region:(j + 1) * w_region] = 1

                    hist = cv2.calcHist([img_lab], [0, 1, 2], mask,
                                        [256, 256, 256],
                                        [0, 256, 0, 256, 0, 256])

                    max_color_l, max_color_a, max_color_b = np.where(
                        hist == np.max(hist))
                    des.append(
                        [max_color_l[0], max_color_a[0], max_color_b[0]])

            number_of_kp.append(div * div)
            if counter == 1:
                des_vector = des
            else:
                des_vector = np.concatenate((des_vector, des), axis=0)
            counter += 1

            #for evaluation
            name1 = im.split("/")[-1]
            name = name1.split("_")[0]

            if name in class_names:
                index = class_names.index(name)
                labels.append(index)
            else:
                class_names.append(name)
                index = class_names.index(name)
                labels.append(index)

            n_images = n_images + 1

    #measure the time to compute the description of each image (divide time elapsed by # of images)
    elapsed_time = (time.time() - start_time) / len(imList)
    print 'Time to compute detector and descriptor for each image = ' + str(
        elapsed_time)

    f.write(
        'Average time to compute detector and descriptor for each image = ' +
        str(elapsed_time) + '\n')

    average_words = sum(number_of_kp) / float(len(number_of_kp))

    print 'Total number of features = ' + str(len(des_vector))
    f.write('Total number of features obtained = ' + str(len(des_vector)) +
            '\n')
    print 'Average number of keypoints per image = ' + str(average_words)
    f.write('Average number of keypoints per image = ' + str(average_words) +
            '\n')

    #################################################################
    #
    # Image and Keypoint sampling
    #
    #################################################################

    rand_indexes = []
    nmi_indexes = []

    for iteraction in range(0, rep):

        print "\nIteraction #" + str(iteraction + 1) + '\n'
        f.write("\nIteraction #" + str(iteraction + 1) + '\n')

        print 'Sampling images and keypoints prior to codebook computation...'

        if imsample != "NONE":

            sampleKp = sampleKeypoints.SamplingImandKey(
                n_images, number_of_kp, average_words, percentage)
            sampleallKp = sampleAllKeypoints.SamplingAllKey(percentage)

            names_sampling = np.array(["SAMPLEI", "SAMPLEP"])
            sample_method = np.array([sampleKp, sampleallKp])

            #Get the sampling method passed in the -g argument
            index = np.where(names_sampling == imsample)[0]
            if index.size > 0:
                sampling_to_use = sample_method[index[0]]
            else:
                print 'Wrong sampling method passed in the -g argument. Options: NONE, SAMPLEI, SAMPLEP'
                sys.exit()

            #FOR RESULTS FILE
            sampling_to_use.writeFile(f)

            des_vector_sampled = sampling_to_use.sampleKeypoints(des_vector)

            print 'Total number of features after sampling = ' + str(
                len(des_vector_sampled))
            f.write('Total number of features after sampling = ' +
                    str(len(des_vector_sampled)) + '\n')

            print 'Images and keypoints sampled...'

        else:
            print 'No sampling method chosen'
            #FOR RESULTS FILE
            f.write(
                "No method of keypoint sampling chosen. Use all keypoints for codebook construction \n"
            )
            des_vector_sampled = des_vector

        #################################################################
        #
        # Codebook computation
        #
        #################################################################

        print 'Obtaining codebook...'

        #save current time
        start_time = time.time()

        #Get detector classes
        codebook_kmeans = KMeans1.KMeans1(size)
        codebook_birch = Birch.Birch(size)
        codebook_minibatch = minibatch.MiniBatch(size)
        codebook_randomv = randomSamplesBook.RandomVectors(size)
        codebook_allrandom = allrandom.AllRandom(size)

        names_codebook = np.array(
            ["KMEANS", "BIRCH", "MINIBATCH", "RANDOMV", "RANDOM"])
        codebook_algorithm = np.array([
            codebook_kmeans, codebook_birch, codebook_minibatch,
            codebook_randomv, codebook_allrandom
        ])

        #Get the codebook algorithm passed in the -c argument
        index = np.where(names_codebook == codebook)[0]
        if index.size > 0:
            codebook_to_use = codebook_algorithm[index[0]]
        else:
            print 'Wrong codebook construction algorithm name passed in the -c argument. Options: KMEANS, MINIBATCH, RANDOMV and RANDOM'
            sys.exit()

        #FOR RESULTS FILE
        codebook_to_use.writeFileCodebook(f)

        #Get centers and projections using codebook algorithm
        ceters, projections = codebook_to_use.obtainCodebook(
            des_vector_sampled, des_vector)

        elapsed_time = (time.time() - start_time)
        print 'Time to compute codebook = ' + str(elapsed_time)
        f.write('Time to compute codebook = ' + str(elapsed_time) + '\n')

        #################################################################
        #
        # Obtain Histogram
        #
        #################################################################

        print 'Obtaining histograms...'

        #print 'projection shape = '+ str(projections.shape)
        #print 'size = ' + str(size)
        #print 'n of images = ' + str(n_images)
        #print 'number of kp' + str(number_of_kp)

        hist = histogram.computeHist(projections, size, n_images, number_of_kp)
        print hist
        print 'Histograms obtained'

        ################################################################
        #
        # Feature selection
        #
        #################################################################

        print 'Number of visual words = ' + str(len(hist[0]))

        if fselec != "NONE":

            print 'Applying feature selection to descriptors...'

            filter_max = filterMax.WordFilterMax(fselec_perc[0])
            filter_min = filterMin.WordFilterMin(fselec_perc[1])
            filter_maxmin = filterMaxMin.WordFilterMaxMin(
                fselec_perc[0], fselec_perc[1])

            names_filter = np.array(["FMAX", "FMIN", "FMAXMIN"])
            filter_method = np.array([filter_max, filter_min, filter_maxmin])

            #Get the feature selection method passed in the -f argument
            index = np.where(names_filter == fselec)[0]
            if index.size > 0:
                filter_to_use = filter_method[index[0]]
            else:
                print 'Wrong codebook construction algorithm name passed in the -f argument. Options: NONE, FMAX, FMIN, FMAXMIN'
                sys.exit()

            hist = filter_to_use.applyFilter(hist, size, n_images)

            #FOR RESULTS FILE
            filter_to_use.writeFile(f)

            new_size = hist.shape[1]

            print 'Visual words Filtered'
            print 'Number of visual words filtered = ' + str(size - new_size)
            f.write("Number of visual words filtered = " +
                    str(size - new_size) + '\n')
            print 'Final number of visual words = ' + str(new_size)
            f.write('Final number of visual words = ' + str(new_size) + '\n')

        else:
            #FOR RESULTS FILE
            filter_min = filterMin.WordFilterMin(0)
            hist = filter_min.applyFilter(hist, size, n_images)
            new_size = hist.shape[1]
            print 'Number of visual words filtered = ' + str(size - new_size)
            f.write("No feature selection applied \n")

        #################################################################
        #
        # Histogram Normalization
        #
        #################################################################

        if histnorm != "NONE":

            #Get detector classes
            norm_sbin = simpleBinarization.SimpleBi()
            norm_tfnorm = tfnorm.Tfnorm()
            norm_tfidf = tfidf.TfIdf()
            norm_tfidf2 = tfidf2.TfIdf2()
            norm_tfidfnorm = tfidfnorm.TfIdfnorm()
            norm_okapi = okapi.Okapi(average_words)
            norm_power = powerNorm.PowerNorm()

            names_normalization = np.array([
                "SBIN", "TFNORM", "TFIDF", "TFIDF2", "TFIDFNORM", "OKAPI",
                "POWER"
            ])
            normalization_method = np.array([
                norm_sbin, norm_tfnorm, norm_tfidf, norm_tfidf2,
                norm_tfidfnorm, norm_okapi, norm_power
            ])

            #Get the detector passed in the -h argument
            index = np.where(names_normalization == histnorm)[0]
            if index.size > 0:
                normalization_to_use = normalization_method[index[0]]
                new_hist = normalization_to_use.normalizeHist(
                    hist, new_size, n_images)
            else:
                print 'Wrong normalization name passed in the -h argument. Options: SBIN, TFNORM, TFIDF and TFIDF2'
                sys.exit()

            #FOR RESULTS FILE
            normalization_to_use.writeFile(f)

        else:
            #FOR RESULTS FILE
            f.write("No histogram normalization applied\n")
            new_hist = hist

        #################################################################
        #
        # Clustering of the features
        #
        #################################################################

        #save current time
        start_time = time.time()

        #Get detector classes
        clust_dbscan = Dbscan.Dbscan(dist)
        clust_kmeans = KMeans1.KMeans1([nclusters])
        clust_birch = Birch.Birch(nclusters)
        clust_meanSift = meanSift.MeanSift(nclusters)
        clust_hierar1 = hierarchicalClustering.Hierarchical(nclusters, dist)
        clust_hierar2 = hierarchicalClustScipy.HierarchicalScipy(dist)
        clust_community = communityDetection.CommunityDetection(dist)

        names_clustering = np.array([
            "DBSCAN", "KMEANS", "BIRCH", "MEANSIFT", "HIERAR1", "HIERAR2",
            "COMM"
        ])
        clustering_algorithm = np.array([
            clust_dbscan, clust_kmeans, clust_birch, clust_meanSift,
            clust_hierar1, clust_hierar2, clust_community
        ])

        #Get the detector passed in the -a argument
        index = np.where(names_clustering == clust)[0]
        if index.size > 0:
            clustering_to_use = clustering_algorithm[index[0]]
        else:
            print 'Wrong clustering algorithm name passed in the -a argument. Options: DBSCAN, KMEANS, BIRCH, MEANSIFT, HIERAR1, HIERAR2, COMM'
            sys.exit()

        clusters = clustering_to_use.obtainClusters(new_hist)

        #FOR RESULTS FILE
        clustering_to_use.writeFileCluster(f)

        elapsed_time = (time.time() - start_time)
        print 'Time to run clustering algorithm = ' + str(elapsed_time)
        f.write('Time to run clustering algorithm = ' + str(elapsed_time) +
                '\n')

        print 'Number of clusters obtained = ' + str(max(clusters) + 1)
        f.write('Number of clusters obtained = ' + str(max(clusters) + 1) +
                '\n')

        nclusters = max(clusters) + 1

        print 'Clusters obtained = ' + str(np.asarray(clusters))

        #date_time = datetime.datetime.now().strftime('%b-%d-%I%M%p-%G')
        #np.savetxt('saveClusters_'+date_time+'_.txt', clusters, '%i', ',')

        #ADDED
        #################################################################
        #
        # Create folder with central images for each cluster
        #
        #################################################################

        #obtain representative images for each cluster
        central_ims = clust_community.obtainCenteralImages(new_hist, clusters)

        central_folder = os.path.join(dir_results, 'CenterImages')
        if not os.path.exists(central_folder):
            os.makedirs(central_folder)

        count = 0
        for central_im in central_ims:
            filename = os.path.join(central_folder,
                                    'Cluster_' + str(count) + '.jpg')
            img = cv2.imread(imPaths[central_im], 1)
            cv2.imwrite(filename, img)
            count = count + 1

        #ADDED
        #################################################################
        #
        # Separate Clusters into folders
        #
        #################################################################

        clusters_folder = os.path.join(dir_results, 'Clusters')
        if not os.path.exists(clusters_folder):
            os.makedirs(clusters_folder)

        clust_dir = []
        for iclust in range(0, nclusters):
            direc = os.path.join(clusters_folder, 'Cluster_' + str(iclust))
            if not os.path.exists(direc):
                os.makedirs(direc)
            clust_dir.append(direc)

        for im in range(0, len(imPaths)):
            im_name = imPaths[im].split('/')[-1]
            #print clust_dir[int(clusters[im])]
            filename = os.path.join(clust_dir[int(clusters[im])], im_name)
            #print filename
            img = cv2.imread(imPaths[im], 1)
            cv2.imwrite(filename, img)

        #################################################################
        #
        # Evaluation
        #
        #################################################################

        users = 0

        if users == 1:

            rand_index = evaluationUsers.randIndex(clusters)
            rand_indexes.append(rand_index)
            print 'rand_index = ' + str(rand_index)
            f.write("Rand Index = " + str(rand_index) + "\n")

        else:
            if len(clusters) == len(labels):

                f.write("\nResults\n")

                f.write('Clusters Obtained = ' + str(np.asarray(clusters)))
                f.write('Labels = ' + str(np.asarray(labels)))

                rand_index = metrics.adjusted_rand_score(labels, clusters)
                rand_indexes.append(rand_index)
                print 'rand_index = ' + str(rand_index)
                f.write("Rand Index = " + str(rand_index) + "\n")

                NMI_index = metrics.normalized_mutual_info_score(
                    labels, clusters)
                nmi_indexes.append(NMI_index)
                print 'NMI_index = ' + str(NMI_index)
                f.write("NMI Index = " + str(NMI_index) + "\n")

    if rep > 1:
        f.write("\nFINAL RESULTS\n")
        f.write("Avg Rand Index = " + str(float(sum(rand_indexes)) / rep) +
                "\n")
        f.write("Std Rand Index = " + str(statistics.stdev(rand_indexes)) +
                "\n")
        if users != 1:
            f.write("Avg NMI Index = " + str(float(sum(nmi_indexes)) / rep) +
                    "\n")
            f.write("Std NMI Index = " + str(statistics.stdev(nmi_indexes)) +
                    "\n")
    f.close()

예제 #25

0

파일 보기

import sys
from helper_functions import curr_test_dict

paramDict = curr_test_dict("curr_test.txt")

path = '../TLG_idf_files/' + paramDict[
    'num_grams'] + 'grams' + '/' + paramDict['spread'] + "_" + "v" + paramDict[
        'variant'] + "_" + "sw" + paramDict['stopwords'] + "/"

if not os.path.exists(path):
    os.makedirs(path)

TLG_file = os.readlink(sys.argv[1])
TLG_file = TLG_file.split('/')
TLG_file = TLG_file[-1]

if not os.path.exists(path + TLG_file):
    print TLG_file
    if paramDict['variant'] == 'True':
        variant = True
    else:
        variant = False

    _tfidf = tfidf.TfIdf(int(paramDict['spread']), variant, None, None,
                         paramDict['stopword_file'])
    _tfidf.add_input_document('../stripped_text/' + TLG_file,
                              int(paramDict['num_grams']))
    _tfidf.save_corpus_to_file(path + TLG_file)
else:
    print 'path already exists for ', TLG_file

예제 #26

0

파일 보기

파일: best.py 프로젝트: ymcdull/University-TTS

  def __init__(self, k, n_d, n_w):
    self.k = k or PseudoRelevanceFeedback._K
    self.n_d = n_d or PseudoRelevanceFeedback._N_D
    self.n_w = n_w or PseudoRelevanceFeedback._N_W

    self.tf_idf = tfidf.TfIdf(self.k)

예제 #27

0

파일 보기

파일: mainBOF.py 프로젝트: kuruparan/ImageClustering

def run(pathImages, method, keypnt, numpatch, equalnum, imdes, imsample,
        percentage, codebook, dist, size, fselec, fselec_perc, histnorm, clust,
        K, pca, nclusters, rep):

    #################################################################
    #
    # Initializations and result file configurations
    #
    #################################################################

    im_dataset_name = pathImages.split('/')[-1]

    date_time = datetime.datetime.now().strftime('%b-%d-%I%M%p-%G')

    name_results_file = 'BOF_' + im_dataset_name + '_' + keypnt + '_' + str(
        numpatch
    ) + '_' + str(
        equalnum
    ) + '_' + imdes + '_' + imsample + '_' + codebook + '_' + str(
        size
    ) + '_' + fselec + '_' + histnorm + '_' + clust + '_' + dist + '_' + date_time

    #dir_results = 'Results_' + im_dataset_name + '_BOF_' + date_time
    dir_results = 'Results_BOF'

    if not os.path.exists(dir_results):
        os.makedirs(dir_results)

    file_count = 2
    file_name = os.path.join(dir_results, name_results_file)
    while os.path.exists(file_name + ".txt"):
        file_name = os.path.join(dir_results,
                                 name_results_file) + "_" + str(file_count)
        file_count = file_count + 1
    f = open(file_name + ".txt", 'w')

    #################################################################
    #
    # Get images
    #
    #################################################################

    #pathImages = '/Users/Mariana/mieec/Tese/Development/ImageDatabases/Graz-01_sample'

    imList = get_imlist(pathImages)

    print 'Number of images read = ' + str(len(imList))
    f.write("Number of images in dataset read: " + str(len(imList)) + "\n")

    #################################################################
    #
    # Image description
    #
    #################################################################

    #Get detector classes
    det_sift = siftLib.Sift(numpatch, equalnum)
    det_surf = surfLib.Surf(numpatch, equalnum)
    det_fast = fastDetector.Fast(numpatch, equalnum)
    det_star = starDetector.Star(numpatch, equalnum)
    det_orb = orbLib.Orb(numpatch, equalnum)
    det_random = randomDetector.Random(numpatch)

    names_detectors = np.array(
        ["SIFT", "SURF", "FAST", "STAR", "ORB", "RANDOM"])
    detectors = np.array(
        [det_sift, det_surf, det_fast, det_star, det_orb, det_random])

    #Get the detector passed in the -k argument
    index = np.where(names_detectors == keypnt)[0]
    if index.size > 0:
        detector_to_use = detectors[index[0]]
    else:
        print 'Wrong detector name passed in the -k argument. Options: SIFT, SURF, FAST, STAR, ORB and RANDOM'
        sys.exit()

    #FOR RESULTS FILE
    detector_to_use.writeParametersDet(f)

    #Get descriptor classes
    des_sift = siftLib.Sift(numpatch, equalnum)
    des_surf = surfLib.Surf(numpatch, equalnum)
    des_orb = orbLib.Orb(numpatch)
    des_brief = briefDescriptor.Brief()
    des_freak = freakDescriptor.Freak()

    names_descriptors = np.array(["SIFT", "SURF", "ORB", "BRIEF", "FREAK"])
    descriptors = np.array([des_sift, des_surf, des_orb, des_brief, des_freak])

    #Get the detector passed in the -d argument
    index = np.where(names_descriptors == imdes)[0]
    if index.size > 0:
        descriptor_to_use = descriptors[index[0]]
    else:
        print 'Wrong descriptor name passed in the -d argument. Options: SIFT, SURF, ORB, BRIEF and FREAK'
        sys.exit()

    #FOR RESULTS FILE
    descriptor_to_use.writeParametersDes(f)

    kp_vector = []  #vector with the keypoints object
    des_vector = [
    ]  #vector wih the descriptors (in order to obtain the codebook)
    number_of_kp = []  #vector with the number of keypoints per image

    counter = 1

    #save current time
    start_time = time.time()

    labels = []
    class_names = []
    #ADDED
    imPaths = []

    #detect the keypoints and compute the sift descriptors for each image
    for im in imList:
        if 'DS_Store' not in im:
            #ADDED
            imPaths.append(im)
            print 'image: ' + str(im) + ' number: ' + str(counter)
            #read image
            img = cv2.imread(im, 0)

            #mask in order to avoid keypoints in border of image. size = 40 pixels
            border = 40
            height, width = img.shape
            mask = np.zeros(img.shape, dtype=np.uint8)
            mask[border:height - border, border:width - border] = 1

            #get keypoints from detector
            kp = detector_to_use.detectKp(img, mask)

            #get features from descriptor
            des = descriptor_to_use.computeDes(img, kp)

            number_of_kp.append(len(kp))
            kp_vector.append(kp)
            if counter == 1:
                des_vector = des
            else:
                des_vector = np.concatenate((des_vector, des), axis=0)
            counter += 1

            #for evaluation
            name1 = im.split("/")[-1]
            name = name1.split("_")[0]

            if name in class_names:
                index = class_names.index(name)
                labels.append(index)
            else:
                class_names.append(name)
                index = class_names.index(name)
                labels.append(index)

    #measure the time to compute the description of each image (divide time elapsed by # of images)
    elapsed_time = (time.time() - start_time) / len(imList)
    print 'Time to compute detector and descriptor for each image = ' + str(
        elapsed_time)

    f.write(
        'Average time to compute detector and descriptor for each image = ' +
        str(elapsed_time) + '\n')

    n_images = len(kp_vector)

    average_words = sum(number_of_kp) / float(len(number_of_kp))

    print 'Total number of features = ' + str(len(des_vector))
    f.write('Total number of features obtained = ' + str(len(des_vector)) +
            '\n')
    print 'Average number of keypoints per image = ' + str(average_words)
    f.write('Average number of keypoints per image = ' + str(average_words) +
            '\n')

    #################################################################
    #
    # Dimentionality reduction
    #
    #################################################################

    if pca != None:
        start_time = time.time()
        print 'Applying PCA...'
        pca = PCA(n_components=pca)
        descriptors_reduced = pca.fit(des_vector).transform(des_vector)
        print 'PCA Applied.'
        print 'time to apply PCA = ' + str(time.time() - start_time)
        des_vector = descriptors_reduced

    #################################################################
    #
    # Image and Keypoint sampling
    #
    #################################################################

    rand_indexes = []
    nmi_indexes = []

    for iteraction in range(0, rep):

        print "\nIteraction #" + str(iteraction + 1) + '\n'
        f.write("\nIteraction #" + str(iteraction + 1) + '\n')

        print 'Sampling images and keypoints prior to codebook computation...'

        if imsample != "NONE":

            sampleKp = sampleKeypoints.SamplingImandKey(
                n_images, number_of_kp, average_words, percentage)
            sampleallKp = sampleAllKeypoints.SamplingAllKey(percentage)

            names_sampling = np.array(["SAMPLEI", "SAMPLEP"])
            sample_method = np.array([sampleKp, sampleallKp])

            #Get the detector passed in the -g argument
            index = np.where(names_sampling == imsample)[0]
            if index.size > 0:
                sampling_to_use = sample_method[index[0]]
            else:
                print 'Wrong sampling method passed in the -g argument. Options: NONE, SAMPLEI, SAMPLEP'
                sys.exit()

            #FOR RESULTS FILE
            sampling_to_use.writeFile(f)

            des_vector_sampled = sampling_to_use.sampleKeypoints(des_vector)

            print 'Total number of features after sampling = ' + str(
                len(des_vector_sampled))
            f.write('Total number of features after sampling = ' +
                    str(len(des_vector_sampled)) + '\n')

            print 'Images and keypoints sampled...'

        else:
            print 'No sampling method chosen'
            #FOR RESULTS FILE
            f.write(
                "No method of keypoint sampling chosen. Use all keypoints for codebook construction \n"
            )
            des_vector_sampled = des_vector

        #################################################################
        #
        # Codebook computation
        #
        #################################################################

        print 'Obtaining codebook...'

        #save current time
        start_time = time.time()

        #Get detector classes
        codebook_kmeans = KMeans1.KMeans1(size)
        codebook_birch = Birch.Birch(size)
        codebook_minibatch = minibatch.MiniBatch(size)
        codebook_randomv = randomSamplesBook.RandomVectors(size)
        codebook_allrandom = allrandom.AllRandom(size)

        names_codebook = np.array(
            ["KMEANS", "BIRCH", "MINIBATCH", "RANDOMV", "RANDOM"])
        codebook_algorithm = np.array([
            codebook_kmeans, codebook_birch, codebook_minibatch,
            codebook_randomv, codebook_allrandom
        ])

        #Get the detector passed in the -c argument
        index = np.where(names_codebook == codebook)[0]
        if index.size > 0:
            codebook_to_use = codebook_algorithm[index[0]]
        else:
            print 'Wrong codebook construction algorithm name passed in the -c argument. Options: KMEANS, MINIBATCH, RANDOMV and RANDOM'
            sys.exit()

        #FOR RESULTS FILE
        codebook_to_use.writeFileCodebook(f)

        #Get centers and projections using codebook algorithm
        centers, projections = codebook_to_use.obtainCodebook(
            des_vector_sampled, des_vector)

        #compute the number of unique descriptor vectors
        codebook_randomv.unique_vectors(centers)

        elapsed_time = (time.time() - start_time)
        print 'Time to compute codebook = ' + str(elapsed_time)
        f.write('Time to compute codebook = ' + str(elapsed_time) + '\n')

        #################################################################
        #
        # Obtain Histogram
        #
        #################################################################

        print 'Obtaining histograms...'

        #print 'projection shape = '+ str(projections.shape)
        #print 'size = ' + str(size)
        #print 'n of images = ' + str(n_images)
        #print 'number of kp' + str(number_of_kp)

        hist = histogram.computeHist(projections, size, n_images, number_of_kp)
        #print hist
        print 'Histograms obtained'

        ################################################################
        #
        # Feature selection
        #
        #################################################################

        print 'Number of visual words = ' + str(len(hist[0]))

        if fselec != "NONE":

            print 'Applying feature selection to descriptors...'

            filter_max = filterMax.WordFilterMax(fselec_perc[0])
            filter_min = filterMin.WordFilterMin(fselec_perc[1])
            filter_maxmin = filterMaxMin.WordFilterMaxMin(
                fselec_perc[0], fselec_perc[1])

            names_filter = np.array(["FMAX", "FMIN", "FMAXMIN"])
            filter_method = np.array([filter_max, filter_min, filter_maxmin])

            #Get the detector passed in the -f argument
            index = np.where(names_filter == fselec)[0]
            if index.size > 0:
                filter_to_use = filter_method[index[0]]
            else:
                print 'Wrong codebook construction algorithm name passed in the -f argument. Options: NONE, FMAX, FMIN, FMAXMIN'
                sys.exit()

            hist = filter_to_use.applyFilter(hist, size, n_images)

            #FOR RESULTS FILE
            filter_to_use.writeFile(f)

            new_size = hist.shape[1]

            print 'Visual words Filtered'
            print 'Number of visual words filtered = ' + str(size - new_size)
            f.write("Number of visual words filtered = " +
                    str(size - new_size) + '\n')
            print 'Final number of visual words = ' + str(new_size)
            f.write('Final number of visual words = ' + str(new_size) + '\n')

        else:
            #FOR RESULTS FILE
            filter_min = filterMin.WordFilterMin(0)
            hist = filter_min.applyFilter(hist, size, n_images)
            new_size = hist.shape[1]
            print 'Number of visual words filtered = ' + str(size - new_size)
            f.write("No feature selection applied \n")

        #################################################################
        #
        # Histogram Normalization
        #
        #################################################################

        if histnorm != "NONE":

            #Get detector classes
            norm_sbin = simpleBinarization.SimpleBi()
            norm_tfnorm = tfnorm.Tfnorm()
            norm_tfidf = tfidf.TfIdf()
            norm_tfidf2 = tfidf2.TfIdf2()
            norm_tfidf3 = tfidf3.Tfidf3()
            norm_power = powerNorm.PowerNorm()
            norm_tfidfnorm = tfidfnorm.TfIdfnorm()
            norm_okapi = okapi.Okapi(average_words)

            names_normalization = np.array(
                ["SBIN", "TFNORM", "TFIDF", "TFIDF2", "TFIDFNORM", "OKAPI"])
            normalization_method = np.array([
                norm_sbin, norm_tfnorm, norm_tfidf, norm_tfidf2,
                norm_tfidfnorm, norm_okapi
            ])

            #Get the detector passed in the -h argument
            index = np.where(names_normalization == histnorm)[0]
            if index.size > 0:
                normalization_to_use = normalization_method[index[0]]
                new_hist = normalization_to_use.normalizeHist(
                    hist, new_size, n_images)
            else:
                print 'Wrong normalization name passed in the -h argument. Options: SBIN, TFNORM, TFIDF and TFIDF2'
                sys.exit()

            #FOR RESULTS FILE
            normalization_to_use.writeFile(f)

        else:
            #FOR RESULTS FILE
            f.write("No histogram normalization applied\n")
            new_hist = hist

        #################################################################
        #
        # Clustering of the features
        #
        #################################################################

        #save current time
        start_time = time.time()

        #Get detector classes
        clust_dbscan = Dbscan.Dbscan(dist)
        clust_kmeans = KMeans1.KMeans1([nclusters])
        clust_kmeans2 = kmeans2.KMeans2([nclusters])
        clust_birch = Birch.Birch(nclusters)
        clust_meanSift = meanSift.MeanSift(nclusters)
        clust_hierar1 = hierarchicalClustering.Hierarchical(nclusters, dist)
        clust_hierar2 = hierarchicalClustScipy.HierarchicalScipy(dist)
        clust_community = communityDetection.CommunityDetection(dist)

        names_clustering = np.array([
            "DBSCAN", "KMEANS", "BIRCH", "MEANSIFT", "HIERAR1", "HIERAR2",
            "COMM"
        ])
        clustering_algorithm = np.array([
            clust_dbscan, clust_kmeans, clust_birch, clust_meanSift,
            clust_hierar1, clust_hierar2, clust_community
        ])

        #Get the detector passed in the -a argument
        index = np.where(names_clustering == clust)[0]
        if index.size > 0:
            clustering_to_use = clustering_algorithm[index[0]]
        else:
            print 'Wrong clustering algorithm name passed in the -a argument. Options: DBSCAN, KMEANS, BIRCH, MEANSIFT, HIERAR1, HIERAR2, COMM'
            sys.exit()

        clusters = clustering_to_use.obtainClusters(new_hist)

        #FOR RESULTS FILE
        clustering_to_use.writeFileCluster(f)

        elapsed_time = (time.time() - start_time)
        print 'Time to run clustering algorithm = ' + str(elapsed_time)
        f.write('Time to run clustering algorithm = ' + str(elapsed_time) +
                '\n')

        #ADDED
        nclusters = int(max(clusters) + 1)
        print 'Number of clusters obtained = ' + str(max(clusters) + 1)
        f.write('Number of clusters obtained = ' + str(max(clusters) + 1) +
                '\n')

        print 'Clusters obtained = ' + str(np.asarray(clusters))

        #date_time = datetime.datetime.now().strftime('%b-%d-%I%M%p-%G')
        #np.savetxt('saveClusters_'+date_time+'_.txt', clusters, '%i', ',')

        #ADDED
        #################################################################
        #
        # Create folder with central images for each cluster
        #
        #################################################################

        ###obtain representative images for each cluster
        #central_ims = clust_community.obtainCenteralImages(new_hist, clusters)

        #central_folder = os.path.join(dir_results,'CenterImages')
        #if not os.path.exists(central_folder):
        #os.makedirs(central_folder)

        #count=0
        #for central_im in central_ims:
        #filename = os.path.join(central_folder,'Cluster_'+str(count)+'.jpg')
        #img = cv2.imread(imPaths[central_im],1)
        #cv2.imwrite(filename, img)
        #count = count + 1

        ##ADDED
        ##################################################################
        ##
        ## Separate Clusters into folders
        ##
        ##################################################################

        #clusters_folder = os.path.join(dir_results,'Clusters')
        #if not os.path.exists(clusters_folder):
        #os.makedirs(clusters_folder)

        #clust_dir = []
        #for iclust in range(0,nclusters):
        #direc = os.path.join(clusters_folder,'Cluster_'+str(iclust))
        #if not os.path.exists(direc):
        #os.makedirs(direc)
        #clust_dir.append(direc)

        #for im in range(0,len(imPaths)):
        #im_name = imPaths[im].split('/')[-1]
        ##print clust_dir[int(clusters[im])]
        #filename = os.path.join(clust_dir[int(clusters[im])],im_name)
        ##print filename
        #img = cv2.imread(imPaths[im],1)
        #cv2.imwrite(filename, img)

        ##calculate distances between images and closest images
        #closest_im = distances.calculateClosest(new_hist,dist)
        ##print closest_im

        #if not os.path.exists('ClosestImages'):
        #os.makedirs('ClosestImages')

        #file_name = os.path.join('ClosestImages',name_results_file)
        #f2 = open(file_name + ".txt", 'w')
        #counter = 0
        #counter2 = 1
        #for ims in closest_im:
        #for im in ims:
        #f2.write(str(counter2) + '-' + str(counter) + '-' + str(im) + '\n')
        #counter2 = counter2 + 1
        #counter = counter + 1

        #f2.close()

        #################################################################
        #
        # Evaluation
        #
        #################################################################

        users = 0
        #labels = np.load('IndividualClustersMatrix.npy')

        if users == 1:

            rand_index = evaluationUsers.randIndex(clusters)
            rand_indexes.append(rand_index)
            print 'rand_index = ' + str(rand_index)
            f.write("Rand Index = " + str(rand_index) + "\n")

        else:
            if len(clusters) == len(labels):

                f.write("\nResults\n")

                f.write('Clusters Obtained = ' + str(np.asarray(clusters)))
                f.write('Labels = ' + str(np.asarray(labels)))
                rand_index = metrics.adjusted_rand_score(labels, clusters)
                rand_indexes.append(rand_index)
                print 'rand_index = ' + str(rand_index)
                f.write("Rand Index = " + str(rand_index) + "\n")
                NMI_index = metrics.normalized_mutual_info_score(
                    labels, clusters)
                nmi_indexes.append(NMI_index)
                print 'NMI_index = ' + str(NMI_index)
                f.write("NMI Index = " + str(NMI_index) + "\n")

    if rep > 1:
        f.write("\nFINAL RESULTS\n")
        f.write("Avg Rand Index = " + str(float(sum(rand_indexes)) / rep) +
                "\n")
        f.write("Std Rand Index = " + str(statistics.stdev(rand_indexes)) +
                "\n")
        if users != 1:
            f.write("Avg NMI Index = " + str(float(sum(nmi_indexes)) / rep) +
                    "\n")
            f.write("Std NMI Index = " + str(statistics.stdev(nmi_indexes)) +
                    "\n")
    f.close()

예제 #28

0

파일 보기

            new_size = hist.shape[1]
            print 'Number of visual words filtered = ' + str(size - new_size)
            f.write("No feature selection applied \n")

        #################################################################
        #
        # Histogram Normalization
        #
        #################################################################

        if histnorm != "NONE":

            #Get detector classes
            norm_sbin = simpleBinarization.SimpleBi()
            norm_tfnorm = tfnorm.Tfnorm()
            norm_tfidf = tfidf.TfIdf()
            norm_tfidf2 = tfidf2.TfIdf2()
            norm_tfidfnorm = tfidfnorm.TfIdfnorm()
            norm_okapi = okapi.Okapi(average_words)

            names_normalization = np.array(
                ["SBIN", "TFNORM", "TFIDF", "TFIDF2", "TFIDFNORM", "OKAPI"])
            normalization_method = np.array([
                norm_sbin, norm_tfnorm, norm_tfidf, norm_tfidf2,
                norm_tfidfnorm, norm_okapi
            ])

            #Get the detector passed in the -h argument
            index = np.where(names_normalization == histnorm)[0]
            if index.size > 0:
                normalization_to_use = normalization_method[index[0]]