Пример #1
0
def testRun():

    lukija = WordReader(["../Material/Grimm's Fairy Tales.txt"])

    lukija.readWords()


    punamusta = RedBlack(lukija)
    trie = Trie(lukija)

    print "Adding words from selected material..."
    intti = 0; setti = 0
    for word in lukija.words:
        trie.add(word[0], word[1:])
        punamusta.add(word[0], word[1:])
        intti = intti + 1
        if intti > lukija.wordcount / 100.0:
            setti = setti + 1
            print setti, '% of words added'
            intti = 0

    print "Searching for words in Grimm's Fairy tales and The Adventures of Tom Sawyer"

    word = raw_input( "Find a word (or its beginning) in the text: " ).rstrip( '\n' )

    positions, count, linecount = trie.find(word)
    print "Found", count, "instances (", linecount, "lines) @", positions
    RBpositions, RBcount, RBlinecount = punamusta.find(word)
    print "Found", RBcount, "instances (", RBlinecount, "lines) @", RBpositions
Пример #2
0
 def setUp(self):
     self.lukija = WordReader(["../../Material/Grimm's Fairy Tales.txt"],
                              specialCharacters = ["-", "'"],
                              acceptNumerals = True, acceptUpperCase = True,
                              acceptLowerCase = False)
     # test addFileName
     self.lukija.addFileName("../../Material/The Adventures of Tom Sawyer by Mark Twain.txt")
Пример #3
0
class  PySearcherTestCases(unittest.TestCase):
    def setUp(self):
        self.reader = WordReader()
        self.finder = Trie(self.reader)
        self.searcher = Searcher(self.finder, '')

    def tearDown(self):
        self.reader = None
        self.finder = None
        self.searcher = None

    def testRandomWord(self):
       """ Tests that non-empty words are found and they are not the same """
       word1 = self.searcher.randomWord()
       word2 = self.searcher.randomWord()
       self.assertTrue(len(word1) > 1, 'Word length too short')
       self.assertTrue(len(word2) > 1, 'Word length too short')
       self.assertNotEqual(word1, word2, 'Found the same word')

    def testRandomWords(self):
       """ Tests that a set of random words do not contain the same words """
       words = self.searcher.randomWord(5)
       self.assertTrue(len(set(words)) == 5, 'Did not find 5 unique words')

    def testBinaryOperationsAreWorking(self):
        """
        Checks that operations are not identic and that correct number of hits
        is returned for every known result.
        """
        self.reader.addFileName(MaterialFilePath, readNow=True)
        self.finder.addFromReader()

        results = []
        for operation in operations:
            results.append(self.searcher.search(operations[operation],
                                                returnCount=True))
        self.assertTrue(len(set(results)) == 6, #i.e. operations are not identic
                        'Searcher failed binary operation check')
        for searchTerm in binaryOperationsSearch:
            self.assertEqual(self.searcher.search(searchTerm, returnCount=True),
                             binaryOperationsSearch[searchTerm],
                            'Searcher found wrong number of hits on some search')
Пример #4
0
 def testWordCountAndClear(self):
     """ Test if the reader finds the correct number of words """
     self.lukija = WordReader(['../../Material/50words_in_UTF-8.txt'])
     self.lukija.readWords()
     self.assertEqual(self.lukija.wordcount , wordsInTestFile,
                          'Did not get the correct number of words')
     # After clearing, we should not have any words in memory
     self.lukija.clear()
     self.assertEqual((self.lukija.words, self.lukija.wordcount,
                       self.lukija.filecount, self.lukija.linecount),
                       ([], 0, 0, 0))
Пример #5
0
class  PyWordReaderTestCases(unittest.TestCase):
    def setUp(self):
        self.lukija = WordReader(["../../Material/Grimm's Fairy Tales.txt"],
                                 specialCharacters = ["-", "'"],
                                 acceptNumerals = True, acceptUpperCase = True,
                                 acceptLowerCase = False)
        # test addFileName
        self.lukija.addFileName("../../Material/The Adventures of Tom Sawyer by Mark Twain.txt")

    def tearDown(self):
        self.lukija.clear('all')

    def testSanitize(self):
        """ Test whether word sanitizing works """
        self.words = []
        for word in unsanitizedWords:
            self.words.append(self.lukija.sanitize(word))
        self.assertEqual(self.words, sanitizedWords, 'Failed to sanitize words')

    def testCreateChrMap(self):
        """ Test whether index and character maps are okay """
        self.chrMap, self.idxMap = self.lukija._createChrMap()
        self.assertEqual(self.chrMap, properChrMap, 'Bad character map')
        self.assertEqual(self.idxMap, properIdxMap, 'Bad index map')

    def testInd2char(self):
        """ Test function ind2char """
        for index, val in enumerate(properChrMap):
            self.assertEqual(self.lukija.ind2char(index), properChrMap[index],
                             'ind2char function failed to map indices to characters')

    def testChar2ind(self):
        """ Test function ind2char """
        for char in properIdxMap:
            self.assertEqual(self.lukija.char2ind(char), properIdxMap[char],
                             'char2ind function failed to map characters to indices')

    def testGetCharMapSize(self):
        """ Test whether getCharMapSize returns the correct value """
        self.assertEqual(self.lukija.getCharMapSize(), len(properChrMap),
                             'getCharMapSize returned wrong map size')

    def testLineCount(self):
        """ Test whether WordReader reads all lines in files """
        self.lukija.readWords()
        self.assertEqual(self.lukija.linecount , linesIn2books,
                             'Did not read correct number of lines from file')
        self.assertEqual(self.lukija.filecount , noOfFiles,
                             'Did not read correct number of lines from file')

    def testWordCountAndClear(self):
        """ Test if the reader finds the correct number of words """
        self.lukija = WordReader(['../../Material/50words_in_UTF-8.txt'])
        self.lukija.readWords()
        self.assertEqual(self.lukija.wordcount , wordsInTestFile,
                             'Did not get the correct number of words')
        # After clearing, we should not have any words in memory
        self.lukija.clear()
        self.assertEqual((self.lukija.words, self.lukija.wordcount,
                          self.lukija.filecount, self.lukija.linecount),
                          ([], 0, 0, 0))
Пример #6
0
 def setUp(self):
     self.reader = WordReader()
     self.finder = Trie(self.reader)
     self.searcher = Searcher(self.finder, '')
Пример #7
0
        return average(runtimes)
    else:
        print string + '%20.3f ms' %  (sum(runtimes) / repeats)


if __name__ == "__main__":
    print "Hello World"

    trieAddFile = openFile('trieAddToEmpty', 'w')
    punamustaAddFile = openFile('punamustaAddToEmpty', 'w')
    trieFindLengthFile = openFile('trieFindWordLength', 'w')
    punamustaFindLengthFile = openFile('punamustaFindWordLength', 'w')
    trieFindWordCountFile = openFile('trieFindWordCount', 'w')
    punamustaFindWordCountFile = openFile('punamustaFindWordCoun', 'w')

    lukija = WordReader(["../Material/Grimm's Fairy Tales.txt"])
    lukija.readWords()
    punamusta = RedBlack(lukija)
    trie = Trie(lukija)

    words = pickle.load( open( "randomWordList", "rb" ) ) # indexed by word len

    repeats = 100;
    runtimes = []
    for i in range(2,17):
        runtime = addWordsToEmptyList(trie, 2**i, repeats, '%25s\t%10d\t' % ('trie:add', 2**i), False)
        print '%25s\t%10d\t\t%14.3f ms' % ('trie:addToEmpty', 2**i, runtime)
        trieAddFile.write('%10d\t%8.3f\n' % (2**i, runtime))
        runtimes.append(findWords(trie, words[7], repeats, printout=False))
    for index, runtime in enumerate(runtimes):
        print '%25s\t%10d\t\t%14.3f ms' % ('trie:findWordCount', 2**(index+2), runtime)