Exemplo n.º 1
0
 def testGetWordForLanguage(self):
     L = Lexicon()
     L.addLanguage('en')
     L.addLanguage('de')
     L.insertWords((u'fox', u'quick'), 'en')
     L.insertWords((u'fuchs', u'quick'), 'de')
     en_words = L.getWordsForLanguage('en')
     en_words.sort()
     de_words = L.getWordsForLanguage('de')
     de_words.sort()
     self.assertEqual(en_words, ['fox', 'quick'])
     self.assertEqual(de_words, ['fuchs', 'quick'])
     self.assertRaises(LexiconError, L.getWordsForLanguage,
                       'unsupported_language')
Exemplo n.º 2
0
 def testGetWordForLanguage(self):
     L = Lexicon()
     L.addLanguage('en')
     L.addLanguage('de')
     L.insertWords((u'fox', u'quick'), 'en')
     L.insertWords((u'fuchs', u'quick'), 'de')
     en_words = L.getWordsForLanguage('en')
     en_words.sort()
     de_words = L.getWordsForLanguage('de')
     de_words.sort()
     self.assertEqual(en_words, ['fox', 'quick'])
     self.assertEqual(de_words, ['fuchs', 'quick'])
     self.assertRaises(LexiconError, L.getWordsForLanguage, 'unsupported_language')
Exemplo n.º 3
0
 def setUp(self):
     self._lexicon = Lexicon()
Exemplo n.º 4
0
class LexiconTests(unittest.TestCase):
    def setUp(self):
        self._lexicon = Lexicon()

    def _populateLexicon(self):
        self._lexicon.addLanguage('en')
        self._lexicon.addLanguage('de')
        self._lexicon.insertWords(text_en.split(' '), 'en')
        self._lexicon.insertWords(text_de.split(' '), 'de')

    def testInterface(self):
        verifyClass(ILexicon, self._lexicon.__class__)

    def testEmpty(self):
        assert len(self._lexicon) == 0

    def testNonUnicode(self):
        self._lexicon.addLanguage('en')
        self.assertRaises(LexiconError, self._lexicon.insertWord, 'foo-bar')
        self.assertRaises(LexiconError, self._lexicon.insertWord, dict())
        self.assertRaises(LexiconError, self._lexicon.insertWord, 2)
        self.assertRaises(LexiconError, self._lexicon.insertWords, [u'a', 'b'])

    def testLanguages(self):
        assert len(self._lexicon.getLanguages()) == 0
        self._lexicon.addLanguage('de')
        self._lexicon.addLanguage('en')
        assert len(self._lexicon.getLanguages()) == 2
        self._lexicon.addLanguage('en')
        assert len(self._lexicon.getLanguages()) == 2
        langs = self._lexicon.getLanguages()
        assert 'en' in langs
        assert not 'fr' in langs
        assert self._lexicon.hasLanguage('en')
        assert self._lexicon.hasLanguage('de')
        assert not self._lexicon.hasLanguage('fr')

    def testInsert(self):
        self._populateLexicon()
        wids = [self._lexicon.getWordId(word) for word in text_en.split(' ')]
        assert wids == self._lexicon.getWordIds(text_en.split(' '))
        wids = [self._lexicon.getWordId(word) for word in text_de.split(' ')]
        assert wids == self._lexicon.getWordIds(text_de.split(' '))

    def testWids(self):
        words = text_de.split(' ')
        self._lexicon.addLanguage('de')

        d = {}
        for word, wid in zip(words, self._lexicon.insertWords(words, 'de')):
            d[word] = wid

        for word, wid in d.items():
            self.assertEqual(self._lexicon.getWordAndLanguage(wid),
                             (word, 'de'))
            self.assertEqual(self._lexicon.getWord(wid), word)

    def testGetWordForLanguage(self):
        L = Lexicon()
        L.addLanguage('en')
        L.addLanguage('de')
        L.insertWords((u'fox', u'quick'), 'en')
        L.insertWords((u'fuchs', u'quick'), 'de')
        en_words = L.getWordsForLanguage('en')
        en_words.sort()
        de_words = L.getWordsForLanguage('de')
        de_words.sort()
        self.assertEqual(en_words, ['fox', 'quick'])
        self.assertEqual(de_words, ['fuchs', 'quick'])
        self.assertRaises(LexiconError, L.getWordsForLanguage,
                          'unsupported_language')

    def testLanguageDependentRetrival(self):
        self._populateLexicon()
        gw = self._lexicon.getWordId
        assert gw('the', 'en') != None
        assert gw('the', 'de') != None
        assert gw('einen', 'en') == None
        assert gw('einen', 'de') != None
        assert gw('the', 'en') != gw('the', 'de')

    def testRightTruncation(self):
        def _check(term, expected, language='en'):
            r1 = list(M(term, language))
            r1.sort()
            r2 = list(expected)
            r2.sort()
            self.assertEqual(r1, r2,
                             'got: %s, expected %s' % (repr(r1), repr(r2)))

        self._populateLexicon()
        M = self._lexicon.getWordsForRightTruncation
        _check(u't', ['the'])
        _check(u'th', ['the'])
        _check(u'the', ['the'])
        _check(u'thee', [])
        _check(u'q', ['quick'])
        _check(u'f', ['fox'])
        _check(u'fo', ['fox'])

    def testLeftTruncation(self):
        def _check(term, expected, language='en'):
            r1 = list(M(term, language))
            r1.sort()
            r2 = list(expected)
            r2.sort()
            self.assertEqual(r1, r2,
                             'got: %s, expected %s' % (repr(r1), repr(r2)))

        self._populateLexicon()
        M = self._lexicon.getWordsForLeftTruncation
        _check(u'the', ['the'])
        _check(u'he', ['the'])
        _check(u'e', ['the'])
        _check(u'thee', [])
        _check(u'ick', ['quick'])
        _check(u'x', ['fox'])
        _check(u'ox', ['fox'])

    def testPatternMatching(self):
        def _check(term, expected, language='en'):
            r1 = list(M(term, language))
            r1.sort()
            r2 = list(expected)
            r2.sort()
            self.assertEqual(r1, r2,
                             'got: %s, expected %s' % (repr(r1), repr(r2)))

        self._populateLexicon()
        M = self._lexicon.getWordsForPattern
        _check(u't*', ['the'])
        _check(u'th*', ['the'])
        _check(u'the*', ['the'])
        _check(u'thee*', [])
        _check(u'q*', ['quick'])
        _check(u'f*', ['fox'])
        _check(u'fo*', ['fox'])
        _check(u'f?x', ['fox'])
        _check(u'f*x', ['fox'])
        _check(u'f*x', [], 'de')
        _check(u'An*', ['Angriffen'], 'de')
        _check(u'An*n', ['Angriffen'], 'de')
        _check(u'Ang*n', ['Angriffen'], 'de')
        _check(u'Ang????en', ['Angriffen'], 'de')

        self.assertRaises(LexiconError, M, '*')

    def testSimilaritySearch(self):
        def _check(term, expected, language='en'):
            r1 = list(
                [term for term, ratio in M(term, 0.5, language=language)])
            r1.sort()
            r2 = list(expected)
            r2.sort()
            self.assertEqual(r1, r2,
                             'got: %s, expected %s' % (repr(r1), repr(r2)))

        self._populateLexicon()
        M = self._lexicon.getSimiliarWords
        _check(u'the', ['the'])
        _check(u'fox', ['fox'])
        _check(u'f*x', ['fox'])
        _check(u'Angriffe', ['Angriffen'], 'de')
        _check(u'Angriffen', ['Angriffen'], 'de')
        _check(u'Angrüff', ['Angriffen'], 'de')
        _check(u'Angrüven', ['Angriffen'], 'de')
        _check(u'Angrüvvven', ['Angriffen'], 'de')
        _check(u'Ankreivven', ['Angriffen', 'einen', 'seien'], 'de')
Exemplo n.º 5
0
 def setUp(self):
     self._lexicon = Lexicon()
Exemplo n.º 6
0
class LexiconTests(unittest.TestCase):

    def setUp(self):
        self._lexicon = Lexicon()

    def _populateLexicon(self):
        self._lexicon.addLanguage('en')
        self._lexicon.addLanguage('de')
        self._lexicon.insertWords(text_en.split(' '), 'en')
        self._lexicon.insertWords(text_de.split(' '), 'de')

    def testInterface(self):
        verifyClass(ILexicon, self._lexicon.__class__)

    def testEmpty(self):
        assert len(self._lexicon) == 0

    def testNonUnicode(self):
        self._lexicon.addLanguage('en')
        self.assertRaises(LexiconError, self._lexicon.insertWord, 'foo-bar')
        self.assertRaises(LexiconError, self._lexicon.insertWord, dict())
        self.assertRaises(LexiconError, self._lexicon.insertWord, 2)
        self.assertRaises(LexiconError, self._lexicon.insertWords, [u'a', 'b'])

    def testLanguages(self):
        assert len(self._lexicon.getLanguages()) == 0
        self._lexicon.addLanguage('de')
        self._lexicon.addLanguage('en')
        assert len(self._lexicon.getLanguages()) == 2
        self._lexicon.addLanguage('en')
        assert len(self._lexicon.getLanguages()) == 2
        langs = self._lexicon.getLanguages()
        assert 'en' in langs
        assert not 'fr' in langs
        assert self._lexicon.hasLanguage('en')
        assert self._lexicon.hasLanguage('de')
        assert not self._lexicon.hasLanguage('fr')

    def testInsert(self):
        self._populateLexicon()
        wids = [self._lexicon.getWordId(word) for word in text_en.split(' ')]
        assert wids == self._lexicon.getWordIds(text_en.split(' '))
        wids = [self._lexicon.getWordId(word) for word in text_de.split(' ')]
        assert wids == self._lexicon.getWordIds(text_de.split(' '))

    def testWids(self):
        words = text_de.split(' ')
        self._lexicon.addLanguage('de')

        d = {}
        for word, wid in zip(words, self._lexicon.insertWords(words, 'de')):
            d[word] = wid

        for word, wid in d.items():
            self.assertEqual(self._lexicon.getWordAndLanguage(wid), (word, 'de'))
            self.assertEqual(self._lexicon.getWord(wid), word)

    def testGetWordForLanguage(self):
        L = Lexicon()
        L.addLanguage('en')
        L.addLanguage('de')
        L.insertWords((u'fox', u'quick'), 'en')
        L.insertWords((u'fuchs', u'quick'), 'de')
        en_words = L.getWordsForLanguage('en')
        en_words.sort()
        de_words = L.getWordsForLanguage('de')
        de_words.sort()
        self.assertEqual(en_words, ['fox', 'quick'])
        self.assertEqual(de_words, ['fuchs', 'quick'])
        self.assertRaises(LexiconError, L.getWordsForLanguage, 'unsupported_language')

    def testLanguageDependentRetrival(self):
        self._populateLexicon()
        gw = self._lexicon.getWordId
        assert gw('the', 'en') != None
        assert gw('the', 'de') != None
        assert gw('einen', 'en') == None
        assert gw('einen', 'de') != None
        assert gw('the', 'en') != gw('the', 'de') 

    def testRightTruncation(self):

        def _check(term, expected, language='en'):
            r1 = list(M(term, language))
            r1.sort()
            r2 = list(expected)
            r2.sort()
            self.assertEqual(r1, r2, 'got: %s, expected %s' % (repr(r1), repr(r2)))

        self._populateLexicon()
        M = self._lexicon.getWordsForRightTruncation
        _check(u't', ['the'])     
        _check(u'th', ['the'])     
        _check(u'the', ['the'])     
        _check(u'thee', [])     
        _check(u'q',  ['quick'])     
        _check(u'f', ['fox'])     
        _check(u'fo', ['fox'])     

    def testLeftTruncation(self):

        def _check(term, expected, language='en'):
            r1 = list(M(term, language))
            r1.sort()
            r2 = list(expected)
            r2.sort()
            self.assertEqual(r1, r2, 'got: %s, expected %s' % (repr(r1), repr(r2)))

        self._populateLexicon()
        M = self._lexicon.getWordsForLeftTruncation
        _check(u'the', ['the'])     
        _check(u'he', ['the'])     
        _check(u'e', ['the'])     
        _check(u'thee', [])     
        _check(u'ick',  ['quick'])     
        _check(u'x', ['fox'])     
        _check(u'ox', ['fox'])     


    def testPatternMatching(self):

        def _check(term, expected, language='en'):
            r1 = list(M(term, language))
            r1.sort()
            r2 = list(expected)
            r2.sort()
            self.assertEqual(r1, r2, 'got: %s, expected %s' % (repr(r1), repr(r2)))

        self._populateLexicon()
        M = self._lexicon.getWordsForPattern
        _check(u't*', ['the'])     
        _check(u'th*', ['the'])     
        _check(u'the*', ['the'])     
        _check(u'thee*', [])     
        _check(u'q*',  ['quick'])     
        _check(u'f*', ['fox'])     
        _check(u'fo*', ['fox'])     
        _check(u'f?x', ['fox'])     
        _check(u'f*x', ['fox'])     
        _check(u'f*x', [], 'de')     
        _check(u'An*', ['Angriffen'], 'de')     
        _check(u'An*n', ['Angriffen'], 'de')     
        _check(u'Ang*n', ['Angriffen'], 'de')     
        _check(u'Ang????en', ['Angriffen'], 'de')     

        self.assertRaises(LexiconError, M, '*')

    def testSimilaritySearch(self):

        def _check(term, expected, language='en'):
            r1 = list([term for term,ratio in M(term, 0.5, language=language)])
            r1.sort()
            r2 = list(expected)
            r2.sort()
            self.assertEqual(r1, r2, 'got: %s, expected %s' % (repr(r1), repr(r2)))

        self._populateLexicon()
        M = self._lexicon.getSimiliarWords
        _check(u'the', ['the'])     
        _check(u'fox', ['fox'])     
        _check(u'f*x', ['fox'])     
        _check(u'Angriffe', ['Angriffen'], 'de')     
        _check(u'Angriffen', ['Angriffen'], 'de')     
        _check(u'Angrüff', ['Angriffen'], 'de')     
        _check(u'Angrüven', ['Angriffen'], 'de')     
        _check(u'Angrüvvven', ['Angriffen'], 'de')     
        _check(u'Ankreivven', ['Angriffen', 'einen', 'seien'], 'de')