예제 #1
0
    def test_addDocument(self):
        document = Document()
        document.tokens = ['add', 'words', 'to', 'dictionary']
        document.specialCharacters = ['add' 'specialChars', '?!%$', 'add']

        dictionary = Dictionary()
        dictionary.addDocument(document)

        self.assertEqual(dictionary.specialCharacters,
                         set(document.specialCharacters))
        self.assertEqual(set(dictionary.ids.values()), set(document.tokens))

        document2 = Document()
        document2.tokens = ['new', 'words']
        document2.specialCharacters = ['add', 'xx9']

        dictionary.addDocument(document2)

        document.specialCharacters.append('xx9')
        document.tokens.append('new')

        self.assertEqual(dictionary.specialCharacters,
                         set(document.specialCharacters))
        self.assertEqual(set(dictionary.ids.values()), set(document.tokens))
예제 #2
0
    def test_removeSpecialCharacters(self):
        testDocument = Document('', '')
        testDocument.tokens = [
            'child`s', '23.09.1998', 'test entity', 'normal', '$200 000',
            '809/87', 'http://asfd.org', '809/87', 'talib@n?', '.',
            'end of line.\n'
        ]
        testDocument.specialCharacters = [
            '23.09.1998', '$200 000', '809/87', 'http://asfd.org', 'talib@n?',
            '.'
        ]

        target = set(
            ['child`s', '809/87', 'test entity', 'normal', 'end of line.\n'])
        testDocument.removeSpecialCharacters()
        self.assertEqual(target, set(testDocument.tokens))
예제 #3
0
    def test_findSpecialCharacterTokens(self):
        testDocument = Document('', '')
        testDocument.tokens = [
            'child`s', '23.09.1998', 'test entity', 'normal', '$200 000',
            '809/87', 'http://asfd.org', 'talib@n?', 'end of line.\n', '.'
        ]
        specialChars = r'.*[@./,:$©].*'
        testDocument.findSpecialCharacterTokens(specialChars)

        targetDocument = Document('', '')
        targetDocument.specialCharacters = [
            '23.09.1998', '$200 000', '809/87', 'http://asfd.org', 'talib@n?',
            'end of line.\n', '.'
        ]
        self.assertEqual(set(targetDocument.specialCharacters),
                         set(testDocument.specialCharacters))