def test_addDocument(self): document = Document() document.tokens = ['add', 'words', 'to', 'dictionary'] document.specialCharacters = ['add' 'specialChars', '?!%$', 'add'] dictionary = Dictionary() dictionary.addDocument(document) self.assertEqual(dictionary.specialCharacters, set(document.specialCharacters)) self.assertEqual(set(dictionary.ids.values()), set(document.tokens)) document2 = Document() document2.tokens = ['new', 'words'] document2.specialCharacters = ['add', 'xx9'] dictionary.addDocument(document2) document.specialCharacters.append('xx9') document.tokens.append('new') self.assertEqual(dictionary.specialCharacters, set(document.specialCharacters)) self.assertEqual(set(dictionary.ids.values()), set(document.tokens))
def test_removeSpecialCharacters(self): testDocument = Document('', '') testDocument.tokens = [ 'child`s', '23.09.1998', 'test entity', 'normal', '$200 000', '809/87', 'http://asfd.org', '809/87', 'talib@n?', '.', 'end of line.\n' ] testDocument.specialCharacters = [ '23.09.1998', '$200 000', '809/87', 'http://asfd.org', 'talib@n?', '.' ] target = set( ['child`s', '809/87', 'test entity', 'normal', 'end of line.\n']) testDocument.removeSpecialCharacters() self.assertEqual(target, set(testDocument.tokens))
def test_findSpecialCharacterTokens(self): testDocument = Document('', '') testDocument.tokens = [ 'child`s', '23.09.1998', 'test entity', 'normal', '$200 000', '809/87', 'http://asfd.org', 'talib@n?', 'end of line.\n', '.' ] specialChars = r'.*[@./,:$©].*' testDocument.findSpecialCharacterTokens(specialChars) targetDocument = Document('', '') targetDocument.specialCharacters = [ '23.09.1998', '$200 000', '809/87', 'http://asfd.org', 'talib@n?', 'end of line.\n', '.' ] self.assertEqual(set(targetDocument.specialCharacters), set(testDocument.specialCharacters))