예제 #1
0
 def test_createEntities(self):
     testDictionary = Dictionary()
     collection = [
         Document(
             'doc1',
             'Test named entity recognition of a Collection of documents.'),
         Document(
             'doc2',
             ' African Commission is a named entity, also countries like Senegal and Lybia and names like Peter and Anna.'
         ),
         Document(
             'doc3',
             'Also organizations like the United Nations or UNICEF should be recognized.'
         )
     ]
     testEntities = Entities('')
     testEntities.addEntities(
         'ORGANIZATION',
         set([(u'african commission', 1), (u'unicef', 1),
              (u'united nations', 1)]))
     testEntities.addEntities('PERSON', set([(u'anna', 1), (u'peter', 1)]))
     testEntities.addEntities('LOCATION',
                              set([(u'senegal', 1), (u'lybia', 1)]))
     testDictionary.createEntities(collection)
     self.assertEqual(testEntities.__dict__,
                      testDictionary.entities.__dict__)
예제 #2
0
 def setUp(self):
     self.targetDocument = Document('', '')
     self.testDocument = Document(
         'Test Doc',
         'Test of tokenization\n dates like 12.03.1998, 103/78 and World Health Organisation should be kept together. Words appear more more often!?'
     )
     self.stoplist = ['and', 'of']
     self.specialChars = r'.*[\.,/?!].*'
예제 #3
0
 def test_createTokens(self):
     testDocument = Document(
         'Test Doc',
         'Test of tokenization\n dates like 12.03.1998, 103/78 and Words should be lowered and appear more more often.?'
     )
     testDocument.createTokens()
     self.targetDocument.tokens = [
         'test', 'of', 'tokenization', 'dates', 'like', '12.03.1998', ',',
         '103/78', 'and', 'words', 'should', 'be', 'lowered', 'and',
         'appear', 'more', 'more', 'often', '.', '?'
     ]
     self.assertEqual(testDocument.tokens, self.targetDocument.tokens)
예제 #4
0
    def test_lemmatizeTokens(self):
        testDocument = Document('', '')
        testDocument.tokens = set([
            'children', 'forced', 'trafficking', 'prisons', 'arrested',
            'United Nations', '12.03.1992', 'are', 'violations',
            'bags of words'
        ])
        testDocument.lemmatizeTokens()

        self.targetDocument.tokens = [
            'child', 'United Nations', '12.03.1992', 'be', 'violation',
            'force', 'arrest', 'traffic', 'prison', 'bags of words'
        ]

        self.assertEqual(set(testDocument.tokens),
                         set(self.targetDocument.tokens))
예제 #5
0
    def test_prepareDocument(self):
        targetDocument = Document('', '')
        testDocument = Document(
            'Test Doc',
            'Test of tokenization\n remove to short words and spe?cial char/s, words not in whitelist or in stoplist'
        )
        stoplist = ['and', 'of', 'stoplist']
        specialChars = r'.*[?/].*'
        whiteList = [
            'test', 'of', 'tokenization', 'remove', 'to', 'short', 'word',
            'spec?cial', 'char/s', 'in', 'stoplist'
        ]
        testDocument.prepareDocument(lemmatize=True,
                                     includeEntities=False,
                                     stopwords=stoplist,
                                     specialChars=specialChars,
                                     removeShortTokens=True,
                                     threshold=2,
                                     whiteList=whiteList)

        targetDocument.tokens = [
            'test',
            'tokenization',
            'remove',
            'short',
            'word',
            'word',
        ]
        self.assertEqual(testDocument.tokens, targetDocument.tokens)
예제 #6
0
    def test_correctTokenOccurance(self):
        testDocument = Document(
            'Test Document',
            'In the world many organizations like the World Health Organization or the Union of the World exist'
        )
        testDocument.tokens = [
            'world', 'many', 'organizations', 'like', 'world', 'health',
            'organization', 'union', 'world', 'exist',
            'world health organization', 'union of the world'
        ]
        entity = ('union of the world', 1)

        targetTokens = [
            'many', 'organizations', 'like', 'world', 'health', 'organization',
            'world', 'exist', 'world health organization', 'union of the world'
        ]

        testDocument.correctTokenOccurance(entity[0])
        self.assertEqual(targetTokens, testDocument.tokens)
예제 #7
0
    def test_findSpecialCharacterTokens(self):
        testDocument = Document('', '')
        testDocument.tokens = [
            'child`s', '23.09.1998', 'test entity', 'normal', '$200 000',
            '809/87', 'http://asfd.org', 'talib@n?', 'end of line.\n', '.'
        ]
        specialChars = r'.*[@./,:$©].*'
        testDocument.findSpecialCharacterTokens(specialChars)

        targetDocument = Document('', '')
        targetDocument.specialCharacters = [
            '23.09.1998', '$200 000', '809/87', 'http://asfd.org', 'talib@n?',
            'end of line.\n', '.'
        ]
        self.assertEqual(set(targetDocument.specialCharacters),
                         set(testDocument.specialCharacters))
예제 #8
0
    def test_createEntities(self):
        self.targetDocument.createEntities()
        testDocument = Document(
            'Test Document',
            'Name entities like World Health Organization, person names like Sir James and Ms Rosa Wallis but also locations like Lebanon, United States of America, Lebanon or cities like New York have to be recognized'
        )
        testDocument.createEntities()

        self.targetDocument.entities.LOCATION = [(u'lebanon', 2),
                                                 (u'united states of america',
                                                  1), (u'new york', 1)]
        self.targetDocument.entities.PERSON = [(u'james', 1),
                                               (u'rosa wallis', 1)]
        self.targetDocument.entities.ORGANIZATION = [
            (u'world health organization', 1)
        ]

        self.assertEqual(testDocument.entities.PERSON,
                         self.targetDocument.entities.PERSON)
        self.assertEqual(testDocument.entities.LOCATION,
                         self.targetDocument.entities.LOCATION)
        self.assertEqual(testDocument.entities.ORGANIZATION,
                         self.targetDocument.entities.ORGANIZATION)
예제 #9
0
    def test_appendEntities(self):
        testDocument = Document(
            'Test Document',
            'Name entities like World Health Organization, person names like Sir James and Ms Rosa Wallis but also world locations or states like Lebanon, United States of America, Lebanon or new cities like New York have to be recognized'
        )
        testDocument.createEntities()
        testDocument.createTokens()
        testDocument.appendEntities()

        self.targetDocument.tokens = [
            'name', 'entities', 'like', ',', 'person', 'names', 'like', 'sir',
            'and', 'ms', 'but', 'also', 'world', 'locations', 'or', 'like',
            ',', 'states', ',', 'or', 'cities', 'like', 'new', 'have', 'to',
            'be', 'recognized', 'world health organization', 'lebanon',
            'lebanon', 'united states of america', 'new york', 'james',
            'rosa wallis'
        ]
        self.assertEqual(self.targetDocument.tokens, testDocument.tokens)
예제 #10
0
    def test_removeSpecialCharacters(self):
        testDocument = Document('', '')
        testDocument.tokens = [
            'child`s', '23.09.1998', 'test entity', 'normal', '$200 000',
            '809/87', 'http://asfd.org', '809/87', 'talib@n?', '.',
            'end of line.\n'
        ]
        testDocument.specialCharacters = [
            '23.09.1998', '$200 000', '809/87', 'http://asfd.org', 'talib@n?',
            '.'
        ]

        target = set(
            ['child`s', '809/87', 'test entity', 'normal', 'end of line.\n'])
        testDocument.removeSpecialCharacters()
        self.assertEqual(target, set(testDocument.tokens))
예제 #11
0
    def test_addDocument(self):
        document = Document()
        document.tokens = ['add', 'words', 'to', 'dictionary']
        document.specialCharacters = ['add' 'specialChars', '?!%$', 'add']

        dictionary = Dictionary()
        dictionary.addDocument(document)

        self.assertEqual(dictionary.specialCharacters,
                         set(document.specialCharacters))
        self.assertEqual(set(dictionary.ids.values()), set(document.tokens))

        document2 = Document()
        document2.tokens = ['new', 'words']
        document2.specialCharacters = ['add', 'xx9']

        dictionary.addDocument(document2)

        document.specialCharacters.append('xx9')
        document.tokens.append('new')

        self.assertEqual(dictionary.specialCharacters,
                         set(document.specialCharacters))
        self.assertEqual(set(dictionary.ids.values()), set(document.tokens))
예제 #12
0
class testDocument(unittest.TestCase):
    def setUp(self):
        self.targetDocument = Document('', '')
        self.testDocument = Document(
            'Test Doc',
            'Test of tokenization\n dates like 12.03.1998, 103/78 and World Health Organisation should be kept together. Words appear more more often!?'
        )
        self.stoplist = ['and', 'of']
        self.specialChars = r'.*[\.,/?!].*'

    def test_lemmatizeTokens(self):
        testDocument = Document('', '')
        testDocument.tokens = set([
            'children', 'forced', 'trafficking', 'prisons', 'arrested',
            'United Nations', '12.03.1992', 'are', 'violations',
            'bags of words'
        ])
        testDocument.lemmatizeTokens()

        self.targetDocument.tokens = [
            'child', 'United Nations', '12.03.1992', 'be', 'violation',
            'force', 'arrest', 'traffic', 'prison', 'bags of words'
        ]

        self.assertEqual(set(testDocument.tokens),
                         set(self.targetDocument.tokens))

    def test_findSpecialCharacterTokens(self):
        testDocument = Document('', '')
        testDocument.tokens = [
            'child`s', '23.09.1998', 'test entity', 'normal', '$200 000',
            '809/87', 'http://asfd.org', 'talib@n?', 'end of line.\n', '.'
        ]
        specialChars = r'.*[@./,:$©].*'
        testDocument.findSpecialCharacterTokens(specialChars)

        targetDocument = Document('', '')
        targetDocument.specialCharacters = [
            '23.09.1998', '$200 000', '809/87', 'http://asfd.org', 'talib@n?',
            'end of line.\n', '.'
        ]
        self.assertEqual(set(targetDocument.specialCharacters),
                         set(testDocument.specialCharacters))

    def test_removeSpecialCharacters(self):
        testDocument = Document('', '')
        testDocument.tokens = [
            'child`s', '23.09.1998', 'test entity', 'normal', '$200 000',
            '809/87', 'http://asfd.org', '809/87', 'talib@n?', '.',
            'end of line.\n'
        ]
        testDocument.specialCharacters = [
            '23.09.1998', '$200 000', '809/87', 'http://asfd.org', 'talib@n?',
            '.'
        ]

        target = set(
            ['child`s', '809/87', 'test entity', 'normal', 'end of line.\n'])
        testDocument.removeSpecialCharacters()
        self.assertEqual(target, set(testDocument.tokens))

    def test_createTokens(self):
        testDocument = Document(
            'Test Doc',
            'Test of tokenization\n dates like 12.03.1998, 103/78 and Words should be lowered and appear more more often.?'
        )
        testDocument.createTokens()
        self.targetDocument.tokens = [
            'test', 'of', 'tokenization', 'dates', 'like', '12.03.1998', ',',
            '103/78', 'and', 'words', 'should', 'be', 'lowered', 'and',
            'appear', 'more', 'more', 'often', '.', '?'
        ]
        self.assertEqual(testDocument.tokens, self.targetDocument.tokens)

    def test_prepareDocument(self):
        targetDocument = Document('', '')
        testDocument = Document(
            'Test Doc',
            'Test of tokenization\n remove to short words and spe?cial char/s, words not in whitelist or in stoplist'
        )
        stoplist = ['and', 'of', 'stoplist']
        specialChars = r'.*[?/].*'
        whiteList = [
            'test', 'of', 'tokenization', 'remove', 'to', 'short', 'word',
            'spec?cial', 'char/s', 'in', 'stoplist'
        ]
        testDocument.prepareDocument(lemmatize=True,
                                     includeEntities=False,
                                     stopwords=stoplist,
                                     specialChars=specialChars,
                                     removeShortTokens=True,
                                     threshold=2,
                                     whiteList=whiteList)

        targetDocument.tokens = [
            'test',
            'tokenization',
            'remove',
            'short',
            'word',
            'word',
        ]
        self.assertEqual(testDocument.tokens, targetDocument.tokens)

    def test_appendEntities(self):
        testDocument = Document(
            'Test Document',
            'Name entities like World Health Organization, person names like Sir James and Ms Rosa Wallis but also world locations or states like Lebanon, United States of America, Lebanon or new cities like New York have to be recognized'
        )
        testDocument.createEntities()
        testDocument.createTokens()
        testDocument.appendEntities()

        self.targetDocument.tokens = [
            'name', 'entities', 'like', ',', 'person', 'names', 'like', 'sir',
            'and', 'ms', 'but', 'also', 'world', 'locations', 'or', 'like',
            ',', 'states', ',', 'or', 'cities', 'like', 'new', 'have', 'to',
            'be', 'recognized', 'world health organization', 'lebanon',
            'lebanon', 'united states of america', 'new york', 'james',
            'rosa wallis'
        ]
        self.assertEqual(self.targetDocument.tokens, testDocument.tokens)

    def test_createEntities(self):
        self.targetDocument.createEntities()
        testDocument = Document(
            'Test Document',
            'Name entities like World Health Organization, person names like Sir James and Ms Rosa Wallis but also locations like Lebanon, United States of America, Lebanon or cities like New York have to be recognized'
        )
        testDocument.createEntities()

        self.targetDocument.entities.LOCATION = [(u'lebanon', 2),
                                                 (u'united states of america',
                                                  1), (u'new york', 1)]
        self.targetDocument.entities.PERSON = [(u'james', 1),
                                               (u'rosa wallis', 1)]
        self.targetDocument.entities.ORGANIZATION = [
            (u'world health organization', 1)
        ]

        self.assertEqual(testDocument.entities.PERSON,
                         self.targetDocument.entities.PERSON)
        self.assertEqual(testDocument.entities.LOCATION,
                         self.targetDocument.entities.LOCATION)
        self.assertEqual(testDocument.entities.ORGANIZATION,
                         self.targetDocument.entities.ORGANIZATION)

    def test_correctTokenOccurance(self):
        testDocument = Document(
            'Test Document',
            'In the world many organizations like the World Health Organization or the Union of the World exist'
        )
        testDocument.tokens = [
            'world', 'many', 'organizations', 'like', 'world', 'health',
            'organization', 'union', 'world', 'exist',
            'world health organization', 'union of the world'
        ]
        entity = ('union of the world', 1)

        targetTokens = [
            'many', 'organizations', 'like', 'world', 'health', 'organization',
            'world', 'exist', 'world health organization', 'union of the world'
        ]

        testDocument.correctTokenOccurance(entity[0])
        self.assertEqual(targetTokens, testDocument.tokens)
예제 #13
0
    def setUp(self):
        self.doc = Document(
            'TestDoc', 'Test to see if this text is added to dictionary.words')

        self.testDictionary = Dictionary()
        self.targetDictionary = Dictionary()