Пример #1
0
    def test_concatenate(self):
        """
        Test that when documents are concatenated, all documents are part of the new document.
        """
        """
        Create the test data.
        """
        strings = [
            'this is not a pipe', 'this is just a cigarette',
            'still just as deadly'
        ]

        tokenizer = Tokenizer(stem=False)
        documents = [
            Document(string, tokenizer.tokenize(string), scheme=TF())
            for string in strings
        ]

        document = Document.concatenate(*documents,
                                        tokenizer=tokenizer,
                                        scheme=TF())
        self.assertEqual(2, document.dimensions.get('this'))
        self.assertEqual(2, document.dimensions.get('just'))
        self.assertEqual(1, document.dimensions.get('pipe'))
        self.assertEqual(1, document.dimensions.get('cigarette'))
        self.assertEqual(1, document.dimensions.get('deadly'))
        self.assertEqual(' '.join(strings), document.text)
Пример #2
0
    def test_concatenate_zero_documents(self):
        """
        Test that when no documents are given to be concatenated, an empty document is created.
        """

        tokenizer = Tokenizer(stem=False)
        documents = []

        document = Document.concatenate(*documents,
                                        tokenizer=tokenizer,
                                        scheme=TF())
        self.assertFalse(document.dimensions)
        self.assertEqual('', document.text)
Пример #3
0
    def test_concatenate_with_attributes(self):
        """
        Test that when attributes are given to the concatentation, they are included in the new document.
        """
        """
        Create the test data.
        """
        strings = [
            'this is not a pipe', 'this is just a cigarette',
            'still just as deadly'
        ]

        tokenizer = Tokenizer(stem=False)
        documents = [
            Document(string, tokenizer.tokenize(string), scheme=TF())
            for string in strings
        ]

        document = Document.concatenate(*documents,
                                        tokenizer=tokenizer,
                                        scheme=TF(),
                                        attributes={'attr': True})
        self.assertTrue(document.attributes['attr'])