def test_tokenize_dirty_text(self):
     """
     Tokenize dirty text
     """
     expected = ['the', 'first', 'sentence', 'the', 'second', 'sentence']
     actual = tokenize('The first% sentence><. The sec&*ond sent@ence #.')
     self.assertEqual(expected, actual)
 def test_tokenize_several_sentences(self):
     """
     Tokenize text with several sentences
     """
     expected = ['the', 'first', 'sentence', 'the', 'second', 'sentence']
     actual = tokenize('The first sentence. The second sentence.')
     self.assertEqual(expected, actual)
 def test_tokenize_line_breaks(self):
     """
     Tokenize text with line breaks
     """
     expected = ['the', 'first', 'sentence', 'the', 'second', 'sentence']
     actual = tokenize(
         'The first sentence.<br /><br />The second sentence.')
     self.assertEqual(expected, actual)
 def test_tokenize_bad_input(self):
     """
     Tokenize bad input argument scenario
     """
     bad_inputs = [[], {}, (), None, 9, 9.34, True]
     expected = []
     for bad_input in bad_inputs:
         actual = tokenize(bad_input)
         self.assertEqual(expected, actual)
 def test_tokenize_ideal(self):
     """
     Ideal tokenize scenario
     """
     expected = [
         'the', 'weather', 'is', 'sunny', 'the', 'man', 'is', 'happy'
     ]
     actual = tokenize('The weather is sunny, the man is happy.')
     self.assertEqual(expected, actual)
Пример #6
0
    def test_tokenize_big_text_length_equal(self):
        """
        Tokenize big input text and assert equal
        """
        text = read_from_file('lab_1/tokens.txt')

        expected = len(text.split())
        actual = len(tokenize(text))
        self.assertEqual(expected, actual)
Пример #7
0
    def test_tokenize_big_text_case(self):
        """
        Tokenize big input text scenario
        """
        text = read_from_file('lab_1/tokens.txt')

        expected = text.split()
        actual = tokenize(text)
        self.assertEqual(expected, actual)
 def test_big_text_get_adjacent_words_term(self):
     """
     Checks if adjacent words for a given term can be found properly
     """
     text = read_from_file('lab_1/data.txt')
     tokens = tokenize(text)
     expected = [['although', 'products']]
     actual = get_adjacent_words(tokens, 'tex', 4, 31)
     self.assertEqual(expected, actual)
 def test_tokenize_punctuation_marks(self):
     """
     Tokenize text with different punctuation marks
     """
     expected = [
         'the', 'first', 'sentence', 'nice', 'the', 'second', 'sentence',
         'bad'
     ]
     actual = tokenize(
         'The, first sentence - nice. The second sentence: bad!')
     self.assertEqual(expected, actual)
    def test_get_adjacent_words_several_contexts_big_text(self):
        """
        Checks if adjacent words for a given term can be found in real text properly
        """
        text = read_from_file('lab_1/data.txt')
        tokens = tokenize(text)

        expected = [['epithelial', 'channels'], ['means', 'aluminate'],
                    ['by', 'bicarbonate'], ['the', 'salt']]
        actual = get_adjacent_words(tokens, 'sodium', 1, 1)
        self.assertEqual(expected, actual)
    def test_get_concordance_several_contexts_big_text_right(self):
        """
        Checks if contexts for a given term can be found in real text properly
        Taking into consideration right context
        """
        text = read_from_file('lab_1/data.txt')
        tokens = tokenize(text)

        expected = [['means', 'sodium', 'aluminate'],
                    ['by', 'sodium', 'bicarbonate'],
                    ['epithelial', 'sodium', 'channels'],
                    ['the', 'sodium', 'salt']]
        actual = sort_concordance(tokens, 'sodium', 1, 1, False)
        self.assertEqual(expected, actual)
    def test_big_text_get_and_sort_concordance_term(self):
        """
        Checks if a context sorts right for a given term and can be found properly
        """
        text = read_from_file('lab_1/data.txt')
        tokens = tokenize(text)

        expected = [[
            'although', 'less', 'compact', 'than', 'tex', 'the', 'xml',
            'structuring', 'promises', 'to', 'make', 'it', 'widely', 'usable',
            'and', 'allows', 'for', 'instant', 'display'
        ]]
        actual = sort_concordance(tokens, 'tex', 4, 14, True)
        self.assertEqual(expected, actual)
    def test_big_text_get_concordance_term(self):
        """
        Checks if a context for a given term can be found properly
        """
        text = read_from_file('lab_1/data.txt')
        tokens = tokenize(text)

        expected = [[
            'although', 'less', 'compact', 'than', 'tex', 'the', 'xml',
            'structuring', 'promises', 'to', 'make', 'it', 'widely', 'usable',
            'and', 'allows', 'for', 'instant', 'display', 'in', 'applications',
            'such', 'as', 'web', 'browsers', 'and', 'facilitates', 'an',
            'interpretation', 'of', 'its', 'meaning', 'in', 'mathematical',
            'software', 'products'
        ]]
        actual = get_concordance(tokens, 'tex', 4, 31)
        self.assertEqual(expected, actual)