示例#1
0
 def test_punkt_pair_iter_handles_stop_iteration_exception(self):
     # test input to trigger StopIteration from next()
     it = iter([])
     # call method under test and produce a generator
     gen = punkt._pair_iter(it)
     # unpack generator, ensure that no error is raised
     list(gen)
示例#2
0
 def test_punkt_pair_iter_handles_stop_iteration_exception(self):
     # test input to trigger StopIteration from next()
     it = iter([])
     # call method under test and produce a generator
     gen = punkt._pair_iter(it)
     # unpack generator, ensure that no error is raised
     list(gen)
示例#3
0
    def test_punkt_pair_iter(self):

        test_cases = [
            ('12', [('1', '2'), ('2', None)]),
            ('123', [('1', '2'), ('2', '3'), ('3', None)]),
            ('1234', [('1', '2'), ('2', '3'), ('3', '4'), ('4', None)]),
        ]

        for (test_input, expected_output) in test_cases:
            actual_output = [x for x in punkt._pair_iter(test_input)]

            assert_equal(actual_output, expected_output)
    def test_punkt_pair_iter(self):

        test_cases = [
            ("12", [("1", "2"), ("2", None)]),
            ("123", [("1", "2"), ("2", "3"), ("3", None)]),
            ("1234", [("1", "2"), ("2", "3"), ("3", "4"), ("4", None)]),
        ]

        for (test_input, expected_output) in test_cases:
            actual_output = [x for x in punkt._pair_iter(test_input)]

            assert actual_output == expected_output
示例#5
0
    def test_punkt_pair_iter(self):

        test_cases = [
            ('12', [('1', '2'), ('2', None)]),
            ('123', [('1', '2'), ('2', '3'), ('3', None)]),
            ('1234', [('1', '2'), ('2', '3'), ('3', '4'), ('4', None)]),
        ]

        for (test_input, expected_output) in test_cases:
            actual_output = [x for x in punkt._pair_iter(test_input)]

            assert_equal(actual_output, expected_output)
    def annotate_multi_punct_words(self, tokens):
        """ Detect abbreviations with multiple periods and mark them as abbreviations.
        Basically punkt is failing to count custom abbreviations, like F.B.I.,
        when it is not in the training data, even though they are relatively simple
        to tease out, especially when mixing it with ortho heuristics to detect
        the likelyhood of it being a sentence starter as well an abbreviation."""
        for aug_tok1, aug_tok2 in punkt._pair_iter(tokens):
            if self._re_abbr.search(aug_tok1.tok) is None:
                yield aug_tok1
                continue

            aug_tok1.abbr = True
            aug_tok1.sentbreak = False
            # Is it the last token? We can't do anything then.
            if not aug_tok2:
                continue

            next_typ = aug_tok2.type_no_sentperiod
            tok_is_initial = aug_tok1.is_initial
            # figure out if it's a sentence starter
            # [4.2. Token-Based Reclassification of Abbreviations] If
            # the token is an abbreviation or an ellipsis, then decide
            # whether we should *also* classify it as a sentbreak.
            if (aug_tok1.abbr or aug_tok1.ellipsis) and not tok_is_initial:
                # [4.1.1. Orthographic Heuristic] Check if there's
                # orthogrpahic evidence about whether the next word
                # starts a sentence or not.
                is_sent_starter = self._ortho_heuristic(aug_tok2)
                if is_sent_starter == True:
                    aug_tok1.sentbreak = True
                    yield aug_tok1
                    continue

            # [4.1.3. Frequent Sentence Starter Heruistic] If the
            # next word is capitalized, and is a member of the
            # frequent-sentence-starters list, then label tok as a
            # sentence break.
            if aug_tok2.first_upper and next_typ in self._params.sent_starters:
                aug_tok1.sentbreak = True

            yield aug_tok1
    def annotate_multi_punct_words(self, tokens):
        """ Detect abbreviations with multiple periods and mark them as abbreviations.
        Basically punkt is failing to count custom abbreviations, like F.B.I.,
        when it is not in the training data, even though they are relatively simple
        to tease out, especially when mixing it with ortho heuristics to detect
        the likelyhood of it being a sentence starter as well an abbreviation."""
        for aug_tok1, aug_tok2 in punkt._pair_iter(tokens):
            if self._re_abbr.search(aug_tok1.tok) is None:
                yield aug_tok1
                continue

            aug_tok1.abbr = True
            aug_tok1.sentbreak = False
            # Is it the last token? We can't do anything then.
            if not aug_tok2:
                continue

            next_typ = aug_tok2.type_no_sentperiod
            tok_is_initial = aug_tok1.is_initial
            # figure out if it's a sentence starter
            # [4.2. Token-Based Reclassification of Abbreviations] If
            # the token is an abbreviation or an ellipsis, then decide
            # whether we should *also* classify it as a sentbreak.
            if (aug_tok1.abbr or aug_tok1.ellipsis) and not tok_is_initial:
                # [4.1.1. Orthographic Heuristic] Check if there's
                # orthogrpahic evidence about whether the next word
                # starts a sentence or not.
                is_sent_starter = self._ortho_heuristic(aug_tok2)
                if is_sent_starter == True:
                    aug_tok1.sentbreak = True
                    yield aug_tok1
                    continue

            # [4.1.3. Frequent Sentence Starter Heruistic] If the
            # next word is capitalized, and is a member of the
            # frequent-sentence-starters list, then label tok as a
            # sentence break.
            if aug_tok2.first_upper and next_typ in self._params.sent_starters:
                aug_tok1.sentbreak = True

            yield aug_tok1