Пример #1
0
class TestCerFirst(unittest.TestCase):
    """
    """

    def setUp(self) -> None:
        self.word_tokenizer = WordTokenizer()

    def test_into(self):
        """
        """
        r = self.word_tokenizer.tokenize(token_string="into", brackets_list=[])
        self.assertEqual(r, ["into"])

    def test_into_2(self):
        """
        """
        r = self.word_tokenizer.tokenize(token_string="in to", brackets_list=[])
        self.assertEqual(r, ["in", "to"])

    def test_into_3(self):
        """
        """
        r = self.word_tokenizer.tokenize(token_string="into the monitor", brackets_list=[])
        self.assertEqual(r, ["into", "the", "monitor"])

    def test_into_4(self):
        """
        """
        r = self.word_tokenizer.tokenize(token_string="into", brackets_list=["()"])
        self.assertEqual(r, ["into"])

    def test_into_5(self):
        """
        """
        r = self.word_tokenizer.tokenize(token_string="in to", brackets_list=["()"])
        self.assertEqual(r, ["in", "to"])

    def test_into_6(self):
        """
        """
        r = self.word_tokenizer.tokenize(token_string="into the monitor", brackets_list=["()"])
        self.assertEqual(r, ["into", "the", "monitor"])
Пример #2
0
    def update_alignment_result(self, alignment_result, process_output_digit):

        word_tokenizer = WordTokenizer()

        calculator = UKKLevenshteinDistanceCalculator(
            tokenizer=None, get_alignment_result=False)

        # get output token list
        output_token_list = alignment_result.get_outputs()
        reference_token_list = alignment_result.get_reference()

        old_distance = alignment_result.calculate_three_kinds_of_distance()[0]

        generator = SimpleReferenceCombinationGenerator()

        tmp_result = None
        no_digit = True

        if process_output_digit:
            token_list_to_check_digit = output_token_list
        else:
            token_list_to_check_digit = reference_token_list

        for current_str in token_list_to_check_digit:

            result_digit = self.our_is_digit(current_str)
            if result_digit:
                no_digit = False
                for r in result_digit:
                    # tokenize the string
                    tokenized_r = []
                    for option in r:
                        tokenized_r.append(
                            word_tokenizer.tokenize(option,
                                                    to_lower=True,
                                                    remove_punctuation=True))

                    generator.add_new_token_options(tokenized_r)
            else:
                generator.add_new_token_options([current_str])

        if no_digit:
            return None

        for x in generator.get_all_reference():
            if process_output_digit:
                distance = calculator.get_result_from_list(
                    reference_token_list, x).distance
            else:
                distance = calculator.get_result_from_list(
                    x, output_token_list).distance

            if distance < old_distance:
                old_distance = distance
                tmp_result = x

        if tmp_result is None:
            return None

        calculator2 = UKKLevenshteinDistanceCalculator(
            tokenizer=None, get_alignment_result=True)

        if process_output_digit:
            update_result = calculator2.get_result_from_list(
                reference_token_list, tmp_result).alignment_result
        else:
            update_result = calculator2.get_result_from_list(
                tmp_result, output_token_list).alignment_result
        return update_result