Exemplos de WordTokenizer em Python, exemplos de transcription_compare.tokenizer.WordTokenizer em Python

Exemplo n.º 1

0

Exibir arquivo

def main(reference, output, reference_file, output_file, alignment, error_type,
         output_format, digit):
    """
    Transcription compare tool provided by VoiceGain
    """
    if reference is not None:
        reference = reference
    elif reference_file is not None:
        # with open(reference_file, 'r') as file1:
        reference = reference_file.read()
    else:
        raise ValueError(
            "One of --reference and --reference_file must be specified")

    if output is not None:
        output = output
    elif output_file is not None:
        # with open(output_file, 'r') as file2:
        output = output_file.read()
    else:
        raise ValueError("One of --output and --output_file must be specified")

    if error_type == "CER":

        calculator = UKKLevenshteinDistanceCalculator(
            tokenizer=CharacterTokenizer(), get_alignment_result=alignment)
    else:
        calculator = UKKLevenshteinDistanceCalculator(
            tokenizer=WordTokenizer(), get_alignment_result=alignment)

    if output_format == 'TABLE':
        alignment_result = calculator.get_distance(reference,
                                                   output).alignment_result
        error_list = alignment_result.get_error_section_list()
        for e in error_list:
            print("+++++++++++++++")
            print(e.original_alignment_result)
            # updated_alignment_result = update_alignment_result(e.original_alignment_result)
            updated_alignment_result = update_alignment_result(
                e.original_alignment_result)
            e.set_correction(updated_alignment_result)

        alignment_result.apply_error_section_list(error_list)
        click.echo(alignment_result)

    if output_format == 'JSON':
        alignment_result = calculator.get_distance(reference,
                                                   output).alignment_result
        error_list = alignment_result.get_error_section_list()
        for e in error_list:
            print("+++++++++++++++")
            print(e.original_alignment_result)
            # updated_alignment_result = update_alignment_result(e.original_alignment_result)
            updated_alignment_result = update_alignment_result(
                e.original_alignment_result)
            e.set_correction(updated_alignment_result)

        alignment_result.apply_error_section_list(error_list)
        distance = alignment_result.calculate_three_kinds_of_distance()[0]
        click.echo(alignment_result.to_json())

Exemplo n.º 2

0

Exibir arquivo

Arquivo: digit_util.py Projeto: kakakuoka/transcription-compare

    def update_alignment_result_error_section(self,
                                              alignment_result_error_section):
        alignment_result = alignment_result_error_section.original_alignment_result
        #   alignment_result = result.alignment_result
        aligned_tokens_list = alignment_result.aligned_tokens_list

        calculator = UKKLevenshteinDistanceCalculator(
            tokenizer=WordTokenizer(), get_alignment_result=False)
        output_string = alignment_result.get_outputs_str()
        # original_ref_string = alignment_result.get_reference_str()
        # print("++++++++++++++++before calculate three in DU")
        old_distance = alignment_result.calculate_three_kinds_of_distance()[0]
        generator = SimpleReferenceCombinationGenerator()
        tmp_result = None
        for index in range(0, len(alignment_result)):
            # if aligned_tokens_list[index].reference.isdigit() is True:
            result_digit = self.our_is_digit(
                aligned_tokens_list[index].reference)
            if result_digit is not False:
                for r in result_digit:
                    generator.add_new_token_options(r)
            else:
                generator.add_new_token_options(
                    [aligned_tokens_list[index].reference])

        # print('generator.get_all_reference()', generator.get_all_reference())
        for x in generator.get_all_reference():
            x = " ".join(x)
            distance = calculator.get_distance(x, output_string).distance
            # print('x', x)
            # print('output_string', output_string)
            # print('distance', distance)

            if distance < old_distance:
                old_distance = distance
                tmp_result = x

        if tmp_result is None:
            return None
        # else:
        #     if original_ref_string !=tmp_result:
        #        print("Update from '{}' to '{}', {}".format(original_ref_string, tmp_result, original_ref_string==tmp_result))
        calculator2 = UKKLevenshteinDistanceCalculator(
            tokenizer=WordTokenizer(), get_alignment_result=True)
        update_result = calculator2.get_distance(
            tmp_result, output_string).alignment_result
        return update_result

Exemplo n.º 3

0

Exibir arquivo

def update_alignment_result(alignment_result):
    #   alignment_result = result.alignment_result
    aligned_tokens_list = alignment_result.aligned_tokens_list

    calculator = UKKLevenshteinDistanceCalculator(tokenizer=WordTokenizer(),
                                                  get_alignment_result=False)
    output_string = alignment_result.get_outputs_str()
    old_distance = alignment_result.calculate_three_kinds_of_distance()[0]
    generator = SimpleReferenceCombinationGenerator()
    tmp_result = None
    for index in range(0, len(alignment_result)):
        # if aligned_tokens_list[index].reference.isdigit() is True:
        result_digit = our_is_digit(aligned_tokens_list[index].reference)
        if result_digit is not False:
            for r in result_digit:
                generator.add_new_token_options(r)
        else:
            generator.add_new_token_options(
                aligned_tokens_list[index].reference)
        # print('generator.get_all_reference()', generator.get_all_reference())
        for x in generator.get_all_reference():
            x = " ".join(x)
            distance = calculator.get_distance(x, output_string).distance
            # print('x', x)
            # print('output_string', output_string)
            # print('distance', distance)

            if distance < old_distance:
                old_distance = distance
                tmp_result = x
    if tmp_result is None:
        return None
    calculator2 = UKKLevenshteinDistanceCalculator(tokenizer=WordTokenizer(),
                                                   get_alignment_result=True)
    update_result = calculator2.get_distance(tmp_result,
                                             output_string).alignment_result
    return update_result

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test-transcribe.py Projeto: johnfelipe/platform-2

def run_transcription_compare(reference_path, output_file_list,
                              output_html_path):
    logging.info("Start to compare results")

    with open(reference_path, "r", encoding='utf-8') as reference_file:
        reference_text = reference_file.read()

    calculator = UKKLevenshteinDistanceCalculator(
        tokenizer=WordTokenizer(),
        get_alignment_result=True,
        local_optimizers=[
            DigitUtil(process_output_digit=True),
            LocalCerOptimizer()
        ])

    output_all = dict()  # (output identifier -> output string)
    for output_path in output_file_list:
        with open(output_path, "r", encoding='utf-8') as output_file:
            output_text = output_file.read()
        output_path_name = os.path.basename(output_path)
        output_all[output_path_name] = output_text
    logging.info("Finish reading all results")

    output_results = dict()  # (output_identifier -> output_string)
    for (key, value) in output_all.items():
        logging.info("Start to process {}".format(key))
        output_results[key] = calculator.get_distance(
            reference_text,
            value,
            brackets_list=["[]", "()", "<>"],
            to_lower=True,
            remove_punctuation=True,
            use_alternative_spelling=True)

    logging.info("Merge all results into one HTML")
    calculator_local = UKKLevenshteinDistanceCalculator(
        tokenizer=CharacterTokenizer(), get_alignment_result=False)

    result = MultiResult(output_results, calculator_local)
    s = result.to_html()

    with open(output_html_path, 'w') as f:
        f.write(s)

Exemplo n.º 5

0

Exibir arquivo

class TestCerFirst(unittest.TestCase):
    """
    """

    def setUp(self) -> None:
        self.word_tokenizer = WordTokenizer()

    def test_into(self):
        """
        """
        r = self.word_tokenizer.tokenize(token_string="into", brackets_list=[])
        self.assertEqual(r, ["into"])

    def test_into_2(self):
        """
        """
        r = self.word_tokenizer.tokenize(token_string="in to", brackets_list=[])
        self.assertEqual(r, ["in", "to"])

    def test_into_3(self):
        """
        """
        r = self.word_tokenizer.tokenize(token_string="into the monitor", brackets_list=[])
        self.assertEqual(r, ["into", "the", "monitor"])

    def test_into_4(self):
        """
        """
        r = self.word_tokenizer.tokenize(token_string="into", brackets_list=["()"])
        self.assertEqual(r, ["into"])

    def test_into_5(self):
        """
        """
        r = self.word_tokenizer.tokenize(token_string="in to", brackets_list=["()"])
        self.assertEqual(r, ["in", "to"])

    def test_into_6(self):
        """
        """
        r = self.word_tokenizer.tokenize(token_string="into the monitor", brackets_list=["()"])
        self.assertEqual(r, ["into", "the", "monitor"])

Exemplo n.º 6

0

Exibir arquivo

    def update_alignment_result(self, alignment_result, process_output_digit):

        word_tokenizer = WordTokenizer()

        calculator = UKKLevenshteinDistanceCalculator(
            tokenizer=None, get_alignment_result=False)

        # get output token list
        output_token_list = alignment_result.get_outputs()
        reference_token_list = alignment_result.get_reference()

        old_distance = alignment_result.calculate_three_kinds_of_distance()[0]

        generator = SimpleReferenceCombinationGenerator()

        tmp_result = None
        no_digit = True

        if process_output_digit:
            token_list_to_check_digit = output_token_list
        else:
            token_list_to_check_digit = reference_token_list

        for current_str in token_list_to_check_digit:

            result_digit = self.our_is_digit(current_str)
            if result_digit:
                no_digit = False
                for r in result_digit:
                    # tokenize the string
                    tokenized_r = []
                    for option in r:
                        tokenized_r.append(
                            word_tokenizer.tokenize(option,
                                                    to_lower=True,
                                                    remove_punctuation=True))

                    generator.add_new_token_options(tokenized_r)
            else:
                generator.add_new_token_options([current_str])

        if no_digit:
            return None

        for x in generator.get_all_reference():
            if process_output_digit:
                distance = calculator.get_result_from_list(
                    reference_token_list, x).distance
            else:
                distance = calculator.get_result_from_list(
                    x, output_token_list).distance

            if distance < old_distance:
                old_distance = distance
                tmp_result = x

        if tmp_result is None:
            return None

        calculator2 = UKKLevenshteinDistanceCalculator(
            tokenizer=None, get_alignment_result=True)

        if process_output_digit:
            update_result = calculator2.get_result_from_list(
                reference_token_list, tmp_result).alignment_result
        else:
            update_result = calculator2.get_result_from_list(
                tmp_result, output_token_list).alignment_result
        return update_result

Exemplo n.º 7

0

Exibir arquivo

 def setUp(self) -> None:
     self.word_tokenizer = WordTokenizer()

Exemplo n.º 8

0

Exibir arquivo

Arquivo: testcases.py Projeto: voicegain/transcription-compare

 def setUp(self) -> None:
     self.calculator = UKKLevenshteinDistanceCalculator(
         tokenizer=CharacterTokenizer(), get_alignment_result=True)
     self.wer_calculator = UKKLevenshteinDistanceCalculator(
         tokenizer=WordTokenizer(), get_alignment_result=True)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: for_plot.py Projeto: voicegain/transcription-compare

from transcription_compare.levenshtein_distance_calculator import UKKLevenshteinDistanceCalculator
from transcription_compare.tokenizer import WordTokenizer
import re

calculator = UKKLevenshteinDistanceCalculator(tokenizer=WordTokenizer(),
                                              get_alignment_result=True,
                                              digit_util=None)

reference = 'APPLE BANANA WATER BYE OK NO PROBLEM TIME LOG SEARCH'
output = 'APPLE BANANA HELLO WATER HA NO TIME LOG YES '
length = len(reference)
for i in range(1, length):
    b = re.split(r" +", reference)
    print(b)


def cut_text(text, lenth):
    textArr = re.findall('.{' + str(lenth) + '}', text)
    textArr.append(text[(len(textArr) * lenth):])
    return textArr


reference = 'APPLE BANANA WATER BYE OK NO PROBLEM TIME LOG SEARCH'
output = 'APPLE BANANA HELLO WATER HA NO TIME LOG YES '
spaces_count = reference.count(' ')
length = len(reference)
print(length)
# print(spaces_count//2) #根据spaces的个数去判断

spaces_spot = [k for k in range(len(reference)) if reference.find(' ', k) == k]
print(spaces_spot)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: error_display.py Projeto: voicegain/transcription-compare

                print('current_reference', all_reference[0])
                print('current_output', current_output)
                d += calculator.get_distance(all_reference[0],
                                             current_output[0]).distance
                d += calculator.get_distance(all_reference[1],
                                             current_output[1]).distance
                print('old_distance', old_distance)
                print('d', d)
                if d < old_distance:
                    old_distance = distance
                    tmp_result = current_output
                print('tmp_result', tmp_result)
            if tmp_result is None:
                pass
            calculator2 = UKKLevenshteinDistanceCalculator(
                tokenizer=WordTokenizer(), get_alignment_result=True)
            update_result = calculator2.get_distance(
                all_reference[0], " ".join(tmp_result[0])).alignment_result
            update_result += calculator2.get_distance(
                all_reference[1], " ".join(tmp_result[1])).alignment_result
            print(update_result)


def update_alignment_result_word(alignment_result):
    # fist check same character
    for row in alignment_result:
        if row.reference in row.output:
            index = list.index(row.reference)
        else:
            index = 0
    # sort