示例#1
0
 def test_tokenizing_transducer(self):
     ref_word_ipa = g2p.make_g2p("mic", "mic-ipa")("sq").output_string
     transducer = g2p.make_g2p("mic", "mic-ipa", tok_lang="mic")
     word_ipa = transducer("sq").output_string
     self.assertEqual(word_ipa, ref_word_ipa)
     string_ipa = transducer(self.contextualize("sq")).output_string
     self.assertEqual(string_ipa, self.contextualize(ref_word_ipa))
示例#2
0
    def test_check_ipa(self):
        transducer = make_g2p("fra", "fra-ipa")
        self.assertTrue(transducer.check(transducer("ceci")))
        self.assertFalse(transducer.check(transducer("ñ")))
        self.assertFalse(
            transducer.check(transducer("ñ"), display_warnings=True))
        self.assertTrue(transducer.check(transducer("ceci est un test été à")))

        transducer = make_g2p("fra-ipa", "eng-ipa")
        self.assertFalse(transducer.check(transducer("ñ")))
示例#3
0
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False):
    dummy_inventory = ["ɑ", "i", "u", "t", "s", "n"]
    display_name = mapping.kwargs.get('language_name', 'No Language display name in Config')
    config = generate_config(mapping.kwargs[f'{io}_lang'], 'dummy', display_name, display_name)
    default_char = 't'
    if is_ipa(mapping.kwargs[f'{io}_lang']):
        mapping = align_inventories(mapping.inventory(io), dummy_inventory)
    else:
        und_g2p = make_g2p('und', 'und-ipa')
        mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower())} for x in mapping.inventory(io)]
        dummy_list = align_inventories([x['out'] for x in mapping], dummy_inventory)
        dummy_dict = {}
        for x in dummy_list:
            if x['in']:
                dummy_dict[x['in']] = x['out']
                
        for x in mapping:
            try:
                x['out'] = dummy_dict[x['out']]
            except KeyError:
                LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.")
                x['out'] = default_char       
 
    if write_to_file:
        write_generated_mapping_to_file(config, mapping)
    return config, mapping
示例#4
0
 def test_tiered_composition(self):
     transducer = make_g2p("dan", "eng-arpabet")
     tg = transducer("hej")
     self.assertEqual(tg.output_string, "HH EH Y")
     self.assertEqual(
         tg.edges,
         [
             [(0, 0), (1, 1), (2, 2)],
             [(0, 0), (1, 1), (2, 2)],
             [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (2, 6)],
         ],
     )
     self.assertEqual(
         tg.pretty_edges(),
         [
             [["h", "h"], ["e", "ɛ"], ["j", "j"]],
             [["h", "h"], ["ɛ", "ɛ"], ["j", "j"]],
             [
                 ["h", "H"],
                 ["h", "H"],
                 ["h", " "],
                 ["ɛ", "E"],
                 ["ɛ", "H"],
                 ["ɛ", " "],
                 ["j", "Y"],
             ],
         ],
     )
     self.assertEqual(compose_tiers(tg.edges), [(0, 2), (1, 5), (2, 6)])
示例#5
0
 def get(self):
     args = self.parser.parse_args()
     in_lang = args['in-lang']
     out_lang = args['out-lang']
     text = args['text']
     index = args['index']
     debugger = args['debugger']
     try:
         transducer = make_g2p(in_lang, out_lang)
         tg = transducer(text)
         text = tg.output_string
         input_text = tg.input_string
         if debugger:
             debugger = tg.debugger
         if index:
             index = tg.edges
         return {
             'input-text': input_text,
             'output-text': text,
             'index': index,
             'debugger': debugger
         }
     except NetworkXNoPath:
         abort(400)
     except FileNotFoundError:
         abort(404)
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False, out_dir: str = ''):
    display_name = mapping.kwargs.get('language_name', 'No Language display name in Config')
    config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'}
    default_char = 't'
    if is_ipa(mapping.kwargs[f'{io}_lang']):
        mapping = align_inventories(mapping.inventory(io), DUMMY_INVENTORY)
    else:
        und_g2p = make_g2p('und', 'und-ipa')
        mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower()).output_string} for x in mapping.inventory(io)]
        dummy_list = align_inventories([x['out'] for x in mapping], DUMMY_INVENTORY)
        dummy_dict = {}
        for x in dummy_list:
            if x['in']:
                dummy_dict[x['in']] = x['out']
                
        for x in mapping:
            try:
                x['out'] = dummy_dict[x['out']]
            except KeyError:
                LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.")
                x['out'] = default_char       

    config['mapping'] = mapping
    mapping = Mapping(**config)
    if write_to_file:
        if out_dir:
            if os.path.isdir(out_dir):
                mapping.config_to_file(out_dir)
                mapping.mapping_to_file(out_dir)
            else:
                LOGGER.warning(f'{out_dir} is not a directory. Writing to default instead.')
        else:
            mapping.config_to_file()
            mapping.mapping_to_file()
    return mapping
示例#7
0
 def test_composition_with_none(self):
     transducer = make_g2p("ctp", "eng-arpabet")
     tg = transducer("Qne\u1D2C")
     self.assertEqual(tg.output_string, "HH N EY")
     self.assertEqual(
         tg.edges,
         [
             [(0, 0), (1, 1), (2, 2), (3, None)],
             [(0, 0), (1, 1), (2, 2), (2, 3)],
             [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (2, 5), (3, 6)],
         ],
     )
     self.assertEqual(
         tg.pretty_edges(),
         [
             [["q", "ʔ"], ["n", "n"], ["e", "e"], ["ᴬ", None]],
             [["ʔ", "ʔ"], ["n", "n"], ["e", "e"], ["e", "ː"]],
             [
                 ["ʔ", "H"],
                 ["ʔ", "H"],
                 ["ʔ", " "],
                 ["n", "N"],
                 ["n", " "],
                 ["e", "E"],
                 ["ː", "Y"],
             ],
         ],
     )
     self.assertEqual(compose_tiers(tg.edges), [(0, 2), (1, 4), (2, 6),
                                                (3, 6)])
示例#8
0
 def test_tiered_composition(self):
     """Indices mapped through a more complex, three-step composition"""
     transducer = make_g2p("dan", "eng-arpabet")
     tg = transducer("hej")
     self.assertEqual(tg.output_string, "HH EH Y ")
     self.assertEqual(
         tg.edges,
         [
             [(0, 0), (1, 1), (2, 2)],
             [(0, 0), (1, 1), (2, 2)],
             [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (2, 6),
              (2, 7)],
         ],
     )
     self.assertEqual(
         tg.pretty_edges(),
         [
             [["h", "h"], ["e", "ɛ"], ["j", "j"]],
             [["h", "h"], ["ɛ", "ɛ"], ["j", "j"]],
             [
                 ["h", "H"],
                 ["h", "H"],
                 ["h", " "],
                 ["ɛ", "E"],
                 ["ɛ", "H"],
                 ["ɛ", " "],
                 ["j", "Y"],
                 ["j", " "],
             ],
         ],
     )
示例#9
0
 def test_tokenizing_transducer_edges(self):
     transducer = g2p.make_g2p("fra", "fra-ipa", tok_lang="fra")
     edges = transducer("est est").edges
     # est -> ɛ, so edges are (0, 0), (1, 0), (2, 0) for each "est", plus the
     # space to the space, and the second set of edges being offset
     ref_edges = [(0, 0), (1, 0), (2, 0), (3, 1), (4, 2), (5, 2), (6, 2)]
     self.assertEqual(edges, ref_edges)
示例#10
0
 def test_tok_and_map_mic(self):
     transducer = g2p.make_g2p("mic", "mic-ipa")
     tokenizer = g2p.make_tokenizer("mic")
     word_ipa = transducer("sq").output_string
     string_ipa = g2p.tokenize_and_map(
         tokenizer, transducer, self.contextualize("sq")
     )
     self.assertEqual(string_ipa, self.contextualize(word_ipa))
示例#11
0
def convert(in_lang, out_lang, input_text):
    ''' Convert any text
    '''
    if os.path.exists(input_text) and input_text.endswith('txt'):
        with open(input_text, encoding='utf8') as f:
            input_text = f.read()
    transducer = make_g2p(in_lang, out_lang)
    click.echo(transducer(input_text))
示例#12
0
 def test_fra(self):
     transducer = make_g2p("fra", "eng-arpabet")
     tg = transducer("mais")
     self.assertEqual(tg.output_string, "M EH")
     self.assertEqual(compose_tiers(increment_tiers(tg.edges)), [(1, 2),
                                                                 (2, 4),
                                                                 (3, 4),
                                                                 (4, 4)])
示例#13
0
 def test_check_with_equiv(self):
     transducer = make_g2p("tau", "eng-arpabet", tok_lang="tau")
     tau_ipa = make_g2p("tau", "tau-ipa", tok_lang="tau")(
         "sh'oo Jign maasee' do'eent'aa shyyyh").output_string
     self.assertTrue(utils.is_panphon(tau_ipa))
     eng_ipa = make_g2p("tau", "eng-ipa", tok_lang="tau")(
         "sh'oo Jign maasee' do'eent'aa shyyyh").output_string
     self.assertTrue(utils.is_panphon(eng_ipa))
     eng_arpabet = make_g2p("tau", "eng-arpabet", tok_lang="tau")(
         "sh'oo Jign maasee' do'eent'aa shyyyh").output_string
     self.assertTrue(utils.is_arpabet(eng_arpabet))
     LOGGER.warning(
         f"tau-ipa {tau_ipa}\neng-ipa {eng_ipa}\n eng-arpabet {eng_arpabet}"
     )
     self.assertTrue(
         transducer.check(
             transducer("sh'oo Jign maasee' do'eent'aa shyyyh")))
示例#14
0
 def get_transducer(input_language: str, output_language: str):
     if not input_language:
         input_language = ''
         raise exceptions.CorrespondenceMissing(input_language)
     elif not output_language:
         output_language = ''
         raise exceptions.CorrespondenceMissing(output_language)
     else:
         return make_g2p(input_language, output_language)
示例#15
0
 def test_tokenizing_transducer_edge_spaces(self):
     transducer = g2p.make_g2p("fra", "eng-arpabet", tok_lang="fra")
     edges = transducer("  a, ").edges
     ref_edges = [
         [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)],  # "  a, " -> "  a, "
         [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)],  # "  a, " -> "  ɑ, "
         [(0, 0), (1, 1), (2, 2), (2, 3), (2, 4), (3, 5), (4, 6)],  # "  ɑ, " -> "  AA , "
     ]
     self.assertEqual(edges, ref_edges)
示例#16
0
    def test_io(self):
        # go through each language declared in the test case set up
        # Instead of asserting immediately, we go through all the cases first, so that
        # running test_langs.py prints all the errors at once, to help debugging a given g2p mapping.
        # Then we call assertEqual on the first failed case, to make unittest register the failure.
        error_count = 0
        for test in self.langs_to_test:
            transducer = make_g2p(test[0], test[1])
            output_string = transducer(test[2]).output_string
            if output_string != test[3]:
                LOGGER.warning("test_langs.py: mapping error: {} from {} to {} should be {}, got {}".format(test[2], test[0], test[1], test[3], output_string))
                if error_count == 0:
                    first_failed_test = test
                error_count += 1

        if error_count > 0:
            transducer = make_g2p(first_failed_test[0], first_failed_test[1])
            self.assertEqual(transducer(first_failed_test[2]).output_string, first_failed_test[3])
示例#17
0
 def test_tokenizing_transducer_edge_chain(self):
     transducer = g2p.make_g2p("fra", "eng-arpabet", tok_lang="fra")
     edges = transducer("est est").edges
     ref_edges = [
         # "est est" -> "ɛ ɛ"
         [(0, 0), (1, 0), (2, 0), (3, 1), (4, 2), (5, 2), (6, 2)],
         [(0, 0), (1, 1), (2, 2)],  # "ɛ ɛ" -> "ɛ ɛ"
         [(0, 0), (0, 1), (0, 2), (1, 3), (2, 4), (2, 5), (2, 6)],  # "ɛ ɛ" -> "EH  EH "
     ]
     self.assertEqual(edges, ref_edges)
示例#18
0
文件: test_langs.py 项目: joanise/g2p
 def test_io(self):
     # go through each language declared in the test case set up
     for lang in self.langs_to_test:
         in_lang = lang['in_lang']
         out_lang = lang['out_lang']
         transducer = make_g2p(in_lang, out_lang)
         # go through each table in the current lang
         for sample in lang['samples']:
             # assert that the transduced first item in the tuple is equal to the second item in the tuple
             self.assertEqual(transducer(sample[0]), sample[1])
示例#19
0
 def test_check_tokenizing_transducer(self):
     transducer = make_g2p("fra", "fra-ipa", tok_lang="fra")
     self.assertTrue(transducer.check(transducer("ceci est un test été à")))
     self.assertFalse(transducer.check(transducer("ñ oǹ")))
     self.assertTrue(
         transducer.check(
             transducer("ceci, cela; c'est tokenizé: alors c'est bon!")))
     self.assertFalse(
         transducer.check(
             transducer("mais... c'est ñoñ, si du texte ne passe pas!")))
示例#20
0
 def test_tok_and_map_fra(self):
     """ Chaining tests: tokenize and map a string """
     transducer = g2p.make_g2p("fra", "fra-ipa")
     tokenizer = g2p.make_tokenizer("fra")
     # "teste" in isolation is at string and word end and beginning
     word_ipa = transducer("teste").output_string
     # "teste" followed by space or punctuation should be mapped to the same string
     string_ipa = g2p.tokenize_and_map(
         tokenizer, transducer, self.contextualize("teste")
     )
     self.assertEqual(string_ipa, self.contextualize(word_ipa))
示例#21
0
    def convert_word(word: str, lang: str):
        """Convert one individual word through the specified cascade of g2p mappings.

        Args:
            word (str): input word to map through g2p
            lang (str): the language code to use to attempt the g2p mapping

        Returns:
            g2p_text (str), valid(bool):
              - g2p_text is the word mapping from lang to output_orthography
              - valid is a flag indicating whether g2p conversion yielded valid
                output, which includes making sure IPA output was valid IPA and
                ARPABET output was valid ARPABET, at all intermediate steps as
                well as in the final output.
        """

        if lang == "eng":
            # Hack to use old English LexiconG2P
            # Note: adding eng_ prefix to vars that are used in both blocks to make mypy
            # happy. Since the two sides of the if and in the same scope, it complains about
            # type checking otherwise.
            assert output_orthography == "eng-arpabet"
            eng_converter = getLexiconG2P(
                os.path.join(os.path.dirname(LEXICON_PATH), "cmu_sphinx.metadata.json")
            )
            try:
                eng_text, _ = eng_converter.convert(word)
                eng_valid = is_arpabet(eng_text)
            except KeyError as e:
                if verbose_warnings:
                    LOGGER.warning(f'Could not g2p "{word}" as English: {e.args[0]}')
                eng_text = word
                eng_valid = False
            return eng_text, eng_valid
        else:
            try:
                converter = make_g2p(lang, output_orthography)
            except InvalidLanguageCode as e:
                raise ValueError(
                    f'Could not g2p "{word}" as "{lang}": invalid language code. '
                    f"Use one of {getLangs()[0]}"
                ) from e
            except NoPath as e:
                raise ValueError(
                    f'Count not g2p "{word}" as "{lang}": no path to "{output_orthography}". '
                    f"Use one of {getLangs()[0]}"
                ) from e
            tg = converter(word)
            text = tg.output_string.strip()
            valid = converter.check(tg, shallow=True)
            if not valid and verbose_warnings:
                converter.check(tg, shallow=False, display_warnings=verbose_warnings)
            return text, valid
示例#22
0
 def test_check_tokenizing_composite_transducer(self):
     transducer = make_g2p("fra", "eng-arpabet", tok_lang="fra")
     self.assertTrue(transducer.check(transducer("ceci est un test été à")))
     self.assertFalse(transducer.check(transducer("ñ oǹ")))
     self.assertTrue(
         transducer.check(
             transducer("ceci, cela; c'est tokenizé: alors c'est bon!")))
     self.assertFalse(
         transducer.check(
             transducer("mais... c'est ñoñ, si du texte ne passe pas!")))
     self.assertFalse(
         transducer.check(
             transducer("mais... c'est ñoñ, si du texte ne passe pas!"),
             display_warnings=True,
         ))
示例#23
0
def change_table(message):
    """ Change the lookup table
    """
    if message['in_lang'] == 'custom' or message['out_lang'] == 'custom':
        mappings = Mapping(return_empty_mappings())
    else:
        transducer = make_g2p(message['in_lang'], message['out_lang'])
    if isinstance(transducer, Transducer):
        mappings = [transducer.mapping]
    elif isinstance(transducer, CompositeTransducer):
        mappings = [x.mapping for x in transducer._transducers]
    else:
        pass
    emit('table response', [{
        'mappings': x.plain_mapping(),
        'abbs': expand_abbreviations(x.abbreviations),
        'kwargs': x.kwargs
    } for x in mappings])
示例#24
0
def convert_words(xml, word_unit="w", output_orthography="eng-arpabet"):
    for word in xml.xpath(".//" + word_unit):
        # only convert text within words
        same_language_units = get_same_language_units(word)
        if not same_language_units:
            return
        all_text = ""
        all_indices = []
        for unit in same_language_units:
            # Hack to use old English LexiconG2P
            if unit["lang"] != "eng":
                converter = make_g2p(unit["lang"], output_orthography)
                tg = converter(unit["text"])
                text = tg.output_string
                indices = tg.edges
            else:
                tg = False
                converter = LexiconG2P(
                    os.path.join(
                        os.path.dirname(LEXICON_PATH), "cmu_sphinx.metadata.json"
                    )
                )
                text, indices = converter.convert(unit["text"])
            all_text += text
            all_indices += indices
        if tg and isinstance(tg, CompositeTransductionGraph):
            norm_form = converter._transducers[0].norm_form
            indices = increment_tiers(indices)
            all_indices = compose_tiers(indices)
        elif tg and isinstance(tg, TransductionGraph):
            norm_form = converter.norm_form
            indices = increment_indices(indices)
            all_indices = compose_indices([], indices)
        else:
            norm_form = None
            all_indices = indices
        if norm_form:
            word.text = ud.normalize(norm_form, word.text)
        replace_text_in_node(word, all_text, all_indices)
    return xml
示例#25
0
def convert(in_lang, out_lang, input_text, path, debugger):
    '''Convert INPUT_TEXT through g2p mapping(s) from IN_LANG to OUT_LANG.

       Visit http://g2p-studio.herokuapp.com/api/v1/langs for a list of languages.

       There must be a path from IN_LANG to OUT_LANG, possibly via some intermediates.
       For example, mapping from fra to eng-arpabet will successively apply
       fra->fra-ipa, fra-ipa->eng-ipa and eng-ipa->eng-arpabet.
    '''
    # Check valid input
    # Check input != output
    if in_lang == out_lang:
        raise click.UsageError(
            "Values must be different for 'IN_LANG' and 'OUT_LANG'")
    # Check input lang exists
    if not in_lang in LANGS_NETWORK.nodes:
        raise click.UsageError(
            f"'{in_lang}' is not a valid value for 'IN_LANG'")
    # Check output lang exists
    if not out_lang in LANGS_NETWORK.nodes:
        raise click.UsageError(
            f"'{out_lang}' is not a valid value for 'OUT_LANG'")
    # Check if path exists
    if not has_path(LANGS_NETWORK, in_lang, out_lang):
        raise click.UsageError(
            f"Path between '{in_lang}' and '{out_lang}' does not exist")
    if os.path.exists(input_text) and input_text.endswith('txt'):
        with open(input_text, encoding='utf8') as f:
            input_text = f.read()
    if in_lang and out_lang:
        transducer = make_g2p(in_lang, out_lang)
    elif path:
        transducer = Transducer(Mapping(path))
    tg = transducer(input_text)
    if debugger:
        output = [tg.output_string, tg.edges, tg.debugger]
        PRINTER.pprint(output)
    else:
        output = tg.output_string
        click.echo(output)
示例#26
0
文件: api.py 项目: deltork/g2p
 def get(self):
     args = self.parser.parse_args()
     in_lang = args["in-lang"]
     out_lang = args["out-lang"]
     text = args["text"]
     index = args["index"]
     debugger = args["debugger"]
     try:
         transducer = make_g2p(in_lang, out_lang)
         tg = transducer(text)
         text = tg.output_string
         input_text = tg.input_string
         debugger = tg.debugger if debugger else debugger
         index = tg.edges if index else index
         return {
             "input-text": input_text,
             "output-text": text,
             "index": index,
             "debugger": debugger,
         }
     except NoPath:
         abort(400)
     except InvalidLanguageCode:
         abort(404)
示例#27
0
def align_to_dummy_fallback(mapping: Mapping,
                            io: str = 'in',
                            distance: str = "weighted_feature_edit_distance"):
    """Create a mapping from mapping's output inventory to a minimalist dummy inventory"""
    config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'}
    default_char = 't'
    if is_ipa(mapping.kwargs[f'{io}_lang']):
        mapping = align_inventories(mapping.inventory(io),
                                    DUMMY_INVENTORY,
                                    distance=distance)
    else:
        und_g2p = make_g2p('und', 'und-ipa')
        mapping = [{
            "in": unicode_escape(x),
            "out": und_g2p(unidecode(x).lower()).output_string
        } for x in mapping.inventory(io)]
        dummy_list = align_inventories([x['out'] for x in mapping],
                                       DUMMY_INVENTORY,
                                       distance=distance)
        dummy_dict = {}
        for x in dummy_list:
            if x['in']:
                dummy_dict[x['in']] = x['out']

        for x in mapping:
            try:
                x['out'] = dummy_dict[x['out']]
            except KeyError:
                LOGGER.warning(
                    f"We couldn't guess at what {x['in']} means, so it's being "
                    f"replaced with '{default_char}' instead.")
                x['out'] = default_char

    config['mapping'] = mapping
    mapping = Mapping(**config)
    return mapping
示例#28
0
 def test_valid_transducer(self):
     transducer = make_g2p('atj', 'atj-ipa')
     self.assertTrue(isinstance(transducer, Transducer))
     self.assertEqual('niɡiɡw', transducer('nikikw').output_string)
示例#29
0
 def test_no_path(self):
     with self.assertRaises(NetworkXNoPath):
         make_g2p('hei', 'git')
示例#30
0
 def test_not_found(self):
     with self.assertRaises(FileNotFoundError):
         make_g2p('foo', 'eng-ipa')
     with self.assertRaises(FileNotFoundError):
         make_g2p('git', 'bar')