示例#1
0
def cache_langs():
    ''' Read in all files and save as pickle
    '''
    langs = {}
    dir_path = Path(LANGS_DIR)
    # Sort by language code
    paths = sorted(dir_path.glob('./*/config.y*ml'),
                   key=lambda x: x.parent.stem)
    mappings_legal_pairs = []
    for path in paths:
        code = path.parent.stem
        with open(path, encoding='utf8') as f:
            data = yaml.safe_load(f)
        # If there is a mappings key, there is more than one mapping
        # TODO: should put in some measure to prioritize non-generated mappings and warn when they override
        if 'mappings' in data:
            for index, mapping in enumerate(data['mappings']):
                mappings_legal_pairs.append(
                    (data['mappings'][index]['in_lang'],
                     data['mappings'][index]['out_lang']))
                data['mappings'][index] = load_mapping_from_path(path, index)
        else:
            data = load_mapping_from_path(path)
        langs = {**langs, **{code: data}}

    lang_network = Graph()
    lang_network.add_edges_from(mappings_legal_pairs)

    with open(LANGS_NWORK_PATH, 'wb') as f:
        write_gpickle(lang_network, f)

    with open(LANGS_PKL, 'wb') as f:
        pickle.dump(langs, f)

    return langs
示例#2
0
 def test_generated_mapping(self):
     config = {
         'in_lang': 'test',
         'out_lang': 'test-out',
         'rule_ordering': "apply-longest-first"
     }
     # config = utils.generate_config('test', 'test-out', 'Test', 'TestOut')
     config['mapping'] = [{'in': 'a', 'out': 'b'}]
     mapping = Mapping(**config)
     mapping.config_to_file(
         os.path.join(PUBLIC_DIR, 'mappings', 'test_config.yaml'))
     mapping.config_to_file(
         os.path.join(PUBLIC_DIR, 'mappings', 'generated_add.yaml'))
     mapping.mapping_to_file(os.path.join(PUBLIC_DIR, 'mappings'))
     test_config = utils.load_mapping_from_path(
         os.path.join(PUBLIC_DIR, 'mappings', 'test_config.yaml'))
     test_config_added = utils.load_mapping_from_path(
         os.path.join(PUBLIC_DIR, 'mappings', 'generated_add.yaml'))
     self.assertEqual(test_config['mapping_data'], [{
         'in': 'a',
         'out': 'b',
         'context_before': '',
         'context_after': ''
     }])
     self.assertEqual(test_config['in_lang'], 'test')
     self.assertEqual(test_config['out_lang'], 'test-out')
     self.assertEqual(test_config['language_name'], 'test')
     self.assertEqual(test_config['display_name'],
                      'test custom to test-out custom')
     self.assertEqual(test_config_added['mapping_data'],
                      [{
                          'in': 'a',
                          'out': 'b',
                          'context_before': '',
                          'context_after': ''
                      }])
     self.assertEqual(test_config_added['in_lang'], 'test')
     self.assertEqual(test_config_added['out_lang'], 'test-out')
     self.assertEqual(test_config_added['language_name'], 'test')
     self.assertEqual(test_config_added['display_name'],
                      'test custom to test-out custom')
示例#3
0
 def __init__(self,
              mapping=None,
              abbreviations: Union[str, DefaultDict[str,
                                                    List[str]]] = False,
              **kwargs):
     # should these just be explicit instead of kwargs...
     # yes, they should
     self.allowable_kwargs = [
         'language_name', 'display_name', 'mapping', 'in_lang', 'out_lang',
         'out_delimiter', 'as_is', 'case_sensitive', 'rule_ordering',
         'escape_special', 'norm_form', 'prevent_feeding', 'reverse'
     ]
     self.kwargs = OrderedDict(kwargs)
     self.processed = False
     if isinstance(abbreviations, defaultdict) or not abbreviations:
         self.abbreviations = abbreviations
     elif abbreviations:
         self.abbreviations = load_abbreviations_from_file(abbreviations)
     # Handle user-supplied list
     if isinstance(mapping, list):
         self.mapping = validate(mapping)
     elif isinstance(mapping, str) and (mapping.endswith('yaml')
                                        or mapping.endswith('yml')):
         loaded_config = load_mapping_from_path(mapping)
         self.process_loaded_config(loaded_config)
     elif isinstance(mapping, str):
         self.mapping = validate(load_from_file(mapping))
     else:
         if "in_lang" in self.kwargs and "out_lang" in self.kwargs:
             loaded_config = find_mapping(self.kwargs['in_lang'],
                                          self.kwargs['out_lang'])
             self.process_loaded_config(loaded_config)
         elif 'id' in self.kwargs:
             loaded_config = self.find_mapping_by_id(self.kwargs['id'])
             self.process_loaded_config(loaded_config)
         else:
             raise exceptions.MalformedLookup()
     if self.abbreviations:
         for abb, stands_for in self.abbreviations.items():
             abb_match = re.compile(abb)
             abb_repl = '|'.join(stands_for)
             if self.mapping and 'match_pattern' not in self.mapping[0]:
                 for io in self.mapping:
                     for key in io.keys():
                         if key in [
                                 'in', 'out', 'context_before',
                                 'context_after'
                         ] and re.search(abb_match, io[key]):
                             io[key] = re.sub(abb_match,
                                              unicode_escape(abb_repl),
                                              io[key])
     if not self.processed:
         self.mapping = self.process_kwargs(self.mapping)
示例#4
0
 def test_load_mapping(self):
     with self.assertRaises(MalformedMapping):
         utils.load_mapping_from_path(
             os.path.join(PUBLIC_DIR, 'mappings', 'malformed_config.yaml'))
     minimal = utils.load_mapping_from_path(
         os.path.join(PUBLIC_DIR, 'mappings', 'minimal_config.yaml'))
     csv = utils.load_mapping_from_path(
         os.path.join(PUBLIC_DIR, 'mappings', 'minimal_configs.yaml'), 0)
     tsv = utils.load_mapping_from_path(
         os.path.join(PUBLIC_DIR, 'mappings', 'minimal_configs.yaml'), 1)
     psv = utils.load_mapping_from_path(
         os.path.join(PUBLIC_DIR, 'mappings', 'minimal_configs.yaml'), 2)
     json = utils.load_mapping_from_path(
         os.path.join(PUBLIC_DIR, 'mappings', 'minimal_configs.yaml'), 3)
     xlsx = utils.load_mapping_from_path(
         os.path.join(PUBLIC_DIR, 'mappings', 'minimal_configs.yaml'), 4)
     self.assertEqual(minimal['mapping_data'], csv['mapping_data'])
     self.assertEqual(minimal['mapping_data'], tsv['mapping_data'])
     self.assertEqual(minimal['mapping_data'], psv['mapping_data'])
     self.assertEqual(minimal['mapping_data'], json['mapping_data'])
     self.assertEqual(minimal['mapping_data'], xlsx['mapping_data'])
示例#5
0
文件: cli.py 项目: deltork/g2p
def convert(
    in_lang,
    out_lang,
    input_text,
    path,
    tok,
    check,
    debugger,
    pretty_edges,
    tok_lang,
    config,
):
    """Convert INPUT_TEXT through g2p mapping(s) from IN_LANG to OUT_LANG.

       Visit http://g2p-studio.herokuapp.com/api/v1/langs for a list of languages.

       There must be a path from IN_LANG to OUT_LANG, possibly via some intermediates.
       For example, mapping from fra to eng-arpabet will successively apply
       fra->fra-ipa, fra-ipa->eng-ipa and eng-ipa->eng-arpabet.
    """
    # Check valid input
    # Check input != output
    if in_lang == out_lang:
        raise click.UsageError(
            "Values must be different for 'IN_LANG' and 'OUT_LANG'")
    if config:
        # This isn't that DRY - copied from g2p/mappings/langs/__init__.py
        mappings_legal_pairs = []
        with open(config, encoding="utf8") as f:
            data = yaml.safe_load(f)
        if "mappings" in data:
            for index, mapping in enumerate(data["mappings"]):
                mappings_legal_pairs.append((
                    data["mappings"][index]["in_lang"],
                    data["mappings"][index]["out_lang"],
                ))
                data["mappings"][index] = load_mapping_from_path(config, index)
        else:
            mapping = load_mapping_from_path(config)
            data["mappings"] = [mapping]
            mappings_legal_pairs.append(
                (mapping["in_lang"], mapping["out_lang"]))
        for pair in mappings_legal_pairs:
            if pair[0] in LANGS_NETWORK.nodes:
                LOGGER.warning(
                    f"A mapping with the name '{pair[0]}' is already defined in g2p. "
                    "Your local mapping with the same name might not function properly."
                )
        LANGS_NETWORK.add_edges_from(mappings_legal_pairs)
        MAPPINGS_AVAILABLE.extend(data["mappings"])
    # Check input lang exists
    if in_lang not in LANGS_NETWORK.nodes:
        raise click.UsageError(
            f"'{in_lang}' is not a valid value for 'IN_LANG'")
    # Check output lang exists
    if out_lang not in LANGS_NETWORK.nodes:
        raise click.UsageError(
            f"'{out_lang}' is not a valid value for 'OUT_LANG'")
    # Check if path exists
    if not has_path(LANGS_NETWORK, in_lang, out_lang):
        raise click.UsageError(
            f"Path between '{in_lang}' and '{out_lang}' does not exist")
    if os.path.exists(input_text) and input_text.endswith("txt"):
        with open(input_text, encoding="utf8") as f:
            input_text = f.read()
    # Determine which tokenizer to use, if any
    if tok is not None and not tok and tok_lang is not None:
        raise click.UsageError(
            "Specified conflicting --no-tok and --tok-lang options.")
    if tok and tok_lang is None:
        tok_lang = "path"
    # Transduce!!!
    if in_lang and out_lang:
        transducer = make_g2p(in_lang, out_lang, tok_lang=tok_lang)
    elif path:
        transducer = Transducer(Mapping(path))
    tg = transducer(input_text)
    if check:
        transducer.check(tg, display_warnings=True)
    outputs = [tg.output_string]
    if pretty_edges:
        outputs += [tg.pretty_edges()]
    if debugger:
        outputs += [tg.edges, tg.debugger]
    if len(outputs) > 1:
        click.echo(pprint.pformat(outputs, indent=4))
    else:
        click.echo(tg.output_string)
示例#6
0
文件: __init__.py 项目: deltork/g2p
 def __init__(
     self,
     mapping=None,
     abbreviations: Union[str, DefaultDict[str, List[str]]] = False,
     **kwargs,
 ):
     # should these just be explicit instead of kwargs...
     # yes, they should
     self.allowable_kwargs = [
         "language_name",
         "display_name",
         "mapping",
         "in_lang",
         "out_lang",
         "out_delimiter",
         "as_is",
         "case_sensitive",
         "rule_ordering",
         "escape_special",
         "norm_form",
         "prevent_feeding",
         "reverse",
         "type",
     ]
     self.kwargs = OrderedDict(kwargs)
     self.processed = False
     if isinstance(abbreviations, defaultdict) or not abbreviations:
         self.abbreviations = abbreviations
     else:
         self.abbreviations = load_abbreviations_from_file(abbreviations)
     # Handle user-supplied list
     if isinstance(mapping, list):
         self.mapping = validate(mapping, path="user-supplied mapping")
     elif isinstance(mapping, str) and (mapping.endswith("yaml")
                                        or mapping.endswith("yml")):
         loaded_config = load_mapping_from_path(mapping)
         self.process_loaded_config(loaded_config)
     elif isinstance(mapping, str):
         self.mapping = validate(load_from_file(mapping), path=mapping)
     else:
         if "in_lang" in self.kwargs and "out_lang" in self.kwargs:
             loaded_config = find_mapping(self.kwargs["in_lang"],
                                          self.kwargs["out_lang"])
             self.process_loaded_config(loaded_config)
         elif "id" in self.kwargs:
             loaded_config = self.find_mapping_by_id(self.kwargs["id"])
             self.process_loaded_config(loaded_config)
         elif self.kwargs.get("type", "") == "unidecode":
             self.mapping = []
         else:
             raise exceptions.MalformedLookup()
     if self.abbreviations:
         for abb, stands_for in sorted(self.abbreviations.items(),
                                       key=lambda x: len(x[0]),
                                       reverse=True):
             abb_match = re.compile(abb)
             abb_repl = "|".join(stands_for)
             if self.mapping and "match_pattern" not in self.mapping[0]:
                 for io in self.mapping:
                     for key in io.keys():
                         if key in [
                                 "in",
                                 "out",
                                 "context_before",
                                 "context_after",
                         ] and re.search(abb_match, io[key]):
                             io[key] = re.sub(abb_match,
                                              unicode_escape(abb_repl),
                                              io[key])
     if not self.processed:
         self.mapping = self.process_kwargs(self.mapping)