def cache_langs(): ''' Read in all files and save as pickle ''' langs = {} dir_path = Path(LANGS_DIR) # Sort by language code paths = sorted(dir_path.glob('./*/config.y*ml'), key=lambda x: x.parent.stem) mappings_legal_pairs = [] for path in paths: code = path.parent.stem with open(path, encoding='utf8') as f: data = yaml.safe_load(f) # If there is a mappings key, there is more than one mapping # TODO: should put in some measure to prioritize non-generated mappings and warn when they override if 'mappings' in data: for index, mapping in enumerate(data['mappings']): mappings_legal_pairs.append( (data['mappings'][index]['in_lang'], data['mappings'][index]['out_lang'])) data['mappings'][index] = load_mapping_from_path(path, index) else: data = load_mapping_from_path(path) langs = {**langs, **{code: data}} lang_network = Graph() lang_network.add_edges_from(mappings_legal_pairs) with open(LANGS_NWORK_PATH, 'wb') as f: write_gpickle(lang_network, f) with open(LANGS_PKL, 'wb') as f: pickle.dump(langs, f) return langs
def test_generated_mapping(self): config = { 'in_lang': 'test', 'out_lang': 'test-out', 'rule_ordering': "apply-longest-first" } # config = utils.generate_config('test', 'test-out', 'Test', 'TestOut') config['mapping'] = [{'in': 'a', 'out': 'b'}] mapping = Mapping(**config) mapping.config_to_file( os.path.join(PUBLIC_DIR, 'mappings', 'test_config.yaml')) mapping.config_to_file( os.path.join(PUBLIC_DIR, 'mappings', 'generated_add.yaml')) mapping.mapping_to_file(os.path.join(PUBLIC_DIR, 'mappings')) test_config = utils.load_mapping_from_path( os.path.join(PUBLIC_DIR, 'mappings', 'test_config.yaml')) test_config_added = utils.load_mapping_from_path( os.path.join(PUBLIC_DIR, 'mappings', 'generated_add.yaml')) self.assertEqual(test_config['mapping_data'], [{ 'in': 'a', 'out': 'b', 'context_before': '', 'context_after': '' }]) self.assertEqual(test_config['in_lang'], 'test') self.assertEqual(test_config['out_lang'], 'test-out') self.assertEqual(test_config['language_name'], 'test') self.assertEqual(test_config['display_name'], 'test custom to test-out custom') self.assertEqual(test_config_added['mapping_data'], [{ 'in': 'a', 'out': 'b', 'context_before': '', 'context_after': '' }]) self.assertEqual(test_config_added['in_lang'], 'test') self.assertEqual(test_config_added['out_lang'], 'test-out') self.assertEqual(test_config_added['language_name'], 'test') self.assertEqual(test_config_added['display_name'], 'test custom to test-out custom')
def __init__(self, mapping=None, abbreviations: Union[str, DefaultDict[str, List[str]]] = False, **kwargs): # should these just be explicit instead of kwargs... # yes, they should self.allowable_kwargs = [ 'language_name', 'display_name', 'mapping', 'in_lang', 'out_lang', 'out_delimiter', 'as_is', 'case_sensitive', 'rule_ordering', 'escape_special', 'norm_form', 'prevent_feeding', 'reverse' ] self.kwargs = OrderedDict(kwargs) self.processed = False if isinstance(abbreviations, defaultdict) or not abbreviations: self.abbreviations = abbreviations elif abbreviations: self.abbreviations = load_abbreviations_from_file(abbreviations) # Handle user-supplied list if isinstance(mapping, list): self.mapping = validate(mapping) elif isinstance(mapping, str) and (mapping.endswith('yaml') or mapping.endswith('yml')): loaded_config = load_mapping_from_path(mapping) self.process_loaded_config(loaded_config) elif isinstance(mapping, str): self.mapping = validate(load_from_file(mapping)) else: if "in_lang" in self.kwargs and "out_lang" in self.kwargs: loaded_config = find_mapping(self.kwargs['in_lang'], self.kwargs['out_lang']) self.process_loaded_config(loaded_config) elif 'id' in self.kwargs: loaded_config = self.find_mapping_by_id(self.kwargs['id']) self.process_loaded_config(loaded_config) else: raise exceptions.MalformedLookup() if self.abbreviations: for abb, stands_for in self.abbreviations.items(): abb_match = re.compile(abb) abb_repl = '|'.join(stands_for) if self.mapping and 'match_pattern' not in self.mapping[0]: for io in self.mapping: for key in io.keys(): if key in [ 'in', 'out', 'context_before', 'context_after' ] and re.search(abb_match, io[key]): io[key] = re.sub(abb_match, unicode_escape(abb_repl), io[key]) if not self.processed: self.mapping = self.process_kwargs(self.mapping)
def test_load_mapping(self): with self.assertRaises(MalformedMapping): utils.load_mapping_from_path( os.path.join(PUBLIC_DIR, 'mappings', 'malformed_config.yaml')) minimal = utils.load_mapping_from_path( os.path.join(PUBLIC_DIR, 'mappings', 'minimal_config.yaml')) csv = utils.load_mapping_from_path( os.path.join(PUBLIC_DIR, 'mappings', 'minimal_configs.yaml'), 0) tsv = utils.load_mapping_from_path( os.path.join(PUBLIC_DIR, 'mappings', 'minimal_configs.yaml'), 1) psv = utils.load_mapping_from_path( os.path.join(PUBLIC_DIR, 'mappings', 'minimal_configs.yaml'), 2) json = utils.load_mapping_from_path( os.path.join(PUBLIC_DIR, 'mappings', 'minimal_configs.yaml'), 3) xlsx = utils.load_mapping_from_path( os.path.join(PUBLIC_DIR, 'mappings', 'minimal_configs.yaml'), 4) self.assertEqual(minimal['mapping_data'], csv['mapping_data']) self.assertEqual(minimal['mapping_data'], tsv['mapping_data']) self.assertEqual(minimal['mapping_data'], psv['mapping_data']) self.assertEqual(minimal['mapping_data'], json['mapping_data']) self.assertEqual(minimal['mapping_data'], xlsx['mapping_data'])
def convert( in_lang, out_lang, input_text, path, tok, check, debugger, pretty_edges, tok_lang, config, ): """Convert INPUT_TEXT through g2p mapping(s) from IN_LANG to OUT_LANG. Visit http://g2p-studio.herokuapp.com/api/v1/langs for a list of languages. There must be a path from IN_LANG to OUT_LANG, possibly via some intermediates. For example, mapping from fra to eng-arpabet will successively apply fra->fra-ipa, fra-ipa->eng-ipa and eng-ipa->eng-arpabet. """ # Check valid input # Check input != output if in_lang == out_lang: raise click.UsageError( "Values must be different for 'IN_LANG' and 'OUT_LANG'") if config: # This isn't that DRY - copied from g2p/mappings/langs/__init__.py mappings_legal_pairs = [] with open(config, encoding="utf8") as f: data = yaml.safe_load(f) if "mappings" in data: for index, mapping in enumerate(data["mappings"]): mappings_legal_pairs.append(( data["mappings"][index]["in_lang"], data["mappings"][index]["out_lang"], )) data["mappings"][index] = load_mapping_from_path(config, index) else: mapping = load_mapping_from_path(config) data["mappings"] = [mapping] mappings_legal_pairs.append( (mapping["in_lang"], mapping["out_lang"])) for pair in mappings_legal_pairs: if pair[0] in LANGS_NETWORK.nodes: LOGGER.warning( f"A mapping with the name '{pair[0]}' is already defined in g2p. " "Your local mapping with the same name might not function properly." ) LANGS_NETWORK.add_edges_from(mappings_legal_pairs) MAPPINGS_AVAILABLE.extend(data["mappings"]) # Check input lang exists if in_lang not in LANGS_NETWORK.nodes: raise click.UsageError( f"'{in_lang}' is not a valid value for 'IN_LANG'") # Check output lang exists if out_lang not in LANGS_NETWORK.nodes: raise click.UsageError( f"'{out_lang}' is not a valid value for 'OUT_LANG'") # Check if path exists if not has_path(LANGS_NETWORK, in_lang, out_lang): raise click.UsageError( f"Path between '{in_lang}' and '{out_lang}' does not exist") if os.path.exists(input_text) and input_text.endswith("txt"): with open(input_text, encoding="utf8") as f: input_text = f.read() # Determine which tokenizer to use, if any if tok is not None and not tok and tok_lang is not None: raise click.UsageError( "Specified conflicting --no-tok and --tok-lang options.") if tok and tok_lang is None: tok_lang = "path" # Transduce!!! if in_lang and out_lang: transducer = make_g2p(in_lang, out_lang, tok_lang=tok_lang) elif path: transducer = Transducer(Mapping(path)) tg = transducer(input_text) if check: transducer.check(tg, display_warnings=True) outputs = [tg.output_string] if pretty_edges: outputs += [tg.pretty_edges()] if debugger: outputs += [tg.edges, tg.debugger] if len(outputs) > 1: click.echo(pprint.pformat(outputs, indent=4)) else: click.echo(tg.output_string)
def __init__( self, mapping=None, abbreviations: Union[str, DefaultDict[str, List[str]]] = False, **kwargs, ): # should these just be explicit instead of kwargs... # yes, they should self.allowable_kwargs = [ "language_name", "display_name", "mapping", "in_lang", "out_lang", "out_delimiter", "as_is", "case_sensitive", "rule_ordering", "escape_special", "norm_form", "prevent_feeding", "reverse", "type", ] self.kwargs = OrderedDict(kwargs) self.processed = False if isinstance(abbreviations, defaultdict) or not abbreviations: self.abbreviations = abbreviations else: self.abbreviations = load_abbreviations_from_file(abbreviations) # Handle user-supplied list if isinstance(mapping, list): self.mapping = validate(mapping, path="user-supplied mapping") elif isinstance(mapping, str) and (mapping.endswith("yaml") or mapping.endswith("yml")): loaded_config = load_mapping_from_path(mapping) self.process_loaded_config(loaded_config) elif isinstance(mapping, str): self.mapping = validate(load_from_file(mapping), path=mapping) else: if "in_lang" in self.kwargs and "out_lang" in self.kwargs: loaded_config = find_mapping(self.kwargs["in_lang"], self.kwargs["out_lang"]) self.process_loaded_config(loaded_config) elif "id" in self.kwargs: loaded_config = self.find_mapping_by_id(self.kwargs["id"]) self.process_loaded_config(loaded_config) elif self.kwargs.get("type", "") == "unidecode": self.mapping = [] else: raise exceptions.MalformedLookup() if self.abbreviations: for abb, stands_for in sorted(self.abbreviations.items(), key=lambda x: len(x[0]), reverse=True): abb_match = re.compile(abb) abb_repl = "|".join(stands_for) if self.mapping and "match_pattern" not in self.mapping[0]: for io in self.mapping: for key in io.keys(): if key in [ "in", "out", "context_before", "context_after", ] and re.search(abb_match, io[key]): io[key] = re.sub(abb_match, unicode_escape(abb_repl), io[key]) if not self.processed: self.mapping = self.process_kwargs(self.mapping)