def check( self, tg: TransductionGraph, shallow=False, display_warnings=False, original_input=None, ): out_lang = self.mapping.kwargs["out_lang"] if "eng-arpabet" in out_lang: if not is_arpabet(tg.output_string): if display_warnings: display_input = (original_input if original_input else tg.input_string) LOGGER.warning( f'Transducer output "{tg.output_string}" for input "{display_input}" is not fully valid eng-arpabet as recognized by soundswallower.' ) return False else: return True elif is_ipa(out_lang): if not is_panphon(tg.output_string, display_warnings=display_warnings): if display_warnings: display_input = (original_input if original_input else tg.input_string) LOGGER.warning( f'Transducer output "{tg.output_string}" for input "{display_input}" is not fully valid {out_lang}.' ) return False else: return True else: # No check implemented at this tier, just return True return True
def network_to_echart(write_to_file: bool = False, layout: bool = False): nodes = [] no_nodes = len(LANGS_NETWORK.nodes) for node in LANGS_NETWORK.nodes: lang_name = node.split('-')[0] no_ancestors = len(ancestors(LANGS_NETWORK, node)) no_descendants = len(descendants(LANGS_NETWORK, node)) size = min( 20, max(2, ((no_ancestors / no_nodes) * 100 + (no_descendants / no_nodes) * 100))) node = { 'name': node, 'symbolSize': size, 'id': node, 'category': lang_name } nodes.append(node) nodes.sort(key=lambda x: x['name']) edges = [] for edge in LANGS_NETWORK.edges: edges.append({'source': edge[0], 'target': edge[1]}) if write_to_file: with open( os.path.join(os.path.dirname(static_file), 'languages-network.json'), 'w') as f: f.write(json.dumps({'nodes': nodes, 'edges': edges})) LOGGER.info(f'Wrote network nodes and edges to static file.') return nodes, edges
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False): dummy_inventory = ["ɑ", "i", "u", "t", "s", "n"] display_name = mapping.kwargs.get('language_name', 'No Language display name in Config') config = generate_config(mapping.kwargs[f'{io}_lang'], 'dummy', display_name, display_name) default_char = 't' if is_ipa(mapping.kwargs[f'{io}_lang']): mapping = align_inventories(mapping.inventory(io), dummy_inventory) else: und_g2p = make_g2p('und', 'und-ipa') mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower())} for x in mapping.inventory(io)] dummy_list = align_inventories([x['out'] for x in mapping], dummy_inventory) dummy_dict = {} for x in dummy_list: if x['in']: dummy_dict[x['in']] = x['out'] for x in mapping: try: x['out'] = dummy_dict[x['out']] except KeyError: LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.") x['out'] = default_char if write_to_file: write_generated_mapping_to_file(config, mapping) return config, mapping
def check_ipa_known_segs(mappings_to_check=False) -> bool: """Check the given mappings, or all IPA mappings, for invalid IPA in the "out" fields Returns True iff not errors were found. """ if not mappings_to_check: mappings_to_check = [x["out_lang"] for x in MAPPINGS_AVAILABLE] found_error = False for mapping in [ x for x in MAPPINGS_AVAILABLE if x["out_lang"] in mappings_to_check ]: if is_ipa(mapping["out_lang"]): reverse = mapping.get("reverse", False) for rule in mapping["mapping_data"]: output = rule["in"] if reverse else rule["out"] if not is_panphon(output): LOGGER.warning( f"Output '{rule['out']}' in rule {rule} in mapping between {mapping['in_lang']} " f"and {mapping['out_lang']} is not recognized as valid IPA by panphon." ) found_error = True if found_error: LOGGER.warning( "Please refer to https://github.com/dmort27/panphon for information about panphon." ) return not found_error
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False, out_dir: str = ''): display_name = mapping.kwargs.get('language_name', 'No Language display name in Config') config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'} default_char = 't' if is_ipa(mapping.kwargs[f'{io}_lang']): mapping = align_inventories(mapping.inventory(io), DUMMY_INVENTORY) else: und_g2p = make_g2p('und', 'und-ipa') mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower()).output_string} for x in mapping.inventory(io)] dummy_list = align_inventories([x['out'] for x in mapping], DUMMY_INVENTORY) dummy_dict = {} for x in dummy_list: if x['in']: dummy_dict[x['in']] = x['out'] for x in mapping: try: x['out'] = dummy_dict[x['out']] except KeyError: LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.") x['out'] = default_char config['mapping'] = mapping mapping = Mapping(**config) if write_to_file: if out_dir: if os.path.isdir(out_dir): mapping.config_to_file(out_dir) mapping.mapping_to_file(out_dir) else: LOGGER.warning(f'{out_dir} is not a directory. Writing to default instead.') else: mapping.config_to_file() mapping.mapping_to_file() return mapping
def rule_to_regex(self, rule: str) -> Pattern: """Turns an input string (and the context) from an input/output pair into a regular expression pattern""" if "context_before" in rule and rule['context_before']: before = rule["context_before"] else: before = '' if 'context_after' in rule and rule['context_after']: after = rule["context_after"] else: after = '' input_match = re.sub(re.compile(r'{\d+}'), "", rule['in']) try: inp = create_fixed_width_lookbehind(before) + input_match if after: inp += f"(?={after})" if not self.kwargs['case_sensitive']: rule_regex = re.compile(inp, re.I) else: rule_regex = re.compile(inp) except: LOGGER.error( f'Your regex in mapping between {self.kwargs["in_lang"]} and {self.kwargs["out_lang"]} is malformed. \ Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?' ) raise Exception( f'Your regex in mapping between {self.kwargs["in_lang"]} and {self.kwargs["out_lang"]} is malformed. \ Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?' ) return rule_regex
def create_mapping(mapping_1: Mapping, mapping_2: Mapping, mapping_1_io: str = 'out', mapping_2_io: str = 'in', write_to_file: bool = False) -> Mapping: map_1_name = mapping_1.kwargs[f'{mapping_1_io}_lang'] map_2_name = mapping_2.kwargs[f'{mapping_2_io}_lang'] if not is_ipa(map_1_name) and not is_xsampa(map_1_name): LOGGER.warning( "Unsupported orthography of inventory 1: %s" " (must be ipa or x-sampa)", map_1_name) if not is_ipa(map_2_name) and not is_xsampa(map_2_name): LOGGER.warning( "Unsupported orthography of inventory 2: %s" " (must be ipa or x-sampa)", map_2_name) l1_is_xsampa, l2_is_xsampa = is_xsampa(map_1_name), is_xsampa(map_2_name) mapping = align_inventories(mapping_1.inventory(mapping_1_io), mapping_2.inventory(mapping_2_io), l1_is_xsampa, l2_is_xsampa) l1_display_name = mapping_1.kwargs.get( 'language_name', 'No Language display name in Config') l2_display_name = mapping_2.kwargs.get( 'language_name', 'No Language display name in Config') config = generate_config(map_1_name, map_2_name, l1_display_name, l2_display_name) if write_to_file: write_generated_mapping_to_file(config, mapping) return Mapping(mapping, **{k: v for k, v in config.items() if k != 'mapping'})
def write_generated_mapping_to_file(config: dict, mapping: List[dict]): # read config with open(GEN_CONFIG, 'r') as f: data = yaml.safe_load(f) map_output_path = os.path.join(GEN_DIR, config['mapping']) # write mapping if os.path.exists(map_output_path): LOGGER.info(f"Overwriting file at {map_output_path}") with open(map_output_path, 'w', encoding='utf8') as f: json.dump(mapping, f, indent=4) data = deepcopy(data) cfg_exists = bool([x for x in data['mappings'] if x['in_lang'] == config['in_lang'] and x['out_lang'] == config['out_lang']]) # add new mapping if no mappings are generated yet if not data['mappings']: data['mappings'] = [config] # add new mapping if it doesn't exist yet elif not cfg_exists: data['mappings'].append(config) # rewrite config with open(GEN_CONFIG, 'w', encoding='utf8') as f: yaml.dump(data, f, Dumper=IndentDumper, default_flow_style=False) elif cfg_exists: for i, cfg in enumerate(data['mappings']): if cfg['in_lang'] == config['in_lang'] and cfg['out_lang'] == config['out_lang']: data['mappings'][i] = config # rewrite config with open(GEN_CONFIG, 'w', encoding='utf8') as f: yaml.dump(data, f, Dumper=IndentDumper, default_flow_style=False) break else: LOGGER.warn( f"Not writing generated files because a non-generated mapping from {config['in_lang']} to {config['out_lang']} already exists.")
def test_response_code(self): ''' Ensure all routes return 200 ''' for rt in self.routes_no_args: try: r = self.client().get(rt) self.assertEqual(r.status_code, 200) LOGGER.debug("Route " + rt + " returned " + str(r.status_code)) except: LOGGER.error("Couldn't connect. Is flask running?")
def update_docs(): """ Update the swagger documentation with all nodes from the network """ swagger_path = os.path.join(os.path.dirname(static_file), "swagger.json") with open(swagger_path) as f: data = json.load(f) data["components"]["schemas"]["Langs"]["enum"] = sorted( LANGS_NETWORK.nodes) with open(swagger_path, "w") as f: f.write(json.dumps(data)) LOGGER.info("Updated API documentation")
def update_docs(): ''' Update the swagger documentation with all nodes from the network ''' swagger_path = os.path.join(os.path.dirname(static_file), 'swagger.json') with open(swagger_path) as f: data = json.load(f) data['components']['schemas']['Langs']['enum'] = sorted( [x for x in LANGS_NETWORK.nodes]) with open(swagger_path, 'w') as f: f.write(json.dumps(data)) LOGGER.info('Updated API documentation')
def get_tokenizer(*args, **kwargs): """ Deprecated; use make_tokenizer() instead. """ global _deprecated_warning_printed if not _deprecated_warning_printed: LOGGER.warning( "g2p.get_tokenizer() / g2p.mappings.tokenizer.get_tokenizer() is deprecated. Import and use g2p.make_tokenizer() instead." ) _deprecated_warning_printed = True return make_tokenizer(*args, **kwargs)
def escape_special_characters(to_escape: Dict[str, str]) -> Dict[str, str]: for k, v in to_escape.items(): if isinstance(v, str): escaped = re.escape(v) else: escaped = v if escaped != v: LOGGER.info( f"Escaped special characters in '{v}' with '{escaped}''. Set 'escape_special' to False in your Mapping configuration to disable this.") to_escape[k] = escaped return to_escape
def rule_to_regex(self, rule: dict) -> Pattern: """Turns an input string (and the context) from an input/output pair into a regular expression pattern" The 'in' key is the match. The 'context_after' key creates a lookahead. The 'context_before' key creates a lookbehind. Args: rule: A dictionary containing 'in', 'out', 'context_before', and 'context_after' keys Raises: Exception: This is raised when un-supported regex characters or symbols exist in the rule Returns: Pattern: returns a regex pattern (re.Pattern) bool: returns False if input is null """ # Prevent null input. See, https://github.com/roedoejet/g2p/issues/24 if not rule['in']: LOGGER.warning( f'Rule with input \'{rule["in"]}\' and output \'{rule["out"]}\' has no input. This is disallowed. Please check your mapping file for rules with null inputs.' ) return False if "context_before" in rule and rule['context_before']: before = rule["context_before"] else: before = '' if 'context_after' in rule and rule['context_after']: after = rule["context_after"] else: after = '' input_match = re.sub(re.compile(r'{\d+}'), "", rule['in']) try: inp = create_fixed_width_lookbehind(before) + input_match if after: inp += f"(?={after})" if not self.kwargs['case_sensitive']: rule_regex = re.compile(inp, re.I) else: rule_regex = re.compile(inp) except: in_lang = self.kwargs.get('in_lang', 'und') out_lang = self.kwargs.get('out_lang', 'und') LOGGER.error( f'Your regex in mapping between {in_lang} and {out_lang} is malformed. \ Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?' ) raise Exception( f'Your regex in mapping between {in_lang} and {out_lang} is malformed. \ Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?' ) return rule_regex
def test_convert(self): error_count = 0 for test in self.langs_to_test: output_string = self.runner.invoke(convert, [test[2], test[0], test[1]]).stdout.strip() if output_string != test[3]: LOGGER.warning("test_cli.py: mapping error: {} from {} to {} should be {}, got {}".format(test[2], test[0], test[1], test[3], output_string)) if error_count == 0: first_failed_test = test error_count += 1 if error_count > 0: output_string = self.runner.invoke(convert, [first_failed_test[2], first_failed_test[0], first_failed_test[1]]).stdout.strip() self.assertEqual(output_string, first_failed_test[3])
def test_response_code_with_args(self): ''' Ensure all args return 200 ''' for ep in self.routes_only_args: for node in LANGS_NETWORK.nodes: rt = re.sub(self.arg_match, node, ep) try: r = self.client().get(rt) self.assertEqual(r.status_code, 200) except: LOGGER.error("Couldn't connect. Is flask running?") LOGGER.debug("Successfully tested " + str(len(LANGS_NETWORK.nodes) ) + " node resources at route " + ep + " .")
def normalize(inp: str, norm_form: str): ''' Normalize to NFC(omposed) or NFD(ecomposed). Also, find any Unicode Escapes & decode 'em! ''' if norm_form not in ['none', 'NFC', 'NFD', 'NFKC', 'NFKD']: raise exceptions.InvalidNormalization(normalize) elif norm_form is None or norm_form == 'none': return inp else: normalized = ud.normalize(norm_form, unicode_escape(inp)) if normalized != inp: LOGGER.info( 'The string %s was normalized to %s using the %s standard and by decoding any Unicode escapes. Note that this is not necessarily the final stage of normalization.', inp, normalized, norm_form) return normalized
def get_distance_method(dst, distance: str): if distance not in DISTANCE_METRICS: raise ValueError(f"Distance metric {distance} not supported") try: distance_method = getattr(dst, distance) except AttributeError as e: # Older versions of panphon mispelled Dolgopolsky's name as Dogolpolsky... # Try again with the older name, so we stay compatible with both <=0.19 # and >=0.19.1 if distance == "dolgo_prime_distance": return getattr(dst, "dogol_prime_distance") LOGGER.error(f"The distance metric {distance} is not supported by PanPhon") raise ValueError(f"Distance metric {distance} not supported") from e return distance_method
def find_good_match(p1, inventory_l2): """Find a good sequence in inventory_l2 matching p1.""" # The proper way to do this would be with some kind of beam search # through a determinized/minimized FST, but in the absence of that # we can do a kind of heurstic greedy search. (we don't want any # dependencies outside of PyPI otherwise we'd just use OpenFST) p1_pseq = dst.fm.ipa_segs(p1) i = 0 good_match = [] while i < len(p1_pseq): best_input = "" best_output = -1 best_score = 0xDEADBEEF for j, p2_pseq in enumerate(p2_pseqs): # FIXME: Should also consider the (weighted) possibility # of deleting input or inserting any segment (but that # can't be done with a greedy search) if len(p2_pseq) == 0: LOGGER.warning( "No panphon mapping for %s - skipping", inventory_l2[j] ) continue e = min(i + len(p2_pseq), len(p1_pseq)) input_seg = p1_pseq[i:e] distance_method = get_distance_method(dst, distance) score = distance_method("".join(input_seg), "".join(p2_pseq)) # Be very greedy and take the longest match if ( score < best_score or score == best_score and len(input_seg) > len(best_input) ): best_input = input_seg best_output = j best_score = score LOGGER.debug( "Best match at position %d: %s => %s", i, best_input, inventory_l2[best_output], ) good_match.append(inventory_l2[best_output]) i += len(best_input) # greedy! return "".join(good_match)
def setUp(self): DATA_DIR = os.path.dirname(data_dir) self.langs_to_test = [] for fn in glob(f'{DATA_DIR}/*.*sv'): if fn.endswith('csv'): delimiter = ',' elif fn.endswith('psv'): delimiter = '|' elif fn.endswith('tsv'): delimiter = '\t' with open(fn, encoding="utf-8") as csvfile: reader = csv.reader(csvfile, delimiter=delimiter) for row in reader: if len(row) != 4: LOGGER.warning(f'Row in {fn} containing values {row} does not have the right values. Please check your data.') else: self.langs_to_test.append(row)
def create_mapping( mapping_1: Mapping, mapping_2: Mapping, mapping_1_io: str = "out", mapping_2_io: str = "in", distance: str = "weighted_feature_edit_distance", ) -> Mapping: """Create a mapping from mapping_1's output inventory to mapping_2's input inventory""" map_1_name = mapping_1.kwargs[f"{mapping_1_io}_lang"] map_2_name = mapping_2.kwargs[f"{mapping_2_io}_lang"] if not is_ipa(map_1_name) and not is_xsampa(map_1_name): LOGGER.warning( "Unsupported orthography of inventory 1: %s (must be ipa or x-sampa)", map_1_name, ) if not is_ipa(map_2_name) and not is_xsampa(map_2_name): LOGGER.warning( "Unsupported orthography of inventory 2: %s (must be ipa or x-sampa)", map_2_name, ) l1_is_xsampa, l2_is_xsampa = is_xsampa(map_1_name), is_xsampa(map_2_name) mapping = align_inventories( mapping_1.inventory(mapping_1_io), mapping_2.inventory(mapping_2_io), l1_is_xsampa, l2_is_xsampa, distance=distance, ) # Initialize mapping with input language parameters (as_is, # case_sensitive, prevent_feeding, etc) config = mapping_1.kwargs.copy() # Fix up names, etc. if "authors" in config: del config["authors"] if "display_name" in config: del config["display_name"] if "language_name" in config: del config["language_name"] config["prevent_feeding"] = True config["in_lang"] = map_1_name config["out_lang"] = map_2_name config["mapping"] = mapping mapping = Mapping(**config) return mapping
def test_check_with_equiv(self): transducer = make_g2p("tau", "eng-arpabet", tok_lang="tau") tau_ipa = make_g2p("tau", "tau-ipa", tok_lang="tau")( "sh'oo Jign maasee' do'eent'aa shyyyh").output_string self.assertTrue(utils.is_panphon(tau_ipa)) eng_ipa = make_g2p("tau", "eng-ipa", tok_lang="tau")( "sh'oo Jign maasee' do'eent'aa shyyyh").output_string self.assertTrue(utils.is_panphon(eng_ipa)) eng_arpabet = make_g2p("tau", "eng-arpabet", tok_lang="tau")( "sh'oo Jign maasee' do'eent'aa shyyyh").output_string self.assertTrue(utils.is_arpabet(eng_arpabet)) LOGGER.warning( f"tau-ipa {tau_ipa}\neng-ipa {eng_ipa}\n eng-arpabet {eng_arpabet}" ) self.assertTrue( transducer.check( transducer("sh'oo Jign maasee' do'eent'aa shyyyh")))
def test_io(self): # go through each language declared in the test case set up # Instead of asserting immediately, we go through all the cases first, so that # running test_langs.py prints all the errors at once, to help debugging a given g2p mapping. # Then we call assertEqual on the first failed case, to make unittest register the failure. error_count = 0 for test in self.langs_to_test: transducer = make_g2p(test[0], test[1]) output_string = transducer(test[2]).output_string if output_string != test[3]: LOGGER.warning("test_langs.py: mapping error: {} from {} to {} should be {}, got {}".format(test[2], test[0], test[1], test[3], output_string)) if error_count == 0: first_failed_test = test error_count += 1 if error_count > 0: transducer = make_g2p(first_failed_test[0], first_failed_test[1]) self.assertEqual(transducer(first_failed_test[2]).output_string, first_failed_test[3])
def run_tests(suite): ''' Decide which Test Suite to run ''' if suite == 'all': suite = LOADER.discover(os.path.dirname(__file__)) if suite == 'trans': suite = TestSuite(TRANSDUCER_TESTS) if suite == 'langs': suite = TestSuite(LANGS_TESTS) if suite == 'mappings': suite = TestSuite(MAPPINGS_TESTS) elif suite == 'dev': suite = TestSuite(DEV_TESTS) runner = TextTestRunner(verbosity=3) if isinstance(suite, str): LOGGER.error("Please specify a test suite to run: i.e. 'dev' or 'all'") else: runner.run(suite)
def normalize(inp: str, norm_form: str): """ Normalize to NFC(omposed) or NFD(ecomposed). Also, find any Unicode Escapes & decode 'em! """ if norm_form not in ["none", "NFC", "NFD", "NFKC", "NFKD"]: raise exceptions.InvalidNormalization(normalize) elif norm_form is None or norm_form == "none": return unicode_escape(inp) else: normalized = ud.normalize(norm_form, unicode_escape(inp)) if normalized != inp: LOGGER.debug( "The string %s was normalized to %s using the %s standard and by decoding any Unicode escapes. " "Note that this is not necessarily the final stage of normalization.", inp, normalized, norm_form, ) return normalized
def find_good_match(p1, inventory_l2, l2_is_xsampa=False): """Find a good sequence in inventory_l2 matching p1.""" dst = panphon.distance.Distance() # The proper way to do this would be with some kind of beam search # through a determinized/minimized FST, but in the absence of that # we can do a kind of heurstic greedy search. (we don't want any # dependencies outside of PyPI otherwise we'd just use OpenFST) p1_pseq = dst.fm.ipa_segs(p1) p2_pseqs = [ dst.fm.ipa_segs(p) for p in process_characters(inventory_l2, l2_is_xsampa) ] i = 0 good_match = [] while i < len(p1_pseq): best_input = "" best_output = -1 best_score = 0xdeadbeef for j, p2_pseq in enumerate(p2_pseqs): # FIXME: Should also consider the (weighted) possibility # of deleting input or inserting any segment (but that # can't be done with a greedy search) if len(p2_pseq) == 0: LOGGER.warning('No panphon mapping for %s - skipping', inventory_l2[j]) continue e = min(i + len(p2_pseq), len(p1_pseq)) input_seg = p1_pseq[i:e] score = dst.weighted_feature_edit_distance(''.join(input_seg), ''.join(p2_pseq)) # Be very greedy and take the longest match if (score < best_score or score == best_score and len(input_seg) > len(best_input)): best_input = input_seg best_output = j best_score = score LOGGER.debug('Best match at position %d: %s => %s', i, best_input, inventory_l2[best_output]) good_match.append(inventory_l2[best_output]) i += len(best_input) # greedy! return ''.join(good_match)
def check_ipa_known_segs(mappings_to_check=False): dst = distance.Distance() if not mappings_to_check: mappings_to_check = [x['out_lang'] for x in MAPPINGS_AVAILABLE] found_error = False for mapping in [ x for x in MAPPINGS_AVAILABLE if x['out_lang'] in mappings_to_check ]: if mapping['out_lang'].endswith('-ipa'): for rule in mapping['mapping_data']: joined_ipa_segs = ''.join(dst.fm.ipa_segs(rule['out'])) if not joined_ipa_segs == rule['out']: LOGGER.warning( f"Output '{rule['out']}' in rule {rule} in mapping between {mapping['in_lang']} and {mapping['out_lang']} is not recognized as valid IPA by panphon. You may ignore this warning if you know it gets remapped to IPA later." ) found_error = True if found_error: LOGGER.warning( "Please refer to https://github.com/dmort27/panphon for information about panphon." )
def setUp(self): self.runner = APP.test_cli_runner() self.data_dir = os.path.dirname(data_dir) self.langs_to_test = [] for fn in glob(os.path.join(self.data_dir, "*.*sv")): if fn.endswith("csv"): delimiter = "," elif fn.endswith("psv"): delimiter = "|" elif fn.endswith("tsv"): delimiter = "\t" with open(fn, encoding="utf-8") as csvfile: reader = csv.reader(csvfile, delimiter=delimiter) for row in reader: if len(row) < 4: LOGGER.warning( f"Row in {fn} containing values {row} does not have the right values." f"Please check your data.") else: self.langs_to_test.append(row)
def create_mapping(l1_mapping: Mapping, l2_mapping: Mapping) -> Mapping: ''' Create a mapping from the output of l1 and input of l2. Both must be either ipa or x-sampa. ''' l1 = l1_mapping.kwargs['out_lang'] l2 = l2_mapping.kwargs['in_lang'] inv_l1 = l1_mapping.inventory("out") inv_l2 = l2_mapping.inventory() if not is_ipa(l1) and not is_xsampa(l1): LOGGER.warning( "Unsupported orthography of inventory 1: %s" " (must be ipa or x-sampa)", l1) if not is_ipa(l2) and not is_xsampa(l2): LOGGER.warning( "Unsupported orthography of inventory 2: %s" " (must be ipa or x-sampa)", l2) mapping = align_inventories(inv_l1["inventory"], inv_l2["inventory"], is_xsampa(l1), is_xsampa(l2)) output_mapping = Mapping(mapping, in_lang=l1, out_lang=l2) return output_mapping
def scan(lang, path): """ Returns the set of non-mapped characters in a document. Accounts for case sensitivity in the configuration. """ # Check input lang exists if not lang in LANGS_NETWORK.nodes: raise click.UsageError(f"'{lang}' is not a valid value for 'LANG'") # Retrieve the mappings for lang case_sensitive = True mappings = [] for mapping in MAPPINGS_AVAILABLE: mapping_name = mapping["in_lang"] # Exclude mappings for converting between IPAs if mapping_name.startswith(lang) and "ipa" not in mapping_name: case_sensitive = case_sensitive and mapping.get( "case_sensitive", True) mappings.append(mapping) # Get input chars in mapping mapped_chars = set() for lang_mapping in mappings: for x in lang_mapping["mapping_data"]: mapped_chars.add(normalize(x["in"], "NFD")) # Find unmapped chars filter_chars = " \n" mapped_string = "".join(mapped_chars) pattern = "[^" + mapped_string + filter_chars + ".]" prog = re.compile(pattern) with open(path, "r", encoding="utf8") as file: data = normalize(file.read(), "NFD") if not case_sensitive: data = data.lower() unmapped = set(prog.findall(data)) if unmapped: LOGGER.warning("The following characters are not mapped:") print(unmapped)