def test_response_code(self): ''' Ensure all routes return 200 ''' for rt in self.routes_no_args: try: r = self.client().get(rt) self.assertEqual(r.status_code, 200) LOGGER.debug("Route " + rt + " returned " + str(r.status_code)) except: LOGGER.error("Couldn't connect. Is flask running?")
def escape_special_characters(to_escape: Dict[str, str]) -> Dict[str, str]: for k, v in to_escape.items(): if isinstance(v, str): escaped = re.escape(v) else: escaped = v if escaped != v: LOGGER.debug( f"Escaped special characters in '{v}' with '{escaped}''. Set 'escape_special' to False in your Mapping configuration to disable this." ) to_escape[k] = escaped return to_escape
def test_response_code_with_args(self): ''' Ensure all args return 200 ''' for ep in self.routes_only_args: for node in LANGS_NETWORK.nodes: rt = re.sub(self.arg_match, node, ep) try: r = self.client().get(rt) self.assertEqual(r.status_code, 200) except: LOGGER.error("Couldn't connect. Is flask running?") LOGGER.debug("Successfully tested " + str(len(LANGS_NETWORK.nodes) ) + " node resources at route " + ep + " .")
def normalize(inp: str, norm_form: str): ''' Normalize to NFC(omposed) or NFD(ecomposed). Also, find any Unicode Escapes & decode 'em! ''' if norm_form not in ['none', 'NFC', 'NFD', 'NFKC', 'NFKD']: raise exceptions.InvalidNormalization(normalize) elif norm_form is None or norm_form == 'none': return unicode_escape(inp) else: normalized = ud.normalize(norm_form, unicode_escape(inp)) if normalized != inp: LOGGER.debug( 'The string %s was normalized to %s using the %s standard and by decoding any Unicode escapes. Note that this is not necessarily the final stage of normalization.', inp, normalized, norm_form) return normalized
def find_good_match(p1, inventory_l2): """Find a good sequence in inventory_l2 matching p1.""" # The proper way to do this would be with some kind of beam search # through a determinized/minimized FST, but in the absence of that # we can do a kind of heurstic greedy search. (we don't want any # dependencies outside of PyPI otherwise we'd just use OpenFST) p1_pseq = dst.fm.ipa_segs(p1) i = 0 good_match = [] while i < len(p1_pseq): best_input = "" best_output = -1 best_score = 0xDEADBEEF for j, p2_pseq in enumerate(p2_pseqs): # FIXME: Should also consider the (weighted) possibility # of deleting input or inserting any segment (but that # can't be done with a greedy search) if len(p2_pseq) == 0: LOGGER.warning( "No panphon mapping for %s - skipping", inventory_l2[j] ) continue e = min(i + len(p2_pseq), len(p1_pseq)) input_seg = p1_pseq[i:e] distance_method = get_distance_method(dst, distance) score = distance_method("".join(input_seg), "".join(p2_pseq)) # Be very greedy and take the longest match if ( score < best_score or score == best_score and len(input_seg) > len(best_input) ): best_input = input_seg best_output = j best_score = score LOGGER.debug( "Best match at position %d: %s => %s", i, best_input, inventory_l2[best_output], ) good_match.append(inventory_l2[best_output]) i += len(best_input) # greedy! return "".join(good_match)
def normalize(inp: str, norm_form: str): """ Normalize to NFC(omposed) or NFD(ecomposed). Also, find any Unicode Escapes & decode 'em! """ if norm_form not in ["none", "NFC", "NFD", "NFKC", "NFKD"]: raise exceptions.InvalidNormalization(normalize) elif norm_form is None or norm_form == "none": return unicode_escape(inp) else: normalized = ud.normalize(norm_form, unicode_escape(inp)) if normalized != inp: LOGGER.debug( "The string %s was normalized to %s using the %s standard and by decoding any Unicode escapes. " "Note that this is not necessarily the final stage of normalization.", inp, normalized, norm_form, ) return normalized
def load_mapping_from_path(path_to_mapping_config, index=0): ''' Loads a mapping from a path, if there is more than one mapping, then it loads based on the int provided to the 'index' argument. Default is 0. ''' path = Path(path_to_mapping_config) # If path leads to actual mapping config if path.exists() and (path.suffix.endswith('yml') or path.suffix.endswith('yaml')): # safe load it with open(path, encoding='utf8') as f: mapping = yaml.safe_load(f) # If more than one mapping in the mapping config if 'mappings' in mapping: try: LOGGER.debug( 'Loading mapping from %s between "%s" and "%s" at index %s', path_to_mapping_config, mapping['mappings'][index].get('in_lang', 'und'), mapping['mappings'][index].get('out_lang', 'und'), index) mapping = mapping['mappings'][index] except KeyError: LOGGER.warning( 'An index of %s was provided for the mapping %s but that index does not exist in the mapping. Please check your mapping.', index, path_to_mapping_config) # Log the warning if an Index other than 0 was provided for a mapping config with a single mapping. elif index != 0: LOGGER.warning( 'An index of %s was provided for the mapping %s but that index does not exist in the mapping. Please check your mapping.', index, path_to_mapping_config) # try to load the data from the mapping data file if 'mapping' in mapping: mapping['mapping_data'] = load_from_file( os.path.join(path.parent, mapping['mapping'])) else: # Is "mapping" key missing? raise exceptions.MalformedMapping # load any abbreviations if 'abbreviations' in mapping: mapping['abbreviations_data'] = load_abbreviations_from_file( os.path.join(path.parent, mapping['abbreviations'])) return mapping else: raise FileNotFoundError
def find_good_match(p1, inventory_l2, l2_is_xsampa=False): """Find a good sequence in inventory_l2 matching p1.""" dst = panphon.distance.Distance() # The proper way to do this would be with some kind of beam search # through a determinized/minimized FST, but in the absence of that # we can do a kind of heurstic greedy search. (we don't want any # dependencies outside of PyPI otherwise we'd just use OpenFST) p1_pseq = dst.fm.ipa_segs(p1) p2_pseqs = [ dst.fm.ipa_segs(p) for p in process_characters(inventory_l2, l2_is_xsampa) ] i = 0 good_match = [] while i < len(p1_pseq): best_input = "" best_output = -1 best_score = 0xdeadbeef for j, p2_pseq in enumerate(p2_pseqs): # FIXME: Should also consider the (weighted) possibility # of deleting input or inserting any segment (but that # can't be done with a greedy search) if len(p2_pseq) == 0: LOGGER.warning('No panphon mapping for %s - skipping', inventory_l2[j]) continue e = min(i + len(p2_pseq), len(p1_pseq)) input_seg = p1_pseq[i:e] score = dst.weighted_feature_edit_distance(''.join(input_seg), ''.join(p2_pseq)) # Be very greedy and take the longest match if (score < best_score or score == best_score and len(input_seg) > len(best_input)): best_input = input_seg best_output = j best_score = score LOGGER.debug('Best match at position %d: %s => %s', i, best_input, inventory_l2[best_output]) good_match.append(inventory_l2[best_output]) i += len(best_input) # greedy! return ''.join(good_match)
def make_g2p(in_lang: str, out_lang: str): # Check in_lang is a node in network if in_lang not in LANGS_NETWORK.nodes: LOGGER.error(f"No lang called {in_lang}. Please try again.") raise (FileNotFoundError("No lang called {in_lang}.")) # Check out_lang is a node in network if out_lang not in LANGS_NETWORK.nodes: LOGGER.error(f"No lang called {out_lang}. Please try again.") raise (FileNotFoundError("No lang called {out_lang}.")) # Try to find the shortest path between the nodes try: path = shortest_path(LANGS_NETWORK, in_lang, out_lang) except NetworkXNoPath: LOGGER.error( f"Sorry, we couldn't find a way to convert {in_lang} to {out_lang}. Please update your langs by running `g2p update` and try again." ) raise (NetworkXNoPath) # Find all mappings needed mappings_needed = [] for i, lang in enumerate(path): try: mapping = Mapping(in_lang=path[i], out_lang=path[i + 1]) LOGGER.debug( f"Adding mapping between {path[i]} and {path[i+1]} to composite transducer." ) mappings_needed.append(mapping) except IndexError: continue # Either return Transducer or Composite Transducer if len(mappings_needed) == 1: return Transducer(mappings_needed[0]) else: return CompositeTransducer([Transducer(x) for x in mappings_needed])
def make_g2p(in_lang: str, out_lang: str, tok_lang=None): """Make a g2p Transducer for mapping text from in_lang to out_lang via the shortest path between them. Args: in_lang (str): input language code out_lang (str): output language code Returns: Transducer from in_lang to out_lang Raises: InvalidLanguageCode: if in_lang or out_lang don't exist NoPath: if there is path between in_lang and out_lang """ if (in_lang, out_lang, tok_lang) in _g2p_cache: return _g2p_cache[(in_lang, out_lang, tok_lang)] # Check in_lang is a node in network if in_lang not in LANGS_NETWORK.nodes: LOGGER.error(f"No lang called '{in_lang}'. Please try again.") raise InvalidLanguageCode(in_lang) # Check out_lang is a node in network if out_lang not in LANGS_NETWORK.nodes: LOGGER.error(f"No lang called '{out_lang}'. Please try again.") raise InvalidLanguageCode(out_lang) if in_lang == out_lang: LOGGER.error( f"Sorry, you can't transduce between the same language. Please select a different output language code." ) raise NoPath(in_lang, out_lang) # Try to find the shortest path between the nodes try: path = shortest_path(LANGS_NETWORK, in_lang, out_lang) except NetworkXNoPath as e: LOGGER.error( f"Sorry, we couldn't find a way to convert {in_lang} to {out_lang}. Please update your langs by running `g2p update` and try again." ) raise NoPath(in_lang, out_lang) from e # Find all mappings needed mappings_needed = [] for lang1, lang2 in zip(path[:-1], path[1:]): mapping = Mapping(in_lang=lang1, out_lang=lang2) LOGGER.debug( f"Adding mapping between {lang1} and {lang2} to composite transducer." ) mappings_needed.append(mapping) # Either construct a Transducer or Composite Transducer if len(mappings_needed) == 1: transducer = Transducer(mappings_needed[0]) else: transducer = CompositeTransducer( [Transducer(x) for x in mappings_needed]) # If tokenization was requested, return a TokenizingTransducer if tok_lang: if tok_lang == "path": tokenizer = make_tokenizer(in_lang=in_lang, tok_path=path) else: tokenizer = make_tokenizer(in_lang=tok_lang) transducer = TokenizingTransducer(transducer, tokenizer) _g2p_cache[(in_lang, out_lang, tok_lang)] = transducer return transducer
def load_mapping_from_path(path_to_mapping_config, index=0): """ Loads a mapping from a path, if there is more than one mapping, then it loads based on the int provided to the 'index' argument. Default is 0. """ path = Path(path_to_mapping_config) # If path leads to actual mapping config if path.exists() and (path.suffix.endswith("yml") or path.suffix.endswith("yaml")): # safe load it with open(path, encoding="utf8") as f: mapping = yaml.safe_load(f) # If more than one mapping in the mapping config if "mappings" in mapping: try: LOGGER.debug( 'Loading mapping from %s between "%s" and "%s" at index %s', path_to_mapping_config, mapping["mappings"][index].get("in_lang", "und"), mapping["mappings"][index].get("out_lang", "und"), index, ) mapping = mapping["mappings"][index] except KeyError: LOGGER.warning( "An index of %s was provided for the mapping %s but that index does not exist in the mapping. " "Please check your mapping.", index, path_to_mapping_config, ) # Log the warning if an Index other than 0 was provided for a mapping config with a single mapping. elif index != 0: LOGGER.warning( "An index of %s was provided for the mapping %s but that index does not exist in the mapping. " "Please check your mapping.", index, path_to_mapping_config, ) # try to load the data from the mapping data file if "mapping" in mapping: try: mapping["mapping_data"] = load_from_file( os.path.join(path.parent, mapping["mapping"]) ) except (OSError, exceptions.IncorrectFileType) as e: raise exceptions.MalformedMapping( f"Cannot load mapping data file specified in {path}: {e}" ) from e elif mapping.get("type", "") == "unidecode": # This mapping is not implemented as a regular mapping, but as custom software pass else: # Is "mapping" key missing? raise exceptions.MalformedMapping( 'Key "mapping:" missing from a mapping in {}.'.format(path) ) # load any abbreviations if "abbreviations" in mapping: try: mapping["abbreviations_data"] = load_abbreviations_from_file( os.path.join(path.parent, mapping["abbreviations"]) ) except (OSError, exceptions.IncorrectFileType) as e: raise exceptions.MalformedMapping( f"Cannot load abbreviations data file specified in {path}: {e}" ) from e return mapping else: raise FileNotFoundError