Exemplo n.º 1
0
 def check(
     self,
     tg: TransductionGraph,
     shallow=False,
     display_warnings=False,
     original_input=None,
 ):
     out_lang = self.mapping.kwargs["out_lang"]
     if "eng-arpabet" in out_lang:
         if not is_arpabet(tg.output_string):
             if display_warnings:
                 display_input = (original_input
                                  if original_input else tg.input_string)
                 LOGGER.warning(
                     f'Transducer output "{tg.output_string}" for input "{display_input}" is not fully valid eng-arpabet as recognized by soundswallower.'
                 )
             return False
         else:
             return True
     elif is_ipa(out_lang):
         if not is_panphon(tg.output_string,
                           display_warnings=display_warnings):
             if display_warnings:
                 display_input = (original_input
                                  if original_input else tg.input_string)
                 LOGGER.warning(
                     f'Transducer output "{tg.output_string}" for input "{display_input}" is not fully valid {out_lang}.'
                 )
             return False
         else:
             return True
     else:
         # No check implemented at this tier, just return True
         return True
Exemplo n.º 2
0
def network_to_echart(write_to_file: bool = False, layout: bool = False):
    nodes = []
    no_nodes = len(LANGS_NETWORK.nodes)
    for node in LANGS_NETWORK.nodes:
        lang_name = node.split('-')[0]
        no_ancestors = len(ancestors(LANGS_NETWORK, node))
        no_descendants = len(descendants(LANGS_NETWORK, node))
        size = min(
            20,
            max(2, ((no_ancestors / no_nodes) * 100 +
                    (no_descendants / no_nodes) * 100)))
        node = {
            'name': node,
            'symbolSize': size,
            'id': node,
            'category': lang_name
        }
        nodes.append(node)
    nodes.sort(key=lambda x: x['name'])
    edges = []
    for edge in LANGS_NETWORK.edges:
        edges.append({'source': edge[0], 'target': edge[1]})
    if write_to_file:
        with open(
                os.path.join(os.path.dirname(static_file),
                             'languages-network.json'), 'w') as f:
            f.write(json.dumps({'nodes': nodes, 'edges': edges}))
        LOGGER.info(f'Wrote network nodes and edges to static file.')
    return nodes, edges
Exemplo n.º 3
0
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False):
    dummy_inventory = ["ɑ", "i", "u", "t", "s", "n"]
    display_name = mapping.kwargs.get('language_name', 'No Language display name in Config')
    config = generate_config(mapping.kwargs[f'{io}_lang'], 'dummy', display_name, display_name)
    default_char = 't'
    if is_ipa(mapping.kwargs[f'{io}_lang']):
        mapping = align_inventories(mapping.inventory(io), dummy_inventory)
    else:
        und_g2p = make_g2p('und', 'und-ipa')
        mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower())} for x in mapping.inventory(io)]
        dummy_list = align_inventories([x['out'] for x in mapping], dummy_inventory)
        dummy_dict = {}
        for x in dummy_list:
            if x['in']:
                dummy_dict[x['in']] = x['out']
                
        for x in mapping:
            try:
                x['out'] = dummy_dict[x['out']]
            except KeyError:
                LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.")
                x['out'] = default_char       
 
    if write_to_file:
        write_generated_mapping_to_file(config, mapping)
    return config, mapping
Exemplo n.º 4
0
Arquivo: utils.py Projeto: deltork/g2p
def check_ipa_known_segs(mappings_to_check=False) -> bool:
    """Check the given mappings, or all IPA mappings, for invalid IPA in the "out" fields

    Returns True iff not errors were found.
    """
    if not mappings_to_check:
        mappings_to_check = [x["out_lang"] for x in MAPPINGS_AVAILABLE]
    found_error = False
    for mapping in [
            x for x in MAPPINGS_AVAILABLE if x["out_lang"] in mappings_to_check
    ]:
        if is_ipa(mapping["out_lang"]):
            reverse = mapping.get("reverse", False)
            for rule in mapping["mapping_data"]:
                output = rule["in"] if reverse else rule["out"]
                if not is_panphon(output):
                    LOGGER.warning(
                        f"Output '{rule['out']}' in rule {rule} in mapping between {mapping['in_lang']} "
                        f"and {mapping['out_lang']} is not recognized as valid IPA by panphon."
                    )
                    found_error = True
    if found_error:
        LOGGER.warning(
            "Please refer to https://github.com/dmort27/panphon for information about panphon."
        )
    return not found_error
Exemplo n.º 5
0
def align_to_dummy_fallback(mapping: Mapping, io: str = 'in', write_to_file: bool = False, out_dir: str = ''):
    display_name = mapping.kwargs.get('language_name', 'No Language display name in Config')
    config = {'in_lang': mapping.kwargs[f'{io}_lang'], 'out_lang': 'dummy'}
    default_char = 't'
    if is_ipa(mapping.kwargs[f'{io}_lang']):
        mapping = align_inventories(mapping.inventory(io), DUMMY_INVENTORY)
    else:
        und_g2p = make_g2p('und', 'und-ipa')
        mapping = [{"in": unicode_escape(x), "out": und_g2p(unidecode(x).lower()).output_string} for x in mapping.inventory(io)]
        dummy_list = align_inventories([x['out'] for x in mapping], DUMMY_INVENTORY)
        dummy_dict = {}
        for x in dummy_list:
            if x['in']:
                dummy_dict[x['in']] = x['out']
                
        for x in mapping:
            try:
                x['out'] = dummy_dict[x['out']]
            except KeyError:
                LOGGER.warn(f"We couldn't guess at what {x['in']} means, so it's being replaced with '{default_char}' instead.")
                x['out'] = default_char       

    config['mapping'] = mapping
    mapping = Mapping(**config)
    if write_to_file:
        if out_dir:
            if os.path.isdir(out_dir):
                mapping.config_to_file(out_dir)
                mapping.mapping_to_file(out_dir)
            else:
                LOGGER.warning(f'{out_dir} is not a directory. Writing to default instead.')
        else:
            mapping.config_to_file()
            mapping.mapping_to_file()
    return mapping
Exemplo n.º 6
0
 def rule_to_regex(self, rule: str) -> Pattern:
     """Turns an input string (and the context) from an input/output pair
     into a regular expression pattern"""
     if "context_before" in rule and rule['context_before']:
         before = rule["context_before"]
     else:
         before = ''
     if 'context_after' in rule and rule['context_after']:
         after = rule["context_after"]
     else:
         after = ''
     input_match = re.sub(re.compile(r'{\d+}'), "", rule['in'])
     try:
         inp = create_fixed_width_lookbehind(before) + input_match
         if after:
             inp += f"(?={after})"
         if not self.kwargs['case_sensitive']:
             rule_regex = re.compile(inp, re.I)
         else:
             rule_regex = re.compile(inp)
     except:
         LOGGER.error(
             f'Your regex in mapping between {self.kwargs["in_lang"]} and {self.kwargs["out_lang"]} is malformed. \
                 Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?'
         )
         raise Exception(
             f'Your regex in mapping between {self.kwargs["in_lang"]} and {self.kwargs["out_lang"]} is malformed. \
                 Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?'
         )
     return rule_regex
Exemplo n.º 7
0
def create_mapping(mapping_1: Mapping,
                   mapping_2: Mapping,
                   mapping_1_io: str = 'out',
                   mapping_2_io: str = 'in',
                   write_to_file: bool = False) -> Mapping:
    map_1_name = mapping_1.kwargs[f'{mapping_1_io}_lang']
    map_2_name = mapping_2.kwargs[f'{mapping_2_io}_lang']
    if not is_ipa(map_1_name) and not is_xsampa(map_1_name):
        LOGGER.warning(
            "Unsupported orthography of inventory 1: %s"
            " (must be ipa or x-sampa)", map_1_name)
    if not is_ipa(map_2_name) and not is_xsampa(map_2_name):
        LOGGER.warning(
            "Unsupported orthography of inventory 2: %s"
            " (must be ipa or x-sampa)", map_2_name)
    l1_is_xsampa, l2_is_xsampa = is_xsampa(map_1_name), is_xsampa(map_2_name)
    mapping = align_inventories(mapping_1.inventory(mapping_1_io),
                                mapping_2.inventory(mapping_2_io),
                                l1_is_xsampa, l2_is_xsampa)

    l1_display_name = mapping_1.kwargs.get(
        'language_name', 'No Language display name in Config')
    l2_display_name = mapping_2.kwargs.get(
        'language_name', 'No Language display name in Config')

    config = generate_config(map_1_name, map_2_name, l1_display_name,
                             l2_display_name)

    if write_to_file:
        write_generated_mapping_to_file(config, mapping)

    return Mapping(mapping,
                   **{k: v
                      for k, v in config.items() if k != 'mapping'})
Exemplo n.º 8
0
def write_generated_mapping_to_file(config: dict, mapping: List[dict]):
    # read config
    with open(GEN_CONFIG, 'r') as f:
        data = yaml.safe_load(f)
    map_output_path = os.path.join(GEN_DIR, config['mapping'])
    # write mapping
    if os.path.exists(map_output_path):
        LOGGER.info(f"Overwriting file at {map_output_path}")
    with open(map_output_path, 'w', encoding='utf8') as f:
        json.dump(mapping, f, indent=4)
    data = deepcopy(data)
    cfg_exists = bool([x for x in data['mappings'] if x['in_lang']
                       == config['in_lang'] and x['out_lang'] == config['out_lang']])
    # add new mapping if no mappings are generated yet
    if not data['mappings']:
        data['mappings'] = [config]
    # add new mapping if it doesn't exist yet
    elif not cfg_exists:
        data['mappings'].append(config)
        # rewrite config
        with open(GEN_CONFIG, 'w', encoding='utf8') as f:
            yaml.dump(data, f, Dumper=IndentDumper, default_flow_style=False)
    elif cfg_exists:
        for i, cfg in enumerate(data['mappings']):
            if cfg['in_lang'] == config['in_lang'] and cfg['out_lang'] == config['out_lang']:
                data['mappings'][i] = config
                # rewrite config
                with open(GEN_CONFIG, 'w', encoding='utf8') as f:
                    yaml.dump(data, f, Dumper=IndentDumper,
                              default_flow_style=False)
                break
    else:
        LOGGER.warn(
            f"Not writing generated files because a non-generated mapping from {config['in_lang']} to {config['out_lang']} already exists.")
Exemplo n.º 9
0
 def test_response_code(self):
     '''
     Ensure all routes return 200
     '''
     for rt in self.routes_no_args:
         try:
             r = self.client().get(rt)
             self.assertEqual(r.status_code, 200)
             LOGGER.debug("Route " + rt + " returned " + str(r.status_code))
         except:
             LOGGER.error("Couldn't connect. Is flask running?")
Exemplo n.º 10
0
Arquivo: api.py Projeto: deltork/g2p
def update_docs():
    """ Update the swagger documentation with all nodes from the network
    """
    swagger_path = os.path.join(os.path.dirname(static_file), "swagger.json")
    with open(swagger_path) as f:
        data = json.load(f)
    data["components"]["schemas"]["Langs"]["enum"] = sorted(
        LANGS_NETWORK.nodes)
    with open(swagger_path, "w") as f:
        f.write(json.dumps(data))
    LOGGER.info("Updated API documentation")
Exemplo n.º 11
0
def update_docs():
    ''' Update the swagger documentation with all nodes from the network
    '''
    swagger_path = os.path.join(os.path.dirname(static_file), 'swagger.json')
    with open(swagger_path) as f:
        data = json.load(f)
    data['components']['schemas']['Langs']['enum'] = sorted(
        [x for x in LANGS_NETWORK.nodes])
    with open(swagger_path, 'w') as f:
        f.write(json.dumps(data))
    LOGGER.info('Updated API documentation')
Exemplo n.º 12
0
def get_tokenizer(*args, **kwargs):
    """ Deprecated; use make_tokenizer() instead. """

    global _deprecated_warning_printed
    if not _deprecated_warning_printed:
        LOGGER.warning(
            "g2p.get_tokenizer() / g2p.mappings.tokenizer.get_tokenizer() is deprecated. Import and use g2p.make_tokenizer() instead."
        )
        _deprecated_warning_printed = True

    return make_tokenizer(*args, **kwargs)
Exemplo n.º 13
0
def escape_special_characters(to_escape: Dict[str, str]) -> Dict[str, str]:
    for k, v in to_escape.items():
        if isinstance(v, str):
            escaped = re.escape(v)
        else:
            escaped = v
        if escaped != v:
            LOGGER.info(
                f"Escaped special characters in '{v}' with '{escaped}''. Set 'escape_special' to False in your Mapping configuration to disable this.")
        to_escape[k] = escaped
    return to_escape
Exemplo n.º 14
0
    def rule_to_regex(self, rule: dict) -> Pattern:
        """Turns an input string (and the context) from an input/output pair
        into a regular expression pattern"

        The 'in' key is the match.
        The 'context_after' key creates a lookahead.
        The 'context_before' key creates a lookbehind.

        Args:
            rule: A dictionary containing 'in', 'out', 'context_before', and 'context_after' keys

        Raises:
            Exception: This is raised when un-supported regex characters or symbols exist in the rule

        Returns:
            Pattern: returns a regex pattern (re.Pattern)
            bool: returns False if input is null
        """
        # Prevent null input. See, https://github.com/roedoejet/g2p/issues/24
        if not rule['in']:
            LOGGER.warning(
                f'Rule with input \'{rule["in"]}\' and output \'{rule["out"]}\' has no input. This is disallowed. Please check your mapping file for rules with null inputs.'
            )
            return False
        if "context_before" in rule and rule['context_before']:
            before = rule["context_before"]
        else:
            before = ''
        if 'context_after' in rule and rule['context_after']:
            after = rule["context_after"]
        else:
            after = ''
        input_match = re.sub(re.compile(r'{\d+}'), "", rule['in'])
        try:
            inp = create_fixed_width_lookbehind(before) + input_match
            if after:
                inp += f"(?={after})"
            if not self.kwargs['case_sensitive']:
                rule_regex = re.compile(inp, re.I)
            else:
                rule_regex = re.compile(inp)
        except:
            in_lang = self.kwargs.get('in_lang', 'und')
            out_lang = self.kwargs.get('out_lang', 'und')
            LOGGER.error(
                f'Your regex in mapping between {in_lang} and {out_lang} is malformed. \
                    Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?'
            )
            raise Exception(
                f'Your regex in mapping between {in_lang} and {out_lang} is malformed. \
                    Do you have un-escaped regex characters in your input {inp}, contexts {before}, {after}?'
            )
        return rule_regex
Exemplo n.º 15
0
    def test_convert(self):
        error_count = 0
        for test in self.langs_to_test:
            output_string = self.runner.invoke(convert, [test[2], test[0], test[1]]).stdout.strip()
            if output_string != test[3]:
                LOGGER.warning("test_cli.py: mapping error: {} from {} to {} should be {}, got {}".format(test[2], test[0], test[1], test[3], output_string))
                if error_count == 0:
                    first_failed_test = test
                error_count += 1

        if error_count > 0:
            output_string = self.runner.invoke(convert, [first_failed_test[2], first_failed_test[0], first_failed_test[1]]).stdout.strip()
            self.assertEqual(output_string, first_failed_test[3])
Exemplo n.º 16
0
 def test_response_code_with_args(self):
     '''
     Ensure all args return 200
     '''
     for ep in self.routes_only_args:
         for node in LANGS_NETWORK.nodes:
             rt = re.sub(self.arg_match, node, ep)
             try:
                 r = self.client().get(rt)
                 self.assertEqual(r.status_code, 200)
             except:
                 LOGGER.error("Couldn't connect. Is flask running?")
         LOGGER.debug("Successfully tested " + str(len(LANGS_NETWORK.nodes)
                                                   ) + " node resources at route " + ep + " .")
Exemplo n.º 17
0
def normalize(inp: str, norm_form: str):
    ''' Normalize to NFC(omposed) or NFD(ecomposed).
        Also, find any Unicode Escapes & decode 'em!
    '''
    if norm_form not in ['none', 'NFC', 'NFD', 'NFKC', 'NFKD']:
        raise exceptions.InvalidNormalization(normalize)
    elif norm_form is None or norm_form == 'none':
        return inp
    else:
        normalized = ud.normalize(norm_form, unicode_escape(inp))
        if normalized != inp:
            LOGGER.info(
                'The string %s was normalized to %s using the %s standard and by decoding any Unicode escapes. Note that this is not necessarily the final stage of normalization.',
                inp, normalized, norm_form)
        return normalized
Exemplo n.º 18
0
def get_distance_method(dst, distance: str):
    if distance not in DISTANCE_METRICS:
        raise ValueError(f"Distance metric {distance} not supported")
    try:
        distance_method = getattr(dst, distance)
    except AttributeError as e:
        # Older versions of panphon mispelled Dolgopolsky's name as Dogolpolsky...
        # Try again with the older name, so we stay compatible with both <=0.19
        # and >=0.19.1
        if distance == "dolgo_prime_distance":
            return getattr(dst, "dogol_prime_distance")

        LOGGER.error(f"The distance metric {distance} is not supported by PanPhon")
        raise ValueError(f"Distance metric {distance} not supported") from e
    return distance_method
Exemplo n.º 19
0
    def find_good_match(p1, inventory_l2):
        """Find a good sequence in inventory_l2 matching p1."""

        # The proper way to do this would be with some kind of beam search
        # through a determinized/minimized FST, but in the absence of that
        # we can do a kind of heurstic greedy search.  (we don't want any
        # dependencies outside of PyPI otherwise we'd just use OpenFST)

        p1_pseq = dst.fm.ipa_segs(p1)

        i = 0
        good_match = []
        while i < len(p1_pseq):
            best_input = ""
            best_output = -1
            best_score = 0xDEADBEEF
            for j, p2_pseq in enumerate(p2_pseqs):
                # FIXME: Should also consider the (weighted) possibility
                # of deleting input or inserting any segment (but that
                # can't be done with a greedy search)
                if len(p2_pseq) == 0:
                    LOGGER.warning(
                        "No panphon mapping for %s - skipping", inventory_l2[j]
                    )
                    continue
                e = min(i + len(p2_pseq), len(p1_pseq))
                input_seg = p1_pseq[i:e]
                distance_method = get_distance_method(dst, distance)
                score = distance_method("".join(input_seg), "".join(p2_pseq))
                # Be very greedy and take the longest match
                if (
                    score < best_score
                    or score == best_score
                    and len(input_seg) > len(best_input)
                ):
                    best_input = input_seg
                    best_output = j
                    best_score = score
            LOGGER.debug(
                "Best match at position %d: %s => %s",
                i,
                best_input,
                inventory_l2[best_output],
            )
            good_match.append(inventory_l2[best_output])
            i += len(best_input)  # greedy!
        return "".join(good_match)
Exemplo n.º 20
0
 def setUp(self):
     DATA_DIR = os.path.dirname(data_dir)
     self.langs_to_test = []
     for fn in glob(f'{DATA_DIR}/*.*sv'):
         if fn.endswith('csv'):
             delimiter = ','
         elif fn.endswith('psv'):
             delimiter = '|'
         elif fn.endswith('tsv'):
             delimiter = '\t'
         with open(fn, encoding="utf-8") as csvfile:
             reader = csv.reader(csvfile, delimiter=delimiter)
             for row in reader:
                 if len(row) != 4:
                     LOGGER.warning(f'Row in {fn} containing values {row} does not have the right values. Please check your data.')
                 else:
                     self.langs_to_test.append(row)
Exemplo n.º 21
0
def create_mapping(
    mapping_1: Mapping,
    mapping_2: Mapping,
    mapping_1_io: str = "out",
    mapping_2_io: str = "in",
    distance: str = "weighted_feature_edit_distance",
) -> Mapping:
    """Create a mapping from mapping_1's output inventory to mapping_2's input inventory"""

    map_1_name = mapping_1.kwargs[f"{mapping_1_io}_lang"]
    map_2_name = mapping_2.kwargs[f"{mapping_2_io}_lang"]
    if not is_ipa(map_1_name) and not is_xsampa(map_1_name):
        LOGGER.warning(
            "Unsupported orthography of inventory 1: %s (must be ipa or x-sampa)",
            map_1_name,
        )
    if not is_ipa(map_2_name) and not is_xsampa(map_2_name):
        LOGGER.warning(
            "Unsupported orthography of inventory 2: %s (must be ipa or x-sampa)",
            map_2_name,
        )
    l1_is_xsampa, l2_is_xsampa = is_xsampa(map_1_name), is_xsampa(map_2_name)
    mapping = align_inventories(
        mapping_1.inventory(mapping_1_io),
        mapping_2.inventory(mapping_2_io),
        l1_is_xsampa,
        l2_is_xsampa,
        distance=distance,
    )

    # Initialize mapping with input language parameters (as_is,
    # case_sensitive, prevent_feeding, etc)
    config = mapping_1.kwargs.copy()
    # Fix up names, etc.
    if "authors" in config:
        del config["authors"]
    if "display_name" in config:
        del config["display_name"]
    if "language_name" in config:
        del config["language_name"]
    config["prevent_feeding"] = True
    config["in_lang"] = map_1_name
    config["out_lang"] = map_2_name
    config["mapping"] = mapping
    mapping = Mapping(**config)
    return mapping
Exemplo n.º 22
0
 def test_check_with_equiv(self):
     transducer = make_g2p("tau", "eng-arpabet", tok_lang="tau")
     tau_ipa = make_g2p("tau", "tau-ipa", tok_lang="tau")(
         "sh'oo Jign maasee' do'eent'aa shyyyh").output_string
     self.assertTrue(utils.is_panphon(tau_ipa))
     eng_ipa = make_g2p("tau", "eng-ipa", tok_lang="tau")(
         "sh'oo Jign maasee' do'eent'aa shyyyh").output_string
     self.assertTrue(utils.is_panphon(eng_ipa))
     eng_arpabet = make_g2p("tau", "eng-arpabet", tok_lang="tau")(
         "sh'oo Jign maasee' do'eent'aa shyyyh").output_string
     self.assertTrue(utils.is_arpabet(eng_arpabet))
     LOGGER.warning(
         f"tau-ipa {tau_ipa}\neng-ipa {eng_ipa}\n eng-arpabet {eng_arpabet}"
     )
     self.assertTrue(
         transducer.check(
             transducer("sh'oo Jign maasee' do'eent'aa shyyyh")))
Exemplo n.º 23
0
    def test_io(self):
        # go through each language declared in the test case set up
        # Instead of asserting immediately, we go through all the cases first, so that
        # running test_langs.py prints all the errors at once, to help debugging a given g2p mapping.
        # Then we call assertEqual on the first failed case, to make unittest register the failure.
        error_count = 0
        for test in self.langs_to_test:
            transducer = make_g2p(test[0], test[1])
            output_string = transducer(test[2]).output_string
            if output_string != test[3]:
                LOGGER.warning("test_langs.py: mapping error: {} from {} to {} should be {}, got {}".format(test[2], test[0], test[1], test[3], output_string))
                if error_count == 0:
                    first_failed_test = test
                error_count += 1

        if error_count > 0:
            transducer = make_g2p(first_failed_test[0], first_failed_test[1])
            self.assertEqual(transducer(first_failed_test[2]).output_string, first_failed_test[3])
Exemplo n.º 24
0
def run_tests(suite):
    ''' Decide which Test Suite to run
    '''
    if suite == 'all':
        suite = LOADER.discover(os.path.dirname(__file__))
    if suite == 'trans':
        suite = TestSuite(TRANSDUCER_TESTS)
    if suite == 'langs':
        suite = TestSuite(LANGS_TESTS)
    if suite == 'mappings':
        suite = TestSuite(MAPPINGS_TESTS)
    elif suite == 'dev':
        suite = TestSuite(DEV_TESTS)
    runner = TextTestRunner(verbosity=3)
    if isinstance(suite, str):
        LOGGER.error("Please specify a test suite to run: i.e. 'dev' or 'all'")
    else:
        runner.run(suite)
Exemplo n.º 25
0
Arquivo: utils.py Projeto: deltork/g2p
def normalize(inp: str, norm_form: str):
    """ Normalize to NFC(omposed) or NFD(ecomposed).
        Also, find any Unicode Escapes & decode 'em!
    """
    if norm_form not in ["none", "NFC", "NFD", "NFKC", "NFKD"]:
        raise exceptions.InvalidNormalization(normalize)
    elif norm_form is None or norm_form == "none":
        return unicode_escape(inp)
    else:
        normalized = ud.normalize(norm_form, unicode_escape(inp))
        if normalized != inp:
            LOGGER.debug(
                "The string %s was normalized to %s using the %s standard and by decoding any Unicode escapes. "
                "Note that this is not necessarily the final stage of normalization.",
                inp,
                normalized,
                norm_form,
            )
        return normalized
Exemplo n.º 26
0
def find_good_match(p1, inventory_l2, l2_is_xsampa=False):
    """Find a good sequence in inventory_l2 matching p1."""

    dst = panphon.distance.Distance()
    # The proper way to do this would be with some kind of beam search
    # through a determinized/minimized FST, but in the absence of that
    # we can do a kind of heurstic greedy search.  (we don't want any
    # dependencies outside of PyPI otherwise we'd just use OpenFST)
    p1_pseq = dst.fm.ipa_segs(p1)
    p2_pseqs = [
        dst.fm.ipa_segs(p)
        for p in process_characters(inventory_l2, l2_is_xsampa)
    ]
    i = 0
    good_match = []
    while i < len(p1_pseq):
        best_input = ""
        best_output = -1
        best_score = 0xdeadbeef
        for j, p2_pseq in enumerate(p2_pseqs):
            # FIXME: Should also consider the (weighted) possibility
            # of deleting input or inserting any segment (but that
            # can't be done with a greedy search)
            if len(p2_pseq) == 0:
                LOGGER.warning('No panphon mapping for %s - skipping',
                               inventory_l2[j])
                continue
            e = min(i + len(p2_pseq), len(p1_pseq))
            input_seg = p1_pseq[i:e]
            score = dst.weighted_feature_edit_distance(''.join(input_seg),
                                                       ''.join(p2_pseq))
            # Be very greedy and take the longest match
            if (score < best_score or score == best_score
                    and len(input_seg) > len(best_input)):
                best_input = input_seg
                best_output = j
                best_score = score
        LOGGER.debug('Best match at position %d: %s => %s', i, best_input,
                     inventory_l2[best_output])
        good_match.append(inventory_l2[best_output])
        i += len(best_input)  # greedy!
    return ''.join(good_match)
Exemplo n.º 27
0
def check_ipa_known_segs(mappings_to_check=False):
    dst = distance.Distance()
    if not mappings_to_check:
        mappings_to_check = [x['out_lang'] for x in MAPPINGS_AVAILABLE]
    found_error = False
    for mapping in [
            x for x in MAPPINGS_AVAILABLE if x['out_lang'] in mappings_to_check
    ]:
        if mapping['out_lang'].endswith('-ipa'):
            for rule in mapping['mapping_data']:
                joined_ipa_segs = ''.join(dst.fm.ipa_segs(rule['out']))
                if not joined_ipa_segs == rule['out']:
                    LOGGER.warning(
                        f"Output '{rule['out']}' in rule {rule} in mapping between {mapping['in_lang']} and {mapping['out_lang']} is not recognized as valid IPA by panphon. You may ignore this warning if you know it gets remapped to IPA later."
                    )
                    found_error = True
    if found_error:
        LOGGER.warning(
            "Please refer to https://github.com/dmort27/panphon for information about panphon."
        )
Exemplo n.º 28
0
 def setUp(self):
     self.runner = APP.test_cli_runner()
     self.data_dir = os.path.dirname(data_dir)
     self.langs_to_test = []
     for fn in glob(os.path.join(self.data_dir, "*.*sv")):
         if fn.endswith("csv"):
             delimiter = ","
         elif fn.endswith("psv"):
             delimiter = "|"
         elif fn.endswith("tsv"):
             delimiter = "\t"
         with open(fn, encoding="utf-8") as csvfile:
             reader = csv.reader(csvfile, delimiter=delimiter)
             for row in reader:
                 if len(row) < 4:
                     LOGGER.warning(
                         f"Row in {fn} containing values {row} does not have the right values."
                         f"Please check your data.")
                 else:
                     self.langs_to_test.append(row)
Exemplo n.º 29
0
def create_mapping(l1_mapping: Mapping, l2_mapping: Mapping) -> Mapping:
    ''' Create a mapping from the output of l1 and input of l2.
        Both must be either ipa or x-sampa.
    '''
    l1 = l1_mapping.kwargs['out_lang']
    l2 = l2_mapping.kwargs['in_lang']
    inv_l1 = l1_mapping.inventory("out")
    inv_l2 = l2_mapping.inventory()
    if not is_ipa(l1) and not is_xsampa(l1):
        LOGGER.warning(
            "Unsupported orthography of inventory 1: %s"
            " (must be ipa or x-sampa)", l1)
    if not is_ipa(l2) and not is_xsampa(l2):
        LOGGER.warning(
            "Unsupported orthography of inventory 2: %s"
            " (must be ipa or x-sampa)", l2)
    mapping = align_inventories(inv_l1["inventory"], inv_l2["inventory"],
                                is_xsampa(l1), is_xsampa(l2))

    output_mapping = Mapping(mapping, in_lang=l1, out_lang=l2)
    return output_mapping
Exemplo n.º 30
0
Arquivo: cli.py Projeto: deltork/g2p
def scan(lang, path):
    """ Returns the set of non-mapped characters in a document.
        Accounts for case sensitivity in the configuration.
    """
    # Check input lang exists
    if not lang in LANGS_NETWORK.nodes:
        raise click.UsageError(f"'{lang}' is not a valid value for 'LANG'")

    # Retrieve the mappings for lang
    case_sensitive = True
    mappings = []
    for mapping in MAPPINGS_AVAILABLE:
        mapping_name = mapping["in_lang"]
        # Exclude mappings for converting between IPAs
        if mapping_name.startswith(lang) and "ipa" not in mapping_name:
            case_sensitive = case_sensitive and mapping.get(
                "case_sensitive", True)
            mappings.append(mapping)

    # Get input chars in mapping
    mapped_chars = set()
    for lang_mapping in mappings:
        for x in lang_mapping["mapping_data"]:
            mapped_chars.add(normalize(x["in"], "NFD"))
    # Find unmapped chars
    filter_chars = " \n"
    mapped_string = "".join(mapped_chars)
    pattern = "[^" + mapped_string + filter_chars + ".]"
    prog = re.compile(pattern)

    with open(path, "r", encoding="utf8") as file:
        data = normalize(file.read(), "NFD")
        if not case_sensitive:
            data = data.lower()
        unmapped = set(prog.findall(data))
        if unmapped:
            LOGGER.warning("The following characters are not mapped:")
            print(unmapped)