示例#1
0
def normalise_token(token):
    """
	Remove tie bars (e.g. t͡ʃ → tʃ) and diacritics marking non-syllabic vowels
	(e.g. aɪ̯ → aɪ) from a token. This ensures a single (arbitrarily chosen)
	"normal" form of tokens with such symbols.
	"""
    return ''.join(
        [char for char in token if not is_tie_bar(char) and char != '◌̯'[1]])
示例#2
0
def convert_ipa_token(token):
    """
	Convert an IPA token into an ASJP token or raise (Assertion|Index)Error if
	the input does not constitute a valid IPA token.

	Helper for ipa2asjp(ipa_seq).
	"""
    output = []
    has_tie_bar = False

    for char in token:
        if is_letter(char):
            if has_tie_bar:
                affricate = output[-1] + char
                if affricate in chart.ipa:
                    output[-1] = chart.ipa[affricate]
                has_tie_bar = False
            else:
                for asjp_char in chart.ipa[char]:
                    output.append(asjp_char)

        elif is_tie_bar(char):
            has_tie_bar = True

        elif char == 'n̪'[1] and output[-1] == 'n':
            output[-1] = chart.ipa['n̪']

        elif char in chart.ipa:
            asjp_char = chart.ipa[char]
            if asjp_char in chart.asjp_diacritics:
                output[-1] += asjp_char
            else:
                output.append(asjp_char)

    assert 1 <= len(output) <= 3

    if len(output) != 1:
        output.append('~' if len(output) == 2 else '$')

    return ''.join(output)
示例#3
0
	def sanitise_token(token, keep_digits=False):
		"""
		Sanitise a string by (1) ensuring its chars' normal form comply to the
		IPA spec; (2) replacing common substitutes with their IPA equivalents;
		(3) excluding chars that are not IPA letters, diacritics, tie bars, or
		length markers.

		If keep_digits is set to True, do not replace digits with Chao letters.

		This method leverages ipatok functions that are not in the package's
		public API.
		"""
		if not keep_digits:
			token = replace_digits_with_chao(token)

		token = replace_substitutes(normalise(token))

		return ''.join([
				char for char in token
				if is_letter(char, strict=False) \
					or is_tie_bar(char) \
					or is_diacritic(char, strict=False) \
					or is_length(char) \
					or is_tone(char, strict=False) or char in '¹²³⁴⁵'])
示例#4
0
文件: tokens.py 项目: pavelsof/ipatok
def tokenise_word(string,
                  strict=False, replace=False, tones=False, unknown=False):
    """
    Tokenise the string into a list of tokens or raise ValueError if it cannot
    be tokenised (relatively) unambiguously. The string should not include
    whitespace, i.e. it is assumed to be a single word.

    If strict=False, allow non-standard letters and diacritics, as well as
    initial diacritic-only tokens (e.g. pre-aspiration). If replace=True,
    replace some common non-IPA symbols with their IPA counterparts. If
    tones=False, ignore tone symbols. If unknown=False, ignore symbols that
    cannot be classified into a relevant category.

    Helper for tokenise(string, ..).
    """
    string = normalise(string)

    if replace:
        string = ipa.replace_substitutes(string)

    tokens = []

    for index, char in enumerate(string):
        if ipa.is_letter(char, strict):
            if tokens and ipa.is_tie_bar(string[index-1]):
                tokens[-1] += char
            else:
                tokens.append(char)

        elif ipa.is_tie_bar(char):
            if not tokens:
                raise ValueError(f'The string starts with a tie bar: {string}')
            tokens[-1] += char

        elif ipa.is_diacritic(char, strict) or ipa.is_length(char):
            if tokens:
                tokens[-1] += char
            else:
                if strict:
                    raise ValueError(
                        f'The string starts with a diacritic: {string}'
                    )
                else:
                    tokens.append(char)

        elif tones and ipa.is_tone(char, strict):
            if unicodedata.combining(char):
                if not tokens:
                    raise ValueError(
                        f'The string starts with an accent mark: {string}'
                    )
                tokens[-1] += char
            elif tokens and ipa.is_tone(tokens[-1][-1], strict):
                tokens[-1] += char
            else:
                tokens.append(char)

        elif ipa.is_suprasegmental(char, strict):
            pass

        else:
            if strict:
                raise ValueError(
                    f'Unrecognised char: {char} ({ unicodedata.name(char)})'
                )
            elif unknown:
                tokens.append(char)
            else:
                pass

    return tokens
示例#5
0
    def test_is_tie_bar(self):
        """
		is_tie_bar should return True for IPA tie bars and False for other IPA
		symbols.
		"""
        self.assertTrue(is_tie_bar('◌͡'[1]))
        self.assertTrue(is_tie_bar('◌͜'[1]))

        self.assertFalse(is_tie_bar('ʋ'))
        self.assertFalse(is_tie_bar('‿'))

        [self.assertTrue(is_tie_bar(x)) for x in chart.tie_bars]

        [self.assertFalse(is_tie_bar(x)) for x in chart.consonants]
        [self.assertFalse(is_tie_bar(x)) for x in chart.vowels]
        [self.assertFalse(is_tie_bar(x)) for x in chart.diacritics]
        [self.assertFalse(is_tie_bar(x)) for x in chart.suprasegmentals]
        [self.assertFalse(is_tie_bar(x)) for x in chart.lengths]
        [self.assertFalse(is_tie_bar(x)) for x in chart.tones]