Exemplo n.º 1
0
    def test_is_letter_non_ipa(self):
        """
		is_letter should return False for non-IPA letters in strict mode and
		True in non-strict mode.
		"""
        for char in ['ʣ', 'ɫ', 'g', 'Γ', 'F', 'Lj']:
            self.assertFalse(is_letter(char, strict=True))
            self.assertTrue(is_letter(char, strict=False))
Exemplo n.º 2
0
def get_vector(token):
    """
	Return the vector representation (an entry from VECTORS) of an IPA token.
	Raise an exception if VECTORS is not yet set.
	"""
    token = normalise_token(token)

    try:
        return VECTORS[token]
    except KeyError:
        pass

    letters = ''.join([char for char in token if is_letter(char, False)])

    if len(letters) > 1:
        sub_tokens = []

        for index, sub_token in enumerate(tokenise(token)):
            if sub_token in VECTORS:
                sub_tokens.append(sub_token)
            elif letters[index] in VECTORS:
                sub_tokens.append(sub_token)
            else:
                break
        else:  # no break
            # warnings.warn('neural-net: {} → {}'.format(
            # 							token, ' '.join(sub_tokens)))
            sub_vectors = [VECTORS[sub_token] for sub_token in sub_tokens]
            return sum(sub_vectors) / len(sub_vectors)

    try:
        return VECTORS[letters]
    except KeyError:
        warnings.warn('neural-net: cannot recognise {}'.format(token))
        raise
Exemplo n.º 3
0
def get_vector_key(token):
	"""
	Return the key that maps to the vector representation of a phoneme (i.e.
	IPA token). Raise an exception if the module-level model is not set.
	"""
	token = normalise_token(token)

	if token in model.wv:
		return token

	if token == '':
		return '\0'

	alt_token = ''.join([char for char in token if is_letter(char, False)])

	if alt_token in model.wv:
		return alt_token

	warnings.warn('phon2vec: cannot recognise {}'.format(token))
	return '\0'
Exemplo n.º 4
0
def convert_ipa_token(token):
    """
	Convert an IPA token into an ASJP token or raise (Assertion|Index)Error if
	the input does not constitute a valid IPA token.

	Helper for ipa2asjp(ipa_seq).
	"""
    output = []
    has_tie_bar = False

    for char in token:
        if is_letter(char):
            if has_tie_bar:
                affricate = output[-1] + char
                if affricate in chart.ipa:
                    output[-1] = chart.ipa[affricate]
                has_tie_bar = False
            else:
                for asjp_char in chart.ipa[char]:
                    output.append(asjp_char)

        elif is_tie_bar(char):
            has_tie_bar = True

        elif char == 'n̪'[1] and output[-1] == 'n':
            output[-1] = chart.ipa['n̪']

        elif char in chart.ipa:
            asjp_char = chart.ipa[char]
            if asjp_char in chart.asjp_diacritics:
                output[-1] += asjp_char
            else:
                output.append(asjp_char)

    assert 1 <= len(output) <= 3

    if len(output) != 1:
        output.append('~' if len(output) == 2 else '$')

    return ''.join(output)
Exemplo n.º 5
0
	def sanitise_token(token, keep_digits=False):
		"""
		Sanitise a string by (1) ensuring its chars' normal form comply to the
		IPA spec; (2) replacing common substitutes with their IPA equivalents;
		(3) excluding chars that are not IPA letters, diacritics, tie bars, or
		length markers.

		If keep_digits is set to True, do not replace digits with Chao letters.

		This method leverages ipatok functions that are not in the package's
		public API.
		"""
		if not keep_digits:
			token = replace_digits_with_chao(token)

		token = replace_substitutes(normalise(token))

		return ''.join([
				char for char in token
				if is_letter(char, strict=False) \
					or is_tie_bar(char) \
					or is_diacritic(char, strict=False) \
					or is_length(char) \
					or is_tone(char, strict=False) or char in '¹²³⁴⁵'])
Exemplo n.º 6
0
def tokenise_word(string,
                  strict=False, replace=False, tones=False, unknown=False):
    """
    Tokenise the string into a list of tokens or raise ValueError if it cannot
    be tokenised (relatively) unambiguously. The string should not include
    whitespace, i.e. it is assumed to be a single word.

    If strict=False, allow non-standard letters and diacritics, as well as
    initial diacritic-only tokens (e.g. pre-aspiration). If replace=True,
    replace some common non-IPA symbols with their IPA counterparts. If
    tones=False, ignore tone symbols. If unknown=False, ignore symbols that
    cannot be classified into a relevant category.

    Helper for tokenise(string, ..).
    """
    string = normalise(string)

    if replace:
        string = ipa.replace_substitutes(string)

    tokens = []

    for index, char in enumerate(string):
        if ipa.is_letter(char, strict):
            if tokens and ipa.is_tie_bar(string[index-1]):
                tokens[-1] += char
            else:
                tokens.append(char)

        elif ipa.is_tie_bar(char):
            if not tokens:
                raise ValueError(f'The string starts with a tie bar: {string}')
            tokens[-1] += char

        elif ipa.is_diacritic(char, strict) or ipa.is_length(char):
            if tokens:
                tokens[-1] += char
            else:
                if strict:
                    raise ValueError(
                        f'The string starts with a diacritic: {string}'
                    )
                else:
                    tokens.append(char)

        elif tones and ipa.is_tone(char, strict):
            if unicodedata.combining(char):
                if not tokens:
                    raise ValueError(
                        f'The string starts with an accent mark: {string}'
                    )
                tokens[-1] += char
            elif tokens and ipa.is_tone(tokens[-1][-1], strict):
                tokens[-1] += char
            else:
                tokens.append(char)

        elif ipa.is_suprasegmental(char, strict):
            pass

        else:
            if strict:
                raise ValueError(
                    f'Unrecognised char: {char} ({ unicodedata.name(char)})'
                )
            elif unknown:
                tokens.append(char)
            else:
                pass

    return tokens