示例#1
0
def run(align_func, dataset_path, output_path):
    """
	Read the word pairs from a psa dataset, align them using the given func,
	and write an output psa dataset.

	If there is a word that cannot be converted to ASJP, an obviously wrong
	alignment is output.
	"""
    dataset = AlignmentsDataset(dataset_path)
    output = ['{} (PMI alignment)'.format(dataset.header)]

    for word_a, word_b, original_al in dataset.data:
        try:
            asjp_a = ipa2asjp(word_a.ipa)
            asjp_b = ipa2asjp(word_b.ipa)
        except ValueError:
            output.extend([
                original_al.comment,
                '\t'.join([word_a.lang, '-'] + list(word_a.ipa)),
                '\t'.join([word_b.lang, '-'] + list(word_b.ipa)), ''
            ])
            continue

        for asjp_al in align_func(asjp_a, asjp_b):
            ipa_corr = convert_alignment(word_a.ipa, word_b.ipa, asjp_al.corr)
            output.extend([
                original_al.comment,
                '\t'.join([word_a.lang] + [pair[0] for pair in ipa_corr]),
                '\t'.join([word_b.lang] + [pair[1] for pair in ipa_corr]), ''
            ])

    with open_for_writing(output_path) as f:
        f.write('\n'.join(output))
示例#2
0
    def test_tokenise_ipa(self):
        """
		Tokenising the IPA input or the ASJP output should be equivalent.
		"""
        for string in [
                'sɫɤnt͡sɛ', 'tʼu͡ʊlɡ', 'fɔʏ̯ɐ', 'ɖʰʊ̄̃ɽʰənaː',
                't͡sʼʷelin ttar', 'ɧɪnː'
        ]:
            tokens = ipatok.tokenise(string, replace=True)
            self.assertEqual(tokenise(ipa2asjp(string)), ipa2asjp(tokens))
示例#3
0
    def test_ipa2asjp_errors(self):
        """
		Non-IPA sequences should raise ValueError and non-sequences should
		raise TypeError.
		"""
        for item in [None, True, 42]:
            with self.assertRaises(TypeError):
                ipa2asjp(item)

        for item in ['zEmy~a', 'павел']:
            with self.assertRaises(ValueError):
                ipa2asjp(item)

        for item in [['z', 'E', 'my~', 'a'], ['п']]:
            with self.assertRaises(ValueError):
                ipa2asjp(item)
示例#4
0
    def test_ipa2asjp_lists(self):
        """
		IPA-compliant tokens should be correctly converted to ASJP tokens.
		"""
        self.assertEqual(ipa2asjp([]), [])
        self.assertEqual(ipa2asjp(['']), [''])

        self.assertEqual(ipa2asjp(['s', 'ɫ', 'ɤ', 'n', 't͡s', 'ɛ']),
                         ['s', 'l', 'o', 'n', 'c', 'E'])
        self.assertEqual(ipa2asjp(['z', 'ɛ', 'mʲ', 'a']),
                         ['z', 'E', 'my~', 'a'])

        self.assertEqual(ipa2asjp(['t͡ʃʼ', 'e', 'n', 'ɪ', 'ɡ']),
                         ['C"', 'e', 'n', 'i', 'g'])
        self.assertEqual(ipa2asjp(['b', 'u͡ʊ', 'qʼ']), ['b', 'u', 'q"'])

        self.assertEqual(ipa2asjp(['ʃ', 't', 'a', 'ɪ̯', 'n']),
                         ['S', 't', 'a', 'i', 'n'])
        self.assertEqual(ipa2asjp(['ʃ', 't', 'a͡ɪ̯', 'n']),
                         ['S', 't', 'a', 'n'])

        self.assertEqual(ipa2asjp(['ãː', 'kʰ']), ['a*', 'kh~'])
        self.assertEqual(ipa2asjp(['ɖʰ', 'ʊ̄̃', 'ɽʰ', 'ə', 'n', 'aː']),
                         ['dh~', 'u*', 'rh~', '3', 'n', 'a'])

        self.assertEqual(ipa2asjp(['qʼʷ', 'e', 'tʼ']), ['q"w~', 'e', 't"'])
        self.assertEqual(ipa2asjp(['w', 'a', 't͡sʼ']), ['w', 'a', 'c"'])

        self.assertEqual(ipa2asjp(['ɧ', 'ɪ', 'nː']), ['Sx~', 'i', 'n'])
        self.assertEqual(ipa2asjp(['ɔ', 'tː', 'ɧ', 'ɪ', 'l', 'ɪ', 'ɡ']),
                         ['o', 't', 'Sx~', 'i', 'l', 'i', 'g'])
示例#5
0
    def test_ipa2asjp_strings(self):
        """
		IPA-compliant strings should be correctly converted to ASJP.
		"""
        self.assertEqual(ipa2asjp(''), '')

        self.assertEqual(ipa2asjp('sɫɤnt͡sɛ'), 'sloncE')
        self.assertEqual(ipa2asjp('zvɛzda'), 'zvEzda')
        self.assertEqual(ipa2asjp('zɛmʲa'), 'zEmy~a')
        self.assertEqual(ipa2asjp('ɔɡɤn'), 'ogon')
        self.assertEqual(ipa2asjp('javʲa sɛ'), 'yavy~a sE')

        self.assertEqual(ipa2asjp('motː'), 'mot')
        self.assertEqual(ipa2asjp('t͡ʃʼenɪɡ'), 'C"enig')
        self.assertEqual(ipa2asjp('bu͡ʊqʼ'), 'buq"')
        self.assertEqual(ipa2asjp('tʼu͡ʊlɡ'), 't"ulg')
        self.assertEqual(ipa2asjp('bu͡ʊt͡s'), 'buc')

        self.assertEqual(ipa2asjp('zɔnə'), 'zon3')
        self.assertEqual(ipa2asjp('vasɐ'), 'vasa')
        self.assertEqual(ipa2asjp('ʃtaɪ̯n'), 'Stain')
        self.assertEqual(ipa2asjp('ɛɐ̯də'), 'Ead3')
        self.assertEqual(ipa2asjp('fɔʏ̯ɐ'), 'foia')

        self.assertEqual(ipa2asjp('ãːkʰ'), 'a*kh~')
        self.assertEqual(ipa2asjp('ɖʰʊ̄̃ɽʰənaː'), 'dh~u*rh~3na')

        self.assertEqual(ipa2asjp('qʼʷetʼ'), 'q"w~et"')
        self.assertEqual(ipa2asjp('wat͡sʼ'), 'wac"')
        self.assertEqual(ipa2asjp('t͡sʼʷelin ttar'), 'c"w~elin ttar')
        self.assertEqual(ipa2asjp('kʼʷenkʼʷ'), 'k"w~enk"w~')

        self.assertEqual(ipa2asjp('ɧɪnː'), 'Sx~in')
        self.assertEqual(ipa2asjp('ɧæːɳa'), 'Sx~Ena')
        self.assertEqual(ipa2asjp('ɔtːɧɪlɪɡ'), 'otSx~ilig')
示例#6
0
    def test_asjp_ipa_asjp(self, tokens):
        """
		Tokens generated by asjp_tokens() should be correctly recovered after
		being converted to IPA.
		"""
        self.assertEqual(ipa2asjp(asjp2ipa(tokens)), tokens)