def run(align_func, dataset_path, output_path): """ Read the word pairs from a psa dataset, align them using the given func, and write an output psa dataset. If there is a word that cannot be converted to ASJP, an obviously wrong alignment is output. """ dataset = AlignmentsDataset(dataset_path) output = ['{} (PMI alignment)'.format(dataset.header)] for word_a, word_b, original_al in dataset.data: try: asjp_a = ipa2asjp(word_a.ipa) asjp_b = ipa2asjp(word_b.ipa) except ValueError: output.extend([ original_al.comment, '\t'.join([word_a.lang, '-'] + list(word_a.ipa)), '\t'.join([word_b.lang, '-'] + list(word_b.ipa)), '' ]) continue for asjp_al in align_func(asjp_a, asjp_b): ipa_corr = convert_alignment(word_a.ipa, word_b.ipa, asjp_al.corr) output.extend([ original_al.comment, '\t'.join([word_a.lang] + [pair[0] for pair in ipa_corr]), '\t'.join([word_b.lang] + [pair[1] for pair in ipa_corr]), '' ]) with open_for_writing(output_path) as f: f.write('\n'.join(output))
def test_tokenise_ipa(self): """ Tokenising the IPA input or the ASJP output should be equivalent. """ for string in [ 'sɫɤnt͡sɛ', 'tʼu͡ʊlɡ', 'fɔʏ̯ɐ', 'ɖʰʊ̄̃ɽʰənaː', 't͡sʼʷelin ttar', 'ɧɪnː' ]: tokens = ipatok.tokenise(string, replace=True) self.assertEqual(tokenise(ipa2asjp(string)), ipa2asjp(tokens))
def test_ipa2asjp_errors(self): """ Non-IPA sequences should raise ValueError and non-sequences should raise TypeError. """ for item in [None, True, 42]: with self.assertRaises(TypeError): ipa2asjp(item) for item in ['zEmy~a', 'павел']: with self.assertRaises(ValueError): ipa2asjp(item) for item in [['z', 'E', 'my~', 'a'], ['п']]: with self.assertRaises(ValueError): ipa2asjp(item)
def test_ipa2asjp_lists(self): """ IPA-compliant tokens should be correctly converted to ASJP tokens. """ self.assertEqual(ipa2asjp([]), []) self.assertEqual(ipa2asjp(['']), ['']) self.assertEqual(ipa2asjp(['s', 'ɫ', 'ɤ', 'n', 't͡s', 'ɛ']), ['s', 'l', 'o', 'n', 'c', 'E']) self.assertEqual(ipa2asjp(['z', 'ɛ', 'mʲ', 'a']), ['z', 'E', 'my~', 'a']) self.assertEqual(ipa2asjp(['t͡ʃʼ', 'e', 'n', 'ɪ', 'ɡ']), ['C"', 'e', 'n', 'i', 'g']) self.assertEqual(ipa2asjp(['b', 'u͡ʊ', 'qʼ']), ['b', 'u', 'q"']) self.assertEqual(ipa2asjp(['ʃ', 't', 'a', 'ɪ̯', 'n']), ['S', 't', 'a', 'i', 'n']) self.assertEqual(ipa2asjp(['ʃ', 't', 'a͡ɪ̯', 'n']), ['S', 't', 'a', 'n']) self.assertEqual(ipa2asjp(['ãː', 'kʰ']), ['a*', 'kh~']) self.assertEqual(ipa2asjp(['ɖʰ', 'ʊ̄̃', 'ɽʰ', 'ə', 'n', 'aː']), ['dh~', 'u*', 'rh~', '3', 'n', 'a']) self.assertEqual(ipa2asjp(['qʼʷ', 'e', 'tʼ']), ['q"w~', 'e', 't"']) self.assertEqual(ipa2asjp(['w', 'a', 't͡sʼ']), ['w', 'a', 'c"']) self.assertEqual(ipa2asjp(['ɧ', 'ɪ', 'nː']), ['Sx~', 'i', 'n']) self.assertEqual(ipa2asjp(['ɔ', 'tː', 'ɧ', 'ɪ', 'l', 'ɪ', 'ɡ']), ['o', 't', 'Sx~', 'i', 'l', 'i', 'g'])
def test_ipa2asjp_strings(self): """ IPA-compliant strings should be correctly converted to ASJP. """ self.assertEqual(ipa2asjp(''), '') self.assertEqual(ipa2asjp('sɫɤnt͡sɛ'), 'sloncE') self.assertEqual(ipa2asjp('zvɛzda'), 'zvEzda') self.assertEqual(ipa2asjp('zɛmʲa'), 'zEmy~a') self.assertEqual(ipa2asjp('ɔɡɤn'), 'ogon') self.assertEqual(ipa2asjp('javʲa sɛ'), 'yavy~a sE') self.assertEqual(ipa2asjp('motː'), 'mot') self.assertEqual(ipa2asjp('t͡ʃʼenɪɡ'), 'C"enig') self.assertEqual(ipa2asjp('bu͡ʊqʼ'), 'buq"') self.assertEqual(ipa2asjp('tʼu͡ʊlɡ'), 't"ulg') self.assertEqual(ipa2asjp('bu͡ʊt͡s'), 'buc') self.assertEqual(ipa2asjp('zɔnə'), 'zon3') self.assertEqual(ipa2asjp('vasɐ'), 'vasa') self.assertEqual(ipa2asjp('ʃtaɪ̯n'), 'Stain') self.assertEqual(ipa2asjp('ɛɐ̯də'), 'Ead3') self.assertEqual(ipa2asjp('fɔʏ̯ɐ'), 'foia') self.assertEqual(ipa2asjp('ãːkʰ'), 'a*kh~') self.assertEqual(ipa2asjp('ɖʰʊ̄̃ɽʰənaː'), 'dh~u*rh~3na') self.assertEqual(ipa2asjp('qʼʷetʼ'), 'q"w~et"') self.assertEqual(ipa2asjp('wat͡sʼ'), 'wac"') self.assertEqual(ipa2asjp('t͡sʼʷelin ttar'), 'c"w~elin ttar') self.assertEqual(ipa2asjp('kʼʷenkʼʷ'), 'k"w~enk"w~') self.assertEqual(ipa2asjp('ɧɪnː'), 'Sx~in') self.assertEqual(ipa2asjp('ɧæːɳa'), 'Sx~Ena') self.assertEqual(ipa2asjp('ɔtːɧɪlɪɡ'), 'otSx~ilig')
def test_asjp_ipa_asjp(self, tokens): """ Tokens generated by asjp_tokens() should be correctly recovered after being converted to IPA. """ self.assertEqual(ipa2asjp(asjp2ipa(tokens)), tokens)