Пример #1
0
def run(align_func, dataset_path, output_path):
    """
	Read the word pairs from a psa dataset, align them using the given func,
	and write an output psa dataset.

	If there is a word that cannot be converted to ASJP, an obviously wrong
	alignment is output.
	"""
    dataset = AlignmentsDataset(dataset_path)
    output = ['{} (PMI alignment)'.format(dataset.header)]

    for word_a, word_b, original_al in dataset.data:
        try:
            asjp_a = ipa2asjp(word_a.ipa)
            asjp_b = ipa2asjp(word_b.ipa)
        except ValueError:
            output.extend([
                original_al.comment,
                '\t'.join([word_a.lang, '-'] + list(word_a.ipa)),
                '\t'.join([word_b.lang, '-'] + list(word_b.ipa)), ''
            ])
            continue

        for asjp_al in align_func(asjp_a, asjp_b):
            ipa_corr = convert_alignment(word_a.ipa, word_b.ipa, asjp_al.corr)
            output.extend([
                original_al.comment,
                '\t'.join([word_a.lang] + [pair[0] for pair in ipa_corr]),
                '\t'.join([word_b.lang] + [pair[1] for pair in ipa_corr]), ''
            ])

    with open_for_writing(output_path) as f:
        f.write('\n'.join(output))
Пример #2
0
def write_alignments(alignments, path=None, header='OUTPUT'):
	"""
	Write a list of (Word, Word, Alignment) tuples to a psa file. The last
	element of each tuple should be an Alignment named tuple from either this
	or the align module.

	If path is None or '-', use stdout.
	"""
	lines = [header]

	for word_a, word_b, alignment in alignments:
		field_size = str(max(len(word_a.lang), len(word_b.lang)))
		lang_a = ('{:.<'+ field_size +'}').format(word_a.lang)
		lang_b = ('{:.<'+ field_size +'}').format(word_b.lang)

		align_a = [token if token else '-' for token, _ in alignment.corr]
		align_b = [token if token else '-' for _, token in alignment.corr]

		line_a = '\t'.join([lang_a] + align_a)
		line_b = '\t'.join([lang_b] + align_b)

		if hasattr(alignment, 'comment'):
			comment = alignment.comment
		else:
			comment = str(word_a.concept)

		lines.extend([comment, line_a, line_b, ''])

	with open_for_writing(path) as f:
		f.write('\n'.join(lines))
Пример #3
0
def write_pairs(path, word_pairs):
	"""
	Write a [] of pairs of IPA sequences to a file, one tab-separated pair per
	line, with the IPA tokens separated by intervals.
	"""
	output = []

	for pair in word_pairs:
		output.append('\t'.join([' '.join(seq) for seq in pair]))

	with open_for_writing(path) as f:
		f.write('\n'.join(output))
Пример #4
0
def write_words(words, path=None, dialect='excel-tab',
				header=WordsDataset.DEFAULT_COLUMNS, tokenised=False):
	"""
	Write the words ([] of Word tuples) to a csv file using the given dialect.
	If path is None or '-', use stdout.

	The header arg should be a list of the headings for the language, concept,
	and transcription columns, respectively.

	If tokenised is set to True, separate the IPA tokens with spaces.
	"""
	joiner = ' ' if tokenised else ''

	with open_for_writing(path, newline='') as f:
		writer = csv.writer(f, dialect=dialect)
		writer.writerow(header)

		for word in words:
			writer.writerow([word.lang, word.concept, joiner.join(word.ipa)])
Пример #5
0
def run_sca(dataset_path, output_path):
    """
	Read the word pairs from a psa dataset, align them using the SCA algorithm,
	and write an output psa dataset.
	"""
    dataset = AlignmentsDataset(dataset_path, keep_digits=True)
    output = ['{} (SCA alignment)'.format(dataset.header)]

    for word_a, word_b, alignment in dataset.data:
        sca = Pairwise(word_a.ipa,
                       word_b.ipa,
                       merge_vowels=False,
                       merge_geminates=False)
        sca.align()
        align_a, align_b, _ = sca.alignments[0]

        output.extend([
            alignment.comment, '\t'.join([word_a.lang] + align_a),
            '\t'.join([word_b.lang] + align_b), ''
        ])

    with open_for_writing(output_path) as f:
        f.write('\n'.join(output))