def from_aligned_file(syllabus_name, aligned_file, output_file): _log.start('Extracting gp-aligned words', nSteps=4) _log.log('Building set of expected words') include_set = set((w.surface, w.reading) for w in \ align_core.iter_words(syllabus_name)) _log.log('Loading alignments') alignments = AlignedFile(aligned_file) _log.log('Saving alignments') o_stream = sopen(output_file, 'w') for alignment in alignments: key = (alignment.grapheme, alignment.phoneme) if key in include_set: print >> o_stream, alignment.to_line() include_set.remove(key) o_stream.close() if include_set: _log.finish('%d entries not found (see missing.log)' % len(include_set)) o_stream = sopen('missing.log', 'w') for surface, reading in sorted(include_set): print >> o_stream, '%s %s:%s %s' % (surface, reading, surface, reading) o_stream.close() else: _log.finish('All entries found')
def to_alignment_format(syllabus_name, output_file): o_stream = sopen(output_file, 'w') for word in align_core.iter_words(syllabus_name): if word.reading and word.has_kanji(): print >> o_stream, word.surface, word.reading o_stream.close()