def test_canonical_forms(self): base = (u'ゆっ', u'ぐり') seg1Cases = [u'ゆ' + c for c in u'いちりきつくっ'] seg2Cases = [u'くり', u'ぐり'] expected = set(combinations(seg1Cases, seg2Cases)) self.assertEqual(set(alternations.canonical_forms(base)), expected)
def canonical_forms(kana_segments): """ When given a sequence of segments, determine all possible canonical forms for the sequence. We define the canonical form to be the underlying form, before sequential voicing and sound euphony are applied. @param kana_segments: Reading segments in their surface form. """ table = kana_table.KanaTable.get_cached() num_segments = len(kana_segments) candidate_sets = [] for i, segment in enumerate(kana_segments): variants = [segment] if (i < num_segments - 1 and len(segment) > 1 and segment.endswith(u'っ')): # Can restore onbin cases. variants.extend([segment[:-1] + c for c in u'いちりきつく']) if i > 0 and table.is_voiced(segment[0]): # Can devoice. variants.extend([from_voiced[v[0]] + v[1:] for v in variants]) candidate_sets.append(variants) return combinations(*candidate_sets)
def surface_forms(reading_segments): """ The counterpart of canonical_forms(). Takes a correct reading, and determines how it could be erroneously modified into various surface forms. """ candidate_sets = [] candidate_sets.append(onbin_variants(reading_segments[0])) candidate_sets.extend( map(rendaku_variants, reading_segments[1:]) ) return combinations(*candidate_sets)