def test_expand_kXHC1983(expanded_data, ucn, fieldval, expected): r""" Each pīnyīn reading is preceded by the character’s location(s) in the dictionary, separated from the reading by “:” (colon); multiple locations for a given reading are separated by “,” (comma); multiple “location: reading” values are separated by “ ” (space). Each location reference is of the form /[0-9]{4}\.[0-9]{3}\*?/ . The number preceding the period is the page number, zero-padded to four digits. The first two digits of the number following the period are the entry’s position on the page, zero-padded. The third digit is 0 for a main entry and greater than 0 for a parenthesized variant of the main entry. A trailing “*” (asterisk) on the location indicates an encoded variant substituted for an unencoded character (see below). As of the present writing (Unicode 5.1), the XHC source data contains 204 unencoded characters (198 of which were represented by PUA or CJK Compatibility [or in one case, by non-CJK, see below] characters), for the most part simplified variants. Each of these 198 characters in the source is replaced by one or more encoded variants (references in all 204 cases are marked with a trailing “*”; see above). Many of these unencoded forms are already in the pipeline for future encoding, and future revisions of this data will eliminate trailing asterisks from mappings. """ item = [i for i in expanded_data if i['ucn'] == ucn][0] assert item['kXHC1983'] == expected assert expansion.expand_field('kXHC1983', fieldval) == expected
def test_expand_kXHC1983(expanded_data, ucn, fieldval, expected): """ Each pīnyīn reading is preceded by the character’s location(s) in the dictionary, separated from the reading by “:” (colon); multiple locations for a given reading are separated by “,” (comma); multiple “location: reading” values are separated by “ ” (space). Each location reference is of the form /[0-9]{4}\.[0-9]{3}\*?/ . The number preceding the period is the page number, zero-padded to four digits. The first two digits of the number following the period are the entry’s position on the page, zero-padded. The third digit is 0 for a main entry and greater than 0 for a parenthesized variant of the main entry. A trailing “*” (asterisk) on the location indicates an encoded variant substituted for an unencoded character (see below). As of the present writing (Unicode 5.1), the XHC source data contains 204 unencoded characters (198 of which were represented by PUA or CJK Compatibility [or in one case, by non-CJK, see below] characters), for the most part simplified variants. Each of these 198 characters in the source is replaced by one or more encoded variants (references in all 204 cases are marked with a trailing “*”; see above). Many of these unencoded forms are already in the pipeline for future encoding, and future revisions of this data will eliminate trailing asterisks from mappings. """ item = [i for i in expanded_data if i['ucn'] == ucn][0] assert item['kXHC1983'] == expected assert expansion.expand_field('kXHC1983', fieldval) == expected
def expand_delimiters(normalized_data): """Return expanded multi-value fields in UNIHAN. :param normalized: Expects data in list of hashes, per :meth:`process.normalize` :type normalized: list of dict :returns: Items which have fields with delimiters and custom separation rules, will be expanded. Including multi-value fields not using both fields (so all fields stay consistent). :rtype: list of dict """ for char in normalized_data: for field in char.keys(): if not char[field]: continue char[field] = expansion.expand_field(field, char[field]) return normalized_data
def expand_delimiters(normalized_data): """ Return expanded multi-value fields in UNIHAN. Parameters ---------- normalized_data : list of dict Expects data in list of hashes, per :meth:`process.normalize` Returns ------- list of dict : Items which have fields with delimiters and custom separation rules, will be expanded. Including multi-value fields not using both fields (so all fields stay consistent). """ for char in normalized_data: for field in char.keys(): if not char[field]: continue char[field] = expansion.expand_field(field, char[field]) return normalized_data
def test_expand_kCCCII(expanded_data, ucn, fieldval, expected): item = [i for i in expanded_data if i['ucn'] == ucn][0] assert item['kCCCII'] == expected assert expansion.expand_field('kCCCII', fieldval) == expected
def test_expand_kIRG_KPSource(expanded_data, field, ucn, fieldval, expected): item = [i for i in expanded_data if i['ucn'] == ucn][0] assert item[field] == expected assert expansion.expand_field(field, fieldval) == expected