def testAllCarrierSymbols(self): """Verify that emoji4unicode.xml covers each carrier's set of symbols. Verify that we have exactly one round-trip mapping for each carrier's symbol. """ # One set of symbol Unicode code points per carrier. # First we enumerate all symbols and each symbol's carrier mappings, # adding a round-trip mapping to the carrier's set. # Then we compare each carrier's set to CarrierData.all_uni. # They should match. carrier_all_uni = {"docomo": set(), "kddi": set(), "softbank": set()} carriers = carrier_all_uni.keys() for symbol in emoji4unicode.GetSymbols(): for carrier in carriers: uni = symbol.GetCarrierUnicode(carrier) if uni and not uni.startswith(">"): self.failIf( uni in carrier_all_uni[carrier], "emoji4unicode.xml has two round-trip mappings with " "%s %s" % (carrier, uni)) carrier_all_uni[carrier].add(uni) for carrier in carriers: e4u_set = carrier_all_uni[carrier] cd_set = emoji4unicode.all_carrier_data[carrier].all_uni self.assertEqual( e4u_set, cd_set, "Mismatched all_uni sets for %s:\n" "Missing from emoji4unicode.xml: %s\n" "Missing from CarrierData: %s" % (carrier, cd_set - e4u_set, e4u_set - cd_set))
def main(): standardized_variants.Load() emoji_vs_code_points = standardized_variants.GetSetOfUnicodeWithEmojiVS() print print("Unicode Standard code points with emoji-style " + "Variation Selector sequences:") print print "C++: " + SetToUnicodeSetPattern(emoji_vs_code_points, _EscapeForCpp) print print "Java: " + SetToUnicodeSetPattern(emoji_vs_code_points, _EscapeForJava) emoji4unicode.Load() pua_vs_code_points = set() for symbol in emoji4unicode.GetSymbols(): if symbol.UnicodeHasVariationSequence(): # Get the Google Private Use Area code point. pua = symbol.GetCarrierUnicode("google") if not pua.startswith("<"): # Round-trip, must be a single code point. pua_vs_code_points.add(int(pua, 16)) print print("Google PUA code points corresponding to Unicode Standard " + "code points with emoji-style Variation Selector sequences:") print print "C++: " + SetToUnicodeSetPattern(pua_vs_code_points, _EscapeForCpp) print print "Java: " + SetToUnicodeSetPattern(pua_vs_code_points, _EscapeForJava) if len(emoji_vs_code_points) != len(pua_vs_code_points): raise ValueError("Mismatch: %d standard code points with VS16 but " + "%d corresponding Google PUA code points" % (len(emoji_vs_code_points), len(pua_vs_code_points)))
def testSymbolNames(self): cp2n = unicode_names.code_points_to_names n2cp = unicode_names.names_to_code_points differences = [] collisions = [] for symbol in emoji4unicode.GetSymbols(): if not symbol.in_proposal: continue name = symbol.GetName() uni = symbol.GetUnicode() if not uni: # The proposal was accepted, the Emoji symbols were added to Unicode 6.0. # These are minor changes to deal with Emoji symbols # that are now encoded (not just proposed any more), # so that we need not modify the .xml data file. proposed_uni = symbol.GetProposedUnicode() if proposed_uni and unicode_age.GetAge(proposed_uni) >= "6.0": uni = proposed_uni if uni: unicode_name = cp2n.get(uni) if unicode_name and (name != unicode_name): msg = "name of e-%s %s differs from U+%s %s" % (symbol.id, name, uni, unicode_name) print msg differences.append(msg) else: self.failIf(_INITIAL_DIGIT_RE.search(name), "name %s of e-%s contains a word-initial digit" % (name, symbol.id)) uni = n2cp.get(name) if uni: msg = "name of e-%s %s collides with U+%s" % (symbol.id, name, uni) print msg collisions.append(msg) self.failIf(differences, differences) self.failIf(collisions, collisions)
def testUniqueNames(self): """Verify that names of new symbols are unique.""" new_names = set() for symbol in emoji4unicode.GetSymbols(): if not symbol.in_proposal or symbol.GetUnicode(): continue name = symbol.GetName() self.failIf(name in new_names, "duplicate name: %s" % name) new_names.add(name)
def testSymbolIDs(self): """Verify that symbol IDs are unique and well-formed.""" id_re = re.compile(r"^[0-9A-F]{3,3}$") symbol_ids = set() for symbol in emoji4unicode.GetSymbols(): self.assert_(id_re.match(symbol.id), "Bad symbol ID %s" % symbol.id) self.failIf(symbol.id in symbol_ids, "Duplicate symbol ID %s" % symbol.id) symbol_ids.add(symbol.id)
def _WriteCmapXML(writer): char_maps = [] for symbol in emoji4unicode.GetSymbols(): glyph_id = symbol.GetGlyphRefID() if glyph_id and symbol.in_proposal and not symbol.GetUnicode(): char_maps.append((symbol.GetFontUnicode(), glyph_id)) char_maps.sort() writer.write(CMAP_HEADER) for char_map in char_maps: writer.write(' <map charValue="0x%s" glyphRefID="%d"/>\n' % (char_map[0], char_map[1])) writer.write(CMAP_FOOTER) writer.close()
def testProposedCodePoints(self): """Verify that proposed code points are unique.""" cp2n = unicode_names.code_points_to_names all_proposed_uni = set() for symbol in emoji4unicode.GetSymbols(): proposed_uni = symbol.GetProposedUnicode() if not proposed_uni or unicode_age.GetAge(proposed_uni) >= "6.0": continue self.failIf(proposed_uni in cp2n, "e-%s proposed U+%s already taken" % (symbol.id, proposed_uni)) self.failIf(proposed_uni in all_proposed_uni, "e-%s proposed U+%s duplicate" % (symbol.id, proposed_uni)) all_proposed_uni.add(proposed_uni)
def _WritePostXML(writer): postscriptnames = [] for symbol in emoji4unicode.GetSymbols(): glyph_id = symbol.GetGlyphRefID() if glyph_id: if symbol.in_proposal and not symbol.GetUnicode(): name = symbol.GetName().replace(" ", "_").replace("-", "_").lower() name = "uni" + symbol.GetFontUnicode() + "." + name else: name = ".notdef" postscriptnames.append((glyph_id, name)) postscriptnames.sort() writer.write(POST_HEADER) for psn in postscriptnames: writer.write(' <PostScriptName glyphRefID="%d" NameString="%s" />\n' % (psn[0], psn[1])) writer.write(POST_FOOTER) writer.close()
def testGlyphIDs(self): """Verify that glyph IDs are unique, sufficient and contiguous.""" glyph_ids = set() for symbol in emoji4unicode.GetSymbols(): glyph_id = symbol.GetGlyphRefID() if not glyph_id: # Not every symbol has a glyph ID. self.assert_( not symbol.in_proposal or symbol.GetUnicode(), "Missing glyph ID for symbol e-%s " "proposed for new encoding" % symbol.id) continue self.assert_(glyph_id >= 4, "Glyph ID %d less than 4" % glyph_id) self.failIf(glyph_id in glyph_ids, "Duplicate glyph ID %d" % glyph_id) glyph_ids.add(glyph_id) min_glyph_id = min(glyph_ids) max_glyph_id = max(glyph_ids) full_set = set(range(min_glyph_id, max_glyph_id + 1)) self.assert_(glyph_ids == full_set, "Missing glyph IDs: %s" % (full_set - glyph_ids))
def testShiftJis(self): """Check for source separation with standard Shift-JIS. No Unicode unification must be with a character from the JIS X 0208 part of Shift-JIS. This part has lead bytes below 0xF0. We consider only round-trip mappings because only those map the same characters between Unicode and Shift-JIS. (Fallbacks go to best-fit *similar* characters.) Japanese cell phone carriers encode Emoji symbols with Shift-JIS VDC codes. """ errors = [] for symbol in emoji4unicode.GetSymbols(): uni = symbol.GetUnicode() if uni and uni in self.__shift_jis_ucm.round_trip_code_points: shift_jis = self.__shift_jis_ucm.from_unicode.get(uni) if shift_jis and shift_jis < "F": msg = ("source separation error: e-%s = U+%s = Shift-JIS-%s" % (symbol.id, uni, shift_jis)) print msg errors.append(msg) self.failIf(errors, errors)
def main(): emoji4unicode.Load() docomo_data = emoji4unicode.all_carrier_data["docomo"] img_from_counts = {"docomo":0, "kddi":0, "softbank":0, "google":0} docomo_exp = 0 only_docomo_exp = 0 for symbol in emoji4unicode.GetSymbols(): if not symbol.in_proposal: continue if symbol.GetUnicode(): continue img_from = symbol.ImageFromWhichCarrier() img_from_counts[img_from] += 1 if img_from == "docomo": docomo_uni = symbol.GetCarrierUnicode("docomo") docomo_symbol = docomo_data.SymbolFromUnicode(docomo_uni) if docomo_symbol.number >= 300: # Expansion Pictogram docomo_exp += 1 has_kddi = False kddi_uni = symbol.GetCarrierUnicode("kddi") if kddi_uni and not kddi_uni.startswith(">"): has_kddi = True has_softbank = False softbank_uni = symbol.GetCarrierUnicode("softbank") if softbank_uni and not softbank_uni.startswith(">"): has_softbank = True msg = "e-%s img_from=docomo" % symbol.id if not has_kddi and not has_softbank: msg += " Expansion Pictogram only" only_docomo_exp += 1 else: if has_kddi: msg += ", kddi available" if has_softbank: msg += ", softbank available" print msg print "Number of symbol images from which carrier:" print img_from_counts print ("Number of symbol images from DoCoMo Expansion Pictograms: %d" % docomo_exp) print ("Number of these symbol images where there are no KDDI or SoftBank " "round-trip mappings: %d" % only_docomo_exp)
def testSymbolNames(self): cp2n = unicode_names.code_points_to_names n2cp = unicode_names.names_to_code_points differences = [] collisions = [] for symbol in emoji4unicode.GetSymbols(): name = symbol.GetName() uni = symbol.GetUnicode() if uni: unicode_name = cp2n.get(uni) if unicode_name and (name != unicode_name): msg = "name of e-%s %s differs from U+%s %s" % ( symbol.id, name, uni, unicode_name) print msg differences.append(msg) else: uni = n2cp.get(name) if uni: msg = "name of e-%s %s collides with U+%s" % (symbol.id, name, uni) print msg collisions.append(msg) self.failIf(differences, differences) self.failIf(collisions, collisions)