def ReadSymbolTsv(stream): """Reads emoji data from stream and returns zero query data.""" zero_query_dict = defaultdict(list) stream = code_generator_util.SkipLineComment(stream) for columns in code_generator_util.ParseColumnStream(stream, delimiter='\t'): if len(columns) < 3: logging.warning('format error: %s', '\t'.join(columns)) continue symbol = columns[1] readings = columns[2] symbol_unicode = symbol.decode('utf-8') if len(symbol_unicode) != 1: continue symbol_code_point = ord(symbol_unicode) # Selects emoji symbols from symbol dictionary. # TODO(toshiyuki): Update the range if we need. # from "☀"(black sun with rays) to "❧"(rotated floral heart). if not (0x2600 <= symbol_code_point and symbol_code_point <= 0x2767): continue # \xe3\x80\x80 is a full-width space for reading in re.split(r'(?: |\xe3\x80\x80)+', readings.strip()): if not reading: continue zero_query_dict[reading].append( util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NONE, symbol, util.EMOJI_TYPE_NONE, 0)) if len(columns) >= 4 and columns[3]: # description: "天気", etc. description = columns[3] zero_query_dict[description].append( util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NONE, symbol, util.EMOJI_TYPE_NONE, 0)) if len(columns) >= 5 and columns[4]: # additional_description: "傘", etc. additional_description = columns[4] zero_query_dict[additional_description].append( util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NONE, symbol, util.EMOJI_TYPE_NONE, 0)) return zero_query_dict
def ReadZeroQueryRuleData(input_stream): """Reads zero query rule data from stream and returns zero query data.""" zero_query_dict = collections.defaultdict(list) for line in input_stream: if line.startswith('#'): continue line = line.rstrip('\r\n') if not line: continue tokens = line.split('\t') key = tokens[0] values = tokens[1].split(',') for value in values: zero_query_dict[key].append( util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NONE, value, util.EMOJI_TYPE_NONE, 0)) return zero_query_dict
def ReadZeroQueryNumberData(input_stream): """Reads zero query number data from stream and returns zero query data.""" zero_query_dict = defaultdict(list) for line in input_stream: if line.startswith(b'#'): continue line = line.rstrip(b'\r\n') if not line: continue tokens = line.split(b'\t') key = tokens[0] values = tokens[1].split(b',') for value in values: zero_query_dict[key].append( util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NUMBER_SUFFIX, value, util.EMOJI_TYPE_NONE, 0)) return zero_query_dict
def ReadEmoticonTsv(stream): """Reads emoticon data from stream and returns zero query data.""" zero_query_dict = collections.defaultdict(list) stream = code_generator_util.SkipLineComment(stream) for columns in code_generator_util.ParseColumnStream(stream, delimiter='\t'): if len(columns) != 3: logging.critical('format error: %s', '\t'.join(columns)) sys.exit(1) emoticon = columns[0] readings = columns[2] for reading in re.split(RE_SPLIT, readings.strip()): if not reading: continue zero_query_dict[reading].append( util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_EMOTICON, emoticon, util.EMOJI_TYPE_NONE, 0)) return zero_query_dict
def ReadEmojiTsv(stream): """Reads emoji data from stream and returns zero query data.""" zero_query_dict = collections.defaultdict(list) stream = code_generator_util.SkipLineComment(stream) for columns in code_generator_util.ParseColumnStream(stream, delimiter='\t'): if len(columns) != 13: logging.critical('format error: %s', '\t'.join(columns)) sys.exit(1) code_points = columns[0].split(' ') # Emoji code point. emoji = columns[1] android_pua = ParseCodePoint(columns[2]) readings = columns[6] japanese_name = columns[8] docomo_description = columns[9] softbank_description = columns[10] kddi_description = columns[11] if not android_pua or len(code_points) > 1: # Skip some emoji, which is not supported on old devices. # - Unicode 6.1 or later emoji which doesn't have PUA code point. # - Composite emoji which has multiple code point. # NOTE: Some Unicode 6.0 emoji don't have PUA, and it is also omitted. # TODO(hsumita): Check the availability of such emoji and enable it. logging.info('Skip %s', ' '.join(code_points)) continue reading_list = [] for reading in re.split(RE_SPLIT, NormalizeString(readings)): if not reading: continue reading_list.append(reading) reading_list.extend(GetReadingsFromDescription(japanese_name)) reading_list.extend(GetReadingsFromDescription(docomo_description)) reading_list.extend(GetReadingsFromDescription(softbank_description)) reading_list.extend(GetReadingsFromDescription(kddi_description)) emoji_type = util.EMOJI_TYPE_NONE if emoji: emoji_type |= util.EMOJI_TYPE_UNICODE if docomo_description: emoji_type |= util.EMOJI_TYPE_DOCOMO if softbank_description: emoji_type |= util.EMOJI_TYPE_SOFTBANK if kddi_description: emoji_type |= util.EMOJI_TYPE_KDDI for description in set(reading_list): if not description: continue zero_query_dict[description].append( util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_EMOJI, emoji, emoji_type, android_pua)) # Sort emoji for each reading. for key in zero_query_dict.keys(): zero_query_dict[key].sort(key=lambda e: (e.value, e.emoji_android_pua)) return zero_query_dict