Exemplo n.º 1
0
def ReadSymbolTsv(stream):
    """Reads emoji data from stream and returns zero query data."""
    zero_query_dict = defaultdict(list)
    stream = code_generator_util.SkipLineComment(stream)
    for columns in code_generator_util.ParseColumnStream(stream,
                                                         delimiter='\t'):
        if len(columns) < 3:
            logging.warning('format error: %s', '\t'.join(columns))
            continue

        symbol = columns[1]
        readings = columns[2]

        symbol_unicode = symbol.decode('utf-8')
        if len(symbol_unicode) != 1:
            continue

        symbol_code_point = ord(symbol_unicode)
        # Selects emoji symbols from symbol dictionary.
        # TODO(toshiyuki): Update the range if we need.
        # from "☀"(black sun with rays) to "❧"(rotated floral heart).
        if not (0x2600 <= symbol_code_point and symbol_code_point <= 0x2767):
            continue

        # \xe3\x80\x80 is a full-width space
        for reading in re.split(r'(?: |\xe3\x80\x80)+', readings.strip()):
            if not reading:
                continue
            zero_query_dict[reading].append(
                util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NONE, symbol,
                                    util.EMOJI_TYPE_NONE, 0))

        if len(columns) >= 4 and columns[3]:
            # description: "天気", etc.
            description = columns[3]
            zero_query_dict[description].append(
                util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NONE, symbol,
                                    util.EMOJI_TYPE_NONE, 0))
        if len(columns) >= 5 and columns[4]:
            # additional_description: "傘", etc.
            additional_description = columns[4]
            zero_query_dict[additional_description].append(
                util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NONE, symbol,
                                    util.EMOJI_TYPE_NONE, 0))

    return zero_query_dict
Exemplo n.º 2
0
def ReadEmojiTsv(stream):
    """Reads emoji data from stream and returns zero query data."""
    zero_query_dict = defaultdict(list)
    stream = code_generator_util.SkipLineComment(stream)
    for columns in code_generator_util.ParseColumnStream(stream,
                                                         delimiter='\t'):
        if len(columns) != 13:
            logging.critical('format error: %s', '\t'.join(columns))
            sys.exit(1)

        # Emoji code point.
        emoji = columns[1]

        android_pua = ParseCodePoint(columns[2])
        readings = columns[6]
        japanese_name = columns[8]
        docomo_description = columns[9]
        softbank_description = columns[10]
        kddi_description = columns[11]

        reading_list = []
        # \xe3\x80\x80 is a full-width space
        for reading in re.split(r'(?: |\xe3\x80\x80)+',
                                NormalizeString(readings)):
            if not reading:
                continue
            reading_list.append(reading)

        reading_list.extend(GetReadingsFromDescription(japanese_name))
        reading_list.extend(GetReadingsFromDescription(docomo_description))
        reading_list.extend(GetReadingsFromDescription(softbank_description))
        reading_list.extend(GetReadingsFromDescription(kddi_description))

        emoji_type = util.EMOJI_TYPE_NONE
        if emoji:
            emoji_type |= util.EMOJI_TYPE_UNICODE
        if docomo_description:
            emoji_type |= util.EMOJI_TYPE_DOCOMO
        if softbank_description:
            emoji_type |= util.EMOJI_TYPE_SOFTBANK
        if kddi_description:
            emoji_type |= util.EMOJI_TYPE_KDDI

        for description in set(reading_list):
            if not description:
                continue
            zero_query_dict[description].append(
                util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_EMOJI, emoji,
                                    emoji_type, android_pua))

    # Sort emoji for each reading.
    for key in zero_query_dict.keys():
        zero_query_dict[key].sort(key=lambda e: (e.value, e.emoji_android_pua))

    return zero_query_dict
Exemplo n.º 3
0
def ReadZeroQueryNumberData(input_stream):
    """Reads zero query number data from stream and returns zero query data."""
    zero_query_dict = defaultdict(list)

    for line in input_stream:
        if line.startswith('#'):
            continue
        line = line.rstrip('\r\n')
        if not line:
            continue

        tokens = line.split('\t')
        key = tokens[0]
        values = tokens[1].split(',')

        for value in values:
            zero_query_dict[key].append(
                util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NUMBER_SUFFIX, value,
                                    util.EMOJI_TYPE_NONE, 0))
    return zero_query_dict
Exemplo n.º 4
0
def ReadEmoticonTsv(stream):
    """Reads emoticon data from stream and returns zero query data."""
    zero_query_dict = defaultdict(list)
    stream = code_generator_util.SkipLineComment(stream)
    for columns in code_generator_util.ParseColumnStream(stream,
                                                         delimiter='\t'):
        if len(columns) != 3:
            logging.critical('format error: %s', '\t'.join(columns))
            sys.exit(1)

        emoticon = columns[0]
        readings = columns[2]

        # \xe3\x80\x80 is a full-width space
        for reading in re.split(r'(?: |\xe3\x80\x80)+', readings.strip()):
            if not reading:
                continue
            zero_query_dict[reading].append(
                util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_EMOTICON, emoticon,
                                    util.EMOJI_TYPE_NONE, 0))

    return zero_query_dict