def Parse(self, id_file, special_pos_file): id_list = [] with open(id_file, 'r') as stream: stream = code_generator_util.SkipLineComment(stream) stream = code_generator_util.ParseColumnStream(stream, num_column=2) for pos_id, feature in stream: id_list.append((feature, int(pos_id))) max_id = max(pos_id for _, pos_id in id_list) with open(special_pos_file, 'r') as stream: stream = code_generator_util.SkipLineComment(stream) for pos_id, line in enumerate(stream, start=max_id + 1): id_list.append((line, pos_id)) self.id_list = id_list
def WriteData(input_path, output_path): outputs = [] with open(input_path) as input_stream: input_stream = code_generator_util.SkipLineComment(input_stream) input_stream = code_generator_util.ParseColumnStream(input_stream, num_column=3) # ex. (value, error, correction) = ("雰囲気", "ふいんき", "ふんいき") for value, error, correction in input_stream: outputs.append([value, error, correction]) # In order to lookup the entries via |error| with binary search, # sort outputs here. outputs.sort(lambda x, y: cmp(x[1], y[1]) or cmp(x[0], y[0])) with open(output_path, 'w') as output_stream: output_stream.write('static const ReadingCorrectionItem ' 'kReadingCorrections[] = {\n') for output in outputs: (value, error, correction) = output output_stream.write(' // %s, %s, %s\n' % (value, error, correction)) output_stream.write( code_generator_util.FormatWithCppEscape( ' { %s, %s, %s },\n', value, error, correction)) output_stream.write('};\n')
def ReadData(stream): category_map = defaultdict(list) stream = code_generator_util.SkipLineComment(stream) stream = code_generator_util.ParseColumnStream(stream, delimiter=b'\t') stream = code_generator_util.SelectColumn(stream, [0, 2, 8, 9, 10, 11, 12]) for (code, pua_code, japanese_name, docomo_name, softbank_name, kddi_name, category_index) in stream: if bool(code) != bool(japanese_name): if code: logging.fatal('No Japanese name for %s found.' % code) else: logging.fatal('No Unicode code point for %s found.' % japanese_name) sys.exit(-1) if not code: # Use dummy code point code = b'0' if not pua_code: # Use dummy code point pua_code = b'0' if pua_code[0:1] == b'>': # Don't skip entires which has non-primary PUA codepoint since they also # has unique Unicode codepoint. # e.g. "BLACK SQUARE BUTTON" and "LARGE BLUE CIRCLE" pua_code = pua_code[1:] code_values = [int(c, 16) for c in re.split(br' +', code.strip())] pua_code_value = int(pua_code, 16) (category, index) = category_index.split(b'-') index = int(index) + _CATEGORY_MAP[category]['offset'] category = _CATEGORY_MAP[category]['category'] category_map[category].append( (index, code_values, pua_code_value, japanese_name, docomo_name, softbank_name, kddi_name)) return category_map
def GeneratePosMap(third_party_pos_map_file, user_pos_file): user_pos_map = ParseUserPos(user_pos_file) result = {} with open(third_party_pos_map_file, 'r') as stream: stream = code_generator_util.SkipLineComment(stream) for columns in code_generator_util.ParseColumnStream(stream, num_column=2): third_party_pos_name, mozc_pos = (columns + [None])[:2] if mozc_pos is not None: mozc_pos = user_pos_map[mozc_pos] if third_party_pos_name in result: assert (result[third_party_pos_name] == mozc_pos) continue result[third_party_pos_name] = mozc_pos # Create mozc_pos to mozc_pos map. for key, value in user_pos_map.iteritems(): if key in result: assert (result[key] == value) continue result[key] = value return result
def ParseConnectionFile(text_connection_file, pos_size, special_pos_size): # The result is a square matrix. mat_size = pos_size + special_pos_size matrix = [[0] * mat_size for _ in xrange(mat_size)] with open(text_connection_file) as stream: stream = code_generator_util.SkipLineComment(stream) # The first line contains the matrix column/row size. size = stream.next().rstrip() assert (int(size) == pos_size), '%s != %d' % (size, pos_size) for array_index, cost in enumerate(stream): cost = int(cost.rstrip()) rid = array_index / pos_size lid = array_index % pos_size if rid == 0 and lid == 0: cost = 0 matrix[rid][lid] = cost # Fill INVALID_COST in matrix elements for special POS. for rid in xrange(pos_size, mat_size): for lid in xrange(1, mat_size): # Skip EOS matrix[rid][lid] = INVALID_COST for lid in xrange(pos_size, mat_size): for rid in xrange(1, mat_size): # Skip BOS matrix[rid][lid] = INVALID_COST return matrix
def Parse(self, pos_matcher_rule_file): with open(pos_matcher_rule_file, 'r') as stream: stream = code_generator_util.SkipLineComment(stream) stream = code_generator_util.ParseColumnStream(stream, num_column=2) self._match_rule_map = dict( (name, (pattern, re.compile(pattern.replace('*', '[^,]+')), sortkey)) for sortkey, (name, pattern) in enumerate(stream))
def Parse(self, filepath): result = [] with open(filepath, 'r') as stream: stream = code_generator_util.SkipLineComment(stream) stream = code_generator_util.ParseColumnStream(stream, num_column=4) for user_pos, _, ctype, feature in stream: conjugation_list = [] if ctype == '*': conjugation_list.append( (None, None, self._pos_database.GetPosId(feature))) else: for form, value_suffix, key_suffix in self._inflection_map.Get( ctype): # repalce <cfrom> with actual cform pos_id = self._pos_database.GetPosId( feature.replace('<cform>', form)) # Known error items. # 動詞,自立,*,*,五段動詞,体言接続特殊2,* # 形容詞,自立,*,*,形容詞・アウオ段,文語基本形,* if pos_id is not None: conjugation_list.append( (value_suffix, key_suffix, pos_id)) result.append((user_pos, conjugation_list)) self.data = result
def Parse(self, filepath): result = [] with open(filepath, 'r') as stream: stream = code_generator_util.SkipLineComment(stream) stream = code_generator_util.ParseColumnStream(stream, num_column=4) for user_pos, _, ctype, feature in stream: conjugation_list = [] if ctype == '*': conjugation_list.append( (None, None, self._pos_database.GetPosId(feature))) else: for form, value_suffix, key_suffix in self._inflection_map.Get( ctype): # repalce <cfrom> with actual cform pos_id = self._pos_database.GetPosId( feature.replace('<cform>', form)) # Known error items. # 動詞,自立,*,*,五段動詞,体言接続特殊�* # 形容�自立,*,*,形容詞・アウオ段,文語基本�* if pos_id is not None: conjugation_list.append( (value_suffix, key_suffix, pos_id)) result.append((user_pos, conjugation_list)) self.data = result
def ParseUserPos(path, pos_database, inflection_map): """Reads and parses user pos data based on given pos_util and inflection_map. Returns: parsed user pos data, which is a list of (user_pos, conjugation_list) pairs. conjugation_list is a list of (value_suffix, key_suffix, pos_id). """ result = [] with open(path, 'r') as stream: stream = code_generator_util.SkipLineComment(stream) stream = code_generator_util.ParseColumnStream(stream, num_column=3) for user_pos, ctype, feature in stream: conjugation_list = [] if ctype == '*': conjugation_list.append((None, None, pos_database.GetPosId(feature))) else: for form, value_suffix, key_suffix in inflection_map[ctype]: # repalce <cfrom> with actual cform pos_id = pos_database.GetPosId(feature.replace('<cform>', form)) # Known error items. # 動詞,自立,*,*,五段動詞,体言接続特殊2,* # 形容詞,自立,*,*,形容詞・アウオ段,文語基本形,* # TODO(hidehiko): Make sure it is expected or not. if pos_id is not None: conjugation_list.append((value_suffix, key_suffix, pos_id)) result.append((user_pos, conjugation_list)) return result
def GetPosSize(filepath): # The pos-size should be equal to the number of lines. # TODO(hidehiko): Merge this method with pos_util in dictionary. with open(filepath, 'r') as stream: stream = code_generator_util.SkipLineComment(stream) # Count the number of lines. return sum(1 for _ in stream)
def main(): options = _CreateOptionParser().parse_args()[0] stream = code_generator_util.SkipLineComment(sys.stdin) stream = code_generator_util.ParseColumnStream(stream, delimiter='\t') for entry in stream: assert len(entry) == len(_FIELDS), 'Invalid TSV row.' if _ShouldPrint(entry, options): _Print(entry, options, sys.stdout)
def ReadSingleKanji(stream): """Parses single kanji dictionary data from stream.""" stream = code_generator_util.SkipLineComment(stream) stream = code_generator_util.ParseColumnStream(stream, num_column=2) outputs = list(stream) # For binary search by |key|, sort outputs here. outputs.sort(key=lambda x: x[0]) return outputs
def ReadEmojiTsv(stream): """Parses emoji_data.tsv file and builds the emoji_data_list and reading map. """ emoji_data_list = [] token_dict = collections.defaultdict(list) stream = code_generator_util.SkipLineComment(stream) for columns in code_generator_util.ParseColumnStream(stream, delimiter='\t'): if len(columns) != 13: logging.critical('format error: %s', '\t'.join(columns)) sys.exit(1) # Emoji code point. emoji = columns[1] if columns[1] else '' android_pua = ParseCodePoint(columns[2]) docomo_pua = ParseCodePoint(columns[3]) softbank_pua = ParseCodePoint(columns[4]) kddi_pua = ParseCodePoint(columns[5]) readings = columns[6] # [7]: Name defined in Unicode. It is ignored in current implementation. utf8_description = columns[8] if columns[8] else '' docomo_description = columns[9] if columns[9] else '' softbank_description = columns[10] if columns[10] else '' kddi_description = columns[11] if columns[11] else '' # Check consistency between carrier PUA codes and descriptions for Android # just in case. if ((bool(docomo_pua) != bool(docomo_description)) or (bool(softbank_pua) != bool(softbank_description)) or (bool(kddi_pua) != bool(kddi_description))): logging.warning('carrier PUA and description conflict: %s', '\t'.join(columns)) continue # Check if the character is usable on Android. if not android_pua or not (docomo_pua or softbank_pua or kddi_pua): android_pua = 0 # Replace None with 0. if not emoji and not android_pua: logging.info('Skip: %s', '\t'.join(columns)) continue index = len(emoji_data_list) emoji_data_list.append( (emoji, android_pua, utf8_description, docomo_description, softbank_description, kddi_description)) # \xe3\x80\x80 is a full-width space for reading in re.split(r'(?: |\xe3\x80\x80)+', readings.strip()): if reading: token_dict[NormalizeString(reading)].append(index) return (emoji_data_list, token_dict)
def ReadEmojiTsv(stream): """Reads emoji data from stream and returns zero query data.""" zero_query_dict = defaultdict(list) stream = code_generator_util.SkipLineComment(stream) for columns in code_generator_util.ParseColumnStream(stream, delimiter='\t'): if len(columns) != 13: logging.critical('format error: %s', '\t'.join(columns)) sys.exit(1) # Emoji code point. emoji = columns[1] android_pua = ParseCodePoint(columns[2]) readings = columns[6] japanese_name = columns[8] docomo_description = columns[9] softbank_description = columns[10] kddi_description = columns[11] reading_list = [] # \xe3\x80\x80 is a full-width space for reading in re.split(r'(?: |\xe3\x80\x80)+', NormalizeString(readings)): if not reading: continue reading_list.append(reading) reading_list.extend(GetReadingsFromDescription(japanese_name)) reading_list.extend(GetReadingsFromDescription(docomo_description)) reading_list.extend(GetReadingsFromDescription(softbank_description)) reading_list.extend(GetReadingsFromDescription(kddi_description)) emoji_type = util.EMOJI_TYPE_NONE if emoji: emoji_type |= util.EMOJI_TYPE_UNICODE if docomo_description: emoji_type |= util.EMOJI_TYPE_DOCOMO if softbank_description: emoji_type |= util.EMOJI_TYPE_SOFTBANK if kddi_description: emoji_type |= util.EMOJI_TYPE_KDDI for description in set(reading_list): if not description: continue zero_query_dict[description].append( util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_EMOJI, emoji, emoji_type, android_pua)) # Sort emoji for each reading. for key in zero_query_dict.keys(): zero_query_dict[key].sort(key=lambda e: (e.value, e.emoji_android_pua)) return zero_query_dict
def Parse(self, filepath): result = defaultdict(list) with open(filepath, 'r') as stream: stream = code_generator_util.SkipLineComment(stream) stream = code_generator_util.ParseColumnStream(stream, num_column=4) for key, form, value_suffix, key_suffix in stream: result[key].append( (form, value_suffix if value_suffix != '*' else '', key_suffix if key_suffix != '*' else '')) self._map = result
def ReadData(stream): category_map = defaultdict(list) stream = code_generator_util.SkipLineComment(stream) stream = code_generator_util.ParseColumnStream(stream, delimiter='\t') stream = code_generator_util.SelectColumn(stream, [2, 9, 10, 11, 12]) for (code, docomo_name, softbank_name, kddi_name, category_index) in stream: if not code or code[0] == '>': continue (category, index) = category_index.split('-') category_map[category].append( (index, int(code, 16), docomo_name, softbank_name, kddi_name)) return category_map
def ReadSymbolTsv(stream): """Reads emoji data from stream and returns zero query data.""" zero_query_dict = defaultdict(list) stream = code_generator_util.SkipLineComment(stream) for columns in code_generator_util.ParseColumnStream(stream, delimiter='\t'): if len(columns) < 3: logging.warning('format error: %s', '\t'.join(columns)) continue symbol = columns[1] readings = columns[2] symbol_unicode = symbol.decode('utf-8') if len(symbol_unicode) != 1: continue symbol_code_point = ord(symbol_unicode) # Selects emoji symbols from symbol dictionary. # TODO(toshiyuki): Update the range if we need. # from "☀"(black sun with rays) to "❧"(rotated floral heart). if not (0x2600 <= symbol_code_point and symbol_code_point <= 0x2767): continue # \xe3\x80\x80 is a full-width space for reading in re.split(r'(?: |\xe3\x80\x80)+', readings.strip()): if not reading: continue zero_query_dict[reading].append( util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NONE, symbol, util.EMOJI_TYPE_NONE, 0)) if len(columns) >= 4 and columns[3]: # description: "天気", etc. description = columns[3] zero_query_dict[description].append( util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NONE, symbol, util.EMOJI_TYPE_NONE, 0)) if len(columns) >= 5 and columns[4]: # additional_description: "傘", etc. additional_description = columns[4] zero_query_dict[additional_description].append( util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_NONE, symbol, util.EMOJI_TYPE_NONE, 0)) return zero_query_dict
def ParseInflectionMap(path): """Reads and parses inflection data. Returns: parsed inflection map data, which forms; - map of key to (form, value_suffix, key_suffix) """ result = defaultdict(list) with open(path, 'r') as stream: stream = code_generator_util.SkipLineComment(stream) stream = code_generator_util.ParseColumnStream(stream, num_column=4) for key, form, value_suffix, key_suffix in stream: result[key].append(( form, value_suffix if value_suffix != '*' else '', key_suffix if key_suffix != '*' else '')) return result
def ReadVariant(stream): """Parses variant data from stream.""" variant_types = [] variant_items = [] stream = code_generator_util.SkipLineComment(stream) stream = code_generator_util.ParseColumnStream(stream) for tokens in stream: if len(tokens) == 1: variant_types.append(tokens[0]) elif len(tokens) == 2 and variant_types: (target, original) = tokens variant_items.append([target, original, len(variant_types) - 1]) # For binary search by |target|, sort variant items here. variant_items.sort(key=lambda x: x[0]) return (variant_types, variant_items)
def ReadEmoticonTsv(stream): """Reads emoticon data from stream and returns zero query data.""" zero_query_dict = collections.defaultdict(list) stream = code_generator_util.SkipLineComment(stream) for columns in code_generator_util.ParseColumnStream(stream, delimiter='\t'): if len(columns) != 3: logging.critical('format error: %s', '\t'.join(columns)) sys.exit(1) emoticon = columns[0] readings = columns[2] for reading in re.split(RE_SPLIT, readings.strip()): if not reading: continue zero_query_dict[reading].append( util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_EMOTICON, emoticon, util.EMOJI_TYPE_NONE, 0)) return zero_query_dict
def ReadData(stream): category_map = defaultdict(list) stream = code_generator_util.SkipLineComment(stream) stream = code_generator_util.ParseColumnStream(stream, delimiter='\t') stream = code_generator_util.SelectColumn(stream, [0, 2, 8, 9, 10, 11, 12]) for (code, pua_code, japanese_name, docomo_name, softbank_name, kddi_name, category_index) in stream: if not pua_code or pua_code[0] == '>': continue if not code: if japanese_name: logging.fatal('No Unicode emoji code point found.') sys.exit(-1) # Use dummy code point code = '0' (category, index) = category_index.split('-') category_map[category].append( (index, int(code, 16), int(pua_code, 16), japanese_name, docomo_name, softbank_name, kddi_name)) return category_map
def WriteData(input_path, output_value_array_path, output_error_array_path, output_correction_array_path): outputs = [] with codecs.open(input_path, 'r', encoding='utf-8') as input_stream: input_stream = code_generator_util.SkipLineComment(input_stream) input_stream = code_generator_util.ParseColumnStream(input_stream, num_column=3) # ex. (value, error, correction) = ("雰囲気", "ふいんき", "ふんいき") for value, error, correction in input_stream: outputs.append([value, error, correction]) # In order to lookup the entries via |error| with binary search, # sort outputs here. outputs.sort(key=lambda x: (x[1], x[0])) serialized_string_array_builder.SerializeToFile( [value for (value, _, _) in outputs], output_value_array_path) serialized_string_array_builder.SerializeToFile( [error for (_, error, _) in outputs], output_error_array_path) serialized_string_array_builder.SerializeToFile( [correction for (_, _, correction) in outputs], output_correction_array_path)
def ReadEmojiTsv(stream): """Parses emoji_data.tsv file and builds the emoji_data_list and reading map. """ emoji_data_list = [] token_dict = defaultdict(list) stream = code_generator_util.SkipLineComment(stream) for columns in code_generator_util.ParseColumnStream(stream, delimiter='\t'): if len(columns) != 13: logging.critical('format error: %s', '\t'.join(columns)) sys.exit(1) code_points = columns[0].split(' ') # Emoji code point. emoji = columns[1] if columns[1] else '' android_pua = ParseCodePoint(columns[2]) docomo_pua = ParseCodePoint(columns[3]) softbank_pua = ParseCodePoint(columns[4]) kddi_pua = ParseCodePoint(columns[5]) readings = columns[6] # [7]: Name defined in Unicode. It is ignored in current implementation. utf8_description = columns[8] if columns[8] else '' docomo_description = columns[9] if columns[9] else '' softbank_description = columns[10] if columns[10] else '' kddi_description = columns[11] if columns[11] else '' if not android_pua or len(code_points) > 1: # Skip some emoji, which is not supported on old devices. # - Unicode 6.1 or later emoji which doesn't have PUA code point. # - Composite emoji which has multiple code point. # NOTE: Some Unicode 6.0 emoji don't have PUA, and it is also omitted. # TODO(hsumita): Check the availability of such emoji and enable it. logging.info('Skip %s', ' '.join(code_points)) continue # Check consistency between carrier PUA codes and descriptions for Android # just in case. if ((bool(docomo_pua) != bool(docomo_description)) or (bool(softbank_pua) != bool(softbank_description)) or (bool(kddi_pua) != bool(kddi_description))): logging.warning('carrier PUA and description conflict: %s', '\t'.join(columns)) continue # Check if the character is usable on Android. if not android_pua or not (docomo_pua or softbank_pua or kddi_pua): android_pua = 0 # Replace None with 0. if not emoji and not android_pua: logging.info('Skip: %s', '\t'.join(columns)) continue index = len(emoji_data_list) emoji_data_list.append((emoji, android_pua, utf8_description, docomo_description, softbank_description, kddi_description)) # \xe3\x80\x80 is a full-width space for reading in re.split(r'(?: |\xe3\x80\x80)+', readings.strip()): if reading: token_dict[NormalizeString(reading)].append(index) return (emoji_data_list, token_dict)
def ReadEmojiTsv(stream): """Reads emoji data from stream and returns zero query data.""" zero_query_dict = collections.defaultdict(list) stream = code_generator_util.SkipLineComment(stream) for columns in code_generator_util.ParseColumnStream(stream, delimiter='\t'): if len(columns) != 13: logging.critical('format error: %s', '\t'.join(columns)) sys.exit(1) code_points = columns[0].split(' ') # Emoji code point. emoji = columns[1] android_pua = ParseCodePoint(columns[2]) readings = columns[6] japanese_name = columns[8] docomo_description = columns[9] softbank_description = columns[10] kddi_description = columns[11] if not android_pua or len(code_points) > 1: # Skip some emoji, which is not supported on old devices. # - Unicode 6.1 or later emoji which doesn't have PUA code point. # - Composite emoji which has multiple code point. # NOTE: Some Unicode 6.0 emoji don't have PUA, and it is also omitted. # TODO(hsumita): Check the availability of such emoji and enable it. logging.info('Skip %s', ' '.join(code_points)) continue reading_list = [] for reading in re.split(RE_SPLIT, NormalizeString(readings)): if not reading: continue reading_list.append(reading) reading_list.extend(GetReadingsFromDescription(japanese_name)) reading_list.extend(GetReadingsFromDescription(docomo_description)) reading_list.extend(GetReadingsFromDescription(softbank_description)) reading_list.extend(GetReadingsFromDescription(kddi_description)) emoji_type = util.EMOJI_TYPE_NONE if emoji: emoji_type |= util.EMOJI_TYPE_UNICODE if docomo_description: emoji_type |= util.EMOJI_TYPE_DOCOMO if softbank_description: emoji_type |= util.EMOJI_TYPE_SOFTBANK if kddi_description: emoji_type |= util.EMOJI_TYPE_KDDI for description in set(reading_list): if not description: continue zero_query_dict[description].append( util.ZeroQueryEntry(util.ZERO_QUERY_TYPE_EMOJI, emoji, emoji_type, android_pua)) # Sort emoji for each reading. for key in zero_query_dict.keys(): zero_query_dict[key].sort(key=lambda e: (e.value, e.emoji_android_pua)) return zero_query_dict
def ParseUserPos(user_pos_file): with open(user_pos_file, 'r') as stream: stream = code_generator_util.SkipLineComment(stream) stream = code_generator_util.ParseColumnStream(stream, num_column=2) return dict((key, enum_value) for key, enum_value in stream)