def main(): """For use from command line""" cn_home = 'https://github.com/alexamies/chinesenotes.com' fname = '{cn_home}/blob/master/data/words.txt?raw=true' if "CNREADER_HOME" in os.environ: cn_home = os.environ['CNREADER_HOME'] fname = f'{cn_home}/data/words.txt' wdict = cndict.open_dictionary(fname) parser = argparse.ArgumentParser() parser.add_argument( '--tosimplified', dest='tosimplified', help='Convert the given traditional text to simplified') parser.add_argument( '--totraditional', dest='totraditional', help='Convert the given simplified text to traditional') parser.add_argument('--topinyin', dest='topinyin', help='Convert the given text to topinyin') args = parser.parse_args() if args.tosimplified: simplified, _, _ = charutil.to_simplified(wdict, args.tosimplified) print(f'Simplified: {simplified}') elif args.totraditional: trad = charutil.to_traditional(wdict, args.totraditional) print(f'Traditional: {trad}') elif args.topinyin: _, _, pinyin = charutil.to_simplified(wdict, args.topinyin) print(f'Pinyin: {pinyin}')
def to_cc_cedict(infile: str, outfile: str): """Converts the Chinese Notes dictionary from native format to CC-CEDICT Since there are utilities available that use the CC-CEDICT format, it can be useful to have the dicitonary in that format. """ wdict = cndict.open_dictionary(infile) with open(outfile, 'w') as outf: done = set() for key in wdict: entry = wdict[key] simplified = entry.simplified traditional = entry.traditional if traditional == '\\N': traditional = simplified if simplified in done or traditional in done: continue pinyin = entry.pinyin pinyin = _convert_pinyin_numeric(simplified, wdict) english = entry.english if english == '\\N': continue outf.write(f'{traditional} {simplified} [{pinyin}] /{english}/\n') done.add(simplified) done.add(traditional)
def compare_cc_cedict_cnotes(in_fname: str, out_fname: str): """Compares the cc_cedict and chinesenotes, reporting the differences Writes the output to out_fname and prints a summary to the console Params: in_fname: Full path name of the cc-cedict file out_fname: Full path name of an output file """ summary = ComparisonSummary('CC-CEDICT', 'Chinese Notes') analyzer = EntryAnalyzer() cedict = open_cc_cedict(in_fname) cnotes_dict = cndict.open_dictionary() sample = 0 luid = 6005842 with open(out_fname, 'w') as out_file: for trad, entry in cedict.items(): if trad not in cnotes_dict: entry_analysis = summary.increment_absent_dict2(entry) if (sample < 10 and not entry_analysis.contains_alphanum and len(trad) > 1 #and not entry_analysis.contains_notes #and not len(entry.senses) > 1 and not entry_analysis.refers_to_variant #and not entry_analysis.is_modern_named_entity #and not entry_analysis.ignore #and not entry_analysis.contains_punctuation ): grammar = entry_analysis.grammar traditional = trad if entry.simplified == trad: traditional = '\\N' empty = '\\N\t\\N' entity_kind = entry_analysis.entity_kind domain = entry_analysis.domain subdomain = entry_analysis.subdomain formatter = EntryFormatter(cnotes_dict, entry) english = formatter.reformat_english() pinyin = formatter.reformat_pinyin() notes = formatter.format_notes(trad) out_file.write( f'{luid}\t{entry.simplified}\t{traditional}\t' f'{pinyin}\t{english}\t{grammar}\t{entity_kind}\t{domain}\t' f'{subdomain}\t{empty}\t{notes}\t{luid}\n') sample += 1 luid += 1 summary.print_summary()
def main(): """Command line entry point""" logging.basicConfig(level=logging.INFO) cn_home = 'https://github.com/alexamies/chinesenotes.com' fname = f'{cn_home}/blob/master/data/words.txt?raw=true' wdict = {} if 'CNREADER_HOME' in os.environ: cn_home = os.environ['CNREADER_HOME'] fname = f'{cn_home}/data/words.txt' parser = argparse.ArgumentParser() parser.add_argument('--word', dest='word', help='Target to search for similar terms') args = parser.parse_args() if not args.word: print('Please supply target word with --word') return cnotes_dict = cndict.open_dictionary() most_similar = find_similar(args.word, cnotes_dict) logging.info(f'Words most similar to {args.word}: {most_similar}')
def __init__(self): beam.DoFn.__init__(self) cn_home = "https://github.com/alexamies/chinesenotes.com" fname = "{}/blob/master/data/words.txt?raw=true".format(cn_home) self.wdict = cndict.open_dictionary(fname, True) self.term_counter = Metrics.counter(self.__class__, 'terms')