Пример #1
0
def main():
    """For use from command line"""
    cn_home = 'https://github.com/alexamies/chinesenotes.com'
    fname = '{cn_home}/blob/master/data/words.txt?raw=true'
    if "CNREADER_HOME" in os.environ:
        cn_home = os.environ['CNREADER_HOME']
        fname = f'{cn_home}/data/words.txt'
    wdict = cndict.open_dictionary(fname)
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--tosimplified',
        dest='tosimplified',
        help='Convert the given traditional text to simplified')
    parser.add_argument(
        '--totraditional',
        dest='totraditional',
        help='Convert the given simplified text to traditional')
    parser.add_argument('--topinyin',
                        dest='topinyin',
                        help='Convert the given text to topinyin')
    args = parser.parse_args()
    if args.tosimplified:
        simplified, _, _ = charutil.to_simplified(wdict, args.tosimplified)
        print(f'Simplified: {simplified}')
    elif args.totraditional:
        trad = charutil.to_traditional(wdict, args.totraditional)
        print(f'Traditional: {trad}')
    elif args.topinyin:
        _, _, pinyin = charutil.to_simplified(wdict, args.topinyin)
        print(f'Pinyin: {pinyin}')
Пример #2
0
def to_cc_cedict(infile: str, outfile: str):
    """Converts the Chinese Notes dictionary from native format to CC-CEDICT

  Since there are utilities available that use the CC-CEDICT format, it can be
  useful to have the dicitonary in that format.
  """
    wdict = cndict.open_dictionary(infile)
    with open(outfile, 'w') as outf:
        done = set()
        for key in wdict:
            entry = wdict[key]
            simplified = entry.simplified
            traditional = entry.traditional
            if traditional == '\\N':
                traditional = simplified
            if simplified in done or traditional in done:
                continue
            pinyin = entry.pinyin
            pinyin = _convert_pinyin_numeric(simplified, wdict)
            english = entry.english
            if english == '\\N':
                continue
            outf.write(f'{traditional} {simplified} [{pinyin}] /{english}/\n')
            done.add(simplified)
            done.add(traditional)
Пример #3
0
def compare_cc_cedict_cnotes(in_fname: str, out_fname: str):
    """Compares the cc_cedict and chinesenotes, reporting the differences

  Writes the output to out_fname and prints a summary to the console
  Params:
    in_fname: Full path name of the cc-cedict file
    out_fname: Full path name of an output file
  """
    summary = ComparisonSummary('CC-CEDICT', 'Chinese Notes')
    analyzer = EntryAnalyzer()
    cedict = open_cc_cedict(in_fname)
    cnotes_dict = cndict.open_dictionary()
    sample = 0
    luid = 6005842
    with open(out_fname, 'w') as out_file:
        for trad, entry in cedict.items():
            if trad not in cnotes_dict:
                entry_analysis = summary.increment_absent_dict2(entry)
                if (sample < 10 and not entry_analysis.contains_alphanum
                        and len(trad) > 1
                        #and not entry_analysis.contains_notes
                        #and not len(entry.senses) > 1
                        and not entry_analysis.refers_to_variant
                        #and not entry_analysis.is_modern_named_entity
                        #and not entry_analysis.ignore
                        #and not entry_analysis.contains_punctuation
                    ):
                    grammar = entry_analysis.grammar
                    traditional = trad
                    if entry.simplified == trad:
                        traditional = '\\N'
                    empty = '\\N\t\\N'
                    entity_kind = entry_analysis.entity_kind
                    domain = entry_analysis.domain
                    subdomain = entry_analysis.subdomain
                    formatter = EntryFormatter(cnotes_dict, entry)
                    english = formatter.reformat_english()
                    pinyin = formatter.reformat_pinyin()
                    notes = formatter.format_notes(trad)
                    out_file.write(
                        f'{luid}\t{entry.simplified}\t{traditional}\t'
                        f'{pinyin}\t{english}\t{grammar}\t{entity_kind}\t{domain}\t'
                        f'{subdomain}\t{empty}\t{notes}\t{luid}\n')
                    sample += 1
                    luid += 1
        summary.print_summary()
Пример #4
0
def main():
    """Command line entry point"""
    logging.basicConfig(level=logging.INFO)
    cn_home = 'https://github.com/alexamies/chinesenotes.com'
    fname = f'{cn_home}/blob/master/data/words.txt?raw=true'
    wdict = {}
    if 'CNREADER_HOME' in os.environ:
        cn_home = os.environ['CNREADER_HOME']
        fname = f'{cn_home}/data/words.txt'
    parser = argparse.ArgumentParser()
    parser.add_argument('--word',
                        dest='word',
                        help='Target to search for similar terms')
    args = parser.parse_args()
    if not args.word:
        print('Please supply target word with --word')
        return
    cnotes_dict = cndict.open_dictionary()
    most_similar = find_similar(args.word, cnotes_dict)
    logging.info(f'Words most similar to {args.word}: {most_similar}')
Пример #5
0
 def __init__(self):
     beam.DoFn.__init__(self)
     cn_home = "https://github.com/alexamies/chinesenotes.com"
     fname = "{}/blob/master/data/words.txt?raw=true".format(cn_home)
     self.wdict = cndict.open_dictionary(fname, True)
     self.term_counter = Metrics.counter(self.__class__, 'terms')