from wordfreq import zipf_frequency from database import database, objects import logging import sqlite3 import sys sources = [ objects.SourceTuple( "CC-CEDICT", "CC", "2022-02-07", "CC-CEDICT is a continuation of the CEDICT project started by Paul Denisowski in 1997 with the aim to provide a complete downloadable Chinese to English dictionary with pronunciation in pinyin for the Chinese characters.", "This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.", "http://www.mdbg.net/chindict/chindict.php?page=cc-cedict", "", "", ), objects.SourceTuple( "CC-CANTO", "CCY", "2017-02-02", "CC-Canto is an open-source Cantonese-to-English dictionary with about 22,000 entries, designed to be used alongside CC-CEDICT.", "This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License.", "http://cantonese.org/download.html", "", "", ), ]
"<source version> <source description> <source legal> " "<source link> <source update url> <source other>")) print(( "e.g. python3 script.py moedict.db ./dict-revised.json " '"Ministry of Education Dictionary (MoEDict)" MOE 2021-08-06 ' '"本典為一部歷史語言辭典,記錄中古至現代各類詞語,並大量引用古典文獻書證,字 音部分則兼收現代及傳統音讀。" ' '"中華民國教育部《重編國語辭典修訂本》資料採「創用CC-姓名標示- 禁止改作 3.0 臺灣授權條款」釋出' '本授權條款允許使用者重製、散布、傳輸著作(包括商業性利用),但不得修改該著作,使用時必須遵照「使用說明」之內容要求。" ' '"https://language.moe.gov.tw/001/Upload/Files/site_content/M0001/respub/dict_reviseddict_download.html" "" "words,sentences"' )) sys.exit(1) cc_cedict.load() source = objects.SourceTuple( sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6], sys.argv[7], sys.argv[8], sys.argv[9], sys.argv[10], ) logging.getLogger().setLevel(logging.INFO) words = [] parse_file(sys.argv[2], words) write(sys.argv[1], source, words)
chinese_sentences = {} # Use this to store all the source sentences nonchinese_sentences = {} # Use this to store all the target sentences intermediate_ids = set() # Use this to store ids of sentences between source/target chinese_sentences_filtered = ( {} ) # Store only source sentences that match a target sentence nonchinese_sentences_filtered = ( {} ) # Store only target sentences that match a source sentence links = {} # Use this to store all the links between sentences source = objects.SourceTuple( sys.argv[6], sys.argv[7], sys.argv[8], sys.argv[9], sys.argv[10], sys.argv[11], sys.argv[12], sys.argv[13], ) parse_sentence_file( sys.argv[2], sys.argv[4], sys.argv[5], chinese_sentences, nonchinese_sentences, intermediate_ids, enable_jyutping, enable_pinyin, ) parse_links_file(