示例#1
0
 def test_jamdict_xml(self):
     print("Test Jamdict search in XML files")
     jam = Jamdict(":memory:",
                   jmd_xml_file=MINI_JMD,
                   kd2_xml_file=MINI_KD2,
                   jmnedict_xml_file=MINI_JMNE,
                   auto_config=True)
     jam.import_data()
     result = jam.lookup('おみやげ')
     self.assertEqual(1, len(result.entries))
     self.assertEqual(2, len(result.chars))
     self.assertEqual({c.literal for c in result.chars}, {'土', '産'})
示例#2
0
 def test_lookup_result(self):
     jam = Jamdict(jmd_xml_file=MINI_JMD,
                   kd2_xml_file=MINI_KD2,
                   auto_config=False,
                   auto_expand=False)
     result = jam.lookup('おみやげ')
     self.assertTrue(result.entries)
     self.assertEqual(result.entries[0].kana_forms[0].text, 'おみやげ')
     # test lookup by ID
     res = jam.lookup('id#{}'.format(1002490))
     self.assertTrue(res.entries)
     self.assertEqual(res.entries[0].kana_forms[0].text, 'おとそ')
示例#3
0
class PrepJam(Processor):

    def __init__(self, info, name="jam"):
        super().__init__(info, name)
        self.parser = JapaneseAnalyser()
        self.jam = Jamdict()

    def process(self, sent):
        if isinstance(sent, Sentence):
            ttl_sent = self.parser.analyse(sent.text)
            # lookup each token in the dictionary
            for idx, token in enumerate(ttl_sent):
                if not token.lemma:
                    continue
                result = self.jam.lookup(token.lemma, strict_lookup=True)
                if result.entries or result.chars:
                    ids = []
                    for e in result.entries:
                        ids.append('jam::{}'.format(e.idseq))
                    # for c in result.chars:
                    #     ids.append('jam:char:{}'.format(c.literal))
                    nc = ttl_sent.new_concept(tag=';'.join(ids), clemma=token.text, tokens=[token])
                    # comment = TextReport.string()
                    # dump_result(result, report=comment)
                    # nc.comment = comment.content()
                    nc.comment = result.text(compact=False, no_id=True, with_chars=False)
            sent.shallow = ttl_sent
            sent.text = ' '.join(t.text for t in sent.shallow.tokens)
            return sent
        else:
            return self.process(Sentence(sent))
示例#4
0
 def test_search_by_ne_type(self):
     print("Test Jamdict search in XML files")
     jam = Jamdict(":memory:",
                   jmd_xml_file=MINI_JMD,
                   kd2_xml_file=MINI_KD2,
                   jmnedict_xml_file=MINI_JMNE,
                   auto_config=True)
     jam.import_data()
     netypes = jam.all_ne_type()
     expected = [
         'company', 'fem', 'given', 'organization', 'person', 'place',
         'surname', 'unclass'
     ]
     self.assertEqual(expected, netypes)
     res = jam.lookup("place")
     actual = set()
     for n in res.names:
         actual.update(k.text for k in n.kanji_forms)
     self.assertIn("厦門", actual)
     res = jam.lookup("company")
     actual = set()
     for n in res.names:
         actual.update(k.text for k in n.kanji_forms)
     expected = {'埼銀', 'IKEA'}
     self.assertTrue(expected.issubset(actual))
示例#5
0
 def test_jamdict_sqlite_all(self):
     if os.path.isfile(TEST_DB):
         os.unlink(TEST_DB)
     jam = Jamdict(db_file=TEST_DB,
                   kd2_file=TEST_DB,
                   jmd_xml_file=MINI_JMD,
                   kd2_xml_file=MINI_KD2)
     # Lookup using XML
     result = jam.jmdict_xml.lookup('おみやげ')
     getLogger().debug("Results: {}".format(result))
     # Lookup using SQLite
     jam.import_data()
     # test lookup
     result = jam.lookup('おみやげ')
     print(result.entries)
     self.assertEqual(len(result.entries), 1)
     self.assertEqual(len(result.chars), 2)
     self.assertEqual({c.literal for c in result.chars}, {'土', '産'})
示例#6
0
 def set_english(self):
     jmd = Jamdict()
     self.english = ''
     self.isVerb = 0
     results = jmd.lookup(self.japanese)
     for entry in results.entries:
         # print(entry)
         # print()
         for self.kana in entry.kana_forms:
             if self.reading == str(self.kana):
                 if self.english:
                     self.english += '; '
                 for idx, s in enumerate(entry.senses):
                     if idx > 0:
                         self.english += '; '
                     if (str(s).find('Godan verb') !=
                             -1) or (str(s).find('Ichidan verb') != -1):
                         self.isVerb = 1
                     self.english += self.remGrammar(str(s))
示例#7
0
 def set_english_from_kana(self):
     jmd = Jamdict()
     self.english = ''
     self.isVerb = 0
     results = jmd.lookup(self.japanese)
     for entry in results.entries:
         # print(entry)
         # print()
         for noEntries, self.kana in enumerate(entry.kana_forms):
             if self.japanese == str(self.kana):
                 if self.english:
                     self.english += '; '
                 for idx, s in enumerate(entry.senses):
                     if idx > 0 and self.english:
                         self.english += '; '
                     if str(s).find('verb') != -1 and str(s).find(
                             'verb suru') == -1 and str(s).find(
                                 'adverb') == -1:
                         self.isVerb = 1
                     self.english += self.remGrammar(str(s))
示例#8
0
def get_jam(cli, args):
    if not args.jdb:
        args.jdb = None
    if args.kd2:
        cli.logger.warning("Jamdict database location: {}".format(args.jdb))
        cli.logger.warning("Kanjidic2 database location: {}".format(args.kd2))
        jmd = Jamdict(db_file=args.jdb,
                      kd2_file=args.kd2,
                      jmd_xml_file=args.jmdxml,
                      kd2_xml_file=args.kd2xml)
    else:
        cli.logger.debug(
            "Using the same database for both JMDict and Kanjidic2")
        jmd = Jamdict(db_file=args.jdb,
                      kd2_file=args.jdb,
                      jmd_xml_file=args.jmdxml,
                      kd2_xml_file=args.kd2xml)
    if jmd.kd2 is None:
        cli.logger.warning("Kanjidic2 database could not be found")
    return jmd
    def __init__(self, user_dict="", user_dict_en=""):
        self.dict_en = {}
        dbfile = os.path.dirname(__file__) + "/res/jamdict.db"
        if not os.path.isfile(dbfile):
            dbfile = os.path.dirname(sys.argv[0]) + "/res/jamdict.db"

        self.jmd = Jamdict(db_file=dbfile, kd2_file=dbfile)
        if user_dict != "":
            self.tokenizer = Tokenizer(user_dict, udic_type="simpledic", udic_enc="utf8")
        else:
            self.tokenizer = Tokenizer()
        self.token_filters = [POSStopFilter(['記号', '助詞']), TokenCountFilter(att='base_form')]
        if user_dict_en != "":
            with open(user_dict_en, newline='', encoding="utf-8") as csvfile:
                dic_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
                for row in dic_reader:
                    if len(row) >= 3:
                        self.dict_en[row[0]] = {
                            'reading': row[1],
                            'meaning': row[2]
                        }
示例#10
0
async def word_translate(request):
    data = await request.json()

    # fake_data = {
    #     "word": "自然",
    # }
    jmd = Jamdict()
    result = jmd.lookup(data["word"])

    # print all word entries
    #for entry in result.entries:
    #    print(entry)

    # tokenizer = WordTokenizer('Sentencepiece', model_path=MECAB_PATH)
    # print(tokenizer.tokenize(sentence))
    # # => [▁, 自然, 言語, 処理, を, 勉強, し, ています]
    #return web.json_response({"en_word": result.entries[0]})
    return web.json_response(
        data={"en_word": result.entries[0].text()},
        # TODO: later to setup -> result.entries[0].to_json()
        headers=HEADERS,
    )
示例#11
0
 def test_jamdict_sqlite_all(self):
     if os.path.isfile(TEST_DB):
         os.unlink(TEST_DB)
     TEST_DB.touch()
     jam = Jamdict(db_file=TEST_DB,
                   jmd_xml_file=MINI_JMD,
                   kd2_xml_file=MINI_KD2,
                   jmnedict_xml_file=MINI_JMNE)
     # Lookup using XML
     result = jam.jmdict_xml.lookup('おみやげ')
     getLogger().debug("Results: {}".format(result))
     # Lookup using SQLite
     jam.import_data()
     # test lookup
     result = jam.lookup('おみやげ')
     self.assertIsNotNone(result.entries)
     self.assertEqual(len(result.entries), 1)
     self.assertEqual(len(result.chars), 2)
     self.assertEqual({c.literal for c in result.chars}, {'土', '産'})
     print("Test reading DB into RAM")
     ram_jam = Jamdict(TEST_DB, memory_mode=True)
     print("1st lookup")
     result = ram_jam.lookup('おみやげ')
     self.assertIsNotNone(result.entries)
     self.assertEqual(len(result.entries), 1)
     self.assertEqual(len(result.chars), 2)
     self.assertEqual({c.literal for c in result.chars}, {'土', '産'})
     print("2nd lookup")
     result = ram_jam.lookup('おみやげ')
     self.assertIsNotNone(result.entries)
     self.assertEqual(len(result.entries), 1)
     self.assertEqual(len(result.chars), 2)
     self.assertEqual({c.literal for c in result.chars}, {'土', '産'})
     print("3rd lookup")
     result = ram_jam.lookup('おみやげ')
     self.assertIsNotNone(result.entries)
     self.assertEqual(len(result.entries), 1)
     self.assertEqual(2, len(result.chars))
     self.assertEqual({c.literal for c in result.chars}, {'土', '産'})
示例#12
0
 def test_jamdict_data(self):
     jam = Jamdict()
     # search verb kaeru
     res = jam.lookup("かえる", pos="transitive verb")
     actual = [e.idseq for e in res.entries]
     self.assertIn(1510650, actual)
     self.assertIn(1589780, actual)
     forms = all_kanji(res)
     expected = {'変える', '代える', '換える', '替える'}
     self.assertTrue(expected.issubset(forms))
     # search by noun kaeru
     res2 = jam.lookup("かえる", pos='noun (common) (futsuumeishi)')
     actual2 = [e.idseq for e in res2.entries]
     forms2 = all_kanji(res2)
     self.assertIn(1577460, actual2)
     expected2 = {'蛙', '蛤', '蝦'}
     self.assertTrue(expected2.issubset(forms2))
     # search both noun and verb
     res3 = jam.lookup(
         "かえる", pos=['noun (common) (futsuumeishi)', "transitive verb"])
     forms3 = all_kanji(res3)
     self.assertTrue(expected.issubset(forms3))
     self.assertTrue(expected2.issubset(forms3))
示例#13
0
    def translationButtonClicked(self, text):
        if self.insertionBox.text() == "":
            msg = QMessageBox()
            msg.setWindowTitle("Error")
            msg.setText("There is no word to translate!")
            msg.exec_()
        else:
            jmd = Jamdict()
            text = self.insertionBox.text()

            Window.Rtext = toRomaji(text)
            Window.Ktext = toKatakana(text)
            Window.Htext = toHiragana(text)
            result = jmd.lookup(text)

            text = toTokensDictionary(text)

            separater = ""
            Window.Ftext = toFurigana(text)
            Window.Ftext = separater.join(Window.Ftext)

            if result == None:
                result = jmd.lookup(Window.Ktext)
            if result == None:
                result = jmd.lookup(Window.Htext)

            Window.Etext = repr(result.entries).strip("[]")
            Window.Ctext = repr(result.chars).strip("[]")

            self.romajiBox.setText(Window.Rtext)
            self.katakanaBox.setText(Window.Ktext)
            self.hiraganaBox.setText(Window.Htext)
            self.furiganaBox.setText(Window.Ftext)
            self.entriesBox.setText(Window.Etext)
            self.charsBox.setText(Window.Ctext)

            return Window.Rtext, Window.Ktext, Window.Htext, Window.Ftext
示例#14
0
 def test_search_by_pos(self):
     print("Test Jamdict search in XML files")
     jam = Jamdict(":memory:",
                   jmd_xml_file=MINI_JMD,
                   kd2_xml_file=MINI_KD2,
                   jmnedict_xml_file=MINI_JMNE,
                   auto_config=True)
     jam.import_data()
     # test get all pos
     poses = jam.all_pos()
     expected = {
         'Godan verb - -aru special class', "Godan verb with `ku' ending",
         "Godan verb with `ru' ending", "Godan verb with `su' ending",
         "Godan verb with `u' ending", 'Ichidan verb',
         'adjectival nouns or quasi-adjectives (keiyodoshi)',
         'adjective (keiyoushi)', 'adverb (fukushi)',
         "adverb taking the `to' particle", 'auxiliary verb', 'conjunction',
         'expressions (phrases, clauses, etc.)', 'interjection (kandoushi)',
         'intransitive verb', 'noun (common) (futsuumeishi)',
         'noun or participle which takes the aux. verb suru',
         'noun or verb acting prenominally',
         "nouns which may take the genitive case particle `no'",
         'pre-noun adjectival (rentaishi)', 'pronoun', 'transitive verb'
     }
     self.assertEqual(expected, set(poses))
     result = jam.lookup('おみやげ', pos=['noun (common) (futsuumeishi)'])
     self.assertEqual(1, len(result.entries))
     with self.assertLogs('jamdict.jmdict_sqlite', level="WARNING") as cm:
         result = jam.lookup('おみやげ', pos='noun (common) (futsuumeishi)')
         self.assertEqual(1, len(result.entries))
         warned_pos_as_str = False
         for line in cm.output:
             if "POS filter should be a collection, not a string" in line:
                 warned_pos_as_str = True
                 break
         self.assertTrue(warned_pos_as_str)
     result = jam.lookup('おみやげ', pos=['intransitive verb'])
     self.assertFalse(result.entries)
     result = jam.lookup(
         'おみやげ', pos=['intransitive verb', 'noun (common) (futsuumeishi)'])
     self.assertTrue(result.entries)
class LearningMaterialGetter:
    def __init__(self, user_dict="", user_dict_en=""):
        self.dict_en = {}
        dbfile = os.path.dirname(__file__) + "/res/jamdict.db"
        if not os.path.isfile(dbfile):
            dbfile = os.path.dirname(sys.argv[0]) + "/res/jamdict.db"

        self.jmd = Jamdict(db_file=dbfile, kd2_file=dbfile)
        if user_dict != "":
            self.tokenizer = Tokenizer(user_dict, udic_type="simpledic", udic_enc="utf8")
        else:
            self.tokenizer = Tokenizer()
        self.token_filters = [POSStopFilter(['記号', '助詞']), TokenCountFilter(att='base_form')]
        if user_dict_en != "":
            with open(user_dict_en, newline='', encoding="utf-8") as csvfile:
                dic_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
                for row in dic_reader:
                    if len(row) >= 3:
                        self.dict_en[row[0]] = {
                            'reading': row[1],
                            'meaning': row[2]
                        }

    def tokenize(self, text):
        a = Analyzer(tokenizer=self.tokenizer, token_filters=self.token_filters)
        return a.analyze(text)

    def getDictionaryInfos(self, pairs):
        infos = []
        for token, v in pairs:
            match = re.match("[\u30A1-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff66-\uff9f]", token)
            if match:
                dic_info = self.jmd.lookup(token)
                if len(dic_info.entries) > 0 or len(dic_info.chars) > 0:
                    # Inject custom dictionary meaning
                    if len(dic_info.entries) == 0 and token in self.dict_en:
                        meaning = self.dict_en[token]['meaning']
                        reading = self.dict_en[token]['reading']
                        d = JMDEntry()
                        d.senses = [Sense()]
                        d.kana_forms = [KanaForm()]
                        d.kana_forms[0].text = reading
                        d.senses[0].gloss.append(SenseGloss("", "", meaning))
                        dic_info.entries.append(d)
                    infos.append((token, dic_info))

        return infos
示例#16
0
 def test_warn_to_json_deprecated(self):
     print("Test Jamdict search in XML files")
     jam = Jamdict(":memory:",
                   jmd_xml_file=MINI_JMD,
                   kd2_xml_file=MINI_KD2,
                   jmnedict_xml_file=MINI_JMNE)
     jam.import_data()
     with self.assertWarns(DeprecationWarning):
         r = jam.lookup("おみやげ")
         self.assertTrue(r.to_json())
     with self.assertWarns(DeprecationWarning):
         r2 = jam.lookup("シェンロン")
         self.assertTrue(r2.to_json())
示例#17
0
 def test_lookup_iter(self):
     jam = Jamdict(":memory:",
                   jmd_xml_file=MINI_JMD,
                   kd2_xml_file=MINI_KD2,
                   jmnedict_xml_file=MINI_JMNE,
                   auto_config=True)
     jam.import_data()
     # verify entries
     res = jam.lookup_iter("おこ%", pos="noun (common) (futsuumeishi)")
     entries = [e.text() for e in res.entries]
     expected = [
         'おこのみやき (お好み焼き) : okonomiyaki/savoury pancake containing meat or seafood and '
         'vegetables', 'おこさん (お子さん) : child',
         "おこさま (お子様) : child (someone else's)"
     ]
     self.assertEqual(expected, entries)
     # verify characters
     res = jam.lookup_iter("お土産")
     self.assertIsNotNone(res.entries)
     self.assertIsNotNone(res.chars)
     self.assertIsNotNone(res.names)
     # verify characters
     chars = [repr(c) for c in res.chars]
     expected = [
         '土:3:soil,earth,ground,Turkey',
         '産:11:products,bear,give birth,yield,childbirth,native,property'
     ]
     self.assertEqual(expected, chars)
     # verify names
     res = jam.lookup_iter("surname")
     names = [n.text() for n in res.names]
     expected = [
         'しめたに (〆谷) : Shimetani (surname)', 'しめき (〆木) : Shimeki (surname)',
         'しめの (〆野) : Shimeno (surname)'
     ]
     self.assertEqual(expected, names)
示例#18
0
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

########################################################################

import json
from jamdict import Jamdict

########################################################################

# Create an instance of Jamdict
jam = Jamdict()
print("Jamdict DB file: {}".format(jam.db_file))

# Lookup by kana
result = jam.lookup('おかえし')
for entry in result.entries:
    print(entry)

# Lookup by kanji
print("-----------------")
result = jam.lookup('御土産')
for entry in result.entries:
    print(entry)

# Lookup a name
# a name entity is also a jamdict.jmdict.JMDEntry object
示例#19
0
def mainloop(file, savedump, database, cfgfile, records, orderby, compact, known, verbose, nosense, translate, destlang):
    """Get user Janpanse input then parse it and record new words into database."""
    jmd = Jamdict()
    knp = KNP()

    knownlist = {}
    with open(known, 'r') as reader:
        lines = reader.readlines()
        for line in lines:
            if re.match("^#", line):
                continue
            entry = line.split(",")
            if len(entry) == 2:
                knownlist[entry[0].strip()] = entry[1].strip()

    appid = ""
    appkey = ""
    if translate == "true":
        # See https://fanyi-api.baidu.com/
        # See https://fanyi-api.baidu.com/api/trans/product/desktop?req=developer
        # See https://docs.python.org/3/library/configparser.html
        config = configparser.ConfigParser()
        config.read(cfgfile)
        # Set your own appid/appkey.
        appid = config['api.fanyi.baidu.com']['appid']
        appkey = config['api.fanyi.baidu.com']['appkey']
        #print("appid=" + appid)
        #print("appkey=" + appkey)

    jumandict = sqlite3.connect(database)
    dictcursor = jumandict.cursor()
    dictcursor.execute("CREATE TABLE IF NOT EXISTS words (id INTEGER PRIMARY KEY, name TEXT UNIQUE, desc TEXT, count INTEGER)")
    dumper = open(savedump, 'w')
    dumper.write("# 日语学习记录\n\n")

    while True:
        userinputs = ""
        if file == "":
            try:
                if not click.confirm('想要进入编辑器输入日文句子或段落进行分析吗?'):
                    continue
            except EOFError:
                print("\n你选择退出了哦!")
                break
            except click.Abort:
                print("\n你选择退出了哦!")
                break

            if records > 0:
                rows = dictcursor.execute("SELECT id, name, desc, count FROM words ORDER BY {} DESC LIMIT {}".format(orderby, records)).fetchall()
                words = len(rows)
                if words > 0:
                    if orderby == "id":
                        print("最近保存过的{}个单词(最近优先排序):".format(words))
                    else:
                        print("出现频率最高的{}个单词(高频优先排序):".format(words))
                count = 0
                for row in rows:
                    print('{} [{} ({}次)]:\n'.format(row[0], row[1], row[3]))
                    print(row[2])

            userinputs = click.edit()
            if userinputs is None:
                print("你啥也没输入啊!")
                continue
        else:
            with open(file, 'r') as reader:
                lines = reader.readlines()
                userinputs = "".join(lines)

        if translate == "true":
            # For list of language codes, please refer to `https://api.fanyi.baidu.com/doc/21`
            from_lang = 'jp'
            to_lang = destlang

            endpoint = 'http://api.fanyi.baidu.com'
            path = '/api/trans/vip/translate'
            url = endpoint + path

            salt = random.randint(32768, 65536)
            sign = make_md5(appid + userinputs + str(salt) + appkey)

            # Build request
            headers = {'Content-Type': 'application/x-www-form-urlencoded'}
            payload = {'appid': appid, 'q': userinputs, 'from': from_lang, 'to': to_lang, 'salt': salt, 'sign': sign}

            # Send request
            r = requests.post(url, params=payload, headers=headers)
            result = r.json()

            # Show response
            print("=================================")
            print(userinputs)
            dumper.write("```\n")
            dumper.write(userinputs)
            print("=================================")
            dumper.write("=================================\n")
            trans_result = result["trans_result"]
            for i in range(len(trans_result)):
                dst = trans_result[i]["dst"]
                print(dst)
                dumper.write(dst + "\n")
            dumper.write("```\n")

        inputsentences = [x+"。" for x in userinputs.split("。") if x.strip() != ""]
        for userinput in inputsentences:
            userinput = userinput.strip()
            userinput = userinput.encode('utf-8','surrogatepass').decode('utf-8')

            print("=================================")
            print(userinput)
            dumper.write("## "+ userinput + "\n\n")

            result = knp.parse(userinput.replace("\n", ""))
            dumper.write("```\n")
            dumper.write(userinput + "\n")
            length = 0
            for bnst in result.bnst_list(): # 访问每个词组
                phrase = "".join(mrph.midasi for mrph in bnst.mrph_list())
                phrase = phrase.replace("\␣", " ")
                print("  " * length + phrase)
                dumper.write("  " * length + phrase + "\n")
                length = length + len(phrase)
                if length > 80:
                    length = 0

            dumper.write("```\n")
            print("=================================")
            for mrph in result.mrph_list(): # 访问每个词素
                found = False
                for known in knownlist.keys():
                    if mrph.genkei == known:
                        types = knownlist[known].split("|")
                        for type in types:
                            if mrph.hinsi == type:
                                found = True
                                break

                if ((found == True) and (verbose == "none")) or (mrph.hinsi == "特殊"):
                    continue

                message = "ID:{}".format(mrph.mrph_id)
                if mrph.midasi:
                    message += ", 词汇:{}".format(mrph.midasi)
                if mrph.yomi:
                    message += ", 读法:{}".format(mrph.yomi)
                if mrph.genkei:
                    message += ", 原形:{}".format(mrph.genkei)
                if mrph.hinsi and mrph.hinsi != "*":
                    message += ", 词性:{}".format(mrph.hinsi)
                if mrph.bunrui and mrph.bunrui != "*":
                    message += ", 词性细分:{}".format(mrph.bunrui)
                if mrph.katuyou1 and mrph.katuyou1 != "*":
                    message += ", 活用型:{}".format(mrph.katuyou1)
                if mrph.katuyou2 and mrph.katuyou2 != "*":
                    message += ", 活用形:{}".format(mrph.katuyou2)
                if mrph.imis and mrph.imis != "NIL":
                    message += ", {}".format(mrph.imis) #语义信息:
                elif mrph.repname:
                    message += ", 代表符号:{}".format(mrph.repname)
                print("\t" + message)
                dumper.write("### " + message + "\n")

                if nosense == "true" or (found == True and verbose == "half"):
                    continue

                # use exact matching to find exact meaning
                dictcheck = jmd.lookup(mrph.genkei)
                if len(dictcheck.entries) == 0:
                    dictcheck = jmd.lookup(mrph.midasi)
                    if len(dictcheck.entries) == 0:
                        dictcheck = jmd.lookup(mrph.yomi)
                if len(dictcheck.entries) > 0:
                    desc = ""
                    print("\n")
                    dumper.write("\n")
                    for entry in dictcheck.entries:
                        text = ""
                        if compact == "true":
                            text = entry.text(compact=False, no_id=True)
                            text = re.sub('[`\']', '"', text)
                            print(text)
                        else:
                            tmp = []
                            if entry.kana_forms:
                                tmp.append(entry.kana_forms[0].text)
                            if entry.kanji_forms:
                                tmp.append("({})".format(entry.kanji_forms[0].text))
                            header = " ".join(tmp)
                            tmp = []
                            if entry.senses:
                                for sense, idx in zip(entry.senses, range(len(entry.senses))):
                                    tmps = [str(x) for x in sense.gloss]
                                    if sense.pos:
                                        s = '{gloss} ({pos})'.format(gloss='/'.join(tmps), pos=('(%s)' % '|'.join(sense.pos)))
                                    else:
                                        s = '/'.join(tmps)
                                    s = re.sub('[`\']', '"', s)
                                    tmp.append('    {i}. {s}\n'.format(i=idx + 1, s=s))
                            senses = "".join(tmp)
                            print(header)
                            print(senses)
                            text = "**" + header + "**\n" + senses
                        desc = desc + text + "\n"
                        text = re.sub('[|]', '\|', text)
                        dumper.write("- " + text + "\n")
                    dictcursor.execute('INSERT INTO words (name, desc, count) VALUES ("{}", "{}", "{}") ON CONFLICT (name) DO UPDATE SET count = count + 1'
                                        .format(mrph.genkei.replace('"', '""'), desc.replace('"', '""'), 1))
            jumandict.commit()

        dumper.flush()

        if file != "":
            break

    jumandict.close()
    dumper.close()
示例#20
0
import operator
import os
import pickle
import random
import re
import sys
import time
from collections import OrderedDict, defaultdict
from itertools import chain
from typing import List, Optional

import pygame
import romkan
from jamdict import Jamdict

JMD = Jamdict()

if not os.path.exists('data'):
    os.mkdir('data')

WORDS_FREQ_FILEPATH = "data/nf_words_freq"


def generate_word_frequency_file(filepath):
    nf_to_kanjis = defaultdict(set)
    for entry in JMD.jmdict_xml.entries:
        for word in chain(entry.kanji_forms, entry.kana_forms):
            for pri in word.pri:
                if pri.startswith('nf'):
                    nf_x = int(pri[-2:])
                    nf_to_kanjis[nf_x].add(word.text)
示例#21
0
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

########################################################################

import os
from jamdict import Jamdict

########################################################################

# Create an instance of Jamdict
jam = Jamdict()
print("Jamdict DB file: {}".format(jam.db_file))

# Lookup by kana
result = jam.lookup('おかえし')
for entry in result.entries:
    print(entry)
print("-----------------")

# Lookup by kanji
result = jam.lookup('御土産')
for entry in result.entries:
    print(entry)
print("-----------------")

示例#22
0
文件: kotoba.py 项目: cliffpham/kioku
class Kotoba():
    def __init__(self):
        self.jmd = Jamdict()
        self.moras = [
            'あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す',
            'せ', 'そ', 'た', 'ち', 'つ', 'て', 'と', 'な', 'に', 'ぬ', 'ね', 'の', 'は',
            'ひ', 'ふ', 'へ', 'ほ', 'ま', 'み', 'む', 'め', 'も', 'ら', 'り', 'る', 'れ',
            'ろ', 'や', 'ゆ', 'よ', 'わ', 'ん'
        ]

        self.special_cases = {
            'か': ['が'],
            'き': ['ぎ'],
            'け': ['げ'],
            'こ': ['ご'],
            'さ': ['ざ'],
            'し': ['じ'],
            'す': ['ず'],
            'せ': ['ぜ'],
            'そ': ['ぞ'],
            'た': ['だ'],
            'ち': ['ぢ'],
            'つ': ['づ', 'っ'],
            'て': ['で'],
            'と': ['ど'],
            'は': ['ば', 'ぱ'],
            'ひ': ['び', 'ぴ'],
            'ふ': ['ぶ', 'ぷ'],
            'へ': ['べ', 'ぺ'],
            'ほ': ['ぼ', 'ぽ'],
            'や': ['ゃ'],
            'ゆ': ['ゅ'],
            'よ': ['ょ'],
        }

    def find_kotoba(self, word, hidden):
        temp = set()
        hidden_set = set()
        for m in word:
            temp.add(m)
        for m in hidden:
            hidden_set.add(m)
        if not temp.issubset(hidden):
            return None
        test = self.jmd.lookup(word)
        return test.entries

    def generate_moras(self):
        current_set = []
        all_set = []
        hidden = []

        all_set.append([self.moras[2]])
        current_set.append(self.moras[2])
        current_set.append(self.moras[17])
        tsu = [self.moras[17]]
        for mora in self.special_cases[self.moras[17]]:
            tsu.append(mora)
        all_set.append(tsu)

        while len(current_set) < 9:
            cur = self.moras[random.randint(0, len(self.moras) - 1)]
            if cur not in current_set:
                current_set.append(cur)
                hidden.append(cur)
                if cur in self.special_cases:
                    temp = [cur]
                    for char in self.special_cases[cur]:
                        temp.append(char)
                        hidden.append(char)
                    all_set.append(temp)
                else:
                    all_set.append([cur])
        dup = copy.deepcopy(all_set)
        str_display = ''.join(flatten(dup))
        shuffle(all_set)

        return (str_display, current_set, all_set, hidden)

    def display_result(self, word, entry, max_score):
        print("Correct!")
        print("Current Score: " + str(max_score))
        print(entry.entries)

    def start_session(self):
        session_started = True
        mora_list = self.generate_moras()
        guesses = set()
        lives_left = 3
        cur_score = 0
        max_score = 0

        while session_started:
            print('The letters are: ' + mora_list[0])
            cmd = input('Create a word: ')
            if cmd not in guesses:
                check = self.jmd.lookup(cmd)
                if check != "Found nothing":
                    cur_score += 1
                    max_score += 1
                    self.display_result(cmd, check, max_score)
                else:
                    lives_left -= 1
            else:
                print("You have already used " + cmd)
            #reset for a new mora list
            if cur_score >= 5:
                cmd = input('Generate a new list?')
                if cmd == 'y':
                    cur_score = 0
                    mora_list = self.generate_moras()

    def start_game(self):
        print('Hello from start')
        self.start_session()
示例#23
0
from flask import Flask, Response
from functools import wraps
from flask import request

from chirptext.cli import setup_logging

from jamdict import Jamdict
from jamdict import __version__

# ---------------------------------------------------------------------
# CONFIGURATION
# ---------------------------------------------------------------------

setup_logging('logging.json', 'logs')
app = Flask(__name__, static_url_path="")
jmd = Jamdict()


def getLogger():
    return logging.getLogger(__name__)


# ---------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------


def jsonp(func):
    @wraps(func)
    def decorated_function(*args, **kwargs):
        data = func(*args, **kwargs)
示例#24
0
class OfflineTranslator(Translator):
    """
    Offline 'translator' based on stanza, jamdict (dictionary) and jaconv (kana romanization)

    Results are pretty bad, but you don't need cloud API to use it.

    To use this class, make sure to:
      - pip install stanza jamdict jaconv
      - run `stanza.download('ja')` in Python console once, to download the resources for Stanza

    """
    CACHE_NAME = "pyTranslateSwf-cache-OfflineTranslator.json"
    BATCH_SIZE = 100

    def __init__(self):
        super().__init__()

        # stanza.download('ja')
        self.nlp = stanza.Pipeline('ja')
        self.jmd = Jamdict()

        self._translate_jmd_cache = {}

    def _translate_all(self, input_strings: List[str]) -> List[str]:
        return [self._translate(s) for s in input_strings]

    def _translate(self, string: str) -> str:
        """The actual translation logic"""
        if not string or string.isspace():
            return ""

        doc = self.nlp(string)  # run annotation over a sentence
        input_tokens = []
        output_tokens = []

        for sentence in doc.to_dict():
            for d in sentence:
                token = d["text"]
                input_tokens.append(token)
                if d["upos"] in ("NOUN", "VERB"):
                    x = self._translate_jmd(token)
                else:
                    x = self._transliterate(token).upper()
                output_tokens.append(x)

        return " ".join(output_tokens).replace("( ",
                                               "(").replace(" )", ")").replace(
                                                   " .", ".")

    def _translate_jmd(self, token: str) -> str:
        if token in self._translate_jmd_cache:
            return self._translate_jmd_cache[token]

        result = self.jmd.lookup(token)

        # get first dictionary meaning
        try:
            meaning = result.entries[0].senses[0].text()
            meaning = meaning.split("/")[0]
            meaning = re.sub(r",.*|to |\(.+\)", "", meaning)
            output = self._translate_jmd_cache[token] = meaning.strip()
            return output
        except Exception:
            pass

        # get first radical meaning
        try:
            meaning = result.chars[0].meanings()[0]
            meaning = meaning.split("/")[0]
            meaning = re.sub(r",.*|to |\(.+\)", "", meaning)
            output = self._translate_jmd_cache[token] = meaning.strip()
            return output
        except Exception:
            pass

        output = self._translate_jmd_cache[token] = self._transliterate(
            token).upper()
        return output

    @staticmethod
    def _transliterate(token: str) -> str:
        s = jaconv.kata2hira(token)
        s = jaconv.kana2alphabet(s)
        return s
示例#25
0
 def __init__(self, info, name="jam"):
     super().__init__(info, name)
     self.parser = JapaneseAnalyser()
     self.jam = Jamdict()
示例#26
0
def mainloop(file, database, savedump, records, orderby, guimode):
    """Get user Janpanse input then parse it and record new words into database."""
    jmd = Jamdict()
    knp = KNP()

    jumandict = sqlite3.connect(database)
    dictcursor = jumandict.cursor()
    dictcursor.execute(
        "CREATE TABLE IF NOT EXISTS words (id INTEGER PRIMARY KEY, name TEXT UNIQUE, desc TEXT, count INTEGER)"
    )
    dumper = open(savedump, 'w')

    # Pass any command line argument for Web use
    if guimode == "web":  # if there is use the Web Interface
        import PySimpleGUIWeb as sg
        import socket
    elif guimode == "tk":  # default uses the tkinter GUI
        import PySimpleGUI as sg
    elif guimode == "qt":
        import PySimpleGUIQt as sg
    else:
        import PySimpleGUIWx as sg

    # All the stuff inside your window.
    header_list = [
        "ID", "词汇", "读法", "原形", "词性", "词性细分", "活用型", "活用形", "语义信息", "代表符号"
    ]
    uifont = "Ariel 32"
    left_column_layout = [
        [
            sg.T("输入日语"),
            sg.FolderBrowse(),
        ],
        [
            sg.Multiline("", size=(75, 10), key="nihongo"),
        ],
        [
            sg.Button("分析",
                      size=(30, 3),
                      font=uifont,
                      button_color=('white', 'green'),
                      key="submit"),
            sg.Button("退出",
                      size=(30, 3),
                      font=uifont,
                      button_color=('white', 'red'),
                      key="exit")
        ],
        [
            sg.Listbox(values=[],
                       enable_events=True,
                       size=(75, 20),
                       key="parsedwords")
        ],
    ]
    right_column_layout = [
        [sg.T("词汇意义")],
        [
            sg.Listbox(values=[],
                       enable_events=True,
                       size=(75, 33),
                       key="foundentries")
        ],
    ]
    layout = [[
        sg.VSeperator(),
        sg.Column(left_column_layout),
        sg.VSeperator(),
        sg.Column(right_column_layout),
    ]]
    # Create the Window
    if guimode == "web":
        hostname = socket.gethostname()
        local_ip = socket.gethostbyname(hostname)
        print("local_ip is " + local_ip)
        window = sg.Window('日语学习',
                           layout,
                           web_ip=local_ip,
                           web_port=8888,
                           web_start_browser=False)
    else:
        window = sg.Window('日语学习', layout)

    resultlist = []
    # Run the Event Loop
    while True:
        event, values = window.read()
        if event == "exit" or event == sg.WIN_CLOSED:
            break
        # Folder name was filled in, make a list of files in the folder
        if event == "submit":
            userinput = values["nihongo"]
            print("=================================")
            print(userinput)
            userinput = userinput.strip()
            userinput = userinput.encode('utf-8',
                                         'surrogatepass').decode('utf-8')

            dumper.write(userinput + "\n\n")

            result = knp.parse(userinput.replace("\n", ""))

            print("=================================")
            print("词素")
            resultlist = result.mrph_list()
            parsedwords = []
            for mrph in resultlist:  # 访问每个词素
                if mrph.midasi in {"、", "。", "「", "」", "\␣"}:
                    continue
                message = "\tID:{}, 词汇:{}, 读法:{}, 原形:{}, 词性:{}, 词性细分:{}, 活用型:{}, 活用形:{}, 语义信息:{}, 代表符号:{}".format(
                    mrph.mrph_id, mrph.midasi, mrph.yomi, mrph.genkei,
                    mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2,
                    mrph.imis, mrph.repname)
                print(message)
                dumper.write(message + "\n")
                parsedwords += [message]

                # use exact matching to find exact meaning
                dictcheck = jmd.lookup(mrph.genkei)
                if len(dictcheck.entries) == 0:
                    dictcheck = jmd.lookup(mrph.midasi)
                    if len(dictcheck.entries) == 0:
                        dictcheck = jmd.lookup(mrph.yomi)
                if len(dictcheck.entries) > 0:
                    desc = ""
                    for entry in dictcheck.entries:
                        desc = desc + entry.text(compact=False,
                                                 no_id=True) + "\n"
                    print("\n" + desc)
                    dumper.write("\n" + desc + "\n")
                    dictcursor.execute(
                        'INSERT INTO words (name, desc, count) VALUES ("{}", "{}", "{}") ON CONFLICT (name) DO UPDATE SET count = count + 1'
                        .format(mrph.genkei.replace('"', '""'),
                                desc.replace('"', '""'), 1))

            jumandict.commit()
            window["parsedwords"].update(parsedwords)

        elif event == "parsedwords":  # A file was chosen from the listbox
            selectedword = values["parsedwords"][0]
            print(selectedword)
            selectedid = int(selectedword.split(',')[0].split(':')[1].strip())
            print("selectedid=" + str(selectedid) + " among " +
                  str(len(resultlist)) + " entries")
            foundentries = []
            for mrph in resultlist:  # 访问每个词素
                if selectedid != mrph.mrph_id:
                    continue
                message = "\tID:{}, 词汇:{}, 读法:{}, 原形:{}, 词性:{}, 词性细分:{}, 活用型:{}, 活用形:{}, 语义信息:{}, 代表符号:{}".format(
                    mrph.mrph_id, mrph.midasi, mrph.yomi, mrph.genkei,
                    mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2,
                    mrph.imis, mrph.repname)
                print(message)
                # use exact matching to find exact meaning
                dictcheck = jmd.lookup(mrph.genkei)
                if len(dictcheck.entries) == 0:
                    dictcheck = jmd.lookup(mrph.midasi)
                    if len(dictcheck.entries) == 0:
                        dictcheck = jmd.lookup(mrph.yomi)
                foundentries += [message]
                foundentries += ["==================================="]
                if len(dictcheck.entries) > 0:
                    for entry in dictcheck.entries:
                        desc = entry.text(compact=False, no_id=True)
                        print("\n" + desc)
                        foundentries += [desc]

            window["foundentries"].update(foundentries)

    window.close()
    jumandict.close()
    dumper.close()
import uuid as uid
from functools import reduce

from jamdict import Jamdict

from fastapi import FastAPI, Form, HTTPException
# from starlette.responses import FileResponse

# --------------
# Game Machanics
# --------------
from de.mindscan.orangemoon.httpserver.game_directory import GameDirectory
from de.mindscan.orangemoon.httpserver.game_room import GameRoom
from de.mindscan.orangemoon.httpserver.game_player import GamePlayer

myJamDict = Jamdict()

RADICAL_STROKE_DATA = 'kanjiRadicalStrokeData.json'
KANJI_STROKE_DATA = 'kanjiStrokeData.json'

with open(os.path.join(DATA_BASE_DIR, RADICAL_STROKE_DATA), 'r') as jsonFile:
    global_radicalDict = json.load(jsonFile)

with open(os.path.join(DATA_BASE_DIR, KANJI_STROKE_DATA), 'r') as jsonFile:
    global_kanjiDict = json.load(jsonFile)

app = FastAPI()


@app.get("/")
def read_root():
示例#28
0
from jamdict import Jamdict, config
jmd = Jamdict(db_file=config.get_file('JAMDICT_DB'))


def lookup_dic(word, igonre=True):
    word_dic = dict()
    result = jmd.lookup(word, strict_lookup=True, lookup_chars=False)
    result = result.to_json()['entries']
    for entry in result:
        if entry['kanji'][0]['text'] == word:
            word_dic['word'] = entry['kanji'][0]['text']
            word_dic['pronunciation'] = ""
            for k, p in enumerate(entry['kana'], start=1):
                if len(entry['kana']) == 1:
                    word_dic['pronunciation'] += p['text']
                else:
                    word_dic['pronunciation'] += str(k) + ". " + p['text'] + "<br>"
            word_dic['meaning'] = ""
            for j, meaning in enumerate(entry['senses'], start=1):
                if j >= 3:
                    break
                if len(entry['senses']) > 1:
                    word_dic['meaning'] += str(j) + ". "
                for i, one_sense in enumerate(meaning['SenseGloss'], start=1):
                    if i >= 5:
                        break
                    word_dic['meaning'] += one_sense['text']
                    if i != len(meaning['SenseGloss']) and i != 4:
                        word_dic['meaning'] += "; "
                if j != len(entry['senses']) and j != 2:
                    word_dic['meaning'] += "<br>"