Exemplo n.º 1
0
def test_accent(text, accent):
    # This checks for correct handling of feature fields containing commas as reported in #13
    tagger = Tagger()
    tokens = tagger.parseToNodeList(text)
    # Skip if UnidicFeatures17 is used because it doesn't have 'atype' attribute
    if tokens and isinstance(tokens[0].feature, UnidicFeatures17):
        pytest.skip()
    accent_ = [tok.feature.aType for tok in tokens]
    assert accent_ == accent
class Tokenizer():
    def __init__(self):
        self.tagger = Tagger("-Owakati")

    def tokenize(self, text):
        tokens = self.tagger.parse(text).split(" ")
        return tokens
Exemplo n.º 3
0
def load_fugashi(write_cfg=False):
    try:
        # help python find libmecab.dll, adjust this to fit your env if necessary
        dll_path = None
        for base in sys.path:
            x = os.path.join(base, "fugashi")
            if os.path.exists(os.path.join(x, "cli.py")) and not dll_path:
                dll_path = x
            x2 = os.path.join(x, "../../../lib/site-packages/fugashi")
            if os.path.exists(x2):
                dll_path = x2
                break

        if not dll_path:
            raise Exception("could not find fugashi installation path")

        if WINDOWS:
            os.add_dll_directory(dll_path)

        from fugashi import Tagger

        dicrc = os.path.join(dll_path, "dicrc")
        if write_cfg:
            with open(dicrc, "wb") as f:
                f.write("\n".join([
                    r"node-format-yomi = %f[9] ",
                    r"unk-format-yomi = %m",
                    r"eos-format-yomi  = \n",
                    "",
                ]).encode("utf-8"))

        wakati = Tagger("-Owakati")
        yomi = Tagger("-Oyomi -r " + dicrc.replace("\\", "\\\\"))

        # import MeCab
        # wakati = MeCab.Tagger('-Owakati')
        info("found fugashi")
        return wakati, yomi
    except:
        import traceback

        warn("could not load fugashi:\n" + traceback.format_exc() + "-" * 72 +
             "\n")
Exemplo n.º 4
0
def main():
    text = 'softbank'
    tagger = Tagger()
    gtagger = GenericTagger()

    print('Tagger:')
    print(tagger.parse(text))
    for word in tagger(text):
        print(word.surface)
        print(word.feature)
    print()

    print('GenericTagger:')
    print(gtagger.parse(text))
    for word in gtagger(text):
        print(word.surface)
        print(word.feature)
    print()
    print('DONE')
Exemplo n.º 5
0
def main():
    tagger = Tagger()
    wakati_tagger = Tagger('-Owakati')
    text = '私はご飯を食べます。'
    
    result = wakati_tagger.parse(text)
    print('result1(parse + wakati):')
    print(result)
    print(type(result))
    print()

    result = tagger.parse(text)
    print('result2(parse):')
    print(result)
    print(type(result))
    print()

    result = wakati_tagger(text)
    print('result3(_call_+wakati):')
    print(result)
    print(type(result))
    print(inspect.getmembers(result[0]))
    print(type(result[0]))
    print()

    result = tagger(text)
    print('result4(_call_):')
    print(result)
    print(type(result))
    print(inspect.getmembers(result[0]))
    print(type(result[0]))
    print()
    print('DONE')
Exemplo n.º 6
0
Arquivo: cli.py Projeto: polm/fugashi
def info():
    """Print configuration info."""
    args = ' '.join(sys.argv[1:])
    try:
        tagger = GenericTagger(args, quiet=True)
    except RuntimeError:
        tagger = Tagger(args)
    #TODO get the fugashi version here too
    print("Fugashi dictionary info:")
    print("-----")
    for di in tagger.dictionary_info:
        for field in 'version size charset filename'.split():
            print( (field + ':').ljust(10), di[field])
        print('-----')
Exemplo n.º 7
0
def main():
    tagger = Tagger()
    neologd_tagger = Tagger('-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-unidic-neologd')

    text = '私は、渋谷ストリームでランチを食べる。'
    print('unidic:')
    print(tagger.parse(text))
    print()

    print('unidic-neologd:')
    print(neologd_tagger.parse(text))
    print('DONE')
Exemplo n.º 8
0
Arquivo: cli.py Projeto: polm/fugashi
def main():
    """
    This is a simple wrapper for fugashi so you can test it from the command line.
    Like the mecab binary, it treats each line of stdin as one sentence. You can
    pass tagger arguments here too.
    """
    args = ' '.join(sys.argv[1:])

    # This should work if you specify a different dictionary,
    # but it should also work with the pip unidic.
    # Try the GenericTagger and then try the Unidic tagger.
    try:
        tagger = GenericTagger(args, quiet=True)
    except RuntimeError:
        tagger = Tagger(args)

    for line in fileinput.input([]):
        print(tagger.parse(line.strip()))
Exemplo n.º 9
0
def test_pos(text, tags):
    # There should be a pos property when using the default tagger
    tagger = Tagger()
    tags_ = [tok.pos for tok in tagger(text)]
    assert tags == tags_
Exemplo n.º 10
0
def test_invalid_args():
    # Invalid args will give a NULL pointer for the Tagger object
    # don't try to use the null object!
    with pytest.raises(RuntimeError):
        tagger = Tagger('-fail')
Exemplo n.º 11
0
def test_nbest(text, saved):
    tagger = Tagger('-Owakati')
    assert tagger.nbest(text, 2) == saved
Exemplo n.º 12
0
def test_tokens(text, saved):
    # testing the token objects is tricky, so instead just check surfaces
    #TODO: maybe save serialized nodes to compare?
    tagger = Tagger()
    tokens = [str(tok) for tok in tagger(text)]
    assert tokens == saved
Exemplo n.º 13
0
def test_wakati(text, wakati):
    tagger = Tagger('-Owakati')
    assert tagger.parse(text) == wakati
from fugashi import Tagger
from dataclasses import dataclass
from jamdict import Jamdict, jmdict
from japaneseverbconjugator.src.constants.EnumeratedTypes import VerbClass
import jconj.conj as jconj

SudachiPos = Tuple[str, str, str, str, str, str]
K = TypeVar("K")
V = TypeVar("V")

CT = jconj.read_conj_tables("./jconj/data")
JMDICT_ABBREV_MAP = {v: k for k, vs in CT["kwpos"].items() for v in vs}
JMDICT_ABBREV_MAP["expressions (phrases, clauses, etc.)"] = "exp"

tokenizer_obj = dictionary.Dictionary().create()
tagger = Tagger("-Owakati")
jmd = Jamdict()
google_translate = googletrans.Translator()

SUDACHI_POS_MAP = {
    "感動詞": "interjection",
    "記号": "symbol",
    "補助記号": "supplementary symbol",
    "名詞": "noun",
    "接尾辞": "suffix",
    "助詞": "particle",
    "形容詞": "adjective",  # "i-adjective",
    "助動詞": "auxiliary verb",
    "代名詞": "pronoun",
    "空白": "blank space",
    "動詞": "verb",
#!/usr/bin/env python
from fugashi import Tagger

tt = Tagger()
from collections import Counter

wc = Counter()

for line in open('wagahai.txt'):
    for word in tt.parseToNodeList(line.strip()):
        wc[word.surface] += 1
 def __init__(self):
     self.tagger = Tagger("-Owakati")
Exemplo n.º 17
0
def test_accent(text, accent):
    # This checks for correct handling of feature fields containing commas as reported in #13
    tagger = Tagger()
    accent_ = [tok.feature.aType for tok in tagger.parseToNodeList(text)]
    assert accent_ == accent
Exemplo n.º 18
0
"""
vocabulaire = []

phrase = "12345"

liste_mots = []
for i in range(len(phrase)+1):
    for j in range(i,len(phrase)+1):
        mot = phrase[i:j]
        if mot in vocabulaire: 
            liste_mots.append(mot)
"""
import numpy as np          
from fugashi import Tagger

tagger = Tagger('-Owakati')




text = "今日はパリから東京まで散歩するつもりだ"

text = "この暑い焼き鳥お食べ次第すぐにビールお飲みます"

text = '僕は自分中心'

text = "この暑い焼き鳥お食べ次第すぐにビールお飲みます"

text = 'でないと'

Exemplo n.º 19
0
def recognize_image(image_file, clipboard_buffer):
    """Returns document bounds given an image."""
    client = vision.ImageAnnotatorClient()

    with io.open(image_file, "rb") as image_file:
        content = image_file.read()

    image = types.Image(content=content)

    response = client.document_text_detection(image=image)
    document = response.full_text_annotation
    texts = response.text_annotations

    s = wx.ScreenDC()
    ss_x1 = c1x + c1x_delta
    ss_x2 = c2x + c2x_delta
    ss_y1 = c1y + c1y_delta
    ss_y2 = c2y + c2y_delta
    global mode

    console_output = ""
    table = Table(show_header=True,
                  header_style="bold magenta",
                  box=box.MINIMAL_DOUBLE_HEAD)
    table.add_column("日本語", style="dim")
    table.add_column(mode)
    if mode == "Vocab":
        table.add_column("読み方")
        table.add_column("意味")
    for page in document.pages:
        for block in track(page.blocks):
            results = []
            results.append([])
            for paragraph in block.paragraphs:
                for word in paragraph.words:
                    for symbol in word.symbols:
                        results[-1].append(symbol.text)

            bound = block.bounding_box
            start_x = bound.vertices[0].x
            start_y = bound.vertices[0].y

            width = bound.vertices[2].x - bound.vertices[0].x
            height = bound.vertices[2].y - bound.vertices[0].y

            s.Pen = wx.Pen("#FF0000")
            s.SetTextForeground((255, 0, 0))
            s.SetTextBackground((0, 0, 0))
            s.Brush = wx.Brush(wx.Colour(255, 255, 255))

            s.SetFont(
                wx.Font(
                    12,
                    wx.FONTFAMILY_DECORATIVE,
                    wx.FONTSTYLE_NORMAL,
                    wx.FONTWEIGHT_BOLD,
                ))

            ocr_results = "".join(results[-1])

            clipboard_buffer = clipboard_buffer + ocr_results
            clipboard_buffer = clipboard_buffer + "\n"
            if mode == "Romaji":
                katsu = cutlet.Cutlet()
                hepburn_block = katsu.romaji(ocr_results)
                table.add_row(ocr_results, hepburn_block)
                hepburn_block = "\n".join(textwrap.wrap(hepburn_block, 25))

            if mode == "Vocab":
                tagger = Tagger("-Owakati")

                nl_separated_block = []
                for word in tagger(ocr_results):
                    if word.char_type == 2:
                        results = jmd.lookup(str(word.feature.lemma))
                        meaning = " "
                        for k in range(len(results.entries)):
                            result = results.entries[k]
                            if k > 0:
                                meaning = meaning + "\n "
                            meaning = (meaning +
                                       f"[bold red]{str(k + 1)}. [/bold red]" +
                                       " \\ ".join([
                                           str(sense.gloss[0])
                                           for sense in result.senses
                                       ]))
                        console_output = console_output + "\t".join([
                            str(word),
                            "『" + str(word.feature.kana) + "』",
                            str(meaning),
                            "\n",
                        ])
                        nl_separated_block.append("\t".join([
                            str(word),
                            "『" + str(word.feature.kana) + "』",
                            str(meaning),
                        ]))
                        table.add_row(
                            str(word),
                            str(word.feature.lemma),
                            "『" + str(word.feature.kana) + "』",
                            str(meaning),
                        )
                hepburn_block = "\n".join(nl_separated_block)
                # table.add_row(ocr_results, hepburn_block)

            if mode == "Google":
                translator = Translator()
                translated = translator.translate(ocr_results).text
                table.add_row("\n".join(textwrap.wrap(ocr_results, 25)),
                              translated)
                hepburn_block = "\n".join(textwrap.wrap(translated, 25))

            if mode == "DeepL":
                url = "https://api.deepl.com/v2/translate"
                response = requests.get(
                    url,
                    params={
                        "auth_key": deepL_auth,
                        "text": ocr_results,
                        "target_lang": "EN",
                    },
                )
                result = response.json()
                translated = result["translations"][0]["text"]
                table.add_row("\n".join(textwrap.wrap(ocr_results, 25)) + "\n",
                              translated)
                hepburn_block = "\n".join(textwrap.wrap(translated, 40))

            nl_separated_block = hepburn_block.split("\n")
            max_x_bound = (
                max([s.GetTextExtent(line)[0]
                     for line in nl_separated_block]) + 3)
            max_y_bound = (
                s.GetTextExtent(hepburn_block)[1] * len(nl_separated_block) +
                3)
            w, h, = s.GetTextExtent(hepburn_block)

            # modify this with dpi scale when screen device context is fixed
            s.DrawRectangle(ss_x1 + start_x - 3, ss_y1 + start_y - 3,
                            max_x_bound, max_y_bound)
            s.DrawText(hepburn_block, ss_x1 + start_x, ss_y1 + start_y)
    console.print(table)
    return clipboard_buffer
Exemplo n.º 20
0
import pandas, regex, functools
from fugashi import Tagger
from pykakasi import kakasi
from collections import OrderedDict
import re


from ja_sentence_segmenter.common.pipeline import make_pipeline
from ja_sentence_segmenter.concatenate.simple_concatenator import concatenate_matching
from ja_sentence_segmenter.normalize.neologd_normalizer import normalize
from ja_sentence_segmenter.split.simple_splitter import split_newline, split_punctuation

df_word = pandas.read_csv("../dict_data/ja/jlpt/JLPT.csv")

fugger = Tagger()

pos_list = []
kana_list = []

df_word=df_word.fillna("NONE")

for index, row in df_word.iterrows():
    index += 1
    if not row['kanji'] == "NONE":
        kanji = row['kanji']
        wf = fugger(kanji)
    else:
        furigana = row['furigana']
        wf = fugger(furigana)
    if len(wf) > 1:
Exemplo n.º 21
0
def JlptLevel(text):
    # Data import
    df_word = pandas.read_csv("dict_data/ja/jlpt/JLPT.csv")
    df_kanji = pandas.read_csv("dict_data/ja/jlpt/JLPT_Kanji.csv")
    kks = kakasi()
    kks.setMode("J", "H")
    conv = kks.getConverter()

    # Text to words
    ###########################################################################################################################
    text = text
    ###########################################################################################################################
    sentence_list = segmenter(text)
    fugger = Tagger()
    text_level = 5
    dict = []
    if sentence_list:
        for sentence_num, sentence in enumerate(sentence_list):
            sentence_word_level_count_dict = {5: 0, 4: 0, 3: 0, 2: 0, 1: 0}
            kanji_level_count_dict = OrderedDict({5: 0, 4: 0, 3: 0, 2: 0, 1: 0})
            sentence_num += 1
            word_list = []
            for w in fugger(sentence):
                word = w
                if word:
                    word_original = str(w)
                    word_dict_form = word.feature.lemma
                    word_kanji_hiragana = conv.do(word_original)
                    level = None
                    word_dict_form_hiragana = None

                    if word.feature.pos1 in ["動詞", "助動詞", "形容詞", "形状詞", "助動詞"]:
                        word_dict_form_hiragana = conv.do(word_dict_form)

                    p = regex.compile(r'\p{Script=Han}+') # Kanji unicode coverage
                    if p.findall(word_original): # The original word is written in Kanji
                        # word_dict[word_original]["letter_type"] = "kanji" <- delete later
                        kanji_dict = OrderedDict()
                        for kanji in p.findall(word_original):
                            kanji_single_list = []
                            for kanji_single in kanji:
                                kanji_dict[kanji_single] = {}
                                index_list = df_kanji[df_kanji.kanji == kanji_single].index
                                level_list = []
                                for index in index_list:
                                    level_list.append(df_kanji.loc[index, "jlpt"])
                                if level_list:
                                    level = max(level_list)
                                kanji_dict = OrderedDict({
                                    "kanji_single": kanji_single,
                                    "kanji_level": level,
                                })
                                kanji_single_list.append(kanji_dict)

                        # Check the Kanji word level
                        if word_dict_form_hiragana: # if word can conjugate
                            index_list = df_word[df_word.furigana == word_dict_form_hiragana].index
                        else:
                            index_list = df_word[df_word.kanji == word_original].index
                        level_list = []
                        for index in index_list:
                            if df_word.loc[index].any():
                                level_list.append(df_word.loc[index, "jlpt"])
                        if level_list:
                            level = max(level_list)
                            sentence_word_level_count_dict[level] += 1
                        pos = word.feature.pos1

                        word_dict = {
                            "kanji_elements": kanji_single_list,
                            "word_level": level,
                            "word_pos": pos,
                            "word": word,
                            "word_dict_form": word_dict_form,
                            "word_index": index_list,
                        }
                    else: # The original word is not written in Kanji
                        if word_dict_form_hiragana: # if word can conjugate
                            index_list = df_word[df_word.furigana == word_dict_form_hiragana].index
                        else:
                            index_list = df_word[df_word.furigana == word].index
                        index_list = df_word[df_word.furigana == word_original].index
                        level_list = []
                        for index in index_list:
                            level_list.append(df_word.loc[index, "jlpt"])
                        if level_list:
                            level = max(level_list)
                            sentence_word_level_count_dict[level] += 1
                        pos = word.feature.pos1

                        word_dict = {
                            "word_level": level,
                            "word_pos": pos,
                            "word": word,
                            "word_dict_form": word_dict_form,
                            "word_index": index_list,
                        }
                    word_list.append(word_dict)

            # Define the level of each sentence
            #print(sentence_word_level_count_dict)
            freq_level = max(sentence_word_level_count_dict, key=sentence_word_level_count_dict.get)
            rare_level = min(sentence_word_level_count_dict, key=sentence_word_level_count_dict.get)
            highest_level = 5
            for level in sentence_word_level_count_dict:
                if sentence_word_level_count_dict[level] > 0:
                    highest_level = level

            sentence_level_dict = {
                "max_level": freq_level,
                "min_level": rare_level,
                "highest_level": highest_level,
            }

            sentence_dict = {
                "sentence": sentence,
                "sentence_num": sentence_num,
                "sentence_word_level_count_dict": sentence_word_level_count_dict,
                "sentence_word_level_dict": sentence_level_dict,
                #"sentence_kanji_level": sentence_kanji_level,
                "word_dict": word_list,
            }
            dict.append(sentence_dict)

    pprint.pprint(dict)
    return dict
#!/usr/bin/env python
from fugashi import Tagger
tt = Tagger('-Owakati')
from collections import Counter

wc = Counter()

for line in open('wagahai.txt'):
    for word in tt.parse(line.strip()).split(' '):
        wc[word] += 1