def test_create_hash_func(self): rma = RakutenMA() hash_func = rma.create_hash_func(4) assert_equals(hash_func("feat1", "foo"), ["5"]) assert_equals(hash_func("feat1", "bar"), ["2"]) assert_equals(hash_func("feat1", "baz"), ["10"]) assert_equals(hash_func("feat1", "qux"), ["3"])
def tagWordsInSentences(self, studying, entry): '''Tags the part of speech for each word.''' jar_path = 'stanford-postagger-full/stanford-postagger.jar' if studying in self.english: words = parseWordsFromEntry(entry) tagged_words = tagWords(words) return tagged_words elif studying in self.japanese or self.korean or self.mandarin: #segmenter = TinySegmenter() #words = segmenter.tokenize(entry) rm = RakutenMA() tagged_words = rm.tokenize(entry) #mecab = Mecab() #tagged_words = mecab.pos(entry) return tagged_words else: if studying in self.spanish: model_path = 'stanford-postagger-full/models/spanish.tagger' words = parseWordsFromEntry(entry) elif studying in self.french: model_path = 'stanford-postagger-full/models/french.tagger' words = parseWordsFromEntry(entry) postagger = StanfordPOSTagger(model_path, jar_path, encoding='utf8') tagged_words = postagger.tag(words) return tagged_words
class ExplainJapaneseSentences(BaseFilter): def __init__(self): super().__init__() # Initialize a RakutenMA instance with an empty model # the default ja feature set is set already self.rma = RakutenMA() # Initialize a RakutenMA instance with a pre-trained model self.rma = RakutenMA( phi=1024, c=0.007812 ) # Specify hyperparameter for SCW (for demonstration purpose) # https://github.com/ikegami-yukino/rakutenma-python/tree/master/rakutenma/model self.rma.load(abspath(r'..\resource\model_ja.min.json')) def __call__(self, chunk): chunk = self._duplicate_chunk(chunk) chunk.final = True result = [chunk] text = self.tokenize(chunk.text) result.append( TextChunk(text=text, language='japanese', audible=False, printable=True, final=True)) return result def tokenize(self, text): tokens = self.rma.tokenize(text) return ' '.join(map(lambda pair: f'{pair[0]} ({pair[1]})', tokens))
def test_train_one(self): rma = RakutenMA() rma.featset = ["w0"] res = rma.train_one([["foo", "N-nc"], ["bar", "N-nc"]]) assert_true(res["updated"]) assert_true(Trie.find(rma.model["mu"], ["w0", "f", "B-N"]) > 0) assert_true(Trie.find(rma.model["mu"], ["w0", "o", "I-N"]) > 0) assert_true(Trie.find(rma.model["mu"], ["w0", "o", "E-N"]) > 0) assert_equals(rma.tokenize("foobar"), [["foo", "N-nc"], ["bar", "N-nc"]])
def test_count_tps(self): # the last "a" doesn"t match because of offset of "d+" sent1 = ["a", "b", "c", "d", "a"] sent2 = ["a", "b", "c", "d+", "a"] assert_equals(RakutenMA.count_tps(sent1, sent2), 3) # ignores pos tags for comparison sent1 = [["x", "pos1"], ["y", "pos2"], ["z", "pos3"]] sent2 = [["x", "pos0"], ["u", "pos2"], ["v", "pos3"], ["x", "pos1"]] assert_equals(RakutenMA.count_tps(sent1, sent2), 1)
def test_ctype_ja_default_func(self): rma = RakutenMA() assert_equals(rma.ctype_ja_default_func("あ"), "H") assert_equals(rma.ctype_ja_default_func("ア"), "K") assert_equals(rma.ctype_ja_default_func("A"), "A") assert_equals(rma.ctype_ja_default_func("a"), "a") assert_equals(rma.ctype_ja_default_func("漢"), "C") assert_equals(rma.ctype_ja_default_func("百"), "S") assert_equals(rma.ctype_ja_default_func("0"), "N") assert_equals(rma.ctype_ja_default_func("・"), "n")
def test_decode(self): rma = RakutenMA() rma.hash_func = None csent = rma.tokens2csent([["foo", "N"], ["bar", "N"]], "SBIEO") csent = rma.add_efeats(csent) for i in range(len(csent)): csent[i].l = "" rma.model["mu"] = WEIGHTS csent = rma.decode(csent) assert_equals(csent[0].l, "_") assert_equals(csent[1].l, "B-N") assert_equals(csent[2].l, "I-N") assert_equals(csent[3].l, "E-N") assert_equals(csent[4].l, "B-N") assert_equals(csent[5].l, "I-N") assert_equals(csent[6].l, "E-N") assert_equals(csent[7].l, "_") csent = rma.tokens2csent([["foX", "N"], ["bar", "N"]], "SBIEO") csent = rma.add_efeats(csent) csent = rma.decode(csent) assert_equals(csent[0].l, "_") assert_equals(csent[1].l, "B-N") assert_equals(csent[2].l, "I-N") assert_equals(csent[3].l, "O") assert_equals(csent[4].l, "B-N") assert_equals(csent[5].l, "I-N") assert_equals(csent[6].l, "E-N") assert_equals(csent[7].l, "_")
def test_csent2tokens(self): sent = [["hoge", "X"], ["fuga", "Y"], ["p", "Z"]] rma = RakutenMA() csent = rma.tokens2csent(sent, "SBIEO") sent = RakutenMA.csent2tokens(csent, "SBIEO") assert_equals(sent[0][0], "hoge") assert_equals(sent[0][1], "X") assert_equals(sent[1][0], "fuga") assert_equals(sent[1][1], "Y") assert_equals(sent[2][0], "p") assert_equals(sent[2][1], "Z") assert_raises(Exception, RakutenMA.csent2tokens, (csent, "UNKNOWN_SCHEME"))
def __init__(self): super().__init__() # Initialize a RakutenMA instance with an empty model # the default ja feature set is set already self.rma = RakutenMA() # Initialize a RakutenMA instance with a pre-trained model self.rma = RakutenMA( phi=1024, c=0.007812 ) # Specify hyperparameter for SCW (for demonstration purpose) # https://github.com/ikegami-yukino/rakutenma-python/tree/master/rakutenma/model self.rma.load(abspath(r'..\resource\model_ja.min.json'))
def __init__(self, kv_filepath, model): self.rma = RakutenMA(json.loads(open(model).read())) self.rma.hash_func = RakutenMA.create_hash_func(self.rma, 15) self.ja_to_en = defaultdict(list) self.en_to_ja = defaultdict(list) for l in open(kv_filepath): [k, v] = l.strip().split(',')[:2] raw = unicode(k, 'utf-8') # lemma = self.rma.tokenize(raw)[0][0] self.ja_to_en[raw].append(v) self.en_to_ja[v].append(raw)
def test_tokens2csent(self): sent = [["hoge", "X"], ["fuga", "Y"], ["p", "Z"]] rma = RakutenMA() assert_raises(Exception, rma.tokens2csent, (sent, "UNKNOWN_SCHEME")) csent = rma.tokens2csent(sent, "SBIEO") assert_equals(csent[1].c, "h") assert_equals(csent[1].l, "B-X") assert_equals(csent[2].c, "o") assert_equals(csent[2].l, "I-X") assert_equals(csent[4].c, "e") assert_equals(csent[4].l, "E-X") assert_equals(csent[9].c, "p") assert_equals(csent[9].l, "S-Z")
def test_str2csent(self): rma = RakutenMA() actual = rma.str2csent("hoge") desired = [ Token(l=_BEOS_LABEL), Token(c="h", t=rma.ctype_ja_default_func("h")), Token(c="o", t=rma.ctype_ja_default_func("o")), Token(c="g", t=rma.ctype_ja_default_func("g")), Token(c="e", t=rma.ctype_ja_default_func("e")), Token(l=_BEOS_LABEL)] assert_equals(len(actual), len(desired)) for i in range(len(actual)): assert_equals(actual[i].c, desired[i].c) assert_equals(actual[i].t, desired[i].t) assert_equals(actual[i].f, desired[i].f) assert_equals(actual[i].l, desired[i].l)
def tokenize(x, t): if t in TOKC: from jieba import posseg toks = posseg.cut(x) if t == POSC: return u'\u3000'.join([('%s [%s]' % (f.word, f.flag)) for f in toks]) elif t == SPACEC: return u'\u3000'.join([('%s' % (f.word)) for f in toks]) else: return lexDens(toks, t) elif t in TOKJ: from rakutenma import RakutenMA rma = RakutenMA() rma = RakutenMA(phi=1024, c=0.007812) tD, tF = os.path.split(__file__) jSon = os.path.join(tD, 'model_ja.min.json') rma.load(jSon) toks = rma.tokenize(x) if t == SPACEJ: return u'\u3000'.join([i[0] for i in toks]) elif t == POSJ: return u'\u3000'.join([('%s [%s]' % (i[0], i[1])) for i in toks]) else: return lexDens(toks, t)
def test_eval_corpus(self): sent1 = ["a", "b", "c", "d", "a"] sent2 = ["a", "b", "c", "d+", "a", "b", "c", "d", "e", "f"] res = RakutenMA.eval_corpus([sent1], [sent2]) assert_equals(res[0], 0.3) assert_equals(res[1], 0.6) assert_equals(res[2], 0.4) assert_raises(Exception, RakutenMA.eval_corpus, (["a"], []))
def splitShuffle(expr, t): expr = stripHTML(expr).strip() if t == SGJ: from rakutenma import RakutenMA rma = RakutenMA(phi=1024, c=0.007812) tD, tF = os.path.split(__file__) jSon = os.path.join(tD, 'model_ja.min.json') rma.load(jSon) resultl = rma.tokenize(expr) result = [r for r, s in resultl] elif t in SGC: import jieba result = jieba.cut(expr, cut_all=False) elif t == JSS: result = expr.split(' ') elif t in WRAPL: result = list(expr) newResult, glosses = getResult(result, t) jn = u'' full = jn.join(newResult) random.shuffle(newResult) strResult = u''.join(newResult) return strResult, full, glosses
def _get_tokenizer(lang): rma = None if lang == 'ja': rma = RakutenMA() rma.load('model_ja.json') rma.hash_func = rma.create_hash_func(15) tokenizer = _jap_tokenizer # tokenizer = _jap_character_tokenizer else: tokenizer = _eng_tokenizer return tokenizer, rma
class PairScorer(): def __init__(self, model): print model self.rma = RakutenMA(json.loads(open(model).read())) self.rma.hash_func = RakutenMA.create_hash_func(self.rma, 15) return def extract_ja_content_lemmas(self, s): """ extracxts content words from a japanese sentence (nouns, verb roots, adjectives, no okurigana) """ s = unicode(s, 'utf-8') out = [] for [x, y] in self.rma.tokenize(s): if y in RAKUTEN_POS_TAGS: if y.startswith('V'): out += [(guess, y) for guess in guess_stem(x)] else: out.append( (x, y) ) return out def extract_en_content_lemmas(self, s): def penn_to_wordnet(pos): p = pos[0].lower() if p == 'j': return 'a' elif p == 'r': return 'r' elif p == 'v': return 'v' else: return 'n' lemmatizer = nltk.stem.WordNetLemmatizer() s = unicode(s, 'utf-8') out = [] for w, pos in nltk.pos_tag(nltk.word_tokenize(s)): if pos in PENN_POS_TAGS: out.append( (lemmatizer.lemmatize(w, pos=penn_to_wordnet(pos)), pos) ) return out
def test_csent2feats(self): rma = RakutenMA() rma.hash_func = None rma.featset = ["w0"] csent = rma.tokens2csent([["foo", "N"], ["bar", "N"]], "SBIEO") csent = rma.add_efeats(csent) feats = rma.csent2feats(csent) desired = ( ["w0", "", "_"], ["w0", "f", "B-N"], ["w0", "o", "I-N"], ["w0", "o", "E-N"], ["w0", "b", "B-N"], ["w0", "a", "I-N"], ["w0", "r", "E-N"], ["t", "B-N", "_"], ["t", "I-N", "B-N"], ["t", "E-N", "I-N"], ["t", "B-N", "E-N"], ["t", "_", "E-N"]) for d in desired: assert_true(d in feats) assert_true(["t", "E-N", "B-N"] not in feats) assert_true(["t", "B-N", "I-N"] not in feats)
# coding=utf8 from rakutenma import RakutenMA import tinysegmenter from nltk import * import nltk import re #segmenter = tinysegmenter.TinySegmenter() result = tinysegmenter.tokenize( "米中間選挙は6日に午後6時(日本時間7日午前8時)に一部の投票所が締め切られ、開票が始まった。米連邦議会の多数党がどちらになるかによって、ドナルド・トランプ米大統領の政策の行方が決まる。特に下院でどれだけ、民主党が共和党現職の議席を奪うかが注目されている。") print('Segmenter: ') print(result) # Initialize a RakutenMA instance with an empty model # the default ja feature set is set already rma = RakutenMA() # Let's analyze a sample sentence (from http://tatoeba.org/jpn/sentences/show/103809) # With a disastrous result, since the model is empty! print('Result') print(rma.tokenize(result)) print('Original') print(rma.tokenize("米中間選挙は6日に午後6時(日本時間7日午前8時)に一部の投票所が締め切られ、開票が始まった。")) print('------------------') # print(rma.tokenize("子どものみなさん、ゆるしてください。ぼくはこの本をひとりのおとなのひとにささげます。でもちゃんとしたわけがあるのです。")) # print(rma.tokenizetwo("彼は新しい仕事できっと成功するだろう。")) # print(rma.tokenize("彼は新しい仕事できっと成功するだろう。")) # Feed the model with ten sample sentences from tatoeba.com # "tatoeba.json" is available at https://github.com/rakuten-nlp/rakutenma
def __init__(self, model): print model self.rma = RakutenMA(json.loads(open(model).read())) self.rma.hash_func = RakutenMA.create_hash_func(self.rma, 15) return
def __init__(self): rma = RakutenMA() rma.load("model_ja.json") rma.hash_func = rma.create_hash_func(15) self.rma = rma
del counter['DET'] del counter['DETWH'] counter['ET'] += 0 counter['I'] += 0 counter['NC'] += counter['N'] + counter['VINF']; del counter['N']; del counter['VINF'] counter['NP'] += counter['NPP']; del counter['NPP'] del counter['P'] # The Japanese tag set doesn't account for prepositions. We could manually look for them using a table like this: http://mylanguages.org/japanese_prepositions.php counter['PREF'] += 0 counter['PRO'] += counter['PROREL'] + counter['PROWH']; del counter['PROREL']; del counter['PROWH'] counter['V'] += counter['VIMP'] + counter['VPR'] + counter['VS']; del counter['VIMP']; del counter['VPR']; del counter['VS'] counter['PUNC'] += counter['.$$.']; del counter['.$$.'] return dict(counter) rma = RakutenMA() rma.load("model_ja.json") def _analyze_ja(text): tags = rma.tokenize(text) counter = collections.Counter([x[1] for x in tags]) # see the same premise above in the French section subordinating_conjunctions = list(filter(lambda tup: tup[1] == 'C' and tup[0] in jsc, tags)) return { # we need to map the Japanese tagset to a subset of the French tagset, so that we can compare the two 'ADJ': counter['A-c'] + counter['A-dp'] + counter['J-c'] + counter['J-tari'] + counter['J-xs'] + counter['R'], 'ADV': counter['F'], 'CC': counter['C'] - len(subordinating_conjunctions), 'CS': len(subordinating_conjunctions), 'ET': counter['E'], 'I': counter['I-c'], 'NC': counter['N-n'] + counter['N-nc'], 'NP': counter['N-pn'],
def test_tokens2string(self): sent = [["hoge", "X"], ["fuga", "Y"], ["p", "Z"]] assert_equals(RakutenMA.tokens2string(sent), "hoge [X] | fuga [Y] | p [Z]")
def test_tokens_identical(self): assert_false(RakutenMA.tokens_identical([["a"]], [[]])) assert_false(RakutenMA.tokens_identical([["a"]], [["b"]])) assert_false(RakutenMA.tokens_identical([["a", "pos1"]], [["a", "pos2"]])) assert_true(RakutenMA.tokens_identical([["a", "pos1"]], [["a", "pos1"]]))
def test_create_ctype_chardic_func(self): rma = RakutenMA() cfunc = rma.create_ctype_chardic_func({"a": ["type1"], "b": ["type2"]}) assert_equals(cfunc("a"), ["type1"]) assert_equals(cfunc("b"), ["type2"]) assert_equals(cfunc("c"), [])
def test_tokenize_corpus(self): test_corpus = [[["abra", "pos1"], ["cadabra", "pos2"]]] tokenize_func = lambda s: list(s) desired = [["a", "b", "r", "a", "c", "a", "d", "a", "b", "r", "a"]] assert_equals(RakutenMA.tokenize_corpus(tokenize_func, test_corpus), desired)
def test_add_efeats(self): # feature functions test rma = RakutenMA() rma.hash_func = None rma.featset = ["w0"] csent = rma.str2csent("A1-b") csent = rma.add_efeats(csent) assert_equals(csent[0].f, [["w0", ""]]) assert_equals(csent[1].f, [["w0", "A"]]) assert_equals(csent[2].f, [["w0", "1"]]) assert_equals(csent[3].f, [["w0", "-"]]) assert_equals(csent[4].f, [["w0", "b"]]) assert_equals(csent[5].f, [["w0", ""]]) rma.featset = ["b1"] csent = rma.add_efeats(csent) assert_equals(csent[0].f, [["b1", "", "A"]]) assert_equals(csent[1].f, [["b1", "A", "1"]]) assert_equals(csent[2].f, [["b1", "1", "-"]]) assert_equals(csent[3].f, [["b1", "-", "b"]]) assert_equals(csent[4].f, [["b1", "b", ""]]) assert_equals(csent[5].f, [["b1", "", ""]]) rma.featset = ["c0"] csent = rma.add_efeats(csent) assert_equals(csent[0].f, [["c0", ""]]) assert_equals(csent[1].f, [["c0", "A"]]) assert_equals(csent[2].f, [["c0", "N"]]) assert_equals(csent[3].f, [["c0", "O"]]) assert_equals(csent[4].f, [["c0", "a"]]) assert_equals(csent[5].f, [["c0", ""]]) rma.featset = ["d9"] csent = rma.add_efeats(csent) assert_equals(csent[0].f, [["d9", "", ""]]) assert_equals(csent[1].f, [["d9", "", "A"]]) assert_equals(csent[2].f, [["d9", "A", "N"]]) assert_equals(csent[3].f, [["d9", "N", "O"]]) assert_equals(csent[4].f, [["d9", "O", "a"]]) assert_equals(csent[5].f, [["d9", "a", ""]]) rma.featset = ["t0"] csent = rma.add_efeats(csent) assert_equals(csent[0].f, [["t0", "", "", "A"]]) assert_equals(csent[1].f, [["t0", "", "A", "1"]]) assert_equals(csent[2].f, [["t0", "A", "1", "-"]]) assert_equals(csent[3].f, [["t0", "1", "-", "b"]]) assert_equals(csent[4].f, [["t0", "-", "b", ""]]) assert_equals(csent[5].f, [["t0", "b", "", ""]]) # test a custom function for feature # args _t: a function which receives position i and returns the token, # taking care of boundary cases # i: current position # sample function -> returns if the character is a capitalized letter rma.featset = [lambda _t, i: ["CAP", "T" if _t(i).t == "A" else "F"]] csent = rma.add_efeats(csent) assert_equals(csent[0].f, [["CAP", "F"]]) assert_equals(csent[1].f, [["CAP", "T"]]) assert_equals(csent[2].f, [["CAP", "F"]]) assert_equals(csent[3].f, [["CAP", "F"]]) assert_equals(csent[4].f, [["CAP", "F"]]) assert_equals(csent[5].f, [["CAP", "F"]]) rma.featset = ["NONEXISTENT_FEATURE"] assert_raises(Exception, rma.add_efeats, csent)
def test_set_model(self): rma = RakutenMA() rma.set_model({"mu": {"feat1": 0.3}, "sigma": {"feat1": 0.4}}) assert_equals(rma.scw.mu, {"feat1": 0.3}) assert_equals(rma.scw.sigma, {"feat1": 0.4})
def test_set_tag_scheme(self): rma = RakutenMA() rma.set_tag_scheme("IOB2") assert_equals(rma.tag_scheme, "IOB2")
import os import json import pickle from pathlib import Path from transformers import BertTokenizer from BertDataset import BertDataset from tqdm import tqdm import pandas as pd import numpy as np import unicodedata import re from rakutenma import RakutenMA import sys rma = RakutenMA() # (default: phi = 2048, c = 0.003906) tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese', do_lower_case=False) def main(args): with open(args.config_path / 'config.json') as f: config = json.load(f) rma.load(args.config_path / "model_ja.min.json") rma.hash_func = rma.create_hash_func(15) print(f"config:{config}") # loading datasets from excel files
def test_string2hash(self): rma = RakutenMA() assert_equals(rma.string2hash("hoge"), 3208229) assert_equals(rma.string2hash("piyopiyo"), -105052642) assert_equals(rma.string2hash(""), 0)
def create_tokenizer(): rma = RakutenMA() rma.load('model_ja.json') rma.hash_func = rma.create_hash_func(15) return rma.tokenize
def test_calc_states0(self): rma = RakutenMA() rma.hash_func = None rma.featset = ["c0", "w0"] csent = rma.tokens2csent([["foo", "N"], ["bar", "N"]], "SBIEO") csent = rma.add_efeats(csent) assert_equals(rma.calc_states0(csent[1].f, WEIGHTS), {"B-N": 2, "I-N": 1, "E-N": 1}) assert_equals(rma.calc_states0(csent[2].f, WEIGHTS), {"B-N": 1, "I-N": 2, "E-N": 2}) assert_equals(rma.calc_states0(csent[3].f, WEIGHTS), {"B-N": 1, "I-N": 2, "E-N": 2}) assert_equals(rma.calc_states0(csent[4].f, WEIGHTS), {"B-N": 2, "I-N": 1, "E-N": 1}) assert_equals(rma.calc_states0(csent[5].f, WEIGHTS), {"B-N": 1, "I-N": 2, "E-N": 1}) assert_equals(rma.calc_states0(csent[6].f, WEIGHTS), {"B-N": 1, "I-N": 1, "E-N": 2})