def test_decode(self): rma = RakutenMA() rma.hash_func = None csent = rma.tokens2csent([["foo", "N"], ["bar", "N"]], "SBIEO") csent = rma.add_efeats(csent) for i in range(len(csent)): csent[i].l = "" rma.model["mu"] = WEIGHTS csent = rma.decode(csent) assert_equals(csent[0].l, "_") assert_equals(csent[1].l, "B-N") assert_equals(csent[2].l, "I-N") assert_equals(csent[3].l, "E-N") assert_equals(csent[4].l, "B-N") assert_equals(csent[5].l, "I-N") assert_equals(csent[6].l, "E-N") assert_equals(csent[7].l, "_") csent = rma.tokens2csent([["foX", "N"], ["bar", "N"]], "SBIEO") csent = rma.add_efeats(csent) csent = rma.decode(csent) assert_equals(csent[0].l, "_") assert_equals(csent[1].l, "B-N") assert_equals(csent[2].l, "I-N") assert_equals(csent[3].l, "O") assert_equals(csent[4].l, "B-N") assert_equals(csent[5].l, "I-N") assert_equals(csent[6].l, "E-N") assert_equals(csent[7].l, "_")
def _get_tokenizer(lang): rma = None if lang == 'ja': rma = RakutenMA() rma.load('model_ja.json') rma.hash_func = rma.create_hash_func(15) tokenizer = _jap_tokenizer # tokenizer = _jap_character_tokenizer else: tokenizer = _eng_tokenizer return tokenizer, rma
def test_csent2feats(self): rma = RakutenMA() rma.hash_func = None rma.featset = ["w0"] csent = rma.tokens2csent([["foo", "N"], ["bar", "N"]], "SBIEO") csent = rma.add_efeats(csent) feats = rma.csent2feats(csent) desired = ( ["w0", "", "_"], ["w0", "f", "B-N"], ["w0", "o", "I-N"], ["w0", "o", "E-N"], ["w0", "b", "B-N"], ["w0", "a", "I-N"], ["w0", "r", "E-N"], ["t", "B-N", "_"], ["t", "I-N", "B-N"], ["t", "E-N", "I-N"], ["t", "B-N", "E-N"], ["t", "_", "E-N"]) for d in desired: assert_true(d in feats) assert_true(["t", "E-N", "B-N"] not in feats) assert_true(["t", "B-N", "I-N"] not in feats)
def test_calc_states0(self): rma = RakutenMA() rma.hash_func = None rma.featset = ["c0", "w0"] csent = rma.tokens2csent([["foo", "N"], ["bar", "N"]], "SBIEO") csent = rma.add_efeats(csent) assert_equals(rma.calc_states0(csent[1].f, WEIGHTS), {"B-N": 2, "I-N": 1, "E-N": 1}) assert_equals(rma.calc_states0(csent[2].f, WEIGHTS), {"B-N": 1, "I-N": 2, "E-N": 2}) assert_equals(rma.calc_states0(csent[3].f, WEIGHTS), {"B-N": 1, "I-N": 2, "E-N": 2}) assert_equals(rma.calc_states0(csent[4].f, WEIGHTS), {"B-N": 2, "I-N": 1, "E-N": 1}) assert_equals(rma.calc_states0(csent[5].f, WEIGHTS), {"B-N": 1, "I-N": 2, "E-N": 1}) assert_equals(rma.calc_states0(csent[6].f, WEIGHTS), {"B-N": 1, "I-N": 1, "E-N": 2})
def test_add_efeats(self): # feature functions test rma = RakutenMA() rma.hash_func = None rma.featset = ["w0"] csent = rma.str2csent("A1-b") csent = rma.add_efeats(csent) assert_equals(csent[0].f, [["w0", ""]]) assert_equals(csent[1].f, [["w0", "A"]]) assert_equals(csent[2].f, [["w0", "1"]]) assert_equals(csent[3].f, [["w0", "-"]]) assert_equals(csent[4].f, [["w0", "b"]]) assert_equals(csent[5].f, [["w0", ""]]) rma.featset = ["b1"] csent = rma.add_efeats(csent) assert_equals(csent[0].f, [["b1", "", "A"]]) assert_equals(csent[1].f, [["b1", "A", "1"]]) assert_equals(csent[2].f, [["b1", "1", "-"]]) assert_equals(csent[3].f, [["b1", "-", "b"]]) assert_equals(csent[4].f, [["b1", "b", ""]]) assert_equals(csent[5].f, [["b1", "", ""]]) rma.featset = ["c0"] csent = rma.add_efeats(csent) assert_equals(csent[0].f, [["c0", ""]]) assert_equals(csent[1].f, [["c0", "A"]]) assert_equals(csent[2].f, [["c0", "N"]]) assert_equals(csent[3].f, [["c0", "O"]]) assert_equals(csent[4].f, [["c0", "a"]]) assert_equals(csent[5].f, [["c0", ""]]) rma.featset = ["d9"] csent = rma.add_efeats(csent) assert_equals(csent[0].f, [["d9", "", ""]]) assert_equals(csent[1].f, [["d9", "", "A"]]) assert_equals(csent[2].f, [["d9", "A", "N"]]) assert_equals(csent[3].f, [["d9", "N", "O"]]) assert_equals(csent[4].f, [["d9", "O", "a"]]) assert_equals(csent[5].f, [["d9", "a", ""]]) rma.featset = ["t0"] csent = rma.add_efeats(csent) assert_equals(csent[0].f, [["t0", "", "", "A"]]) assert_equals(csent[1].f, [["t0", "", "A", "1"]]) assert_equals(csent[2].f, [["t0", "A", "1", "-"]]) assert_equals(csent[3].f, [["t0", "1", "-", "b"]]) assert_equals(csent[4].f, [["t0", "-", "b", ""]]) assert_equals(csent[5].f, [["t0", "b", "", ""]]) # test a custom function for feature # args _t: a function which receives position i and returns the token, # taking care of boundary cases # i: current position # sample function -> returns if the character is a capitalized letter rma.featset = [lambda _t, i: ["CAP", "T" if _t(i).t == "A" else "F"]] csent = rma.add_efeats(csent) assert_equals(csent[0].f, [["CAP", "F"]]) assert_equals(csent[1].f, [["CAP", "T"]]) assert_equals(csent[2].f, [["CAP", "F"]]) assert_equals(csent[3].f, [["CAP", "F"]]) assert_equals(csent[4].f, [["CAP", "F"]]) assert_equals(csent[5].f, [["CAP", "F"]]) rma.featset = ["NONEXISTENT_FEATURE"] assert_raises(Exception, rma.add_efeats, csent)
def __init__(self): rma = RakutenMA() rma.load("model_ja.json") rma.hash_func = rma.create_hash_func(15) self.rma = rma
# Now what does the result look like? # First trained, maybe? print('After first trained') print('segment') print(rma.tokenize(result)) print('rma') print(rma.tokenize( "米中間選挙は6日に午後6時(日本時間7日午前8時)に一部の投票所が締め切られ、開票が始まった。米連邦議会の多数党がどちらになるかによって、ドナルド・トランプ米大統領の政策の行方が決まる。特に下院でどれだけ、民主党が共和党現職の議席を奪うかが注目されている。")) # Initialize a RakutenMA instance with a pre-trained model rma = RakutenMA(phi=1024, c=0.007812) # Specify hyperparameter for SCW (for demonstration purpose) rma.load("model_ja.json") # Set the feature hash function (15bit) rma.hash_func = rma.create_hash_func(15) # Tokenize one sample sentence print('Tokenize simple sentence') print(rma.tokenize("うらにわにはにわにわとりがいる")); # Re-train the model feeding the right answer (pairs of [token, PoS tag]) res = rma.train_one( [["うらにわ", "N-nc"], ["に", "P-k"], ["は", "P-rj"], ["にわ", "N-n"], ["にわとり", "N-nc"], ["が", "P-k"], ["いる", "V-c"]]) # The result of train_one contains:
def create_tokenizer(): rma = RakutenMA() rma.load('model_ja.json') rma.hash_func = rma.create_hash_func(15) return rma.tokenize