def test_decode(self):
        rma = RakutenMA()
        rma.hash_func = None
        csent = rma.tokens2csent([["foo", "N"], ["bar", "N"]], "SBIEO")
        csent = rma.add_efeats(csent)
        for i in range(len(csent)):
            csent[i].l = ""

        rma.model["mu"] = WEIGHTS
        csent = rma.decode(csent)
        assert_equals(csent[0].l, "_")
        assert_equals(csent[1].l, "B-N")
        assert_equals(csent[2].l, "I-N")
        assert_equals(csent[3].l, "E-N")
        assert_equals(csent[4].l, "B-N")
        assert_equals(csent[5].l, "I-N")
        assert_equals(csent[6].l, "E-N")
        assert_equals(csent[7].l, "_")

        csent = rma.tokens2csent([["foX", "N"], ["bar", "N"]], "SBIEO")
        csent = rma.add_efeats(csent)
        csent = rma.decode(csent)
        assert_equals(csent[0].l, "_")
        assert_equals(csent[1].l, "B-N")
        assert_equals(csent[2].l, "I-N")
        assert_equals(csent[3].l, "O")
        assert_equals(csent[4].l, "B-N")
        assert_equals(csent[5].l, "I-N")
        assert_equals(csent[6].l, "E-N")
        assert_equals(csent[7].l, "_")
示例#2
0
def _get_tokenizer(lang):
    rma = None
    if lang == 'ja':
        rma = RakutenMA()
        rma.load('model_ja.json')
        rma.hash_func = rma.create_hash_func(15)
        tokenizer = _jap_tokenizer
        # tokenizer = _jap_character_tokenizer
    else:
        tokenizer = _eng_tokenizer
    return tokenizer, rma
 def test_csent2feats(self):
     rma = RakutenMA()
     rma.hash_func = None
     rma.featset = ["w0"]
     csent = rma.tokens2csent([["foo", "N"], ["bar", "N"]], "SBIEO")
     csent = rma.add_efeats(csent)
     feats = rma.csent2feats(csent)
     desired = (
         ["w0", "", "_"], ["w0", "f", "B-N"], ["w0", "o", "I-N"],
         ["w0", "o", "E-N"], ["w0", "b", "B-N"], ["w0", "a", "I-N"],
         ["w0", "r", "E-N"], ["t", "B-N", "_"], ["t", "I-N", "B-N"],
         ["t", "E-N", "I-N"], ["t", "B-N", "E-N"], ["t", "_", "E-N"])
     for d in desired:
         assert_true(d in feats)
     assert_true(["t", "E-N", "B-N"] not in feats)
     assert_true(["t", "B-N", "I-N"] not in feats)
    def test_calc_states0(self):
        rma = RakutenMA()
        rma.hash_func = None
        rma.featset = ["c0", "w0"]
        csent = rma.tokens2csent([["foo", "N"], ["bar", "N"]], "SBIEO")
        csent = rma.add_efeats(csent)

        assert_equals(rma.calc_states0(csent[1].f, WEIGHTS),
                      {"B-N": 2, "I-N": 1, "E-N": 1})
        assert_equals(rma.calc_states0(csent[2].f, WEIGHTS),
                      {"B-N": 1, "I-N": 2, "E-N": 2})
        assert_equals(rma.calc_states0(csent[3].f, WEIGHTS),
                      {"B-N": 1, "I-N": 2, "E-N": 2})
        assert_equals(rma.calc_states0(csent[4].f, WEIGHTS),
                      {"B-N": 2, "I-N": 1, "E-N": 1})
        assert_equals(rma.calc_states0(csent[5].f, WEIGHTS),
                      {"B-N": 1, "I-N": 2, "E-N": 1})
        assert_equals(rma.calc_states0(csent[6].f, WEIGHTS),
                      {"B-N": 1, "I-N": 1, "E-N": 2})
    def test_add_efeats(self):
        # feature functions test
        rma = RakutenMA()
        rma.hash_func = None
        rma.featset = ["w0"]
        csent = rma.str2csent("A1-b")
        csent = rma.add_efeats(csent)
        assert_equals(csent[0].f, [["w0", ""]])
        assert_equals(csent[1].f, [["w0", "A"]])
        assert_equals(csent[2].f, [["w0", "1"]])
        assert_equals(csent[3].f, [["w0", "-"]])
        assert_equals(csent[4].f, [["w0", "b"]])
        assert_equals(csent[5].f, [["w0", ""]])

        rma.featset = ["b1"]
        csent = rma.add_efeats(csent)
        assert_equals(csent[0].f, [["b1", "", "A"]])
        assert_equals(csent[1].f, [["b1", "A", "1"]])
        assert_equals(csent[2].f, [["b1", "1", "-"]])
        assert_equals(csent[3].f, [["b1", "-", "b"]])
        assert_equals(csent[4].f, [["b1", "b", ""]])
        assert_equals(csent[5].f, [["b1", "", ""]])

        rma.featset = ["c0"]
        csent = rma.add_efeats(csent)
        assert_equals(csent[0].f, [["c0", ""]])
        assert_equals(csent[1].f, [["c0", "A"]])
        assert_equals(csent[2].f, [["c0", "N"]])
        assert_equals(csent[3].f, [["c0", "O"]])
        assert_equals(csent[4].f, [["c0", "a"]])
        assert_equals(csent[5].f, [["c0", ""]])

        rma.featset = ["d9"]
        csent = rma.add_efeats(csent)
        assert_equals(csent[0].f, [["d9", "", ""]])
        assert_equals(csent[1].f, [["d9", "", "A"]])
        assert_equals(csent[2].f, [["d9", "A", "N"]])
        assert_equals(csent[3].f, [["d9", "N", "O"]])
        assert_equals(csent[4].f, [["d9", "O", "a"]])
        assert_equals(csent[5].f, [["d9", "a", ""]])

        rma.featset = ["t0"]
        csent = rma.add_efeats(csent)
        assert_equals(csent[0].f, [["t0", "", "", "A"]])
        assert_equals(csent[1].f, [["t0", "", "A", "1"]])
        assert_equals(csent[2].f, [["t0", "A", "1", "-"]])
        assert_equals(csent[3].f, [["t0", "1", "-", "b"]])
        assert_equals(csent[4].f, [["t0", "-", "b", ""]])
        assert_equals(csent[5].f, [["t0", "b", "", ""]])

        # test a custom function for feature
        # args _t: a function which receives position i and returns the token,
        #          taking care of boundary cases
        #       i: current position
        # sample function -> returns if the character is a capitalized letter
        rma.featset = [lambda _t, i: ["CAP", "T" if _t(i).t == "A" else "F"]]
        csent = rma.add_efeats(csent)
        assert_equals(csent[0].f, [["CAP", "F"]])
        assert_equals(csent[1].f, [["CAP", "T"]])
        assert_equals(csent[2].f, [["CAP", "F"]])
        assert_equals(csent[3].f, [["CAP", "F"]])
        assert_equals(csent[4].f, [["CAP", "F"]])
        assert_equals(csent[5].f, [["CAP", "F"]])

        rma.featset = ["NONEXISTENT_FEATURE"]
        assert_raises(Exception, rma.add_efeats, csent)
示例#6
0
 def __init__(self):
     rma = RakutenMA()
     rma.load("model_ja.json")
     rma.hash_func = rma.create_hash_func(15)
     self.rma = rma 
        # Now what does the result look like?
        # First trained, maybe?
        print('After first trained')
        print('segment')
        print(rma.tokenize(result))
        print('rma')
        print(rma.tokenize(
            "米中間選挙は6日に午後6時(日本時間7日午前8時)に一部の投票所が締め切られ、開票が始まった。米連邦議会の多数党がどちらになるかによって、ドナルド・トランプ米大統領の政策の行方が決まる。特に下院でどれだけ、民主党が共和党現職の議席を奪うかが注目されている。"))

# Initialize a RakutenMA instance with a pre-trained model
rma = RakutenMA(phi=1024, c=0.007812)  # Specify hyperparameter for SCW (for demonstration purpose)
rma.load("model_ja.json")

# Set the feature hash function (15bit)
rma.hash_func = rma.create_hash_func(15)

# Tokenize one sample sentence
print('Tokenize simple sentence')
print(rma.tokenize("うらにわにはにわにわとりがいる"));

# Re-train the model feeding the right answer (pairs of [token, PoS tag])
res = rma.train_one(
    [["うらにわ", "N-nc"],
     ["に", "P-k"],
     ["は", "P-rj"],
     ["にわ", "N-n"],
     ["にわとり", "N-nc"],
     ["が", "P-k"],
     ["いる", "V-c"]])
# The result of train_one contains:
示例#8
0
def create_tokenizer():
    rma = RakutenMA()
    rma.load('model_ja.json')
    rma.hash_func = rma.create_hash_func(15)
    return rma.tokenize