def test_create_hash_func(self): rma = RakutenMA() hash_func = rma.create_hash_func(4) assert_equals(hash_func("feat1", "foo"), ["5"]) assert_equals(hash_func("feat1", "bar"), ["2"]) assert_equals(hash_func("feat1", "baz"), ["10"]) assert_equals(hash_func("feat1", "qux"), ["3"])
def _get_tokenizer(lang): rma = None if lang == 'ja': rma = RakutenMA() rma.load('model_ja.json') rma.hash_func = rma.create_hash_func(15) tokenizer = _jap_tokenizer # tokenizer = _jap_character_tokenizer else: tokenizer = _eng_tokenizer return tokenizer, rma
def __init__(self, kv_filepath, model): self.rma = RakutenMA(json.loads(open(model).read())) self.rma.hash_func = RakutenMA.create_hash_func(self.rma, 15) self.ja_to_en = defaultdict(list) self.en_to_ja = defaultdict(list) for l in open(kv_filepath): [k, v] = l.strip().split(',')[:2] raw = unicode(k, 'utf-8') # lemma = self.rma.tokenize(raw)[0][0] self.ja_to_en[raw].append(v) self.en_to_ja[v].append(raw)
def __init__(self, model): print model self.rma = RakutenMA(json.loads(open(model).read())) self.rma.hash_func = RakutenMA.create_hash_func(self.rma, 15) return
def __init__(self): rma = RakutenMA() rma.load("model_ja.json") rma.hash_func = rma.create_hash_func(15) self.rma = rma
# Now what does the result look like? # First trained, maybe? print('After first trained') print('segment') print(rma.tokenize(result)) print('rma') print(rma.tokenize( "米中間選挙は6日に午後6時(日本時間7日午前8時)に一部の投票所が締め切られ、開票が始まった。米連邦議会の多数党がどちらになるかによって、ドナルド・トランプ米大統領の政策の行方が決まる。特に下院でどれだけ、民主党が共和党現職の議席を奪うかが注目されている。")) # Initialize a RakutenMA instance with a pre-trained model rma = RakutenMA(phi=1024, c=0.007812) # Specify hyperparameter for SCW (for demonstration purpose) rma.load("model_ja.json") # Set the feature hash function (15bit) rma.hash_func = rma.create_hash_func(15) # Tokenize one sample sentence print('Tokenize simple sentence') print(rma.tokenize("うらにわにはにわにわとりがいる")); # Re-train the model feeding the right answer (pairs of [token, PoS tag]) res = rma.train_one( [["うらにわ", "N-nc"], ["に", "P-k"], ["は", "P-rj"], ["にわ", "N-n"], ["にわとり", "N-nc"], ["が", "P-k"], ["いる", "V-c"]]) # The result of train_one contains:
def create_tokenizer(): rma = RakutenMA() rma.load('model_ja.json') rma.hash_func = rma.create_hash_func(15) return rma.tokenize