示例#1
0
class ExplainJapaneseSentences(BaseFilter):
    def __init__(self):
        super().__init__()

        # Initialize a RakutenMA instance with an empty model
        # the default ja feature set is set already
        self.rma = RakutenMA()

        # Initialize a RakutenMA instance with a pre-trained model
        self.rma = RakutenMA(
            phi=1024, c=0.007812
        )  # Specify hyperparameter for SCW (for demonstration purpose)

        # https://github.com/ikegami-yukino/rakutenma-python/tree/master/rakutenma/model
        self.rma.load(abspath(r'..\resource\model_ja.min.json'))

    def __call__(self, chunk):
        chunk = self._duplicate_chunk(chunk)
        chunk.final = True

        result = [chunk]
        text = self.tokenize(chunk.text)
        result.append(
            TextChunk(text=text,
                      language='japanese',
                      audible=False,
                      printable=True,
                      final=True))
        return result

    def tokenize(self, text):
        tokens = self.rma.tokenize(text)
        return ' '.join(map(lambda pair: f'{pair[0]} ({pair[1]})', tokens))
示例#2
0
def tokenize(x, t):
    if t in TOKC:
        from jieba import posseg
        toks = posseg.cut(x)
        if t == POSC:
            return u'\u3000'.join([('%s [%s]' % (f.word, f.flag))
                                   for f in toks])
        elif t == SPACEC:
            return u'\u3000'.join([('%s' % (f.word)) for f in toks])
        else:
            return lexDens(toks, t)
    elif t in TOKJ:
        from rakutenma import RakutenMA
        rma = RakutenMA()
        rma = RakutenMA(phi=1024, c=0.007812)
        tD, tF = os.path.split(__file__)
        jSon = os.path.join(tD, 'model_ja.min.json')
        rma.load(jSon)
        toks = rma.tokenize(x)
        if t == SPACEJ:
            return u'\u3000'.join([i[0] for i in toks])
        elif t == POSJ:
            return u'\u3000'.join([('%s [%s]' % (i[0], i[1])) for i in toks])
        else:
            return lexDens(toks, t)
示例#3
0
def _get_tokenizer(lang):
    rma = None
    if lang == 'ja':
        rma = RakutenMA()
        rma.load('model_ja.json')
        rma.hash_func = rma.create_hash_func(15)
        tokenizer = _jap_tokenizer
        # tokenizer = _jap_character_tokenizer
    else:
        tokenizer = _eng_tokenizer
    return tokenizer, rma
示例#4
0
def splitShuffle(expr, t):
    expr = stripHTML(expr).strip()
    if t == SGJ:       
        from rakutenma import RakutenMA    
        rma = RakutenMA(phi=1024, c=0.007812)
        tD, tF = os.path.split(__file__)
        jSon = os.path.join(tD, 'model_ja.min.json')     
        rma.load(jSon)           
        resultl = rma.tokenize(expr)  
        result = [r for r, s in resultl]
    elif t in SGC:    
        import jieba
        result = jieba.cut(expr, cut_all=False)  
    elif t == JSS:    
        result = expr.split(' ')
    elif t in WRAPL:
        result = list(expr)
    newResult, glosses = getResult(result, t)
    jn = u''
    full = jn.join(newResult)
    random.shuffle(newResult)     
    strResult = u''.join(newResult)     
    return strResult, full, glosses
示例#5
0
    del counter['DETWH']
    counter['ET'] += 0
    counter['I'] += 0
    counter['NC'] += counter['N'] + counter['VINF']; del counter['N']; del counter['VINF']
    counter['NP'] += counter['NPP']; del counter['NPP']
    del counter['P']   # The Japanese tag set doesn't account for prepositions. We could manually look for them using a table like this: http://mylanguages.org/japanese_prepositions.php
    counter['PREF'] += 0
    counter['PRO'] += counter['PROREL'] + counter['PROWH']; del counter['PROREL']; del counter['PROWH']
    counter['V'] += counter['VIMP'] + counter['VPR'] + counter['VS']; del counter['VIMP']; del counter['VPR']; del counter['VS']
    counter['PUNC'] += counter['.$$.']; del counter['.$$.']

    return dict(counter)


rma = RakutenMA()
rma.load("model_ja.json")
def _analyze_ja(text):
    tags = rma.tokenize(text)
    counter = collections.Counter([x[1] for x in tags])  # see the same premise above in the French section
    subordinating_conjunctions = list(filter(lambda tup: tup[1] == 'C' and tup[0] in jsc, tags))

    return {  # we need to map the Japanese tagset to a subset of the French tagset, so that we can compare the two
        'ADJ':    counter['A-c'] + counter['A-dp'] + counter['J-c'] + counter['J-tari'] + counter['J-xs'] + counter['R'],
        'ADV':  counter['F'],
        'CC':   counter['C'] - len(subordinating_conjunctions),
        'CS':   len(subordinating_conjunctions),
        'ET':   counter['E'],
        'I':    counter['I-c'],
        'NC':   counter['N-n'] + counter['N-nc'],
        'NP':   counter['N-pn'],
        'PREF': counter['P'],
示例#6
0
 def __init__(self):
     rma = RakutenMA()
     rma.load("model_ja.json")
     rma.hash_func = rma.create_hash_func(15)
     self.rma = rma 
示例#7
0
def create_tokenizer():
    rma = RakutenMA()
    rma.load('model_ja.json')
    rma.hash_func = rma.create_hash_func(15)
    return rma.tokenize