Пример #1
0
class Parser(object):
    def __init__(self, encoding="utf8", mecab_option=default_option):
        self.encoding = encoding
        self.mecab_option = mecab_option
        self.tagger = MeCab.Tagger(self.mecab_option)
        self.normalizer = TextNormalizer()

    def node(self, s):
        try:
            if type(s) == str:
                s = s.decode(self.encoding)
            s = self.normalizer.normalize(s)
            s = s.encode(self.encoding)
        except:
            s = ""
        return self.tagger.parseToNode(s)

    def parse(self, s, to_unicode=False):
        node = self.node(s)
        ret = []
        while node:
            surface = node.surface
            if surface != "":
                if to_unicode:
                    surface = surface.decode(self.encoding)
                ret.append(surface)
            node = node.next
        return ret
Пример #2
0
class Parser(object):
    def __init__(self, encoding="utf8", mecab_option=default_option):
        self.encoding = encoding
        self.mecab_option = mecab_option
        self.tagger = MeCab.Tagger(self.mecab_option)
        self.normalizer = TextNormalizer()
    
    def node(self, s):
        try:
            if type(s) == str:
                s = s.decode(self.encoding)
            s = self.normalizer.normalize(s)
            s = s.encode(self.encoding)
        except:
            s = ""
        return self.tagger.parseToNode(s)
    
    def parse(self, s, to_unicode=False):
        node = self.node(s)
        ret = []
        while node:
            surface = node.surface
            if surface != "":
                if to_unicode:
                    surface = surface.decode(self.encoding)
                ret.append(surface)
            node = node.next
        return ret
Пример #3
0
#!/usr/bin/python
#encoding: utf8

import sys
from NormalizeText import TextNormalizer

if __name__ == '__main__':
    #print sys.argv
    if len(sys.argv) < 2:
        normalizer = TextNormalizer()
    else:
        normalizer = TextNormalizer(*sys.argv[1:])
    #print normalizer.normalize_methods
    while 1:
        s = raw_input()
        print normalizer.normalize(s.decode('utf8'))