Пример #1
0
class Parser(object):
    def __init__(self, encoding="utf8", mecab_option=default_option):
        self.encoding = encoding
        self.mecab_option = mecab_option
        self.tagger = MeCab.Tagger(self.mecab_option)
        self.normalizer = TextNormalizer()

    def node(self, s):
        try:
            if type(s) == str:
                s = s.decode(self.encoding)
            s = self.normalizer.normalize(s)
            s = s.encode(self.encoding)
        except:
            s = ""
        return self.tagger.parseToNode(s)

    def parse(self, s, to_unicode=False):
        node = self.node(s)
        ret = []
        while node:
            surface = node.surface
            if surface != "":
                if to_unicode:
                    surface = surface.decode(self.encoding)
                ret.append(surface)
            node = node.next
        return ret
Пример #2
0
class Parser(object):
    def __init__(self, encoding="utf8", mecab_option=default_option):
        self.encoding = encoding
        self.mecab_option = mecab_option
        self.tagger = MeCab.Tagger(self.mecab_option)
        self.normalizer = TextNormalizer()
    
    def node(self, s):
        try:
            if type(s) == str:
                s = s.decode(self.encoding)
            s = self.normalizer.normalize(s)
            s = s.encode(self.encoding)
        except:
            s = ""
        return self.tagger.parseToNode(s)
    
    def parse(self, s, to_unicode=False):
        node = self.node(s)
        ret = []
        while node:
            surface = node.surface
            if surface != "":
                if to_unicode:
                    surface = surface.decode(self.encoding)
                ret.append(surface)
            node = node.next
        return ret
Пример #3
0
#!/usr/bin/python
#encoding: utf8

import sys
from NormalizeText import TextNormalizer

if __name__ == '__main__':
    #print sys.argv
    if len(sys.argv) < 2:
        normalizer = TextNormalizer()
    else:
        normalizer = TextNormalizer(*sys.argv[1:])
    #print normalizer.normalize_methods
    while 1:
        s = raw_input()
        print normalizer.normalize(s.decode('utf8'))
Пример #4
0
 def __init__(self, encoding="utf8", mecab_option=default_option):
     self.encoding = encoding
     self.mecab_option = mecab_option
     self.tagger = MeCab.Tagger(self.mecab_option)
     self.normalizer = TextNormalizer()
Пример #5
0
 def __init__(self, encoding="utf8", mecab_option=default_option):
     self.encoding = encoding
     self.mecab_option = mecab_option
     self.tagger = MeCab.Tagger(self.mecab_option)
     self.normalizer = TextNormalizer()
Пример #6
0
from NormalizeText import TextNormalizer
import datetime
from BeautifulSoup import BeautifulSoup as BS
from config import *
import urllib
import urllib2
import MeCabParser
import pldautils

pattern_year = re.compile(u"([1-9][0-9]{3,})[\/年][0-9]")
p_date = re.compile(u"[1-9]{1,2}[\/月][1-9]{1,2}[\/日]")
date_split = re.compile(u"[\/月日]").split
p_event = re.compile(
    "^.*(本日|今日|明日|明後日|募集|予約|歓迎|発売|発表|開催|実施|参加|受付|会場|開場|申し込み|展示|開演|主催).*$")
p_youbi = re.compile(u"\([月火水木金土日]\)")
normalizer = TextNormalizer()
tv_list = [
    u"nhk", u"テレ東", u"テレビ東京", u"日テレ", u"日本テレビ", u"テレビ朝日", u"テレ朝", u"tbs",
    u"フジテレビ"
]
yurl = "http://geo.search.olp.yahooapis.jp/OpenLocalPlatform/V1/geoCoder"


def urlopen(q):
    if type(q).__name__ == "unicode":
        q = q.encode("utf8")
    return urllib2.urlopen(yurl + "?appid=" + yid + "&query=" + q)


class EventTweetTokenizer(MeCabParser.Parser):
    """tweetから余計なものを削除してトークナイズ"""