예제 #1
0
def lineproc(id, text):
    global word_dict_root
    global grouptree

    grouptree.StartCountGroup()
    line = weibo_bot.RemoveWeiboRubbish(text)
    if len(line) == 0:
        return None
    spliter = decoder.LineSpliter(word_dict_root)
    spliter.SplitLine(line)
    spliter.AfterProcess()
    words = spliter.found_word
    signwordpos.ProcessSentence(words)
    grouptree.ProcessOneLine(words)
    return grouptree.group_count
예제 #2
0
    def RequestWork(self, params, body):
        if params.get('zip'):
            body = gzip.GzipFile(fileobj=StringIO(body), mode='r').read()
        if isinstance(body, unicode) == False and 'encode' in params:
            body = body.decode(params['encode'])

        text_pice = re.split(u"[\s!?,。;,:“ ”( )、?《》·]+", body)
        text_list = []
        for tp in text_pice:
            tp = tp.strip()
            if len(tp) > 0:
                text_list.append(tp)

        result_text_list = []
        for tp in text_list:
            spliter = decoder.LineSpliter(self.word_dict_root)
            spliter.SplitLine(tp)
            spliter.AfterProcess()
            words = spliter.found_word
            self.signwordpos.ProcessSentence(words)
            #self.grouptree.ProcessOneLine(words)
            """for word in words:
                groupstr=None
                if word.info:
                    groups=word.info.get('group')
                    if groups:
                        groupstr=','.join(groups)"""
            word_list = []
            for word in words:
                word_list.append({
                    'pos': word.pos,
                    'txt': word.word,
                    'type': word.word_type_list,
                    'nocn': word.is_no_cn
                })
            result_text_list.append({'pice': tp, 'words': word_list})

        outbuf = StringIO()
        json.dump(result_text_list, gzip.GzipFile(fileobj=outbuf, mode='w'))
        return {'zip': True}, outbuf.getvalue()
예제 #3
0
#-*-coding:utf-8-*-
import sqlite3
import codecs
import json
import decoder
import gzip

if __name__ == '__main__':
    """
    使用新浪新闻来测试词频 上一步是 fetch_hudongbaike/fetch_sina_news.py
    """
    dbtext = sqlite3.connect("../fetch_hudongbaike/data/sina_news.db")

    dc = dbtext.cursor()
    dc.execute('select content from sina_news where content is not null')

    word_dic = {}
    word_dict_root = decoder.LoadDefaultWordDic()
    for content, in dc:
        spliter = decoder.LineSpliter(word_dict_root)
        spliter.SplitLine(content)
        spliter.CheckCantantPre()
        spliter.CheckTail()
        for word in spliter.found_word:
            if word.is_no_cn:
                continue
            word_dic[word.word] = word_dic.get(word.word, 0) + 1

    fp = gzip.open('data/dictbase/word_freq.txt.gz', 'w')
    json.dump(word_dic, fp)
    fp.close()