def lineproc(id, text): global word_dict_root global grouptree grouptree.StartCountGroup() line = weibo_bot.RemoveWeiboRubbish(text) if len(line) == 0: return None spliter = decoder.LineSpliter(word_dict_root) spliter.SplitLine(line) spliter.AfterProcess() words = spliter.found_word signwordpos.ProcessSentence(words) grouptree.ProcessOneLine(words) return grouptree.group_count
def RequestWork(self, params, body): if params.get('zip'): body = gzip.GzipFile(fileobj=StringIO(body), mode='r').read() if isinstance(body, unicode) == False and 'encode' in params: body = body.decode(params['encode']) text_pice = re.split(u"[\s!?,。;,:“ ”( )、?《》·]+", body) text_list = [] for tp in text_pice: tp = tp.strip() if len(tp) > 0: text_list.append(tp) result_text_list = [] for tp in text_list: spliter = decoder.LineSpliter(self.word_dict_root) spliter.SplitLine(tp) spliter.AfterProcess() words = spliter.found_word self.signwordpos.ProcessSentence(words) #self.grouptree.ProcessOneLine(words) """for word in words: groupstr=None if word.info: groups=word.info.get('group') if groups: groupstr=','.join(groups)""" word_list = [] for word in words: word_list.append({ 'pos': word.pos, 'txt': word.word, 'type': word.word_type_list, 'nocn': word.is_no_cn }) result_text_list.append({'pice': tp, 'words': word_list}) outbuf = StringIO() json.dump(result_text_list, gzip.GzipFile(fileobj=outbuf, mode='w')) return {'zip': True}, outbuf.getvalue()
#-*-coding:utf-8-*- import sqlite3 import codecs import json import decoder import gzip if __name__ == '__main__': """ 使用新浪新闻来测试词频 上一步是 fetch_hudongbaike/fetch_sina_news.py """ dbtext = sqlite3.connect("../fetch_hudongbaike/data/sina_news.db") dc = dbtext.cursor() dc.execute('select content from sina_news where content is not null') word_dic = {} word_dict_root = decoder.LoadDefaultWordDic() for content, in dc: spliter = decoder.LineSpliter(word_dict_root) spliter.SplitLine(content) spliter.CheckCantantPre() spliter.CheckTail() for word in spliter.found_word: if word.is_no_cn: continue word_dic[word.word] = word_dic.get(word.word, 0) + 1 fp = gzip.open('data/dictbase/word_freq.txt.gz', 'w') json.dump(word_dic, fp) fp.close()