def load_scws(): s = scws.Scws() s.set_charset(SCWS_ENCODING) s.set_dict(CHS_DICT_PATH, scws.XDICT_MEM) s.add_dict(CHT_DICT_PATH, scws.XDICT_MEM) s.add_dict(CUSTOM_DICT_PATH, scws.XDICT_TXT) # 把停用词全部拆成单字,再过滤掉单字,以达到去除停用词的目的 s.add_dict(EXTRA_STOPWORD_PATH, scws.XDICT_TXT) # 即基于表情表对表情进行分词,必要的时候在返回结果处或后剔除 s.add_dict(EXTRA_EMOTIONWORD_PATH, scws.XDICT_TXT) s.set_rules(SCWS_RULES) s.set_ignore(IGNORE_PUNCTUATION) return s
#!/usr/bin/env python # -*- coding: utf-8 -*- # !此文件不再使用,重构中 import sys import xapian import string import simplejson as json import re import pymongo import scws import time s = scws.Scws() s.set_charset('utf-8') s.set_dict('/usr/local/scws/etc/dict.utf8.xdb',scws.XDICT_MEM) s.add_dict('/usr/local/scws/etc/dict_cht.utf8.xdb',scws.XDICT_MEM) s.add_dict('userdic.txt',scws.XDICT_TXT) s.set_rules('/usr/local/scws/etc/rules.utf8.ini') s.set_ignore(1) #connection = pymongo.Connection() connection = pymongo.Connection('219.224.135.60',27017) db = connection.admin db.authenticate('root','root') db = connection.weibo print 'pymongo success'