poller = zmq.Poller() poller.register(receiver, zmq.POLLIN) poller.register(controller, zmq.POLLIN) parser = ArgumentParser() parser.add_argument('-r', '--remote_stub', action='store_true', help='remote stub') args = parser.parse_args(sys.argv[1:]) remote_stub = args.remote_stub dbpath = XAPIAN_DB_PATH xapian_indexer = XapianIndex(dbpath, SCHEMA_VERSION, remote_stub) fill_field_funcs = [] from consts import XAPIAN_EXTRA_FIELD from triple_sentiment_classifier import triple_classifier def fill_sentiment(item): sentiment = triple_classifier(item) item[XAPIAN_EXTRA_FIELD] = sentiment return item fill_field_funcs.append(fill_sentiment) s = load_scws() def cut_text(item): text = item['text'].encode('utf-8') item['terms'] = cut(s, text, cx=False) return item fill_field_funcs.append(cut_text) xapian_index_forever(xapian_indexer, receiver, controller, poller, fill_field_funcs=fill_field_funcs)
#! /usr/bin/env python # -*- coding: utf-8 -*- import eventlet from eventlet import wsgi from xapian_weibo.utils import load_scws import json import urllib JSON_HEADER = [('content-Type', 'application/json;charset=UTF-8'), ("Access-Control-Allow-Origin", "*"), ('Server', 'WDC-eventlet')] s = load_scws() def cut(text, f=None): global s if f: return [ token[0].decode('utf-8') for token in s.participle(text) if token[1] in f and (token[0].isalnum() or len(token[0]) > 3) ] else: return [ token[0].decode('utf-8') for token in s.participle(text) if token[0].isalnum() or len(token[0]) > 3 ] def word_seg(env, start_response):
import nltk import re from gensim import corpora, models, similarities import math import string from nltk import probability from nltk.probability import FreqDist import cPickle as pickle import leveldb from xapian_weibo.xapian_backend import XapianSearch from xapian_weibo.xapian_backend_extra import _load_weibos_from_xapian from xapian_weibo.utils import load_scws from xapian_weibo.utils import cut cut_str = load_scws() ##情绪类标 HAPPY = 1 ANGRY = 2 SAD = 3 def emoticon(zan_set, angry_set, sad_set, text): """ text是微博文本,不是关键词""" emotion_pattern = r'\[(\S+?)\]' remotions = re.findall(emotion_pattern, text) zan = 0 angry = 0 sad = 0
# -*- coding: utf-8 -*- # gathering snmp data from __future__ import division import re import opencc import os from gensim import corpora import cPickle as pickle from xapian_weibo.utils import load_scws, cut, load_emotion_words AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') cut_str = load_scws() cc = opencc.OpenCC('s2t', opencc_path='/usr/bin/opencc') emotions_words = load_emotion_words() emotions_words = [unicode(e, 'utf-8') for e in emotions_words] t_emotions_words = [cc.convert(e) for e in emotions_words] emotions_words.extend(t_emotions_words) emotions_words = [w.encode('utf-8') for w in emotions_words] emotions_words_set = set(emotions_words) emotion_pattern = re.compile(r'\[(\S+?)\]') def if_emoticoned_weibo(r): # 微博是否包含指定的表情符号集 emotions = re.findall(emotion_pattern, r['text']) is_emoticoned = 1 if set(emotions) & emotions_words_set else 0 return is_emoticoned