def __init__(self, dbpath, schema_version, pid): self.schema = getattr(Schema, 'v%s' % schema_version) self.db_folder = '_%s_%s' % (dbpath, pid) self.s = load_scws() self.db = _database(self.db_folder, writable=True) self.termgen = xapian.TermGenerator() self.iter_keys = self.schema['origin_data_iter_keys'] self.pre_func = self.schema.get('pre_func', {})
# -*- coding: utf-8 -*- import re import opencc import os import time import csv from gensim import corpora from utils import load_scws, cut, load_emotion_words from flow_psy import flow_psychology_classfiy # from test_data import input_data2 #测试输入 AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") cut_str = load_scws() cc = opencc.OpenCC("s2t", opencc_path="/usr/bin/opencc") emotions_words = load_emotion_words() emotions_words = [unicode(e, "utf-8") for e in emotions_words] t_emotions_words = [cc.convert(e) for e in emotions_words] emotions_words.extend(t_emotions_words) emotions_words = [w.encode("utf-8") for w in emotions_words] emotions_words_set = set(emotions_words) emotion_pattern = re.compile(r"\[(\S+?)\]") def if_emoticoned_weibo(r): # 微博是否包含指定的表情符号集 emotions = re.findall(emotion_pattern, r["text"]) is_emoticoned = 1 if set(emotions) & emotions_words_set else 0
import xapian import simplejson as json import msgpack import datetime import calendar import time PROCESS_IDX_SIZE = 20000 SCHEMA_VERSION = 2 DOCUMENT_ID_TERM_PREFIX = 'M' DOCUMENT_CUSTOM_TERM_PREFIX = 'X' MONGOD_HOST = 'localhost' MONGOD_PORT = 27017 s = load_scws() def timeit(method): def timed(*args, **kw): ts = time.time() result = method(*args, **kw) te = time.time() print '%r %2.2f sec' % (method.__name__, te - ts) return result return timed class Schema: v2 = { 'db': 'master_timeline',
# -*- coding: utf-8 -*- import re import opencc import os import time import csv from gensim import corpora from utils import load_scws, cut, load_emotion_words from flow_psy import flow_psychology_classfiy #from test_data import input_data #测试输入 AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') cut_str = load_scws() cc = opencc.OpenCC('s2t', opencc_path='/usr/bin/opencc') emotions_words = load_emotion_words() emotions_words = [unicode(e, 'utf-8') for e in emotions_words] t_emotions_words = [cc.convert(e) for e in emotions_words] emotions_words.extend(t_emotions_words) emotions_words = [w.encode('utf-8') for w in emotions_words] emotions_words_set = set(emotions_words) emotion_pattern = re.compile(r'\[(\S+?)\]') def if_emoticoned_weibo(r): # 微博是否包含指定的表情符号集 emotions = re.findall(emotion_pattern, r['text']) is_emoticoned = 1 if set(emotions) & emotions_words_set else 0 return is_emoticoned
def test_scws(self): sentence = u'中国好声音' s = load_scws() tokens = cut(s, sentence.encode('utf-8')) self.assertNotEqual(tokens, None, 'scws failed')
from sklearn import cross_validation from sklearn.svm import SVC from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.svm import LinearSVC from sklearn.externals import joblib from imblearn.over_sampling import SMOTE import numpy as np from gensim import corpora, models, similarities import math import string from utils import single_word_whitelist, black_word, load_scws, cx_dict, LABEL_DICT AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'classify_dict') sw = load_scws() def read_csv(path): word_dict = dict() files = os.listdir(path) for filename in files: title = filename.replace('.csv', '') reader = csv.reader(file(path + filename, 'rb')) text_list = [] for line in reader: text_list.append(line[0]) word_dict[title] = text_list return word_dict