示例#1
0
    def __init__(self, dbpath, schema_version, pid):
        self.schema = getattr(Schema, 'v%s' % schema_version)
        self.db_folder = '_%s_%s' % (dbpath, pid)
        self.s = load_scws()
        self.db = _database(self.db_folder, writable=True)

        self.termgen = xapian.TermGenerator()
        self.iter_keys = self.schema['origin_data_iter_keys']
        self.pre_func = self.schema.get('pre_func', {})
    def __init__(self, dbpath, schema_version, pid):
        self.schema = getattr(Schema, 'v%s' % schema_version)
        self.db_folder = '_%s_%s' % (dbpath, pid)
        self.s = load_scws()
        self.db = _database(self.db_folder, writable=True)

        self.termgen = xapian.TermGenerator()
        self.iter_keys = self.schema['origin_data_iter_keys']
        self.pre_func = self.schema.get('pre_func', {})
# -*- coding: utf-8 -*-

import re
import opencc
import os
import time
import csv
from gensim import corpora
from utils import load_scws, cut, load_emotion_words
from flow_psy import flow_psychology_classfiy

# from test_data import input_data2 #测试输入

AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")

cut_str = load_scws()

cc = opencc.OpenCC("s2t", opencc_path="/usr/bin/opencc")
emotions_words = load_emotion_words()
emotions_words = [unicode(e, "utf-8") for e in emotions_words]
t_emotions_words = [cc.convert(e) for e in emotions_words]
emotions_words.extend(t_emotions_words)
emotions_words = [w.encode("utf-8") for w in emotions_words]
emotions_words_set = set(emotions_words)
emotion_pattern = re.compile(r"\[(\S+?)\]")


def if_emoticoned_weibo(r):
    # 微博是否包含指定的表情符号集
    emotions = re.findall(emotion_pattern, r["text"])
    is_emoticoned = 1 if set(emotions) & emotions_words_set else 0
示例#4
0
import xapian
import simplejson as json
import msgpack
import datetime
import calendar
import time


PROCESS_IDX_SIZE = 20000
SCHEMA_VERSION = 2
DOCUMENT_ID_TERM_PREFIX = 'M'
DOCUMENT_CUSTOM_TERM_PREFIX = 'X'
MONGOD_HOST = 'localhost'
MONGOD_PORT = 27017

s = load_scws()


def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        print '%r %2.2f sec' % (method.__name__, te - ts)
        return result
    return timed


class Schema:
    v2 = {
        'db': 'master_timeline',
# -*- coding: utf-8 -*-

import re
import opencc
import os
import time
import csv
from gensim import corpora
from utils import load_scws, cut, load_emotion_words
from flow_psy import flow_psychology_classfiy
#from test_data import input_data #测试输入

AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')

cut_str = load_scws()

cc = opencc.OpenCC('s2t', opencc_path='/usr/bin/opencc')
emotions_words = load_emotion_words()
emotions_words = [unicode(e, 'utf-8') for e in emotions_words]
t_emotions_words = [cc.convert(e) for e in emotions_words]
emotions_words.extend(t_emotions_words)
emotions_words = [w.encode('utf-8') for w in emotions_words]
emotions_words_set = set(emotions_words)
emotion_pattern = re.compile(r'\[(\S+?)\]')


def if_emoticoned_weibo(r):
    # 微博是否包含指定的表情符号集
    emotions = re.findall(emotion_pattern, r['text'])
    is_emoticoned = 1 if set(emotions) & emotions_words_set else 0
    return is_emoticoned
示例#6
0
 def test_scws(self):
     sentence = u'中国好声音'
     s = load_scws()
     tokens = cut(s, sentence.encode('utf-8'))
     self.assertNotEqual(tokens, None, 'scws failed')
示例#7
0
from sklearn import cross_validation
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
from imblearn.over_sampling import SMOTE
import numpy as np
from gensim import corpora, models, similarities
import math
import string
from utils import single_word_whitelist, black_word, load_scws, cx_dict, LABEL_DICT

AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                       'classify_dict')
sw = load_scws()


def read_csv(path):

    word_dict = dict()
    files = os.listdir(path)
    for filename in files:
        title = filename.replace('.csv', '')
        reader = csv.reader(file(path + filename, 'rb'))
        text_list = []
        for line in reader:
            text_list.append(line[0])
        word_dict[title] = text_list

    return word_dict