예제 #1
0
def separater(user_weibos):
    #print user_weibos
    s = load_scws()
    contents = []
    #all_words_dict = {}
    for user_weibo in user_weibos:
        content = user_weibo['_source']['text']
        print str(content)
        content = cut_filter(content)
        content = re_cut(content)
        separated_words = cut(s, content)
        words_dict = {}
        for word in separated_words:
            print str(word)
            try:
                words_dict[word] += 1
            except:
                words_dict[word] = 1

        #for item in words_dict:
        #print str(words_dict[item])

        #contents.append(content)

    #print contents

    return words_dict
예제 #2
0
def input_data():#测试输入

    sw = load_scws()
    uid_weibo = dict()
    uid_list = []
    reader = csv.reader(file('./weibo_data/uid_text_0728.csv', 'rb'))
    for mid,w_text in reader:
        if uid_weibo.has_key(str(mid)):
            uid_weibo[str(mid)] = uid_weibo[str(mid)] + '-' + w_text
        else:
            uid_weibo[str(mid)] = w_text
        if mid not in uid_list:
            uid_list.append(mid)

    uid_word = dict()
    for k,v in uid_weibo.items():
        words = sw.participle(v)
        word_list = dict()
        for word in words:
            if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (word[0] not in black_word) and (word[0] not in single_word_whitelist):#选择分词结果的名词、动词、形容词,并去掉单个词
                if word_list.has_key(word[0]):
                    word_list[word[0]] = word_list[word[0]] + 1
                else:
                    word_list[word[0]] = 1
        uid_word[k] = word_list
    
    return uid_list,uid_word
예제 #3
0
파일: user_domain.py 프로젝트: SwoJa/ruman
#-*-coding=utf-8-*-

import os
import sys
import json
from global_utils_do import STATUS_THRE,FOLLOWER_THRE,labels,outlist,lawyerw,cut,load_scws,adminw,mediaw,businessw

s = load_scws()

def user_domain_classifier_v2(user):
    r = user
    label = labels[11]

    verified_type = r['verified_type']
    location = r['user_location']
    province = location.split(' ')[0]

    followers_count = r['fansnum']
    statuses_count = r['statusnum']

    name = r['nick_name']
    description = r['description']

    if verified_type == 4:
        label = labels[0] # 高校微博

    elif verified_type == 1:
        label = labels[7]#政府机构及人士
        
    elif verified_type == 8 or verified_type == 7 or verified_type == 2:
        if province not in outlist:
# -*- coding: utf-8 -*-

#  gathering snmp data
from __future__ import division
import re
import opencc
import os
from gensim import corpora
import cPickle as pickle
#from xapian_case.utils import load_scws, cut, load_emotion_words
from global_utils_do import load_scws, cut, load_emotion_words

AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')

cut_str = load_scws()

cc = opencc.OpenCC('s2t', opencc_path='/usr/bin/opencc')
emotions_words = load_emotion_words()
emotions_words = [unicode(e, 'utf-8') for e in emotions_words]
t_emotions_words = [cc.convert(e) for e in emotions_words]
emotions_words.extend(t_emotions_words)
emotions_words = [w.encode('utf-8') for w in emotions_words]
emotions_words_set = set(emotions_words)
emotion_pattern = re.compile(r'\[(\S+?)\]')


def if_emoticoned_weibo(r):
    # 微博是否包含指定的表情符号集
    emotions = re.findall(emotion_pattern, r['text'])
    is_emoticoned = 1 if set(emotions) & emotions_words_set else 0
    return is_emoticoned
예제 #5
0
# -*- coding: utf-8 -*-

import os
import scws
import time
import csv
import re
from gensim import corpora
#from xapian_case.utils import load_scws, cut, cut_filter
from global_utils_do import load_scws, cut, cut_filter
from liblinearutil import svm_read_problem, load_model, predict, save_model, train

sw = load_scws()

AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), './')
FEATURE_WORD_PATH = os.path.join(AB_PATH, './svm/dictionary_20150124.txt')
SVM_MODEL_FILE = os.path.join(AB_PATH, './svm/train.model')
TRAIN_DATA_FILE = os.path.join(AB_PATH, './train20150124.csv')
TRAIN_INPUT_FILE = os.path.join(AB_PATH, './svm/train20150124.txt')

dictionary = corpora.Dictionary.load_from_text(FEATURE_WORD_PATH)


def prepare_svm_input_file(texts, dictionary=dictionary):
    """将svm输入处理成文件
    """
    pid = os.getpid()
    svm_input_path = os.path.join(AB_PATH, './svm_test/%s.txt' % pid)

    fw = open(svm_input_path, 'w')
    for text in texts:
예제 #6
0
    return connection


def ts2datetime(timestamp):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(timestamp))


def ts2date(timestamp):
    return time.strftime('%Y%m%d', time.localtime(timestamp))


def datetime2ts(date):
    return int(time.mktime(time.strptime(date, '%Y-%m-%d %H:%M:%S')))


s = load_scws()
cx_dict = set([
    'Ag', 'a', 'an', 'Ng', 'n', 'nr', 'ns', 'nt', 'nz', 'Vg', 'v', 'vd', 'vn',
    '@', 'j'
])  # 关键词词性词典

EXTRA_BLACK_LIST_PATH = os.path.join(
    os.path.dirname(os.path.abspath(__file__)), 'black.txt')


def load_black_words():
    one_words = set(
        [line.strip('\r\n') for line in file(EXTRA_BLACK_LIST_PATH)])
    return one_words