Exemplo n.º 1
0
def load_weibo(uid_weibo):

    ts = time.time()
    domain_dict, domain_count = load_train()
    end = time.time()

    print '%s' % (end - ts)

    len_dict = dict()
    total = 0
    for k, v in domain_dict.items():
        len_dict[k] = len(v)
        total = total + len(v)

    sw = load_scws()
    black = load_black_words()
    result_data = dict()
    ts = time.time()
    for k, v in uid_weibo.items():
        words = sw.participle(v)
        domain_p = start_p(name_list)
        word_list = dict()
        for word in words:
            if (word[1] in cx_dict) and 3 < len(
                    word[0]) < 30 and (word[0] not in black) and (
                        word[0] not in single_word_whitelist
                    ) and (word[0] not in word_list):  #选择分词结果的名词、动词、形容词,并去掉单个词
                if word_list.has_key(word[0]):
                    word_list[word[0]] = word_list[word[0]] + 1
                else:
                    word_list[word[0]] = 1
        for d_k in domain_p.keys():
            start = time.time()
            domain_p[d_k] = com_p(word_list, domain_dict[d_k],
                                  domain_count[d_k], len_dict[d_k],
                                  total)  #计算文档属于每一个类的概率
            end_time = time.time()
            print '%s' % (end_time - start)
        result_data[k] = domain_p
        end = time.time()
        print '%s takes %s...' % (k, end - ts)
        ts = end

    return result_data
Exemplo n.º 2
0
def load_weibo(uid_weibo):

    ts = time.time()
    domain_dict,domain_count = load_train()
    end = time.time()

    print '%s' % (end-ts)

    len_dict = dict()
    total = 0
    for k,v in domain_dict.items():
        len_dict[k] = len(v)
        total = total + len(v)

    sw = load_scws()
    black = load_black_words()
    result_data = dict()
    ts = time.time()
    for k,v in uid_weibo.items():
        words = sw.participle(v)
        domain_p = start_p(name_list)
        word_list = dict()
        for word in words:
            if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (word[0] not in black) and (word[0] not in single_word_whitelist) and (word[0] not in word_list):#选择分词结果的名词、动词、形容词,并去掉单个词
                if word_list.has_key(word[0]):
                    word_list[word[0]] = word_list[word[0]] + 1
                else:
                    word_list[word[0]] = 1
        for d_k in domain_p.keys():
            start = time.time()
            domain_p[d_k] = com_p(word_list,domain_dict[d_k],domain_count[d_k],len_dict[d_k],total)#计算文档属于每一个类的概率
            end_time = time.time()
            print '%s' % (end_time-start)
        result_data[k] = domain_p
        end = time.time()
        print '%s takes %s...' % (k,end-ts)
        ts = end

    return result_data
Exemplo n.º 3
0
def read_csv(domain_dict,domain_count,d_time):
    sw = load_scws()
    black = load_black_words()
    text = ''
    word_dict = dict()
    reader = csv.reader(file('./add_dict/%s_new.csv'% d_time, 'rb'))
    for line in reader:
        #line = line[0].strip('\xef\xbb\xbf')
        #line = line.strip('\n')
        text = text + ',' + line

    #print text.encode('utf-8')
    words = sw.participle(text)
    for word in words:
        if (word[1] in cx_dict) and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist) and (word[0] not in black):#选择分词结果的名词、动词、形容词,并去掉单个词
            if domain_dict.has_key(str(word[0])):
                domain_dict[str(word[0])] = domain_dict[str(word[0])] + 1
            else:
                domain_dict[str(word[0])] = 1
            domain_count = domain_count + 1

    return domain_dict,domain_count
Exemplo n.º 4
0
def read_csv(domain_dict, domain_count, d_time):
    sw = load_scws()
    black = load_black_words()
    text = ''
    word_dict = dict()
    reader = csv.reader(file('./add_dict/%s_new.csv' % d_time, 'rb'))
    for line in reader:
        #line = line[0].strip('\xef\xbb\xbf')
        #line = line.strip('\n')
        text = text + ',' + line

    #print text.encode('utf-8')
    words = sw.participle(text)
    for word in words:
        if (word[1] in cx_dict) and (3 < len(
                word[0]) < 30 or word[0] in single_word_whitelist) and (
                    word[0] not in black):  #选择分词结果的名词、动词、形容词,并去掉单个词
            if domain_dict.has_key(str(word[0])):
                domain_dict[str(word[0])] = domain_dict[str(word[0])] + 1
            else:
                domain_dict[str(word[0])] = 1
            domain_count = domain_count + 1

    return domain_dict, domain_count
Exemplo n.º 5
0
# -*- coding: UTF-8 -*-

import os
import time
import re
import scws
import csv
import sys
import json
from textrank4zh import TextRank4Keyword, TextRank4Sentence
from config import load_scws,load_black_words,re_cut

black = load_black_words()
tr4w = TextRank4Keyword()

def get_keyword(w_text, n_gram, n_count):

    tr4w.analyze(text=w_text, lower=True, window=n_gram)
    word_list = dict()
    k_dict = tr4w.get_keywords(n_count, word_min_len=2)
    for item in k_dict:
        if item.word.encode('utf-8').isdigit() or item.word.encode('utf-8') in black:
            continue
        word_list[item.word.encode('utf-8')] = item.weight

    return word_list

def get_weibo_single(text,n_gram=2,n_count=3):
    '''
        针对单条微博提取关键词,但是效率比较低
        输入数据: