Пример #1
0
def save_ts(topic, excel_name): # '0' no means
    start_ts = [1396918800, 1396918800, 1396920000, 1396922400, 1396929600, 1396928400,\
                1397032200, 1397045700, 1397096100, 1397089200, 1397138400]
    end_ts = [1396918900,1396920300, 1396927000, 1396923400, 1396931000, 1396930000,\
              1397033200, 1397130000, 1397098000, 1397089900, 1397140000]
    s = load_scws()
    for i in range(11):
        #item = OpinionTestTime(topic, str(i), start_ts[i], end_ts[i])
        # 获取child_topic对应的权重最大的微博,并进行分词
        data = xlrd.open_workbook(excel_name)
        table_weibo = data.sheet_by_name(str(i))
        line = table_weibo.row_values(0) # 读取工作表中的第一行,即权重最大的微博文本
        weibo = line[1] # 获得微博文本
        term_list = cut(s, weibo.encode('utf8'))
        #print 'term_list:', term_list
        child_topic = json.dumps({str(i): term_list})
        item = OpinionTestTime(topic, child_topic, start_ts[i], end_ts[i])
        # 分割线
        item_exist = db.session.query(OpinionTestTime).filter(OpinionTestTime.topic==topic, \
                                                              OpinionTestTime.child_topic==child_topic, \
                                                              OpinionTestTime.start_ts==start_ts[i], \
                                                              OpinionTestTime.end_ts==end_ts[i]).first()
        if item_exist:
            db.session.delete(item_exist)
        db.session.add(item)
    db.session.commit()
Пример #2
0
def save_ts(topic, excel_name):  # '0' no means
    start_ts = [1396918800, 1396918800, 1396920000, 1396922400, 1396929600, 1396928400,\
                1397032200, 1397045700, 1397096100, 1397089200, 1397138400]
    end_ts = [1396918900,1396920300, 1396927000, 1396923400, 1396931000, 1396930000,\
              1397033200, 1397130000, 1397098000, 1397089900, 1397140000]
    s = load_scws()
    for i in range(11):
        #item = OpinionTestTime(topic, str(i), start_ts[i], end_ts[i])
        # 获取child_topic对应的权重最大的微博,并进行分词
        data = xlrd.open_workbook(excel_name)
        table_weibo = data.sheet_by_name(str(i))
        line = table_weibo.row_values(0)  # 读取工作表中的第一行,即权重最大的微博文本
        weibo = line[1]  # 获得微博文本
        term_list = cut(s, weibo.encode('utf8'))
        #print 'term_list:', term_list
        child_topic = json.dumps({str(i): term_list})
        item = OpinionTestTime(topic, child_topic, start_ts[i], end_ts[i])
        # 分割线
        item_exist = db.session.query(OpinionTestTime).filter(OpinionTestTime.topic==topic, \
                                                              OpinionTestTime.child_topic==child_topic, \
                                                              OpinionTestTime.start_ts==start_ts[i], \
                                                              OpinionTestTime.end_ts==end_ts[i]).first()
        if item_exist:
            db.session.delete(item_exist)
        db.session.add(item)
    db.session.commit()
Пример #3
0
import datetime
from case.model import *
from case.extensions import db
from case.moodlens import pie as pieModule
from case.identify import utils as identifyModule
import search as searchModule
from case.time_utils import ts2datetime, ts2date
from xapian_case.xapian_backend import XapianSearch
from xapian_case.utils import cut, load_scws
from case.dynamic_xapian_weibo import getXapianWeiboByTopic
from case.global_config import XAPIAN_USER_DATA_PATH
from case.Database import Event, EventManager
from case.topic_manage import topics_name_start_end
from flask import Blueprint, url_for, render_template, request, abort, flash, session, redirect, make_response

scws = load_scws()

mod = Blueprint('case', __name__, url_prefix='/index')

xapian_search_weibo = getXapianWeiboByTopic()

em = EventManager()


def acquire_user_by_id(uid):
    user_search = XapianSearch(path=XAPIAN_USER_DATA_PATH,
                               name='master_timeline_user',
                               schema_version=1)
    result = user_search.search_by_id(int(uid),
                                      fields=[
                                          'name', 'location',
Пример #4
0
    # db.authenticate('root', 'root')
    db = getattr(connection, usedb)
    return db

def ts2date(timestamp):
    return time.strftime('%Y-%m-%d', time.localtime(timestamp))


def ts2datetime(timestamp):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(timestamp))


def datetime2ts(date):
    return int(time.mktime(time.strptime(date, '%Y-%m-%d %H:%M:%S')))

s = load_scws()

cx_dict = set(['Ag','a','an','Ng','n','nr','ns','nt','nz','Vg','v','vd','vn','@','j']) # 关键词词性词典, 保留名词、动词、形容词
cx_dict_noun = set(['Ng','n','nr','ns','nt','nz']) # 关键词词性词典, 保留名词


def load_black_words():
    one_words = set([line.strip('\r\n') for line in file(EXTRA_BLACK_LIST_PATH)])
    return one_words

black_words = load_black_words()

def cut_words(text):
    '''分词, 加入黑名单过滤单个词,保留名词、动词、形容词
       input
           texts: 输入text的list,utf-8
#-*-coding=utf-8-*-

import collections
from xapian_case.utils import load_scws, cut

sw = load_scws()
total_keywords_list = []

f = open('../source/domain_training_text.txt')

for line in f:
    text = line.strip()
    terms = cut(sw, text)
    total_keywords_list.extend(terms)

f.close()

ct = collections.Counter(total_keywords_list)
keywords_results = ct.most_common(100)
fw = open('../source/domain_keywords_20150618.txt', 'w')
for keyword, count in keywords_results:
    fw.write("%s\n" % keyword)
fw.close()
Пример #6
0
# -*- coding: utf-8 -*-

#  gathering snmp data
from __future__ import division
import re
import opencc
import os
from gensim import corpora
import cPickle as pickle
from xapian_case.utils import load_scws, cut, load_emotion_words

AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')

cut_str = load_scws()

cc = opencc.OpenCC('s2t', opencc_path='/usr/bin/opencc')
emotions_words = load_emotion_words()
emotions_words = [unicode(e, 'utf-8') for e in emotions_words]
t_emotions_words = [cc.convert(e) for e in emotions_words]
emotions_words.extend(t_emotions_words)
emotions_words = [w.encode('utf-8') for w in emotions_words]
emotions_words_set = set(emotions_words)
emotion_pattern = re.compile(r'\[(\S+?)\]')


def if_emoticoned_weibo(r):
    # 微博是否包含指定的表情符号集
    emotions = re.findall(emotion_pattern, r['text'])
    is_emoticoned = 1 if set(emotions) & emotions_words_set else 0
    return is_emoticoned
Пример #7
0
# -*- coding: utf-8 -*-

import os
import scws
import time
import csv
import re
from gensim import corpora
from xapian_case.utils import load_scws, cut, cut_filter
from liblinearutil import svm_read_problem, load_model, predict, save_model, train

sw = load_scws()

AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), './')
FEATURE_WORD_PATH = os.path.join(AB_PATH, './svm/dictionary_20150124.txt')
SVM_MODEL_FILE = os.path.join(AB_PATH, './svm/train.model')
TRAIN_DATA_FILE = os.path.join(AB_PATH, './train20150124.csv')
TRAIN_INPUT_FILE = os.path.join(AB_PATH, './svm/train20150124.txt')

dictionary = corpora.Dictionary.load_from_text(FEATURE_WORD_PATH)


def prepare_svm_input_file(texts, dictionary=dictionary):
    """将svm输入处理成文件
    """
    pid = os.getpid()
    svm_input_path = os.path.join(AB_PATH, './svm_test/%s.txt' % pid)

    fw = open(svm_input_path, 'w')
    for text in texts:
        words = cut(sw, text)
Пример #8
0
# -*- coding: utf-8 -*-

#  gathering snmp data
from __future__ import division
import re
import opencc
import os
from gensim import corpora
import cPickle as pickle
from xapian_case.utils import load_scws, cut, load_emotion_words

AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')

cut_str = load_scws()

cc = opencc.OpenCC('s2t', opencc_path='/usr/bin/opencc')
emotions_words = load_emotion_words()
emotions_words = [unicode(e, 'utf-8') for e in emotions_words]
t_emotions_words = [cc.convert(e) for e in emotions_words]
emotions_words.extend(t_emotions_words)
emotions_words = [w.encode('utf-8') for w in emotions_words]
emotions_words_set = set(emotions_words)
emotion_pattern = re.compile(r'\[(\S+?)\]')


def if_emoticoned_weibo(r):
    # 微博是否包含指定的表情符号集
    emotions = re.findall(emotion_pattern, r['text'])
    is_emoticoned = 1 if set(emotions) & emotions_words_set else 0
    return is_emoticoned