示例#1
0
这个脚本负责:指定的group,浏览TopicInfo和CommentInfo
Note: Do one thing, do it well.
"""
from datetime import datetime
import logging
import os

from logconfig import congifLogger
from utils import is_between
from prepare import TRAIN_START_DATE, TRAIN_END_DATE
from prepare import TEST_START_DATE, TEST_END_DATE
from prepare import load_user_list

# config logging
log = logging.getLogger('Main.behavior')
congifLogger("behavior.log", 5)


def load_topic():
    """ 读入所有的topic信息
    Note:这里按照有title信息的格式来
    """
    log.info('Loading topic info...')
    # map topic_id --> dict()
    topic_dict = dict()
    f = open(TOPIC_ALL_FILE_PATH, 'r')
    ft = open(TOPIC_FILE_PATH, 'w')  # 存储本group的topic
    row = ''
    for line in f:
        line = line.strip()
        if line != '[*ROWEND*]':
        protocal = urlparse(href).scheme
        if protocal == 'http' or protocal == 'https':
            return True
        return False
        
    def _getAlreadyVisitedNum(self):
        #visitedGroups保存已经分配给taskQueue的链接,有可能链接还在处理中。
        #因此真实的已访问链接数为visitedGroups数减去待访问的链接数
        if len(self.visited_href) == 0:
            return 0
        else:
            return len(self.visited_href) - self.thread_pool.getTaskLeft()
        
if __name__ == "__main__":
    stacktracer.trace_start("trace.html",interval=5,auto=True) # Set auto flag to always update file!
    congifLogger("log/thread-id-crawler.log", 5)
    
    # 从这个URL开始抓取,从这个页面抓取下一页的地址,只能是单各进程
    section_id = 'free'
    start_url = 'http://bbs.tianya.cn/list-%s-1.shtml' % (section_id)
    print "Start URL:", start_url
    
    base_path = '/home/kqc/dataset/tianya-forum/'
    time_now = datetime.now()    
    post_id_list_path = 'post-id-list.txt'
    
    tcrawler = PostIDCrawler(start_url, 1, post_id_list_path, max_post_num = 10000)
    tcrawler.start()
    
    stacktracer.trace_stop()
        for topic_id in self.topic_dict:
            topic = self.topic_dict[topic_id]
            s = topic.getSimpleString(delimiter = '[=]')
            ftopic.write(s + '\n[*ROWEND*]\n')
            for comment in topic.comment_list:
                cs = comment.getSimpleString(delimiter = '[=]')
                fcomment.write(cs + '\n[*ROWEND*]\n')
                
        ftopic.close()
        fcomment.close()
     '''
        
if __name__ == "__main__":
    LINE_FEED = "\n" # 采用windows的换行格式
    stacktracer.trace_start("trace.html",interval=5,auto=True) # Set auto flag to always update file!
    congifLogger("log/comment_crawler.log", 5)
    #group_id_list = ['FLL', '294806', 'MML']
    #group_id_list = ['test']
    #group_id_list = ['70612', 'FLL']
    group_id_list = []
    if len(sys.argv) <= 1:
        print "Group IDs were not provided."
        sys.exit()
    # add group ids
    for i in range(1, len(sys.argv)):
        group_id_list.append(sys.argv[i])
        
    print "Crawling comments for groups: ", group_id_list
    
    MAX_TOPIC_NUM = float('inf') # 每个小组最多处理的topic的个数
    group_id = group_id_list[0]
示例#4
0
准备所有的训练集和测试集:在每条topic后增加评论这条topic的所有用户
"""
import sys
import logging
from datetime import datetime
import codecs

from logconfig import congifLogger
from utils import is_between
from prepare import load_topic_user, load_comment_user
from prepare import TRAIN_START_DATE, TRAIN_END_DATE
from prepare import TEST_START_DATE, TEST_END_DATE

# config logging
log = logging.getLogger('Main.prepare_train_test')
congifLogger("prepare_train_test.log", 5)

# 抽取评论,截止某个日期
COMMENT_END_DATE = datetime(2013, 3, 1)

# Note: 这里需要将所有的topic和comment信息都导入到内存中

def main(argv):
    if len(argv) < 2:
        print 'Group ID not provided.'
        sys.exit(1)
        
    group_id = argv[1]
    log.info('Prepare training set and test set for group: %s' % group_id)
    
    path = 'tables/' + group_id + '/TopicInfo-raw-all-' + group_id
        f = open(topic_list_path, 'w')
        for tid in this_group.stick_topic_list:
            f.write(tid + "\n")
            
        f.write("\n")
        for tid in self.topic_list:
            f.write(tid + "\n")
            
        f.close()
        
        self.topic_list = list()
        self.failed_href = set()
        
if __name__ == "__main__":
    stacktracer.trace_start("trace.html",interval=5,auto=True) # Set auto flag to always update file!
    congifLogger("log/topicCrawler.log", 5)
    
    group_id_list = []
    if len(sys.argv) <= 1:
        print "Group IDs were not provided."
        sys.exit()
    # add group ids
    for i in range(1, len(sys.argv)):
        group_id_list.append(sys.argv[i])
        
    print "Crawling topic list for groups: ", group_id_list
    #tcrawler = TopicCrawler(['FLL', '294806', 'MML'], 5)
    #tcrawler = TopicCrawler(['70612'], 5) # 我们都是学术女小组
    #tcrawler = TopicCrawler(['ustv'], 5) # 美剧fans 小组
    for group_id in group_id_list:
        base_path = '/home/kqc/dataset/douban-group/'
示例#6
0
from lxml import etree # use XPath from lxml
import codecs
import pdb
import time

from bs4 import BeautifulSoup 

from threadPool import ThreadPool
from logconfig import congifLogger
from webPage import WebPage

import stacktracer

# config logging
log = logging.getLogger('Main.crawl_title')
congifLogger("crawl_title.log", 5)

failed_set = set()

def task_handler(topic_id, seg_list):
    f = codecs.open('tables/TopicInfo-title.txt', 'a','utf-8')
    url = 'http://www.douban.com/group/topic/' + topic_id + '/'
    print 'Visiting: ', url
    webPage = WebPage(url)
    flag = webPage.fetch()
    
    if flag:
        url, pageSource = webPage.getDatas() # pageSource已经为unicode格式
        page = etree.HTML(pageSource)
        content = page.xpath(u"/html/body/div[@id='wrapper']/div[@id='content']")[0]
        tmp = page.xpath(u"//table[@class='infobox']//td[@class='tablecc']")
示例#7
0
import codecs
from datetime import datetime

reload(sys)
sys.setdefaultencoding('utf-8')

from gensim import corpora, models, similarities
from gensim.models.ldamodel import LdaModel

from logconfig import congifLogger

STOP_WORDS_PATH = 'dataset/chinese-english-stopwords.txt'

# config logging
log = logging.getLogger('Main.train_lda')
congifLogger("train_lda.log", 5)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, filename = "train_lda.log")

def load_stop_words():
    """导入所有的中文停用词
    """
    stoplist = set()
    f = codecs.open(STOP_WORDS_PATH, 'r', 'utf-8')
    for line in f:
        line = line.strip()
        stoplist.add(line)
        
    return stoplist
    
def load_documents(file_path):
示例#8
0
# coding=utf-8

"""
该脚本检查采用Logistic Regression by LIBLINEAR的概率输出,得到precision@K
"""
import logging
import sys
import random

from operator import itemgetter
from logconfig import congifLogger

# config logging
log = logging.getLogger('Main.statics')
congifLogger("prediction_statics.log", 5)

def load_user(filepath):
    """ 按照文件中user id的顺序读入,返回list
    """
    uid_list = []
    f = open(filepath)
    for line in f:
        line = line.strip()
        uid_list.append(line)
        
    return uid_list
    
def load_test_topic(filepath):
    """ 导入测试topic的index以及对应的真正的参与人数
    """
    topic_list = []
示例#9
0
import os

from gensim import models, corpora

from logconfig import congifLogger
from prepare import load_user_list
from utils import is_between
from prepare_corpus import seg_chinese, remove_url

# 除了训练集中的数据以外,都可已作为用户兴趣的内容
from prepare import TRAIN_START_DATE, TRAIN_END_DATE
from prepare import TEST_START_DATE, TEST_END_DATE

# config logging
log = logging.getLogger('Main.interest')
congifLogger("gen_user_interest.log", 5)

def save_interest_info(interest_path, interest_info, model, dictionary):
    """ 将用户的兴趣向量和行为信息写入到文件
    """
    # 写入感兴趣的文本
    f = codecs.open(interest_path, 'a', 'utf-8')
    for uid in interest_info:
        text = interest_info[uid][1]
        # 如果该用户没有任何兴趣文本,则表示其没有发表topic或者参与评论
        text = remove_url(text)
        text = seg_chinese(text)
        document = text.split(' ')
        doc_bow = dictionary.doc2bow(document)
        doc_lda = model.__getitem__(doc_bow, eps = 0)
        #print doc_lda
示例#10
0
#encoding=utf-8

"""
这里包含了一些准备数据集需要用到的函数和变量
"""

from datetime import datetime
import codecs
import logging

from utils import is_between
from logconfig import congifLogger

# config logging
log = logging.getLogger('Main.prepare')
congifLogger("prepare.log", 5)

# 训练集的起止时间
TRAIN_START_DATE = datetime(2012, 8, 1)
TRAIN_END_DATE = datetime(2012, 11, 1)

TEST_START_DATE = datetime(2012, 11, 1)
TEST_END_DATE = datetime(2013, 1, 1)

# 设置最早和最晚的年限
VERY_EARLY_TIME = datetime(1900, 1, 1)
VERY_LATE_TIME = datetime(2050, 1, 1)

def load_topic_user(filepath, start_date = VERY_EARLY_TIME, end_date = VERY_LATE_TIME):
    """ 根据时间范围,导入所有的topic以及参与的user id
    注意:topic可能有commentlist或者没有
示例#11
0
import codecs
import logging
from datetime import datetime
import os

from logconfig import congifLogger
from prepare import load_user_list
from utils import is_between

# 除了训练集中的数据以外,都可已作为用户兴趣的内容
from prepare import TRAIN_START_DATE, TRAIN_END_DATE
from prepare import TEST_START_DATE, TEST_END_DATE

# config logging
log = logging.getLogger('Main.behavior')
congifLogger("gen_user_behavior.log", 5)

    
def save_behavior(behavior_path, behavior):
    # 写入行为信息
    f = codecs.open(behavior_path, 'a', 'utf-8')
    for uid in behavior:
        f.write(uid + '[=]' + str(behavior[uid][0]) + '[=]' + str(behavior[uid][1]) + '\n')
    f.close()
    
def get_interested_topic(uid_list, comment_path):
    """ 从comment info中获取用户感兴趣的topic list列表(即评论的某个topic的id列表 ),
    并统计用户的评论次数
    """
    user_set = set(uid_list)
    # user id ==> (num_comments, num_topics)
示例#12
0
            quote_comment = self.find_previous_comment(i, uname, date)
            if quote_comment is None:
                log.error('Quote comment not found for comment: %s in post: %s, in group: \%s' % (comment.cid, self.post_id, self.section_id))
                log.error('Current comment content: %s\n\n' % comment.content)
            else:
                # 链接找到的comment
                comment.quote = quote_comment
                #print comment.get_simple_string("[=]")
                log.info(u'评论 %s by %s 引用 评论 %s by %s' % (comment.cid, comment.user_name, comment.quote.cid, comment.quote.user_name))
        
if __name__ == "__main__":
    import sys
    import codecs
    sys.stdout = (codecs.getwriter('utf8'))(sys.stdout)
    
    congifLogger("log/models.log", 5)
    
    post = Post('4318716', u'free')
    #f = codecs.open(u"./testpage/舌尖上的厨娘 (配图,配过程),挑战你的味蕾_天涯杂谈_天涯论坛.html", "r", 'utf8') # first page
    f = codecs.open(u"./testpage/温故512:汶川地震的坍塌及重建_天涯杂谈_天涯论坛.html", "r", 'utf8') # first page    
    strfile_page1 = f.read()
    f.close()
    """
    f = codecs.open(u"./testpage/舌尖上的厨娘 (配图,配过程),挑战你的味蕾(第2页)_天涯杂谈_天涯论坛.html", "r", 'utf8') # first page    
    strfile_page2 = f.read()
    f.close()
    f = codecs.open(u"./testpage/舌尖上的厨娘 (配图,配过程),挑战你的味蕾(第3页)_天涯杂谈_天涯论坛.html", "r", 'utf8') # first page    
    strfile_page3 = f.read()
    f.close()
    """
    # 抓取评论