这个脚本负责:指定的group,浏览TopicInfo和CommentInfo Note: Do one thing, do it well. """ from datetime import datetime import logging import os from logconfig import congifLogger from utils import is_between from prepare import TRAIN_START_DATE, TRAIN_END_DATE from prepare import TEST_START_DATE, TEST_END_DATE from prepare import load_user_list # config logging log = logging.getLogger('Main.behavior') congifLogger("behavior.log", 5) def load_topic(): """ 读入所有的topic信息 Note:这里按照有title信息的格式来 """ log.info('Loading topic info...') # map topic_id --> dict() topic_dict = dict() f = open(TOPIC_ALL_FILE_PATH, 'r') ft = open(TOPIC_FILE_PATH, 'w') # 存储本group的topic row = '' for line in f: line = line.strip() if line != '[*ROWEND*]':
protocal = urlparse(href).scheme if protocal == 'http' or protocal == 'https': return True return False def _getAlreadyVisitedNum(self): #visitedGroups保存已经分配给taskQueue的链接,有可能链接还在处理中。 #因此真实的已访问链接数为visitedGroups数减去待访问的链接数 if len(self.visited_href) == 0: return 0 else: return len(self.visited_href) - self.thread_pool.getTaskLeft() if __name__ == "__main__": stacktracer.trace_start("trace.html",interval=5,auto=True) # Set auto flag to always update file! congifLogger("log/thread-id-crawler.log", 5) # 从这个URL开始抓取,从这个页面抓取下一页的地址,只能是单各进程 section_id = 'free' start_url = 'http://bbs.tianya.cn/list-%s-1.shtml' % (section_id) print "Start URL:", start_url base_path = '/home/kqc/dataset/tianya-forum/' time_now = datetime.now() post_id_list_path = 'post-id-list.txt' tcrawler = PostIDCrawler(start_url, 1, post_id_list_path, max_post_num = 10000) tcrawler.start() stacktracer.trace_stop()
for topic_id in self.topic_dict: topic = self.topic_dict[topic_id] s = topic.getSimpleString(delimiter = '[=]') ftopic.write(s + '\n[*ROWEND*]\n') for comment in topic.comment_list: cs = comment.getSimpleString(delimiter = '[=]') fcomment.write(cs + '\n[*ROWEND*]\n') ftopic.close() fcomment.close() ''' if __name__ == "__main__": LINE_FEED = "\n" # 采用windows的换行格式 stacktracer.trace_start("trace.html",interval=5,auto=True) # Set auto flag to always update file! congifLogger("log/comment_crawler.log", 5) #group_id_list = ['FLL', '294806', 'MML'] #group_id_list = ['test'] #group_id_list = ['70612', 'FLL'] group_id_list = [] if len(sys.argv) <= 1: print "Group IDs were not provided." sys.exit() # add group ids for i in range(1, len(sys.argv)): group_id_list.append(sys.argv[i]) print "Crawling comments for groups: ", group_id_list MAX_TOPIC_NUM = float('inf') # 每个小组最多处理的topic的个数 group_id = group_id_list[0]
准备所有的训练集和测试集:在每条topic后增加评论这条topic的所有用户 """ import sys import logging from datetime import datetime import codecs from logconfig import congifLogger from utils import is_between from prepare import load_topic_user, load_comment_user from prepare import TRAIN_START_DATE, TRAIN_END_DATE from prepare import TEST_START_DATE, TEST_END_DATE # config logging log = logging.getLogger('Main.prepare_train_test') congifLogger("prepare_train_test.log", 5) # 抽取评论,截止某个日期 COMMENT_END_DATE = datetime(2013, 3, 1) # Note: 这里需要将所有的topic和comment信息都导入到内存中 def main(argv): if len(argv) < 2: print 'Group ID not provided.' sys.exit(1) group_id = argv[1] log.info('Prepare training set and test set for group: %s' % group_id) path = 'tables/' + group_id + '/TopicInfo-raw-all-' + group_id
f = open(topic_list_path, 'w') for tid in this_group.stick_topic_list: f.write(tid + "\n") f.write("\n") for tid in self.topic_list: f.write(tid + "\n") f.close() self.topic_list = list() self.failed_href = set() if __name__ == "__main__": stacktracer.trace_start("trace.html",interval=5,auto=True) # Set auto flag to always update file! congifLogger("log/topicCrawler.log", 5) group_id_list = [] if len(sys.argv) <= 1: print "Group IDs were not provided." sys.exit() # add group ids for i in range(1, len(sys.argv)): group_id_list.append(sys.argv[i]) print "Crawling topic list for groups: ", group_id_list #tcrawler = TopicCrawler(['FLL', '294806', 'MML'], 5) #tcrawler = TopicCrawler(['70612'], 5) # 我们都是学术女小组 #tcrawler = TopicCrawler(['ustv'], 5) # 美剧fans 小组 for group_id in group_id_list: base_path = '/home/kqc/dataset/douban-group/'
from lxml import etree # use XPath from lxml import codecs import pdb import time from bs4 import BeautifulSoup from threadPool import ThreadPool from logconfig import congifLogger from webPage import WebPage import stacktracer # config logging log = logging.getLogger('Main.crawl_title') congifLogger("crawl_title.log", 5) failed_set = set() def task_handler(topic_id, seg_list): f = codecs.open('tables/TopicInfo-title.txt', 'a','utf-8') url = 'http://www.douban.com/group/topic/' + topic_id + '/' print 'Visiting: ', url webPage = WebPage(url) flag = webPage.fetch() if flag: url, pageSource = webPage.getDatas() # pageSource已经为unicode格式 page = etree.HTML(pageSource) content = page.xpath(u"/html/body/div[@id='wrapper']/div[@id='content']")[0] tmp = page.xpath(u"//table[@class='infobox']//td[@class='tablecc']")
import codecs from datetime import datetime reload(sys) sys.setdefaultencoding('utf-8') from gensim import corpora, models, similarities from gensim.models.ldamodel import LdaModel from logconfig import congifLogger STOP_WORDS_PATH = 'dataset/chinese-english-stopwords.txt' # config logging log = logging.getLogger('Main.train_lda') congifLogger("train_lda.log", 5) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, filename = "train_lda.log") def load_stop_words(): """导入所有的中文停用词 """ stoplist = set() f = codecs.open(STOP_WORDS_PATH, 'r', 'utf-8') for line in f: line = line.strip() stoplist.add(line) return stoplist def load_documents(file_path):
# coding=utf-8 """ 该脚本检查采用Logistic Regression by LIBLINEAR的概率输出,得到precision@K """ import logging import sys import random from operator import itemgetter from logconfig import congifLogger # config logging log = logging.getLogger('Main.statics') congifLogger("prediction_statics.log", 5) def load_user(filepath): """ 按照文件中user id的顺序读入,返回list """ uid_list = [] f = open(filepath) for line in f: line = line.strip() uid_list.append(line) return uid_list def load_test_topic(filepath): """ 导入测试topic的index以及对应的真正的参与人数 """ topic_list = []
import os from gensim import models, corpora from logconfig import congifLogger from prepare import load_user_list from utils import is_between from prepare_corpus import seg_chinese, remove_url # 除了训练集中的数据以外,都可已作为用户兴趣的内容 from prepare import TRAIN_START_DATE, TRAIN_END_DATE from prepare import TEST_START_DATE, TEST_END_DATE # config logging log = logging.getLogger('Main.interest') congifLogger("gen_user_interest.log", 5) def save_interest_info(interest_path, interest_info, model, dictionary): """ 将用户的兴趣向量和行为信息写入到文件 """ # 写入感兴趣的文本 f = codecs.open(interest_path, 'a', 'utf-8') for uid in interest_info: text = interest_info[uid][1] # 如果该用户没有任何兴趣文本,则表示其没有发表topic或者参与评论 text = remove_url(text) text = seg_chinese(text) document = text.split(' ') doc_bow = dictionary.doc2bow(document) doc_lda = model.__getitem__(doc_bow, eps = 0) #print doc_lda
#encoding=utf-8 """ 这里包含了一些准备数据集需要用到的函数和变量 """ from datetime import datetime import codecs import logging from utils import is_between from logconfig import congifLogger # config logging log = logging.getLogger('Main.prepare') congifLogger("prepare.log", 5) # 训练集的起止时间 TRAIN_START_DATE = datetime(2012, 8, 1) TRAIN_END_DATE = datetime(2012, 11, 1) TEST_START_DATE = datetime(2012, 11, 1) TEST_END_DATE = datetime(2013, 1, 1) # 设置最早和最晚的年限 VERY_EARLY_TIME = datetime(1900, 1, 1) VERY_LATE_TIME = datetime(2050, 1, 1) def load_topic_user(filepath, start_date = VERY_EARLY_TIME, end_date = VERY_LATE_TIME): """ 根据时间范围,导入所有的topic以及参与的user id 注意:topic可能有commentlist或者没有
import codecs import logging from datetime import datetime import os from logconfig import congifLogger from prepare import load_user_list from utils import is_between # 除了训练集中的数据以外,都可已作为用户兴趣的内容 from prepare import TRAIN_START_DATE, TRAIN_END_DATE from prepare import TEST_START_DATE, TEST_END_DATE # config logging log = logging.getLogger('Main.behavior') congifLogger("gen_user_behavior.log", 5) def save_behavior(behavior_path, behavior): # 写入行为信息 f = codecs.open(behavior_path, 'a', 'utf-8') for uid in behavior: f.write(uid + '[=]' + str(behavior[uid][0]) + '[=]' + str(behavior[uid][1]) + '\n') f.close() def get_interested_topic(uid_list, comment_path): """ 从comment info中获取用户感兴趣的topic list列表(即评论的某个topic的id列表 ), 并统计用户的评论次数 """ user_set = set(uid_list) # user id ==> (num_comments, num_topics)
quote_comment = self.find_previous_comment(i, uname, date) if quote_comment is None: log.error('Quote comment not found for comment: %s in post: %s, in group: \%s' % (comment.cid, self.post_id, self.section_id)) log.error('Current comment content: %s\n\n' % comment.content) else: # 链接找到的comment comment.quote = quote_comment #print comment.get_simple_string("[=]") log.info(u'评论 %s by %s 引用 评论 %s by %s' % (comment.cid, comment.user_name, comment.quote.cid, comment.quote.user_name)) if __name__ == "__main__": import sys import codecs sys.stdout = (codecs.getwriter('utf8'))(sys.stdout) congifLogger("log/models.log", 5) post = Post('4318716', u'free') #f = codecs.open(u"./testpage/舌尖上的厨娘 (配图,配过程),挑战你的味蕾_天涯杂谈_天涯论坛.html", "r", 'utf8') # first page f = codecs.open(u"./testpage/温故512:汶川地震的坍塌及重建_天涯杂谈_天涯论坛.html", "r", 'utf8') # first page strfile_page1 = f.read() f.close() """ f = codecs.open(u"./testpage/舌尖上的厨娘 (配图,配过程),挑战你的味蕾(第2页)_天涯杂谈_天涯论坛.html", "r", 'utf8') # first page strfile_page2 = f.read() f.close() f = codecs.open(u"./testpage/舌尖上的厨娘 (配图,配过程),挑战你的味蕾(第3页)_天涯杂谈_天涯论坛.html", "r", 'utf8') # first page strfile_page3 = f.read() f.close() """ # 抓取评论