def close(self): """ 关闭文件 """ if self.SEND_LAST_PAGE: notify_page(self.id, "o_%s" % self.page) self.db.update_page_info(self.id, self.page, self.word_count) self.file.close()
def write_content(self, page): """ 写入正文内容 """ first_flag = True self.SEND_LAST_PAGE = True for each in page["content"]: _len = len(each["content"]) # 判断是否写入摘要 try: real_content = unicode(each["content"], 'utf-8', 'ignore') except: real_content = each["content"] real_content = strip_tag(real_content) if self.SUMMARY_FLAG and len(real_content) > 33: self.db.update_summary(self.id, real_content[:100]) self.SUMMARY_FLAG = False if self.word_count + _len > self.WORD_PERPAGE and self.word_count > self.WORD_PERPAGE * 0.75: # 修改页数 self.db.update_page_info(self.id, self.page, self.word_count) notify_page(self.id, "o_%s" % self.page) self.page += 1 self.word_count = 0 self.make_file() if first_flag: self.file.write(''' <div class="content_top"> 原帖第 %(page)s 页:<a href="%(url)s" target="_blank">查看该页内容</a> </div> ''' % page) first_flag = False self.file.write(''' <div class="content_main"> <a name="%(floor)s"></a> %(content)s <span class="floor"><a href="#%(floor)s" onclick="alert('设置完成,请添加收藏夹吧!')">[设为书签]</a></span> </div> ''' % each) self.word_count += _len # 第一次写入如果是第一页的话需要强制翻页 if self.page == 1: self.page += 1 self.word_count = 0 self.make_file() notify_page(self.id, "o_%s" % self.page) # 内存问题 self.file.flush()
def worker(): db = BaseDB() while q.qsize(): novel = q.get() try: if ready_work(novel, db): notify_page(novel[0], "start") do_work(novel, db) except Exception, e: log.error("%s: %s" % (novel[0], traceback.format_exc())) db.modify_status(novel[0], 4) db.commit() q.task_done()
def write_content(self, page): """ 写入正文内容 """ first_flag = True self.SEND_LAST_PAGE = True for each in page["content"]: _len = len(each["content"]) # 判断是否写入摘要 try: real_content = unicode(each["content"], 'utf-8', 'ignore') except: real_content = each["content"] real_content = strip_tag(real_content) if self.SUMMARY_FLAG and len(real_content) > 33: self.db.update_summary(self.id, real_content[:100]) self.SUMMARY_FLAG = False if self.word_count + _len > self.WORD_PERPAGE and self.word_count > self.WORD_PERPAGE*0.75: # 修改页数 self.db.update_page_info(self.id, self.page, self.word_count) notify_page(self.id, "o_%s" % self.page) self.page += 1 self.word_count = 0 self.make_file() if first_flag: self.file.write(''' <div class="content_top"> 原帖第 %(page)s 页:<a href="%(url)s" target="_blank">查看该页内容</a> </div> ''' % page) first_flag = False self.file.write(''' <div class="content_main"> <a name="%(floor)s"></a> %(content)s <span class="floor"><a href="#%(floor)s" onclick="alert('设置完成,请添加收藏夹吧!')">[设为书签]</a></span> </div> ''' % each) self.word_count += _len # 第一次写入如果是第一页的话需要强制翻页 if self.page == 1: self.page += 1 self.word_count = 0 self.make_file() notify_page(self.id, "o_%s" % self.page) # 内存问题 self.file.flush()
def do_work(novel, db): """ 执行工作 """ id, type, main_url, url, last_floor = novel crawler = TYPE_TO_CRAWLER[type] source = crawler(main_url, url=url, html=True, floor=last_floor) writer = BaseWriter(id, db) floor = None eachpage = None i = 0 # 控制每次抓取的页数 for eachpage in source: # if i >= PAGES_PER_RUN: # break notify_page(id, "r_%s" % eachpage["page"]) i += 1 # 修改最后更新url db.update_url(id, eachpage["url"]) # gc.collect() if not eachpage["content"]: continue floor = max([x["floor"] for x in eachpage["content"]]) writer.write_content(eachpage) # 修改最后更新楼层 db.update_floor(id, floor) # 检测这次是不是只跑了一页 # 修改间隔时间 if eachpage["url"] == url: db.incr_interval(id) else: db.decr_interval(id) # 完成 writer.close() db.modify_status(id, 2) notify_page(id, "end") log.info("novel %s ended." % id) try: G_LOCK[id].release() except: pass
#coding: utf-8 from common import notify_page import urllib import time import random for i in xrange(1000000): try: notify_page(1, "r_%s" % i) except: pass if random.random() < 1: notify_page(1, "o_%s" % random.randint(1, 100)) time.sleep(2)