def run(self): with self.threadingSum: logging.debug("%s start" % self.url) dbHandler = DbHandler() if not dbHandler.hasQuestion(self.url): # 插入新的问题 question = Question(self.url) title = question.get_title() detail = question.get_detail() answerNum = question.get_answer_num() followersNum = question.get_followers_num() tags = "" for tag in question.get_tags(): tags += tag + ";" tags = tags[0: len(tags) - 1] questionDict = {"url": self.url, "title": title, "detail": detail, "followers": followersNum, "answerNum": answerNum, "tags": tags} dbHandler.insertNewQuestion(questionDict) zh_qid = dbHandler.getQueIdByUrl(self.url) # 插入新的答案 for answer_link in question.get_all_answer_link(): answer = Answer(answer_link) author = answer.get_author() votes = answer.get_votes() answerDict = {"url": answer_link, "author": author, "zh_qid": zh_qid, "votes": votes} dbHandler.insertNewAnswer(answerDict) # 插入图片地址 zh_aid = dbHandler.getAnsIdByUrl(answer_link) for imgUrl in answer.get_all_pics(): dbHandler.insertNewImgUrl(zh_aid, imgUrl) contents = answer.get_answer_content() self.storeTheAnswer(zh_aid, contents) dbHandler.close() logging.debug("%s done" % self.url)
with self.threadingSum: logging.debug("%s start!" % self.url) pic_name = self.url.split("/")[-1] request = requests.get(self.url, stream=True, timeout=10) with open(os.getcwd() + "/tmp/html/images/" + pic_name, 'wb') as fd: for chunk in request.iter_content(): fd.write(chunk) logging.debug("%s done!" % self.url) if __name__ == '__main__': #设置线程数 threadingSum = threading.Semaphore(20) dbHandler = DbHandler() question = Question("http://www.zhihu.com/question/26702926") answer = Answer("http://www.zhihu.com/question/26702926/answer/33843851") img_urls = answer.get_all_pics() for url in img_urls: t = DownloadImg(threadingSum, url) t.start() for t in threading.enumerate(): if t is threading.currentThread(): continue t.join()