def __init__(self, task_list): self.task_set = set(task_list) self.work_set = set() # 待抓取网址池 self.answer_list = [] self.question_list = [] self.thread_pool = ThreadPool(SettingClass.MAXTHREAD) self.info_list = [] self.extra_index_list = [] self.info_url_set = self.task_set.copy() self.add_property() # 添加扩展属性 HttpBaseClass.set_cookie()
def worker(self, target_url): content = HttpBaseClass.get_http_content(target_url, timeout=SettingClass.WAITFOR_HTML) if not content: return self.work_set.discard(target_url) self.parse_content(content) return
def worker(self, target_url): content = HttpBaseClass.get_http_content( target_url, timeout=SettingClass.WAITFOR_HTML) if not content: return self.work_set.discard(target_url) self.parse_content(content) return
def catch_info(self, target_url): content = HttpBaseClass.get_http_content(target_url + '/top-answers', timeout=SettingClass.WAITFOR_HTML) if not content: return self.info_url_set.discard(target_url) parser = TopicParser(content) self.info_list.append(parser.get_extra_info()) return
def catch_info(self, target_url): content = HttpBaseClass.get_http_content( target_url + '/top-answers', timeout=SettingClass.WAITFOR_HTML) if not content: return self.info_url_set.discard(target_url) parser = TopicParser(content) self.info_list.append(parser.get_extra_info()) return
def download(self, image): filename = image['filename'] href = image['href'] content = HttpBaseClass.get_http_content(url=href, timeout=SettingClass.WAITFOR_PIC) if not content: return with open(self.save_path + '/' + filename, 'wb') as image: image.write(content) self.delete(href) return
def create_work_set(self, target_url): content = HttpBaseClass.get_http_content(target_url, timeout=SettingClass.WAITFOR_HTML) if not content: return self.task_set.discard(target_url) max_page = self.parse_max_page(content) for page in range(max_page): url = '{}?nr=1&sort=created&page={}'.format(target_url, page) self.work_set.add(url) return
def create_work_set(self, target_url): content = HttpBaseClass.get_http_content(target_url + '/answers?order_by=vote_num', timeout=SettingClass.WAITFOR_HTML) if not content: return self.task_set.discard(target_url) max_page = self.parse_max_page(content) for page in range(max_page): url = '{}/answers?order_by=vote_num&page={}'.format(target_url, page) self.work_set.add(url) return
def create_work_set(self, target_url): content = HttpBaseClass.get_http_content( target_url, timeout=SettingClass.WAITFOR_HTML) if not content: return self.task_set.discard(target_url) max_page = self.parse_max_page(content) for page in range(max_page): url = '{}?nr=1&sort=created&page={}'.format(target_url, page) self.work_set.add(url) return
def create_work_set(self, target_url): content = HttpBaseClass.get_http_content( target_url + '/answers?order_by=vote_num', timeout=SettingClass.WAITFOR_HTML) if not content: return self.task_set.discard(target_url) max_page = self.parse_max_page(content) for page in range(max_page): url = '{}/answers?order_by=vote_num&page={}'.format( target_url, page) self.work_set.add(url) return