def get_page_base(url, headers=HttpHeader.LoginHeader()): resp = requests.get(url=url, headers=headers) if str(resp.status_code)[0] == '2': log.info('下载成功,状态为%s,url:%s:。' % (resp.status_code, url)) return resp else: log.warning('下载不成功,状态为%s,url:%s:。' % (resp.status_code, url)) return resp
def run_down(self, resp): self.get_page_url(resp) for pageNum, url in self.page_url.items(): log.info('正在下载文档(%s)第%s页,url:%s' % (self.document.id, pageNum, url)) self.down_page(url) log.info('文档(%s)第%s页下载完成,url:%s' % (self.document.id, pageNum, url)) time.sleep(TIME_WITE)
def run(self): self.get_doc_info() log.info( '你下载的文档ID为:%s,文档标题为:%s,文档类型为:%s' % (self.document.id, self.document.title, self.document.docType)) ws = self.WkInfo_docType[self.document.docTypeNum] if ws: pag = ws(self.document) pag.run_down(self.resp) else: log.info('你的文档类型不支持下载,文档类型为:%s' % self.document.docType)
def get_page_url(self, resp): page = resp.content.decode('gbk') self.page_url = { x: y.replace('\\', '').replace('//wk/', '/wk/') for x, y in re.findall( r'[,\[]{.{10,30}:(\d{1,3}).{10,30}(https:.*?\.json\?.*?token.*?)\\x22}', page) } log.info('文档ID:%s,完成下载网址获取,共%s个链接。' % (self.document.id, self.page_url.__len__()))
def down_page(self,url): resp = self.get_page_base(url, HttpHeader.TextHeader()) cont = resp.content.decode('unicode_escape', 'ignore').replace('\r\n','') datas = re.findall('{"c":"(.*?)".*?"}.*?(\d+).*?\d+}', cont) for data,index in datas: # 调用 filesave 实例处理文档内容保存 filesave.Save(data, self.document) log.info('文档(%s)正在写入第%s页' % (self.docId, index))
def get_page_url(self,page): getdocinfo_url = 'https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id=%s' % self.docId resp = self.get_page_base(getdocinfo_url,HttpHeader.LoginHeader()) content = resp.content.decode('utf-8') # 获取文档内容url参数 md5 = re.findall('"md5sum":"(.*?)"',content)[0] pn = re.findall('"totalPageNum":"(.*?)"', content)[0] rsign = re.findall('"rsign":"(.*?)"', content)[0] # 合成url content_url = 'https://wkretype.bdimg.com/retype/text/' + self.docId + '?rn=' + pn + '&type=txt' + md5 + '&rsign=' + rsign self.page_url = {'1': content_url} log.info('文档ID(%s)下载网址获取完成,url:%s。' % (self.docId, content_url))
def make_path(self): if not os.path.isdir(self.fp): os.makedirs(self.fp) log.info('创建目录%s' % self.fp)
def run_down(self): filesave.Save(json.dumps(self.document.__dict__,ensure_ascii=False),self.document) log.info('文档(%s)正在下载,url:%s' % (self.docId, self.page_url['1'] )) self.down_page(self.page_url['1']) log.info('文档(%s)下载完成,url:%s' % (self.docId, self.page_url['1']))
datas = re.findall('{"c":"(.*?)".*?"}.*?(\d+).*?\d+}', cont) for data,index in datas: # 调用 filesave 实例处理文档内容保存 filesave.Save(data, self.document) log.info('文档(%s)正在写入第%s页' % (self.docId, index)) def run_down(self): filesave.Save(json.dumps(self.document.__dict__,ensure_ascii=False),self.document) log.info('文档(%s)正在下载,url:%s' % (self.docId, self.page_url['1'] )) self.down_page(self.page_url['1']) log.info('文档(%s)下载完成,url:%s' % (self.docId, self.page_url['1'])) if __name__ == '__main__': # url = 'https://wenku.baidu.com/view/c55db74626d3240c844769eae009581b6bd9bd1d.html' # url = 'https://wenku.baidu.com/view/8c8d30a9bb0d4a7302768e9951e79b8968026891.html?from=search' # TXT url = 'https://wenku.baidu.com/view/f22deb61d1f34693dbef3e8b.html?from=search' docm = txtSpider(url) docm.get_doc_info() docm.run_down() for x,y in docm.document.__dict__.items(): print(x,':',y) log.info(url+' 下载完成')
def run_down(self): for pageNum, url in self.page_url.items(): log.info('正在下载文档(%s)第%s页,url:%s' % (self.docId, pageNum, url)) self.down_page(url) log.info('文档(%s)第%s页下载完成,url:%s' % (self.docId, pageNum, url)) time.sleep(TIME_WITE)