def get_page_base(url, headers=HttpHeader.LoginHeader()): resp = requests.get(url=url, headers=headers) if str(resp.status_code)[0] == '2': log.info('下载成功,状态为%s,url:%s:。' % (resp.status_code, url)) return resp else: log.warning('下载不成功,状态为%s,url:%s:。' % (resp.status_code, url)) return resp
def down_page(self,url): resp = self.get_page_base(url, HttpHeader.TextHeader()) cont = resp.content.decode('unicode_escape', 'ignore').replace('\r\n','') datas = re.findall('{"c":"(.*?)".*?"}.*?(\d+).*?\d+}', cont) for data,index in datas: # 调用 filesave 实例处理文档内容保存 filesave.Save(data, self.document) log.info('文档(%s)正在写入第%s页' % (self.docId, index))
def get_doc_info(self): # 获取URL html信息 self.resp = get_page_base(self.url, headers=HttpHeader.LoginHeader()) # resp = self.r.get(url = self.url,headers=self.loginheader) # html 编码装潢 cont = self.resp.content.decode('gbk') # 获取文档标题/类型/页数等信息 soup = BeautifulSoup(cont, 'lxml') # title: 文档标题 self.document.title = re.search('\'title\'.*?\'(.*?)\'', cont).group(1) # creater: 文档上传者 self.document.creater = parse.unquote( re.search('\'creater\'.*?\'(.*?)\'', cont).group(1), 'gbk') # docType: 文档格式(类型) self.document.docType = re.search('\'docType\'.*?\'(.*?)\'', cont).group(1) # docTypeNum: 文档格式号(类型) self.document.docTypeNum = re.search('\'docTypeNum\'.*?\'(.*?)\'', cont).group(1) # totalPageNum: 文档总页数 self.document.totalPageNum = re.search('\'totalPageNum\'.*?\'(.*?)\'', cont).group(1) # freepagenum:免费阅读页数 self.document.freepagenum = re.search('\'freepagenum\'.*?\'(.*?)\'', cont).group(1) # put_date: 文档上传日期 '//*[@id="doc-header-test"]/div/span/em' self.document.put_date = soup.find( id="doc-header-test").find('em').get_text() # payPrice: 文档价格 self.document.payPrice = re.search('\'payPrice\'.*?\'(.*?)\'', cont).group(1) # downloadCoupons: 下载劵 self.document.downloadCoupons = '' # isPaymentDoc:是否为付费文档 self.document.isPaymentDoc = re.search('\'isPaymentDoc\'.*?\'(.*?)\'', cont).group(1) # content 文档内容 self.document.conetent = '' # 获取文档评分 self._get_score() self._documentSave() # print(self.document.__dict__) return self.document
def get_page_url(self,page): getdocinfo_url = 'https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id=%s' % self.docId resp = self.get_page_base(getdocinfo_url,HttpHeader.LoginHeader()) content = resp.content.decode('utf-8') # 获取文档内容url参数 md5 = re.findall('"md5sum":"(.*?)"',content)[0] pn = re.findall('"totalPageNum":"(.*?)"', content)[0] rsign = re.findall('"rsign":"(.*?)"', content)[0] # 合成url content_url = 'https://wkretype.bdimg.com/retype/text/' + self.docId + '?rn=' + pn + '&type=txt' + md5 + '&rsign=' + rsign self.page_url = {'1': content_url} log.info('文档ID(%s)下载网址获取完成,url:%s。' % (self.docId, content_url))
def down_page(self, url): resp = get_page_base(url, HttpHeader.PageHeader()) json_data = re.search('{.*}', resp.content.decode('unicode_escape', 'ignore')).group(0) da = json.loads(json_data)['body'] da_j = '' for i in da: # print('1',type(i),type(i['c']),i['c']) if type(i['c']) == str: da_j += i['c'].strip() # 调用 filesave 实例处理文档内容保存 filesave.Save(da_j, self.document)