Пример #1
0
def get_page_base(url, headers=HttpHeader.LoginHeader()):
    resp = requests.get(url=url, headers=headers)

    if str(resp.status_code)[0] == '2':
        log.info('下载成功,状态为%s,url:%s:。' % (resp.status_code, url))
        return resp
    else:
        log.warning('下载不成功,状态为%s,url:%s:。' % (resp.status_code, url))
        return resp
Пример #2
0
    def down_page(self,url):

        resp = self.get_page_base(url, HttpHeader.TextHeader())

        cont = resp.content.decode('unicode_escape', 'ignore').replace('\r\n','')

        datas = re.findall('{"c":"(.*?)".*?"}.*?(\d+).*?\d+}', cont)

        for data,index in datas:
            # 调用 filesave 实例处理文档内容保存
            filesave.Save(data, self.document)
            log.info('文档(%s)正在写入第%s页' % (self.docId, index))
Пример #3
0
    def get_doc_info(self):

        # 获取URL html信息
        self.resp = get_page_base(self.url, headers=HttpHeader.LoginHeader())
        # resp = self.r.get(url = self.url,headers=self.loginheader)

        # html 编码装潢
        cont = self.resp.content.decode('gbk')

        # 获取文档标题/类型/页数等信息
        soup = BeautifulSoup(cont, 'lxml')

        # title: 文档标题
        self.document.title = re.search('\'title\'.*?\'(.*?)\'', cont).group(1)
        # creater: 文档上传者
        self.document.creater = parse.unquote(
            re.search('\'creater\'.*?\'(.*?)\'', cont).group(1), 'gbk')
        # docType: 文档格式(类型)
        self.document.docType = re.search('\'docType\'.*?\'(.*?)\'',
                                          cont).group(1)
        # docTypeNum: 文档格式号(类型)
        self.document.docTypeNum = re.search('\'docTypeNum\'.*?\'(.*?)\'',
                                             cont).group(1)
        # totalPageNum: 文档总页数
        self.document.totalPageNum = re.search('\'totalPageNum\'.*?\'(.*?)\'',
                                               cont).group(1)
        # freepagenum:免费阅读页数
        self.document.freepagenum = re.search('\'freepagenum\'.*?\'(.*?)\'',
                                              cont).group(1)
        # put_date: 文档上传日期 '//*[@id="doc-header-test"]/div/span/em'
        self.document.put_date = soup.find(
            id="doc-header-test").find('em').get_text()

        # payPrice: 文档价格
        self.document.payPrice = re.search('\'payPrice\'.*?\'(.*?)\'',
                                           cont).group(1)
        # downloadCoupons: 下载劵
        self.document.downloadCoupons = ''
        # isPaymentDoc:是否为付费文档
        self.document.isPaymentDoc = re.search('\'isPaymentDoc\'.*?\'(.*?)\'',
                                               cont).group(1)

        # content 文档内容
        self.document.conetent = ''
        # 获取文档评分
        self._get_score()
        self._documentSave()

        # print(self.document.__dict__)
        return self.document
Пример #4
0
    def get_page_url(self,page):
        getdocinfo_url = 'https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id=%s' % self.docId

        resp = self.get_page_base(getdocinfo_url,HttpHeader.LoginHeader())

        content = resp.content.decode('utf-8')

        # 获取文档内容url参数
        md5 = re.findall('"md5sum":"(.*?)"',content)[0]
        pn = re.findall('"totalPageNum":"(.*?)"', content)[0]
        rsign = re.findall('"rsign":"(.*?)"', content)[0]
        # 合成url
        content_url = 'https://wkretype.bdimg.com/retype/text/' + self.docId + '?rn=' + pn + '&type=txt' + md5 + '&rsign=' + rsign
        self.page_url = {'1': content_url}
        log.info('文档ID(%s)下载网址获取完成,url:%s。' % (self.docId, content_url))
Пример #5
0
    def down_page(self, url):

        resp = get_page_base(url, HttpHeader.PageHeader())

        json_data = re.search('{.*}',
                              resp.content.decode('unicode_escape',
                                                  'ignore')).group(0)

        da = json.loads(json_data)['body']
        da_j = ''
        for i in da:
            # print('1',type(i),type(i['c']),i['c'])
            if type(i['c']) == str:
                da_j += i['c'].strip()
        # 调用 filesave 实例处理文档内容保存
        filesave.Save(da_j, self.document)