Пример #1
0
def test_download():
    url_ = "http://dx.doi.org/10.1016/j.athoracsur.2019.05.024"

    section = "Elsevier_0003-4975"
    cp = htmls.config_parser()
    print(cp.get_section(section))
    d_url = htmls.HTML(None, None, None,
                       "test").do_run(cp.get_section(section), url_)
    # d_url="https://onlinelibrary.wiley.com/doi/epdf/10.1016/S1607-551X%2814%2900235-6"
    # d_url="https://www.microbiologyresearch.org/deliver/fulltext/jgv/99/9/1187_vir001128.pdf?itemId=%2Fcontent%2Fjournal%2Fjgv%2F10.1099%2Fjgv.0.001128&mimeType=pdf&containerItemId=content/journal/jgv"
    print(d_url)
    htmls.download(d_url.strip(), test_file)
    print(htmls.checkpdf(test_file))
Пример #2
0
import collector.collect as collect
import collector.htmls as htmls

# redis_ = redis.Redis(host="10.3.1.99", port=6379, db=1,decode_responses=True)
# print(redis_.keys("*"))

if __name__ == '__main__':
    # name = "osti_1"
    name = "osti_aps"
    # name = "hg0903"

    file_path = r"C:\temp\osti\r1119\web_xls\aps.xls"
    # file_path = r"C:\public\目次采全文\0903\化工所待补全文清单_20190903..xls"

    cp = htmls.config_parser()
    cp.paser()
    collect.run_thread(name, file_path)
    cp.backup()

    # collect.test_download()
Пример #3
0
    def run(self):
        logger.info(self.sourcename + " download_url start...")
        while (True):
            string = self.um.get_eb(self.url_set_name)
            if string == None:
                break
            eb = nm.execl_bean()
            eb.paser(string)
            url = eb.pinjie

            jcb = nm.json_conf_bean(eb.sourcename, eb.eissn)
            file_path = self.creat_filename()
            try:
                # time.sleep(random.random() * 3 + 1)
                logger.info(self.sourcename + " 开始下载:" + url)

                r = requests.get(url)
                try:
                    c1 = r.cookies['BIGipServerlbapp_tc3']
                    c2 = r.cookies['BIGipServerwww.osti.gov_pool']
                    c3 = r.cookies['JSESSIONID']
                except:
                    pass
                soup = BeautifulSoup(r.text, "html.parser")

                mate = soup.find("meta", {"name": "citation_pdf_url"})
                if mate == None:
                    start_break = False
                    for div1 in soup.find_all("div",
                                              class_="biblio-secondary-group"):
                        for div2 in div1.find_all(
                                "div", class_="biblio-secondary-item small"):
                            for a in div2.find_all("a"):
                                if "href" in a.attrs.keys():
                                    if "https://doi.org" in a["href"]:
                                        pdf_url = a["href"]
                                        cp = htmls.config_parser()
                                        ht = htmls.HTML(None, None, None, None)
                                        for conf in cp.get_all_conf():
                                            print(conf)
                                            if ht.test(conf, pdf_url):
                                                result = ht.do_run(
                                                    conf, pdf_url)
                                                r2 = requests.get(result)
                                                r2.encoding = 'utf-8'
                                                # print(r2.text)
                                                file = open(file_path, "wb+")
                                                file.write(r2.content)
                                                file.close()
                                                break

                                        start_break = True
                                        break
                            if start_break:
                                break
                        if start_break:
                            break

                else:
                    pdf_url = mate["content"]
                    cookies = {
                        'BIGipServerlbapp_tc3': c1,
                        'BIGipServerwww.osti.gov_pool': c2,
                        'JSESSIONID': c3,
                        '__utma':
                        '249692800.1749221367.1564467097.1564467097.1564467097.1',
                        '__utmc': '249692800',
                        '__utmz':
                        '249692800.1564467097.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
                        '_ga': 'GA1.2.1749221367.1564467097',
                        '_gid': 'GA1.2.298248318.1564467099',
                        '__utmt': '1',
                        '__utmb': '249692800.63.10.1564467097'
                    }

                    r2 = requests.get(pdf_url, cookies=cookies)
                    r2.encoding = 'utf-8'
                    # print(r2.text)
                    file = open(file_path, "wb+")
                    file.write(r2.content)
                    file.close()
                eb.page = htmls.checkpdf(file_path)
                full_url = pdf_url
                # r = requests.get(url)
                # c1 = r.cookies['BIGipServerlbapp_tc3']
                # c2 = r.cookies['BIGipServerwww.osti.gov_pool']
                # c3 = r.cookies['JSESSIONID']
                # soup = BeautifulSoup(r.text, "html.parser")
                #
                # pdf_url = soup.find("meta", {"name": "citation_pdf_url"})["content"]
                # cookies = {
                #     'BIGipServerlbapp_tc3': c1,
                #     'BIGipServerwww.osti.gov_pool': c2,
                #     'JSESSIONID': c3,
                #     '__utma': '249692800.1749221367.1564467097.1564467097.1564467097.1',
                #     '__utmc': '249692800',
                #     '__utmz': '249692800.1564467097.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
                #     '_ga': 'GA1.2.1749221367.1564467097',
                #     '_gid': 'GA1.2.298248318.1564467099',
                #     '__utmt': '1',
                #     '__utmb': '249692800.63.10.1564467097'
                # }
                # #time.sleep(random.random() * 3 + 1)
                # logger.info(self.sourcename+" 下载PDF:"+pdf_url)
                #
                # r2 = requests.get(pdf_url, cookies=cookies)
                # r2.encoding = 'utf-8'
                # file = open(file_path, "wb+")
                # file.write(r2.content)
                # file.close()
                # eb.page = htmls.checkpdf(file_path)
                # full_url=pdf_url

            except NoConfError:
                logger.info(eb.eissn + " 无可用的conf.")
                eb.err_and_step = str(self.url_step) + ":  无可用的conf"
                self.um.save(eb, self.err_step)
            except Exception as e:
                logger.error(self.sourcename + " download url " + url +
                             " has err",
                             exc_info=True)
                if eb.retry < collect.DOWNLOAD_URL_RETRY:
                    logger.info("retry time:" + str(eb.retry))
                    eb.retry += 1
                    self.um.save(eb, self.url_step - 1)
                else:
                    logger.info("retry:" + str(eb.retry) +
                                ". retry次数超过5次,不再重试。")
                    self.um.save(eb, self.err_step)
                continue

            eb.full_url = full_url
            eb.abs_url = url
            dirs = file_path.split("/")
            eb.full_path = dirs[-2] + "/" + dirs[-1]
            self.um.save(eb, self.finsh_step)
Пример #4
0
def test(url, file_path=r"C:\temp\other\test.pdf"):
    r = requests.get(url)
    try:
        c1 = r.cookies['BIGipServerlbapp_tc3']
        c2 = r.cookies['BIGipServerwww.osti.gov_pool']
        c3 = r.cookies['JSESSIONID']
    except:
        pass
    soup = BeautifulSoup(r.text, "html.parser")

    mate = soup.find("meta", {"name": "citation_pdf_url"})
    if mate == None:
        start_break = False
        for div1 in soup.find_all("div", class_="biblio-secondary-group"):
            for div2 in div1.find_all("div",
                                      class_="biblio-secondary-item small"):
                for a in div2.find_all("a"):
                    if "href" in a.attrs.keys():
                        if "https://doi.org" in a["href"]:
                            turl = a["href"]
                            cp = htmls.config_parser()
                            ht = htmls.HTML(None, None, None, None)
                            for conf in cp.get_all_conf():
                                print(conf)
                                if ht.test(conf, turl):
                                    result = ht.do_run(conf, turl)
                                    r2 = requests.get(result)
                                    r2.encoding = 'utf-8'
                                    # print(r2.text)
                                    file = open(file_path, "wb+")
                                    file.write(r2.content)
                                    file.close()
                                    break

                            start_break = True
                            break
                if start_break:
                    break
            if start_break:
                break

    else:
        pdf_url = mate["content"]
        cookies = {
            'BIGipServerlbapp_tc3': c1,
            'BIGipServerwww.osti.gov_pool': c2,
            'JSESSIONID': c3,
            '__utma':
            '249692800.1749221367.1564467097.1564467097.1564467097.1',
            '__utmc': '249692800',
            '__utmz':
            '249692800.1564467097.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
            '_ga': 'GA1.2.1749221367.1564467097',
            '_gid': 'GA1.2.298248318.1564467099',
            '__utmt': '1',
            '__utmb': '249692800.63.10.1564467097'
        }

        r2 = requests.get(pdf_url, cookies=cookies)
        r2.encoding = 'utf-8'
        # print(r2.text)
        file = open(file_path, "wb+")
        file.write(r2.content)
        file.close()
    page = checkpdf(file_path)
    print(page)