Exemplo n.º 1
0
def update_excel(excel_path=None):

    com=common_article(None)
    execl = openpyxl.load_workbook(excel_path)
    sheet = execl.get_sheet_by_name("sheet1")

    for index,i in enumerate(sheet.rows):
        print(index)

        excel_author = i[Row_Name.COLUME_NUM[Row_Name.AUTHOR_NAME]].value
        url = i[Row_Name.COLUME_NUM[Row_Name.ABS_URL]].value
        aff=i[Row_Name.COLUME_NUM[Row_Name.AFFILIATION]].value
        publisher=i[Row_Name.COLUME_NUM[Row_Name.PUBLISHER]].value


        if excel_author==Row_Name.AUTHOR_NAME:
            continue
        try:
            if publisher=="The Korean Society of Gastroenterology":
                if aff==None:
                    data = requests.get(url)
                    soup = BeautifulSoup(data.text, "html.parser")

                    aff_dict = {}
                    div = soup.find("div", {"id": "conBox"})
                    for p in div.find_all("p"):
                        sup = p.find("sup")
                        if sup != None:
                            key = sup.get_text().strip()
                            sup.extract()
                            aff_dict[key] = p.get_text().strip()
                        else:
                            aff_dict["0"] = p.get_text().strip()

                        p.extract()
                    print(aff_dict)
                    line = div.get_text()
                    line = line[:line.find("Correspondence")].strip()
                    au_dict = com.clear_authors(line, aff_dict.keys())
                    au, em, af = com.get_author_email_aff(au_dict, {}, aff_dict)

                    i[Row_Name.COLUME_NUM[Row_Name.AUTHOR_NAME]].value=au
                    i[Row_Name.COLUME_NUM[Row_Name.AFFILIATION]].value=af


        except:
            pass

    execl.save(excel_path)
Exemplo n.º 2
0
def write_page_total(excel_path=None):
    com = common_article(None)
    execl = openpyxl.load_workbook(excel_path)
    sheet = execl.get_sheet_by_name("sheet1")

    for index, i in enumerate(sheet.rows):

        path = i[Row_Name.COLUME_NUM[Row_Name.BIO]+1].value
        page_total = i[Row_Name.COLUME_NUM[Row_Name.PAGE_TOTAL]].value
        print(index, page_total,path,os.path.exists(path))
        if page_total==None:
            if os.path.exists(path):
                try:
                    pages=com.checkpdf(path)
                    i[Row_Name.COLUME_NUM[Row_Name.PAGE_TOTAL]].value=pages
                except:
                    logger.error("执行出错!", exc_info=True)
    execl.save(excel_path)
Exemplo n.º 3
0
        if tag_a != None:
            pdf_url = "https://journals.sagepub.com" + tag_a["href"]
            pdf_path = self.download_pdf(pdf_url, "sagepub")
            if pdf_path != None:
                info[Row_Name.FULLTEXT_URL] = pdf_url
                info[Row_Name.FULLTEXT_PDF] = pdf_path

        return info


if __name__ == '__main__':
    urls = []
    url_set = set()
    info = {}

    com = common_article(None)
    # url = "https://journals.sagepub.com/doi/abs/10.1177/147323001204000139"
    url = "https://journals.sagepub.com/doi/abs/10.1177/147323001204000110"
    # url = "https://journals.sagepub.com/doi/full/10.1177/0300060514566649"
    data = requests.get(url)
    soup = BeautifulSoup(data.text, "html.parser")

    aff_dict = {}
    for sage_aff in soup.find_all("div", class_="artice-info-affiliation"):
        sage_sup = sage_aff.find("sup")
        if sage_sup != None:
            key = sage_sup.get_text().strip()
            sage_sup.extract()
            aff_dict[key] = sage_aff.get_text().strip()
        else:
            aff_dict["0"] = sage_aff.get_text().strip()