def update_excel(excel_path=None): com=common_article(None) execl = openpyxl.load_workbook(excel_path) sheet = execl.get_sheet_by_name("sheet1") for index,i in enumerate(sheet.rows): print(index) excel_author = i[Row_Name.COLUME_NUM[Row_Name.AUTHOR_NAME]].value url = i[Row_Name.COLUME_NUM[Row_Name.ABS_URL]].value aff=i[Row_Name.COLUME_NUM[Row_Name.AFFILIATION]].value publisher=i[Row_Name.COLUME_NUM[Row_Name.PUBLISHER]].value if excel_author==Row_Name.AUTHOR_NAME: continue try: if publisher=="The Korean Society of Gastroenterology": if aff==None: data = requests.get(url) soup = BeautifulSoup(data.text, "html.parser") aff_dict = {} div = soup.find("div", {"id": "conBox"}) for p in div.find_all("p"): sup = p.find("sup") if sup != None: key = sup.get_text().strip() sup.extract() aff_dict[key] = p.get_text().strip() else: aff_dict["0"] = p.get_text().strip() p.extract() print(aff_dict) line = div.get_text() line = line[:line.find("Correspondence")].strip() au_dict = com.clear_authors(line, aff_dict.keys()) au, em, af = com.get_author_email_aff(au_dict, {}, aff_dict) i[Row_Name.COLUME_NUM[Row_Name.AUTHOR_NAME]].value=au i[Row_Name.COLUME_NUM[Row_Name.AFFILIATION]].value=af except: pass execl.save(excel_path)
def write_page_total(excel_path=None): com = common_article(None) execl = openpyxl.load_workbook(excel_path) sheet = execl.get_sheet_by_name("sheet1") for index, i in enumerate(sheet.rows): path = i[Row_Name.COLUME_NUM[Row_Name.BIO]+1].value page_total = i[Row_Name.COLUME_NUM[Row_Name.PAGE_TOTAL]].value print(index, page_total,path,os.path.exists(path)) if page_total==None: if os.path.exists(path): try: pages=com.checkpdf(path) i[Row_Name.COLUME_NUM[Row_Name.PAGE_TOTAL]].value=pages except: logger.error("执行出错!", exc_info=True) execl.save(excel_path)
if tag_a != None: pdf_url = "https://journals.sagepub.com" + tag_a["href"] pdf_path = self.download_pdf(pdf_url, "sagepub") if pdf_path != None: info[Row_Name.FULLTEXT_URL] = pdf_url info[Row_Name.FULLTEXT_PDF] = pdf_path return info if __name__ == '__main__': urls = [] url_set = set() info = {} com = common_article(None) # url = "https://journals.sagepub.com/doi/abs/10.1177/147323001204000139" url = "https://journals.sagepub.com/doi/abs/10.1177/147323001204000110" # url = "https://journals.sagepub.com/doi/full/10.1177/0300060514566649" data = requests.get(url) soup = BeautifulSoup(data.text, "html.parser") aff_dict = {} for sage_aff in soup.find_all("div", class_="artice-info-affiliation"): sage_sup = sage_aff.find("sup") if sage_sup != None: key = sage_sup.get_text().strip() sage_sup.extract() aff_dict[key] = sage_aff.get_text().strip() else: aff_dict["0"] = sage_aff.get_text().strip()