def save_html(urls): index_begin = read_index() url_list = urls[index_begin:] for index1, url_temp in enumerate(url_list): print(url_temp) response = get_html(url_temp) sql = """update criterion2018 set ThirdReport_Raw = "{0}" where ThirdReport_Url="{1}";""" # sql_content = """update {0} set htmlContent = "{1}" where url="{2}";""" ch_base64 = base64.b64encode(response[0].encode('utf-8')).decode('utf-8') ch_base64_sql = sql.format(ch_base64, url_temp) link_mysql_write(ch_base64_sql) ''' ch = clear_html(response[0]) try: sql_raw = sql.format(url_temp[1], pymysql.escape_string(ch), url_temp[0]) link_mysql_write(sql_raw) except: response = get_html(url_temp[0], encode="gbk") ch = clear_html(response[0]) sql_raw = sql.format(url_temp[1], pymysql.escape_string(ch), url_temp[0]) link_mysql_write(sql_raw)''' save_index(index_begin + index1 + 1) print(index_begin + index1 + 1)
def template(table, re_temp, website): sql_read_from_html = """ select ThirdReport_Raw,ThirdReport_Url from {0} where ThirdReport_SiteName = "{1}"; """.format(table, website) change_sql = """ update """ + table + """ set ThirdReport_Content="{0}" where ThirdReport_Url="{1}"; """ list_content = link_mysql_read(sql_read_from_html) for index, dic in enumerate(list_content): html_test = base64_decode(list_content[index]['ThirdReport_Raw']) url = list_content[index]["ThirdReport_Url"] try: article = clear_atr(re.findall(re_temp, html_test, re.S)[0]) print(article) sql_c = change_sql.format(article, url) link_mysql_write(sql=sql_c) except: print("正则出现问题")
from linkmysql import link_mysql_read, link_mysql_write from save_html_to_mysql import base64_decode from bs4 import BeautifulSoup import re import requests from filiter_infomation.clearAttr import clear_atr sql_read_from_html = """ select ThirdReport_Raw,ThirdReport_Url from recall2018 where ThirdReport_SiteName = "美国食品药品管理局网站"; """ change_sql = """ update recall2018 set ThirdReport_Content="{0}" where ThirdReport_Url="{1}"; """ list_content = link_mysql_read(sql_read_from_html) for index, dic in enumerate(list_content): html_test = base64_decode(list_content[index]['ThirdReport_Raw']) url = list_content[index]["ThirdReport_Url"] try: article = clear_atr( re.findall( '<div class="col-md-9">(.*)\ <p style="margin-bottom:0; letter-spacing: .125em; text-align: center;">', html_test, re.S)[0]) sql_c = change_sql.format(article, url) link_mysql_write(sql=sql_c) except: print(url)
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } website_name = "疾病控制保护中心" for i in range(4, len(urls)): response = requests.get(url=urls[i], headers=header) response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") create_time = re.findall('<span itemprop="dateModified">(.*?)</span>', response.text) print(create_time) ct = ChangeTime(create_time[0], "%B %d, %Y").mysql_time(3) print(ct) href = urls[i] title = soup.title.text print(title) datetime_now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(i) sql_raw = """update epidemic2018 set ThirdReport_Title="{0}", ThirdReport_Url="{1}", ThirdReport_CreateTime="{2}",\ ThirdReport_GetherTime="{3}", ThirdReport_SiteName="{4}", ThirdReport_InfoType="{5}", ThirdReport_Raw="{6}" \ where ThirdReport_Url="{7}"; """ if "2018" in ct: a1 = sql_raw.format(title[:200], href, ct, datetime_now, website_name, "标准变更", base64_encode(response.text), urls[i]) print(a1) link_mysql_write(a1)
# 增加字段语句 sql = """ alter table {} add ( `title` varchar(1200) DEFAULT NULL, `product` varchar(300) DEFAULT NULL, `brands` varchar(300) DEFAULT NULL, `Manufacturer` varchar(300) DEFAULT NULL, `Packageranddistributor` varchar(300) DEFAULT NULL, `Numbers` int(15) DEFAULT NULL, `html` MediumText DEFAULT NULL ); """ sql_html_content = """ alter table {} add ( `htmlContent` MediumBlob DEFAULT NULL ); """ sql_show_all = "show tables;" # 返回所有的数据库内的表名 table_names = [name["Tables_in_test"] for name in link_mysql_read(sql_show_all)] if __name__ == '__main__': for table_name in table_names: link_mysql_write(sql_html_content.format(table_name)) connection.close() print("all tables was processed")