def save_html(urls):
    index_begin = read_index()
    url_list = urls[index_begin:]
    for index1, url_temp in enumerate(url_list):
        print(url_temp)
        response = get_html(url_temp)

        sql = """update criterion2018 set ThirdReport_Raw = "{0}" where ThirdReport_Url="{1}";"""
        # sql_content = """update {0} set htmlContent = "{1}" where url="{2}";"""

        ch_base64 = base64.b64encode(response[0].encode('utf-8')).decode('utf-8')
        ch_base64_sql = sql.format(ch_base64, url_temp)
        link_mysql_write(ch_base64_sql)
        '''
        ch = clear_html(response[0])
        try:
            sql_raw = sql.format(url_temp[1], pymysql.escape_string(ch), url_temp[0])
            link_mysql_write(sql_raw)
        except:
            response = get_html(url_temp[0], encode="gbk")
            ch = clear_html(response[0])
            sql_raw = sql.format(url_temp[1], pymysql.escape_string(ch), url_temp[0])
            link_mysql_write(sql_raw)'''
        save_index(index_begin + index1 + 1)
        print(index_begin + index1 + 1)
def template(table, re_temp, website):
    sql_read_from_html = """
    select ThirdReport_Raw,ThirdReport_Url from {0} where ThirdReport_SiteName = "{1}"; 
    """.format(table, website)

    change_sql = """
    update """ + table + """ set ThirdReport_Content="{0}" where ThirdReport_Url="{1}";
    """

    list_content = link_mysql_read(sql_read_from_html)

    for index, dic in enumerate(list_content):
        html_test = base64_decode(list_content[index]['ThirdReport_Raw'])
        url = list_content[index]["ThirdReport_Url"]
        try:
            article = clear_atr(re.findall(re_temp, html_test, re.S)[0])
            print(article)
            sql_c = change_sql.format(article, url)
            link_mysql_write(sql=sql_c)
        except:
            print("正则出现问题")
from linkmysql import link_mysql_read, link_mysql_write
from save_html_to_mysql import base64_decode
from bs4 import BeautifulSoup
import re
import requests
from filiter_infomation.clearAttr import clear_atr

sql_read_from_html = """
select ThirdReport_Raw,ThirdReport_Url from recall2018 where ThirdReport_SiteName = "美国食品药品管理局网站"; 
"""

change_sql = """
update recall2018 set ThirdReport_Content="{0}" where ThirdReport_Url="{1}";
"""

list_content = link_mysql_read(sql_read_from_html)

for index, dic in enumerate(list_content):
    html_test = base64_decode(list_content[index]['ThirdReport_Raw'])
    url = list_content[index]["ThirdReport_Url"]
    try:
        article = clear_atr(
            re.findall(
                '<div class="col-md-9">(.*)\
<p style="margin-bottom:0; letter-spacing: .125em; text-align: center;">',
                html_test, re.S)[0])
        sql_c = change_sql.format(article, url)
        link_mysql_write(sql=sql_c)
    except:
        print(url)
Пример #4
0
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
    '(KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}

website_name = "疾病控制保护中心"

for i in range(4, len(urls)):
    response = requests.get(url=urls[i], headers=header)
    response.encoding = "utf-8"
    soup = BeautifulSoup(response.text, "lxml")
    create_time = re.findall('<span itemprop="dateModified">(.*?)</span>',
                             response.text)
    print(create_time)
    ct = ChangeTime(create_time[0], "%B %d, %Y").mysql_time(3)
    print(ct)
    href = urls[i]

    title = soup.title.text
    print(title)
    datetime_now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(i)
    sql_raw = """update epidemic2018 set ThirdReport_Title="{0}", ThirdReport_Url="{1}", ThirdReport_CreateTime="{2}",\
    ThirdReport_GetherTime="{3}", ThirdReport_SiteName="{4}", ThirdReport_InfoType="{5}", ThirdReport_Raw="{6}" \
    where ThirdReport_Url="{7}";
    """
    if "2018" in ct:
        a1 = sql_raw.format(title[:200], href, ct, datetime_now, website_name,
                            "标准变更", base64_encode(response.text), urls[i])
        print(a1)
        link_mysql_write(a1)
Пример #5
0
# 增加字段语句
sql = """
alter table {} add (
    `title` varchar(1200) DEFAULT NULL,
    `product` varchar(300) DEFAULT NULL,
    `brands` varchar(300) DEFAULT NULL,
    `Manufacturer` varchar(300) DEFAULT NULL,
    `Packageranddistributor` varchar(300) DEFAULT NULL,
    `Numbers` int(15) DEFAULT NULL,
    `html` MediumText DEFAULT NULL
);
"""

sql_html_content = """
alter table {} add (
    `htmlContent` MediumBlob DEFAULT NULL
);
"""

sql_show_all = "show tables;"
# 返回所有的数据库内的表名
table_names = [name["Tables_in_test"] for name in link_mysql_read(sql_show_all)]


if __name__ == '__main__':
    for table_name in table_names:
        link_mysql_write(sql_html_content.format(table_name))

    connection.close()
    print("all tables was processed")