Пример #1
0
import content_per_article
import time
from database import Mysql

connDB1 = Mysql.connDB()
Mysql.exeSearch(connDB1[1])
ids = connDB1[1].fetchall()
p = 0
n = 0
while p <= 8:  #循环抓取每一页的内容,实际页数为篇p+1
    s = 0
    a = content_per_article.get_url_title(p)
    if a[0] != []:  #判断是否此页的所有文章都在数据库中
        while s < content_per_article.x:  #循环抓取一页中每篇文章的内容
            if (a[0][s],
                ) in ids or a[7][s] == '':  #判断是否有文章已经存在数据库中或者要抓取的文章是否有内容
                s += 1
                continue
            else:
                sql = "INSERT INTO xungen(id,title,subtitle,summary,content,picurl,rid,author,create_time,public_time,update_time,isanonymous,content_imgs,city)VALUES"
                sql1 = sql + '("' + str(a[0][s]) + '","' + str(
                    a[1][s]
                ) + '","' + str(a[2][s]) + '","' + str(a[3][s]) + '","' + str(
                    a[4][s]) + '","' + str(a[5][s]) + '","' + str(
                        a[6]) + '","' + str(a[7][s]) + '","' + str(
                            a[8][s]) + '","' + str(a[9][s]) + '","' + str(
                                a[10][s]) + '","' + str(a[11]) + '","' + str(
                                    a[12][s]) + '","' + str(a[13]) + '")'
                Mysql.exeUpdate(connDB1[0], connDB1[1], sql1)
                s += 1
                n += 1
def get_url_title(p):
    global x  #计算每一页有多少篇文章
    x = 0
    id = []
    title = []
    subtitle = []
    summary = []
    content = []
    picurl = []
    rid = ''
    author = []
    create_time = []
    public_time = []
    update_time = []
    isanonymous = ''
    content_imgs = []
    city = ''
    Page_con = content_per_page.get_page(p)
    connDB1 = Mysql.connDB()
    Mysql.exeSearch(connDB1[1])
    aids = connDB1[1].fetchall()
    for item in Page_con:
        res = requests.get(item['link'])
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'lxml')

        if (item['aid'], ) in aids or soup.select(
                '#post-user') == []:  #判断要插入的数据是否已经存在数据库中或者要抓取的内容是否存在
            author.append('')
            time.sleep(1)
            continue
        else:
            id.append(item['aid'])

            title.append(item['title'].replace('"', '”'))

            subtitle = title

            summary.append(item['digest'].replace('"', '”'))

            content.append(
                pymysql.escape_string('\n'.join(
                    str(s) for s in soup.select('p')[2:-6])))

            picurl.append(item['cover'])

            rid = '0'

            author.append(soup.select('#post-user')[0].text)

            if soup.select('#post-date') == []:
                create_time.append(soup.select('#publish_time')[0].text)
            else:
                create_time.append(soup.select('#post-date')[0].text)

            public_time = create_time

            update_time.append(item['update_time'])

            isanonymous = 'No'

            n = 0
            content_img = []
            while n < len(soup.select('img')):
                if soup.select('img')[n].has_attr("data-src"):
                    content_img.append(soup.select('img')[n]['data-src'])
                    n += 1
                else:
                    n += 1
            content_imgs.append(';'.join(content_img))

            city = 'Macheng'
            x += 1
            time.sleep(5)
    return id, title, subtitle, summary, content, picurl, rid, author, create_time, public_time, update_time, isanonymous, content_imgs, city