Пример #1
0
 def queryList(self):
     sql = u'select id,title,content,contentHead from web_information'
     self.cur.execute(sql)
     res = self.cur.fetchall()
     articles = []
     for item in res:
         article = Article()
         article.id = item[0]
         article.title = item[1]
         article.content = item[2]
         article.contentHead = item[3]
         articles.append(article)
     return articles
Пример #2
0
def parseListHtml(page, titleindex):
    next_page = {'page': page, 'title': titleindex}
    common.save_now_page(next_page)
    mysql = Mysql()
    s = ''
    if page > 1:
        s = '_' + repr(page)
    print(url.format(titles[titleindex], s))
    try:
        response = requests.get(url.format(titles[titleindex], s),
                                headers=headers,
                                timeout=10)
        response.encoding = 'gb2312'
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            re_coms = soup.find_all('ul', attrs={'class': 'recom_list'})
            articles = []
            for re_com in re_coms:
                article = Article(re_com.a.string, re_com.find('span', attrs={'class': 'gd1'}).a.attrs['href'])
                article.author = 'OK学车'
                article.contentHead = parseContentHead(re_com.find('li', attrs={'class': 'recom_nr'}).text)
                article.type = types[titles[titleindex]]
                articles.append(article)
            parseArticle(articles)
            # 保存到数据库
            mysql.insert_array(articles)
            mysql.close()
            # common.save_file(titles[titleIndex], '第{0}页'.format(page), repr(common.convert_to_dicts(articles)))
            sleep_time = random.randint(5, 10)
            print('休息', sleep_time, 's后再获取')
            time.sleep(sleep_time)
            parseListHtml(page + 1, titleindex)
        else:
            mysql.close()
            if titleindex + 1 < len(titles):
                parseListHtml(1, titleindex + 1)
    except Exception as e:
        print(traceback.format_exc())
        print('网页获取失败:', e)
        mysql.close()
        sleep_time = random.randint(1, 5)
        print(repr(sleep_time), 's后重新获取')
        time.sleep(sleep_time)
        parseListHtml(page + 1, titleindex)