def queryList(self): sql = u'select id,title,content,contentHead from web_information' self.cur.execute(sql) res = self.cur.fetchall() articles = [] for item in res: article = Article() article.id = item[0] article.title = item[1] article.content = item[2] article.contentHead = item[3] articles.append(article) return articles
def parseListHtml(page, titleindex): next_page = {'page': page, 'title': titleindex} common.save_now_page(next_page) mysql = Mysql() s = '' if page > 1: s = '_' + repr(page) print(url.format(titles[titleindex], s)) try: response = requests.get(url.format(titles[titleindex], s), headers=headers, timeout=10) response.encoding = 'gb2312' if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') re_coms = soup.find_all('ul', attrs={'class': 'recom_list'}) articles = [] for re_com in re_coms: article = Article(re_com.a.string, re_com.find('span', attrs={'class': 'gd1'}).a.attrs['href']) article.author = 'OK学车' article.contentHead = parseContentHead(re_com.find('li', attrs={'class': 'recom_nr'}).text) article.type = types[titles[titleindex]] articles.append(article) parseArticle(articles) # 保存到数据库 mysql.insert_array(articles) mysql.close() # common.save_file(titles[titleIndex], '第{0}页'.format(page), repr(common.convert_to_dicts(articles))) sleep_time = random.randint(5, 10) print('休息', sleep_time, 's后再获取') time.sleep(sleep_time) parseListHtml(page + 1, titleindex) else: mysql.close() if titleindex + 1 < len(titles): parseListHtml(1, titleindex + 1) except Exception as e: print(traceback.format_exc()) print('网页获取失败:', e) mysql.close() sleep_time = random.randint(1, 5) print(repr(sleep_time), 's后重新获取') time.sleep(sleep_time) parseListHtml(page + 1, titleindex)