def get_book_html(self, book_abs): sql_str = "select html from book_abs where book_id='%s';" % (book_abs["book_id"]) rows = self.mysql.query(sql_str, ["html"]) for row in rows: return filter_r_and_n(row["html"]) return False
def prase_book(self, book, html): reg_pattern = re.compile(u'<h1>(.*?)</h1>') match = reg_pattern.search(html) if match: book["name"] = match.group(1) book["name"] = filter_tags(book["name"]) book["name"] = filter_r_and_n(book["name"]) reg_pattern = re.compile(u'<div class="PI_info">(.*?)</div>') match = reg_pattern.search(html) if match: book_info_str = match.group(1) #作者 reg_pattern = re.compile(u'<h3 class="PI_item">作者(.*?)</h3>') match = reg_pattern.search(book_info_str) if match: book["author"] = match.group(1) book["author"] = filter_tags(book["author"]) book["author"] = filter_r_and_n(book["author"]) book["author"] = book["author"].replace(" / ", "") #出版社 reg_pattern = re.compile(u'<h3 class="PI_item">出版社(.*?)</h3>') match = reg_pattern.search(book_info_str) if match: book["press"] = match.group(1) book["press"] = filter_tags(book["press"]) book["press"] = filter_r_and_n(book["press"]) book["press"] = book["press"].replace(" / ", "") #出版日期 reg_pattern = re.compile(u'<h3 class="PI_item">出版日期(.*?)</h3>') match = reg_pattern.search(book_info_str) if match: book["publictime"] = match.group(1) book["publictime"] = filter_tags(book["publictime"]) book["publictime"] = filter_r_and_n(book["publictime"]) book["publictime"] = book["publictime"].replace(" / ", "") #定价 reg_pattern = re.compile(u'<h3 class="PI_item">定價(.*?)</h3>') match = reg_pattern.search(book_info_str) if match: book["price"] = match.group(1) book["price"] = filter_tags(book["price"]) book["price"] = filter_r_and_n(book["price"]) book["price"] = book["price"].replace(" / ", "") #售价 reg_pattern = re.compile(u'<h3 class="PI_item">售價(.*?)</h3>') match = reg_pattern.search(book_info_str) if match: book["sell_price"] = match.group(1) book["sell_price"] = filter_tags(book["sell_price"]) book["sell_price"] = filter_r_and_n(book["sell_price"]) book["sell_price"] = book["sell_price"].replace(" / ", "") #裝訂 reg_pattern = re.compile(u'class="PI_item">裝訂(.*?)<') match = reg_pattern.search(book_info_str) if match: book["print"] = match.group(1) book["print"] = filter_tags(book["print"]) book["print"] = filter_r_and_n(book["print"]) book["print"] = book["print"].replace(" / ", "") #商品語言 reg_pattern = re.compile(u'class="PI_item">商品語言(.*?)<') match = reg_pattern.search(book_info_str) if match: book["language"] = match.group(1) book["language"] = filter_tags(book["language"]) book["language"] = filter_r_and_n(book["language"]) book["language"] = book["language"].replace(" / ", "") #詳細資料 reg_pattern = re.compile(u'<div class="C_box"><h2>詳細資料</h2>(.*?)</div>') match = reg_pattern.search(html) if match: book_info_str = match.group(1) book_info_str = filter_tags(book_info_str) book_info_str = book_info_str.replace("\t", "") reg_pattern = re.compile(u'ISBN 13 /(\d+)') match = reg_pattern.search(book_info_str) if match: book["isbn"] = match.group(1) reg_pattern = re.compile(u'頁數/(\d+)') match = reg_pattern.search(book_info_str) if match: book["pagecnt"] = match.group(1) #目录 reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_catelog" class="C_box" style="display:none;">(.*?)</div>') match = reg_pattern.search(html) if match: book["menu"] = filter_tags(match.group(1)) book["menu"] = book["menu"].replace("本書目錄", "") return book #作者介绍 reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_all_character" class="C_box" style="display:none;">(.*?)</div>') match = reg_pattern.search(html) if match: book["authordesc"] = filter_tags(match.group(1)) book["authordesc"] = book["authordesc"].replace("作者介紹", "") #内容接受 reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_introduction" class="C_box" style="display:block;">(.*?)</div>') match = reg_pattern.search(html) if match: book["desc"] = filter_tags(match.group(1)) book["desc"] = book["desc"].replace("內容簡介", "") #媒体推荐 reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_medium" class="C_box" style="display:none;">(.*?)</div>') match = reg_pattern.search(html) if match: book["meidum"] = filter_tags(match.group(1)) book["meidum"] = book["meidum"].replace("媒體推薦", "") #得獎紀錄 reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_award" class="C_box" style="display:none;">(.*?)</div>') match = reg_pattern.search(html) if match: book["award"] = filter_tags(match.group(1)) book["award"] = book["award"].replace("得獎紀錄", "") return book