def generate_tags(request): if 'contents' not in request.POST or 'callback' not in request.POST: return HttpResponse(status=501) from taggit.settings import TAGGIT_TAG_GENERATE_FUNC if TAGGIT_TAG_GENERATE_FUNC is None: return HttpResponse(status=501) # Generate tags from the content content = request.POST['contents'] tags = TAGGIT_TAG_GENERATE_FUNC(content) # Filter out tags that are in our 'bad' list # stub tags = utils.filter_tags(tags) data = simplejson.dumps(tags) return HttpResponse(u'%s(%s)' % (request.POST['callback'], data), content_type='application/json')
def parse_article(self): d = dict() #parse article title title = self.tree.xpath('//p[@class="article-title"]/text()') if title: d['article_title'] = title[0].encode('ISO-8859-1') else: d['article_title'] = 'no_title' #parse article author name d['author_name'] = self.tree.xpath( '//a[contains(@class, "author-name author-info__detail")]/text() | //a[@class="author-info__detail"]//text()' )[0].encode('ISO-8859-1') #parse article time d['article_time'] = self.tree.xpath( '//p[@class="author-info__detail"]/text()')[0].encode('ISO-8859-1') #parse article content # d['article_content'] ='\r\n'.join([a.encode('ISO-8859-1') for a in self.tree.xpath('//div[@class="article-info"]//text()')]) content_ele = self.tree.xpath('//div[@class="article-info"]')[0] #convert content_ele to string tmp = etree.tostring(content_ele, encoding='ISO-8859-1', method="html") # print tmp d['article_content'] = filter_tags(tmp) #print 'article_content type is:', type(d['article_content']) return d
def prase_book(self, book, html): reg_pattern = re.compile(u'<h1>(.*?)</h1>') match = reg_pattern.search(html) if match: book["name"] = match.group(1) book["name"] = filter_tags(book["name"]) book["name"] = filter_r_and_n(book["name"]) reg_pattern = re.compile(u'<div class="PI_info">(.*?)</div>') match = reg_pattern.search(html) if match: book_info_str = match.group(1) #作者 reg_pattern = re.compile(u'<h3 class="PI_item">作者(.*?)</h3>') match = reg_pattern.search(book_info_str) if match: book["author"] = match.group(1) book["author"] = filter_tags(book["author"]) book["author"] = filter_r_and_n(book["author"]) book["author"] = book["author"].replace(" / ", "") #出版社 reg_pattern = re.compile(u'<h3 class="PI_item">出版社(.*?)</h3>') match = reg_pattern.search(book_info_str) if match: book["press"] = match.group(1) book["press"] = filter_tags(book["press"]) book["press"] = filter_r_and_n(book["press"]) book["press"] = book["press"].replace(" / ", "") #出版日期 reg_pattern = re.compile(u'<h3 class="PI_item">出版日期(.*?)</h3>') match = reg_pattern.search(book_info_str) if match: book["publictime"] = match.group(1) book["publictime"] = filter_tags(book["publictime"]) book["publictime"] = filter_r_and_n(book["publictime"]) book["publictime"] = book["publictime"].replace(" / ", "") #定价 reg_pattern = re.compile(u'<h3 class="PI_item">定價(.*?)</h3>') match = reg_pattern.search(book_info_str) if match: book["price"] = match.group(1) book["price"] = filter_tags(book["price"]) book["price"] = filter_r_and_n(book["price"]) book["price"] = book["price"].replace(" / ", "") #售价 reg_pattern = re.compile(u'<h3 class="PI_item">售價(.*?)</h3>') match = reg_pattern.search(book_info_str) if match: book["sell_price"] = match.group(1) book["sell_price"] = filter_tags(book["sell_price"]) book["sell_price"] = filter_r_and_n(book["sell_price"]) book["sell_price"] = book["sell_price"].replace(" / ", "") #裝訂 reg_pattern = re.compile(u'class="PI_item">裝訂(.*?)<') match = reg_pattern.search(book_info_str) if match: book["print"] = match.group(1) book["print"] = filter_tags(book["print"]) book["print"] = filter_r_and_n(book["print"]) book["print"] = book["print"].replace(" / ", "") #商品語言 reg_pattern = re.compile(u'class="PI_item">商品語言(.*?)<') match = reg_pattern.search(book_info_str) if match: book["language"] = match.group(1) book["language"] = filter_tags(book["language"]) book["language"] = filter_r_and_n(book["language"]) book["language"] = book["language"].replace(" / ", "") #詳細資料 reg_pattern = re.compile(u'<div class="C_box"><h2>詳細資料</h2>(.*?)</div>') match = reg_pattern.search(html) if match: book_info_str = match.group(1) book_info_str = filter_tags(book_info_str) book_info_str = book_info_str.replace("\t", "") reg_pattern = re.compile(u'ISBN 13 /(\d+)') match = reg_pattern.search(book_info_str) if match: book["isbn"] = match.group(1) reg_pattern = re.compile(u'頁數/(\d+)') match = reg_pattern.search(book_info_str) if match: book["pagecnt"] = match.group(1) #目录 reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_catelog" class="C_box" style="display:none;">(.*?)</div>') match = reg_pattern.search(html) if match: book["menu"] = filter_tags(match.group(1)) book["menu"] = book["menu"].replace("本書目錄", "") return book #作者介绍 reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_all_character" class="C_box" style="display:none;">(.*?)</div>') match = reg_pattern.search(html) if match: book["authordesc"] = filter_tags(match.group(1)) book["authordesc"] = book["authordesc"].replace("作者介紹", "") #内容接受 reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_introduction" class="C_box" style="display:block;">(.*?)</div>') match = reg_pattern.search(html) if match: book["desc"] = filter_tags(match.group(1)) book["desc"] = book["desc"].replace("內容簡介", "") #媒体推荐 reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_medium" class="C_box" style="display:none;">(.*?)</div>') match = reg_pattern.search(html) if match: book["meidum"] = filter_tags(match.group(1)) book["meidum"] = book["meidum"].replace("媒體推薦", "") #得獎紀錄 reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_award" class="C_box" style="display:none;">(.*?)</div>') match = reg_pattern.search(html) if match: book["award"] = filter_tags(match.group(1)) book["award"] = book["award"].replace("得獎紀錄", "") return book
# -*- coding: utf-8 -*- import re from utils import filter_tags, clean_html, replace_charentity html = """ aaa<p>11111111111</p>sss """ if __name__ == '__main__': re_pp = re.compile('</p>', re.I) s = re_pp.sub('\n', html) print s print filter_tags(html)