def sale_list_find(): #活动图书列表 global sale_list_link_old global sale_list_link_new home = "http://www.queshu.com" try: sale_list_link_diff = set(sale_list_link_old) - set(sale_list_link_new) #从活动列表及活动图书中删除旧记录 for item in set(sale_list_link_diff): result = Promotion.objects.filter(promotionID=item) #FIX 改成外键约束 result.delete() result = PromotionBookList.objects.filter(promotionID=item) result.delete() result = BookPriceList.objects.filter(promotionID=item) result.delete() sale_list_link_diff = set(sale_list_link_new) - set(sale_list_link_old) #添加新活动到列表 for item in set(sale_list_link_diff): promotionID = item sale_list_link = sale_list_link_new[item] #活动图书列表页 response = urllib2.urlopen(urllib2.Request(sale_list_link)) soup = BeautifulSoup(response, "html.parser") for jianlou_book in soup.find_all(id="jianlou_book"): img120 = jianlou_book.find(class_="img120") bookImageLink = img120["src"] #图书图片链接 bookName = img120["alt"] #图书名称 book_right_line = jianlou_book.find_all(class_="book_right_line") bookDetailLink = home + book_right_line[0].contents[0]["href"] #图书详情页链接 bookPrice = book_right_line[3].find(class_="xianjia").string #图书价格 response = urllib2.urlopen(urllib2.Request(bookDetailLink)) soup = BeautifulSoup(response, "html.parser") book_info = soup.find(class_="book_info") bookISBN = book_info.find(text=re.compile(r"\d{13}")) #图书ISBN for price_item in soup.find_all(class_="price_item"): #图书价格列表 book_price = price_item.find(class_="book_price_price") if not book_price: #可能为空 continue book_site = price_item.find(class_="book_site") bookLink = home + book_site.contents[0]["href"] bookSaler = home + book_site.contents[0].contents[0]["src"] bookCurrentPrice = book_price.contents[0] if repr(bookCurrentPrice)[0] != 'u': #无规则数据 continue bookCurrentPrice = bookCurrentPrice + book_price.contents[1].string result = BookPriceList( bookISBN = bookISBN, bookSaler = bookSaler, bookCurrentPrice = bookCurrentPrice, bookLink = bookLink) result.save() result = PromotionBookList( promotionID = promotionID, promotionBookISBN = bookISBN, promotionBookName = bookName, promotionBookImageLink = bookImageLink, promotionBookPrice = bookPrice) result.save() sale_list_link_old = sale_list_link_new sale_list_link_new = {} except urllib2.URLError, e: print e.reason