def parseCategory(self): print "parseCategory" item_collection_name = "dealmoon_category" mongodbCategoryList = first_mongodb[self.database][item_collection_name] #清除数据库数�? # mongodbCategoryList.remove() # print "remove over" url = "http://cn.dealmoon.com/" while 1 : # headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'} selector = loadHtmlSelector(url, headers=None) if selector is None : return reviews = selector.find("ul", {"class":"top_list"}).findAll("a", {"class":"more_arrow"}) reviewhides = selector.find("ul", {"class":"top_list"}).find("div", {"class":"droplist wid_menu"}).findAll("span", {"class":None}) id = 1 for review in reviews : count = mongodbCategoryList.find({"item_id":id}).count() if count == 0 : name = review.find("s").get_text().strip() href = review.attrs['href'] print id print name print href mongodbCategoryList.insert({"item_id":id, "item_name":name, "href":href}) id += 1 else : print "exit" return for reviewhide in reviewhides : count = mongodbCategoryList.find({"item_id":id}).count() if count == 0 : if id > 17 : return #过滤不需要的分类 name = reviewhide.find("a", {"class":"wid105"}).get_text().strip() href = reviewhide.find("a", {"class":"wid105"}).attrs['href'] print id print name print href mongodbCategoryList.insert({"item_id":id, "item_name":name, "href":href}) id += 1 else : print "exit" return pass
def parseCategory(self): print "parseCategory" item_collection_name = "smzdm_fx_category" mongodbCategoryList = first_mongodb[self.database][item_collection_name] #清除数据库数�? # mongodbCategoryList.remove() # print "remove over" url = "http://faxian.smzdm.com/" while 1 : selector = loadHtmlSelector(url, headers=None) if selector is None : return reviews = selector.findAll("div", {"class":"tab_info"}) id = 1 num = 1 for review in reviews : if num == 2 : categorys = review.findAll("a") for category in categorys : count = mongodbCategoryList.find({"item_id":id}).count() if count == 0 : name = category.get_text().strip() href = category.attrs['href'] print id print name print href mongodbCategoryList.insert({"item_id":id, "item_name":name, "href":href}) id += 1 else : print "exit" return num += 1 else : print num num += 1
def parseCategory(self):#OK print "parseCategory" item_collection_name = "mgpyh_category" mongodbCategoryList = first_mongodb[self.database][item_collection_name] #清除数据库数�? # mongodbCategoryList.remove() # print "remove over" url = "http://www.mgpyh.com/" while 1 : print url # http = httplib2.Http(timeout=0.2) # response, content = http.request(url, method='GET', headers={}) # selector = BeautifulSoup(content) selector = loadHtmlSelector(url, headers=None) reviews = selector.find("ul", {"class":"pull-left menu-left"}).findAll("li", {"data-id":True}) # print reviews id = 1 for review in reviews : num = mongodbCategoryList.find({"item_id":id}).count() if num == 0 : name = review.find("a").get_text().strip() href = "http://www.mgpyh.com" + str(review.find("a").attrs['href']) print id print name print href mongodbCategoryList.insert({"item_id":id, "item_name":name, "href":href}) id += 1 else : print "exit" return
def smzdm_yh(self, source): print "smzdm_yh" item_collection_name = "smzdm_yh_item" mongodbItem = first_mongodb[self.database][item_collection_name] clr = Color() #CMD终端分颜色打�? #清除数据库数�? # mongodbItem.remove() # print "remove over" source_url = source['href'] # if source['item_id'] == 13 : # source_url = "http://www.smzdm.com/youhui/fenlei/jiajujiazhuang/p17" source_name = source['item_name'] clr.print_red_text(source_name) clr.print_red_text(source_url) # print source_name # print source_url while 1 : print source_url selector = loadHtmlSelector(source_url, headers=None) if selector is None : return divs = selector.findAll("div", articleid=True) item_list = [] for div in divs : item = Item() item.categoryid = source['item_id'] if div.find("div", {"class":"listTitle"}).find("span", {"class":"icon"}) : continue #过滤过期条目 item.itemid = int (div.attrs['articleid'].split("_")[-1]) print item.itemid #更新,直接跳到下�?个分�? item_num = mongodbItem.find({"itemid":item.itemid}).count() if item_num != 0 : clr.print_red_text("%s update over " %source_name) # print "%s update over " %source_name return # if item_num != 0 : continue #暂停,继续爬�? item.updatetime = int (div.attrs['timesort']) updatetime = time.asctime(time.localtime(item.updatetime)) article_time = datetime.datetime.strptime(updatetime,"%a %b %d %H:%M:%S %Y").strftime("%Y-%m-%d %H:%M:%S %A") print item.updatetime print article_time item.name = div.find("h2", {"class":"itemName"}).find("a").get_text().strip() # print item.name # if "优惠�?".decode('utf-8') in item.name : continue # if "红包".decode('utf-8') in item.name : continue # if "免费�?".decode('utf-8') in item.name : continue # if "蚊子�?".decode('utf-8') in item.name : continue # if "消费提示".decode('utf-8') in item.name : continue #过滤非商品条�? # if "促销".decode('utf-8') in item.name : continue # if "活动".decode('utf-8') in item.name : continue # if "�?么�?�得�?".decode('utf-8') in item.name : continue # if "公告".decode('utf-8') in item.name : continue # if "打车".decode('utf-8') in item.name : continue # if "公交".decode('utf-8') in item.name : continue # if "点券".decode('utf-8') in item.name : continue # if "预告".decode('utf-8') in item.name : continue # if "银行".decode('utf-8') in item.name : continue # if "公益".decode('utf-8') in item.name : continue # if "专享".decode('utf-8') in item.name : continue # if "晒物".decode('utf-8') in item.name : continue # if "专题".decode('utf-8') in item.name : continue # if "白菜".decode('utf-8') in item.name : continue # if "电信".decode('utf-8') in item.name : continue # if "话费".decode('utf-8') in item.name : continue # if "迅雷".decode('utf-8') in item.name : continue # if "网友投稿".decode('utf-8') in item.name : continue item.image = div.find("img", alt=True) #商品图片 if item.image : item.image = item.image.attrs['src'] print item.image else : item.image = "" continue #商品价格 item.price = div.find("h2", {"class":"itemName"}).find("span", {"class":"red"}).get_text() # if item.price == '' : continue # if "红包".decode('utf-8') in item.price : continue # if item.price != '' and not re.search(r'\d', item.price) : continue #过滤价格中没有数字的条目 # print item.price #购买链接 item.href = div.find("div", {"class":"buy"}) if item.href : item.href =item.href.find("a", {"target":"_blank"}).attrs['href'] if "baoxian" in item.href : continue #过滤保险类条�? if "baidu" in item.href : continue print item.href else : item.href = "" continue #商城 originmallitem = div.find("div", {"class":"botPart"}).find("a", {"class":"mall"}) if originmallitem : originmall = originmallitem.get_text() originmallurl = originmallitem.attrs['href'] print originmall else : originmall = "" originmallurl = "" #推荐�? itemelse = div.find("div", {"class":"lrBot"}) goodcount = itemelse.find("a", {"class":"good"}).find("span", {"class":"scoreTotal"}).attrs['value'] goodcountnum = int(goodcount) #�? print "goodcountnum is %d" %goodcountnum #不推荐数 badcount = itemelse.find("a", {"class":"bad"}).find("span", {"class":"scoreTotal"}).attrs['value'] badcountnum = int(badcount) #不�?? print "badcountnum is %d" %badcountnum #收藏�? favcount = itemelse.find("a", {"title":"收藏"}).find("em").get_text() favcountnum = int(favcount) #收藏 print "favcountnum is %d" %favcountnum #评论�? commentcount = itemelse.find("a", {"class":"comment"}).get_text() commentcountnum = int(commentcount) #评论 print "commentcountnum is %d" %commentcountnum #文章链接 article_url = div.find("h2", {"class":"itemName"}).find("a").attrs['href'] print article_url item_dict = item.createItemdic({"article_url":article_url, "article_time":article_time, "good_count":goodcountnum, "bad_count":badcountnum, "fav_count":favcountnum, "comment_count":commentcountnum, "originmall":originmall, "originmallurl":originmallurl}) print item_dict #判断是否已经爬取 item_num = mongodbItem.find({"itemid":item.itemid}).count() if item_num == 0 : # item_list.append(item_dict) mongodbItem.insert(item_dict) print "insert successfully" else : # mongodbItem.update({"itemid":item.itemid}, item_dict) # print "update over" print ("item exits, num is %s" % item_num) continue #�?次插入整页所有条�? # print item_list # if len(item_list) != 0 : # mongodbItem.insert(item_list) #�?次插入一页所有条�? # print "insert successfully" #下一�? next_page = selector.find("ul", {"class":"pagination"}).find("li", {"class":"pagedown"}) if next_page : source_url = next_page.find("a").attrs['href'] else : print "exit" break
def dealmoon(self,source):#OK print "dealmoon" item_collection_name = "dealmoon_item" mongodbItem = first_mongodb[self.database][item_collection_name] #清除数据 # mongodbItem.remove() # print "remove over" clr = Color() #CMD终端分颜色打�? source_url = source['href'] source_name = source['item_name'] clr.print_red_text(source_url) clr.print_red_text(source_name) # if source['item_id'] < 17 : return # if source['item_id'] == 17 : # source_url = "http://cn.dealmoon.com/Everything-Else/39" # print source_name # print source_url # item = ShopItem() # item.categoryid = source['id'] while 1 : # print source_url clr.print_red_text(source_url) selector = loadHtmlSelector(source_url, headers=None) if selector is None : return lists = selector.findAll("div", {"class":"mlist"}) item_list = [] for list in lists : item = Item() item.categoryid = source['item_id'] #条目ID item.itemid = int (list.attrs['data-id']) print item.itemid #更新,直接跳到下�?个分�? item_num = mongodbItem.find({"itemid":item.itemid}).count() if item_num != 0 : return # if item_num != 0 : continue #暂停,继续爬�? #条目标题 if list.find("h2") : item.name = list.find("h2").find("span", {"class":None}).get_text().strip() else : item.name = list.find("h1").find("span", {"class":None}).get_text().strip() # print item.name #商品图片 item.image = list.find("div", {"class":"mpic"}).find("img", {"alt":True}).attrs['src'] print item.image #时间 if not list.find("div", {"class":"date"}) : continue updatetimeitem = list.find("div", {"class":"date"}).get_text().strip() if "分钟".decode('utf-8') in updatetimeitem : #时间格式为几分钟�? updatetime = datetime.datetime.now() - datetime.timedelta(minutes= int (filter(lambda x:x.isdigit(),updatetimeitem))) item.updatetime = time.mktime(updatetime.timetuple()) elif "小时".decode('utf-8') in updatetimeitem : #时间格式为几小时�? updatetime = datetime.datetime.now() - datetime.timedelta(hours= int (filter(lambda x:x.isdigit(),updatetimeitem))) item.updatetime = time.mktime(updatetime.timetuple()) else : #时间格式为几天前 updatetime = datetime.datetime.today() - datetime.timedelta(days= int (filter(lambda x:x.isdigit(),updatetimeitem))) item.updatetime = time.mktime(updatetime.timetuple()) print item.updatetime #价格 if list.find("h2") : item.price = list.find("h2").find("a").find("span", {"class":"notice_item"}).get_text().strip() else : item.price = list.find("h1").find("a").find("span", {"class":"notice_item"}).get_text().strip() # print item.price #商城 if list.find("h2") : articleurl = list.find("h2").find("a").attrs['href'] else : articleurl = list.find("h1").find("a").attrs['href'] status = urllib.urlopen(articleurl).code if status == 404 : #有个别链接失�? clr.print_red_text("return 404 error") # print "return 404 error" continue articleselector = loadHtmlSelector(articleurl, headers=None) originmall_text = articleselector.find("div", {"class":"gn_line"}) if originmall_text : originmall_text = originmall_text.find("a", {"style":"color:#003399;", "trk":None}) if originmall_text : # originmall = originmall_text[4:len(originmall_text)-6] #提取商城名字 originmall = originmall_text.get_text().strip().replace("来自".decode('utf-8'),"").replace("的折扣".decode('utf-8'),"") #提取商城名字 originmallurl = originmall_text.attrs['href'] else : originmall = '' originmallurl = '' else : originmall = '' originmallurl = '' print originmall #购买链接 href = articleselector.find("div", {"class":"mpic"}) if href : item.href = "http://cn.dealmoon.com" + str (href.find("div", {"class":"buy"}).find("a", {"trk":True}).attrs['href']) # if href : # item.href = "http://cn.dealmoon.com" + str (href.attrs['href']) else : item.href = "" continue #过滤没有购买链接的条�? print item.href #推荐�? goodcount = list.find("div", {"class":"minfo"}).find("span", {"class":"like_btn"}) if goodcount : goodcount = goodcount.find("em").get_text() goodcountnum = int (filter(lambda x : x.isdigit(),goodcount)) else : goodcountnum = 0 print "goodcountnum is %d" %goodcountnum #评论�? commentcount = list.attrs['cmtcn'] commentcountnum = int (commentcount) print "commentcountnum is %d" %commentcountnum #收藏�? favcount = list.find("div", {"class":"minfo"}).find("span", {"class":"fav_btn"}).find("em").attrs['num'] favcountnum = int (favcount) print "favcountnum is %d" %favcountnum dict = item.createItemdic({"articleurl":articleurl, "good_count":goodcountnum, "comment_count":commentcountnum, "fav_conut":favcountnum, "originmall":originmall, "originmallurl":originmallurl}) print dict #判断是否已经爬取 num = mongodbItem.find({"itemid":item.itemid}).count() if num == 0 : # item_list.append(dict) mongodbItem.insert(dict) #�?次插入一�? clr.print_red_text("insert sucessfully") # print "insert sucessfully" else : clr.print_red_text("item exits, num is %s" % num) # print ("item exits, num is %s" % num) continue #�?次插入整页所有条�? # print item_list # if len(item_list) != 0 : # self.mongodbitemlist.insert(item_list) # print "insert successfully" #下一�? next_page = selector.find("div", {"class":"pages"}).find("a", {"class":"next_link"}) if next_page : source_url = next_page.attrs['href'] else : clr.print_red_text("exit") # print "exit" break pass
def mgpyh(self,source): print "mgpyh" item_collection_name = "mgpyh_item" mongodbItem = first_mongodb[self.database][item_collection_name] #清除数据库数�? # mongodbItem.remove() # print "remove over" source_url = source['href'] source_name = source['item_name'] print source_name print source_url pagenum = 1 while 1 : # print source_url selector = loadHtmlSelector(source_url, headers=None) if selector is None : return lists = selector.findAll("div", {"class":"content-item clearfix"}) item_list = [] for list in lists : item = Item() item.categoryid = source['item_id'] #条目ID item.itemid = int (list.find("a", {"class":"favorite"}).attrs['data-id']) print item.itemid # #更新,直接跳到下�?个分�? num = mongodbItem.find({"itemid":item.itemid}).count() if num != 0 : return #时间 item.updatetime = int (list.attrs['data-timestamp']) print item.updatetime #条目标题 item.name = list.find("h3").find("a").get_text().strip() # print item.name #商品图片 item.image = list.find("img", {"alt":True}) if item.image : item.image = item.image.attrs['src'] print item.image else : item.image = "" #商品价格 prices = list.find("h3").findAll("em", {"class":"number"}) if prices !=[] : item.price = '' for price in prices : item.price += price.get_text() else : item.price = '' # continue #过滤没有价格的条�? # print item.price #购买链接 item.href = "http://www.mgpyh.com" + str (list.find("div", {"class":"item-right"}).find("a", {"class":"mp-btn-red"}).attrs['href']) print item.href itemicon = list.find("ol", {"class":"item-icon"}) firecount = itemicon.find("a", {"title":True}).find("span", {"class":"count"}).get_text() firecountnum = int(firecount) print "firecountnum is %d" %firecountnum #原文链接 article_href = "http://www.mgpyh.com" + str (list.find("h3").find("a").attrs['href']) print article_href #收藏�? favcount = itemicon.find("a", {"class":"favorite"}).find("span", {"class":"count"}).get_text() favcountnum = int(favcount) print "favcountnum is %d" %favcountnum #评论�? commentcount = itemicon.find("li").find("a", {"class":None, "data-id":None}).find("span", {"class":"count"}).get_text() commentcountnum = int(commentcount) print "commentcountnum is %d" %commentcountnum item_dict = item.createItemdic({"article_href":article_href, "fire_count":firecountnum, "fav_count":favcountnum, "comment_count":commentcountnum}) print item_dict #判断是否已经爬取 num = mongodbItem.find({"itemid":item.itemid}).count() if num == 0 : # item_list.append(item_dict) mongodbItem.insert(item_dict) #�?次插入一�? print "insert sucessfully" else : print ("item exits, num is %s" % num) # continue return #�?�?,跳到下一个分类,有更新功�? #�?次插入整页所有条�? # print item_list # if len(item_list) != 0 : # mongodbitemlist.insert(item_list) # print "insert sucessfully" pagenum += 1 source_url = source['href'] + "?page=" + str(pagenum) # if next_page : # source_url = next_page.find("a").attrs['href'] # else : # print "exit" # break pass
def smzdm_fx(self,source): print "smzdm_fx" item_collection_name = "smzdm_fx_item" mongodbItem = first_mongodb[self.database][item_collection_name] #清除数据库数�? # mongodbItem.remove() # print "remove over" clr = Color() #CMD终端分颜色打�? source_url = source['href'] source_name = source['item_name'] # if source['item_id'] < 15 : return # if source['item_id'] == 15 : # source_url = "http://faxian.smzdm.com/fenlei/qitafenlei/p192" clr.print_red_text(source_url) clr.print_red_text(source_name) # print source_name # print source_url while 1 : print source_url # clr.print_red_text(source_url) selector = loadHtmlSelector(source_url, headers=None) if selector is None : return lists = selector.findAll("li", {"class":"list"}) # print divs item_list = [] for list in lists : item = Item() item.categoryid = source['item_id'] #分类ID #条目ID item.itemid = int (list.attrs['articleid'].split("_")[-1]) # clr.print_blue_text(item.itemid) print item.itemid #更新,直接跳到下�?个分�? num = mongodbItem.find({"itemid":item.itemid}).count() if num != 0 : return # if num != 0 : # clr.print_yellow_text("item exits") # continue #暂停,继续爬�? #时间 item.updatetime = int (list.attrs['timesort']) updatetime = time.asctime(time.localtime(item.updatetime)) article_time = datetime.datetime.strptime(updatetime,"%a %b %d %H:%M:%S %Y").strftime("%Y-%m-%d %H:%M:%S %A") print item.updatetime print article_time #条目名称 item.name = list.find("h2", {"class":"itemName"}).find("span", {"class":"black"}).get_text().strip() # print item.name if "优惠券".decode('utf-8') in item.name : continue #过滤非商品条目 if "活动".decode('utf-8') in item.name : continue if "专享".decode('utf-8') in item.name : continue #商品图片 item.image = list.find("img", alt=True) if item.image : item.image = item.image.attrs['src'] print item.image else : item.image = "" continue #价格 item.price = list.find("h2", {"class":"itemName"}).find("span", {"class":"red"}).get_text() # if item.price == '' : continue if "促销".decode('utf-8') in item.price : continue if "红包".decode('utf-8') in item.price : continue #过滤非商品条�? if item.price != '' and not re.search(r'\d', item.price) : continue #过滤价格中没有数字的条目 # print item.price #购买链接 item.href = list.find("div", {"class":"item_buy_mall"}).find("a", {"class":"directLink"}).attrs['href'] clr.print_blue_text(item.href) # print item.href #推荐�? goodcount = list.find("div", {"class":"zan_fav_com"}).find("a", {"class":"zan"}).find("em").get_text() #“�?��?�数 goodcountnum = int(goodcount) print "goodcountnum is %d" %goodcountnum #评论�? commentcount = list.find("div", {"class":"zan_fav_com"}).find("a", {"class":"comment"}).get_text() commentcountnum = int(commentcount) print "commentcountnum is %d" %commentcountnum #文章链接 article_url = list.find("h2", {"class":"itemName"}).find("a").attrs['href'] # print article_url clr.print_blue_text(article_url) article_selector = loadHtmlSelector(article_url, headers=None) #商城 originmall = article_selector.find("div", {"class":"article-meta-box"}).find("a", {"onclick":None}) if originmall : originmall = originmall.get_text() else : originmall = "" # print originmall # content_item = article_selector.find("article", {"class":"article-details"}).find("div", {"class":"item-box"}) # if content_item : #优惠力度 youhui_content = article_selector.find("div", {"class":"item-box item-preferential"}) if youhui_content : youhui_content = youhui_content.find("div", {"class":"inner-block"}) if youhui_content : youhui_content = youhui_content.find("p").get_text().replace("\t","").replace("\n", "").replace("\r", "").strip() else : youhui_content = "" #爆料原文 baoliao_content = article_selector.find("div", {"class":"item-box item-preferential"}).find("div", {"class":"baoliao-block"}) if baoliao_content : baoliao_content = baoliao_content.find("p").get_text().replace("\t","").replace("\n", "").replace("\r", "").strip() else : baoliao_content = "" else : youhui_content = article_selector.find("article", {"class":"article-details"}).find("div", {"class":"inner-block"}).get_text().replace("\t","").replace("\n", "").replace("\r", "").strip() baoliao_content = "" # print youhui_content # print baoliao_content #商品介绍 item_description = "" item_descriptions = article_selector.findAll("div", {"class":"item-box"}) if item_descriptions : description_count = 1 for description in item_descriptions : if description_count == 2 : item_description = description.find("div", {"class":"inner-block"}) if item_description : item_description = item_description.find("p") if item_description : item_description = item_description.get_text().replace("\t","").replace("\n", "").replace("\r", "").strip() else : item_description = "" else : item_description = "" description_count += 1 # print item_description # else : # baoliao_content = article_selector.find("article", {"class":"article-details"}).find("div", {"class":"inner-block"}).find("p", {"itemprop":"description"}).get_text().replace("\t","").replace("\n", "").replace("\r", "").strip() # youhui_content = "" # item_description = "" #不推荐数 badcount = article_selector.find("div", {"class":"score_rate"}).find("span", {"id":"rating_unworthy_num"}).get_text().strip() badcountnum = int(badcount) print "badcountnum is %d" %badcountnum #收藏�? favcount = article_selector.find("div", {"class":"operate_box"}).find("div", {"class":"operate_icon"}).find("a", {"class":"fav"}).find("em").get_text() favcountnum = int(favcount) print favcountnum item_dict = item.createItemdic({"originmall":originmall, "baoliao_content":baoliao_content, "youhui_content":youhui_content, "item_description":item_description, "bad_count":badcountnum, "fav_count":favcountnum, "article_url":article_url, "article_time":article_time, "good_count":goodcountnum, "comment_count":commentcountnum}) print item_dict # clr.print_green_text(item_dict) #判断是否已经爬取 num = mongodbItem.find({"itemid":item.itemid}).count() if num == 0 : # item_list.append(item_dict) mongodbItem.insert(item_dict) #�?次插入一个条�? clr.print_red_text("insert successfully") # print "insert successfully" else : # mongodbItem.update({"itemid":item.itemid}, item_dict) # print "update over" clr.print_yellow_text(("item exits, num is %s" % num)) # print ("item exits, num is %s" % num) continue #�?次插入一页所有条�? # print item_list # if len(item_list) != 0 : # self.mongodbitemlist.insert(item_list) # print "insert successfully" next_page = selector.find("ul", {"class":"pagination"}).find("li", {"class":"pagedown"}) if next_page : source_url = next_page.find("a").attrs['href'] else : print "exit" break clr.print_red_text("all done")