def handle(self,datas): # print '____',datas data = datas.xpath("div[@class='item']") address = self.mackining('address',data) name = self.mackining('name',data) url = self.mackining('url',data) score = self.mackining('score',data) SCORES = re.search(u'\s*([0-5])\s*',score) score = int(SCORES.group(1)) if SCORES else '' title = self.mackining('title',data) comment = self.mackining('comment',data) commentid = self.mackining('commentid',data) buytime = self.mackining('buytime',data) useful = int(self.mackining('useful',data)) reply = int(self.mackining('reply',data)) buytime = ProcessData.str_datetime(buytime) commenttime = self.mackining('commenttime',data) commenttime = ProcessData.str_datetime(commenttime) return { 'address': address, 'name': name, 'url': url, 'score': score, 'title': title, 'comment': comment, 'commentid': commentid, 'buytime': buytime, 'commenttime': commenttime, 'province': address, 'city': '', 'useful': useful, 'reply': reply # 'city': city }
def crawl(self): # fid = '1662' # priorcategory = ["家居家装","清洁用品","衣物清洁"] # presentcategory = ['1','2','3'] fid = self.key category_data = extract_category(self) count = 3 #页数初始值为3 pages = 1 #从第一页开始 while pages <= count: url = self.get_url(fid,pages) try: jsons = ProcessData.get_json_data(url) if pages==1 : count = math.ceil(int(jsons['wareCount'])/100) lists = jsons['wareInfo'] except Exception,e: self.logger.error(url) self.logger.error(e) print 'error ',url return if lists == []: return {} for i in range(len(lists)): ids = uuid.uuid1() #cassandra 主键 wareId = lists[i]['wareId'] try: f = lambda x: int(x[:-1])/100.00 ecsumscores = float(f(lists[i]['good'])) #商品总评分 except: ecsumscores = 0 crawl_data = { # 'id': uuid.uuid1(), 'source_id': wareId, 'source': self.data.get('source'), 'summary': {}, 'title': lists[i]['wname'], 'adword': lists[i]['adword'], 'price': float(lists[i]['jdPrice']), 'original_price': float(lists[i]['martPrice']), 'score': ecsumscores } crawl_data.update(category_data) data = { # 'uuid': ids, 'priorcategory': self.data['priorcategory'], 'presentcategory': self.data['priorcategory'] # 'presentcategory': self.data['presentcategory'] } model = EcBasicModel(crawl_data) export(model) data["uuid"] = model["id"] Scheduler.schedule(DetailCrawler.type, key=wareId, data=data) Scheduler.schedule(CommentCrawler.type, key=wareId, data=data) pages += 1
def crawl(self): wareId = str(self.key) url = "http://item.yhd.com/item/%s"%wareId html_stream = ProcessData.get_web_data(url) # print html_stream.text.encode('utf-8') tree = etree.HTML(html_stream.text) self.crawler_data(tree)
def get_data(self,CatID,pages): url = 'http://www.ows.newegg.com.cn/cat/%s'%(str(CatID)) list_urls = { 'page': str(pages), 'pagesize': 20, 'sort': 10 } return ProcessData.get_json_data(url,parameter=list_urls)
def crawl(self): key = str(self.key) count = 2 #页数初始值为3 pages = 1 #从第一页开始 for i in xrange(1,count): url = self.get_url(key,pages) html_stream = ProcessData.get_web_data(url) # print html_stream.text.encode('utf-8') tree = etree.HTML(html_stream.text) self.crawler_data(tree)
def crawl(self): # wareId = '1229271' # priorcategory = ["家居家装","清洁用品","衣物清洁"] # presentcategory = ['1','2','3'] # ecid = '124' wareId = self.key ecid = self.data['uuid'] category_data = extract_category(self) pages = 1 count = True while count: number = 0 #去重 url = self.get_url(wareId,pages) # print '++++++++= ',url html_stream = ProcessData.get_web_data(url) try: tree = etree.HTML(html_stream.text) except: print 'error: ',url break xpath = "//div[@id='comments-list']/div[@class='mc']" dom = tree.xpath(xpath) if dom == []: count = False continue for item in dom: datas = self.handle(item) comment_data={ # 'uuid': uuid.uuid1(), #primary key 'ecid': ecid, #commodity table foreign key 'source_id': wareId, 'source': self.data.get('source'), 'comment_id': datas['commentid'], #review id 'score': datas['score'], #commodity score 'pubtime': datas['commenttime'], 'buytime': datas['buytime'], 'user_id': datas['url'], # 'usernickName': groups[i]['usernickName'], 'useful': datas['useful'], 'reply': datas['reply'], 'content': datas['comment'], 'province': datas['province'] } comment_data.update(category_data) model = EcCommentModel(comment_data) is_saved = export(model) if is_saved == True: pass else: number += 1 if number > 10: break pages += 1
def crawl(self): url = "http://interface.m.yhd.com/ \ mcategory/servlet/CentralMobileFacadeJsonServlet/ \ getNavCategoryWithKeywordByRootCategoryId? \ rootCategoryId=0&categoryNavId=0&provinceId=1" try: jsons = ProcessData.get_json_data(url.replace(' ','')) data = jsons['data'] except Exception,e: self.logger.error(url) self.logger.error(e) print 'error ',url
def crawl(self): # wareId = '1229271' # wareId = '1391817787' # priorcategory = ["家居家装","清洁用品","衣物清洁"] # presentcategory = ['1','2','3'] # ids = uuid.uuid1() wareId = self.key ids = self.data.get('uuid') category_data = extract_category(self) url = 'http://m.360buy.com/product/guige/%s.html'%(str(wareId)) html_stream = ProcessData.get_web_data(url) tree = etree.HTML(html_stream.text) xpath = "//table[@class='Ptable']/tr/td/text()" dom = tree.xpath(xpath) specifications = {} temporary = '' i = 0 for item in dom: item = item.strip() if item == '': continue if i%2 ==0: specifications[item] = '' temporary = extract_title(item) else: specifications[temporary] = extract_text(item) i += 1 data = { 'ecnorms':specifications } # specifications = json.dumps(specifications, ensure_ascii=False) introduce = IntroduceCrawler.crawl(wareId,ids) ecbrands = introduce[u'品牌'] if introduce.get(u'品牌') else '' # ecnames = introduce[u'商品名称'].replace('\'',' ') if introduce.get(u'商品名称') else '' ecnames = introduce[u'商品名称'] if introduce.get(u'商品名称') else '' crawl_data = { 'id': ids, 'source': self.data.get('source'), 'source_id': wareId, 'summary': specifications, 'introduce': introduce, 'name': ecnames, 'brand': ecbrands } crawl_data.update(category_data) model = EcDetailModel(crawl_data) export(model)
def crawl(self): url = 'http://www.ows.newegg.com.cn/category.egg' try: jsons = ProcessData.get_json_data(url) except: print 'error ',url return for item1 in jsons: CatName1 = item1['CatName'] for item2 in item1['SubCategories']: CatName2 = item2['CatName'] for item3 in item2['SubCategories'] : priorcategory = [] priorcategory.extend([CatName1,CatName2,item3['CatName']]) self.handle(item3['CatID'],priorcategory)
def crawl(self): start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body=" sencond_urls = { 'catelogyId': '0', 'isDescription': 'true', 'isIcon': 'true', 'level': '0' } url = start_urls + quote(str(sencond_urls)) try: jsons = ProcessData.get_json_data(url) lists = jsons['catelogyList'] except Exception,e: self.logger.error(url) self.logger.error(e) print 'error ',url return
def crawl(self): fid = self.key categorys = self.data['priorcategory'] # fid = '1625' # categorys = ["家居家装","清洁用品"] start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body=" thrid_urls = { 'catelogyId':str(fid), 'isDescription':'false', 'isIcon':'false', 'level':'2' } url = start_urls + quote(str(thrid_urls)) try: jsons = ProcessData.get_json_data(url) lists = jsons['catelogyList'] except Exception,e: self.logger.error(url) self.logger.error(e) print 'error ',url return
def crawl(self): # fid = '1620' # categorys = ["家居家装"] fid = self.key categorys = self.data['priorcategory'] start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body=" sencond_urls = { 'catelogyId': str(fid), 'isDescription': 'true', 'isIcon': 'true', 'level':'1' } url = start_urls + quote(str(sencond_urls)) #print 'url ',url try: jsons = ProcessData.get_json_data(url) lists = jsons['catelogyList'] except: print 'error ',url return if lists == []: return {} for i in range(len(lists)): cid = lists[i]['cid'] # presentcategory = [] priorcategory = [] priorcategory.extend(categorys) priorcategory.append(extract_title(lists[i]['name'])) data = { 'priorcategory':priorcategory, # 'presentcategory':presentcategory } Scheduler.schedule(ThirdCrawler.type, key=cid, data=data)
def crawl(wareId,ids ): import sys reload(sys) sys.setdefaultencoding("utf-8") url = 'http://item.jd.com/%s.html'%(str(wareId)) html_stream = ProcessData.get_web_data(url) if html_stream=={}: return {} html_stream.encoding = 'gb2312' tree = etree.HTML(html_stream.text) xpath = "//div[@id='product-detail-1']/ul[@class='detail-list']/li//text()" dom = tree.xpath(xpath) introduce = {} temporary = '' for item in dom: item = item.strip() if item == '': continue elif item.find(':') >0: item = item.split(':',1) if item[1] == '': temporary = extract_title(item[0]) else: introduce[extract_title(item[0])] = extract_text(item[1]) else: if temporary != '': introduce[temporary] = extract_text(item) temporary = '' else: continue if introduce != '': return introduce else: return ''
def get_response(self, key): url = self.get_url(key) response = ProcessData.get_web_data(url) return response
def get_data(self, CatID, pages): url = 'http://www.ows.newegg.com.cn/cat/%s' % (str(CatID)) list_urls = {'page': str(pages), 'pagesize': 20, 'sort': 10} return ProcessData.get_json_data(url, parameter=list_urls)