def parse_intr(self, tree, xpath): dom = tree.xpath(xpath) introduce = {} temporary = '' for item in dom: item = item.strip() if item == '': continue elif item.find(':') > 0: item = item.split(':', 1) if item[1] == '': temporary = extract_title(item[0]) else: introduce[extract_title(item[0])] = extract_text(item[1]) else: if temporary != '': introduce[temporary] = extract_text(item) temporary = '' else: continue if introduce != '': return introduce else: return ''
def crawl(self): # wareId = '1229271' # wareId = '1391817787' # priorcategory = ["家居家装","清洁用品","衣物清洁"] # presentcategory = ['1','2','3'] # ids = uuid.uuid1() wareId = self.key ids = self.data.get('uuid') category_data = extract_category(self) url = 'http://m.360buy.com/product/guige/%s.html'%(str(wareId)) html_stream = ProcessData.get_web_data(url) tree = etree.HTML(html_stream.text) xpath = "//table[@class='Ptable']/tr/td/text()" dom = tree.xpath(xpath) specifications = {} temporary = '' i = 0 for item in dom: item = item.strip() if item == '': continue if i%2 ==0: specifications[item] = '' temporary = extract_title(item) else: specifications[temporary] = extract_text(item) i += 1 data = { 'ecnorms':specifications } # specifications = json.dumps(specifications, ensure_ascii=False) introduce = IntroduceCrawler.crawl(wareId,ids) ecbrands = introduce[u'品牌'] if introduce.get(u'品牌') else '' # ecnames = introduce[u'商品名称'].replace('\'',' ') if introduce.get(u'商品名称') else '' ecnames = introduce[u'商品名称'] if introduce.get(u'商品名称') else '' crawl_data = { 'id': ids, 'source': self.data.get('source'), 'source_id': wareId, 'summary': specifications, 'introduce': introduce, 'name': ecnames, 'brand': ecbrands } crawl_data.update(category_data) model = EcDetailModel(crawl_data) export(model)
def crawl(wareId,ids ): import sys reload(sys) sys.setdefaultencoding("utf-8") url = 'http://item.jd.com/%s.html'%(str(wareId)) html_stream = ProcessData.get_web_data(url) if html_stream=={}: return {} html_stream.encoding = 'gb2312' tree = etree.HTML(html_stream.text) xpath = "//div[@id='product-detail-1']/ul[@class='detail-list']/li//text()" dom = tree.xpath(xpath) introduce = {} temporary = '' for item in dom: item = item.strip() if item == '': continue elif item.find(':') >0: item = item.split(':',1) if item[1] == '': temporary = extract_title(item[0]) else: introduce[extract_title(item[0])] = extract_text(item[1]) else: if temporary != '': introduce[temporary] = extract_text(item) temporary = '' else: continue if introduce != '': return introduce else: return ''
def parse_summary(self, tree, xpath): dom = tree.xpath(xpath) specifications = {} temporary = '' i = 0 for item in dom: item = item.strip() if item == '': continue if i % 2 == 0: specifications[item] = '' temporary = extract_title(item) else: specifications[temporary] = extract_text(item) i += 1 return specifications
class ThirdCrawler(Crawler): type = "ecommerce.jd.thirdlvl" def crawl(self): fid = self.key categorys = self.data['priorcategory'] # fid = '1625' # categorys = ["家居家装","清洁用品"] start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body=" thrid_urls = { 'catelogyId':str(fid), 'isDescription':'false', 'isIcon':'false', 'level':'2' } url = start_urls + quote(str(thrid_urls)) try: jsons = ProcessData.get_json_data(url) lists = jsons['catelogyList'] except Exception,e: self.logger.error(url) self.logger.error(e) print 'error ',url return if lists == []: return {} for i in range(len(lists)): cid = lists[i]['cid'] # presentcategory = [] priorcategory = [] priorcategory.extend(categorys) priorcategory.append(extract_title(lists[i]['name'])) data = { 'priorcategory':priorcategory, # 'presentcategory':presentcategory } Scheduler.schedule(ListCrawler.type, key=cid, data=data)
class FirstCrawler(Crawler): type = "ecommerce.jd.firstlvl" @staticmethod def init(conf=None): pass # Scheduler.schedule(FirstCrawler.type, interval=86400) def crawl(self): start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body=" sencond_urls = { 'catelogyId': '0', 'isDescription': 'true', 'isIcon': 'true', 'level': '0' } url = start_urls + quote(str(sencond_urls)) try: jsons = ProcessData.get_json_data(url) lists = jsons['catelogyList'] except Exception,e: self.logger.error(url) self.logger.error(e) print 'error ',url return for i in range(len(lists)): cid = lists[i]['cid'] priorcategory = [] # presentcategory = [] priorcategory.append(extract_title(lists[i]['name'])) data = { 'priorcategory':priorcategory, # 'presentcategory':presentcategory } Scheduler.schedule(SecondCrawler.type, key=cid, data=data)
def crawl(self): # fid = '1620' # categorys = ["家居家装"] fid = self.key categorys = self.data['priorcategory'] start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body=" sencond_urls = { 'catelogyId': str(fid), 'isDescription': 'true', 'isIcon': 'true', 'level':'1' } url = start_urls + quote(str(sencond_urls)) #print 'url ',url try: jsons = ProcessData.get_json_data(url) lists = jsons['catelogyList'] except: print 'error ',url return if lists == []: return {} for i in range(len(lists)): cid = lists[i]['cid'] # presentcategory = [] priorcategory = [] priorcategory.extend(categorys) priorcategory.append(extract_title(lists[i]['name'])) data = { 'priorcategory':priorcategory, # 'presentcategory':presentcategory } Scheduler.schedule(ThirdCrawler.type, key=cid, data=data)