def parse_product(self, response): #分析产品时,主图 请勿重复添加 到多图字段中 jeen = Jeen() if conf_show_messages : print '\r\n\t======== Page Crawl Start - Product -----------' hxs = HtmlXPathSelector(response) item = jeen.init_item('product') #初始化 shop item try : if conf_show_messages : print '----Product Fetch Start----' #--分析代码开始################################################################################################################# item['purl'] = response.url item['photo_src'] = 'http://img1.toocle.com/bin/img/?x=160&y=160&t=product_cn&m=1&s=/2012/11/11/08/26527608_1.jpg' item['image_src'] = 'http://img1.toocle.com/bin/img/?x=160&y=160&t=product_cn&m=1&s=/2012/11/11/08/26527608_1.jpg' #--分析代码结束################################################################################################################# if conf_show_messages : print '---- Fetch Success ----' except EOFError,e : if conf_show_messages : print '----Product Fetch Error Start----' print e if conf_show_messages : print '----Product Fetch Error End----'
def parse_product(self, response): #分析产品时,主图 请勿重复添加 到多图字段中 jeen = Jeen() if conf_show_messages: print '\r\n\t======== Page Crawl Start - Product -----------' hxs = HtmlXPathSelector(response) item = jeen.init_item('product') #初始化 shop item try: if conf_show_messages: print '----Product Fetch Start----' #--分析代码开始################################################################################################################# item['purl'] = response.url item[ 'photo_src'] = 'http://img1.toocle.com/bin/img/?x=160&y=160&t=product_cn&m=1&s=/2012/11/11/08/26527608_1.jpg' item[ 'image_src'] = 'http://img1.toocle.com/bin/img/?x=160&y=160&t=product_cn&m=1&s=/2012/11/11/08/26527608_1.jpg' #--分析代码结束################################################################################################################# if conf_show_messages: print '---- Fetch Success ----' except EOFError, e: if conf_show_messages: print '----Product Fetch Error Start----' print e if conf_show_messages: print '----Product Fetch Error End----'
def parse_shop(self, response): jeen = Jeen() if conf_show_messages: print '\r\n\t======== Page Crawl Start - Company -----------' hxs = HtmlXPathSelector(response) item = jeen.init_item('shop') #初始化 shop item try: if conf_show_messages: print '----Company Fetch Start----' #--分析代码开始################################################################################################################# item['url'] = response.url item['logo_src'] = 'http://china.toocle.com/images/comp/11/s19.gif' item[ 'photo_src'] = 'http://img1.toocle.com/bin/img/?x=217&y=156&t=company_cn&m=1&s=/2013/09/05/05/2519405_1.jpg' newurl = 'http://cn.china.cn' #构造企业介绍页Url if conf_use_proxy: try: #尝试加载新页面,使用代理IP proxy_handle = urllib2.ProxyHandler( {'http': jeen.get_proxy()}) opener = urllib2.build_opener(proxy_handle) temp = opener.open(newurl, timeout=30) #请求 except: #重试一次,如果仍无法打开.. 然后..就没有然后了 proxy_handle = urllib2.ProxyHandler( {'http': jeen.get_proxy()}) opener = urllib2.build_opener(proxy_handle) temp = opener.open(newurl, timeout=30) #请求 else: try: temp = urllib2.urlopen(newurl, timeout=30) except: temp = urllib2.urlopen(newurl, timeout=30) temp = temp.read() #读数据 newresponse = HtmlResponse(newurl) newresponse._set_body(temp) hxs = HtmlXPathSelector(newresponse) #构建新的xpath选择器 #print temp #--分析代码结束################################################################################################################# if conf_show_messages: print '---- Fetch Success ----' except EOFError, e: if conf_show_messages: print '----Company Fetch Error Start----' print e if conf_show_messages: print '----Company Fetch Error End----'
def parse_shop(self, response): jeen = Jeen() if conf_show_messages : print '\r\n\t======== Page Crawl Start - Company -----------' hxs = HtmlXPathSelector(response) item = jeen.init_item('shop') #初始化 shop item try : if conf_show_messages : print '----Company Fetch Start----' #--分析代码开始################################################################################################################# item['url'] = response.url item['logo_src'] = 'http://china.toocle.com/images/comp/11/s19.gif' item['photo_src'] = 'http://img1.toocle.com/bin/img/?x=217&y=156&t=company_cn&m=1&s=/2013/09/05/05/2519405_1.jpg' newurl = 'http://cn.china.cn' #构造企业介绍页Url if conf_use_proxy : try : #尝试加载新页面,使用代理IP proxy_handle = urllib2.ProxyHandler({ 'http' : jeen.get_proxy() }) opener = urllib2.build_opener(proxy_handle) temp = opener.open(newurl,timeout=30) #请求 except : #重试一次,如果仍无法打开.. 然后..就没有然后了 proxy_handle = urllib2.ProxyHandler({ 'http' : jeen.get_proxy() }) opener = urllib2.build_opener(proxy_handle) temp = opener.open(newurl,timeout=30) #请求 else : try : temp = urllib2.urlopen(newurl,timeout=30) except : temp = urllib2.urlopen(newurl,timeout=30) temp = temp.read() #读数据 newresponse = HtmlResponse(newurl) newresponse._set_body(temp) hxs = HtmlXPathSelector(newresponse) #构建新的xpath选择器 #print temp #--分析代码结束################################################################################################################# if conf_show_messages : print '---- Fetch Success ----' except EOFError,e : if conf_show_messages : print '----Company Fetch Error Start----' print e if conf_show_messages : print '----Company Fetch Error End----'