예제 #1
0
 def parse_shop(self, response):
     print '\r\n\t======== Page Crawl Start - Company -----------'
     hxs = HtmlXPathSelector(response)
     item = init_item('shop')  #初始化 shop item
     try:
         if conf['show_messages']: print '----Company Fetch Start----'
         #--分析代码开始#################################################################################################################
         item['url'] = response.url
         item['logo_src'] = 'http://baidu.com/abc/ddd.jpg'
         item['photo_src'] = '/image/abcd.jpg'
         newurl = 'http://cn.china.cn'  #构造企业介绍页Url
         try:  #尝试加载新页面,使用代理IP
             proxy_handle = urllib2.ProxyHandler({'http': get_proxy()})
             opener = urllib2.build_opener(proxy_handle)
             temp = opener.open(newurl, timeout=30)  #请求
         except:  #重试一次,如果仍无法打开.. 然后..就没有然后了
             proxy_handle = urllib2.ProxyHandler({'http': get_proxy()})
             opener = urllib2.build_opener(proxy_handle)
             temp = opener.open(newurl, timeout=30)  #请求
         temp = temp.read()  #读数据
         newresponse = HtmlResponse(newurl)
         newresponse._set_body(temp)
         hxs = HtmlXPathSelector(newresponse)  #构建新的xpath选择器
         #print temp
         #--分析代码结束#################################################################################################################
         if conf['show_messages']: print '---- Fetch Success ----'
     except EOFError, e:
         if conf['show_messages']: print '----Company Fetch Error Start----'
         print e
         if conf['show_messages']: print '----Company Fetch Error End----'
예제 #2
0
 def parse_shop(self, response):
     print '\r\n\t======== Page Crawl Start - Company -----------'
     hxs = HtmlXPathSelector(response)
     item = init_item('shop') #初始化 shop item
     try :
         if conf['show_messages'] : print '----Company Fetch Start----'
     #--分析代码开始#################################################################################################################
         item['url'] = response.url
         item['logo_src'] = 'http://baidu.com/abc/ddd.jpg'
         item['photo_src'] = '/image/abcd.jpg'
         newurl = 'http://cn.china.cn' #构造企业介绍页Url
         try : #尝试加载新页面,使用代理IP
             proxy_handle = urllib2.ProxyHandler({ 'http' : get_proxy() })
             opener = urllib2.build_opener(proxy_handle)
             temp = opener.open(newurl,timeout=30) #请求
         except : #重试一次,如果仍无法打开.. 然后..就没有然后了
             proxy_handle = urllib2.ProxyHandler({ 'http' : get_proxy() })
             opener = urllib2.build_opener(proxy_handle)
             temp = opener.open(newurl,timeout=30) #请求
         temp = temp.read() #读数据
         newresponse = HtmlResponse(newurl)
         newresponse._set_body(temp)
         hxs = HtmlXPathSelector(newresponse) #构建新的xpath选择器
         #print temp
     #--分析代码结束#################################################################################################################
         if conf['show_messages'] : print '---- Fetch Success ----'
     except EOFError,e :
         if conf['show_messages'] : print '----Company Fetch Error Start----'
         print e
         if conf['show_messages'] : print '----Company Fetch Error End----'
예제 #3
0
 def parse_product(self, response):
     print '\r\n\t======== Page Crawl Start - Product -----------'
     hxs = HtmlXPathSelector(response)
     item = init_item('product')  #初始化 shop item
     try:
         if conf['show_messages']: print '----Product Fetch Start----'
         #--分析代码开始#################################################################################################################
         item['purl'] = response.url
         #--分析代码结束#################################################################################################################
         if conf['show_messages']: print '---- Fetch Success ----'
     except EOFError, e:
         if conf['show_messages']: print '----Product Fetch Error Start----'
         print e
         if conf['show_messages']: print '----Product Fetch Error End----'
예제 #4
0
 def parse_shop(self, response):
     print '\r\n\t======== Page Crawl Start - Company -----------'
     hxs = HtmlXPathSelector(response)
     item = init_item('shop') #初始化 shop item
     try :
         if conf['show_messages'] : print '----Company Fetch Start----'
     #--分析代码开始#################################################################################################################
         item['url'] = response.url
     #--分析代码结束#################################################################################################################
         if conf['show_messages'] : print '---- Fetch Success ----'
     except EOFError,e :
         if conf['show_messages'] : print '----Company Fetch Error Start----'
         print e
         if conf['show_messages'] : print '----Company Fetch Error End----'
예제 #5
0
 def parse_product(self, response):
     print '\r\n\t======== Page Crawl Start - Product -----------'
     hxs = HtmlXPathSelector(response)
     item = init_item('product') #初始化 shop item
     try :
         if conf['show_messages'] : print '----Product Fetch Start----'
     #--分析代码开始#################################################################################################################
         item['purl'] = response.url
         item['photo_src'] = '/abcd.jpg'
         item['image_src'] = 'http://123.com/abcd.jpg|/abc/ddd.jpg|./dew.jpg|.././ccc.jpg'
     #--分析代码结束#################################################################################################################
         if conf['show_messages'] : print '---- Fetch Success ----'
     except EOFError,e :
         if conf['show_messages'] : print '----Product Fetch Error Start----'
         print e
         if conf['show_messages'] : print '----Product Fetch Error End----'
예제 #6
0
 def parse_product(self, response):
     print '\r\n\t======== Page Crawl Start - Product -----------'
     hxs = HtmlXPathSelector(response)
     item = init_item('product')  #初始化 shop item
     try:
         if conf['show_messages']: print '----Product Fetch Start----'
         #--分析代码开始#################################################################################################################
         item['purl'] = response.url
         item['photo_src'] = '/abcd.jpg'
         item[
             'image_src'] = 'http://123.com/abcd.jpg|/abc/ddd.jpg|./dew.jpg|.././ccc.jpg'
         #--分析代码结束#################################################################################################################
         if conf['show_messages']: print '---- Fetch Success ----'
     except EOFError, e:
         if conf['show_messages']: print '----Product Fetch Error Start----'
         print e
         if conf['show_messages']: print '----Product Fetch Error End----'