Пример #1
0
 def parse(self,url,path):
     f=open(path)
     data=f.read()
     f.close()
     ext=funcs.get_url_ext(url)
     if ext in ('php','html','htm','asp','aspx','jsp'):
         data,coding=funcs.decode_data(data)
         soup=BeautifulSoup(str(data),'html5lib',from_encoding='utf-8')
         urls,css_urls,js_urls,img_urls=self.get_link(soup)
         all_hrefs=css_urls+js_urls+urls+img_urls
             
         self.item.url=url
         self.item.content=str(soup)                      #使用修改后的数据
         self.item.coding=coding                          #内容编码
         self.item.all_hrefs=all_hrefs
 
         self.item=self.update_css_js(self.item)
         content=str(self.item.content).encode(self.item.coding,'ignore')
     else:
         content=data
     patt='[\s\S]+?(/media/html/[\s\S]+)'
     m=re.search(patt,path)
     if m:
         save_path=S_save_root_dir+m.group(1)
     else:
         print path,'is a wrong path'
         save_path=''
     if save_path:
         print save_path
         if self.check_dir_path(os.path.dirname(save_path)):
             f=open(save_path,'w')
             f.write(content)
             f.close()
Пример #2
0
 def save_data(self,path,item):
     '''保存网页'''
     try:
         ext=funcs.get_url_ext(item.url)
         if ext in settings.S_img_ext:
             w_type='wb'
             data=item.content
         elif ext in ('css','js') or item.url.startswith('http://bits.wikimedia.org'):
             w_type='w'
             data=item.content
         else:
             w_type='w'
             if item.coding:
                 #data=str(item.soup).decode('utf-8','ignore').encode(item.coding,'ignore')
                 #data=item.soup.prettify(item.coding)
                 data=item.soup.prettify()           #都转换为utf-8
             else:
                 #data=str(item.soup)
                 data=item.soup.prettify()
         if len(data.strip())>0:
             f=open(path,w_type)
             f.write(data)
             f.close()
             return True
         else:
             raise Exception(item.url+' content is empty')
     except Exception,e:
         msg='pipeline: '+item.url+' save fail :'+str(e)
         print msg,str(e)
         save_fail_log=os.path.abspath('.').replace('\\','/')+'/save_fail.txt'
         f=open(save_fail_log,'a')
         f.write(msg+'\n')
         f.close()
         return False
Пример #3
0
 def update_css_js(self,):
     #替换css和js和部分链接
     for href in self.item.all_hrefs:
         turl=href.split('#')[0].strip()         #去掉锚点
         if turl=='' or turl.startswith('javascript') or turl.startswith('#'):
             continue
         
         ext=funcs.get_url_ext(href).lower()
         if ext in ('css','js'):
             c_href=urlparse.urljoin(self.item.url,href)
             sub_url=urlparse.urlparse(c_href)[2]       #使http://www.csdn.net/../sdf变成http://www.csdn.net/sdf
             if sub_url.startswith('/../'):
                 sub_url=sub_url[3:]
             c_href=self.url_prefix_main+'/css_js'+sub_url
             patt_prefix='href'
         elif ext in ('jpg','png','gif','jpeg'):
             c_href=urlparse.urljoin(self.item.url,href)
             sub_url=urlparse.urlparse(c_href)[2]       #使http://www.csdn.net/../sdf变成http://www.csdn.net/sdf
             if sub_url.startswith('/../'):
                 sub_url=sub_url[3:]
             domain_name=funcs.url2domain(c_href)
             c_href=self.url_prefix_main+'/'+domain_name+sub_url
             patt_prefix='src'
         else:
             c_href=urlparse.urljoin(self.item.url,href)
             if len(re.findall('/',c_href))==2:         #使http://www.csdn.net变成http://www.csdn.net/
                 c_href=c_href+'/'
                 
             domain_name=funcs.url2domain(c_href)
             c_idx=funcs.get_md52int(c_href)
             c_href=self.url_prefix_main+'/'+domain_name+urlparse.urlparse(c_href)[2]
             dir_path=os.path.dirname(c_href)
             c_href=dir_path+'/'+str(c_idx)+'.html'
             patt_prefix='href'
         try:
             p_href=funcs.get_re_patt(href)
             patt=patt_prefix+'=[\'"]'+p_href+'[\'"]'
             c_href=patt_prefix+'="'+c_href+'"'
             self.item.content=re.sub(patt,c_href,self.item.content)                
         except Exception,e:
             print 'update_css_js:'+str(e)
Пример #4
0
 def pipeline(self,item):
     '''更改网页的链接,保存网页'''
     if item.url=='':
         return
     root_dir=settings.S_root_dir
     sub_dir=urlparse.urlparse(item.url)[2]
     ext=funcs.get_url_ext(item.url)                     #获取文件扩展名
     
     item,file_path=self.update_file_path(item,root_dir,sub_dir)
     #wiki 的css页面为http://bits.wikimedia.org开头的php,不分析
     #if (ext not in settings.S_img_ext) and (ext not in ('css','js')) and not item.url.startswith('http://bits.wikimedia.org'): 
     if funcs.is_need_modify(item.url):
         #item=self.update_css_js(item)
         #print item.url
         item=self.modify_tree(item)
     if self.record(item):
         if self.check_dir_path(os.path.dirname(file_path)):
             if self.save_data(file_path,item):
                 print 'pipleline: '+str(self.thread_id)+' : '+str(item.idx)+' : '+item.url
     else:
         print 'pipleline: '+str(self.thread_id)+' : no need to update '+str(item.idx)+' : '+item.url
Пример #5
0
 def parse(self,response):
     '''分析网页内容'''
     response.url = response.url.strip()
     ext=funcs.get_url_ext(response.url)                     #获取文件扩展名
     #wiki 的css页面为http://bits.wikimedia.org开头的php,不分析
     #if (ext not in settings.S_img_ext) and (ext not in ('css','js')) and not response.url.startswith('http://bits.wikimedia.org'):
     if funcs.is_need_modify(response.url):
         data,coding=funcs.decode_data(response.body)
         soup=BeautifulSoup(str(data),'lxml',from_encoding='utf-8')
         soup,urls,css_urls,js_urls,img_urls=self.get_link(soup)
         all_urls=css_urls+js_urls+urls+img_urls
         
         for url in all_urls:
             vurl=funcs.valid_url(response.url,url).strip()      #判断是否有效链接
             if vurl != '':
                 #下载简体中文的网页
                 vurl = funcs.translate_simplify( vurl )
                 _url=funcs.decode_data(vurl)[0].encode('utf-8')
                 print _url
                 if _url:
                     vurl=_url
                 yield Request(vurl)
                 
         item=Item()
         item.url=response.url
         item.soup=soup
         item.content=str(soup)                      #使用修改后的数据
         item.coding=coding                          #内容编码
         item.file_length=int(len(response.body))    #原始文件大小
         yield item
     else:
         item=Item()
         item.url=response.url
         item.soup=None
         item.content=response.body                  #使用修改后的数据
         item.coding=''                              #内容编码
         item.file_length=int(len(response.body))    #原始文件大小
         yield item
Пример #6
0
                f.close()
                return True
            else:
                raise Exception(item.url+' content is empty')
        except Exception,e:
            msg='pipeline: '+item.url+' save fail :'+str(e)
            print msg,str(e)
            save_fail_log=os.path.abspath('.').replace('\\','/')+'/save_fail.txt'
            f=open(save_fail_log,'a')
            f.write(msg+'\n')
            f.close()
            return False
        
        
if __name__=='__main__':
    ext=funcs.get_url_ext('http://asdf.com/sdf.css')
    
    if ext in ('css','js'):
        print ext