def parse(self,url,path): f=open(path) data=f.read() f.close() ext=funcs.get_url_ext(url) if ext in ('php','html','htm','asp','aspx','jsp'): data,coding=funcs.decode_data(data) soup=BeautifulSoup(str(data),'html5lib',from_encoding='utf-8') urls,css_urls,js_urls,img_urls=self.get_link(soup) all_hrefs=css_urls+js_urls+urls+img_urls self.item.url=url self.item.content=str(soup) #使用修改后的数据 self.item.coding=coding #内容编码 self.item.all_hrefs=all_hrefs self.item=self.update_css_js(self.item) content=str(self.item.content).encode(self.item.coding,'ignore') else: content=data patt='[\s\S]+?(/media/html/[\s\S]+)' m=re.search(patt,path) if m: save_path=S_save_root_dir+m.group(1) else: print path,'is a wrong path' save_path='' if save_path: print save_path if self.check_dir_path(os.path.dirname(save_path)): f=open(save_path,'w') f.write(content) f.close()
def parse(self,response): '''分析网页内容''' response.url = response.url.strip() ext=funcs.get_url_ext(response.url) #获取文件扩展名 #wiki 的css页面为http://bits.wikimedia.org开头的php,不分析 #if (ext not in settings.S_img_ext) and (ext not in ('css','js')) and not response.url.startswith('http://bits.wikimedia.org'): if funcs.is_need_modify(response.url): data,coding=funcs.decode_data(response.body) soup=BeautifulSoup(str(data),'lxml',from_encoding='utf-8') soup,urls,css_urls,js_urls,img_urls=self.get_link(soup) all_urls=css_urls+js_urls+urls+img_urls for url in all_urls: vurl=funcs.valid_url(response.url,url).strip() #判断是否有效链接 if vurl != '': #下载简体中文的网页 vurl = funcs.translate_simplify( vurl ) _url=funcs.decode_data(vurl)[0].encode('utf-8') print _url if _url: vurl=_url yield Request(vurl) item=Item() item.url=response.url item.soup=soup item.content=str(soup) #使用修改后的数据 item.coding=coding #内容编码 item.file_length=int(len(response.body)) #原始文件大小 yield item else: item=Item() item.url=response.url item.soup=None item.content=response.body #使用修改后的数据 item.coding='' #内容编码 item.file_length=int(len(response.body)) #原始文件大小 yield item