Пример #1
0
	def fetch_categorys(self):
		category_urls = [
		"http://www.eslite.com/category.aspx?cate=80",	#中文
		"http://www.eslite.com/category.aspx?cate=156",	#外文
		"http://www.eslite.com/category.aspx?cate=44"	#儿童
		]
		
		categorys = []
		
		for category_url in category_urls:
			self.snoopy.fetch(category_url)
			html =  self.snoopy.results
			reg_pattern = re.compile("\r")
			html = str_repalce(html, reg_pattern, "")
			reg_pattern = re.compile("\n")
			html = str_repalce(html, reg_pattern, "")
			reg_pattern = re.compile(r'<a href="(newbook_list.aspx?.*?)">(.*?)</a>')
			category_strs = reg_pattern.findall(html)
			
			for category_str in category_strs:
				try:
					category = self.init_category_format()
					params = url_decode(category_str[0])
					category["cate"] = params["cate"]
					category["sub"] = params["sub"]
					category["list"] =  params["list"]
					category["text"] =  category_str[1].strip().decode("utf8")
					categorys.append(category)
				except:
					pass
		return	categorys
 def parseMiscInfo(el):
     """
     提取微博的评论数、转发数以及发表时间和来源,从微博地址获取mid
     """
     e = el.xpath("./*[@class='info W_linkb W_textb']")[0]
     d = {
         'ccount': 0,
         'rcount': 0,
         'source': '',
     }
     al = e.xpath("./span/a")
     for a in al:
         if a.get('action-type', '') == 'feed_list_forward':
             d['rcount'] = NodeService.getCount(a.text)
         if a.get('action-type', '') == 'feed_list_comment':
             d['ccount'] = NodeService.getCount(a.text)
     d['cdate'] = int(
         e.xpath("./a[@node-type='feed_list_item_date']/@date")[0])
     hrefl = unicode(
         e.xpath("./a[@node-type='feed_list_item_date']/@href")[0]
     ).split('/')
     mid = url_decode(hrefl[-1])
     d['uid'] = int(hrefl[-2])
     d['source'] = unicode(e.xpath("./a[last()]/text()")[0])
     return d, mid
Пример #3
0
 def _get_filename_by_url(self, url):
     try:
         import re
         result = re.match(r"[^:]+://[^/]+/?([^?#]*)",url).groups()[0]
         result = result.split('/')[-1]
         if result:
             return url_decode(result)
         else:
             return "download"
     except Exception:
         return "download"
Пример #4
0
 def _get_filename_by_url(self, url):
     try:
         import re
         result = re.match(r"[^:]+://[^/]+/?([^?#]*)", url).groups()[0]
         result = result.split('/')[-1]
         if result:
             return url_decode(result)
         else:
             return "download"
     except Exception:
         return "download"
Пример #5
0
 def _filename_from_content_disposition(self, content_disposition):
     # rfc2183
     disposition = content_disposition.split(';')
     for i in xrange(1, len(disposition)):
         disposition_parm = disposition[i].split('=')
         if len(disposition_parm) > 1 and disposition_parm[0].strip() == 'filename':
             filename = url_decode(disposition_parm[1].strip('"'))
             if len(filename) > 0:
                 return filename
             else:
                 break
     return None
Пример #6
0
 def _filename_from_content_disposition(self, content_disposition):
     # rfc2183
     disposition = content_disposition.split(';')
     for i in xrange(1, len(disposition)):
         disposition_parm = disposition[i].split('=')
         if len(disposition_parm) > 1 and disposition_parm[0].strip(
         ) == 'filename':
             filename = url_decode(disposition_parm[1].strip('"'))
             if len(filename) > 0:
                 return filename
             else:
                 break
     return None