示例#1
0
 def CatchArticles(self):
     recAbstract = re.compile(self.reAbstract, re.DOTALL)
     recArticle = re.compile(self.reArticle, re.DOTALL)
     recImage = re.compile(self.reImage, re.DOTALL)
     html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}')
     if html == None:
         return self.articles
     for x in recAbstract.findall(html):
         article = dict(url=Spider.ComposeUrl(self.url, x[0]),
                        title=x[1],
                        time=datetime.datetime.strptime(x[2], '%Y-%m-%d'))
         html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}')
         if html == None:
             continue
         content = None
         images = []
         imageCount = 0
         for y in recArticle.findall(html):
             content = y
             for z in recImage.findall(content):
                 imageCount += 1
                 imageUrl = Spider.ComposeUrl(
                     article['url'],
                     urllib.parse.quote(z) if z[0] in ['/', '.'] else z)
                 image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}')
                 if image == None:
                     continue
                 image['imageUrl'] = Spider.ComposeUrl(article['url'], z)
                 images.append(image)
         if not content \
         or imageCount != len(images):
             continue
         self.CacheArticle(article, content, images, '成功自{0}提取文章')
     return self.articles
 def CatchArticles(self):
     recAbstract = re.compile(self.reAbstract, re.DOTALL)
     recArticle = re.compile(self.reArticle, re.DOTALL)
     recImage = re.compile(self.reImage, re.DOTALL)
     html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}')
     if html == None:
         return self.articles
     for x in recAbstract.findall(html):
         article = dict(url=Spider.ComposeUrl(self.url, x[0]), title=x[1])
         html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}')
         if html == None:
             continue
         content = None
         images = []
         imageCount = 0
         for y in recArticle.findall(html):
             article['time'] = datetime.datetime.strptime(y[0], '%Y-%m-%d')
             if not self.CheckNewArticle(article):
                 logging.debug('文章源{0}并非新文章。'.format(article['url']))
                 continue
             content = y[1]
             for z in recImage.findall(content):
                 imageCount += 1
                 imageUrl = Spider.ComposeUrl(article['url'], z)
                 image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}')
                 if image == None:
                     continue
                 images.append(image)
         if not content \
         or imageCount != len(images):
             continue
         self.CacheArticle(article, content, images, '成功自{0}提取文章')
     return self.articles
示例#3
0
 def CatchArticles(self):
     recAbstract = re.compile(self.reAbstract, re.DOTALL)
     recArticle = re.compile(self.reArticle, re.DOTALL)
     recImage = re.compile(self.reImage, re.DOTALL)
     html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}')
     if html == None:
         return self.articles
     for x in recAbstract.findall(html):
         article = dict(
             time=datetime.datetime.strptime(x[0], '%Y-%m-%d'),
             # url = self.url[0:self.url.rfind('/')] + x[1][1:],
             url=Spider.ComposeUrl(self.url, x[1]),
             title=x[2])
         html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}')
         if html == None:
             continue
         content = None
         images = []
         imageCount = 0
         for y in recArticle.findall(html):
             content = y
             for z in recImage.findall(content):
                 imageCount += 1
                 # imageUrl = article['url'][0:article['url'].rfind('/')] + z[1:]
                 imageUrl = Spider.ComposeUrl(article['url'], z)
                 image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}')
                 if image == None:
                     continue
                 images.append(image)
         if not content \
         or imageCount != len(images):
             continue
         self.CacheArticle(article, content, images, '成功自{0}提取文章')
     return self.articles
示例#4
0
 def CatchArticles(self):
     recAbstract = re.compile(self.reAbstract, re.DOTALL)
     recArticle = re.compile(self.reArticle, re.DOTALL)
     recImage = re.compile(self.reImage, re.DOTALL)
     recPage = re.compile(
         '<OPTION value=([^>\s]+?)(?:\s[^>]*?)*?>[^<]*?</OPTION>',
         re.DOTALL)
     html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}')
     if html == None:
         return self.articles
     for x in recAbstract.findall(html):
         article = dict(url=Spider.ComposeUrl(self.url, x[0]),
                        title=x[1],
                        time=datetime.datetime.strptime(x[2], '%Y-%m-%d'))
         if not self.CheckNewArticle(article):
             logging.debug('文章源{0}并非新文章。'.format(article['url']))
             continue
         html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}')
         if html == None:
             continue
         totalContent = ''
         images = []
         imageCount = 0
         pageUrls = recPage.findall(html)
         if len(pageUrls) == 0:
             pageUrls += [article['url']]
         for p in pageUrls:
             pageUrl = Spider.ComposeUrl(article['url'], p)
             if pageUrl != article['url']:
                 html = self.DownLoadHtml(pageUrl, '文章页{0}访问失败,异常信息为:{1}')
                 if html == None:
                     continue
             content = None
             for y in recArticle.findall(html):
                 content = y
                 for z in recImage.findall(content):
                     imageCount += 1
                     imageUrl = Spider.ComposeUrl(article['url'], z)
                     image = self.DownLoadImage(imageUrl,
                                                '图片{0}提取失败,异常信息为:{1}')
                     if image == None:
                         continue
                     images.append(image)
                 if content != None:
                     totalContent += content
         if totalContent == '' \
         or imageCount != len(images):
             continue
         self.CacheArticle(article, totalContent, images, '成功自{0}提取文章')
     return self.articles
 def CatchArticles(self):
     abstracts = None
     recArticle = re.compile(self.reArticle, re.DOTALL)
     recImage = re.compile(self.reImage, re.DOTALL)
     html = self.DownLoadHtml(self.url, '文章摘要接口{0}访问失败,异常信息为:{1}')
     if html == None:
         return self.articles
     try:
         html = html.replace('null', 'None')
         abstracts = eval(html)
     except Exception as e:
         logging.warn('文章摘要信息{0}格式异常,异常信息为:{1}'.format(html, str(e)))
         return self.articles
     for x in abstracts['contents']:
         try:
             article = dict(url=Spider.ComposeUrl(
                 self.url, '/{0}/{1}.jhtml'.format(x['channel_path'],
                                                   x['contentId'])),
                            title=x['title'])
             html = super().DownLoadHtml(article['url'],
                                         '文章页{0}访问失败,异常信息为:{1}')
             if html == None:
                 continue
             content = None
             images = []
             imageCount = 0
             for y in recArticle.findall(html):
                 article['time'] = datetime.datetime.strptime(
                     y[0], '%Y-%m-%d %H:%M:%S')
                 if not self.CheckNewArticle(article):
                     logging.debug('文章源{0}并非新文章。'.format(article['url']))
                     continue
                 content = y[1]
                 for z in recImage.findall(content):
                     imageCount += 1
                     imageUrl = Spider.ComposeUrl(article['url'], z)
                     image = self.DownLoadImage(imageUrl,
                                                '图片{0}提取失败,异常信息为:{1}')
                     if image == None:
                         continue
                     images.append(image)
             if not content \
             or imageCount != len(images):
                 continue
             self.CacheArticle(article, content, images, '成功自{0}提取文章')
         except Exception as e:
             logging.warn('文章明细信息{0}格式异常,异常信息为:{1}'.format(str(x), str(e)))
             continue
     return self.articles
示例#6
0
	def CatchArticles(self):
		recAbstract = re.compile(self.reAbstract, re.DOTALL)
		recArticle = re.compile(self.reArticle, re.DOTALL)
		recImage = re.compile(self.reImage, re.DOTALL)
		recDate = re.compile('Details([\d-]+?).html', re.DOTALL)
		html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}')
		if html == None:
			return self.articles
		for x in recAbstract.findall(html):
			article = dict(
				url = Spider.ComposeUrl(self.url, x[0]),
 				title = x[1].strip()
			)
			for w in recDate.findall(article['url']):
				try:
					article['time'] = datetime.datetime.strptime('20{0}'.format(w[0:8]),
															'%Y-%m-%d')
				except Exception as e:
					logging.warn('文章源{0}无法识别发布日期,异常为:{1}'.format(article['url'],
																				str(e)))
					continue		
			# logging.debug(str(article))
			if not 'time' in article:
				#不符合格式的外部链接忽略
				continue
			if not self.CheckNewArticle(article):
				logging.debug('文章源{0}并非新文章。'.format(article['url']))
				continue
			html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}')
			if html == None:
				continue
			content = None
			images = []
			imageCount = 0
			for y in recArticle.findall(html):
				content = y
				for z in recImage.findall(content):
					imageCount += 1
					imageUrl = Spider.ComposeUrl(article['url'], z)
					image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}')
					if image == None:
						continue
					images.append(image)
			if not content \
			or imageCount != len(images):
				continue
			self.CacheArticle(article, content, images, '成功自{0}提取文章')
		return self.articles
示例#7
0
	def CatchArticles(self):
		recAbstract = re.compile(self.reAbstract, re.DOTALL)
		recArticle = re.compile(self.reArticle, re.DOTALL)
		recImage = re.compile(self.reImage, re.DOTALL)
		recDate = re.compile('http://www.gotohz.com/\w+?/\w+?/\d+?/t(\d+?)_\d+?.shtml', re.DOTALL)
		html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}')
		if html == None:
			return self.articles
		for x in recAbstract.findall(html):
			article = dict(
				title = x[0],
 				url = Spider.ComposeUrl(self.url, x[1])
			)
			for w in recDate.findall(article['url']):
				article['time'] = datetime.datetime.strptime(w,'%Y%m%d')
			# logging.debug(str(article))
			if not 'time' in article:
				#不符合格式的外部链接忽略
				continue
			if not self.CheckNewArticle(article):
				logging.debug('文章源{0}并非新文章。'.format(article['url']))
				continue
			html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}')
			if html == None:
				continue
			content = None
			images = []
			imageCount = 0
			for y in recArticle.findall(html):
				content = y
				for z in recImage.findall(content):
					imageCount += 1
					imageUrl = Spider.ComposeUrl(article['url'], z)
					image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}')
					if image == None:
						continue
					images.append(image)
			if not content \
			or imageCount != len(images):
				continue
			self.CacheArticle(article, content, images, '成功自{0}提取文章')
		return self.articles
示例#8
0
 def CatchArticles(self):
     recAbstract = re.compile(self.reAbstract, re.DOTALL)
     recArticle = re.compile(self.reArticle, re.DOTALL)
     recImage = re.compile(self.reImage, re.DOTALL)
     recKeyword = re.compile(self.reKeyword, re.DOTALL)
     html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}')
     if html == None:
         return self.articles
     for x in recAbstract.findall(html):
         article = dict(url=xml.sax.saxutils.unescape(
             Spider.ComposeUrl(self.url, x[0])),
                        title=x[1].strip(),
                        time=datetime.datetime.strptime(x[2], '%Y-%m-%d'))
         #关键词检查
         if recKeyword.match(x[1]) == None:
             continue
         else:
             logging.debug('文章Url为{0}, 标题为{1}。'.format(
                 article['url'], article['title']))
         if not self.CheckNewArticle(article):
             logging.debug('文章源{0}并非新文章。'.format(article['url']))
             continue
         html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}')
         if html == None:
             continue
         content = None
         images = []
         imageCount = 0
         for y in recArticle.findall(html):
             content = y
             for z in recImage.findall(content):
                 imageCount += 1
                 imageUrl = Spider.ComposeUrl(article['url'], z)
                 image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}')
                 if image == None:
                     continue
                 images.append(image)
         if not content \
         or imageCount != len(images):
             continue
         self.CacheArticle(article, content, images, '成功自{0}提取文章')
     return self.articles
示例#9
0
 def CatchArticles(self):
     recAbstract = re.compile(self.reAbstract, re.DOTALL)
     recArticle = re.compile(self.reArticle, re.DOTALL)
     recImage = re.compile(self.reImage, re.DOTALL)
     validUrl = 'http://news.cncn.com/'
     html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}', 'gbk')
     if html == None:
         return self.articles
     for x in recAbstract.findall(html):
         article = dict(time=datetime.datetime.strptime(
             '{0}-01-01'.format(datetime.datetime.today().year),
             '%Y-%m-%d'),
                        url=Spider.ComposeUrl(self.url, x[0]),
                        title=x[1])
         if not validUrl in article['url']:
             #无效URL
             continue
         if not self.CheckNewArticle(article):
             logging.debug('文章源{0}并非新文章。'.format(article['url']))
             continue
         html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}',
                                  'gbk')
         if html == None:
             continue
         content = None
         images = []
         imageCount = 0
         for y in recArticle.findall(html):
             content = y
             for z in recImage.findall(content):
                 imageCount += 1
                 imageUrl = Spider.ComposeUrl(article['url'], z)
                 image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}')
                 if image == None:
                     continue
                 images.append(image)
         if not content \
         or imageCount != len(images):
             continue
         self.CacheArticle(article, content, images, '成功自{0}提取文章')
     return self.articles
示例#10
0
 def CatchArticles(self):
     recAbstract = re.compile(self.reAbstract, re.DOTALL)
     recArticle = re.compile(self.reArticle, re.DOTALL)
     recImage = re.compile(self.reImage, re.DOTALL)
     recDate = re.compile(self.reDate, re.DOTALL)
     html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}', 'gbk')
     if html == None:
         return self.articles
     for x in recAbstract.findall(html):
         article = dict(url=Spider.ComposeUrl(self.url, x[0]),
                        title=x[1].strip())
         for w in recDate.findall(article['url']):
             try:
                 article['time'] = datetime.datetime.strptime(
                     w, '%Y%m%d%H%M%S')
             except Exception as e:
                 logging.warn('文章源{0}无法识别发布日期,异常为:{1}'.format(
                     article['url'], str(e)))
                 continue
         # logging.debug(str(article))
         if not 'time' in article:
             #不符合格式的外部链接忽略
             continue
         if not self.CheckNewArticle(article):
             logging.debug('文章源{0}并非新文章。'.format(article['url']))
             continue
         html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}',
                                  'gbk')
         if html == None:
             continue
         content = None
         images = []
         imageCount = 0
         for y in recArticle.findall(html):
             #图片过滤
             content = recImage.sub(lambda matchobj: '', y)
         if not content \
         or imageCount != len(images):
             continue
         self.CacheArticle(article, content, images, '成功自{0}提取文章')
     return self.articles
 def CatchArticles(self):
     recAbstract = re.compile(self.reAbstract, re.DOTALL)
     recArticle = re.compile(self.reArticle, re.DOTALL)
     recImage = re.compile(self.reImage, re.DOTALL)
     recKeyword = re.compile(self.reKeyword, re.DOTALL)
     recDate = re.compile(self.reDate, re.DOTALL)
     recPage = re.compile(self.rePage, re.DOTALL)
     html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}', 'gbk')
     if html == None:
         return self.articles
     for x in recAbstract.findall(html):
         article = dict(url=Spider.ComposeUrl(self.url, x[0]), title=x[1])
         if recKeyword.match(x[1]) == None:
             #关键词检查
             continue
         for w in recDate.findall(article['url']):
             try:
                 article['time'] = datetime.datetime.strptime(w, '%Y/%m/%d')
             except Exception as e:
                 logging.warn('文章源{0}无法识别发布日期,异常为:{1}'.format(
                     article['url'], str(e)))
                 continue
         if not 'time' in article:
             #链接日期检查
             continue
         if not self.CheckNewArticle(article):
             #新文章检查
             logging.debug('文章源{0}并非新文章。'.format(article['url']))
             continue
         html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}',
                                  'gbk')
         if html == None:
             continue
         totalContent = ''
         images = []
         imageCount = 0
         pageUrls = [article['url']] + recPage.findall(html)
         for p in pageUrls:
             pageUrl = Spider.ComposeUrl(article['url'], p)
             if pageUrl != article['url']:
                 html = self.DownLoadHtml(pageUrl, '文章页{0}访问失败,异常信息为:{1}',
                                          'gbk')
                 if html == None:
                     continue
             content = None
             for y in recArticle.findall(html):
                 content = y
                 for z in recImage.findall(content):
                     imageCount += 1
                     imageUrl = Spider.ComposeUrl(article['url'], z)
                     image = self.DownLoadImage(imageUrl,
                                                '图片{0}提取失败,异常信息为:{1}')
                     if image == None:
                         continue
                     images.append(image)
                 if content != None:
                     totalContent += content
         if totalContent == '' \
         or imageCount != len(images):
             continue
         self.CacheArticle(article, totalContent, images, '成功自{0}提取文章')
     return self.articles