def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}') if html == None: return self.articles for x in recAbstract.findall(html): article = dict(url=Spider.ComposeUrl(self.url, x[0]), title=x[1], time=datetime.datetime.strptime(x[2], '%Y-%m-%d')) html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): content = y for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl( article['url'], urllib.parse.quote(z) if z[0] in ['/', '.'] else z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue image['imageUrl'] = Spider.ComposeUrl(article['url'], z) images.append(image) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') return self.articles
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}') if html == None: return self.articles for x in recAbstract.findall(html): article = dict(url=Spider.ComposeUrl(self.url, x[0]), title=x[1]) html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): article['time'] = datetime.datetime.strptime(y[0], '%Y-%m-%d') if not self.CheckNewArticle(article): logging.debug('文章源{0}并非新文章。'.format(article['url'])) continue content = y[1] for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') return self.articles
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}') if html == None: return self.articles for x in recAbstract.findall(html): article = dict( time=datetime.datetime.strptime(x[0], '%Y-%m-%d'), # url = self.url[0:self.url.rfind('/')] + x[1][1:], url=Spider.ComposeUrl(self.url, x[1]), title=x[2]) html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): content = y for z in recImage.findall(content): imageCount += 1 # imageUrl = article['url'][0:article['url'].rfind('/')] + z[1:] imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') return self.articles
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) recPage = re.compile( '<OPTION value=([^>\s]+?)(?:\s[^>]*?)*?>[^<]*?</OPTION>', re.DOTALL) html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}') if html == None: return self.articles for x in recAbstract.findall(html): article = dict(url=Spider.ComposeUrl(self.url, x[0]), title=x[1], time=datetime.datetime.strptime(x[2], '%Y-%m-%d')) if not self.CheckNewArticle(article): logging.debug('文章源{0}并非新文章。'.format(article['url'])) continue html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}') if html == None: continue totalContent = '' images = [] imageCount = 0 pageUrls = recPage.findall(html) if len(pageUrls) == 0: pageUrls += [article['url']] for p in pageUrls: pageUrl = Spider.ComposeUrl(article['url'], p) if pageUrl != article['url']: html = self.DownLoadHtml(pageUrl, '文章页{0}访问失败,异常信息为:{1}') if html == None: continue content = None for y in recArticle.findall(html): content = y for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if content != None: totalContent += content if totalContent == '' \ or imageCount != len(images): continue self.CacheArticle(article, totalContent, images, '成功自{0}提取文章') return self.articles
def CatchArticles(self): abstracts = None recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) html = self.DownLoadHtml(self.url, '文章摘要接口{0}访问失败,异常信息为:{1}') if html == None: return self.articles try: html = html.replace('null', 'None') abstracts = eval(html) except Exception as e: logging.warn('文章摘要信息{0}格式异常,异常信息为:{1}'.format(html, str(e))) return self.articles for x in abstracts['contents']: try: article = dict(url=Spider.ComposeUrl( self.url, '/{0}/{1}.jhtml'.format(x['channel_path'], x['contentId'])), title=x['title']) html = super().DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): article['time'] = datetime.datetime.strptime( y[0], '%Y-%m-%d %H:%M:%S') if not self.CheckNewArticle(article): logging.debug('文章源{0}并非新文章。'.format(article['url'])) continue content = y[1] for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') except Exception as e: logging.warn('文章明细信息{0}格式异常,异常信息为:{1}'.format(str(x), str(e))) continue return self.articles
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) recDate = re.compile('Details([\d-]+?).html', re.DOTALL) html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}') if html == None: return self.articles for x in recAbstract.findall(html): article = dict( url = Spider.ComposeUrl(self.url, x[0]), title = x[1].strip() ) for w in recDate.findall(article['url']): try: article['time'] = datetime.datetime.strptime('20{0}'.format(w[0:8]), '%Y-%m-%d') except Exception as e: logging.warn('文章源{0}无法识别发布日期,异常为:{1}'.format(article['url'], str(e))) continue # logging.debug(str(article)) if not 'time' in article: #不符合格式的外部链接忽略 continue if not self.CheckNewArticle(article): logging.debug('文章源{0}并非新文章。'.format(article['url'])) continue html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): content = y for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') return self.articles
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) recDate = re.compile('http://www.gotohz.com/\w+?/\w+?/\d+?/t(\d+?)_\d+?.shtml', re.DOTALL) html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}') if html == None: return self.articles for x in recAbstract.findall(html): article = dict( title = x[0], url = Spider.ComposeUrl(self.url, x[1]) ) for w in recDate.findall(article['url']): article['time'] = datetime.datetime.strptime(w,'%Y%m%d') # logging.debug(str(article)) if not 'time' in article: #不符合格式的外部链接忽略 continue if not self.CheckNewArticle(article): logging.debug('文章源{0}并非新文章。'.format(article['url'])) continue html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): content = y for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') return self.articles
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) recKeyword = re.compile(self.reKeyword, re.DOTALL) html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}') if html == None: return self.articles for x in recAbstract.findall(html): article = dict(url=xml.sax.saxutils.unescape( Spider.ComposeUrl(self.url, x[0])), title=x[1].strip(), time=datetime.datetime.strptime(x[2], '%Y-%m-%d')) #关键词检查 if recKeyword.match(x[1]) == None: continue else: logging.debug('文章Url为{0}, 标题为{1}。'.format( article['url'], article['title'])) if not self.CheckNewArticle(article): logging.debug('文章源{0}并非新文章。'.format(article['url'])) continue html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): content = y for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') return self.articles
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) validUrl = 'http://news.cncn.com/' html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}', 'gbk') if html == None: return self.articles for x in recAbstract.findall(html): article = dict(time=datetime.datetime.strptime( '{0}-01-01'.format(datetime.datetime.today().year), '%Y-%m-%d'), url=Spider.ComposeUrl(self.url, x[0]), title=x[1]) if not validUrl in article['url']: #无效URL continue if not self.CheckNewArticle(article): logging.debug('文章源{0}并非新文章。'.format(article['url'])) continue html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}', 'gbk') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): content = y for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') return self.articles
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) recDate = re.compile(self.reDate, re.DOTALL) html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}', 'gbk') if html == None: return self.articles for x in recAbstract.findall(html): article = dict(url=Spider.ComposeUrl(self.url, x[0]), title=x[1].strip()) for w in recDate.findall(article['url']): try: article['time'] = datetime.datetime.strptime( w, '%Y%m%d%H%M%S') except Exception as e: logging.warn('文章源{0}无法识别发布日期,异常为:{1}'.format( article['url'], str(e))) continue # logging.debug(str(article)) if not 'time' in article: #不符合格式的外部链接忽略 continue if not self.CheckNewArticle(article): logging.debug('文章源{0}并非新文章。'.format(article['url'])) continue html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}', 'gbk') if html == None: continue content = None images = [] imageCount = 0 for y in recArticle.findall(html): #图片过滤 content = recImage.sub(lambda matchobj: '', y) if not content \ or imageCount != len(images): continue self.CacheArticle(article, content, images, '成功自{0}提取文章') return self.articles
def CatchArticles(self): recAbstract = re.compile(self.reAbstract, re.DOTALL) recArticle = re.compile(self.reArticle, re.DOTALL) recImage = re.compile(self.reImage, re.DOTALL) recKeyword = re.compile(self.reKeyword, re.DOTALL) recDate = re.compile(self.reDate, re.DOTALL) recPage = re.compile(self.rePage, re.DOTALL) html = self.DownLoadHtml(self.url, '文章列表页{0}访问失败,异常信息为:{1}', 'gbk') if html == None: return self.articles for x in recAbstract.findall(html): article = dict(url=Spider.ComposeUrl(self.url, x[0]), title=x[1]) if recKeyword.match(x[1]) == None: #关键词检查 continue for w in recDate.findall(article['url']): try: article['time'] = datetime.datetime.strptime(w, '%Y/%m/%d') except Exception as e: logging.warn('文章源{0}无法识别发布日期,异常为:{1}'.format( article['url'], str(e))) continue if not 'time' in article: #链接日期检查 continue if not self.CheckNewArticle(article): #新文章检查 logging.debug('文章源{0}并非新文章。'.format(article['url'])) continue html = self.DownLoadHtml(article['url'], '文章页{0}访问失败,异常信息为:{1}', 'gbk') if html == None: continue totalContent = '' images = [] imageCount = 0 pageUrls = [article['url']] + recPage.findall(html) for p in pageUrls: pageUrl = Spider.ComposeUrl(article['url'], p) if pageUrl != article['url']: html = self.DownLoadHtml(pageUrl, '文章页{0}访问失败,异常信息为:{1}', 'gbk') if html == None: continue content = None for y in recArticle.findall(html): content = y for z in recImage.findall(content): imageCount += 1 imageUrl = Spider.ComposeUrl(article['url'], z) image = self.DownLoadImage(imageUrl, '图片{0}提取失败,异常信息为:{1}') if image == None: continue images.append(image) if content != None: totalContent += content if totalContent == '' \ or imageCount != len(images): continue self.CacheArticle(article, totalContent, images, '成功自{0}提取文章') return self.articles