示例#1
0
 def parse(self, response):
     try:
         article_list = response.xpath('//ul[@class="article-list"]/li/a').extract()
         #self.logger.DEBUG(article_list)
         for i in range(0, len(article_list)):
             item = NoticespiderItem()
             url = re.findall(r'<a href="(.*?)" class="', article_list[i], re.S)[0]
             item['url'] = self.host + url
             self.logger.info(item['url'])
             item['urlmd5'] = md5(item['url']).hexdigest()
             item['title'] = re.findall(r'class="article-list-link">(.*?)</a>', article_list[i], re.S)[0]
             self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s',item['url'], item['urlmd5'], item['title'])
             yield item
     except Exception,e:
         self.logger.critical(e)
示例#2
0
 def parse_item(self, response):
     try:
         self.logger.debug(response.headers)
         article_list = response.xpath('//ul[@id="list"]/li/a').extract()
         self.logger.debug(article_list)
         for i in range(0, len(article_list)):
             item = NoticespiderItem()
             url = re.findall(r'<a href="(.*?)"', article_list[i], re.S)[0]
             item['url'] = self.host + url
             self.logger.info(item['url'])
             item['urlmd5'] = md5(item['url']).hexdigest()
             #Do not delete any space in the regex
             item['title'] = re.findall(r'</span>\n                        (.*?)                    </a>', article_list[i], re.S)[0]
             self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s',item['url'], item['urlmd5'], item['title'])
             yield item
     except Exception,e:
         self.logger.critical(e)
示例#3
0
 def parse_item(self, response):
     try:
         self.logger.debug(response.headers)
         article_list = response.xpath(
             '//div[@id="articles-list"]/article/div/a').extract()
         self.logger.debug(article_list)
         for i in range(0, len(article_list)):
             item = NoticespiderItem()
             url = re.findall(r'<a href="(.*?)"', article_list[i], re.S)[0]
             item['url'] = self.host + url
             self.logger.info(item['url'])
             item['urlmd5'] = md5(item['url']).hexdigest()
             item['title'] = re.findall(r'<h2>(.*?)</h2>', article_list[i],
                                        re.S)[0]
             self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s',
                              item['url'], item['urlmd5'], item['title'])
             yield item
     except Exception, e:
         self.logger.critical(e)
示例#4
0
 def parse(self, response):
     try:
         article_list = response.xpath(
             '//div[@class="leftlatnews" and @id="lcontentnews"]/div[@class="latnewslist"]/div/a'
         ).extract()
         self.logger.debug(article_list)
         for i in range(0, len(article_list)):
             item = NoticespiderItem()
             url = re.findall(r'<a href="(.*?)" title="', article_list[i],
                              re.S)[0]
             item['url'] = self.host + url
             self.logger.info(item['url'])
             item['urlmd5'] = md5(item['url']).hexdigest()
             item['title'] = re.findall(r'<h3>(.*?)</h3>\r\n',
                                        article_list[i], re.S)[0]
             self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s',
                              item['url'], item['urlmd5'], item['title'])
             yield item
     except Exception, e:
         self.logger.critical(e)
示例#5
0
 def parse(self, response):
     try:
         article_list = response.xpath(
             '//div[@class="news-list bg-color-white mb20"]/div/div/div/div/div[2]/div[1]/a'
         ).extract()
         self.logger.debug(article_list)
         for i in range(0, len(article_list)):
             item = NoticespiderItem()
             url = re.findall(r'<a href="(.*?)" class="', article_list[i],
                              re.S)[0]
             item['url'] = self.host + url
             self.logger.info(item['url'])
             item['urlmd5'] = md5(item['url']).hexdigest()
             item['title'] = re.findall(r'class="link-1">(.*?)</a>',
                                        article_list[i], re.S)[0]
             self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s',
                              item['url'], item['urlmd5'], item['title'])
             yield item
     except Exception, e:
         self.logger.critical(e)
示例#6
0
 def parse(self, response):
     try:
         article_list = response.xpath(
             '//table[@class="table table-hover table-striped"]/tbody/tr/td/a'
         ).extract()
         self.logger.debug(article_list)
         for i in range(0, len(article_list)):
             item = NoticespiderItem()
             url = re.findall(r'<a class="pull-left" href="(.*?)"',
                              article_list[i], re.S)[0]
             item['url'] = self.host + url
             self.logger.info(item['url'])
             item['urlmd5'] = md5(item['url']).hexdigest()
             item['title'] = re.findall(
                 r'">\r\n\t\t\t\t\t\t\t\t(.*?)\t\t\t\t\t\t\t\t</a>',
                 article_list[i], re.S)[0]
             self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s',
                              item['url'], item['urlmd5'], item['title'])
             yield item
     except Exception, e:
         self.logger.critical(e)
示例#7
0
 def parse(self, response):
     try:
         article_list = response.xpath(
             '//div[@class="container"]/div[@class="news-preview-wrap col-sm-6 col-md-4"]/a'
         ).extract()
         self.logger.debug(article_list)
         for i in range(0, len(article_list)):
             item = NoticespiderItem()
             url = re.findall(r'<a class="news-preview-link" href="(.*?)"',
                              article_list[i], re.S)[0]
             item['url'] = self.host + url
             self.logger.info(item['url'])
             item['urlmd5'] = md5(item['url']).hexdigest()
             item['title'] = re.findall(
                 r'<h2 class="post-title news-preview-content-title">(.*?)</h2>',
                 article_list[i], re.S)[0]
             self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s',
                              item['url'], item['urlmd5'], item['title'])
             yield item
     except Exception, e:
         self.logger.critical(e)
示例#8
0
 def parse(self, response):
     try:
         body = response.body.decode(response.encoding)
         self.logger.info(body)
         article_list = response.xpath('//*[@id="notice"]/ul')
         #article_list = response.xpath('//ul[@class="page_notice_list_content"]/li/a').extract()
         self.logger.debug(article_list)
         return
         for i in range(0, len(article_list)):
             item = NoticespiderItem()
             url = re.findall(r'<a href="(.*?)">', article_list[i], re.S)[0]
             item['url'] = self.host + url
             item['urlmd5'] = md5(item['url']).hexdigest()
             item['title'] = re.findall(
                 r'<h2 class="page_notice_title">(.*?)</h2>',
                 article_list[i], re.S)[0]
             self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s',
                              item['url'], item['urlmd5'], item['title'])
             return
             yield item
     except Exception, e:
         self.logger.critical(e)
示例#9
0
 def parse(self, response):
     try:
         article_list = response.xpath(
             '//ul[@class="cbp_tmtimeline"]/li/div[@class="cbp_tmlabel"]/article/header/h3/a'
         ).extract()
         self.logger.debug(article_list)
         for i in range(0, len(article_list)):
             item = NoticespiderItem()
             url = re.findall(r'<a href="(.*?)" target=', article_list[i],
                              re.S)[0]
             #replace '&' to ''
             rep_url = url.replace('amp;', '')
             item['url'] = self.host + rep_url
             self.logger.info(item['url'])
             item['urlmd5'] = md5(item['url']).hexdigest()
             item['title'] = re.findall(
                 r'_blank">(.*?)\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t</a>',
                 article_list[i], re.S)[0]
             self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s',
                              item['url'], item['urlmd5'], item['title'])
             yield item
     except Exception, e:
         self.logger.critical(e)