Python NewsArticleItem示例，AmisScraper.items.NewsArticleItem Python示例

示例#1

0

显示文件

文件： spiders.py 项目： mkao006/TheReadingMachine

    def parse_item(self, response):
        item = NewsArticleItem()
        title = response.xpath('//title/text()').extract_first()
        article = (response.xpath('//p/text()').extract() +
                   response.xpath('//br/text()').extract() +
                   response.xpath('//div[@class="article"]/text()').extract() +
                   response.xpath(
                       '//div["font-family: arial; font-size: 13px"]/text()'
                   ).extract())

        self.logger.info('Scraping Title: ' + title)
        item['title'] = title
        item['article'] = article
        item['link'] = response.url.replace('http://',
                                            '').replace('https://', '')

        raw_date = (
            response.xpath(
                '//span[@class="news_article_date"]/text()').extract_first()
            or response.xpath('//td[@class="pubName"]/text()').extract_first())
        clean_date = self._parse_wg_date(self._clean_date(raw_date))
        if clean_date.year < 1900:
            raise DropItem('Incorrect Format for Date in %s' % item)
        else:
            item['date'] = str(clean_date)
        return item

示例#2

0

显示文件

文件： spiders.py 项目： mkao006/TheReadingMachine

    def parse_item(self, response):
        item = NewsArticleItem()
        title = response.xpath('//a/text()')[8].extract()
        article = response.xpath('//div/text()').extract()
        self.logger.info('Scraping Title: ' + title)
        item['title'] = title
        item['article'] = article
        item['link'] = (response.url.replace('http://', '').replace(
            'https://', '').replace("blogspot.co.id", "blogspot.com"))

        try:
            token = filter(lambda x: '--' in x, article)
            try:
                raw_date = token[0].split(' -- ')[0].replace('\n', '')
                date = datetime.strptime(raw_date, '%d/%m/%y')
            except (ValueError, IndexError):
                try:
                    raw_date = title.split("Nogger's Blog: ")[1]
                    date = datetime.strptime(raw_date, '%d-%b-%Y')
                except (ValueError, IndexError):
                    date = ''
            item['date'] = str(date)
            return item
        except Exception as e:
            pass

示例#3

0

显示文件

 def parse_item(self, response):
     item = NewsArticleItem()
     title = response.xpath('//title/text()')[0].extract()
     raw_date = response.xpath('//font/text()')[2].extract().split(',')[1] + \
                response.xpath('//font/text()')[3].extract().split(',')[0]
     article = response.xpath('//body//text()').extract()
     self.logger.info("Scraping Title: " + title)
     item['title'] = title.replace('Agrimoney.com | ', '')
     item['article'] = article
     item['link'] = response.url
     try:
         date = datetime.strptime(
             raw_date.split(',')[1].replace('Sept', 'Sep'), ' %d %b %Y')
         item['date'] = str(date)
         return item
     except (ValueError, IndexError):
         raw_date = response.xpath('//font/text()')[2].extract(
         ) + response.xpath('//font/text()')[3].extract()
         try:
             date = datetime.strptime(raw_date.split(',')[2], ' %d %b %Y')
             item['date'] = str(date)
             return item
         except (ValueError, IndexError):
             try:
                 date = datetime.strptime(
                     raw_date.split(',')[1], ' %d %b %Y')
                 item['date'] = str(date)
                 return item
             except Exception as e:
                 self.logf.write("Failed to scrape {0}: {1}\n".format(
                     str(response.url), str(e)))

示例#4

0

显示文件

 def parse_item(self, response):
     item = NewsArticleItem()
     title = response.xpath('//title/text()')[0].extract()
     article = response.xpath('//p/text()').extract()
     self.logger.info("Scraping Title: " + title)
     try:
         item['title'] = title
         item['article'] = article
         item['link'] = response.url
         raw_date = response.url.split("/")[-2]
         date = datetime.strptime(raw_date, '%Y-%m-%d')
         item['date'] = str(date)
         return item
     except Exception as e:
         self.logf.write("Failed to scrape {0}: {1}\n".format(
             str(response.url), str(e)))

示例#5

0

显示文件

 def parse_item(self, response):
     item = NewsArticleItem()
     title = response.xpath('//title/text()')[0].extract()
     article = response.xpath('//p/text()').extract()
     self.logger.info("Scraping Title: " + title)
     item['title'] = title
     item['article'] = article
     item['link'] = response.url
     try:
         raw_date = response.xpath(
             '//*[contains(@class,"ea-dateformat")]/text()').extract(
             )[1].strip()
         date = datetime.strptime(raw_date, '%d-%m-%Y')
         item['date'] = str(date)
         return item
     except Exception as e:
         self.logf.write("Failed to scrape {0}: {1}\n".format(
             str(response.url), str(e)))

示例#6

0

显示文件

文件： spiders.py 项目： mkao006/TheReadingMachine

    def parse_item(self, response):
        item = NewsArticleItem()
        title = response.xpath('//title/text()').extract_first()
        if title == 'Bloomberg':
            title = response.xpath(
                '//meta[@property="og:title"]/text()').extract_first()

        article = response.xpath('//p/text()').extract()
        self.logger.info('Scraping Title: ' + title)
        try:
            item['title'] = title
            item['article'] = article
            item['link'] = response.url.replace('http://',
                                                '').replace('https://', '')
            raw_date = response.url.split('/')[-2]
            date = datetime.strptime(raw_date, '%Y-%m-%d')
            item['date'] = str(date)
            return item
        except Exception as e:
            pass

示例#7

0

显示文件

文件： spiders.py 项目： mkao006/TheReadingMachine

 def parse_item(self, response):
     cleaned_response = response.replace(
         body=response.body_as_unicode().encode('utf-8', 'ignore'),
         encoding='utf-8')
     item = NewsArticleItem()
     try:
         title = response.xpath('//title/text()')[0].extract().encode(
             'utf-8', 'ignore')
         raw_date = response.xpath('//font/text()')[2].extract().split(',')[1] + \
             response.xpath('//font/text()')[3].extract().split(',')[0]
         self.logger.info('Scraping Title: ' + title)
         item['title'] = title.replace('Agrimoney.com | ',
                                       '').encode('utf-8', 'ignore')
         item['article'] = [
             art.encode('utf-8')
             for art in response.xpath('//body//text()').extract()
         ]
         item['link'] = response.url.replace('http://', '').replace(
             'https://', '').encode('utf-8', 'ignore')
     except UnicodeDecodeError:
         pass
     try:
         date = datetime.strptime(
             raw_date.split(',')[1].replace('Sept', 'Sep'), ' %d %b %Y')
         item['date'] = str(date)
         return item
     except (ValueError, IndexError):
         raw_date = response.xpath('//font/text()')[2].extract() + \
             response.xpath('//font/text()')[3].extract()
         try:
             date = datetime.strptime(raw_date.split(',')[2], ' %d %b %Y')
             item['date'] = str(date)
             return item
         except (ValueError, IndexError):
             try:
                 date = datetime.strptime(
                     raw_date.split(',')[1], ' %d %b %Y')
                 item['date'] = str(date)
                 return item
             except Exception as e:
                 pass

示例#8

0

显示文件

文件： spiders.py 项目： mkao006/TheReadingMachine

    def parse_item(self, response):
        item = NewsArticleItem()
        try:
            title = response.xpath('//title/text()')[0].extract()
            article = response.xpath('//p/text()').extract()
            item['title'] = title
            item['article'] = article
            item['link'] = response.url.replace('http://',
                                                '').replace('https://', '')
        except UnicodeDecodeError:
            pass

        self.logger.info('Scraping Title: ' + title)

        try:
            raw_date = response.xpath(
                '//*[contains(@class,"ea-dateformat")]/text()').extract(
                )[1].strip()
            date = datetime.strptime(raw_date, '%d-%m-%Y')
            item['date'] = str(date)
            return item
        except Exception as e:
            pass

示例#9

0

显示文件

 def parse_item(self, response):
     item = NewsArticleItem()
     title = response.xpath('//a/text()')[8].extract()
     article = response.xpath('//div/text()').extract()
     self.logger.info("Scraping Title: " + title)
     item['title'] = title
     item['article'] = article
     item['link'] = response.url
     try:
         token = filter(lambda x: '--' in x, article)
         try:
             raw_date = token[0].split(' -- ')[0].replace('\n', '')
             date = datetime.strptime(raw_date, '%d/%M/%y')
         except (ValueError, IndexError):
             try:
                 raw_date = title.split("Nogger's Blog: ")[1]
                 date = datetime.strptime(raw_date, '%d-%b-%Y')
             except (ValueError, IndexError):
                 date = ""
         item['date'] = str(date)
         return item
     except Exception as e:
         self.logf.write("Failed to scrape {0}: {1}\n".format(
             str(response.url), str(e)))