Python HtmlXPathSelector примеры использования

Язык программирования: Python

Пространство имен/Пакет: scrapy.selector.libxml2sel

Класс/Тип: HtmlXPathSelector

Примеров на hotexamples.com: 8

Python HtmlXPathSelector - 8 примеров найдено. Это лучшие примеры Python кода для scrapy.selector.libxml2sel.HtmlXPathSelector, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

HtmlXPathSelector(4)

select(3)

extract(1)

Пример #1

Показать файл

    def extract_links(self, response):
        xs = HtmlXPathSelector(response)
        base_url = xs.select('//base/@href').extract()
        base_url = urljoin_rfc(response.url,
                               base_url[0]) if base_url else response.url

        links = []
        for location in self.locations:
            if isinstance(location, basestring):
                selectors = xs.select(location)
            elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)):
                selectors = [location] if isinstance(
                    location, HtmlXPathSelector) else location
            else:
                continue

            for selector in selectors:
                links.extend(
                    self.extract_from_selector(selector, response.encoding))

        seen, ret = set(), []
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response.encoding)
            if self.unique:
                if link.url in seen:
                    continue
                else:
                    seen.add(link.url)
            if self.canonicalize:
                link.url = canonicalize_url(link.url)
            ret.append(link)

        return ret

Пример #2

Показать файл

Файл: test_selector_libxml2.py Проект: richard-ma/CodeReading

    def test_null_bytes(self):
        hxs = HtmlXPathSelector(text='<root>la\x00la</root>')
        self.assertEqual(hxs.extract(),
                         u'<html><body><root>lala</root></body></html>')

        xxs = XmlXPathSelector(text='<root>la\x00la</root>')
        self.assertEqual(xxs.extract(), u'<root>lala</root>')

Пример #3

Показать файл

Файл: image.py Проект: bihicheng/scrapy

    def extract_links(self, response):
        xs = HtmlXPathSelector(response)
        base_url = xs.select('//base/@href').extract()
        base_url = urljoin_rfc(response.url, base_url[0]) if base_url else response.url

        links = []
        for location in self.locations:
            if isinstance(location, basestring):
                selectors = xs.select(location)
            elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)):
                selectors = [location] if isinstance(location, HtmlXPathSelector) else location
            else:
                continue

            for selector in selectors:
                links.extend(self.extract_from_selector(selector, response.encoding))

        seen, ret = set(), []
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response.encoding)
            if self.unique:
                if link.url in seen:
                    continue
                else:
                    seen.add(link.url)
            if self.canonicalize:
                link.url = canonicalize_url(link.url)
            ret.append(link)

        return ret

Пример #4

Показать файл

Файл: test_selector_libxml2.py Проект: 00gpowe/scrapy

    def test_null_bytes(self):
        hxs = HtmlXPathSelector(text='<root>la\x00la</root>')
        self.assertEqual(hxs.extract(),
                         u'<html><body><root>lala</root></body></html>')

        xxs = XmlXPathSelector(text='<root>la\x00la</root>')
        self.assertEqual(xxs.extract(),
                         u'<root>lala</root>')

Пример #5

Показать файл

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        # Assign given elements
        article = ArticleItem()
        article['source'] = self.source
        article['url'] = response.url

        # Parse Category
        try:
            category_str = sanitize(self.get_category(response))
            article['category'] = capitalizeFirstCharInWord(
                self.normalize_category(self.parse_category(category_str)))
        except:
            article['category'] = ''

        # Parse Title
        try:
            article['title'] = sanitize(
                hxs.select(self.xpath_title).extract()[0])
        except:
            article['title'] = ''

        # Parse Content
        paragraphs = hxs.select(self.xpath_content).extract()
        lines = []
        for paragraph in paragraphs:
            line = sanitize(paragraph)
            if len(line) > 0:
                lines.append(line)
        article['content'] = '\n'.join(lines)

        # Parse Subtitle
        try:
            article['subtitle'] = capitalizeFirstCharInWord(
                sanitize(hxs.select(self.xpath_subtitle).extract()[0]))
        except:
            article['subtitle'] = ''

        # Parse Published_at
        try:
            date_str = sanitize(
                hxs.select(self.xpath_published_at).extract()[0])
            article['published_at'] = self.parse_date(date_str)
        except:
            article['published_at'] = ''

        # Parse Place
        try:
            place_str = sanitize(hxs.select(self.xpath_place).extract()[0])
            article['place'] = capitalizeFirstCharInWord(
                self.parse_place(place_str))
        except:
            article['place'] = ''

        # Parse Author
        try:
            author_str = sanitize(hxs.select(self.xpath_author).extract()[0])
            article['author'] = capitalizeFirstCharInWord(
                self.parse_author(author_str))
        except:
            article['author'] = ''

        # Debug
        if self.debug == True:
            print article
            print ''
        else:
            return article

Пример #6

Показать файл

 def get_category(self, response):
     hxs = HtmlXPathSelector(response)
     return hxs.select(self.xpath_category).extract()[0]

Пример #7

Показать файл

Файл: base_crawler.py Проект: wiwiek-ci/ina-news-crawler

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        # Assign given elements
        article = ArticleItem()
        article['source'] = self.source
        article['url'] = response.url

        # Parse Category
        try:
            category_str = sanitize(self.get_category(response))
            article['category'] = capitalizeFirstCharInWord(self.normalize_category(self.parse_category(category_str)))
        except:
            article['category'] = ''

        # Parse Title
        try:
            article['title'] = sanitize(hxs.select(self.xpath_title).extract()[0])
        except:
            article['title'] = ''

        # Parse Content
        paragraphs = hxs.select(self.xpath_content).extract()
        lines = []
        for paragraph in paragraphs:
            line = sanitize(paragraph)
            if len(line) > 0:
                lines.append(line)
        article['content'] = '\n'.join(lines)

        # Parse Subtitle
        try:
            article['subtitle'] = capitalizeFirstCharInWord(sanitize(hxs.select(self.xpath_subtitle).extract()[0]))
        except:
            article['subtitle'] = ''

        # Parse Published_at
        try:
            date_str = sanitize(hxs.select(self.xpath_published_at).extract()[0])
            article['published_at'] = self.parse_date(date_str)
        except:
            article['published_at'] = ''

        # Parse Place
        try:
            place_str = sanitize(hxs.select(self.xpath_place).extract()[0])
            article['place'] = capitalizeFirstCharInWord(self.parse_place(place_str))
        except:
            article['place'] = ''

        # Parse Author
        try:
            author_str = sanitize(hxs.select(self.xpath_author).extract()[0])
            article['author'] = capitalizeFirstCharInWord(self.parse_author(author_str))
        except:
            article['author'] = ''

        # Debug
        if self.debug == True:
            print article
            print ''
        else:
            return article

Пример #8

Показать файл

Файл: base_crawler.py Проект: wiwiek-ci/ina-news-crawler

 def get_category(self, response):
     hxs = HtmlXPathSelector(response)
     return hxs.select(self.xpath_category).extract()[0]