Python HtmlXPathSelector.select 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: scrapy.selector.libxml2sel

클래스/타입: HtmlXPathSelector

메소드/함수: select

hotexamples.com에서의 예제들: 6

Python HtmlXPathSelector.select - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 scrapy.selector.libxml2sel.HtmlXPathSelector.select에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

HtmlXPathSelector(4)

select(3)

extract(1)

예제 #1

파일 보기

    def extract_links(self, response):
        xs = HtmlXPathSelector(response)
        base_url = xs.select('//base/@href').extract()
        base_url = urljoin_rfc(response.url,
                               base_url[0]) if base_url else response.url

        links = []
        for location in self.locations:
            if isinstance(location, basestring):
                selectors = xs.select(location)
            elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)):
                selectors = [location] if isinstance(
                    location, HtmlXPathSelector) else location
            else:
                continue

            for selector in selectors:
                links.extend(
                    self.extract_from_selector(selector, response.encoding))

        seen, ret = set(), []
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response.encoding)
            if self.unique:
                if link.url in seen:
                    continue
                else:
                    seen.add(link.url)
            if self.canonicalize:
                link.url = canonicalize_url(link.url)
            ret.append(link)

        return ret

예제 #2

파일 보기

파일: image.py 프로젝트: bihicheng/scrapy

    def extract_links(self, response):
        xs = HtmlXPathSelector(response)
        base_url = xs.select('//base/@href').extract()
        base_url = urljoin_rfc(response.url, base_url[0]) if base_url else response.url

        links = []
        for location in self.locations:
            if isinstance(location, basestring):
                selectors = xs.select(location)
            elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)):
                selectors = [location] if isinstance(location, HtmlXPathSelector) else location
            else:
                continue

            for selector in selectors:
                links.extend(self.extract_from_selector(selector, response.encoding))

        seen, ret = set(), []
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response.encoding)
            if self.unique:
                if link.url in seen:
                    continue
                else:
                    seen.add(link.url)
            if self.canonicalize:
                link.url = canonicalize_url(link.url)
            ret.append(link)

        return ret

예제 #3

파일 보기

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        # Assign given elements
        article = ArticleItem()
        article['source'] = self.source
        article['url'] = response.url

        # Parse Category
        try:
            category_str = sanitize(self.get_category(response))
            article['category'] = capitalizeFirstCharInWord(
                self.normalize_category(self.parse_category(category_str)))
        except:
            article['category'] = ''

        # Parse Title
        try:
            article['title'] = sanitize(
                hxs.select(self.xpath_title).extract()[0])
        except:
            article['title'] = ''

        # Parse Content
        paragraphs = hxs.select(self.xpath_content).extract()
        lines = []
        for paragraph in paragraphs:
            line = sanitize(paragraph)
            if len(line) > 0:
                lines.append(line)
        article['content'] = '\n'.join(lines)

        # Parse Subtitle
        try:
            article['subtitle'] = capitalizeFirstCharInWord(
                sanitize(hxs.select(self.xpath_subtitle).extract()[0]))
        except:
            article['subtitle'] = ''

        # Parse Published_at
        try:
            date_str = sanitize(
                hxs.select(self.xpath_published_at).extract()[0])
            article['published_at'] = self.parse_date(date_str)
        except:
            article['published_at'] = ''

        # Parse Place
        try:
            place_str = sanitize(hxs.select(self.xpath_place).extract()[0])
            article['place'] = capitalizeFirstCharInWord(
                self.parse_place(place_str))
        except:
            article['place'] = ''

        # Parse Author
        try:
            author_str = sanitize(hxs.select(self.xpath_author).extract()[0])
            article['author'] = capitalizeFirstCharInWord(
                self.parse_author(author_str))
        except:
            article['author'] = ''

        # Debug
        if self.debug == True:
            print article
            print ''
        else:
            return article

예제 #4

파일 보기

 def get_category(self, response):
     hxs = HtmlXPathSelector(response)
     return hxs.select(self.xpath_category).extract()[0]

예제 #5

파일 보기

파일: base_crawler.py 프로젝트: wiwiek-ci/ina-news-crawler

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        # Assign given elements
        article = ArticleItem()
        article['source'] = self.source
        article['url'] = response.url

        # Parse Category
        try:
            category_str = sanitize(self.get_category(response))
            article['category'] = capitalizeFirstCharInWord(self.normalize_category(self.parse_category(category_str)))
        except:
            article['category'] = ''

        # Parse Title
        try:
            article['title'] = sanitize(hxs.select(self.xpath_title).extract()[0])
        except:
            article['title'] = ''

        # Parse Content
        paragraphs = hxs.select(self.xpath_content).extract()
        lines = []
        for paragraph in paragraphs:
            line = sanitize(paragraph)
            if len(line) > 0:
                lines.append(line)
        article['content'] = '\n'.join(lines)

        # Parse Subtitle
        try:
            article['subtitle'] = capitalizeFirstCharInWord(sanitize(hxs.select(self.xpath_subtitle).extract()[0]))
        except:
            article['subtitle'] = ''

        # Parse Published_at
        try:
            date_str = sanitize(hxs.select(self.xpath_published_at).extract()[0])
            article['published_at'] = self.parse_date(date_str)
        except:
            article['published_at'] = ''

        # Parse Place
        try:
            place_str = sanitize(hxs.select(self.xpath_place).extract()[0])
            article['place'] = capitalizeFirstCharInWord(self.parse_place(place_str))
        except:
            article['place'] = ''

        # Parse Author
        try:
            author_str = sanitize(hxs.select(self.xpath_author).extract()[0])
            article['author'] = capitalizeFirstCharInWord(self.parse_author(author_str))
        except:
            article['author'] = ''

        # Debug
        if self.debug == True:
            print article
            print ''
        else:
            return article

예제 #6

파일 보기

파일: base_crawler.py 프로젝트: wiwiek-ci/ina-news-crawler

 def get_category(self, response):
     hxs = HtmlXPathSelector(response)
     return hxs.select(self.xpath_category).extract()[0]