Python FilteringLinkExtractor примеры использования

Язык программирования: Python

Пространство имен/Пакет: scrapy.linkextractors.lxmlhtml

Примеров на hotexamples.com: 4

Python FilteringLinkExtractor - 4 примера найдено. Это лучшие примеры Python кода для scrapy.linkextractors.lxmlhtml.FilteringLinkExtractor, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

FilteringLinkExtractor(1)

_extract_links(1)

_process_links(1)

Пример #1

Показать файл

Файл: content_processor.py Проект: nctl144/general-spider

class ContentProcessor(object):
    def __init__(self):
        lx = LxmlParserLinkExtractor()
        self.linkextractor = FilteringLinkExtractor(lx,
                                                    allow=(),
                                                    deny=(),
                                                    allow_domains=(),
                                                    deny_domains=(),
                                                    restrict_xpaths=(),
                                                    canonicalize=True,
                                                    deny_extensions=None,
                                                    restrict_css=None)

    def process_response(self, response):
        html = Selector(response)
        pc = self._extract_text(html)
        pc.base_url = get_base_url(response)
        pc.links = self.linkextractor._extract_links(html, response.url,
                                                     response.encoding,
                                                     pc.base_url)
        pc.links = self.linkextractor._process_links(pc.links)
        return pc

    def _extract_text(self, selector):
        def _meta_name(el, values):
            return el.tag == 'meta' and 'name' in el.attrib and 'content' in el.attrib and \
                   el.attrib['name'].lower() in values

        pc = ParsedContent()
        for el in selector._root.iter(etree.Element):
            if _meta_name(el, ['description', 'og:description']):
                pc.meta_description = el.attrib['content']
                continue

            if _meta_name(el, ['keywords']):
                pc.meta_keywords = el.attrib['content']
                continue

            if el.tag == 'title':
                pc.title = el.text
                continue

            if el.tag.startswith('h') and len(
                    el.tag) == 2 and el.text and el.text.strip():
                pc.headers.append(el.text)
                continue

            if el.tag in (
                    'script',
                    'style',
            ):
                continue

            if el.text and el.text.strip():
                pc.paragraphs.append(el.text)
        return pc

Пример #2

Показать файл

Файл: content_processor.py Проект: nctl144/general-spider

 def __init__(self):
     lx = LxmlParserLinkExtractor()
     self.linkextractor = FilteringLinkExtractor(lx,
                                                 allow=(),
                                                 deny=(),
                                                 allow_domains=(),
                                                 deny_domains=(),
                                                 restrict_xpaths=(),
                                                 canonicalize=True,
                                                 deny_extensions=None,
                                                 restrict_css=None)

Пример #3

Показать файл

Файл: content_processor.py Проект: sibiryakov/general-spider

class ContentProcessor(object):
    def __init__(self):
        lx = LxmlParserLinkExtractor()
        self.linkextractor = FilteringLinkExtractor(lx, allow=(), deny=(), allow_domains=(),
                                                    deny_domains=(), restrict_xpaths=(), canonicalize=True,
                                                    deny_extensions=None, restrict_css=None)

    def process_response(self, response):
        html = Selector(response)
        pc = self._extract_text(html)
        pc.base_url = get_base_url(response)
        pc.links = self.linkextractor._extract_links(html, response.url, response.encoding, pc.base_url)
        pc.links = self.linkextractor._process_links(pc.links)
        return pc

    def _extract_text(self, selector):
        def _meta_name(el, values):
            return el.tag == 'meta' and 'name' in el.attrib and 'content' in el.attrib and \
                   el.attrib['name'].lower() in values

        pc = ParsedContent()
        for el in selector._root.iter(etree.Element):
            if _meta_name(el, ['description', 'og:description']):
                pc.meta_description = el.attrib['content']
                continue

            if _meta_name(el, ['keywords']):
                pc.meta_keywords = el.attrib['content']
                continue

            if el.tag == 'title':
                pc.title = el.text
                continue

            if el.tag.startswith('h') and len(el.tag) == 2 and el.text and el.text.strip():
                pc.headers.append(el.text)
                continue

            if el.tag in ('script', 'style',):
                continue

            if el.text and el.text.strip():
                pc.paragraphs.append(el.text)
        return pc

Пример #4

Показать файл

Файл: content_processor.py Проект: sibiryakov/general-spider

 def __init__(self):
     lx = LxmlParserLinkExtractor()
     self.linkextractor = FilteringLinkExtractor(lx, allow=(), deny=(), allow_domains=(),
                                                 deny_domains=(), restrict_xpaths=(), canonicalize=True,
                                                 deny_extensions=None, restrict_css=None)