Python TextResponse.follow示例

编程语言: Python

命名空间/包名称: scrapy.http

类/类型: TextResponse

方法/功能: follow

hotexamples.com的示例: 7

Python TextResponse.follow - 已找到7个示例。这些是从开源项目中提取的最受好评的scrapy.http.TextResponse.follow现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

xpath(30)

css(30)

urljoin(14)

follow(7)

TextResponse(5)

body_as_unicode(4)

json(4)

request(3)

_set_body(2)

callback(1)

dont_filter(1)

__init__(1)

_encoding(1)

follow_all(1)

method(1)

priority(1)

timestamp(1)

_cached_page_vector(1)

errback(1)

示例#1

显示文件

    def parse(self, response: TextResponse, **kwargs):
        if self.n_pages == self.max_pages:
            return

        self.n_pages += 1

        links = response.css("#recent p.title > a::attr(href)").extract()

        if links:
            for link in links:
                if self.db.rentals.find({
                        "_id": link
                }, {
                        "_id": 1
                }).count() == 0:
                    yield response.follow(link, callback=self.parse)
        else:
            ad = parse_ad_page_html(response.body)
            with contextlib.suppress(DuplicateKeyError):
                self.db.rentals.insert_one(asdict(ad))
                yield ad

        next_page = response.css(".next a")

        if next_page:
            yield response.follow(next_page[0], callback=self.parse)

示例#2

显示文件

文件： __init__.py 项目： ztx0728/FakeNewsRecognition

    def parse(self, response: TextResponse):
        # article = newspaper.Article(response.url)
        # article.set_html(response.text)
        # article.parse()

        if response.status != 200:
            return

        yield {
            # 'title': article.title,
            # 'content': article.text,
            # 'published_at': article.publish_date,
            # 'authors': article.authors,
            'url': response.url,
            'html': response.text
        }

        for link in LxmlLinkExtractor(
                allow_domains=self.domains).extract_links(response):
            split = urlsplit(link.url)
            scheme = (split.scheme + '://') if len(split.scheme) > 1 else ''
            if len(scheme) < 1 and len(
                    split.netloc) > 1 and split.netloc[0] != '/':
                scheme = '//'

            url = scheme + split.netloc + split.path

            yield response.follow(url, callback=self.parse)

示例#3

显示文件

def fetch(url, meta=None, *args, **kwargs):
    """fetch url. """
    resp = requests.get(url, *args, **kwargs, timeout=30)
    resp.encoding = 'UTF-8'
    rv = TextResponse(resp.url,
                      status=resp.status_code,
                      body=resp.text,
                      encoding='UTF-8')
    rv.request = rv.follow(url, meta=meta)
    _set_response(rv)
    return rv

示例#4

显示文件

文件： spider.py 项目： ztx0728/FakeNewsRecognition

    def parse(self, response: TextResponse):
        article = newspaper.Article(response.url)
        article.set_html(response.text)
        article.parse()

        yield {
            'title': article.title,
            'content': article.title,
            'published_at': article.publish_date,
            'authors': article.authors
        }

        domain = '{uri.scheme}://{uri.netloc}/'.format(
            uri=urlparse(response.url))

        for url in LinkExtractor(allow_domains=domain).extract_links(response):
            yield response.follow(url, callback=self.parse)

示例#5

显示文件

    def parse(self, response: TextResponse):
        # getting the data required to store in the pages table
        r_url = response.url
        r_page = response.text
        r_time = datetime.now()
        print(__file__, "CrawCategory.parse()",
              "scraping for pages: {}".format(r_url))
        # create SQLAlchemy page object
        pge = Page(url=r_url,
                   html=r_page,
                   date=r_time,
                   category=CrawlCategory.catObject)

        # add page object
        CrawlCategory.dbSession.add(pge)

        # calculating the url for the next page
        next_page = response.css("li.next a").attrib["href"]
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

示例#6

显示文件

 def parse(self,
           response: TextResponse) -> Generator[Dict[str, str], None, None]:
     chart_link = response.xpath(self.chart_link_xpath).extract_first()
     yield response.follow(chart_link, callback=self._parse_playlist)

示例#7

显示文件

文件： games.py 项目： psacawa/nhl-api-scraper

 def parse_teams(self, response: TextResponse):
     #  team_links = jq.compile(".teams[].link").input(response.text).all()
     json_response = json.loads(response.body)
     team_links = jq.compile(".teams[].link").input(json_response).all()
     for link in team_links:
         yield response.follow(link, callback=self.parse_team)