Exemplo n.º 1
0
    def parse(self, response: TextResponse, **kwargs):
        if self.n_pages == self.max_pages:
            return

        self.n_pages += 1

        links = response.css("#recent p.title > a::attr(href)").extract()

        if links:
            for link in links:
                if self.db.rentals.find({
                        "_id": link
                }, {
                        "_id": 1
                }).count() == 0:
                    yield response.follow(link, callback=self.parse)
        else:
            ad = parse_ad_page_html(response.body)
            with contextlib.suppress(DuplicateKeyError):
                self.db.rentals.insert_one(asdict(ad))
                yield ad

        next_page = response.css(".next a")

        if next_page:
            yield response.follow(next_page[0], callback=self.parse)
Exemplo n.º 2
0
    def parse(self, response: TextResponse):
        # article = newspaper.Article(response.url)
        # article.set_html(response.text)
        # article.parse()

        if response.status != 200:
            return

        yield {
            # 'title': article.title,
            # 'content': article.text,
            # 'published_at': article.publish_date,
            # 'authors': article.authors,
            'url': response.url,
            'html': response.text
        }

        for link in LxmlLinkExtractor(
                allow_domains=self.domains).extract_links(response):
            split = urlsplit(link.url)
            scheme = (split.scheme + '://') if len(split.scheme) > 1 else ''
            if len(scheme) < 1 and len(
                    split.netloc) > 1 and split.netloc[0] != '/':
                scheme = '//'

            url = scheme + split.netloc + split.path

            yield response.follow(url, callback=self.parse)
Exemplo n.º 3
0
def fetch(url, meta=None, *args, **kwargs):
    """fetch url. """
    resp = requests.get(url, *args, **kwargs, timeout=30)
    resp.encoding = 'UTF-8'
    rv = TextResponse(resp.url,
                      status=resp.status_code,
                      body=resp.text,
                      encoding='UTF-8')
    rv.request = rv.follow(url, meta=meta)
    _set_response(rv)
    return rv
Exemplo n.º 4
0
    def parse(self, response: TextResponse):
        article = newspaper.Article(response.url)
        article.set_html(response.text)
        article.parse()

        yield {
            'title': article.title,
            'content': article.title,
            'published_at': article.publish_date,
            'authors': article.authors
        }

        domain = '{uri.scheme}://{uri.netloc}/'.format(
            uri=urlparse(response.url))

        for url in LinkExtractor(allow_domains=domain).extract_links(response):
            yield response.follow(url, callback=self.parse)
Exemplo n.º 5
0
    def parse(self, response: TextResponse):
        # getting the data required to store in the pages table
        r_url = response.url
        r_page = response.text
        r_time = datetime.now()
        print(__file__, "CrawCategory.parse()",
              "scraping for pages: {}".format(r_url))
        # create SQLAlchemy page object
        pge = Page(url=r_url,
                   html=r_page,
                   date=r_time,
                   category=CrawlCategory.catObject)

        # add page object
        CrawlCategory.dbSession.add(pge)

        # calculating the url for the next page
        next_page = response.css("li.next a").attrib["href"]
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)
Exemplo n.º 6
0
 def parse(self,
           response: TextResponse) -> Generator[Dict[str, str], None, None]:
     chart_link = response.xpath(self.chart_link_xpath).extract_first()
     yield response.follow(chart_link, callback=self._parse_playlist)
Exemplo n.º 7
0
 def parse_teams(self, response: TextResponse):
     #  team_links = jq.compile(".teams[].link").input(response.text).all()
     json_response = json.loads(response.body)
     team_links = jq.compile(".teams[].link").input(json_response).all()
     for link in team_links:
         yield response.follow(link, callback=self.parse_team)