예제 #1
0
    def parse_urls(self, html):
        """
        Produces a list of URLs present in the given html.

        :type html: str
        :rtype:     list
        """

        soup = BeautifulSoup(html, "html.parser")
        urls = []

        # (presumably) only in the main page
        for element in soup.findAll("h2", {"class": "section-heading"}):
            if element.a:
                url = element.a.get("href")
                if url not in self.visited_urls:
                    urls.append(Utility.clean_url(url))

        # in main page and appears as relevant articles
        for element in soup.findAll("a", {"class": "story-link"}):
            url = element.get("href")
            if url not in self.visited_urls:
                urls.append(Utility.clean_url(url))

        return urls
예제 #2
0
    def parse_urls(self, html):
        """
        Appends new URLs present in the given html too the URL queue.

        :type html: str
        """

        soup = BeautifulSoup(html, "html.parser")

        # this is (presumably) only in the main page
        for element in soup.findAll("h2", {"class": "section-heading"}):
            if element.a:
                url = element.a.get("href")
                if url not in self.visited_urls:
                    self.url_queue.append(Utility.clean_url(url))

        # in main page and appear as relevant articles
        for element in soup.findAll("a", {"class": "story-link"}):
            url = element.get("href")
            if url not in self.visited_urls:
                self.url_queue.append(Utility.clean_url(url))