Exemplo n.º 1
0
def worker(domain):
    while True:

        # if LINKS_QUEUE.qsize() == 0:
        #     sleep(10)
        #     if LINKS_QUEUE.qsize() == 0:
        #         break
        #     continue

        url = LINKS_QUEUE.get()
        SCANNED_LINKS.add(url)

        try:
            with HTMLSession() as session:
                resp = session.get(url)

            assert resp.status_code == 200

        except Exception as e:
            print(e, type(e))
            continue

        try:
            page_title = resp.html.xpath('//title')[0].text
        except IndexError:
            page_title = 'Not Found'

        try:
            page_h1 = resp.html.xpath('//h1')[0].text
        except IndexError:
            page_h1 = 'Not Found'

        Page.create(url=url, title=page_title, h1=page_h1)
        print('[OK]', url)

        with locker:
            with open('results.csv', 'a') as f:
                f.write(f'{url}\t{page_title}\t{page_h1}\n')

        for link in resp.html.absolute_links:
            link = link.split('#')[0]
            if domain not in link:
                continue
            if link in SCANNED_LINKS:
                continue
            if any(part in link for part in BAD_PARTS):
                continue

            LINKS_QUEUE.put(link)
Exemplo n.º 2
0
    def get_page(self, url):
        normalized_url = self.normalize(url)
        if normalized_url not in self.pages:
            headers = requests.head(url)
            content_type = headers.headers.get('content-type', '')

            if "text/html" in content_type:
                try:
                    page = requests.get(url)
                except Exception as e:
                    logging.error(f"Requests get exception: {e}")
                    Page.create(id=self.id,
                                url=normalized_url,
                                status=headers.status_code,
                                content_type=content_type,
                                links=json.dumps([]))
                    self.id += 1
                    return

                logging.debug(f"Got {url} [{page.status_code}]")

                try:
                    page_content = get_page_source(url)
                except Exception as e:
                    logging.error(f"Got selenium error: [{e}]")
                    page_content = page.content

                links = [
                    self.normalize(link)
                    for link in self.parse_page(page_content)
                ]
                Page.create(id=self.id,
                            url=normalized_url,
                            status=page.status_code,
                            content_type=content_type,
                            links=json.dumps(links))
                self.pages[normalized_url] = None
                for link in links:
                    if link not in self.pages:
                        self.queue.add(link)
            else:
                logging.debug(f"Add {url} with content_type: {content_type}")
                Page.create(id=self.id,
                            url=normalized_url,
                            status=headers.status_code,
                            content_type=content_type,
                            links=json.dumps({}))
            self.id += 1