Exemplo n.º 1
0
class CrawlerDB(Thread):
    def __init__(self,
                 delegate,
                 initialLink=None,
                 max_links=0,
                 no_workers=10,
                 id=str(uuid.uuid4())):
        Thread.__init__(self)
        self.noOfWorkers = no_workers
        self.workers = []
        self.running = True
        self.paused = False
        self.condition = RLock()
        self.delegate = Delegate()
        self.listeners = [
        ]  # A list of listeners that want to listen to messages (ex: progress) from Crawler
        self.id = id
        self.initialLink = initialLink
        if initialLink is not None:
            self.add_initial_url(initialLink)
        self.max_links = max_links
        try:
            self.domain_regex = re.compile(get_domain(initialLink))
        except Exception as ex:
            logging.error("Exception {}".format(ex))

    # def __del__(self):
    # 	self.delegate.get_session().close()

    def add_initial_url(self, address):
        logger.info("Add initial URL")
        with self.condition:
            url = Url(url=address,
                      absolute_url=address,
                      type=Url.TYPE_INTERNAL,
                      crawl_id=self.id,
                      job_status=Url.JOB_STATUS_NOT_VISITED)
            self.delegate.url_create(url)

    def no_unvisited_urls(self):
        with self.condition:
            return self.delegate.url_count_unvisited(self.id)

    def no_pending_urls(self):
        with self.condition:
            return self.delegate.url_count_pending(self.id)

    def all_unvisited_urls(self):
        with self.condition:
            return self.delegate.url_get_all_unvisited(self.id)

    def no_visited_urls(self):
        with self.condition:
            return self.delegate.url_count_visited(self.id)

    def no_visited_resources(self):
        with self.condition:
            return self.delegate.resource_count_visited(self.id)

    def no_external_urls(self):
        with self.condition:
            return self.delegate.url_count_external(self.id)

    def next_unvisited_link_id(self):
        link_id = -1
        with self.condition:
            url = self.delegate.url_get_first_unvisited(self.id)
            if url is not None:
                url.job_status = Url.JOB_STATUS_IN_PROGRESS  # Set Url as in progress
                self.delegate.url_update(url)
                # self.increaseNoOfJobs()
                link_id = url.id
        return link_id

    def mark_url_as_visited(self, url_id):
        with self.condition:
            url = self.delegate.url_get_by_id(url_id)
            url.job_status = Url.JOB_STATUS_VISITED
            self.delegate.url_update(url)

    def _type_links(self, links):
        for link in links:
            if is_internal(get_domain(self.initialLink),
                           link['absolute']):  # internal link
                link['type'] = 'internal'
            else:  # external link
                link['type'] = 'external'

    def _get_links(self, link_id):
        with self.condition:
            link = self.delegate.url_get_by_id(link_id)
            (page, links) = get_links(link.absolute_url)
            self._type_links(links)
            return page, links

    def link2url(self, link):
        url = Url(crawl_id=self.id)
        # url=link['href'], absolute_url=link['absolute'], type=link['type'],
        if 'href' in link:
            url.url = link['href']
        if 'absolute' in link:
            url.absolute_url = link['absolute']
        if 'type' in link:
            url.type = link['type']
        if 'content' in link:
            url.raw_content = str(link['content'])
            url.text = str(
                link['content']
            )  # TODO: Parse the raw_content and used only the text without HTML tags or other stuff
        return url

    def page2resource(self, page):
        resource = Resource(crawl_id=self.id)
        if 'url' in page:
            resource.absolute_url = page['url']
        if 'content' in page:
            resource.content = page['content']
        if 'elapsed' in page:
            resource.elapsed = page['elapsed']
        return resource

    def add_links(self, links, src_resource_id=None, status_code=200):
        """Add a bunch of URLs using the resource id as source (page where found it)"""
        with self.condition:
            for link in links:
                url = self.link2url(link)
                if src_resource_id is not None:
                    url.src_resource_id = src_resource_id

                    # Check if destination resource exists, and if does mark it as visited
                    try:
                        src_resource = self.delegate.resource_get_by_id(
                            src_resource_id)
                        dest_resource = self.delegate.resource_get_by_absolute_url_and_crawl_id(
                            url.absolute_url, src_resource.crawl_id)
                        if dest_resource is not None:
                            url.job_status = Url.JOB_STATUS_VISITED
                            url.dst_resource_id = dest_resource.id
                            url.status_code = status_code
                    except Exception as e:
                        logger.warning("Exception {}".format(e))

                self.delegate.url_create(url)

    def add_resource(self, page):
        with self.condition:
            if not self.delegate.resource_is_present(crawlId=self.id):
                resource = self.page2resource(page)
                self.delegate.resource_create(resource)

    def connect_url_to_destination(self, url_id, resource_id):
        with self.condition:
            url = self.delegate.url_get_by_id(url_id)
            url.dst_resource_id = resource_id
            self.delegate.url_update(url)

    def resource_get_by_absolute_url_and_crawl_id(self, address, crawler_id):
        with self.condition:
            resource = self.delegate.resource_get_by_absolute_url_and_crawl_id(
                address, crawler_id)
            return resource

    def resource_create(self, page):
        with self.condition:
            try:
                resource = self.page2resource(page)
                self.delegate.resource_create(resource)
            except Exception as e:
                logger.warn("{} Exception {}}.".format(
                    currentThread().getName(), e))
            return resource

    def run(self):

        # Initialize workers
        for i in range(self.noOfWorkers):
            self.workers.append(
                Thread(target=self.workerJob,
                       kwargs={"crawlId": self.id},
                       name="Thread-{}".format(i)))

        # Start workers
        self._start_all_workers()

        while self.running:
            logger.debug("[%s] Crawler thread cycle started." %
                         (currentThread().getName()))
            if self.paused:
                logger.debug("[%s] Crawler paused." %
                             (currentThread().getName()))
                continue

            logger.debug("[%s] Crawler check if jobs are done." %
                         (currentThread().getName()))
            if self._are_jobs_done():
                logger.debug("Crawler is shutting down")
                self.setRunning(False)
                break
            else:
                logger.debug("[%s] Crawler's jos are NOT done." %
                             (currentThread().getName()))

            logger.debug("[%s] Crawler sleep." % (currentThread().getName()))
            time.sleep(1)

        # Join them
        self._join_all_workers()

        # self.delegate.get_session().close()

        msg = {
            "status": "done",
            "visited": self.no_visited_urls(),
            "to_visit": self.no_unvisited_urls(),
            "max_links": self.max_links,
            "crawlId": self.id
        }

        self.notify(msg)

    def workerJob(self, crawlId):
        while self.running:
            logger.debug("[%s] Worker thread cycle started." %
                         (currentThread().getName()))

            if self.paused:
                continue

            # If max pages specified see if we already reached it
            if self.max_links > 0:
                no_pages_visited = self.no_visited_resources()
                if no_pages_visited >= self.max_links:
                    continue

            # Grab next job
            link_id = self.next_unvisited_link_id()
            logger.debug("[%s] Next link [%d]." %
                         (currentThread().getName(), link_id))

            if 'link_id' in locals() and link_id != -1:
                logger.debug("[%s] Current link : %d" %
                             (currentThread().getName(), link_id))
                page, links = self._get_links(link_id)
                logger.debug("[%s] Discovered [%d] links." %
                             (currentThread().getName(), len(links)))

                try:
                    with self.condition:
                        # Update links status code
                        url = self.delegate.url_get_by_id(link_id)
                        url.status_code = page['status-code']
                        self.delegate.url_update(url)

                        if page['status-code'] == 200:
                            # 1.Add Resource 2.Link URLs to (new | existing) Resources
                            resource = self.resource_get_by_absolute_url_and_crawl_id(
                                page['url'], self.id)
                            if resource is None:
                                #Add it only if max links not reached
                                maximum_reached = False
                                if self.max_links > 0:  # We have a max_link specified
                                    no_pages_visited = self.no_visited_resources(
                                    )
                                    if no_pages_visited >= self.max_links:
                                        maximum_reached = True

                                if not maximum_reached:
                                    resource = self.resource_create(page)
                                    self.connect_url_to_destination(
                                        link_id, resource.id)
                                    logger.debug(
                                        "[%s] Adding links to DB linked to resource [%d]"
                                        % (currentThread().getName(),
                                           resource.id))
                                    self.add_links(links, resource.id,
                                                   page['status-code'])
                            else:
                                # Resource already added only make the end connection
                                self.connect_url_to_destination(
                                    link_id, resource.id)
                        else:
                            pass

                        self.mark_url_as_visited(link_id)

                        msg = {
                            "status": "in_progress",
                            "visited": self.no_visited_urls(),
                            "to_visit": self.no_unvisited_urls(),
                            "max_links": self.max_links,
                            "crawlId": crawlId,
                            "currentWorker": currentThread().getName()
                        }

                        self.notify(msg)
                except Exception as e:
                    print("Error {}".format(e))

            logger.debug("[%s] cycle ended." % (currentThread().getName()))
        else:
            logger.debug("[%s] is shutting down." %
                         (currentThread().getName()))
            # self.delegate.get_session().close()

    def stop(self):
        self.setRunning(False)

    def pause(self):
        self.paused = True

    def resume(self):
        if self.paused:
            self.paused = False

    def _start_all_workers(self):
        for w in self.workers:
            w.start()

    def _are_jobs_done(self):
        # Test if noOfJobs == 0 and to_visit == 0
        # no_of_jobs = self.getNoOfJobs()

        # FIXME: If a thread grabs the initial link, while here, no_unvisited_urls() will
        # return zero (on next line) , also the no_of_jobs are zero so the Crawler
        # will initiate shutdown

        no_pending_urls = self.no_pending_urls()
        logger.debug("Crawler: _are_jobs_done(...) : no_pendind_urls = %d " %
                     (no_pending_urls, ))

        if no_pending_urls == 0:
            return True

        # Test if we have reached the max no of pages
        if self.max_links > 0:
            no_pages_visited = self.no_visited_resources()
            if no_pages_visited >= self.max_links:
                return True

        return False

    def _join_all_workers(self):
        for w in self.workers:
            w.join()

    def setRunning(self, status):
        self.running = status

    def addListener(self, callback):
        self.listeners.append(callback)

    def removeListener(self, callback):
        self.listeners.remove(callback)

    def notify(self, msg):
        for callback in self.listeners:
            callback(msg)
Exemplo n.º 2
0
    def test_link(self):
        delegate = XDelegate()

        print("test_page started")
        # Site 1
        site1 = Site()
        site1.name = "Site1"
        site1.url = 'http://foo.com'
        delegate.site_create(site1)

        # Crawl
        crawl = Crawl(site_id=site1.id)
        delegate.crawl_create(crawl)
        assert crawl.id > 0

        # Page
        page = Resource()
        page.crawl_id = crawl.id
        page.content = "Ala bala portocala"
        page.absolute_url = "https://scriptoid.com/index.php"
        delegate.resource_create(page)

        # Link

        # Test url_is_present()
        p1 = delegate.url_is_present('https://scriptoid.com/index.php',
                                     crawl.id)
        assert not p1

        # Test url_count_unvisited()
        n1 = delegate.url_count_unvisited(crawl_id=crawl.id)
        assert n1 == 0, 'n1 is {}'.format(n1)

        # Test url_get_all_by_crawl_id()
        crawl_urls = delegate.url_get_all_by_crawl_id(crawl.id)
        assert len(crawl_urls) == 0

        # Test url_count_incoming_for_resource()
        uc1 = delegate.url_count_incoming_for_resource(page.id)
        assert uc1 == 0

        # Test url_count_internal_full()
        cif = delegate.url_count_internal_full(crawl.id)
        assert cif == 0

        url1 = Url()
        url1.src_resource_id = page.id
        url1.url = '/team'
        url1.absolute_url = 'https://scriptoid.com/team'
        url1.type = Url.TYPE_INTERNAL
        url1.crawl_id = crawl.id
        url1.job_status = Url.JOB_STATUS_IN_PROGRESS
        lid1 = delegate.url_create(url1)
        assert url1.id > 0
        assert lid1 == url1.id

        url2 = Url()
        url2.src_resource_id = page.id
        url2.dst_resource_id = page.id
        url2.url = '/contact'
        url2.absolute_url = 'https://scriptoid.com/index.php'
        url2.type = Url.TYPE_INTERNAL
        url2.crawl_id = crawl.id
        delegate.url_create(url2)
        assert url2.id > 0

        url3 = Url()
        url3.dst_resource_id = page.id
        url3.url = '/jobs'
        url3.absolute_url = 'https://scriptoid.com/jobs.php'
        url3.type = Url.TYPE_INTERNAL
        url3.crawl_id = crawl.id
        delegate.url_create(url3)
        assert url3.id > 0

        # Test url_count_incoming_for_resource()
        uc1 = delegate.url_count_incoming_for_resource(page.id)
        assert uc1 == 1

        # Test url_get_by_id()
        u1 = delegate.url_get_by_id(url1.id)
        assert u1.id == url1.id

        # Test url_is_present()
        p1 = delegate.url_is_present('https://scriptoid.com/index.php',
                                     crawl.id)
        assert p1

        # Test url_get_all_by_crawl_id()
        crawl_urls = delegate.url_get_all_by_crawl_id(crawl.id)
        assert len(crawl_urls) == 3

        # Test first unvisited link
        l1 = delegate.url_get_first_unvisited(crawl_id=crawl.id)
        assert l1.id == url2.id, 'l1.id = {} and url.id = {}'.format(
            l1.id, url2.id)

        # Test url_get_all_unvisited()
        unvisited1 = delegate.url_get_all_unvisited(crawl.id)
        assert len(unvisited1) == 2

        # Test url_count_unvisited()
        n1 = delegate.url_count_unvisited(crawl_id=crawl.id)
        assert n1 == 2, 'n1 is {}'.format(n1)

        n2 = delegate.url_count_visited(crawl_id=crawl.id)
        assert n2 == 0, 'Actually n2 is {}'.format(n2)

        url1.job_status = Url.JOB_STATUS_VISITED
        delegate.url_update(url1)
        l1 = delegate.url_get_first_unvisited(crawl_id=crawl.id)
        assert l1.id == url2.id

        n1 = delegate.url_count_unvisited(crawl_id=crawl.id)
        assert n1 == 2, 'n1 is {}'.format(n1)

        n2 = delegate.url_count_visited(crawl_id=crawl.id)
        assert n2 == 1, 'n2 is {}'.format(n2)

        # Test url_count_internal_full()
        cif = delegate.url_count_internal_full(crawl.id)
        assert cif == 1

        # Test url_count_pending()
        ucp = delegate.url_count_pending(crawl.id)
        assert ucp == 2

        # Test url_delete_all()
        delegate.url_delete_all()
        links = delegate.url_get_all()
        assert len(links) == 0, "When actually there are {}".format(len(links))

        # Test url_count_external()
        uce = delegate.url_count_external(crawl.id)
        assert uce == 0

        url4 = Url()
        url4.dst_resource_id = page.id
        url4.url = '/jobs'
        url4.absolute_url = 'https://scriptoid.com/jobs.php'
        url4.type = Url.TYPE_EXTERNAL
        url4.crawl_id = crawl.id
        delegate.url_create(url4)
        assert url4.id > 0

        uce = delegate.url_count_external(crawl.id)
        assert uce == 1

        assert delegate.url_delete_by_id(url4.id)

        # Test a cascade delete from parent Page resource_delete_all() to Link
        url = Url()
        url.src_resource_id = page.id
        url.url = '/contact'
        url.absolute_url = 'https://scriptoid.com/index.php'
        url.type = Url.TYPE_INTERNAL
        url.crawl_id = crawl.id
        delegate.url_create(url)
        assert url.id > 0

        delegate.resource_delete_all()
        links = delegate.url_get_all()
        assert len(links) == 0, "When actually there are {}".format(len(links))

        # Clean up
        # delegate.link_delete_all()
        delegate.resource_delete_all()
        delegate.crawl_delete_all()
        delegate.site_delete_all()

        print("test_page done")