示例#1
0
    def __init__(self, settings=None, gui_mode=False, lock=None, stats=True):
        self.url_queue = queue.Queue()
        self.data_queue = queue.Queue(maxsize=25)
        self.gui_url_queue = []
        self.gui_mode = gui_mode
        self.lock = lock
        self.stats = stats

        self.list_mode_urls = None
        self.url_attempts = {}
        self.retries = 5

        self.settings = settings
        self.gf = gf(self.settings, columns=None)
        self.crawl_running = Event()
        self.crawl_completed = Event()
        self.crawl_timed_out = Event()
        self.worker_status = []
        self.db_file = None

        self.rate_limit_delay = 0
        self.current_urls_per_second = 0
        self.urls_crawled = 0
        self.urls_total = 0
        self.HEADERS = ""
        self.robots_txt = ""
        self.columns = None

        self.consumer_thread = None

        self.session = None
        self.header_only = False
示例#2
0
    def resume_crawl(self):
        print("Resuming crawl ...")
        self.init_crawl_headers()
        # Reinit session
        self.init_session()

        self.reset_crawl()

        db = self._connect_to_db()

        self.urls_crawled = db.get_urls_crawled()
        self.urls_total = db.get_total_urls()

        # Reset response object
        self.gf = gf(self.settings, columns=db.get_columns())

        if self.settings['MODE'] != 'List':
            response = self.request_robots_txt(
                self.settings.get('STARTING_URL'))
            if response == 'SKIP_ME':
                self.crawl_timed_out.set()
                self.crawl_running.set()
                db.close()
                return

        # Reinit URL queue
        self.add_to_url_queue(db.get_url_queue(), count=False)

        db.commit()
        db.close()

        self.start_consumer()
        Thread(target=self.spawn_threads).start()
示例#3
0
    def resume_crawl(self) -> None:
        """Resumes a crawl using the settings from the connected database."""
        print('Resuming crawl ...')
        self.reset_crawl()
        db = self._connect_to_db()
        self.urls_crawled = db.get_urls_crawled()
        self.urls_total = db.get_total_urls()

        # Create a new response object with the columns from the loaded databse
        self.gf = gf(self.settings, columns=db.get_columns())

        if self.settings['MODE'] != 'List':
            response = self.request_robots_txt(
                self.settings.get('STARTING_URL'))
            if isinstance(response, str):
                self.crawl_timed_out.set()
                self.crawl_running.set()
                db.close()
                return

        # Reinit URL queue
        self.add_to_url_queue(db.get_url_queue(), count=False)

        db.close()

        self.start_consumer()
        Thread(target=self.spawn_threads).start()
示例#4
0
    def start_crawl(self):
        print("Crawl started")
        self.init_crawl_headers()
        self.init_session()

        # Set speed limit
        if int(self.settings.get("URLS_PER_SECOND", 0)) > 0:
            self.parallel_requests_limit = (
                1 / int(self.settings["URLS_PER_SECOND"])) * int(
                    self.settings["THREADS"])

        db = self._connect_to_db()
        db.create()

        # # Reset response object
        self.gf = gf(self.settings, columns=None)

        self.columns = self.gf.all_items = db.get_columns()

        if self.settings["MODE"] == "Spider":
            self.settings['STARTING_URL'] = self.gf.url_components_to_str(
                self.gf.parse_url(self.settings['STARTING_URL']))
            self.settings["ROOT_DOMAIN"] = self.gf.get_domain(
                self.settings['STARTING_URL'])
            response = self.crawl_url(self.settings['STARTING_URL'])

            # Check if we are dealing with a reachable host
            if response == 'SKIP_ME':
                self.crawl_timed_out.set()
                self.crawl_running.set()
                db.close()
                return

            data = self.response_to_data(response)

            self.add_to_data_queue(data)
            self.request_robots_txt(data['url'])

        elif self.settings["MODE"] == "List":
            if len(self.list_mode_urls) > 0:
                self.add_to_url_queue(self.list_mode_urls)
                db.insert_new_urls(self.list_mode_urls)
            else:
                print("ERROR: No urls to list crawl found!")

        db.commit()
        db.close()

        self.start_consumer()
        Thread(target=self.spawn_threads).start()
示例#5
0
    def start_crawl(self) -> None:
        """Starts a new crawl using the config from self.settings"""
        print('Crawl started')
        self.init_crawl_headers()
        self.init_session()

        # Set speed limit
        if int(self.settings.get('URLS_PER_SECOND', 0)) > 0:
            self.parallel_requests_limit = (
                1 / int(self.settings['URLS_PER_SECOND'])) * int(
                    self.settings['THREADS'])

        db = self._connect_to_db()
        db.create()
        db.insert_config(self.settings)

        # # Reset response object
        self.gf = gf(self.settings, columns=None)

        self.columns = self.gf.all_items = db.get_columns()

        if self.settings['MODE'] == 'Spider':
            self.settings['ROOT_DOMAIN'] = self.gf.get_domain(
                self.settings['STARTING_URL'])
            response = self.crawl_url(self.settings['STARTING_URL'])

            # Check if we are dealing with a reachable host
            if isinstance(response, str):
                self.crawl_timed_out.set()
                self.crawl_running.set()
                db.close()
                return

            self.request_robots_txt(response.url)
            data = self.response_to_data(response)

            self.add_to_data_queue(data)

        elif self.settings['MODE'] == 'List':
            if len(self.list_mode_urls) > 0:
                self.add_to_url_queue(self.list_mode_urls)
                db.insert_new_urls(self.list_mode_urls)
            else:
                print('ERROR: No urls to list crawl found!')

        db.close()

        self.start_consumer()
        Thread(target=self.spawn_threads).start()
示例#6
0
    def reset_crawl(self):
        # Reset queue
        if self.settings['MODE'] != 'List':
            self.data_queue = queue.Queue(maxsize=25)
            self.url_queue = queue.Queue()
            self.gui_url_queue = []
            self.url_attempts = {}

        self.gf = gf(self.settings, columns=None)

        self.crawl_running.clear()
        self.crawl_completed.clear()
        self.crawl_timed_out.clear()

        self.urls_crawled = 0
        self.urls_total = 0
示例#7
0
    def reset_crawl(self) -> None:
        """Reset crawl to default state in preparation for a new crawl. """
        # Reset queue
        if self.settings['MODE'] != 'List':
            self.data_queue = queue.Queue(maxsize=25)
            self.url_queue = queue.Queue()
            self.gui_url_queue = []
            self.url_attempts = {}

        self.init_crawl_headers()
        self.init_session()

        self.active_workers = 0

        self.gf = gf(self.settings, columns=None)

        self.crawl_running.clear()
        self.crawl_completed.clear()
        self.crawl_timed_out.clear()

        self.urls_crawled = 0
        self.urls_total = 0