示例#1
0
    def _request_file(url, base_url):

        retries = HttpDirectory.MAX_RETRIES
        while retries > 0:
            try:
                curl = HttpDirectory._curl_handle()
                raw_headers = BytesIO()
                curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore"))
                curl.setopt(pycurl.HEADERFUNCTION, raw_headers.write)
                curl.perform()

                stripped_url = url[len(base_url) - 1:]
                headers = HttpDirectory._parse_dict_header(
                    raw_headers.getvalue().decode("utf-8", errors="ignore"))
                raw_headers.close()

                path, name = os.path.split(stripped_url)
                date = headers.get("Last-Modified", "1970-01-01")
                curl.close()
                return File(path=unquote(path).strip("/"),
                            name=unquote(name),
                            size=int(headers.get("Content-Length", -1)),
                            mtime=int(parse_date(date).timestamp()),
                            is_dir=False)
            except pycurl.error:
                retries -= 1

        logger.debug("TimeoutError - _request_file")
        raise TimeoutError
示例#2
0
    def _process_listings(self, url: str, in_q: Queue, files_q: Queue):

        directory = RemoteDirectoryFactory.get_directory(url)
        timeout_retries = 20  # If any worker threads reaches 20 retries, the whole queue is emptied

        while directory:
            try:
                path = in_q.get(timeout=2000)
            except Empty:
                logger.debug("in_q is Empty")
                directory.close()
                break

            if path is None:
                break

            try:
                path_id, listing = directory.list_dir(path)
                if len(listing) > 0 and path_id not in self.crawled_paths:
                    self.crawled_paths.add(path_id)

                    for f in listing:
                        if f.is_dir:
                            in_q.put(urljoin(f.path, f.name))
                        else:
                            files_q.put(f)
                    logger.debug("LISTED " + urljoin(self.url, path))
            except TooManyConnectionsError:
                logger.debug(
                    "Too many connections, this thread will be killed and path resubmitted"
                )
                # Kill worker and resubmit listing task
                directory.close()
                in_q.put(path)
                # TODO: If all workers are killed the queue will never get processed and
                # TODO: the crawler will be stuck forever
                break
            except TimeoutError:
                logger.error("Directory listing timed out, " +
                             str(timeout_retries) + " retries left")
                if timeout_retries > 0:
                    timeout_retries -= 1
                    in_q.put(path)
                else:
                    logger.error("Dropping website " + url)
                    self.status_code = "Timeout during website listing"
                    directory.close()

                    logger.debug("Emptying queue")
                    while True:
                        try:
                            in_q.get_nowait()
                            in_q.task_done()
                        except Empty:
                            break
                    logger.debug("Emptied queue")
                    break
            finally:
                in_q.task_done()
示例#3
0
    def _fetch_body(self, url: str):
        retries = HttpDirectory.MAX_RETRIES
        while retries > 0:
            try:
                content = BytesIO()
                self.curl.setopt(pycurl.URL,
                                 url.encode("utf-8", errors="ignore"))
                self.curl.setopt(pycurl.WRITEDATA, content)
                self.curl.perform()

                return content.getvalue().decode("utf-8", errors="ignore")
            except pycurl.error:
                self.close()
                retries -= 1

        logger.debug("TimeoutError - _fetch_body")
        raise TimeoutError
示例#4
0
    def stop_when_connected(self):
        failed_attempts = 0
        while failed_attempts < self.max_attempts:
            try:
                self._connect()
                logger.debug("New FTP connection @ " + self.base_url)
                return True
            except ftputil.error.FTPError as e:

                if e.errno == 530 or e.errno == 421:
                    break

                failed_attempts += 1
                print("Connection error; reconnecting..." + e.strerror + " " +
                      str(e.errno))
                time.sleep(2)
        return False
示例#5
0
 def error(self, message):
     logger.debug("HTML Parser error: " + message)
示例#6
0
 def close(self):
     if self.ftp:
         self.ftp.close()
         self.ftp = None
     logger.debug("Closing FtpRemoteDirectory for " + self.base_url)
示例#7
0
 def reconnect(self):
     if self.ftp:
         self.ftp.close()
         success = self.stop_when_connected()
         logger.debug("Reconnecting to FTP server " + self.base_url +
                      (" (OK)" if success else " (ERR)"))