def _request_file(url, base_url): retries = HttpDirectory.MAX_RETRIES while retries > 0: try: curl = HttpDirectory._curl_handle() raw_headers = BytesIO() curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore")) curl.setopt(pycurl.HEADERFUNCTION, raw_headers.write) curl.perform() stripped_url = url[len(base_url) - 1:] headers = HttpDirectory._parse_dict_header( raw_headers.getvalue().decode("utf-8", errors="ignore")) raw_headers.close() path, name = os.path.split(stripped_url) date = headers.get("Last-Modified", "1970-01-01") curl.close() return File(path=unquote(path).strip("/"), name=unquote(name), size=int(headers.get("Content-Length", -1)), mtime=int(parse_date(date).timestamp()), is_dir=False) except pycurl.error: retries -= 1 logger.debug("TimeoutError - _request_file") raise TimeoutError
def list_dir(self, path): current_dir_name = path[path.rstrip("/").rfind("/") + 1:-1] path_identifier = hashlib.md5(current_dir_name.encode()) path_url = urljoin(self.base_url, path, "") body = self._fetch_body(path_url) anchors = self._parse_links(body) urls_to_request = [] files = [] for anchor in anchors: if self._should_ignore(self.base_url, path, anchor): continue if self._isdir(anchor): directory = File( name=anchor.href, # todo handle external links here mtime=0, size=0, path=path, is_dir=True) path_identifier.update(bytes(directory)) files.append(directory) else: urls_to_request.append(urljoin(path_url, anchor.href)) for file in self.request_files(urls_to_request): path_identifier.update(bytes(file)) files.append(file) return path_identifier.hexdigest(), files
def list_dir(self, path): path_url = self.base_url + path.strip("/") + "/" body = self._stream_body(path_url) if not body: return None anchors = self._parse_links(body) urls_to_request = [] for anchor in anchors: if self._should_ignore(self.base_url, anchor): continue if self._isdir(anchor): yield File(name=anchor.href, mtime=None, size=None, path=path, is_dir=True) else: pass urls_to_request.append(path_url + anchor.href) for file in self.request_files(urls_to_request): yield file
def _request_file(self, url): with warnings.catch_warnings(): warnings.simplefilter("ignore") retries = HttpDirectory.MAX_RETRIES while retries > 0: try: r = self.session.head(url, allow_redirects=False, timeout=40) stripped_url = url[len(self.base_url) - 1:] path, name = os.path.split(stripped_url) date = r.headers[ "Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01" return File(path=unquote(path).strip("/"), name=unquote(name), size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1, mtime=int(parse_date(date).timestamp()), is_dir=False) except RequestException: self.session.close() retries -= 1 return None
def list_dir(self, path): if not self.ftp: # No connection - assuming that connection was dropped because too many raise TooManyConnectionsError() results = [] failed_attempts = 0 while failed_attempts < self.max_attempts: try: file_names = self.ftp.listdir(path) for file_name in file_names: file_path = os.path.join(path, file_name) stat = self.try_stat(file_path) is_dir = self.ftp.path.isdir(file_path) results.append( File(name=os.path.join(file_name, "") if is_dir else file_name, mtime=stat.st_mtime, size=-1 if is_dir else stat.st_size, is_dir=is_dir, path=path.strip("/") if not is_dir else path)) return path, results except ftputil.error.ParserError as e: logger.error("TODO: fix parsing error: " + e.strerror + " @ " + str(e.file_name)) break except ftputil.error.FTPError as e: if e.errno in FtpDirectory.CANCEL_LISTING_CODE: break failed_attempts += 1 self.reconnect() except ftputil.error.PermanentError as e: if e.errno == 530: raise TooManyConnectionsError() if e.errno is None: failed_attempts += 1 self.reconnect() else: print(str(e.strerror) + " errno:" + str(e.errno)) break except Exception as e: failed_attempts += 1 self.reconnect() logger.error("Exception while processing FTP listing for " + self.base_url + ": " + str(e)) return path, []
def list_dir(self, path) -> list: if not self.ftp: # No connection - assuming that connection was dropped because too many raise TooManyConnectionsError() results = [] failed_attempts = 0 while failed_attempts < self.max_attempts: try: file_names = self.ftp.listdir(path) for file_name in file_names: stat = self.try_stat(os.path.join(path, file_name)) is_dir = self.ftp.path.isdir(os.path.join(path, file_name)) results.append( File(name=file_name, mtime=stat.st_mtime, size=-1 if is_dir else stat.st_size, is_dir=is_dir, path=path)) return results except ftputil.error.ParserError as e: print("TODO: fix parsing error: " + e.strerror + " @ " + str(e.file_name)) break except ftputil.error.FTPOSError as e: if e.strerror == "timed out": failed_attempts += 1 continue except ftputil.error.FTPError as e: if e.errno == 530: raise TooManyConnectionsError() except Exception as e: # TODO remove that debug info print("ERROR:" + str(e)) print(type(e)) raise e return []