예제 #1
0
    def _request_file(url, base_url):

        retries = HttpDirectory.MAX_RETRIES
        while retries > 0:
            try:
                curl = HttpDirectory._curl_handle()
                raw_headers = BytesIO()
                curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore"))
                curl.setopt(pycurl.HEADERFUNCTION, raw_headers.write)
                curl.perform()

                stripped_url = url[len(base_url) - 1:]
                headers = HttpDirectory._parse_dict_header(
                    raw_headers.getvalue().decode("utf-8", errors="ignore"))
                raw_headers.close()

                path, name = os.path.split(stripped_url)
                date = headers.get("Last-Modified", "1970-01-01")
                curl.close()
                return File(path=unquote(path).strip("/"),
                            name=unquote(name),
                            size=int(headers.get("Content-Length", -1)),
                            mtime=int(parse_date(date).timestamp()),
                            is_dir=False)
            except pycurl.error:
                retries -= 1

        logger.debug("TimeoutError - _request_file")
        raise TimeoutError
예제 #2
0
    def list_dir(self, path):

        current_dir_name = path[path.rstrip("/").rfind("/") + 1:-1]
        path_identifier = hashlib.md5(current_dir_name.encode())
        path_url = urljoin(self.base_url, path, "")
        body = self._fetch_body(path_url)
        anchors = self._parse_links(body)

        urls_to_request = []
        files = []

        for anchor in anchors:
            if self._should_ignore(self.base_url, path, anchor):
                continue

            if self._isdir(anchor):

                directory = File(
                    name=anchor.href,  # todo handle external links here
                    mtime=0,
                    size=0,
                    path=path,
                    is_dir=True)
                path_identifier.update(bytes(directory))
                files.append(directory)
            else:
                urls_to_request.append(urljoin(path_url, anchor.href))

        for file in self.request_files(urls_to_request):
            path_identifier.update(bytes(file))
            files.append(file)

        return path_identifier.hexdigest(), files
예제 #3
0
    def list_dir(self, path):

        path_url = self.base_url + path.strip("/") + "/"
        body = self._stream_body(path_url)
        if not body:
            return None
        anchors = self._parse_links(body)

        urls_to_request = []

        for anchor in anchors:
            if self._should_ignore(self.base_url, anchor):
                continue

            if self._isdir(anchor):
                yield File(name=anchor.href,
                           mtime=None,
                           size=None,
                           path=path,
                           is_dir=True)
            else:
                pass
                urls_to_request.append(path_url + anchor.href)

        for file in self.request_files(urls_to_request):
            yield file
예제 #4
0
    def _request_file(self, url):

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            retries = HttpDirectory.MAX_RETRIES
            while retries > 0:
                try:
                    r = self.session.head(url,
                                          allow_redirects=False,
                                          timeout=40)

                    stripped_url = url[len(self.base_url) - 1:]

                    path, name = os.path.split(stripped_url)
                    date = r.headers[
                        "Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
                    return File(path=unquote(path).strip("/"),
                                name=unquote(name),
                                size=int(r.headers["Content-Length"])
                                if "Content-Length" in r.headers else -1,
                                mtime=int(parse_date(date).timestamp()),
                                is_dir=False)
                except RequestException:
                    self.session.close()
                    retries -= 1

            return None
예제 #5
0
    def list_dir(self, path):
        if not self.ftp:
            # No connection - assuming that connection was dropped because too many
            raise TooManyConnectionsError()
        results = []
        failed_attempts = 0
        while failed_attempts < self.max_attempts:
            try:
                file_names = self.ftp.listdir(path)

                for file_name in file_names:
                    file_path = os.path.join(path, file_name)
                    stat = self.try_stat(file_path)
                    is_dir = self.ftp.path.isdir(file_path)

                    results.append(
                        File(name=os.path.join(file_name, "")
                             if is_dir else file_name,
                             mtime=stat.st_mtime,
                             size=-1 if is_dir else stat.st_size,
                             is_dir=is_dir,
                             path=path.strip("/") if not is_dir else path))
                return path, results
            except ftputil.error.ParserError as e:
                logger.error("TODO: fix parsing error: " + e.strerror + " @ " +
                             str(e.file_name))
                break
            except ftputil.error.FTPError as e:
                if e.errno in FtpDirectory.CANCEL_LISTING_CODE:
                    break
                failed_attempts += 1
                self.reconnect()
            except ftputil.error.PermanentError as e:
                if e.errno == 530:
                    raise TooManyConnectionsError()
                if e.errno is None:
                    failed_attempts += 1
                    self.reconnect()
                else:
                    print(str(e.strerror) + " errno:" + str(e.errno))
                    break
            except Exception as e:
                failed_attempts += 1
                self.reconnect()
                logger.error("Exception while processing FTP listing for " +
                             self.base_url + ": " + str(e))

        return path, []
예제 #6
0
    def list_dir(self, path) -> list:
        if not self.ftp:
            # No connection - assuming that connection was dropped because too many
            raise TooManyConnectionsError()
        results = []
        failed_attempts = 0
        while failed_attempts < self.max_attempts:
            try:
                file_names = self.ftp.listdir(path)

                for file_name in file_names:
                    stat = self.try_stat(os.path.join(path, file_name))
                    is_dir = self.ftp.path.isdir(os.path.join(path, file_name))

                    results.append(
                        File(name=file_name,
                             mtime=stat.st_mtime,
                             size=-1 if is_dir else stat.st_size,
                             is_dir=is_dir,
                             path=path))
                return results
            except ftputil.error.ParserError as e:
                print("TODO: fix parsing error: " + e.strerror + " @ " +
                      str(e.file_name))
                break
            except ftputil.error.FTPOSError as e:
                if e.strerror == "timed out":
                    failed_attempts += 1
                    continue
            except ftputil.error.FTPError as e:
                if e.errno == 530:
                    raise TooManyConnectionsError()
            except Exception as e:
                # TODO remove that debug info
                print("ERROR:" + str(e))
                print(type(e))
                raise e

        return []