def ParseReleaseFilesFromLocalMirror(self) -> list: """ Get a list of all Index files from the Release file using the files that exist in the /mirror directory. """ return self._ParseReleaseFiles(Settings.MirrorPath())
def _NeedUpdate(self, path: str, size: int) -> bool: """ Determine whether a file needs updating. If the file exists on disk, its size is compared to that listed in the Package. The result of the comparison determines whether the file should be downloaded. If the file does not exist, it must be downloaded. Function can be forced to always return True in the event that the correct setting is applied in the Configuration file. """ # Realistically, this is a bad check, as the size # could remain the same, but source may have changed. # Allow the user to force an update via Settings. # Ideally, a comparison of the checksum listed in the Package # and the actual file would be good, but potentially slow if Settings.ForceUpdate(): return True if os.path.isfile(path): return os.path.getsize(path) != size return True
def ParseIndexFilesFromLocalMirror(self) -> list[Package]: """Get all items listed in the Index files that exist within the /mirror directory.""" # The Force setting needs to be enabled so that a Repository will return all Index Files, # and not just modified ones. The dependency isn't great, but this feature is an add-on # and not part of the initial design Settings.SetForce() indices = self._GetIndexFiles(True) # All files due to Force being Enabled fileList = [] # type: list[str] for file in tqdm.tqdm(indices, position=1, unit=" index", desc="Indices ", leave=False): fileList += self._ProcessIndex(Settings.MirrorPath(), file, True) return fileList
def ParseReleaseFilesFromRemote(self) -> list: """ Get a list of all Index files from the Release file using the files that exist in the /skel directory. """ return self._ParseReleaseFiles(Settings.SkelPath())
def Clean(repositories: list, requiredFiles: list): # 5. Determine which files are in the mirror, but not listed in the Index files items = [] # type: list[str] logger.info("\tCompiling list of files to clean...") uris = {repository.Uri for repository in repositories} for uri in tqdm.tqdm(uris, position=0, unit=" repo", desc="Repositories ", leave=False): walked = [] # type: list[str] for root, _, files in tqdm.tqdm(os.walk(SanitiseUri(uri)), position=1, unit=" fso", desc="FSO ", leave=False, delay=0.5): for file in tqdm.tqdm(files, position=2, unit=" file", desc="Files ", leave=False, delay=0.5): walked.append(os.path.join(root, file)) logger.debug(f"{SanitiseUri(uri)}: Walked {len(walked)} items") items += [ x for x in walked if os.path.normpath(x) not in requiredFiles and not os.path.islink(x) ] logger.debug(f"Found {len(items)} which can be freed") for item in items: logger.debug(item) # 6. Calculate size of items to clean if items: logger.info("\tCalculating space savings...") clearSize = 0 for file in tqdm.tqdm(items, unit=" files", leave=False): clearSize += os.path.getsize(file) else: logger.info("\tNo files eligible to clean") return if Settings.Test(): logger.info( f"\tFound {ConvertSize(clearSize)} in {len(items)} files and directories that could be freed." ) return logger.info( f"\t{ConvertSize(clearSize)} in {len(items)} files and directories will be freed..." ) # 7. Clean files for item in items: os.remove(item)
def _ProcessIndex(self, indexRoot: str, index: str, skipUpdateCheck: bool) -> list[Package]: """ Processes each package listed in the Index file. For each Package that is found in the Index file, it is checked to see whether the file exists in the local mirror, and if not, adds it to the collection for download. If the file does exist, checks based on the filesize to determine if the file has been updated. """ packageList = [] # type: list[Package] path = SanitiseUri(self.Uri) indexFile = Index(f"{indexRoot}/{index}") indexFile.Read() logging.debug(f"Processing Index file: {indexRoot}/{index}") packages = indexFile.GetPackages() # type: list[dict[str,str]] mirror = Settings.MirrorPath() + "/" + path for package in tqdm.tqdm(packages, position=2, unit=" pkgs", desc="Packages ", leave=False, delay=0.5): if "Filename" in package: # Packages Index filename = package["Filename"] if filename.startswith("./"): filename = filename[2:] packageList.append(Package(os.path.normpath(f"{path}/{filename}"), int(package["Size"]), skipUpdateCheck or not self._NeedUpdate(os.path.normpath(f"{mirror}/{filename}"), int(package["Size"])))) else: # Sources Index for key, value in package.items(): if "Files" in key: files = list(filter(None, value.splitlines())) # type: list[str] for file in files: directory = package["Directory"] sourceFile = file.split(" ") size = int(sourceFile[1]) filename = sourceFile[2] if filename.startswith("./"): filename = filename[2:] packageList.append(Package(os.path.normpath(f"{path}/{directory}/{filename}"), int(package["Size"]), skipUpdateCheck or not self._NeedUpdate(os.path.normpath(f"{mirror}/{directory}/{filename}"), int(package["Size"])))) if [x for x in packageList if not x.Latest]: logger.debug(f"Packages to update ({len([x for x in packageList if not x.Latest])}):") for pkg in [x.Filename for x in packageList if not x.Latest]: logger.debug(f"\t{pkg}") return packageList
def PerformClean(): """Perform the cleaning of files on the local repository.""" global repositories global filesToKeep logger.info("## Clean Mode ##") print() cleanRepositories = [] # 1. Ensure that the Repositories are actually on disk for repository in repositories: if os.path.isdir( f"{Settings.MirrorPath()}/{SanitiseUri(repository.Uri)}/dists/{repository.Distribution}" ): cleanRepositories.append(repository) else: logger.debug( f"Repository not found on disk: {SanitiseUri(repository.Uri)} {repository.Distribution}" ) # 2. Get the Release files for each of the Repositories releaseFiles = [] for repository in cleanRepositories: releaseFiles += repository.GetReleaseFiles() for releaseFile in releaseFiles: filesToKeep.append(os.path.normpath(SanitiseUri(releaseFile))) # 3. Parse the Release files for the list of Index files that are on Disk indexFiles = [] for repository in cleanRepositories: indexFiles += repository.ParseReleaseFilesFromLocalMirror() for indexFile in indexFiles: filesToKeep.append(os.path.normpath(SanitiseUri(indexFile))) # 4. Generate list of all files on disk according to the Index files logger.info("Reading all Packages...") fileList = [] for repository in tqdm.tqdm(cleanRepositories, position=0, unit=" repo", desc="Repositories ", leave=False): fileList += repository.ParseIndexFilesFromLocalMirror() # Packages potentially add duplicates - remove duplicates now requiredFiles = [] # type: list[str] requiredFiles = list(set(filesToKeep)) + [x.Filename for x in fileList] os.chdir(Settings.MirrorPath()) Clean(cleanRepositories, requiredFiles)
def ConfigureLogger(): """Configure the logger for the Application.""" formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S") # Console minimum level is INFO regardless of settings, to # prevent overloading the screen consoleHandler = logging.StreamHandler(sys.stdout) consoleHandler.setFormatter(formatter) consoleHandler.addFilter(LogFilter(logging.INFO)) path = Path(Settings.GetRootPath()) os.makedirs(path, exist_ok=True) fileHandler = RotatingFileHandler(f"{Settings.GetRootPath()}/refrapt.log", maxBytes=524288000, backupCount=3) fileHandler.setFormatter(formatter) root = logging.getLogger() root.setLevel(Settings.LogLevel()) root.addHandler(consoleHandler) root.addHandler(fileHandler)
def GetRepositories(configData: list) -> list: """Determine the Repositories listed in the Configuration file.""" for line in [x for x in configData if x.startswith("deb")]: repositories.append(Repository(line, Settings.Architecture())) for line in [x for x in configData if x.startswith("clean")]: if "False" in line: uri = line.split(" ")[1] repository = [x for x in repositories if x.Uri == uri] repository[0].Clean = False logger.debug(f"Not cleaning {uri}") return repositories
def Download(urls: list, kind: UrlType): """Download a list of files of a specific type""" if not urls: logger.info("No files to download") return arguments = Downloader.CustomArguments() logger.info(f"Downloading {len(urls)} {kind.name} files...") with multiprocessing.Pool(Settings.Threads()) as pool: downloadFunc = partial(Downloader.DownloadUrlsProcess, kind=kind.name, args=arguments, logPath=Settings.VarPath(), rateLimit=Settings.LimitRate()) for _ in tqdm.tqdm(pool.imap_unordered(downloadFunc, urls), total=len(urls), unit=" file"): pass
def Init(): """Setup filelock for quieter logging and handling of lock files (unix).""" # Quieten filelock's logger filelock.logger().setLevel(logging.CRITICAL) # filelock does not delete releasd lock files on Unix due # to potential race conditions in the event of multiple # programs trying to lock the file. # Refrapt only uses them to track whether a file was fully # downloaded or not in the event of interruption, so we # can cleanup the files now. for file in os.listdir(Settings.VarPath()): if ".lock" in file: os.remove(f"{Settings.VarPath()}/{file}")
def DecompressIndexFiles(self): """ Decompress the Binary Package Indices (Binary Repository) or Source Indices (Source Repository). """ indexFiles = self._GetIndexFiles(True) # Modified files only if not indexFiles: return indexType = None if self._repositoryType == RepositoryType.Bin: indexType = "Packages " elif self.RepositoryType == RepositoryType.Src: indexType = "Sources " with multiprocessing.Pool(Settings.Threads()) as pool: for _ in tqdm.tqdm(pool.imap_unordered(UnzipFile, indexFiles), position=1, total=len(indexFiles), unit=" index", desc=indexType, leave=False): pass
def _GetFiles(self, modified: bool) -> list: """Get a list of all files based on whether they have been modified or not.""" files = [] # type: list[str] for component in self._sourceCollection: for file in self._sourceCollection[component]: addFile = False if modified: addFile = self._sourceCollection[component][file].Modified or Settings.Force() else: addFile = not self._sourceCollection[component][file].Modified or Settings.Force() if addFile: filename, _ = os.path.splitext(file) files.append(filename) return list(set(files)) # Ensure uniqueness due to stripped extension
def ParseUnmodifiedIndexFiles(self) -> list[str]: """ Read the Binary Package Indices (Binary Repository) or Source Indices (Source Repository) for all Filenames. Section 1.4 of the DebianRepository Format document states: - "[The files] consist of multiple paragraphs ... and the additional fields defined in this section, precisely: - Filename (mandatory)" - https://wiki.debian.org/DebianRepository/Format#A.22Packages.22_Indices Only the filename is of interest in order to download it. """ indices = self._GetIndexFiles(False) # Unmodified files only fileList = [] # type: list[Package] for file in tqdm.tqdm(indices, position=1, unit=" index", desc="Indices ", leave=False): fileList += self._ProcessIndex(Settings.SkelPath(), file, True) return [x for x in fileList if x.Latest]
def _ParseReleaseFiles(self, rootPath: str) -> list: """ Get a list of all Index files from the Release file. Section 1.2 of the DebianRepository Format document states: - "Servers shall provide the InRelease file, and might provide a Release files and its signed counterparts" - https://wiki.debian.org/DebianRepository/Format#A.22Release.22_files Therefore default to parsing the InRelease file. For the purposes of identifying which package indexes are required for download, the MD5Sum, SHA1 and SHA256 fields are parsed. Section 1.2.10 states: - "Those fields shall be multi-line fields containing multiple lines of whitespace separated data. Each line shall contain; - The checksum of the file in the format corresponding to the field - The size of the file (integer >= 0) - The filename relative to the directory of the Release file Each datum must be separated by one or more whitespace characters." - https://wiki.debian.org/DebianRepository/Format#MD5Sum.2C_SHA1.2C_SHA256 """ baseUrl = self._uri + "/" if self._components: baseUrl += "dists/" + self._distribution + "/" inReleaseFilePath = rootPath + "/" + SanitiseUri(baseUrl) + "/InRelease" releaseFilePath = rootPath + "/" + SanitiseUri(baseUrl) + "/Release" # Default to InRelease releaseFileToRead = inReleaseFilePath if not os.path.isfile(inReleaseFilePath): # Fall back to Release releaseFileToRead = releaseFilePath checksums = False indexFiles = [] with open(releaseFileToRead) as f: for line in f: if ("SHA256:" in line or "SHA1:" in line or "MD5Sum:" in line) and "Hash:" not in line: checksumType = line checksumType = checksumType.replace(":", "").strip() checksums = False if checksums: if re.search("^ +(.*)$", line): parts = list(filter(None, line.split(" "))) # parts[0] = checksum # parts[1] = size # parts[2] = filename if not len(parts) == 3: logger.warning(f"Malformed checksum line '{line}' in {releaseFileToRead}") continue checksum = parts[0].strip() filename = parts[2].rstrip() if self._repositoryType == RepositoryType.Bin: for architecture in self._architectures: if Settings.Contents(): if re.match(rf"Contents-{architecture}", filename): indexFiles.append(f"{baseUrl}{filename}") if self._components: for component in self._components: if Settings.Contents(): if re.search(rf"{component}/Contents-{architecture}", filename): indexFiles.append(f"{baseUrl}{filename}") binaryByHash = rf"{baseUrl}{component}/binary-{architecture}/by-hash/{checksumType}/{checksum}" if re.match(rf"{component}/binary-{architecture}/Release", filename): indexFiles.append(f"{baseUrl}{filename}") if Settings.ByHash(): indexFiles.append(binaryByHash) if re.match(rf"{component}/binary-{architecture}/Packages", filename): indexFiles.append(f"{baseUrl}{filename}") if re.match(rf"{component}/binary-{architecture}/Packages[^./]*(\.gz|\.bz2|\.xz|$)$", filename): self._packageCollection.Add(component, architecture, f"{baseUrl}{filename}") if Settings.ByHash(): indexFiles.append(binaryByHash) if re.match(rf"{component}/cnf/Commands-{architecture}", filename): indexFiles.append(f"{baseUrl}{filename}") if Settings.ByHash(): indexFiles.append(rf"{baseUrl}{component}/cnf/by-hash/{checksumType}/{checksum}") i18nByHash = rf"{baseUrl}{component}/i18n/by-hash/{checksumType}/{checksum}" if re.match(rf"{component}/i18n/cnf/Commands-{architecture}", filename): indexFiles.append(f"{baseUrl}{filename}") if Settings.ByHash(): indexFiles.append(i18nByHash) if re.match(rf"{component}/i18n/Index", filename): indexFiles.append(f"{baseUrl}{filename}") if Settings.ByHash(): indexFiles.append(i18nByHash) for language in Settings.Language(): if re.match(rf"{component}/i18n/Translation-{language}", filename): indexFiles.append(f"{baseUrl}{filename}") if Settings.ByHash(): indexFiles.append(i18nByHash) if re.match(rf"{component}/dep11/(Components-{architecture}\.yml|icons-[^./]+\.tar)", filename): indexFiles.append(f"{baseUrl}{filename}") if Settings.ByHash(): indexFiles.append(f"{baseUrl}{component}/dep11/by-hash/{checksumType}/{checksum}") else: indexFiles.append(f"{baseUrl}{filename}") self._packageCollection.Add("Flat", architecture, f"{baseUrl}{filename}") elif self._repositoryType == RepositoryType.Src: for component in self._components: if re.match(rf"{component}/source/Release", filename): indexFiles.append(f"{baseUrl}{filename}") if re.match(rf"{component}/source/Sources[^./]*(\.gz|\.bz2|\.xz|$)$", filename): indexFiles.append(f"{baseUrl}{filename}") self._sourceCollection.Add(component, f"{baseUrl}{filename}") else: checksums = False else: checksums = "SHA256:" in line or "SHA1:" in line or "MD5Sum:" in line if self._repositoryType == RepositoryType.Bin: self._packageCollection.DetermineCurrentTimestamps() elif self._repositoryType == RepositoryType.Src: self._sourceCollection.DetermineCurrentTimestamps() for file in indexFiles: file = os.path.normpath(file) return list(set(indexFiles)) # Remove duplicates caused by reading multiple listings for each checksum type
def main(conf: str, test: bool, clean: bool): """A tool to mirror Debian Repositories for use as a local mirror.""" global repositories global filesToKeep startTime = time.perf_counter() ConfigureLogger() logger.info("Starting Refrapt process") configData = GetConfig(conf) # Parse the configuration file Settings.Parse(configData) logging.getLogger().setLevel(Settings.LogLevel()) # Ensure that command line argument for Test overrides if it is set in the configuration file if test: Settings.EnableTest() if Settings.Test(): logger.info("## Running in Test mode ##\n") repositories = GetRepositories(configData) if not repositories: logger.info( "No Repositories found in configuration file. Application exiting." ) sys.exit() # Create working directories Path(Settings.MirrorPath()).mkdir(parents=True, exist_ok=True) Path(Settings.SkelPath()).mkdir(parents=True, exist_ok=True) Path(Settings.VarPath()).mkdir(parents=True, exist_ok=True) Downloader.Init() # Change to the Skel directory for working repository structure os.chdir(Settings.SkelPath()) # Check for any "-lock" files. for file in os.listdir(Settings.VarPath()): if "Download-lock" in file: # A download was in progress and interrupted. This means a # partial download will be sitting on the drive. Remove # it to guarantee that it will be fully downloaded. uri = None with open(f"{Settings.VarPath()}/{file}") as f: uri = f.readline() uri = SanitiseUri(uri) if os.path.isfile(f"{Settings.MirrorPath()}/{uri}"): os.remove(f"{Settings.MirrorPath()}/{uri}") elif os.path.isfile(f"{Settings.VarPath()}/{uri}"): os.remove(f"{Settings.VarPath()}/{uri}") logger.info(f"Removed incomplete download {uri}") if appLockFile in file: # Refrapt was interrupted during processing. # To ensure that files which now may not # be marked as Modified due to recently being # downloaded, force processing of all files logger.info( "The previous Refrapt run was interrupted. Full processing will be performed to ensure completeness" ) Settings.SetForce() # Delete existing /var files logger.info("Removing previous /var files...") for item in os.listdir(Settings.VarPath()): os.remove(f"{Settings.VarPath()}/{item}") # Create a lock file for the Application with FileLock(f"{Settings.VarPath()}/{appLockFile}.lock"): with open(f"{Settings.VarPath()}/{appLockFile}", "w+") as f: pass print() if clean: PerformClean() else: PerformMirroring() # Lock file no longer required os.remove(f"{Settings.VarPath()}/{appLockFile}") if os.path.isfile(f"{Settings.VarPath()}/{appLockFile}.lock"): # Requires manual deletion on Unix os.remove(f"{Settings.VarPath()}/{appLockFile}.lock") print() logger.info( f"Refrapt completed in {datetime.timedelta(seconds=round(time.perf_counter() - startTime))}" )
def PerformMirroring(): """Perform the main mirroring function of this application.""" global repositories global filesToKeep filesToDownload = list([tuple()]) # type: list[tuple[str, int]] filesToDownload.clear() logger.info(f"Processing {len(repositories)} Repositories...") # 1. Get the Release files for each of the Repositories releaseFiles = [] for repository in repositories: releaseFiles += repository.GetReleaseFiles() logger.debug("Adding Release Files to filesToKeep:") for releaseFile in releaseFiles: logger.debug(f"\t{SanitiseUri(releaseFile)}") filesToKeep.append(os.path.normpath(SanitiseUri(releaseFile))) logger.info( f"Compiled a list of {len(releaseFiles)} Release files for download") Downloader.Download(releaseFiles, UrlType.Release) # 2. Parse the Release files for the list of Index files to download indexFiles = [] for repository in repositories: indexFiles += repository.ParseReleaseFilesFromRemote() logger.debug("Adding Index Files to filesToKeep:") for indexFile in indexFiles: logger.debug(f"\t{SanitiseUri(indexFile)}") filesToKeep.append(os.path.normpath(SanitiseUri(indexFile))) print() logger.info( f"Compiled a list of {len(indexFiles)} Index files for download") Downloader.Download(indexFiles, UrlType.Index) # Record timestamps of downloaded files to later detemine which files have changed, # and therefore need to be processsed for repository in repositories: repository.Timestamp() # 3. Unzip each of the Packages / Sources indices and obtain a list of all files to download print() logger.info(f"Decompressing Packages / Sources Indices...") for repository in tqdm.tqdm(repositories, position=0, unit=" repo", desc="Repositories "): repository.DecompressIndexFiles() # 4. Parse all Index files (Package or Source) to collate all files that need to be downloaded print() logger.info("Building file list...") for repository in tqdm.tqdm([x for x in repositories if x.Modified], position=0, unit=" repo", desc="Repositories ", leave=False): filesToDownload += repository.ParseIndexFiles() # Packages potentially add duplicate downloads, slowing down the rest # of the process. To counteract, remove duplicates now filesToKeep = list( set(filesToKeep)) + [x.Filename for x in filesToDownload] logger.debug(f"Files to keep: {len(filesToKeep)}") for file in filesToKeep: logger.debug(f"\t{file}") # 5. Perform the main download of Binary and Source files downloadSize = ConvertSize( sum([x.Size for x in filesToDownload if not x.Latest])) logger.info( f"Compiled a list of {len([x for x in filesToDownload if not x.Latest])} Binary and Source files of size {downloadSize} for download" ) os.chdir(Settings.MirrorPath()) if not Settings.Test(): Downloader.Download([x.Filename for x in filesToDownload], UrlType.Archive) # 6. Copy Skel to Main Archive if not Settings.Test(): print() logger.info("Copying Skel to Mirror") for indexUrl in tqdm.tqdm(filesToKeep, unit=" files"): skelFile = f"{Settings.SkelPath()}/{SanitiseUri(indexUrl)}" if os.path.isfile(skelFile): mirrorFile = f"{Settings.MirrorPath()}/{SanitiseUri(indexUrl)}" copy = True if os.path.isfile(mirrorFile): # Compare files using Timestamp to save moving files that don't need to be skelTimestamp = os.path.getmtime(Path(skelFile)) mirrorTimestamp = os.path.getmtime(Path(mirrorFile)) copy = skelTimestamp > mirrorTimestamp if copy: os.makedirs(Path(mirrorFile).parent.absolute(), exist_ok=True) shutil.copyfile(skelFile, mirrorFile) # 7. Remove any unused files print() if Settings.CleanEnabled(): PostMirrorClean() else: logger.info("Skipping Clean") if Settings.Test(): # Remove Release Files and Index Files added to /skel to ensure normal processing # next time the application is run, otherwise the app will think it has all # the latest files downloaded, when actually it only has the latest /skel Index files print() os.chdir(Settings.SkelPath()) logger.info("Test mode - Removing Release and Index files from /skel") for skelFile in releaseFiles + indexFiles: file = os.path.normpath( f"{Settings.SkelPath()}/{SanitiseUri(skelFile)}") if os.path.isfile(file): os.remove(file)
def CustomArguments() -> list: """Creates custom Wget arguments based on the Settings provided.""" arguments = [] if Settings.AuthNoChallege(): arguments.append("--auth-no-challenge") if Settings.NoCheckCertificate(): arguments.append("--no-check-certificate") if Settings.Unlink(): arguments.append("--unlink") if Settings.Certificate(): arguments.append(f"--certificate={Settings.Certificate()}") if Settings.CaCertificate(): arguments.append(f"--ca-certificate={Settings.CaCertificate()}") if Settings.PrivateKey(): arguments.append(f"--privateKey={Settings.PrivateKey()}") if Settings.UseProxy(): arguments.append("-e use_proxy=yes") if Settings.HttpProxy(): arguments.append("-e http_proxy=" + Settings.HttpProxy()) if Settings.HttpsProxy(): arguments.append("-e https_proxy=" + Settings.HttpsProxy()) if Settings.ProxyUser(): arguments.append("-e proxy_user="******"-e proxy_password=" + Settings.ProxyPassword()) return arguments