def test_file_time_setters(archfmt, timefmt, tmpdir): has_birthtime = archfmt != 'zip' # Create an archive of our libarchive/ directory archive_path = tmpdir.join('/test.{0}'.format(archfmt)).strpath archive2_path = tmpdir.join('/test2.{0}'.format(archfmt)).strpath with file_writer(archive_path, archfmt) as archive1: archive1.add_files('libarchive/') atimestamp = (1482144741, 495628118) mtimestamp = (1482155417, 659017086) ctimestamp = (1482145211, 536858081) btimestamp = (1482144740, 495628118) with file_reader(archive_path) as archive1: with file_writer(archive2_path, archfmt) as archive2: for entry in archive1: entry.set_atime(*atimestamp) entry.set_mtime(*mtimestamp) entry.set_ctime(*ctimestamp) if has_birthtime: entry.set_birthtime(*btimestamp) archive2.add_entries([entry]) with file_reader(archive2_path) as archive2: for entry in archive2: assert entry.atime == time_check(atimestamp, timefmt) assert entry.mtime == time_check(mtimestamp, timefmt) assert entry.ctime == time_check(ctimestamp, timefmt) if has_birthtime: assert entry.birthtime == time_check(btimestamp, timefmt)
def read_file(full, filename): res = Result(filename) with file_reader(filename) as pkg: for entry in pkg: # break if any of the files are not secure, speeding up scanning if not full and res.not_secure: break if not entry.isfile: continue if not any(entry.name.startswith(d) for d in VALID_DIRS): continue fp = BytesIO(b''.join(entry.get_blocks())) elf = Elf(fp) if not elf.is_elf(): continue if not elf.pie(): res.nopie.append(entry.name) if not elf.is_relro(): res.norelro.append(entry.name) if not elf.canary(): res.nocanary.append(entry.name) return res
def extract_hashing_feature_from_compressed_data(input_file, num_entries, output_path): hv = HashingVectorizer(encoding='ISO-8859-2') count_total = num_entries # transform to hashing vector print 'Start transform data to hashing vector' t0 = time.time() count = 0 with libarchive.file_reader(input_file) as archive: for entry in archive: count += 1 if entry.pathname.find('.bytes') != -1: text = [ " ".join([ block.replace('\r\n', ' ') for block in entry.get_blocks() ]) ] util.save_data_to_npz(output_path + entry.pathname, hv.transform(text)) t = time.time() - t0 print 'Transform:\tfiles: ' + str(count_total - count) + '/' + str(count_total) \ + '\tElapsed time: ' + str(t) + '(s)' \ + '\tTime left: ' + str((t / count) * (count_total - count)) + '(s)'
def provision(self, dst, clean_target=True, keep_rpm=False): rpm_file = os.path.join(self.source_dir, os.path.basename(self.source)) if clean_target: tmp_rpm = tempfile.NamedTemporaryFile().name shutil.move(rpm_file, tmp_rpm) shutil.rmtree(dst) os.makedirs(dst) shutil.move(tmp_rpm, rpm_file) # Ensure dst does not have trailing slash dst = dst.rstrip('/') # Open the RPM file and extract it to destination with libarchive.file_reader(rpm_file) as rpm: for rpm_file_entry in rpm: # Binary RPM archive data has paths starting with ./ to support # relocation if enabled in the building of RPMs rpm_file_entrypath = rpm_file_entry.pathname.lstrip('./') rpm_file_entrypath = rpm_file_entrypath.lstrip('/') rpm_file_entry.pathname = os.path.join(dst, rpm_file_entrypath) # XXX: libarchive frees the entry at the end of loop iterations # See https://github.com/Changaco/python-libarchive-c/issues/43 libarchive.extract.extract_entries([rpm_file_entry]) if not keep_rpm: os.remove(rpm_file)
def get_entries(location): """ Using the archive file at `location`, return an iterable of name->value mappings for each libarchive.ArchiveEntry objects essential attributes. Paths are base64-encoded because JSON is UTF-8 and cannot handle arbitrary binary pathdata. """ with file_reader(location) as arch: for entry in arch: # libarchive introduces prefixes such as h prefix for # hardlinks: tarfile does not, so we ignore the first char mode = entry.strmode[1:].decode('ascii') yield { 'path': surrogate_decode(entry.pathname), 'mtime': entry.mtime, 'size': entry.size, 'mode': mode, 'isreg': entry.isreg, 'isdir': entry.isdir, 'islnk': entry.islnk, 'issym': entry.issym, 'linkpath': surrogate_decode(entry.linkpath), 'isblk': entry.isblk, 'ischr': entry.ischr, 'isfifo': entry.isfifo, 'isdev': entry.isdev, }
def extract_resource(source, destiny_path=None): root_dir = destiny_path or '/'.join(source.split('/')[:-1]) def full_path(local_path): return '/'.join((root_dir, str(local_path))) def deep_mkdir(dir_path): steps = dir_path.split('/') for i in range(len(steps)): step = '/'.join((root_dir, *(s for s in steps[:i + 1] if s))) if not os.path.isdir(step): os.mkdir(step) files = [] with libarchive.file_reader(source) as arch: for entry in arch: if entry.isdir: deep_mkdir(str(entry)) else: path = str(entry).rsplit('/', 1) if len(path) > 1 and not os.path.isdir(full_path(path[0])): deep_mkdir(path[0]) resource_path = '/'.join((root_dir, *path)) with open(resource_path, 'wb') as f: for block in entry.get_blocks(): f.write(block) files.append(resource_path) return tuple(files)
def get_entries(location): """ Using the archive file at `location`, return an iterable of name->value mappings for each libarchive.ArchiveEntry objects essential attributes. Paths are base64-encoded because JSON is UTF-8 and cannot handle arbitrary binary pathdata. """ with file_reader(location) as arch: for entry in arch: # libarchive introduces prefixes such as h prefix for # hardlinks: tarfile does not, so we ignore the first char mode = entry.strmode[1:].decode('ascii') yield { 'path': surrogate_decode(entry.pathname), 'mtime': entry.mtime, 'size': entry.size, 'mode': mode, 'isreg': entry.isreg, 'isdir': entry.isdir, 'islnk': entry.islnk, 'issym': entry.issym, 'linkpath': surrogate_decode(entry.linkpath), 'isblk': entry.isblk, 'ischr': entry.ischr, 'isfifo': entry.isfifo, 'isdev': entry.isdev, 'uid': entry.uid, 'gid': entry.gid }
def get_filtered_members(self): try: with libarchive.file_reader(self.source.path) as archive: for entry in archive: if any_excluded(entry.pathname): continue yield entry.pathname, self.get_subclass(entry) except libarchive.exception.ArchiveError: pass
def extract_file(filepath, flags=0, callback=lambda x: None, imginfo=(), extractlist=None): """Extracts an archive from a file into the current directory.""" totalsize = 0 for img in imginfo: if not imginfo[img]: continue totalsize += imginfo[img] with libarchive.file_reader(filepath) as archive: extract_entries(archive, flags, callback, totalsize, extractlist)
def test_entry_name_decoding(tar_file): """ Test that the entry names are decoded to utf8 correctly """ path = join(data_dir, tar_file) with file_reader(path) as arch: for entry in arch: # Use str find method to test it was converted from bytes to a # str/unicode entry.name.find('not there')
def main(argv): args = parse_args(argv) for filename in args.FILE: basedir = os.path.dirname(filename) basename = os.path.basename(filename) original_filename, ext = os.path.splitext(basename) lst = [] try: with libarchive.file_reader(filename) as archive: for entry in archive: if entry.isfile: lst.append(str(entry)) except libarchive.exception.ArchiveError as err: print("{}: error: couldn't process archive".format(filename)) continue if args.basename: lst = [os.path.basename(p) for p in lst] common = common_prefix(lst, args.threshold) if len(common) < args.min_length: print("{}: error: could not find common prefix: '{}'".format(filename, common)) continue if args.strip: common = strip_unimportant(common) if args.preserve: common = original_filename + "_" + common if not args.ignore_ext: common += ext if args.move: outfilename = os.path.join(basedir, common) if args.interactive: print("rename", filename, "->", common, "(y/n)?") answer = None while answer not in ["y", "n"]: answer = input() if answer == "y": print(filename, "->", common) rename_safe(filename, outfilename) else: print("ignoring", filename) else: print(filename, "->", common) rename_safe(filename, outfilename) elif args.verbose or len(args.FILE) > 1: print(filename, "->", common) else: print(common)
def list_libarchive(path, ignore_errors=False): try: with libarchive.file_reader(path) as archive: for entry in archive: name_and_link = entry.name if entry.issym: name_and_link = '{entry.name} -> {entry.linkname}'.format( entry=entry ) if Config().exclude_directory_metadata == 'recursive': yield '{name_and_link}\n'.format( name_and_link=name_and_link ) continue if entry.isblk or entry.ischr: size_or_dev = '{major:>3},{minor:>3}'.format( major=entry.rdevmajor, minor=entry.rdevminor ) else: size_or_dev = entry.size mtime = time.strftime( '%Y-%m-%d %H:%M:%S', time.gmtime(entry.mtime) ) + '.{:06d}'.format(entry.mtime_nsec // 1000) if entry.uname: user = '******'.format( user=entry.uname.decode( 'utf-8', errors='surrogateescape' ), uid='({})'.format(entry.uid), ) else: user = entry.uid if entry.gname: group = '{group:<8} {gid:>7}'.format( group=entry.gname.decode( 'utf-8', errors='surrogateescape' ), gid='({})'.format(entry.gid), ) else: group = entry.gid yield '{strmode} {entry.nlink:>3} {user:>8} {group:>8} {size_or_dev:>8} {mtime:>8} {name_and_link}\n'.format( strmode=entry.strmode.decode('us-ascii'), entry=entry, user=user, group=group, size_or_dev=size_or_dev, mtime=mtime, name_and_link=name_and_link, ) except libarchive.exception.ArchiveError: if not ignore_errors: raise
def extract(self, member_name, dest_dir): dest_path = os.path.join(dest_dir, os.path.basename(member_name)) logger.debug('libarchive extracting %s to %s', member_name, dest_path) with libarchive.file_reader(self.source.path) as archive: for entry in archive: if entry.pathname == member_name: logger.debug('entry found, writing %s', dest_path) with open(dest_path, 'wb') as f: for buf in entry.get_blocks(): f.write(buf) return dest_path raise KeyError('%s not found in archive', member_name)
def iterate_archive_chunk_entries(self, chunk_start_idx, chunk_end_idx): with libarchive.file_reader(self.archive_path) as archive: consume(archive, chunk_start_idx) # skips first n progressbar_iterator = progressbar(archive, total=chunk_end_idx - chunk_start_idx) idx = chunk_start_idx for entry in progressbar_iterator: if idx > chunk_end_idx: progressbar_iterator.refresh() break idx += 1 yield entry
def test_file_atime_ctime(archfmt, timefmt, tmpdir): archive_path = "{0}/test.{1}".format(tmpdir.strpath, archfmt) # Collect information on what should be in the archive tree = treestat('libarchive', stat_dict) # Create an archive of our libarchive/ directory with file_writer(archive_path, archfmt) as archive: archive.add_files('libarchive/') # Read the archive and check that the data is correct with file_reader(archive_path) as archive: check_atime_ctime(archive, tree, timefmt=timefmt)
def get_member(self, member_name): with libarchive.file_reader(self.source.path) as archive: for entry in archive: if entry.pathname == member_name: if entry.isdir: return LibarchiveDirectory(self, entry) elif entry.issym: return LibarchiveSymlink(self, entry) elif entry.isblk or entry.ischr: return LibarchiveDevice(self, entry) else: return LibarchiveMember(self, entry) raise KeyError('%s not found in archive', member_name)
def iter_contents(self): """ Generator for listing the archive contents """ with libarchive.file_reader(str(self.archive)) as archive: for filename in archive: if not filename.isfile: continue filepath = Path(filename.pathname) if not ImageSupported.is_image_file(filepath): continue yield filepath
def test_file_atime_ctime(tmpdir): archive_path = tmpdir.strpath + '/test.zip' # Collect information on what should be in the archive tree = treestat('libarchive', stat_dict) # Create an archive of our libarchive/ directory with file_writer(archive_path, 'zip') as archive: archive.add_files('libarchive/') # Read the archive and check that the data is correct with file_reader(archive_path) as archive: check_atime_ctime(archive, tree)
def extractFiles(self): # TODO: globbing or regex on self.paths? # If we have yum, we can, TECHNICALLY, do this with: # http://yum.baseurl.org/api/yum/rpmUtils/miscutils.html#rpmUtils.miscutils.rpm2cpio # But nope. We can't selectively decompress members based on path with rpm2cpio-like funcs. # We keep getting extraction artefacts, at least with legacy libarchive_c, so we use a hammer. _curdir = os.getcwd() _tempdir = tempfile.mkdtemp() os.chdir(_tempdir) for rpm_file in self.rpms: rf = self.rpms[rpm_file] if is_ctype: if not is_legacy: # ctype - extracts to pwd with libarchive.file_reader(rf) as reader: for entry in reader: if self.paths and entry.path not in self.paths: continue if entry.isdir(): continue fpath = os.path.join(self.dest_dir, rpm_file, entry.path) if not os.path.isdir(os.path.dirname(fpath)): os.makedirs(os.path.dirname(fpath)) with open(fpath, 'wb') as f: for b in entry.get_blocks(): f.write(b) else: with libarchive.Archive(rf) as reader: for entry in reader: if (self.paths and entry.pathname not in self.paths) or (entry.isdir()): continue fpath = os.path.join(self.dest_dir, rpm_file, entry.pathname) if not os.path.isdir(os.path.dirname(fpath)): os.makedirs(os.path.dirname(fpath)) reader.readpath(fpath) else: # pyEasyArchive/"pypi/libarchive" with lap.file_reader(rf) as reader: for entry in reader: if (self.paths and entry.pathname not in self.paths) or (entry.filetype.IFDIR): continue fpath = os.path.join(self.dest_dir, rpm_file, entry.pathname) if not os.path.isdir(os.path.dirname(fpath)): os.makedirs(os.path.dirname(fpath)) with open(fpath, 'wb') as f: for b in entry.get_blocks(): f.write(b) os.chdir(_curdir) shutil.rmtree(_tempdir) return()
def scan_iso(filename): filesizes = {} filecontents = {} with libarchive.file_reader(filename) as reader: for ent in reader: if str(ent).endswith('TRANS.TBL'): continue eventlet.sleep(0) filesizes[str(ent)] = ent.size if str(ent) in READFILES: filecontents[str(ent)] = b'' for block in ent.get_blocks(): filecontents[str(ent)] += bytes(block) return filesizes, filecontents
def test_file_time_setters(tmpdir): # Create an archive of our libarchive/ directory archive_path = tmpdir.join('/test.zip').strpath archive2_path = tmpdir.join('/test2.zip').strpath atimestamp = (1482144741, 495628118) mtimestamp = (1482155417, 659017086) ctimestamp = (1482145211, 536858081) with file_writer(archive_path, "zip") as archive1: archive1.add_files('libarchive/') with file_reader(archive_path) as archive1: with file_writer(archive2_path, "zip") as archive2: for entry in archive1: entry.set_atime(*atimestamp) entry.set_mtime(*mtimestamp) entry.set_ctime(*ctimestamp) archive2.add_entries([entry]) with file_reader(archive2_path) as archive2: for entry in archive2: assert entry.atime == atimestamp[0] assert entry.mtime == mtimestamp[0] assert entry.ctime == ctimestamp[0]
def extract(self, member_name, dest_dir): dest_name = os.path.basename(member_name) if not dest_name: raise ValueError('member_name should not be a directory') dest_path = os.path.join(dest_dir, dest_name) logger.debug('libarchive extracting %s to %s', member_name, dest_path) with libarchive.file_reader(self.source.path) as archive: for entry in archive: if entry.pathname == member_name: logger.debug('entry found, writing %s', dest_path) with open(dest_path, 'wb') as f: for buf in entry.get_blocks(): f.write(buf) return dest_path raise KeyError('%s not found in archive', member_name)
def test_sparse_formats(name): """ test for a good sparse map from all of the various sparse formats """ path = join(data_dir, name) expected_map = [(4096, 4096), (12288, 4096), (20480, 4096), (28672, 4096), (36864, 4096), (45056, 4096), (53248, 4096), (61440, 4096), (69632, 4096), (77824, 4096), (86016, 0)] with file_reader(path) as arch: for entry in arch: try: if entry.name.startswith('gnu/sparse'): assert entry.size == 86016 assert entry.sparse_map == expected_map except UnicodeDecodeError: # py27 fails on some unicode pass
def get_content(self) -> Generator[str, None, None]: """Yields processed pieces of content""" # https://github.com/Changaco/python-libarchive-c#usage try: # pylint:disable=import-outside-toplevel import libarchive except AttributeError as ex: # AttributeError: undefined symbol: archive_errno raise DumpError( "Failed to import libarchive with 7zip support") from ex with self.fetch() as handler: with libarchive.file_reader(handler.name) as archive: for entry in archive: for block in entry.get_blocks(): yield block
def syn2map(self, filename): files = os.listdir(self.startDir + '/engine/maps') for file in files: if fnmatch.fnmatch(file, filename): print("Actual Mapname=" + file) with libarchive.file_reader(self.startDir + '/engine/maps/' + file) as reader: for e in reader: # (The entry evaluates to a filename.) print(e) if e.name[-3:] == 'smf': print("real map name: " + e.name) filename = e.name break break print("returning name" + filename[5:-4]) return filename[5:-4]
def get_problematic_files(archive, query): """Search for the files inside archive with the first line matching given query. Some of the files can contain data, which are not in the plain text format. Bytes are read from the file and the shebang query has to be of the same type. """ problematic = set() with libarchive.file_reader(archive) as a: for entry in a: try: first_line = next(entry.get_blocks(), '').splitlines()[0] except IndexError: continue # file is empty if matches(first_line, query.encode()): problematic.add(entry.pathname.lstrip('.')) return problematic
def get_problematic_files(archive, query): """Search for the files inside archive with the first line matching given query. Some of the files can contain data, which are not in the plain text format. Bytes are read from the file and the shebang query is encoded as well. We only test for ASCII shebangs. """ problematic = set() with libarchive.file_reader(archive) as a: for entry in a: try: first_line = next(entry.get_blocks(), '').splitlines()[0] except IndexError: continue # file is empty if matches(first_line, query.encode('ascii')): problematic.add(entry.pathname.lstrip('.')) return problematic
def ensure_unpacked(self): if hasattr(self, '_members'): return tmpdir = get_temporary_directory().name self._members = collections.OrderedDict() logger.debug("Extracting %s to %s", self.source.path, tmpdir) with libarchive.file_reader(self.source.path) as archive: for idx, entry in enumerate(archive): # Always skip directories if entry.isdir: continue # Save extracting excluded files if any_excluded(entry.pathname): continue # Keep directory sizes small. could be improved but should be # good enough for "ordinary" large archives. dst = os.path.join(tmpdir, str(idx // 4096), str(idx % 4096)) root, ext = os.path.splitext(entry.pathname) dst += ext # Maintain a mapping of archive path to the extracted path, # avoiding the need to sanitise filenames. self._members[entry.pathname] = dst logger.debug("Extracting %s to %s", entry.pathname, dst) os.makedirs(os.path.dirname(dst), exist_ok=True) try: with open(dst, 'wb') as f: for block in entry.get_blocks(): f.write(block) except Exception as exc: raise ContainerExtractionError(entry.pathname, exc) logger.debug( "Extracted %d entries from %s to %s", len(self._members), self.source.path, tmpdir, )
def test_files(tmpdir): archive_path = tmpdir.strpath+'/test.tar.gz' # Collect information on what should be in the archive tree = treestat('libarchive') # Create an archive of our libarchive/ directory with libarchive.file_writer(archive_path, 'ustar', 'gzip') as archive: archive.add_files('libarchive/') # Read the archive and check that the data is correct with libarchive.file_reader(archive_path) as archive: check_archive(archive, tree) # Extract the archive in tmpdir and check that the data is intact with in_dir(tmpdir.strpath): flags = EXTRACT_OWNER | EXTRACT_PERM | EXTRACT_TIME libarchive.extract_file(archive_path, flags) tree2 = treestat('libarchive') assert tree2 == tree
def list_libarchive(path): with libarchive.file_reader(path) as archive: for entry in archive: if entry.isblk or entry.ischr: size_or_dev = '{major:>3},{minor:>3}'.format(major=entry.rdevmajor, minor=entry.rdevminor) else: size_or_dev = entry.size mtime = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(entry.mtime)) + '.{:06d}'.format(entry.mtime_nsec // 1000) if entry.issym: name_and_link = '{entry.name} -> {entry.linkname}'.format(entry=entry) else: name_and_link = entry.name if entry.uname: user = '******'.format(user=entry.uname.decode('utf-8', errors='surrogateescape'), uid='({})'.format(entry.uid)) else: user = entry.uid if entry.gname: group = '{group:<8} {gid:>7}'.format(group=entry.gname.decode('utf-8', errors='surrogateescape'), gid='({})'.format(entry.gid)) else: group = entry.gid yield '{strmode} {entry.nlink:>3} {user:>8} {group:>8} {size_or_dev:>8} {mtime:>8} {name_and_link}\n'.format(strmode=entry.strmode.decode('us-ascii'), entry=entry, user=user, group=group, size_or_dev=size_or_dev, mtime=mtime, name_and_link=name_and_link)
def process_archive(parent, relatives, cache_path): filename = os.path.join(cache_path, parent.filename) results = list() try: with libarchive.file_reader(filename) as archive: for entry in archive: if entry.isfile: switcher = { 'application/x-3ds-archive': process_cia, 'application/x-3ds-homebrew': process_tdsx, 'application/x-3ds-iconfile': process_smdh, 'application/x-3ds-arm9bin': process_arm9, 'application/x-3ds-xml': process_xml } action = switcher.get(determine_mimetype(entry.pathname), None) if action: working_file = os.path.join(cache_path, 'archive_root', entry.pathname) working_path = '/'.join(working_file.split('/')[:-1]) if not os.path.isdir(working_path): os.makedirs(working_path) with open(working_file, 'wb') as f: for block in entry.get_blocks(): f.write(block) os.utime(working_file, (entry.mtime,entry.mtime)) results.append(action(parent, relatives, cache_path, entry.pathname)) except libarchive.exception.ArchiveError as e: log.debug("Archive error: %s", e) if results: for result_item in results: # Match up any xml or smdh files in the same folder as our 3dsx. if result_item.__class__ in (XML, SMDH): matched = False for check_item in results: if not matched: matched = check_siblings(check_item, result_item) result_item.active = matched return(results)
def provision(self, dst, clean_target=True, keep_7z=False): seven_zip_file = os.path.join(self.source_dir, os.path.basename(self.source)) if self.source_checksum: sources.verify_checksum(self.source_checksum, seven_zip_file) if clean_target: tmp_7z = tempfile.NamedTemporaryFile().name shutil.move(seven_zip_file, tmp_7z) shutil.rmtree(dst) os.makedirs(dst) shutil.move(tmp_7z, seven_zip_file) # Open the 7z file and extract it to destination with libarchive.file_reader(seven_zip_file) as archive: for file_entry in archive: file_entry.pathname = os.path.join(dst, file_entry.pathname) libarchive.extract.extract_entries([file_entry]) if not keep_7z: os.remove(seven_zip_file)
def __init__(self, source, destiny_path=None): root_dir = os.path.realpath(destiny_path or self.tmp_dir) files = [] with libarchive.file_reader(source) as arch: for entry in arch: entry_path = self._get_entry_path(entry) if entry.isdir: os.makedirs(os.path.join(root_dir, entry_path), exist_ok=True) else: path, extr_file = os.path.split(entry_path) if path and not os.path.isdir(os.path.join(root_dir, path)): os.makedirs(os.path.join(root_dir, path), exist_ok=True) resource_path = os.path.join(root_dir, path, extr_file) with open(resource_path, 'wb') as f: for block in entry.get_blocks(): f.write(block) files.append(resource_path) self.files = tuple(files)
def _extract(self, outdir: str) -> None: with libarchive.file_reader(self.filename) as entries: self.interruption_point() if not os.path.isdir(outdir): os.makedirs(outdir) for entry in entries: pathname = sanitize(entry.pathname) if pathname is None: logger.error("skipping unsanitary entry: %s", entry.pathname) continue # FIXME: entry.isdir doesn't look reliable, needs more testing if entry.isdir or entry.pathname[-1] == '/': dirname = os.path.join(outdir, pathname) if not os.path.isdir(dirname): os.makedirs(dirname) self.sig_entry_extracted.emit(entry.pathname, dirname) elif entry.isreg: dirname = os.path.dirname(pathname) if dirname: dirname = os.path.join(outdir, dirname) if not os.path.isdir(dirname): os.makedirs(dirname) while True: outfile = os.path.join(outdir, pathname) with open(outfile, "wb") as out: blocks = entry.get_blocks() for block in blocks: self.interruption_point() out.write(block) self.sig_entry_extracted.emit(entry.pathname, outfile) break else: logger.warning("skipping non regular entry: %s", entry.pathname)
def get_member_names(self): with libarchive.file_reader(self.source.path) as archive: member_names = [entry.pathname for entry in archive] return member_names
def main(args): avg_file_size = 0 number_of_files = 0 max_size = 0 biggest_file = None min_size = float('inf') smallest_file = None roots = {} values_count = { 'META/META_SPEC/META_ARTICLE/ETAT': {}, 'META/META_SPEC/META_ARTICLE/TYPE': {}, 'META/META_COMMUN/ORIGINE': {}, 'META/META_COMMUN/NATURE': {}, } etats_par_dossier = { 'code_en_vigueur': {}, 'code_non_vigueur': {}, 'TNC_en_vigueur': {}, 'TNC_non_vigueur': {}, } parser = etree.XMLParser() with libarchive.file_reader(args.archive) as archive: for entry in archive: path = entry.pathname if path[-1] == '/': continue number_of_files += 1 size = entry.size avg_file_size += size if size > max_size: biggest_file = path max_size = size if size < min_size: smallest_file = path min_size = size for block in entry.get_blocks(): parser.feed(block) xml = parser.close() tag = xml.tag roots[tag] = roots.get(tag, 0) + 1 for xpath, values_dict in values_count.iteritems(): e = xml.find(xpath) if e is not None: v = e.text values_dict[v] = values_dict.get(v, 0) + 1 if tag != 'ARTICLE': continue d = etats_par_dossier[path.split('/')[3]] etat = xml.find('META/META_SPEC/META_ARTICLE/ETAT') etat = None if etat is None else etat.text d[etat] = d.get(etat, 0) + 1 avg_file_size /= number_of_files biggest_file = {'path': biggest_file, 'size': max_size} smallest_file = {'path': smallest_file, 'size': min_size} stats = { 'avg_file_size', 'number_of_files', 'biggest_file', 'smallest_file', 'roots', 'etats_par_dossier', 'values_count' } r = {k: v for k, v in locals().items() if k in stats} print(json.dumps(r, indent=4, sort_keys=True))
def from_file(filename: str) -> 'ArchiveInfo': with libarchive.file_reader(filename) as entries: archiveinfo = ArchiveInfo(entries) return archiveinfo
def process_archive(db, archive_path, process_links=True): # Define some constants ARTICLE_TAGS = set('NOTA BLOC_TEXTUEL'.split()) SECTION_TA_TAGS = set('TITRE_TA COMMENTAIRE'.split()) TEXTELR_TAGS = set('VERSIONS'.split()) TEXTE_VERSION_TAGS = set('VISAS SIGNATAIRES TP NOTA ABRO RECT'.split()) META_ARTICLE_TAGS = set('NUM ETAT DATE_DEBUT DATE_FIN TYPE'.split()) META_CHRONICLE_TAGS = set(""" NUM NUM_SEQUENCE NOR DATE_PUBLI DATE_TEXTE DERNIERE_MODIFICATION ORIGINE_PUBLI PAGE_DEB_PUBLI PAGE_FIN_PUBLI """.split()) META_VERSION_TAGS = set( 'TITRE TITREFULL ETAT DATE_DEBUT DATE_FIN AUTORITE MINISTERE'.split() ) SOUS_DOSSIER_MAP = { 'articles': 'article', 'sections': 'section_ta', 'textes_structs': 'texte/struct', 'textes_versions': 'texte/version', } TABLES_MAP = {'ARTI': 'articles', 'SCTA': 'sections', 'TEXT': 'textes_'} TYPELIEN_MAP = { "ABROGATION": "ABROGE", "ANNULATION": "ANNULE", "CODIFICATION": "CODIFIE", "CONCORDANCE": "CONCORDE", "CREATION": "CREE", "DEPLACE": "DEPLACEMENT", "DISJOINT": "DISJONCTION", "MODIFICATION": "MODIFIE", "PEREMPTION": "PERIME", "RATIFICATION": "RATIFIE", "TRANSFERE": "TRANSFERT", } TYPELIEN_MAP.update([(v, k) for k, v in TYPELIEN_MAP.items()]) # Define some shortcuts attr = etree._Element.get insert = db.insert update = db.update def get_table(parts): table = TABLES_MAP[parts[-1][4:8]] if table == 'textes_': table += parts[13] + 's' return table counts = {} def count_one(k): try: counts[k] += 1 except KeyError: counts[k] = 1 skipped = 0 unknown_folders = {} liste_suppression = [] xml = etree.XMLParser(remove_blank_text=True) with libarchive.file_reader(archive_path) as archive: for entry in tqdm(archive): path = entry.pathname if path[-1] == '/': continue parts = path.split('/') if parts[-1] == 'liste_suppression_legi.dat': liste_suppression += b''.join(entry.get_blocks()).decode('ascii').split() continue if parts[1] == 'legi': path = path[len(parts[0])+1:] parts = parts[1:] if not parts[2].startswith('code_et_TNC_'): # https://github.com/Legilibre/legi.py/issues/23 try: unknown_folders[parts[2]] += 1 except KeyError: unknown_folders[parts[2]] = 1 continue dossier = parts[3] text_cid = parts[11] text_id = parts[-1][:-4] mtime = entry.mtime # Skip the file if it hasn't changed, store it if it's a duplicate duplicate = False table = get_table(parts) prev_row = db.one(""" SELECT mtime, dossier, cid FROM {0} WHERE id = ? """.format(table), (text_id,)) if prev_row: prev_mtime, prev_dossier, prev_cid = prev_row if prev_dossier != dossier or prev_cid != text_cid: if prev_mtime >= mtime: duplicate = True else: prev_row_dict = db.one(""" SELECT * FROM {0} WHERE id = ? """.format(table), (text_id,), to_dict=True) data = {table: prev_row_dict} data['liens'] = list(db.all(""" SELECT * FROM liens WHERE src_id = ? AND NOT _reversed OR dst_id = ? AND _reversed """, (text_id, text_id), to_dict=True)) if table == 'sections': data['sommaires'] = list(db.all(""" SELECT * FROM sommaires WHERE cid = ? AND parent = ? AND _source = 'section_ta_liens' """, (text_id, text_id), to_dict=True)) elif table == 'textes_structs': source = 'struct/' + text_id data['sommaires'] = list(db.all(""" SELECT * FROM sommaires WHERE cid = ? AND _source = ? """, (text_cid, source), to_dict=True)) data = {k: v for k, v in data.items() if v} insert('duplicate_files', { 'id': text_id, 'sous_dossier': SOUS_DOSSIER_MAP[table], 'cid': prev_cid, 'dossier': prev_dossier, 'mtime': prev_mtime, 'data': json.dumps(data), 'other_cid': text_cid, 'other_dossier': dossier, 'other_mtime': mtime, }, replace=True) count_one('upsert into duplicate_files') elif prev_mtime == mtime: skipped += 1 continue xml.feed(b''.join(entry.get_blocks())) root = xml.close() tag = root.tag meta = root.find('META') # Check the ID if tag == 'SECTION_TA': assert root.find('ID').text == text_id else: meta_commun = meta.find('META_COMMUN') assert meta_commun.find('ID').text == text_id nature = meta_commun.find('NATURE').text # Extract the data we want attrs = {} liens = () sommaires = () if tag == 'ARTICLE': assert nature == 'Article' assert table == 'articles' contexte = root.find('CONTEXTE/TEXTE') assert attr(contexte, 'cid') == text_cid sections = contexte.findall('.//TITRE_TM') if sections: attrs['section'] = attr(sections[-1], 'id') meta_article = meta.find('META_SPEC/META_ARTICLE') scrape_tags(attrs, meta_article, META_ARTICLE_TAGS) scrape_tags(attrs, root, ARTICLE_TAGS, unwrap=True) elif tag == 'SECTION_TA': assert table == 'sections' scrape_tags(attrs, root, SECTION_TA_TAGS) section_id = text_id contexte = root.find('CONTEXTE/TEXTE') assert attr(contexte, 'cid') == text_cid parents = contexte.findall('.//TITRE_TM') if parents: attrs['parent'] = attr(parents[-1], 'id') sommaires = [ { 'cid': text_cid, 'parent': section_id, 'element': attr(lien, 'id'), 'debut': attr(lien, 'debut'), 'fin': attr(lien, 'fin'), 'etat': attr(lien, 'etat'), 'num': attr(lien, 'num'), 'position': i, '_source': 'section_ta_liens', } for i, lien in enumerate(root.find('STRUCTURE_TA')) ] elif tag == 'TEXTELR': assert table == 'textes_structs' scrape_tags(attrs, root, TEXTELR_TAGS) sommaires = [ { 'cid': text_cid, 'element': attr(lien, 'id'), 'debut': attr(lien, 'debut'), 'fin': attr(lien, 'fin'), 'etat': attr(lien, 'etat'), 'position': i, '_source': 'struct/' + text_id, } for i, lien in enumerate(root.find('STRUCT')) ] elif tag == 'TEXTE_VERSION': assert table == 'textes_versions' attrs['nature'] = nature meta_spec = meta.find('META_SPEC') meta_chronicle = meta_spec.find('META_TEXTE_CHRONICLE') assert meta_chronicle.find('CID').text == text_cid scrape_tags(attrs, meta_chronicle, META_CHRONICLE_TAGS) meta_version = meta_spec.find('META_TEXTE_VERSION') scrape_tags(attrs, meta_version, META_VERSION_TAGS) scrape_tags(attrs, root, TEXTE_VERSION_TAGS, unwrap=True) else: raise Exception('unexpected tag: '+tag) if process_links and tag in ('ARTICLE', 'TEXTE_VERSION'): e = root if tag == 'ARTICLE' else meta_version liens_tags = e.find('LIENS') if liens_tags is not None: liens = [] for lien in liens_tags: typelien, sens = attr(lien, 'typelien'), attr(lien, 'sens') src_id, dst_id = text_id, attr(lien, 'id') if sens == 'cible': assert dst_id src_id, dst_id = dst_id, src_id dst_cid = dst_titre = '' typelien = TYPELIEN_MAP.get(typelien, typelien+'_R') _reversed = True else: dst_cid = attr(lien, 'cidtexte') dst_titre = lien.text _reversed = False liens.append({ 'src_id': src_id, 'dst_cid': dst_cid, 'dst_id': dst_id, 'dst_titre': dst_titre, 'typelien': typelien, '_reversed': _reversed, }) if duplicate: data = {table: attrs} if liens: data['liens'] = liens if sommaires: data['sommaires'] = sommaires insert('duplicate_files', { 'id': text_id, 'sous_dossier': SOUS_DOSSIER_MAP[table], 'cid': text_cid, 'dossier': dossier, 'mtime': mtime, 'data': json.dumps(data), 'other_cid': prev_cid, 'other_dossier': prev_dossier, 'other_mtime': prev_mtime, }, replace=True) count_one('upsert into duplicate_files') continue attrs['dossier'] = dossier attrs['cid'] = text_cid attrs['mtime'] = mtime if prev_row: # Delete the associated rows if tag == 'SECTION_TA': db.run(""" DELETE FROM sommaires WHERE cid = ? AND parent = ? AND _source = 'section_ta_liens' """, (text_cid, section_id)) count(counts, 'delete from sommaires', db.changes()) elif tag == 'TEXTELR': db.run(""" DELETE FROM sommaires WHERE cid = ? AND _source = ? """, (text_cid, 'struct/' + text_id)) count(counts, 'delete from sommaires', db.changes()) if tag in ('ARTICLE', 'TEXTE_VERSION'): db.run(""" DELETE FROM liens WHERE src_id = ? AND NOT _reversed OR dst_id = ? AND _reversed """, (text_id, text_id)) count(counts, 'delete from liens', db.changes()) if table == 'textes_versions': db.run("DELETE FROM textes_versions_brutes WHERE id = ?", (text_id,)) count(counts, 'delete from textes_versions_brutes', db.changes()) # Update the row count_one('update in '+table) update(table, dict(id=text_id), attrs) else: count_one('insert into '+table) attrs['id'] = text_id insert(table, attrs) # Insert the associated rows for lien in liens: db.insert('liens', lien) count(counts, 'insert into liens', len(liens)) for sommaire in sommaires: db.insert('sommaires', sommaire) count(counts, 'insert into sommaires', len(sommaires)) print("made", sum(counts.values()), "changes in the database:", json.dumps(counts, indent=4, sort_keys=True)) if skipped: print("skipped", skipped, "files that haven't changed") if unknown_folders: for d, x in unknown_folders.items(): print("skipped", x, "files in unknown folder `%s`" % d) if liste_suppression: suppress(get_table, db, liste_suppression)