def test_file_time_setters(archfmt, timefmt, tmpdir):
    has_birthtime = archfmt != 'zip'

    # Create an archive of our libarchive/ directory
    archive_path = tmpdir.join('/test.{0}'.format(archfmt)).strpath
    archive2_path = tmpdir.join('/test2.{0}'.format(archfmt)).strpath
    with file_writer(archive_path, archfmt) as archive1:
        archive1.add_files('libarchive/')

    atimestamp = (1482144741, 495628118)
    mtimestamp = (1482155417, 659017086)
    ctimestamp = (1482145211, 536858081)
    btimestamp = (1482144740, 495628118)
    with file_reader(archive_path) as archive1:
        with file_writer(archive2_path, archfmt) as archive2:
            for entry in archive1:
                entry.set_atime(*atimestamp)
                entry.set_mtime(*mtimestamp)
                entry.set_ctime(*ctimestamp)
                if has_birthtime:
                    entry.set_birthtime(*btimestamp)
                archive2.add_entries([entry])

    with file_reader(archive2_path) as archive2:
        for entry in archive2:
            assert entry.atime == time_check(atimestamp, timefmt)
            assert entry.mtime == time_check(mtimestamp, timefmt)
            assert entry.ctime == time_check(ctimestamp, timefmt)
            if has_birthtime:
                assert entry.birthtime == time_check(btimestamp, timefmt)
def test_file_time_setters(archfmt, timefmt, tmpdir):
    has_birthtime = archfmt != 'zip'

    # Create an archive of our libarchive/ directory
    archive_path = tmpdir.join('/test.{0}'.format(archfmt)).strpath
    archive2_path = tmpdir.join('/test2.{0}'.format(archfmt)).strpath
    with file_writer(archive_path, archfmt) as archive1:
        archive1.add_files('libarchive/')

    atimestamp = (1482144741, 495628118)
    mtimestamp = (1482155417, 659017086)
    ctimestamp = (1482145211, 536858081)
    btimestamp = (1482144740, 495628118)
    with file_reader(archive_path) as archive1:
        with file_writer(archive2_path, archfmt) as archive2:
            for entry in archive1:
                entry.set_atime(*atimestamp)
                entry.set_mtime(*mtimestamp)
                entry.set_ctime(*ctimestamp)
                if has_birthtime:
                    entry.set_birthtime(*btimestamp)
                archive2.add_entries([entry])

    with file_reader(archive2_path) as archive2:
        for entry in archive2:
            assert entry.atime == time_check(atimestamp, timefmt)
            assert entry.mtime == time_check(mtimestamp, timefmt)
            assert entry.ctime == time_check(ctimestamp, timefmt)
            if has_birthtime:
                assert entry.birthtime == time_check(btimestamp, timefmt)
예제 #3
0
def read_file(full, filename):
    res = Result(filename)

    with file_reader(filename) as pkg:
        for entry in pkg:
            # break if any of the files are not secure, speeding up scanning
            if not full and res.not_secure:
                break

            if not entry.isfile:
                continue
            if not any(entry.name.startswith(d) for d in VALID_DIRS):
                continue

            fp = BytesIO(b''.join(entry.get_blocks()))
            elf = Elf(fp)

            if not elf.is_elf():
                continue

            if not elf.pie():
                res.nopie.append(entry.name)
            if not elf.is_relro():
                res.norelro.append(entry.name)
            if not elf.canary():
                res.nocanary.append(entry.name)

    return res
예제 #4
0
def extract_hashing_feature_from_compressed_data(input_file, num_entries,
                                                 output_path):
    hv = HashingVectorizer(encoding='ISO-8859-2')

    count_total = num_entries
    # transform to hashing vector
    print 'Start transform data to hashing vector'
    t0 = time.time()
    count = 0
    with libarchive.file_reader(input_file) as archive:
        for entry in archive:
            count += 1
            if entry.pathname.find('.bytes') != -1:
                text = [
                    " ".join([
                        block.replace('\r\n', ' ')
                        for block in entry.get_blocks()
                    ])
                ]
                util.save_data_to_npz(output_path + entry.pathname,
                                      hv.transform(text))
            t = time.time() - t0
            print 'Transform:\tfiles: ' + str(count_total - count) + '/' + str(count_total) \
                  + '\tElapsed time: ' + str(t) + '(s)' \
                  + '\tTime left: ' + str((t / count) * (count_total - count)) + '(s)'
예제 #5
0
    def provision(self, dst, clean_target=True, keep_rpm=False):
        rpm_file = os.path.join(self.source_dir, os.path.basename(self.source))

        if clean_target:
            tmp_rpm = tempfile.NamedTemporaryFile().name
            shutil.move(rpm_file, tmp_rpm)
            shutil.rmtree(dst)
            os.makedirs(dst)
            shutil.move(tmp_rpm, rpm_file)

        # Ensure dst does not have trailing slash
        dst = dst.rstrip('/')
        # Open the RPM file and extract it to destination
        with libarchive.file_reader(rpm_file) as rpm:
            for rpm_file_entry in rpm:
                # Binary RPM archive data has paths starting with ./ to support
                # relocation if enabled in the building of RPMs
                rpm_file_entrypath = rpm_file_entry.pathname.lstrip('./')
                rpm_file_entrypath = rpm_file_entrypath.lstrip('/')
                rpm_file_entry.pathname = os.path.join(dst, rpm_file_entrypath)
                # XXX: libarchive frees the entry at the end of loop iterations
                # See https://github.com/Changaco/python-libarchive-c/issues/43
                libarchive.extract.extract_entries([rpm_file_entry])

        if not keep_rpm:
            os.remove(rpm_file)
예제 #6
0
def get_entries(location):
    """
    Using the archive file at `location`, return an iterable of name->value
    mappings for each libarchive.ArchiveEntry objects essential attributes.
    Paths are base64-encoded because JSON is UTF-8 and cannot handle
    arbitrary binary pathdata.
    """
    with file_reader(location) as arch:
        for entry in arch:
            # libarchive introduces prefixes such as h prefix for
            # hardlinks: tarfile does not, so we ignore the first char
            mode = entry.strmode[1:].decode('ascii')
            yield {
                'path': surrogate_decode(entry.pathname),
                'mtime': entry.mtime,
                'size': entry.size,
                'mode': mode,
                'isreg': entry.isreg,
                'isdir': entry.isdir,
                'islnk': entry.islnk,
                'issym': entry.issym,
                'linkpath': surrogate_decode(entry.linkpath),
                'isblk': entry.isblk,
                'ischr': entry.ischr,
                'isfifo': entry.isfifo,
                'isdev': entry.isdev,
            }
예제 #7
0
def extract_resource(source, destiny_path=None):
    root_dir = destiny_path or '/'.join(source.split('/')[:-1])

    def full_path(local_path):
        return '/'.join((root_dir, str(local_path)))

    def deep_mkdir(dir_path):
        steps = dir_path.split('/')
        for i in range(len(steps)):
            step = '/'.join((root_dir, *(s for s in steps[:i + 1] if s)))
            if not os.path.isdir(step):
                os.mkdir(step)

    files = []

    with libarchive.file_reader(source) as arch:
        for entry in arch:
            if entry.isdir:
                deep_mkdir(str(entry))
            else:
                path = str(entry).rsplit('/', 1)
                if len(path) > 1 and not os.path.isdir(full_path(path[0])):
                    deep_mkdir(path[0])
                resource_path = '/'.join((root_dir, *path))

                with open(resource_path, 'wb') as f:
                    for block in entry.get_blocks():
                        f.write(block)
                files.append(resource_path)

    return tuple(files)
예제 #8
0
파일: _rpm.py 프로젝트: 3v1n0/snapcraft
    def provision(self, dst, clean_target=True, keep_rpm=False):
        rpm_file = os.path.join(self.source_dir, os.path.basename(self.source))

        if clean_target:
            tmp_rpm = tempfile.NamedTemporaryFile().name
            shutil.move(rpm_file, tmp_rpm)
            shutil.rmtree(dst)
            os.makedirs(dst)
            shutil.move(tmp_rpm, rpm_file)

        # Ensure dst does not have trailing slash
        dst = dst.rstrip('/')
        # Open the RPM file and extract it to destination
        with libarchive.file_reader(rpm_file) as rpm:
            for rpm_file_entry in rpm:
                # Binary RPM archive data has paths starting with ./ to support
                # relocation if enabled in the building of RPMs
                rpm_file_entrypath = rpm_file_entry.pathname.lstrip('./')
                rpm_file_entrypath = rpm_file_entrypath.lstrip('/')
                rpm_file_entry.pathname = os.path.join(dst, rpm_file_entrypath)
                # XXX: libarchive frees the entry at the end of loop iterations
                # See https://github.com/Changaco/python-libarchive-c/issues/43
                libarchive.extract.extract_entries([rpm_file_entry])

        if not keep_rpm:
            os.remove(rpm_file)
예제 #9
0
def get_entries(location):
    """
    Using the archive file at `location`, return an iterable of name->value
    mappings for each libarchive.ArchiveEntry objects essential attributes.
    Paths are base64-encoded because JSON is UTF-8 and cannot handle
    arbitrary binary pathdata.
    """
    with file_reader(location) as arch:
        for entry in arch:
            # libarchive introduces prefixes such as h prefix for
            # hardlinks: tarfile does not, so we ignore the first char
            mode = entry.strmode[1:].decode('ascii')
            yield {
                'path': surrogate_decode(entry.pathname),
                'mtime': entry.mtime,
                'size': entry.size,
                'mode': mode,
                'isreg': entry.isreg,
                'isdir': entry.isdir,
                'islnk': entry.islnk,
                'issym': entry.issym,
                'linkpath': surrogate_decode(entry.linkpath),
                'isblk': entry.isblk,
                'ischr': entry.ischr,
                'isfifo': entry.isfifo,
                'isdev': entry.isdev,
                'uid': entry.uid,
                'gid': entry.gid
            }
예제 #10
0
 def get_filtered_members(self):
     try:
         with libarchive.file_reader(self.source.path) as archive:
             for entry in archive:
                 if any_excluded(entry.pathname):
                     continue
                 yield entry.pathname, self.get_subclass(entry)
     except libarchive.exception.ArchiveError:
         pass
예제 #11
0
def extract_file(filepath, flags=0, callback=lambda x: None, imginfo=(), extractlist=None):
    """Extracts an archive from a file into the current directory."""
    totalsize = 0
    for img in imginfo:
        if not imginfo[img]:
            continue
        totalsize += imginfo[img]
    with libarchive.file_reader(filepath) as archive:
        extract_entries(archive, flags, callback, totalsize, extractlist)
예제 #12
0
def test_entry_name_decoding(tar_file):
    """ Test that the entry names are decoded to utf8 correctly """
    path = join(data_dir, tar_file)

    with file_reader(path) as arch:
        for entry in arch:
            # Use str find method to test it was converted from bytes to a
            # str/unicode
            entry.name.find('not there')
예제 #13
0
def main(argv):
    args = parse_args(argv)

    for filename in args.FILE:
        basedir = os.path.dirname(filename)
        basename = os.path.basename(filename)
        original_filename, ext = os.path.splitext(basename)

        lst = []
        try:
            with libarchive.file_reader(filename) as archive:
                for entry in archive:
                    if entry.isfile:
                        lst.append(str(entry))
        except libarchive.exception.ArchiveError as err:
            print("{}: error: couldn't process archive".format(filename))
            continue

        if args.basename:
            lst = [os.path.basename(p) for p in lst]

        common = common_prefix(lst, args.threshold)

        if len(common) < args.min_length:
            print("{}: error: could not find common prefix: '{}'".format(filename, common))
            continue

        if args.strip:
            common = strip_unimportant(common)

        if args.preserve:
            common = original_filename + "_" + common

        if not args.ignore_ext:
            common += ext

        if args.move:
            outfilename = os.path.join(basedir, common)
            if args.interactive:
                print("rename", filename, "->", common, "(y/n)?")
                answer = None
                while answer not in ["y", "n"]:
                    answer = input()
                if answer == "y":
                    print(filename, "->", common)
                    rename_safe(filename, outfilename)
                else:
                    print("ignoring", filename)
            else:
                print(filename, "->", common)
                rename_safe(filename, outfilename)
        elif args.verbose or len(args.FILE) > 1:
            print(filename, "->", common)
        else:
            print(common)
예제 #14
0
def list_libarchive(path, ignore_errors=False):
    try:
        with libarchive.file_reader(path) as archive:
            for entry in archive:
                name_and_link = entry.name
                if entry.issym:
                    name_and_link = '{entry.name} -> {entry.linkname}'.format(
                        entry=entry
                    )
                if Config().exclude_directory_metadata == 'recursive':
                    yield '{name_and_link}\n'.format(
                        name_and_link=name_and_link
                    )
                    continue
                if entry.isblk or entry.ischr:
                    size_or_dev = '{major:>3},{minor:>3}'.format(
                        major=entry.rdevmajor, minor=entry.rdevminor
                    )
                else:
                    size_or_dev = entry.size
                mtime = time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.gmtime(entry.mtime)
                ) + '.{:06d}'.format(entry.mtime_nsec // 1000)
                if entry.uname:
                    user = '******'.format(
                        user=entry.uname.decode(
                            'utf-8', errors='surrogateescape'
                        ),
                        uid='({})'.format(entry.uid),
                    )
                else:
                    user = entry.uid
                if entry.gname:
                    group = '{group:<8} {gid:>7}'.format(
                        group=entry.gname.decode(
                            'utf-8', errors='surrogateescape'
                        ),
                        gid='({})'.format(entry.gid),
                    )
                else:
                    group = entry.gid
                yield '{strmode} {entry.nlink:>3} {user:>8} {group:>8} {size_or_dev:>8} {mtime:>8} {name_and_link}\n'.format(
                    strmode=entry.strmode.decode('us-ascii'),
                    entry=entry,
                    user=user,
                    group=group,
                    size_or_dev=size_or_dev,
                    mtime=mtime,
                    name_and_link=name_and_link,
                )
    except libarchive.exception.ArchiveError:
        if not ignore_errors:
            raise
예제 #15
0
 def extract(self, member_name, dest_dir):
     dest_path = os.path.join(dest_dir, os.path.basename(member_name))
     logger.debug('libarchive extracting %s to %s', member_name, dest_path)
     with libarchive.file_reader(self.source.path) as archive:
         for entry in archive:
             if entry.pathname == member_name:
                 logger.debug('entry found, writing %s', dest_path)
                 with open(dest_path, 'wb') as f:
                     for buf in entry.get_blocks():
                         f.write(buf)
                 return dest_path
     raise KeyError('%s not found in archive', member_name)
예제 #16
0
 def iterate_archive_chunk_entries(self, chunk_start_idx, chunk_end_idx):
     with libarchive.file_reader(self.archive_path) as archive:
         consume(archive, chunk_start_idx)  # skips first n
         progressbar_iterator = progressbar(archive,
                                            total=chunk_end_idx -
                                            chunk_start_idx)
         idx = chunk_start_idx
         for entry in progressbar_iterator:
             if idx > chunk_end_idx:
                 progressbar_iterator.refresh()
                 break
             idx += 1
             yield entry
def test_file_atime_ctime(archfmt, timefmt, tmpdir):
    archive_path = "{0}/test.{1}".format(tmpdir.strpath, archfmt)

    # Collect information on what should be in the archive
    tree = treestat('libarchive', stat_dict)

    # Create an archive of our libarchive/ directory
    with file_writer(archive_path, archfmt) as archive:
        archive.add_files('libarchive/')

    # Read the archive and check that the data is correct
    with file_reader(archive_path) as archive:
        check_atime_ctime(archive, tree, timefmt=timefmt)
예제 #18
0
 def get_member(self, member_name):
     with libarchive.file_reader(self.source.path) as archive:
         for entry in archive:
             if entry.pathname == member_name:
                 if entry.isdir:
                     return LibarchiveDirectory(self, entry)
                 elif entry.issym:
                     return LibarchiveSymlink(self, entry)
                 elif entry.isblk or entry.ischr:
                     return LibarchiveDevice(self, entry)
                 else:
                     return LibarchiveMember(self, entry)
     raise KeyError('%s not found in archive', member_name)
예제 #19
0
    def iter_contents(self):
        """
        Generator for listing the archive contents
        """

        with libarchive.file_reader(str(self.archive)) as archive:
            for filename in archive:
                if not filename.isfile:
                    continue
                filepath = Path(filename.pathname)
                if not ImageSupported.is_image_file(filepath):
                    continue
                yield filepath
예제 #20
0
 def get_member(self, member_name):
     with libarchive.file_reader(self.source.path) as archive:
         for entry in archive:
             if entry.pathname == member_name:
                 if entry.isdir:
                     return LibarchiveDirectory(self, entry)
                 elif entry.issym:
                     return LibarchiveSymlink(self, entry)
                 elif entry.isblk or entry.ischr:
                     return LibarchiveDevice(self, entry)
                 else:
                     return LibarchiveMember(self, entry)
     raise KeyError('%s not found in archive', member_name)
예제 #21
0
def test_file_atime_ctime(tmpdir):
    archive_path = tmpdir.strpath + '/test.zip'

    # Collect information on what should be in the archive
    tree = treestat('libarchive', stat_dict)

    # Create an archive of our libarchive/ directory
    with file_writer(archive_path, 'zip') as archive:
        archive.add_files('libarchive/')

    # Read the archive and check that the data is correct
    with file_reader(archive_path) as archive:
        check_atime_ctime(archive, tree)
예제 #22
0
 def extractFiles(self):
     # TODO: globbing or regex on self.paths?
     # If we have yum, we can, TECHNICALLY, do this with:
     # http://yum.baseurl.org/api/yum/rpmUtils/miscutils.html#rpmUtils.miscutils.rpm2cpio
     # But nope. We can't selectively decompress members based on path with rpm2cpio-like funcs.
     # We keep getting extraction artefacts, at least with legacy libarchive_c, so we use a hammer.
     _curdir = os.getcwd()
     _tempdir = tempfile.mkdtemp()
     os.chdir(_tempdir)
     for rpm_file in self.rpms:
         rf = self.rpms[rpm_file]
         if is_ctype:
             if not is_legacy:
                 # ctype - extracts to pwd
                 with libarchive.file_reader(rf) as reader:
                     for entry in reader:
                         if self.paths and entry.path not in self.paths:
                             continue
                         if entry.isdir():
                             continue
                         fpath = os.path.join(self.dest_dir, rpm_file, entry.path)
                         if not os.path.isdir(os.path.dirname(fpath)):
                             os.makedirs(os.path.dirname(fpath))
                         with open(fpath, 'wb') as f:
                             for b in entry.get_blocks():
                                 f.write(b)
             else:
                 with libarchive.Archive(rf) as reader:
                     for entry in reader:
                         if (self.paths and entry.pathname not in self.paths) or (entry.isdir()):
                             continue
                         fpath = os.path.join(self.dest_dir, rpm_file, entry.pathname)
                         if not os.path.isdir(os.path.dirname(fpath)):
                             os.makedirs(os.path.dirname(fpath))
                         reader.readpath(fpath)
         else:
             # pyEasyArchive/"pypi/libarchive"
             with lap.file_reader(rf) as reader:
                 for entry in reader:
                     if (self.paths and entry.pathname not in self.paths) or (entry.filetype.IFDIR):
                         continue
                     fpath = os.path.join(self.dest_dir, rpm_file, entry.pathname)
                     if not os.path.isdir(os.path.dirname(fpath)):
                         os.makedirs(os.path.dirname(fpath))
                     with open(fpath, 'wb') as f:
                         for b in entry.get_blocks():
                             f.write(b)
     os.chdir(_curdir)
     shutil.rmtree(_tempdir)
     return()
예제 #23
0
def scan_iso(filename):
    filesizes = {}
    filecontents = {}
    with libarchive.file_reader(filename) as reader:
        for ent in reader:
            if str(ent).endswith('TRANS.TBL'):
                continue
            eventlet.sleep(0)
            filesizes[str(ent)] = ent.size
            if str(ent) in READFILES:
                filecontents[str(ent)] = b''
                for block in ent.get_blocks():
                    filecontents[str(ent)] += bytes(block)
    return filesizes, filecontents
예제 #24
0
def test_file_time_setters(tmpdir):
    # Create an archive of our libarchive/ directory
    archive_path = tmpdir.join('/test.zip').strpath
    archive2_path = tmpdir.join('/test2.zip').strpath

    atimestamp = (1482144741, 495628118)
    mtimestamp = (1482155417, 659017086)
    ctimestamp = (1482145211, 536858081)
    with file_writer(archive_path, "zip") as archive1:
        archive1.add_files('libarchive/')

    with file_reader(archive_path) as archive1:
        with file_writer(archive2_path, "zip") as archive2:
            for entry in archive1:
                entry.set_atime(*atimestamp)
                entry.set_mtime(*mtimestamp)
                entry.set_ctime(*ctimestamp)
                archive2.add_entries([entry])

    with file_reader(archive2_path) as archive2:
        for entry in archive2:
            assert entry.atime == atimestamp[0]
            assert entry.mtime == mtimestamp[0]
            assert entry.ctime == ctimestamp[0]
예제 #25
0
 def extract(self, member_name, dest_dir):
     dest_name = os.path.basename(member_name)
     if not dest_name:
         raise ValueError('member_name should not be a directory')
     dest_path = os.path.join(dest_dir, dest_name)
     logger.debug('libarchive extracting %s to %s', member_name, dest_path)
     with libarchive.file_reader(self.source.path) as archive:
         for entry in archive:
             if entry.pathname == member_name:
                 logger.debug('entry found, writing %s', dest_path)
                 with open(dest_path, 'wb') as f:
                     for buf in entry.get_blocks():
                         f.write(buf)
                 return dest_path
     raise KeyError('%s not found in archive', member_name)
예제 #26
0
def test_sparse_formats(name):
    """ test for a good sparse map from all of the various sparse formats """
    path = join(data_dir, name)
    expected_map = [(4096, 4096), (12288, 4096), (20480, 4096), (28672, 4096),
                    (36864, 4096), (45056, 4096), (53248, 4096),
                    (61440, 4096), (69632, 4096), (77824, 4096), (86016, 0)]

    with file_reader(path) as arch:
        for entry in arch:
            try:
                if entry.name.startswith('gnu/sparse'):
                    assert entry.size == 86016
                    assert entry.sparse_map == expected_map
            except UnicodeDecodeError:
                # py27 fails on some unicode
                pass
예제 #27
0
    def get_content(self) -> Generator[str, None, None]:
        """Yields processed pieces of content"""
        # https://github.com/Changaco/python-libarchive-c#usage
        try:
            # pylint:disable=import-outside-toplevel
            import libarchive
        except AttributeError as ex:
            # AttributeError: undefined symbol: archive_errno
            raise DumpError(
                "Failed to import libarchive with 7zip support") from ex

        with self.fetch() as handler:
            with libarchive.file_reader(handler.name) as archive:
                for entry in archive:
                    for block in entry.get_blocks():
                        yield block
예제 #28
0
 def syn2map(self, filename):
     files = os.listdir(self.startDir + '/engine/maps')
     for file in files:
         if fnmatch.fnmatch(file, filename):
             print("Actual Mapname=" + file)
             with libarchive.file_reader(self.startDir + '/engine/maps/' +
                                         file) as reader:
                 for e in reader:
                     # (The entry evaluates to a filename.)
                     print(e)
                     if e.name[-3:] == 'smf':
                         print("real map name: " + e.name)
                         filename = e.name
                         break
             break
     print("returning name" + filename[5:-4])
     return filename[5:-4]
예제 #29
0
def get_problematic_files(archive, query):
    """Search for the files inside archive with the first line
    matching given query. Some of the files can contain data, which
    are not in the plain text format. Bytes are read from the file and
    the shebang query has to be of the same type.
    """
    problematic = set()
    with libarchive.file_reader(archive) as a:
        for entry in a:
            try:
                first_line = next(entry.get_blocks(), '').splitlines()[0]
            except IndexError:
                continue  # file is empty
            if matches(first_line, query.encode()):
                problematic.add(entry.pathname.lstrip('.'))

    return problematic
def get_problematic_files(archive, query):
    """Search for the files inside archive with the first line
    matching given query. Some of the files can contain data, which
    are not in the plain text format. Bytes are read from the file and
    the shebang query is encoded as well. We only test for ASCII shebangs.
    """
    problematic = set()
    with libarchive.file_reader(archive) as a:
        for entry in a:
            try:
                first_line = next(entry.get_blocks(), '').splitlines()[0]
            except IndexError:
                continue  # file is empty
            if matches(first_line, query.encode('ascii')):
                problematic.add(entry.pathname.lstrip('.'))

    return problematic
예제 #31
0
    def ensure_unpacked(self):
        if hasattr(self, '_members'):
            return

        tmpdir = get_temporary_directory().name
        self._members = collections.OrderedDict()

        logger.debug("Extracting %s to %s", self.source.path, tmpdir)

        with libarchive.file_reader(self.source.path) as archive:
            for idx, entry in enumerate(archive):
                # Always skip directories
                if entry.isdir:
                    continue

                # Save extracting excluded files
                if any_excluded(entry.pathname):
                    continue

                # Keep directory sizes small. could be improved but should be
                # good enough for "ordinary" large archives.
                dst = os.path.join(tmpdir, str(idx // 4096), str(idx % 4096))
                root, ext = os.path.splitext(entry.pathname)
                dst += ext
                # Maintain a mapping of archive path to the extracted path,
                # avoiding the need to sanitise filenames.
                self._members[entry.pathname] = dst

                logger.debug("Extracting %s to %s", entry.pathname, dst)

                os.makedirs(os.path.dirname(dst), exist_ok=True)
                try:
                    with open(dst, 'wb') as f:
                        for block in entry.get_blocks():
                            f.write(block)
                except Exception as exc:
                    raise ContainerExtractionError(entry.pathname, exc)

        logger.debug(
            "Extracted %d entries from %s to %s",
            len(self._members),
            self.source.path,
            tmpdir,
        )
예제 #32
0
def test_files(tmpdir):
    archive_path = tmpdir.strpath+'/test.tar.gz'

    # Collect information on what should be in the archive
    tree = treestat('libarchive')

    # Create an archive of our libarchive/ directory
    with libarchive.file_writer(archive_path, 'ustar', 'gzip') as archive:
        archive.add_files('libarchive/')

    # Read the archive and check that the data is correct
    with libarchive.file_reader(archive_path) as archive:
        check_archive(archive, tree)

    # Extract the archive in tmpdir and check that the data is intact
    with in_dir(tmpdir.strpath):
        flags = EXTRACT_OWNER | EXTRACT_PERM | EXTRACT_TIME
        libarchive.extract_file(archive_path, flags)
        tree2 = treestat('libarchive')
        assert tree2 == tree
예제 #33
0
def test_files(tmpdir):
    archive_path = tmpdir.strpath+'/test.tar.gz'

    # Collect information on what should be in the archive
    tree = treestat('libarchive')

    # Create an archive of our libarchive/ directory
    with libarchive.file_writer(archive_path, 'ustar', 'gzip') as archive:
        archive.add_files('libarchive/')

    # Read the archive and check that the data is correct
    with libarchive.file_reader(archive_path) as archive:
        check_archive(archive, tree)

    # Extract the archive in tmpdir and check that the data is intact
    with in_dir(tmpdir.strpath):
        flags = EXTRACT_OWNER | EXTRACT_PERM | EXTRACT_TIME
        libarchive.extract_file(archive_path, flags)
        tree2 = treestat('libarchive')
        assert tree2 == tree
예제 #34
0
def list_libarchive(path):
    with libarchive.file_reader(path) as archive:
        for entry in archive:
            if entry.isblk or entry.ischr:
                size_or_dev = '{major:>3},{minor:>3}'.format(major=entry.rdevmajor, minor=entry.rdevminor)
            else:
                size_or_dev = entry.size
            mtime = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(entry.mtime)) + '.{:06d}'.format(entry.mtime_nsec // 1000)
            if entry.issym:
                name_and_link = '{entry.name} -> {entry.linkname}'.format(entry=entry)
            else:
                name_and_link = entry.name
            if entry.uname:
                user = '******'.format(user=entry.uname.decode('utf-8', errors='surrogateescape'), uid='({})'.format(entry.uid))
            else:
                user = entry.uid
            if entry.gname:
                group = '{group:<8} {gid:>7}'.format(group=entry.gname.decode('utf-8', errors='surrogateescape'), gid='({})'.format(entry.gid))
            else:
                group = entry.gid
            yield '{strmode} {entry.nlink:>3} {user:>8} {group:>8} {size_or_dev:>8} {mtime:>8} {name_and_link}\n'.format(strmode=entry.strmode.decode('us-ascii'), entry=entry, user=user, group=group, size_or_dev=size_or_dev, mtime=mtime, name_and_link=name_and_link)
예제 #35
0
def list_libarchive(path):
    with libarchive.file_reader(path) as archive:
        for entry in archive:
            if entry.isblk or entry.ischr:
                size_or_dev = '{major:>3},{minor:>3}'.format(major=entry.rdevmajor, minor=entry.rdevminor)
            else:
                size_or_dev = entry.size
            mtime = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(entry.mtime)) + '.{:06d}'.format(entry.mtime_nsec // 1000)
            if entry.issym:
                name_and_link = '{entry.name} -> {entry.linkname}'.format(entry=entry)
            else:
                name_and_link = entry.name
            if entry.uname:
                user = '******'.format(user=entry.uname.decode('utf-8', errors='surrogateescape'), uid='({})'.format(entry.uid))
            else:
                user = entry.uid
            if entry.gname:
                group = '{group:<8} {gid:>7}'.format(group=entry.gname.decode('utf-8', errors='surrogateescape'), gid='({})'.format(entry.gid))
            else:
                group = entry.gid
            yield '{strmode} {entry.nlink:>3} {user:>8} {group:>8} {size_or_dev:>8} {mtime:>8} {name_and_link}\n'.format(strmode=entry.strmode.decode('us-ascii'), entry=entry, user=user, group=group, size_or_dev=size_or_dev, mtime=mtime, name_and_link=name_and_link)
예제 #36
0
def process_archive(parent, relatives, cache_path):
    filename = os.path.join(cache_path, parent.filename)

    results = list()
    try:
        with libarchive.file_reader(filename) as archive:
            for entry in archive:
                if entry.isfile:
                    switcher = {
                        'application/x-3ds-archive': process_cia,
                        'application/x-3ds-homebrew': process_tdsx,
                        'application/x-3ds-iconfile': process_smdh,
                        'application/x-3ds-arm9bin': process_arm9,
                        'application/x-3ds-xml': process_xml
                    }
                    action = switcher.get(determine_mimetype(entry.pathname), None)
                    if action:
                        working_file = os.path.join(cache_path, 'archive_root', entry.pathname)
                        working_path = '/'.join(working_file.split('/')[:-1])
                        if not os.path.isdir(working_path):
                            os.makedirs(working_path)
                        with open(working_file, 'wb') as f:
                            for block in entry.get_blocks():
                                f.write(block)
                        os.utime(working_file, (entry.mtime,entry.mtime))
                        results.append(action(parent, relatives, cache_path, entry.pathname))
    except libarchive.exception.ArchiveError as e:
        log.debug("Archive error: %s", e)

    if results:
         for result_item in results:
            # Match up any xml or smdh files in the same folder as our 3dsx.
            if result_item.__class__ in (XML, SMDH):
                matched = False
                for check_item in results:
                    if not matched:
                        matched = check_siblings(check_item, result_item)
                result_item.active = matched

    return(results)
예제 #37
0
    def provision(self, dst, clean_target=True, keep_7z=False):
        seven_zip_file = os.path.join(self.source_dir,
                                      os.path.basename(self.source))

        if self.source_checksum:
            sources.verify_checksum(self.source_checksum, seven_zip_file)

        if clean_target:
            tmp_7z = tempfile.NamedTemporaryFile().name
            shutil.move(seven_zip_file, tmp_7z)
            shutil.rmtree(dst)
            os.makedirs(dst)
            shutil.move(tmp_7z, seven_zip_file)

        # Open the 7z file and extract it to destination
        with libarchive.file_reader(seven_zip_file) as archive:
            for file_entry in archive:
                file_entry.pathname = os.path.join(dst, file_entry.pathname)
                libarchive.extract.extract_entries([file_entry])

        if not keep_7z:
            os.remove(seven_zip_file)
    def __init__(self, source, destiny_path=None):
        root_dir = os.path.realpath(destiny_path or self.tmp_dir)

        files = []

        with libarchive.file_reader(source) as arch:
            for entry in arch:
                entry_path = self._get_entry_path(entry)
                if entry.isdir:
                    os.makedirs(os.path.join(root_dir, entry_path), exist_ok=True)
                else:
                    path, extr_file = os.path.split(entry_path)
                    if path and not os.path.isdir(os.path.join(root_dir, path)):
                        os.makedirs(os.path.join(root_dir, path), exist_ok=True)
                    resource_path = os.path.join(root_dir, path, extr_file)

                    with open(resource_path, 'wb') as f:
                        for block in entry.get_blocks():
                            f.write(block)
                    files.append(resource_path)

        self.files = tuple(files)
예제 #39
0
    def _extract(self, outdir: str) -> None:
        with libarchive.file_reader(self.filename) as entries:
            self.interruption_point()

            if not os.path.isdir(outdir):
                os.makedirs(outdir)

            for entry in entries:
                pathname = sanitize(entry.pathname)
                if pathname is None:
                    logger.error("skipping unsanitary entry: %s", entry.pathname)
                    continue

                # FIXME: entry.isdir doesn't look reliable, needs more testing
                if entry.isdir or entry.pathname[-1] == '/':
                    dirname = os.path.join(outdir, pathname)
                    if not os.path.isdir(dirname):
                        os.makedirs(dirname)
                    self.sig_entry_extracted.emit(entry.pathname, dirname)
                elif entry.isreg:
                    dirname = os.path.dirname(pathname)

                    if dirname:
                        dirname = os.path.join(outdir, dirname)
                        if not os.path.isdir(dirname):
                            os.makedirs(dirname)

                    while True:
                        outfile = os.path.join(outdir, pathname)
                        with open(outfile, "wb") as out:
                            blocks = entry.get_blocks()
                            for block in blocks:
                                self.interruption_point()
                                out.write(block)
                        self.sig_entry_extracted.emit(entry.pathname, outfile)
                        break
                else:
                    logger.warning("skipping non regular entry: %s", entry.pathname)
예제 #40
0
 def get_member_names(self):
     with libarchive.file_reader(self.source.path) as archive:
         member_names = [entry.pathname for entry in archive]
     return member_names
예제 #41
0
파일: stats.py 프로젝트: Changaco/legi.py
def main(args):

    avg_file_size = 0
    number_of_files = 0
    max_size = 0
    biggest_file = None
    min_size = float('inf')
    smallest_file = None
    roots = {}
    values_count = {
        'META/META_SPEC/META_ARTICLE/ETAT': {},
        'META/META_SPEC/META_ARTICLE/TYPE': {},
        'META/META_COMMUN/ORIGINE': {},
        'META/META_COMMUN/NATURE': {},
    }
    etats_par_dossier = {
        'code_en_vigueur': {},
        'code_non_vigueur': {},
        'TNC_en_vigueur': {},
        'TNC_non_vigueur': {},
    }

    parser = etree.XMLParser()
    with libarchive.file_reader(args.archive) as archive:
        for entry in archive:
            path = entry.pathname
            if path[-1] == '/':
                continue
            number_of_files += 1
            size = entry.size
            avg_file_size += size
            if size > max_size:
                biggest_file = path
                max_size = size
            if size < min_size:
                smallest_file = path
                min_size = size
            for block in entry.get_blocks():
                parser.feed(block)
            xml = parser.close()
            tag = xml.tag
            roots[tag] = roots.get(tag, 0) + 1
            for xpath, values_dict in values_count.iteritems():
                e = xml.find(xpath)
                if e is not None:
                    v = e.text
                    values_dict[v] = values_dict.get(v, 0) + 1
            if tag != 'ARTICLE':
                continue
            d = etats_par_dossier[path.split('/')[3]]
            etat = xml.find('META/META_SPEC/META_ARTICLE/ETAT')
            etat = None if etat is None else etat.text
            d[etat] = d.get(etat, 0) + 1

    avg_file_size /= number_of_files
    biggest_file = {'path': biggest_file, 'size': max_size}
    smallest_file = {'path': smallest_file, 'size': min_size}

    stats = {
        'avg_file_size', 'number_of_files', 'biggest_file', 'smallest_file',
        'roots', 'etats_par_dossier', 'values_count'
    }
    r = {k: v for k, v in locals().items() if k in stats}
    print(json.dumps(r, indent=4, sort_keys=True))
예제 #42
0
 def from_file(filename: str) -> 'ArchiveInfo':
     with libarchive.file_reader(filename) as entries:
         archiveinfo = ArchiveInfo(entries)
         return archiveinfo
예제 #43
0
def process_archive(db, archive_path, process_links=True):

    # Define some constants
    ARTICLE_TAGS = set('NOTA BLOC_TEXTUEL'.split())
    SECTION_TA_TAGS = set('TITRE_TA COMMENTAIRE'.split())
    TEXTELR_TAGS = set('VERSIONS'.split())
    TEXTE_VERSION_TAGS = set('VISAS SIGNATAIRES TP NOTA ABRO RECT'.split())
    META_ARTICLE_TAGS = set('NUM ETAT DATE_DEBUT DATE_FIN TYPE'.split())
    META_CHRONICLE_TAGS = set("""
        NUM NUM_SEQUENCE NOR DATE_PUBLI DATE_TEXTE DERNIERE_MODIFICATION
        ORIGINE_PUBLI PAGE_DEB_PUBLI PAGE_FIN_PUBLI
    """.split())
    META_VERSION_TAGS = set(
        'TITRE TITREFULL ETAT DATE_DEBUT DATE_FIN AUTORITE MINISTERE'.split()
    )
    SOUS_DOSSIER_MAP = {
        'articles': 'article',
        'sections': 'section_ta',
        'textes_structs': 'texte/struct',
        'textes_versions': 'texte/version',
    }
    TABLES_MAP = {'ARTI': 'articles', 'SCTA': 'sections', 'TEXT': 'textes_'}
    TYPELIEN_MAP = {
        "ABROGATION": "ABROGE",
        "ANNULATION": "ANNULE",
        "CODIFICATION": "CODIFIE",
        "CONCORDANCE": "CONCORDE",
        "CREATION": "CREE",
        "DEPLACE": "DEPLACEMENT",
        "DISJOINT": "DISJONCTION",
        "MODIFICATION": "MODIFIE",
        "PEREMPTION": "PERIME",
        "RATIFICATION": "RATIFIE",
        "TRANSFERE": "TRANSFERT",
    }
    TYPELIEN_MAP.update([(v, k) for k, v in TYPELIEN_MAP.items()])

    # Define some shortcuts
    attr = etree._Element.get
    insert = db.insert
    update = db.update

    def get_table(parts):
        table = TABLES_MAP[parts[-1][4:8]]
        if table == 'textes_':
            table += parts[13] + 's'
        return table

    counts = {}
    def count_one(k):
        try:
            counts[k] += 1
        except KeyError:
            counts[k] = 1

    skipped = 0
    unknown_folders = {}
    liste_suppression = []
    xml = etree.XMLParser(remove_blank_text=True)
    with libarchive.file_reader(archive_path) as archive:
        for entry in tqdm(archive):
            path = entry.pathname
            if path[-1] == '/':
                continue
            parts = path.split('/')
            if parts[-1] == 'liste_suppression_legi.dat':
                liste_suppression += b''.join(entry.get_blocks()).decode('ascii').split()
                continue
            if parts[1] == 'legi':
                path = path[len(parts[0])+1:]
                parts = parts[1:]
            if not parts[2].startswith('code_et_TNC_'):
                # https://github.com/Legilibre/legi.py/issues/23
                try:
                    unknown_folders[parts[2]] += 1
                except KeyError:
                    unknown_folders[parts[2]] = 1
                continue
            dossier = parts[3]
            text_cid = parts[11]
            text_id = parts[-1][:-4]
            mtime = entry.mtime

            # Skip the file if it hasn't changed, store it if it's a duplicate
            duplicate = False
            table = get_table(parts)
            prev_row = db.one("""
                SELECT mtime, dossier, cid
                  FROM {0}
                 WHERE id = ?
            """.format(table), (text_id,))
            if prev_row:
                prev_mtime, prev_dossier, prev_cid = prev_row
                if prev_dossier != dossier or prev_cid != text_cid:
                    if prev_mtime >= mtime:
                        duplicate = True
                    else:
                        prev_row_dict = db.one("""
                            SELECT *
                              FROM {0}
                             WHERE id = ?
                        """.format(table), (text_id,), to_dict=True)
                        data = {table: prev_row_dict}
                        data['liens'] = list(db.all("""
                            SELECT *
                              FROM liens
                             WHERE src_id = ? AND NOT _reversed
                                OR dst_id = ? AND _reversed
                        """, (text_id, text_id), to_dict=True))
                        if table == 'sections':
                            data['sommaires'] = list(db.all("""
                                SELECT *
                                  FROM sommaires
                                 WHERE cid = ?
                                   AND parent = ?
                                   AND _source = 'section_ta_liens'
                            """, (text_id, text_id), to_dict=True))
                        elif table == 'textes_structs':
                            source = 'struct/' + text_id
                            data['sommaires'] = list(db.all("""
                                SELECT *
                                  FROM sommaires
                                 WHERE cid = ?
                                   AND _source = ?
                            """, (text_cid, source), to_dict=True))
                        data = {k: v for k, v in data.items() if v}
                        insert('duplicate_files', {
                            'id': text_id,
                            'sous_dossier': SOUS_DOSSIER_MAP[table],
                            'cid': prev_cid,
                            'dossier': prev_dossier,
                            'mtime': prev_mtime,
                            'data': json.dumps(data),
                            'other_cid': text_cid,
                            'other_dossier': dossier,
                            'other_mtime': mtime,
                        }, replace=True)
                        count_one('upsert into duplicate_files')
                elif prev_mtime == mtime:
                    skipped += 1
                    continue

            xml.feed(b''.join(entry.get_blocks()))
            root = xml.close()
            tag = root.tag
            meta = root.find('META')

            # Check the ID
            if tag == 'SECTION_TA':
                assert root.find('ID').text == text_id
            else:
                meta_commun = meta.find('META_COMMUN')
                assert meta_commun.find('ID').text == text_id
                nature = meta_commun.find('NATURE').text

            # Extract the data we want
            attrs = {}
            liens = ()
            sommaires = ()
            if tag == 'ARTICLE':
                assert nature == 'Article'
                assert table == 'articles'
                contexte = root.find('CONTEXTE/TEXTE')
                assert attr(contexte, 'cid') == text_cid
                sections = contexte.findall('.//TITRE_TM')
                if sections:
                    attrs['section'] = attr(sections[-1], 'id')
                meta_article = meta.find('META_SPEC/META_ARTICLE')
                scrape_tags(attrs, meta_article, META_ARTICLE_TAGS)
                scrape_tags(attrs, root, ARTICLE_TAGS, unwrap=True)
            elif tag == 'SECTION_TA':
                assert table == 'sections'
                scrape_tags(attrs, root, SECTION_TA_TAGS)
                section_id = text_id
                contexte = root.find('CONTEXTE/TEXTE')
                assert attr(contexte, 'cid') == text_cid
                parents = contexte.findall('.//TITRE_TM')
                if parents:
                    attrs['parent'] = attr(parents[-1], 'id')
                sommaires = [
                    {
                        'cid': text_cid,
                        'parent': section_id,
                        'element': attr(lien, 'id'),
                        'debut': attr(lien, 'debut'),
                        'fin': attr(lien, 'fin'),
                        'etat': attr(lien, 'etat'),
                        'num': attr(lien, 'num'),
                        'position': i,
                        '_source': 'section_ta_liens',
                    }
                    for i, lien in enumerate(root.find('STRUCTURE_TA'))
                ]
            elif tag == 'TEXTELR':
                assert table == 'textes_structs'
                scrape_tags(attrs, root, TEXTELR_TAGS)
                sommaires = [
                    {
                        'cid': text_cid,
                        'element': attr(lien, 'id'),
                        'debut': attr(lien, 'debut'),
                        'fin': attr(lien, 'fin'),
                        'etat': attr(lien, 'etat'),
                        'position': i,
                        '_source': 'struct/' + text_id,
                    }
                    for i, lien in enumerate(root.find('STRUCT'))
                ]
            elif tag == 'TEXTE_VERSION':
                assert table == 'textes_versions'
                attrs['nature'] = nature
                meta_spec = meta.find('META_SPEC')
                meta_chronicle = meta_spec.find('META_TEXTE_CHRONICLE')
                assert meta_chronicle.find('CID').text == text_cid
                scrape_tags(attrs, meta_chronicle, META_CHRONICLE_TAGS)
                meta_version = meta_spec.find('META_TEXTE_VERSION')
                scrape_tags(attrs, meta_version, META_VERSION_TAGS)
                scrape_tags(attrs, root, TEXTE_VERSION_TAGS, unwrap=True)
            else:
                raise Exception('unexpected tag: '+tag)

            if process_links and tag in ('ARTICLE', 'TEXTE_VERSION'):
                e = root if tag == 'ARTICLE' else meta_version
                liens_tags = e.find('LIENS')
                if liens_tags is not None:
                    liens = []
                    for lien in liens_tags:
                        typelien, sens = attr(lien, 'typelien'), attr(lien, 'sens')
                        src_id, dst_id = text_id, attr(lien, 'id')
                        if sens == 'cible':
                            assert dst_id
                            src_id, dst_id = dst_id, src_id
                            dst_cid = dst_titre = ''
                            typelien = TYPELIEN_MAP.get(typelien, typelien+'_R')
                            _reversed = True
                        else:
                            dst_cid = attr(lien, 'cidtexte')
                            dst_titre = lien.text
                            _reversed = False
                        liens.append({
                            'src_id': src_id,
                            'dst_cid': dst_cid,
                            'dst_id': dst_id,
                            'dst_titre': dst_titre,
                            'typelien': typelien,
                            '_reversed': _reversed,
                        })

            if duplicate:
                data = {table: attrs}
                if liens:
                    data['liens'] = liens
                if sommaires:
                    data['sommaires'] = sommaires
                insert('duplicate_files', {
                    'id': text_id,
                    'sous_dossier': SOUS_DOSSIER_MAP[table],
                    'cid': text_cid,
                    'dossier': dossier,
                    'mtime': mtime,
                    'data': json.dumps(data),
                    'other_cid': prev_cid,
                    'other_dossier': prev_dossier,
                    'other_mtime': prev_mtime,
                }, replace=True)
                count_one('upsert into duplicate_files')
                continue

            attrs['dossier'] = dossier
            attrs['cid'] = text_cid
            attrs['mtime'] = mtime

            if prev_row:
                # Delete the associated rows
                if tag == 'SECTION_TA':
                    db.run("""
                        DELETE FROM sommaires
                         WHERE cid = ?
                           AND parent = ?
                           AND _source = 'section_ta_liens'
                    """, (text_cid, section_id))
                    count(counts, 'delete from sommaires', db.changes())
                elif tag == 'TEXTELR':
                    db.run("""
                        DELETE FROM sommaires
                         WHERE cid = ?
                           AND _source = ?
                    """, (text_cid, 'struct/' + text_id))
                    count(counts, 'delete from sommaires', db.changes())
                if tag in ('ARTICLE', 'TEXTE_VERSION'):
                    db.run("""
                        DELETE FROM liens
                         WHERE src_id = ? AND NOT _reversed
                            OR dst_id = ? AND _reversed
                    """, (text_id, text_id))
                    count(counts, 'delete from liens', db.changes())
                if table == 'textes_versions':
                    db.run("DELETE FROM textes_versions_brutes WHERE id = ?", (text_id,))
                    count(counts, 'delete from textes_versions_brutes', db.changes())
                # Update the row
                count_one('update in '+table)
                update(table, dict(id=text_id), attrs)
            else:
                count_one('insert into '+table)
                attrs['id'] = text_id
                insert(table, attrs)

            # Insert the associated rows
            for lien in liens:
                db.insert('liens', lien)
            count(counts, 'insert into liens', len(liens))
            for sommaire in sommaires:
                db.insert('sommaires', sommaire)
            count(counts, 'insert into sommaires', len(sommaires))

    print("made", sum(counts.values()), "changes in the database:",
          json.dumps(counts, indent=4, sort_keys=True))

    if skipped:
        print("skipped", skipped, "files that haven't changed")

    if unknown_folders:
        for d, x in unknown_folders.items():
            print("skipped", x, "files in unknown folder `%s`" % d)

    if liste_suppression:
        suppress(get_table, db, liste_suppression)