示例#1
0
文件: filesystem.py 项目: cben/bedup
 def best_desc(self, root_id):
     if root_id not in self._best_desc:
         intpath = fsdecode(self.root_info[root_id].path)
         candidate_mis = [
             mi for mi in self.minfos
             if not mi.private and path_isprefix(mi.internal_path, intpath)]
         if candidate_mis:
             mi = max(
                 candidate_mis, key=lambda mi: len(mi.internal_path))
             base = mi.mpoint
             intbase = mi.internal_path
             is_fs_path = True
         else:
             base = self.desc
             intbase = '/'
             is_fs_path = False
         self._best_desc[root_id] = VolDesc(
             os.path.normpath(
                 os.path.join(base, os.path.relpath(intpath, intbase))),
             is_fs_path)
     return self._best_desc[root_id]
示例#2
0
文件: filesystem.py 项目: cben/bedup
 def device_info(self):
     di = {}
     lbls = Counter()
     for line in subprocess.check_output(
         'blkid -s LABEL -s UUID -t TYPE=btrfs'.split()
     ).splitlines():
         dev, label, uuid = BLKID_RE.match(line).groups()
         uuid = UUID(hex=uuid.decode('ascii'))
         dev = fsdecode(dev)
         if label is not None:
             try:
                 label = label.decode('ascii')
             except UnicodeDecodeError:
                 # Don't try to guess.
                 pass
         if uuid in di:
             # btrfs raid
             assert di[uuid].label == label
             di[uuid].devices.append(dev)
         else:
             lbls[label] += 1
             di[uuid] = DeviceInfo(label, [dev])
     self._label_occurs = dict(lbls)
     return di
示例#3
0
文件: tracking.py 项目: cben/bedup
def dedup_tracked1(sess, tt, ofile_reserved, query, fs):
    space_gain = 0
    ofile_soft, ofile_hard = resource.getrlimit(resource.RLIMIT_OFILE)

    # Hopefully close any files we left around
    gc.collect()

    for comm1 in query:
        size = comm1.size
        tt.update(comm1=comm1)
        by_mh = defaultdict(list)
        for inode in comm1.inodes:
            # XXX Need to cope with deleted inodes.
            # We cannot find them in the search-new pass, not without doing
            # some tracking of directory modifications to poke updated
            # directories to find removed elements.

            # rehash everytime for now
            # I don't know enough about how inode transaction numbers are
            # updated (as opposed to extent updates) to be able to actually
            # cache the result
            try:
                pathb = inode.vol.live.lookup_one_path(inode)
            except IOError as e:
                if e.errno != errno.ENOENT:
                    raise
                # We have a stale record for a removed inode
                # XXX If an inode number is reused and the second instance
                # is below the size cutoff, we won't update the .size
                # attribute and we won't get an IOError to notify us
                # either.  Inode reuse does happen (with and without
                # inode_cache), so this branch isn't enough to rid us of
                # all stale entries.  We can also get into trouble with
                # regular file inodes being replaced by some other kind of
                # inode.
                sess.delete(inode)
                continue
            with closing(fopenat(inode.vol.live.fd, pathb)) as rfile:
                by_mh[mini_hash_from_file(inode, rfile)].append(inode)
                tt.update(mhash=None)

        for inodes in by_mh.itervalues():
            inode_count = len(inodes)
            if inode_count < 2:
                continue
            fies = set()
            for inode in inodes:
                try:
                    pathb = inode.vol.live.lookup_one_path(inode)
                except IOError as e:
                    if e.errno != errno.ENOENT:
                        raise
                    sess.delete(inode)
                    continue
                with closing(fopenat(inode.vol.live.fd, pathb)) as rfile:
                    fies.add(fiemap_hash_from_file(rfile))

            if len(fies) < 2:
                continue

            files = []
            fds = []
            # For description only
            fd_names = {}
            fd_inodes = {}
            by_hash = defaultdict(list)

            # XXX I have no justification for doubling inode_count
            ofile_req = 2 * inode_count + ofile_reserved
            if ofile_req > ofile_soft:
                if ofile_req <= ofile_hard:
                    resource.setrlimit(resource.RLIMIT_OFILE, (ofile_req, ofile_hard))
                    ofile_soft = ofile_req
                else:
                    tt.notify(
                        "Too many duplicates (%d at size %d), "
                        "would bring us over the open files limit (%d, %d)."
                        % (inode_count, size, ofile_soft, ofile_hard)
                    )
                    for inode in inodes:
                        if inode.has_updates:
                            query.skipped.append(inode)
                    continue

            for inode in inodes:
                # Open everything rw, we can't pick one for the source side
                # yet because the crypto hash might eliminate it.
                # We may also want to defragment the source.
                try:
                    pathb = inode.vol.live.lookup_one_path(inode)
                    path = fsdecode(pathb)
                except IOError as e:
                    if e.errno == errno.ENOENT:
                        sess.delete(inode)
                        continue
                    raise
                try:
                    afile = fopenat_rw(inode.vol.live.fd, pathb)
                except IOError as e:
                    if e.errno == errno.ETXTBSY:
                        # The file contains the image of a running process,
                        # we can't open it in write mode.
                        tt.notify("File %r is busy, skipping" % path)
                    elif e.errno == errno.EACCES:
                        # Could be SELinux or immutability
                        tt.notify("Access denied on %r, skipping" % path)
                    elif e.errno == errno.ENOENT:
                        # The file was moved or unlinked by a racing process
                        tt.notify("File %r may have moved, skipping" % path)
                    else:
                        raise
                    query.skipped.append(inode)
                    continue

                # It's not completely guaranteed we have the right inode,
                # there may still be race conditions at this point.
                # Gets re-checked below (tell and fstat).
                fd = afile.fileno()
                fd_inodes[fd] = inode
                fd_names[fd] = path
                files.append(afile)
                fds.append(fd)

            with ExitStack() as stack:
                for afile in files:
                    stack.enter_context(closing(afile))
                # Enter this context last
                immutability = stack.enter_context(ImmutableFDs(fds))

                # With a false positive, some kind of cmp pass that compares
                # all files at once might be more efficient that hashing.
                for afile in files:
                    fd = afile.fileno()
                    inode = fd_inodes[fd]
                    if fd in immutability.fds_in_write_use:
                        tt.notify("File %r is in use, skipping" % fd_names[fd])
                        query.skipped.append(inode)
                        continue
                    hasher = hashlib.sha1()
                    for buf in iter(lambda: afile.read(BUFSIZE), b""):
                        hasher.update(buf)

                    # Gets rid of a race condition
                    st = os.fstat(fd)
                    if st.st_ino != inode.ino:
                        query.skipped.append(inode)
                        continue
                    if st.st_dev != inode.vol.live.st_dev:
                        query.skipped.append(inode)
                        continue

                    size1 = afile.tell()
                    if size1 != size:
                        if size1 < inode.vol.size_cutoff:
                            # if we didn't delete this inode, it would cause
                            # spurious comm groups in all future invocations.
                            sess.delete(inode)
                        else:
                            query.skipped.append(inode)
                        continue

                    by_hash[hasher.digest()].append(afile)
                    tt.update(fhash=None)

                for fileset in by_hash.itervalues():
                    if len(fileset) < 2:
                        continue
                    sfile = fileset[0]
                    sfd = sfile.fileno()
                    sdesc = fd_inodes[sfd].vol.live.describe_path(fd_names[sfd])
                    # Commented out, defragmentation can unshare extents.
                    # It can also disable compression as a side-effect.
                    if False:
                        defragment(sfd)
                    dfiles = fileset[1:]
                    dfiles_successful = []
                    for dfile in dfiles:
                        dfd = dfile.fileno()
                        ddesc = fd_inodes[dfd].vol.live.describe_path(fd_names[dfd])
                        if not cmp_files(sfile, dfile):
                            # Probably a bug since we just used a crypto hash
                            tt.notify("Files differ: %r %r" % (sdesc, ddesc))
                            assert False, (sdesc, ddesc)
                            continue
                        if clone_data(dest=dfd, src=sfd, check_first=True):
                            tt.notify("Deduplicated:\n- %r\n- %r" % (sdesc, ddesc))
                            dfiles_successful.append(dfile)
                            space_gain += size
                            tt.update(space_gain=space_gain)
                        elif False:
                            # Often happens when there are multiple files with
                            # the same extents, plus one with the same size and
                            # mini-hash but a difference elsewhere.
                            # We hash the same extents multiple times, but
                            # I assume the data is shared in the vfs cache.
                            tt.notify("Did not deduplicate (same extents): %r %r" % (sdesc, ddesc))
                    if dfiles_successful:
                        evt = DedupEvent(fs=fs.impl, item_size=size, created=system_now())
                        sess.add(evt)
                        for afile in [sfile] + dfiles_successful:
                            inode = fd_inodes[afile.fileno()]
                            evti = DedupEventInode(event=evt, ino=inode.ino, vol=inode.vol)
                            sess.add(evti)
                        sess.commit()
    tt.format(None)