Пример #1
0
    def _append_path_record(self, records, path, streamer, log, ex_stat=None, digest_ratio=None):
        """Append meta-data about the given path to the given list of records
        @param stat if you have received the stat already, we will not get it again
        @param digest_ratio if not None, we will use the given digest and ration  instead of creating our own
        @return stat structure of the path, or None if the path could not be read"""
        # minimize file access
        try:
            ascii_path = to_ascii(path)
            stat = ex_stat or lstat(ascii_path)

            if digest_ratio:
                digest, ratio = digest_ratio
            else:
                digest, ratio = None, None
            # end handle digest_ratio

            ldest = None
            fd = None

            if islink(stat.st_mode):
                # Don't follow symlinks as this tricks us into thinking we have duplicates.
                # Hower, we would also have to check for hardlinks, but tracking those
                # can easliy cost too much memory. Hardlinks are rare anyway, so its okay.
                ldest = unicode(readlink(ascii_path))
            elif isreg(stat.st_mode) and not digest:
                fd = os.open(ascii_path, os.O_RDONLY)
            # end open file
        except OSError:
            log.error("Could not stat or open '%s' - skipping", ascii_path, exc_info=False)
            return None
        # end skip failing file

        if fd is not None:
            try:
                extra_progress = stat.st_size >= self.big_file
                if extra_progress:
                    log.info("Streaming %s file at '%s'", int_to_size_string(stat.st_size), ascii_path)
                # end extra logging

                try:
                    digest = (
                        streamer.set_stream(lambda size: os.read(fd, size))
                        .set_log(extra_progress and log or None)
                        .stream()
                        .digest()
                    )
                    ratio = streamer.ratio
                except IOError:
                    log.error("Failed to stream file '%s' - skipping", ascii_path, exc_info=True)
                    return None
                # end handle io errors gracefully
            finally:
                os.close(fd)
            # end assure we close the file
        # end handle symlink

        try:
            path = unicode(path)
        except Exception:
            log.error("Failed to handle encoding of path '%s' - skipping", ascii_path, exc_info=True)
            return None
        # end ignore unicode conversion errors

        # symlinks have a null-digest, which is why they are symlinks.
        # NOTE: We don't care about their contents, it's just a filename and
        # we don't has it, as we are not interested about it's contents
        records.append(
            {
                "path": path,
                "size": stat.st_size,
                "atime": seconds_to_datetime(stat.st_atime),
                "ctime": seconds_to_datetime(stat.st_ctime),
                "mtime": seconds_to_datetime(stat.st_mtime),
                "uid": stat.st_uid,
                "gid": stat.st_gid,
                "nblocks": stat.st_blocks,
                "nlink": stat.st_nlink,
                "mode": stat.st_mode,
                "ldest": ldest,
                "sha1": digest,
                "ratio": ratio,
            }
        )

        return stat
Пример #2
0
    def _fast_update_database(self, engine, args):
        """Update all data contained in the given engine quickly, see --fast
        @return number of processed records"""
        nr = 0
        st = time()
        log = self.log()
        progress_every = 5000
        stats_info_every = 500
        commit_every_seconds = 30
        commit_every_records = 15000
        time_of_last_commit = time()
        connection = engine.connect()
        meta = MetaData(engine, reflect=True)
        fsitem = meta.tables[args.table_name]
        insert = fsitem.insert()
        update = (
            fsitem.update()
            .where(fsitem.c.id == bindparam("rid"))
            .values(
                path=bindparam("path"),
                size=bindparam("size"),
                atime=bindparam("atime"),
                ctime=bindparam("ctime"),
                mtime=bindparam("mtime"),
                uid=bindparam("uid"),
                gid=bindparam("gid"),
                nblocks=bindparam("nblocks"),
                nlink=bindparam("nlink"),
                mode=bindparam("mode"),
                ldest=bindparam("ldest"),
                sha1=bindparam("sha1"),
                ratio=bindparam("ratio"),
            )
        )

        # NOTE: this selector assures we only get the latest version of a file, based on the modification time !
        selector = select(
            [
                fsitem.c.id,
                fsitem.c.path,
                fsitem.c.size,
                fsitem.c.atime,
                fsitem.c.ctime,  # marker to see if something is deleted
                fsitem.c.mtime,
                fsitem.c.uid,
                fsitem.c.gid,
                fsitem.c.nblocks,
                fsitem.c.nlink,
                fsitem.c.mode,
                fsitem.c.ldest,
                fsitem.c.sha1,
                fsitem.c.ratio,
            ],
            order_by=[fsitem.c.path, fsitem.c.id.desc()],
        )

        if args.where_like:
            selector = selector.where(fsitem.c.path.like(args.where_like + "%"))
        # end append where clause

        def progress():
            elapsed = time() - st
            log.info("Checked %i files in %.2fs (%.2f files/s)", nr, elapsed, nr / elapsed)

        # end

        join = os.path.join
        isabs = os.path.isabs
        dirname = os.path.dirname
        basename = os.path.basename
        streamer = HashStreamer(hashlib.sha1, lz4dumps)
        ## A mapping from directory names to all of its files (as names)
        dir_entries = dict()

        # A list of sql operators that will update particular entries. They are executed all at once
        # Must include the ID
        updates = list()
        total_num_updates = 0
        modified_count = 0
        added_count = 0
        deleted_count = 0
        last_path = None
        # The window is critical - it is slow for the server, and each query is like a new complete query
        # where only a subset is sent (due to the ordering)
        # Additionally, if there are many changes, we will change the database during iteration, which will
        # basically give us part of the same files (if not the same files) back on the next query, which
        # makes us even more inefficient. Therefore we use memory to our advantage, and use 1mio entries
        # by default. This needs about 1GB of memory, but reduces the amount of queries considerably
        # especially on large database
        window = 1000 * 1000
        cur_window = 0
        shortest_path = None
        len_shortest_path = 100000000

        for cursor in self._fetch_record_iterator(connection, selector, window):

            nri = 0  # num rows in iteration
            for row in cursor:
                # NOTE: We are getting multiple entries, sorted by the latest one, for the same path
                # We prune all paths of a kind have seen so far
                # Can be files or directories
                nri += 1
                nr += 1
                rid, path, size, atime, ctime, mtime, uid, gid, nblocks, nlink, mode, ldest, sha1, ratio = row
                if not isabs(path) or path == last_path:
                    continue
                # end skip relative paths !

                last_path = path
                ascii_path = to_ascii(path)

                # NOTE: I know, this is killing us, as we will grow rather large by keeping all that data
                # But I know no other way except for processing directories while we are going.
                # As files and directories will be mixed, it is not too easy though to figure this out.
                # For now, we just go for it and let the CPU/Memory burn
                directory = dirname(path)
                if directory not in dir_entries:
                    dir_entries[directory] = set()
                # end count dirs
                dir_entries[directory].add(basename(path))

                # Make sure we don't forget to set the actual directory - otherwise
                if isdir(mode):
                    dir_entries.setdefault(path, set())
                # end add each directory that is a directory

                # Find the root path, which should be the origin of it all, and ignore it when
                # finding added items. It's definitely the shortest one
                if len(directory) < len_shortest_path:
                    shortest_path = directory
                    len_shortest_path = len(directory)
                # end keep shortest path

                try:
                    # For some reason, this doesn't get our unicode as it tries to use ascii to deal with it
                    # NOTE: We could know the file was deleted by checking fsitem.c.ctime is None, but
                    # we check anyway because it could be re-created.
                    stat = lstat(ascii_path)
                except OSError:
                    # DELETION
                    ##########
                    # This marks a deletion - we just keep the time of deletion, which is the time when we
                    # noticed it ! Not the actual one
                    # It didn't exist, but only append this info if we didn't know about that before
                    if ctime is not None:
                        # have to write an entire record, otherwise changes and deletions go out of sync
                        updates.append(
                            {
                                "rid": rid,
                                "path": path,
                                "size": 0,
                                "atime": atime,
                                "ctime": None,
                                "mtime": seconds_to_datetime(time()),
                                "uid": uid,
                                "gid": gid,
                                "nblocks": nblocks,
                                "nlink": nlink,
                                "mode": mode,
                                "ldest": ldest,
                                # Keep sha as last known contents ! This allows to track deletion even
                                # renames and deletions
                                "sha1": sha1,
                                "ratio": ratio,
                            }
                        )
                        deleted_count += 1
                        if deleted_count % stats_info_every == 0:
                            log.info("Found %i DELETED paths", deleted_count)
                        # end handle deleted
                    # end handle deletions
                else:
                    # MODIFICATION
                    ###############
                    # File could have been deleted and re-created
                    # We don't know it was an addition (due to previous deletion), but the dataset is the same
                    # so people can figure it out later
                    # ordered by likeliness
                    if (
                        seconds_to_datetime(stat.st_mtime) != mtime
                        or size != stat.st_size
                        or uid != stat.st_uid
                        or gid != stat.st_gid
                        or mode != stat.st_mode
                        or nlink != stat.st_nlink
                        or (islink(stat.st_mode) and readlink(ascii_path) != ldest)
                    ):

                        # NOTE: we are lazy here and say, for now, that the size must change to justify
                        # taking another sha. Otherwise we assume that it's just any other change, which we will
                        # put into the database in the form of a new commit, of course.
                        if self._append_path_record(
                            updates, path, streamer, log, stat, size == stat.st_size and (sha1, ratio) or None
                        ):
                            # add the rid to have everything we need for the update
                            updates[-1]["rid"] = rid
                            modified_count += 1
                            if modified_count % stats_info_every == 0:
                                log.info("Found %i MODIFIED paths", modified_count)
                            # end show information
                        # end handle modification
                    # end handle modification
                # end handle deleted file

                if nr % progress_every == 0:
                    progress()
                # end handle progress

                if len(updates) >= commit_every_records or time() - time_of_last_commit >= commit_every_seconds:
                    total_num_updates += len(updates)
                    self.do_execute_records(connection, update, updates, log, st, total_num_updates)
                    time_of_last_commit = time()
                # end handle executions
            # end for each file in database windows
            cursor.close()

            # Is the database depleted ?
            if nri < window:
                break
            # end handle window
        # end for each cursor

        progress()
        total_num_updates += len(updates)
        self.do_execute_records(connection, update, updates, log, st, total_num_updates)

        ########################
        # HANDLE ADDITIONS ###
        ####################
        # We iterate all actual directories and their entries as known to the database
        # Now we just have to compare and only check for additions
        new_records = list()

        def list_dir_safely(dir_ascii):
            """@return entries of an empty tuple() if the listing failed"""
            try:
                return os.listdir(dir_ascii)
            except OSError:
                # ignore added dirs which might already be gone
                log.warn("Couldn't access '%s' when trying to add it", dir_ascii)
                return tuple()
            # end handle exception

        # We can't assign a variable in an outside scope, so we have to make it an array
        last_commit_time = [time()]

        def append_records_recursive(path, added_count):
            """Find all entries recursively in path and append them
            @param path directory or path
            @return amount of added items"""
            # no matter what, add the entry
            if self._append_path_record(new_records, path, streamer, log):
                added_count += 1
                if added_count % stats_info_every == 0:
                    log.info("Found %i ADDED paths", added_count)
                # end info printing
                if len(new_records) >= commit_every_records or time() - last_commit_time[0] >= commit_every_seconds:
                    self.do_execute_records(connection, insert, new_records, log, st, added_count)
                    last_commit_time[0] = time()
            # end handle path

            path_ascii = to_ascii(path)
            if os.path.isdir(path_ascii):
                entries = list_dir_safely(path_ascii)
                for entry in entries:
                    added_count = append_records_recursive(join(path, entry), added_count)
                # end for each entry to check
            # end entries
            return added_count

        # end recursion helper

        # Remove shortest directory, which was generated from the directory of our root !
        # NOTE: if there was no root, this is false alarm
        try:
            del (dir_entries[shortest_path])
        except KeyError:
            pass
        # end ignore root not in dirlist

        log.info("About to check %i directories for added entries ...", len(dir_entries))
        for dir, entries in dir_entries.iteritems():
            added = set(list_dir_safely(to_ascii(dir))) - entries
            for added_entry in added:
                added_count = append_records_recursive(join(dir, added_entry), added_count)
        # end for each directory to check

        if new_records:
            log.info("Committing remaining %i new records", len(new_records))
            self.do_execute_records(connection, insert, new_records, log, st, added_count)
        # end commit new records
        connection.close()

        elapsed = time() - st
        log.info("== Statistics ==")
        log.info("%5i ADDED", added_count)
        log.info("%5i MODIFIED", modified_count)
        log.info("%5i DELETED", deleted_count)
        log.info("================")
        log.info(
            "Updated %i entries in %.2fs (%.2f entries/s)", total_num_updates, elapsed, total_num_updates / elapsed
        )

        return nr