Пример #1
0
    def _handle_possibly_changed_package(self, lhs_package, rhs_package, modified):
        """Updates an SQL database with changes and schedules new operations to handle those packages"""
        session = self.thread_local.session

        super(SQLPackageDifferMixin, self)._handle_possibly_changed_package(lhs_package, rhs_package, modified)
        if modified:
            log.info("%s changed", rhs_package)
        else:
            log.debug("%s unchanged, stable since %i", rhs_package, rhs_package.stable_since())
        # end handle unchanged

        sql_package = None
        if modified:
            # Update the database with the stable time of the rhs package, we have changed and must mark it
            sql_package = session.to_sql_package(rhs_package, rhs_package.stable_since())
            sql_package.stable_since = seconds_to_datetime(rhs_package.stable_since())

            for trans in self._unfinished_transactions_for(session, sql_package.id):
                trans.cancel()
                trans.comment = "canceled as input package was changed before transaction was queued"
            # end for each transaction to cancel

            session.commit()
        # end pass off the package handling as it could be stable

        # We have to re-check every package to check if we can schedule a job on it
        self._handle_possibly_stable_package(rhs_package, session, sql_package)
Пример #2
0
    def _dict_to_bundle_list(self, prefix, bundle_dict):
        """Assure we apply retention per-version-bundle list"""
        if not self.config.retention_policy and self.config.keep_latest_version_count < 0:
            bundle_list = super(FilteringVersionBundler, self)._dict_to_bundle_list(prefix, bundle_dict)
        else:
            # MARK BUNDLES FOR DELETION
            ###########################
            # NOTE: When using the policy, it is very important that newer versions are also newer regarding the date.
            # This is why we resort to the min_created attribute, the youngest item counts (just in case people overwrite versions)
            bundle_list = self.BundleListType()
            if self.config.retention_policy:
                samples, removed_samples = self.config.retention_policy.filter(
                    time(),
                    ((seconds_to_datetime(b.min_created), b) for b in self._iter_bundles_in_dict(bundle_dict)),
                    ordered=False,
                )
                for t, b in removed_samples:
                    b.removed = True
                # end for each sample
                bundle_list.extend(sorted((s[1] for s in chain(samples, removed_samples)), key=lambda b: b.version))
            else:
                bundle_list.extend(self._iter_bundles_in_dict(bundle_dict))
                bundle_list.sort(key=lambda b: b.version)

                # can be negative, yielding nothing to iterate on
                for vid in xrange(len(bundle_list) - self.config.keep_latest_version_count):
                    bundle_list[vid].removed = True
                # end for each version to remove
            # end handle policy or stupid keep count
        # end handle bundle list conversion

        bundle_list.prefix = prefix
        return bundle_list
Пример #3
0
    def _handle_added_package(self, rhs_package):
        """Called to handle if package was added, compared to the last incarnation if it's parent tree"""
        session = self.thread_local.session
        log.info("%s managed", rhs_package)
        # Will possibly create a new instance ... 
        sql_package = session.to_sql_package(rhs_package, rhs_package.stable_since())

        # NOTE: Could be a package which is currently being moved ... which just means it's not stable and will
        # seen be put under 'new' management.
        # It can be that the daemon just restarted, but the package existed already with are more useful stable_since
        # date. We will use the one from the database in that case
        if sql_package.stable_since < seconds_to_datetime(rhs_package.stable_since()):
            rhs_package.set_stable_since(datetime_to_seconds(sql_package.stable_since))
        # end handle time conversion

        # in any case, commit the changes right now (possible addition, changes)
        session.commit()


        # See if we can handle the package already
        self._handle_possibly_stable_package(rhs_package, session, sql_package)
Пример #4
0
 def sample(age):
     return seconds_to_datetime(now - age), None
Пример #5
0
    def _append_path_record(self, records, path, streamer, log, ex_stat=None, digest_ratio=None):
        """Append meta-data about the given path to the given list of records
        @param stat if you have received the stat already, we will not get it again
        @param digest_ratio if not None, we will use the given digest and ration  instead of creating our own
        @return stat structure of the path, or None if the path could not be read"""
        # minimize file access
        try:
            ascii_path = to_ascii(path)
            stat = ex_stat or lstat(ascii_path)

            if digest_ratio:
                digest, ratio = digest_ratio
            else:
                digest, ratio = None, None
            # end handle digest_ratio

            ldest = None
            fd = None

            if islink(stat.st_mode):
                # Don't follow symlinks as this tricks us into thinking we have duplicates.
                # Hower, we would also have to check for hardlinks, but tracking those
                # can easliy cost too much memory. Hardlinks are rare anyway, so its okay.
                ldest = unicode(readlink(ascii_path))
            elif isreg(stat.st_mode) and not digest:
                fd = os.open(ascii_path, os.O_RDONLY)
            # end open file
        except OSError:
            log.error("Could not stat or open '%s' - skipping", ascii_path, exc_info=False)
            return None
        # end skip failing file

        if fd is not None:
            try:
                extra_progress = stat.st_size >= self.big_file
                if extra_progress:
                    log.info("Streaming %s file at '%s'", int_to_size_string(stat.st_size), ascii_path)
                # end extra logging

                try:
                    digest = (
                        streamer.set_stream(lambda size: os.read(fd, size))
                        .set_log(extra_progress and log or None)
                        .stream()
                        .digest()
                    )
                    ratio = streamer.ratio
                except IOError:
                    log.error("Failed to stream file '%s' - skipping", ascii_path, exc_info=True)
                    return None
                # end handle io errors gracefully
            finally:
                os.close(fd)
            # end assure we close the file
        # end handle symlink

        try:
            path = unicode(path)
        except Exception:
            log.error("Failed to handle encoding of path '%s' - skipping", ascii_path, exc_info=True)
            return None
        # end ignore unicode conversion errors

        # symlinks have a null-digest, which is why they are symlinks.
        # NOTE: We don't care about their contents, it's just a filename and
        # we don't has it, as we are not interested about it's contents
        records.append(
            {
                "path": path,
                "size": stat.st_size,
                "atime": seconds_to_datetime(stat.st_atime),
                "ctime": seconds_to_datetime(stat.st_ctime),
                "mtime": seconds_to_datetime(stat.st_mtime),
                "uid": stat.st_uid,
                "gid": stat.st_gid,
                "nblocks": stat.st_blocks,
                "nlink": stat.st_nlink,
                "mode": stat.st_mode,
                "ldest": ldest,
                "sha1": digest,
                "ratio": ratio,
            }
        )

        return stat
Пример #6
0
    def _fast_update_database(self, engine, args):
        """Update all data contained in the given engine quickly, see --fast
        @return number of processed records"""
        nr = 0
        st = time()
        log = self.log()
        progress_every = 5000
        stats_info_every = 500
        commit_every_seconds = 30
        commit_every_records = 15000
        time_of_last_commit = time()
        connection = engine.connect()
        meta = MetaData(engine, reflect=True)
        fsitem = meta.tables[args.table_name]
        insert = fsitem.insert()
        update = (
            fsitem.update()
            .where(fsitem.c.id == bindparam("rid"))
            .values(
                path=bindparam("path"),
                size=bindparam("size"),
                atime=bindparam("atime"),
                ctime=bindparam("ctime"),
                mtime=bindparam("mtime"),
                uid=bindparam("uid"),
                gid=bindparam("gid"),
                nblocks=bindparam("nblocks"),
                nlink=bindparam("nlink"),
                mode=bindparam("mode"),
                ldest=bindparam("ldest"),
                sha1=bindparam("sha1"),
                ratio=bindparam("ratio"),
            )
        )

        # NOTE: this selector assures we only get the latest version of a file, based on the modification time !
        selector = select(
            [
                fsitem.c.id,
                fsitem.c.path,
                fsitem.c.size,
                fsitem.c.atime,
                fsitem.c.ctime,  # marker to see if something is deleted
                fsitem.c.mtime,
                fsitem.c.uid,
                fsitem.c.gid,
                fsitem.c.nblocks,
                fsitem.c.nlink,
                fsitem.c.mode,
                fsitem.c.ldest,
                fsitem.c.sha1,
                fsitem.c.ratio,
            ],
            order_by=[fsitem.c.path, fsitem.c.id.desc()],
        )

        if args.where_like:
            selector = selector.where(fsitem.c.path.like(args.where_like + "%"))
        # end append where clause

        def progress():
            elapsed = time() - st
            log.info("Checked %i files in %.2fs (%.2f files/s)", nr, elapsed, nr / elapsed)

        # end

        join = os.path.join
        isabs = os.path.isabs
        dirname = os.path.dirname
        basename = os.path.basename
        streamer = HashStreamer(hashlib.sha1, lz4dumps)
        ## A mapping from directory names to all of its files (as names)
        dir_entries = dict()

        # A list of sql operators that will update particular entries. They are executed all at once
        # Must include the ID
        updates = list()
        total_num_updates = 0
        modified_count = 0
        added_count = 0
        deleted_count = 0
        last_path = None
        # The window is critical - it is slow for the server, and each query is like a new complete query
        # where only a subset is sent (due to the ordering)
        # Additionally, if there are many changes, we will change the database during iteration, which will
        # basically give us part of the same files (if not the same files) back on the next query, which
        # makes us even more inefficient. Therefore we use memory to our advantage, and use 1mio entries
        # by default. This needs about 1GB of memory, but reduces the amount of queries considerably
        # especially on large database
        window = 1000 * 1000
        cur_window = 0
        shortest_path = None
        len_shortest_path = 100000000

        for cursor in self._fetch_record_iterator(connection, selector, window):

            nri = 0  # num rows in iteration
            for row in cursor:
                # NOTE: We are getting multiple entries, sorted by the latest one, for the same path
                # We prune all paths of a kind have seen so far
                # Can be files or directories
                nri += 1
                nr += 1
                rid, path, size, atime, ctime, mtime, uid, gid, nblocks, nlink, mode, ldest, sha1, ratio = row
                if not isabs(path) or path == last_path:
                    continue
                # end skip relative paths !

                last_path = path
                ascii_path = to_ascii(path)

                # NOTE: I know, this is killing us, as we will grow rather large by keeping all that data
                # But I know no other way except for processing directories while we are going.
                # As files and directories will be mixed, it is not too easy though to figure this out.
                # For now, we just go for it and let the CPU/Memory burn
                directory = dirname(path)
                if directory not in dir_entries:
                    dir_entries[directory] = set()
                # end count dirs
                dir_entries[directory].add(basename(path))

                # Make sure we don't forget to set the actual directory - otherwise
                if isdir(mode):
                    dir_entries.setdefault(path, set())
                # end add each directory that is a directory

                # Find the root path, which should be the origin of it all, and ignore it when
                # finding added items. It's definitely the shortest one
                if len(directory) < len_shortest_path:
                    shortest_path = directory
                    len_shortest_path = len(directory)
                # end keep shortest path

                try:
                    # For some reason, this doesn't get our unicode as it tries to use ascii to deal with it
                    # NOTE: We could know the file was deleted by checking fsitem.c.ctime is None, but
                    # we check anyway because it could be re-created.
                    stat = lstat(ascii_path)
                except OSError:
                    # DELETION
                    ##########
                    # This marks a deletion - we just keep the time of deletion, which is the time when we
                    # noticed it ! Not the actual one
                    # It didn't exist, but only append this info if we didn't know about that before
                    if ctime is not None:
                        # have to write an entire record, otherwise changes and deletions go out of sync
                        updates.append(
                            {
                                "rid": rid,
                                "path": path,
                                "size": 0,
                                "atime": atime,
                                "ctime": None,
                                "mtime": seconds_to_datetime(time()),
                                "uid": uid,
                                "gid": gid,
                                "nblocks": nblocks,
                                "nlink": nlink,
                                "mode": mode,
                                "ldest": ldest,
                                # Keep sha as last known contents ! This allows to track deletion even
                                # renames and deletions
                                "sha1": sha1,
                                "ratio": ratio,
                            }
                        )
                        deleted_count += 1
                        if deleted_count % stats_info_every == 0:
                            log.info("Found %i DELETED paths", deleted_count)
                        # end handle deleted
                    # end handle deletions
                else:
                    # MODIFICATION
                    ###############
                    # File could have been deleted and re-created
                    # We don't know it was an addition (due to previous deletion), but the dataset is the same
                    # so people can figure it out later
                    # ordered by likeliness
                    if (
                        seconds_to_datetime(stat.st_mtime) != mtime
                        or size != stat.st_size
                        or uid != stat.st_uid
                        or gid != stat.st_gid
                        or mode != stat.st_mode
                        or nlink != stat.st_nlink
                        or (islink(stat.st_mode) and readlink(ascii_path) != ldest)
                    ):

                        # NOTE: we are lazy here and say, for now, that the size must change to justify
                        # taking another sha. Otherwise we assume that it's just any other change, which we will
                        # put into the database in the form of a new commit, of course.
                        if self._append_path_record(
                            updates, path, streamer, log, stat, size == stat.st_size and (sha1, ratio) or None
                        ):
                            # add the rid to have everything we need for the update
                            updates[-1]["rid"] = rid
                            modified_count += 1
                            if modified_count % stats_info_every == 0:
                                log.info("Found %i MODIFIED paths", modified_count)
                            # end show information
                        # end handle modification
                    # end handle modification
                # end handle deleted file

                if nr % progress_every == 0:
                    progress()
                # end handle progress

                if len(updates) >= commit_every_records or time() - time_of_last_commit >= commit_every_seconds:
                    total_num_updates += len(updates)
                    self.do_execute_records(connection, update, updates, log, st, total_num_updates)
                    time_of_last_commit = time()
                # end handle executions
            # end for each file in database windows
            cursor.close()

            # Is the database depleted ?
            if nri < window:
                break
            # end handle window
        # end for each cursor

        progress()
        total_num_updates += len(updates)
        self.do_execute_records(connection, update, updates, log, st, total_num_updates)

        ########################
        # HANDLE ADDITIONS ###
        ####################
        # We iterate all actual directories and their entries as known to the database
        # Now we just have to compare and only check for additions
        new_records = list()

        def list_dir_safely(dir_ascii):
            """@return entries of an empty tuple() if the listing failed"""
            try:
                return os.listdir(dir_ascii)
            except OSError:
                # ignore added dirs which might already be gone
                log.warn("Couldn't access '%s' when trying to add it", dir_ascii)
                return tuple()
            # end handle exception

        # We can't assign a variable in an outside scope, so we have to make it an array
        last_commit_time = [time()]

        def append_records_recursive(path, added_count):
            """Find all entries recursively in path and append them
            @param path directory or path
            @return amount of added items"""
            # no matter what, add the entry
            if self._append_path_record(new_records, path, streamer, log):
                added_count += 1
                if added_count % stats_info_every == 0:
                    log.info("Found %i ADDED paths", added_count)
                # end info printing
                if len(new_records) >= commit_every_records or time() - last_commit_time[0] >= commit_every_seconds:
                    self.do_execute_records(connection, insert, new_records, log, st, added_count)
                    last_commit_time[0] = time()
            # end handle path

            path_ascii = to_ascii(path)
            if os.path.isdir(path_ascii):
                entries = list_dir_safely(path_ascii)
                for entry in entries:
                    added_count = append_records_recursive(join(path, entry), added_count)
                # end for each entry to check
            # end entries
            return added_count

        # end recursion helper

        # Remove shortest directory, which was generated from the directory of our root !
        # NOTE: if there was no root, this is false alarm
        try:
            del (dir_entries[shortest_path])
        except KeyError:
            pass
        # end ignore root not in dirlist

        log.info("About to check %i directories for added entries ...", len(dir_entries))
        for dir, entries in dir_entries.iteritems():
            added = set(list_dir_safely(to_ascii(dir))) - entries
            for added_entry in added:
                added_count = append_records_recursive(join(dir, added_entry), added_count)
        # end for each directory to check

        if new_records:
            log.info("Committing remaining %i new records", len(new_records))
            self.do_execute_records(connection, insert, new_records, log, st, added_count)
        # end commit new records
        connection.close()

        elapsed = time() - st
        log.info("== Statistics ==")
        log.info("%5i ADDED", added_count)
        log.info("%5i MODIFIED", modified_count)
        log.info("%5i DELETED", deleted_count)
        log.info("================")
        log.info(
            "Updated %i entries in %.2fs (%.2f entries/s)", total_num_updates, elapsed, total_num_updates / elapsed
        )

        return nr
Пример #7
0
    def generate(self):
        now = datetime.now()
        now_time = time()
        rep = self.ReportType()
        rep.columns.extend(self.report_schema)

        policy_string = self._config.policy
        name_like = self._config.name_like
        applied_every_string = self._config.applied_every
        debug = self._config.debug


        if not policy_string:
            # todo find it from filesystem property
            log.error('Retention policy is not configured')
            return rep
        # end ignore empty retention

        if not name_like:
            log.error("Please specify the name_like to be the name of the file system, like '%foo%'")
            return rep
        # end handle name filter not set


        policy = self.PolicyType(policy_string)
        applied_every_string = applied_every_string or None

        # Find all snapshots ascending by creation date and 
        query = self._session.query(ZDataset).\
                                filter(ZDataset.avail == None).\
                                filter(ZDataset.name.like(self._config.name_like)).\
                                order_by(ZDataset.host, ZDataset.creation)
        # sort all results by filesystem
        by_fs_map = dict()
        for ss in host_filter(self._config.hosts, ZDataset.host, query):
            by_fs_map.setdefault((ss.host, ss.filesystem_name()), list()).append((ss.creation, ss))
        # end for each dataset


        def count_samples_in_range(samples, from_date, to_date):
            count = 0
            for ctime, _ in samples:
                if from_date < ctime < to_date:
                    count += 1
                elif count:
                    break
                # end handle early bailout
            # end for each sample
            return count
        # end brute force count samples utility, doesn't make assumptions about order


        kept_comment = 'kept by policy'
        removed_comment = 'removed by policy'
        summaries = list()              # summary-records
        for (fs_host, fs_name), samples in by_fs_map.iteritems():
            # Apply policy and prepare actual report
            remaining, deleted = policy.filter(now_time, samples)

            # in debug mode, we want to see it even there are no deletions
            # Otherwise this is just a shortcut
            if not debug and not deleted:
                continue
            # end handle empty list

            if debug:
                merged_records = list()
                dset = set(deleted)
                for sample in samples:
                    is_deleted = sample in dset
                    ctime, ss = sample
                    rep.records.append([    now - ss.updated_at,
                                            fs_host,
                                            ss.name,
                                            is_deleted and self.TYPE_SNAPSHOT or 'debug',
                                            ss.creation,
                                            now - ss.creation,
                                            ss.used,
                                            0,
                                            100.0,
                                            is_deleted and removed_comment or kept_comment])
                # end for each sample

                # Convert rules into format that is more easily understood: num-samples:date-ago
                rule_tokens = list()
                total_duration = 0
                to_date = now
                for keep, freq, duration in policy._rules:
                    total_duration += duration
                    from_date = seconds_to_datetime(now_time - total_duration)
                    remaining_count = count_samples_in_range(remaining, from_date, to_date)
                    del_count = count_samples_in_range(deleted, from_date, to_date)
                    rule = '(%i-%i=%i)/%i:%s' % (remaining_count + del_count, del_count, remaining_count,
                                                duration / freq, 
                                                delta_to_tty_string(now - from_date))
                    rule_tokens.append(rule) 
                    to_date = from_date
                # end for each rule

                summaries.append([now-now,
                                  fs_host,
                                  fs_name,
                                  'debug-' + self.TYPE_SUMMARY,
                                  now,
                                  now - now,
                                  0,
                                  0,
                                  0,
                                  ','.join(rule_tokens)
                            ])

            # end adjust record source for debugging
            else:
                for creation_time, ss in deleted:
                    rep.records.append([    now - ss.updated_at,
                                            fs_host,
                                            ss.name,
                                            self.TYPE_SNAPSHOT,
                                            ss.creation,
                                            now - ss.creation,
                                            ss.used,
                                            0,
                                            100.0,
                                            removed_comment])
            # end handle debug

            summary = "%s - Removing %i of %i snapshots; %i remain, policy-max = %i (+%i)" % ((policy_string, len(deleted), 
                                                                                            len(samples), len(remaining))
                                                                                            + policy.num_rule_samples(applied_every_string))
                                                                                        
            summaries.append([now-now,
                              fs_host,
                              fs_name,
                              self.TYPE_SUMMARY,
                              now,
                              now - now,
                              0,
                              0,
                              0,
                              summary
                            ])

        # end for each filesystem

        # AGGREGATE
        ###########
        self._aggregate_records(rep.records, now)
        rep.records.extend(summaries)


        return rep
Пример #8
0
def seconds_to_delta_string(date_seconds):
    """@return a string representing the given time (in past) in seconds in time relative to current time"""
    return delta_to_tty_string(now - seconds_to_datetime(date_seconds))