def scan(self, ctx, prev_num): self.compute_stats() # # Check if we have encountered this file during this scan already # ctx.num_visited_files_reporter.increment(1) ctx.current_scanned_file_reporter.set(self.path()) if self.scan_hlink(ctx): logging.info("File %s: HLINK" % self.path()) return # # Check if the file is the same as in one of the upper levels # if self.scan_prev(ctx, prev_num): logging.debug("File %s: PREV" % self.path()) ctx.num_prev_files_reporter.increment(1) return # --- File not yet in database, process it file_size = 0 packer = PackerStream.PackerOStream(self.backup, Container.CODE_DATA) handle = open(self.path(), "rb") for data in FileIO.read_blocks(handle, self.backup.get_block_size()): packer.write(data) file_size += len(data) ctx.num_total_blocks_reporter.increment(1) ctx.size_total_blocks_reporter.increment(len(data)) ctx.update_scan_status() handle.close() self.digest = packer.get_digest() self.level = packer.get_level() self.update_hlink(ctx) logging.info("Scanned file %s size:%d new_blocks:%d new_blocks_size:%d" % (self.path(), file_size, packer.get_num_new_blocks(), packer.get_size_new_blocks())) ctx.num_scanned_files_reporter.increment(1) if packer.get_num_new_blocks() != 0: ctx.num_new_blocks_reporter.increment(packer.get_num_new_blocks()) ctx.size_new_blocks_reporter.increment(packer.get_size_new_blocks()) ctx.num_changed_files_reporter.increment(1) ctx.changed_files_reporter.append(self.path()) if file_size > 256 * 1024: logging.debug("File %s is big enough to register in cndb" % self.path()) cndb = self.backup.get_completed_nodes_db() assert self.stats is not None path_digest = Digest.dataDigest(self.path().encode('utf8')) encoded = (self.digest + IntegerEncodings.binary_encode_int_varlen(self.level) + IntegerEncodings.binary_encode_int_varlen(self.get_type()) + serialize_stats(self.get_stats())) if not cndb.has_key(path_digest) or cndb[path_digest] != encoded: cndb[path_digest] = encoded
def save_epoch_data(self): # So far, the cache is too resource-intensive. # Avoid keeping it persistently until it's better optimized. return longevity_os = StringIO.StringIO() for digest, longevity in self.block_longevity.iteritems(): longevity_os.write(digest) longevity_os.write(IE.binary_encode_int_varlen(longevity)) epoch = self.block_epoch[digest] longevity_os.write(IE.binary_encode_int_varlen(epoch)) self.block_longevity_data["data"] = longevity_os.getvalue() self.block_longevity_data["epoch"] = str(self.epoch)
def update_hlink(self, ctx): if os.name == 'nt': return if self.stats[stat.ST_NLINK] == 1: return inode_num = self.stats[stat.ST_INO] if ctx.inodes_db.has_key(inode_num): return ctx.inodes_db[inode_num] = self.digest +\ IntegerEncodings.binary_encode_int_varlen(self.level)
def write(self, ctx): """ Write the info of the current dir to database """ packer = PackerStream.PackerOStream(self.backup, Container.CODE_DIR) # sorting is an optimization to make everybody access files in the same # order. # TODO: measure if this really makes things faster # (probably will with a btree db) for child in self.children: Format.write_int(packer, child.get_type()) Format.write_string(packer, child.get_name().encode('utf8')) packer.write(child.get_digest()) packer.write(IntegerEncodings.binary_encode_int_varlen(child.get_level())) stats_str = serialize_stats(child.get_stats()) packer.write(stats_str) self.digest = packer.get_digest() self.level = packer.get_level() return (packer.get_num_new_blocks(), packer.get_size_new_blocks())
# # Update the current dir in completed_nodes_db # cndb = self.backup.get_completed_nodes_db() for subdir in subdirs: subdir_path = os.path.join(self.path(), subdir) subdir_path_digest = Digest.dataDigest(subdir_path.encode('utf8')) if cndb.has_key(subdir_path_digest): del cndb[subdir_path_digest] if self.stats is not None: # Stats are empty for the root node, but we don't want to store # it in the cndb, because at this point we're already done with the # increment anyway digest = Digest.dataDigest(self.path().encode('utf8')) encoded = (self.digest + IntegerEncodings.binary_encode_int_varlen(self.level) + IntegerEncodings.binary_encode_int_varlen(self.get_type()) + serialize_stats(self.get_stats())) if not cndb.has_key(digest) or cndb[digest] != encoded: cndb[digest] = encoded if self.digest != prev_digest: #print "changed node", self.path() ctx.changed_nodes += 1 def get_percent_done(self): if self.cur_scanned_child is None: return self.weight * self.processed_percent else: return (self.weight * self.processed_percent +
def _encode_block_info(seq_idx, container_idx): io = StringIO.StringIO() io.write(IE.binary_encode_int_varlen(seq_idx)) io.write(IE.binary_encode_int_varlen(container_idx)) return io.getvalue()