def debug_redis(*args): result = redis.execute_command(*args) if type(result).__name__ == 'list': for member in result: log.err(member) else: log.err(result)
def dupes(): for key in sorted(redis.keys('hash:*')): key = key.decode() collision_paths = redis.lrange(key, 0, -1) for collision_pair in combinations(collision_paths, 2): log.err('Checking', key, *collision_pair) file_name_0 = collision_pair[0].decode() file_name_1 = collision_pair[1].decode() if filecmp.cmp(file_name_0, file_name_1, shallow=False): print(file_name_0, '\t', file_name_1)
def scan(dir_, min_size): global last_progress for abspath in files.scan_files(dir_, progress_interval_sec=PROGRESS_INTERVAL_SEC): if os.stat(abspath).st_size < min_size: # Too small continue elif redis.get('path:' + abspath): # Already hashed continue log.err('Hashing', abspath) with open(abspath, 'rb') as file: md5er = hashlib.md5() try: # Buffering is required for large files because a single read crashes python (maybe only on OSX). # I think that it should not and is a bug, but there is no consensus nor documentation to indicate # intended behavior. for buf in iter(partial(file.read, BUFFER_THRESH), b''): md5er.update(buf) md5 = md5er.hexdigest() except OSError as e: log.err(' ', file, e) continue log.err(' ', os.path.relpath(abspath, start=os.getcwd()), md5) redis.set('path:' + abspath, md5) redis.rpush('hash:' + md5, abspath) print(abspath)
def scan_files(dir_, progress_interval_sec=None, excludes=None): """ Scan a directory recursively for real files. :param dir_: path to scan :param progress_interval_sec: how frequently to output scan progress messages on stderr, if at all :return: a generator which yields absolute paths to files """ global last_progress filenames = os.listdir(dir_) for filename in filenames: abspath = os.path.join(dir_, filename) # Output a note on progress? now_time = time.perf_counter() if progress_interval_sec and now_time > last_progress + progress_interval_sec: log.err('Scan progress', abspath) last_progress = now_time if os.path.islink(abspath): # Ignore links continue elif os.path.isdir(abspath): # Recurse into directories yield from scan_files(abspath) continue elif not os.path.isfile(abspath): # Not a file (what is it?) continue elif excludes and any(filter(lambda exclude: fnmatch.fnmatch(abspath, exclude), excludes)): # Matched exclude pattern continue yield abspath
def sync(preview, single_select): """ Backup configured paths to destination. """ log.err('') log.err(datetime.now().isoformat(), 'Starting sync') for (path_spec, base_excludes, local_path, dest) in iter_sync_dirs(single_select): cmd = ['aws', 's3', 'sync'] cmd.extend(base_excludes) if 'cli_args' in path_spec: cmd.extend(path_spec['cli_args']) cmd.append(local_path) cmd.append(dest) if preview: print(' '.join(cmd)) else: print(local_path, ' > ', dest) subprocess.call(cmd) log.err(datetime.now().isoformat(), 'Finished sync')
def validate_config(): """ Validate and print configuration. """ log.err('Config at', cfg.CONFIG_PATH) print(json.dumps(cfg.load(), indent=2))
def set_config_path(path): """ Set location of configuration. """ cfg.set_config_path(path) log.err('Config path set to', path)