def test_remote_fsck(self): testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) ohfs.put(HDATA_IMG_1) s3 = boto3.resource( 's3', region_name='us-east-1', aws_access_key_id='fake_access_key', aws_secret_access_key='fake_secret_key', ) s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').delete() self.assertRaises(botocore.exceptions.ClientError, lambda: self.check_delete(s3, testbucketname)) mdpath = os.path.join(self.tmp_dir, 'metadata-test') dataset_spec = get_sample_spec(testbucketname) specpath = os.path.join(mdpath, 'vision-computing', 'images', 'dataset-ex') ensure_path_exists(specpath) yaml_save(dataset_spec, os.path.join(specpath, 'dataset-ex.spec')) manifestpath = os.path.join(specpath, 'MANIFEST.yaml') yaml_save({'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh': {'imghires.jpg'}}, manifestpath) fullspecpath = os.path.join(specpath, os.path.join(specpath, 'dataset-ex.spec')) spec = 'vision-computing__images__dataset-ex__5' c = yaml_load('hdata/config.yaml') r = LocalRepository(c, hfspath) ret = r.remote_fsck(mdpath, spec, fullspecpath, 2, True, True) self.assertTrue(ret) self.assertEqual(None, s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').load())
def test_put1024K_pathexistence_level3(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024, levels=3) hfs.put('data/think-hires.jpg') fullpath = os.path.join( self.tmp_dir, 'hashfs', 'aU', 'No', 'RA', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') self.assertTrue(os.path.exists(fullpath))
def test_fsck(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) original_file = 'data/think-hires.jpg' hfs.put(original_file) chunk = os.path.join( self.tmp_dir, 'hashfs', 'aU', 'No', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') corrupted_files = hfs.fsck() self.assertTrue(len(corrupted_files) == 0) # Create a hard link placing the file on a wrong directory chunk_in_wrong_dir = os.path.join( self.tmp_dir, 'hashfs', 'aU', 'NB', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') os.makedirs(os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB')) os.link(chunk, chunk_in_wrong_dir) corrupted_files = hfs.fsck() self.assertTrue(len(corrupted_files) == 1) self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in corrupted_files) with open(chunk, 'wb') as f: f.write(b'blabla') corrupted_files = hfs.fsck() self.assertTrue(len(corrupted_files) == 2) self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in corrupted_files)
def test_put1024K_toomany_levels(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024, levels=23) hfs.put('data/think-hires.jpg') fullpath = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'No', 'RA', 'zc', 'iw', '2J', 'Ji', '69', 's2', 'Hj', 'fC', 'yz', 'Wt', '39', 'BH', 'Cu', 'cC', 'V2', 'Cs', 'AX', '6v', 'Sv', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') self.assertTrue(os.path.exists(fullpath))
def test_get_update_links_wspace_with_duplicates(self): wspath = os.path.join(self.tmp_dir, 'wspace') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) fidx = FullIndex(self.tmp_dir, self.tmp_dir) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') c = get_sample_config_spec(testbucketname, testprofile, testregion) r = LocalRepository(c, hfspath) r._update_cache(cache, key) mfiles = {} files = {DATA_IMG_1, DATA_IMG_2} r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict') wspace_file = os.path.join(wspath, DATA_IMG_1) self.assertTrue(os.path.exists(wspace_file)) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file)) wspace_file = os.path.join(wspath, DATA_IMG_2) self.assertTrue(os.path.exists(wspace_file)) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file)) st = os.stat(wspace_file) self.assertTrue(st.st_nlink == 3) self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh', DATA_IMG_2: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'}) wspath = os.path.join(self.tmp_dir, 'wspace') ensure_path_exists(wspath) to_be_removed = os.path.join(wspath, 'to_be_removed') with open(to_be_removed, 'w') as f: f.write('DEAD\n') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) fidx = FullIndex(self.tmp_dir, self.tmp_dir) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') c = yaml_load('hdata/config.yaml') r = LocalRepository(c, hfspath) r._update_cache(cache, key) mfiles = {} files = {DATA_IMG_1, DATA_IMG_2} r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict') r._remove_unused_links_wspace(wspath, mfiles) self.assertFalse(os.path.exists(to_be_removed))
def test_get_simple(self): original_file = 'data/think-hires.jpg' dst_file = os.path.join(self.tmp_dir, 'think-hires.jpg') hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) objkey = hfs.put(original_file) hfs.get(objkey, dst_file) self.assertEqual(self.md5sum(original_file), self.md5sum(dst_file))
def test_get_update_links_wspace(self): wspath = os.path.join(self.tmp_dir, 'wspace') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) fidx = FullIndex(self.tmp_dir, self.tmp_dir) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') c = get_sample_config_spec(testbucketname, testprofile, testregion) r = LocalRepository(c, hfspath) r._update_cache(cache, key) mfiles = {} files = {DATA_IMG_1} r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict') wspace_file = os.path.join(wspath, DATA_IMG_1) set_write_read(wspace_file) self.assertTrue(os.path.exists(wspace_file)) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file)) st = os.stat(wspace_file) fi = fidx.get_index() for k, v in fi.items(): self.assertEqual(k, os.path.join('data', 'imghires.jpg')) self.assertEqual(v['hash'], 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh') self.assertEqual(v['status'], 'u') self.assertEqual(v['ctime'], st.st_ctime) self.assertEqual(v['mtime'], st.st_mtime) self.assertTrue(st.st_nlink == 2) self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'})
def test_corruption(self): original_file = 'data/think-hires.jpg' dst_file = os.path.join(self.tmp_dir, 'think-hires.jpg') hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) objkey = hfs.put(original_file) chunk = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'No', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') with open(chunk, 'wb') as f: f.write(b'blabla') self.assertFalse(hfs.get(objkey, dst_file)) self.assertTrue(os.path.exists(dst_file) is False)
def test_fsck_with_remove_corrupted(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) original_file = 'data/think-hires.jpg' hfs.put(original_file) chunk = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'No', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') corrupted_files = hfs.fsck() self.assertTrue(len(corrupted_files) == 0) chunk_in_wrong_dir = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') os.makedirs(os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB')) os.link(chunk, chunk_in_wrong_dir) corrupted_files = hfs.fsck(remove_corrupted=True) self.assertTrue(len(corrupted_files) == 1) self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in corrupted_files) self.assertFalse(os.path.exists(chunk_in_wrong_dir))
def test_get_update_cache(self): hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') c = get_sample_config_spec(testbucketname, testprofile, testregion) r = LocalRepository(c, hfspath) r._update_cache(cache, key) self.assertTrue(os.path.exists(cache.get_keypath(key))) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(cache.get_keypath(key)))
class MultihashIndex(object): def __init__(self, spec, index_path, object_path, mutability=MutabilityType.STRICT.value, cache_path=None): self._spec = spec self._path = index_path self._hfs = MultihashFS(object_path) self._mf = self._get_index(index_path) self._full_idx = FullIndex(spec, index_path, mutability) self._cache = cache_path def _get_index(self, idxpath): metadatapath = os.path.join(idxpath, 'metadata', self._spec) ensure_path_exists(metadatapath) mfpath = os.path.join(metadatapath, 'MANIFEST.yaml') return Manifest(mfpath) def _add_dir(self, dir_path, manifest_path, file_path='', ignore_rules=None): self.manifestfiles = yaml_load(manifest_path) f_index_file = self._full_idx.get_index() all_files = [] for root, dirs, files in os.walk(os.path.join(dir_path, file_path)): base_path = root[:len(dir_path) + 1:] relative_path = root[len(dir_path) + 1:] if '.' == root[0] or should_ignore_file( ignore_rules, '{}/'.format(relative_path)): continue for file in files: file_path = os.path.join(relative_path, file) if ignore_rules is None or not should_ignore_file( ignore_rules, file_path): all_files.append(file_path) self.wp.progress_bar_total_inc(len(all_files)) args = { 'wp': self.wp, 'base_path': base_path, 'f_index_file': f_index_file, 'all_files': all_files, 'dir_path': dir_path } result = run_function_per_group(range(len(all_files)), 10000, function=self._adding_dir_work, arguments=args) if not result: return False self._full_idx.save_manifest_index() self._mf.save() def add(self, path, manifestpath, files=[]): self.wp = pool_factory(pb_elts=0, pb_desc='files') ignore_rules = get_ignore_rules(path) if len(files) > 0: single_files = filter( lambda x: os.path.isfile(os.path.join(path, x)), files) self.wp.progress_bar_total_inc(len(list(single_files))) for f in files: fullpath = os.path.join(path, f) if os.path.isdir(fullpath): self._add_dir(path, manifestpath, f, ignore_rules=ignore_rules) elif os.path.isfile(fullpath): if not should_ignore_file(ignore_rules, path): self._add_single_file(path, manifestpath, f) else: log.warn(output_messages['WARN_NOT_FOUND'] % fullpath, class_name=MULTI_HASH_CLASS_NAME) else: if os.path.isdir(path): self._add_dir(path, manifestpath, ignore_rules=ignore_rules) self.wp.progress_bar_close() def _adding_dir_work_future_process(self, futures, wp): for future in futures: scid, filepath, previous_hash = future.result() self.update_index(scid, filepath, previous_hash) if scid is not None else None wp.reset_futures() def _adding_dir_work(self, files, args): for k in files: file_path = args['all_files'][k] if (SPEC_EXTENSION in file_path) or (file_path == 'README.md') or ( file_path == MLGIT_IGNORE_FILE_NAME): args['wp'].progress_bar_total_inc(-1) self.add_metadata(args['base_path'], file_path) else: args['wp'].submit(self._add_file, args['base_path'], file_path, args['f_index_file']) futures = self.wp.wait() try: self._adding_dir_work_future_process(futures, self.wp) except Exception as e: self._full_idx.save_manifest_index() self._mf.save() log.error(output_messages['ERROR_ADDING_DIR'] % (args['dir_path'], e), class_name=MULTI_HASH_CLASS_NAME) return False return True def _add_single_file(self, base_path, manifestpath, file_path): self.manifestfiles = yaml_load(manifestpath) f_index_file = self._full_idx.get_index() if (SPEC_EXTENSION in file_path) or ('README' in file_path) or ( MLGIT_IGNORE_FILE_NAME in file_path): self.wp.progress_bar_total_inc(-1) self.add_metadata(base_path, file_path) else: self.wp.submit(self._add_file, base_path, file_path, f_index_file) futures = self.wp.wait() for future in futures: try: scid, filepath, previous_hash = future.result() self.update_index( scid, filepath, previous_hash) if scid is not None else None except Exception as e: # save the manifest of files added to index so far self._full_idx.save_manifest_index() self._mf.save() log.error(output_messages['ERROR_ADDING_DIR'] % (base_path, e), class_name=MULTI_HASH_CLASS_NAME) return self.wp.reset_futures() self._full_idx.save_manifest_index() self._mf.save() def add_metadata(self, basepath, filepath, automatically_added=False): log.debug(output_messages['DEBUG_ADD_FILE'] % filepath, class_name=MULTI_HASH_CLASS_NAME) fullpath = os.path.join(basepath, filepath) metadatapath = os.path.join(self._path, 'metadata', self._spec) ensure_path_exists(metadatapath) dstpath = os.path.join(metadatapath, filepath) if not os.path.exists(dstpath): shutil.copy2(fullpath, dstpath) else: os.unlink(dstpath) shutil.copy2(fullpath, dstpath) if automatically_added: log.info(output_messages['INFO_FILE_AUTOMATICALLY_ADDED'].format( filepath), class_name=MULTI_HASH_CLASS_NAME) # TODO add : stat to MANIFEST from original file ... def update_index(self, objectkey, filename, previous_hash=None): self._mf.add(objectkey, posix_path(filename), previous_hash) def remove_manifest(self): index_metadata_path = os.path.join(self._path, 'metadata', self._spec) try: os.unlink(os.path.join(index_metadata_path, 'MANIFEST.yaml')) except FileNotFoundError: pass def _save_index(self): self._mf.save() def get_index(self): return self._mf def _add_file(self, basepath, filepath, f_index_file): fullpath = os.path.join(basepath, filepath) metadatapath = os.path.join(self._path, 'metadata', self._spec) ensure_path_exists(metadatapath) scid = None check_file = f_index_file.get(posix_path(filepath)) previous_hash = None if check_file is not None: if self._full_idx.check_and_update(filepath, check_file, self._hfs, posix_path(filepath), fullpath, self._cache): scid = self._hfs.put(fullpath) updated_check = f_index_file.get(posix_path(filepath)) if 'previous_hash' in updated_check: previous_hash = updated_check['previous_hash'] else: scid = self._hfs.put(fullpath) self._full_idx.update_full_index(posix_path(filepath), fullpath, Status.a.name, scid) return scid, filepath, previous_hash def get(self, objectkey, path, file): log.info(output_messages['INFO_GETTING_FILE'] % file, class_name=MULTI_HASH_CLASS_NAME) dirs = os.path.dirname(file) fulldir = os.path.join(path, dirs) ensure_path_exists(fulldir) dstfile = os.path.join(path, file) return self._hfs.get(objectkey, dstfile) def reset(self): shutil.rmtree(self._path) os.mkdir(self._path) def fsck(self, entity_path): return self._full_idx.fsck(entity_path, self._hfs, self._cache) def update_index_manifest(self, hash_files): for key in hash_files: values = list(hash_files[key]) for e in values: self._mf.add(key, e) self._save_index() def get_index_yaml(self): return self._full_idx def remove_deleted_files_index_manifest(self, deleted_files): manifest = self.get_index() for file in deleted_files: manifest.rm_file(file) manifest.save() def get_hashes_list(self): idx_yaml = self._full_idx.get_index() hashes_list = [] for value in idx_yaml: hashes_list.append(idx_yaml[value]['hash']) return hashes_list
class MultihashIndex(object): def __init__(self, spec, index_path, object_path, mutability=Mutability.STRICT.value, cache_path=None): self._spec = spec self._path = index_path self._hfs = MultihashFS(object_path) self._mf = self._get_index(index_path) self._full_idx = FullIndex(spec, index_path, mutability) self._cache = cache_path def _get_index(self, idxpath): metadatapath = os.path.join(idxpath, 'metadata', self._spec) ensure_path_exists(metadatapath) mfpath = os.path.join(metadatapath, 'MANIFEST.yaml') return Manifest(mfpath) def add(self, path, manifestpath, files=[]): self.wp = pool_factory(pb_elts=0, pb_desc='files') if len(files) > 0: single_files = filter(lambda x: os.path.isfile(os.path.join(path, x)), files) self.wp.progress_bar_total_inc(len(list(single_files))) for f in files: fullpath = os.path.join(path, f) if os.path.isdir(fullpath): self._add_dir(path, manifestpath, f) elif os.path.isfile(fullpath): self._add_single_file(path, manifestpath, f) else: log.warn('[%s] Not found!' % fullpath, class_name=MULTI_HASH_CLASS_NAME) else: if os.path.isdir(path): self._add_dir(path, manifestpath) self.wp.progress_bar_close() def _adding_dir_work_future_process(self, futures, wp): for future in futures: scid, filepath, previous_hash = future.result() self.update_index(scid, filepath, previous_hash) if scid is not None else None wp.reset_futures() def _adding_dir_work(self, files, args): for k in files: filepath = args['all_files'][k] if ('.spec' in filepath) or ('README' in filepath): args['wp'].progress_bar_total_inc(-1) self.add_metadata(args['basepath'], filepath) else: args['wp'].submit(self._add_file, args['basepath'], filepath, args['f_index_file']) futures = self.wp.wait() try: self._adding_dir_work_future_process(futures, self.wp) except Exception as e: self._full_idx.save_manifest_index() self._mf.save() log.error('Error adding dir [%s] -- [%s]' % (args['dirpath'], e), class_name=MULTI_HASH_CLASS_NAME) return False return True def _add_dir(self, dirpath, manifestpath, file_path='', trust_links=True): self.manifestfiles = yaml_load(manifestpath) f_index_file = self._full_idx.get_index() all_files = [] for root, dirs, files in os.walk(os.path.join(dirpath, file_path)): if '.' == root[0]: continue basepath = root[:len(dirpath)+1:] relativepath = root[len(dirpath)+1:] for file in files: all_files.append(os.path.join(relativepath, file)) self.wp.progress_bar_total_inc(len(all_files)) args = {'wp': self.wp, 'basepath': basepath, 'f_index_file': f_index_file, 'all_files': all_files, 'dirpath': dirpath} result = run_function_per_group(range(len(all_files)), 10000, function=self._adding_dir_work, arguments=args) if not result: return False self._full_idx.save_manifest_index() self._mf.save() def _add_single_file(self, base_path, manifestpath, file_path): self.manifestfiles = yaml_load(manifestpath) f_index_file = self._full_idx.get_index() if ('.spec' in file_path) or ('README' in file_path): self.wp.progress_bar_total_inc(-1) self.add_metadata(base_path, file_path) else: self.wp.submit(self._add_file, base_path, file_path, f_index_file) futures = self.wp.wait() for future in futures: try: scid, filepath, previous_hash = future.result() self.update_index(scid, filepath, previous_hash) if scid is not None else None except Exception as e: # save the manifest of files added to index so far self._full_idx.save_manifest_index() self._mf.save() log.error('Error adding dir [%s] -- [%s]' % (base_path, e), class_name=MULTI_HASH_CLASS_NAME) return self.wp.reset_futures() self._full_idx.save_manifest_index() self._mf.save() def add_metadata(self, basepath, filepath): log.debug('Add file [%s] to ml-git index' % filepath, class_name=MULTI_HASH_CLASS_NAME) fullpath = os.path.join(basepath, filepath) metadatapath = os.path.join(self._path, 'metadata', self._spec) ensure_path_exists(metadatapath) dstpath = os.path.join(metadatapath, filepath) if not os.path.exists(dstpath): shutil.copy2(fullpath, dstpath) else: os.unlink(dstpath) shutil.copy2(fullpath, dstpath) # TODO add : stat to MANIFEST from original file ... def update_index(self, objectkey, filename, previous_hash=None): self._mf.add(objectkey, posix_path(filename), previous_hash) def remove_manifest(self): index_metadata_path = os.path.join(self._path, 'metadata', self._spec) try: os.unlink(os.path.join(index_metadata_path, 'MANIFEST.yaml')) except FileNotFoundError: pass def _save_index(self): self._mf.save() def get_index(self): return self._mf def _add_file(self, basepath, filepath, f_index_file): fullpath = os.path.join(basepath, filepath) metadatapath = os.path.join(self._path, 'metadata', self._spec) ensure_path_exists(metadatapath) scid = None check_file = f_index_file.get(posix_path(filepath)) previous_hash = None if check_file is not None: if self._full_idx.check_and_update(filepath, check_file, self._hfs, posix_path(filepath), fullpath, self._cache): scid = self._hfs.put(fullpath) updated_check = f_index_file.get(posix_path(filepath)) if 'previous_hash' in updated_check: previous_hash = updated_check['previous_hash'] else: scid = self._hfs.put(fullpath) self._full_idx.update_full_index(posix_path(filepath), fullpath, Status.a.name, scid) return scid, filepath, previous_hash def get(self, objectkey, path, file): log.info('Getting file [%s] from local index' % file, class_name=MULTI_HASH_CLASS_NAME) dirs = os.path.dirname(file) fulldir = os.path.join(path, dirs) ensure_path_exists(fulldir) dstfile = os.path.join(path, file) return self._hfs.get(objectkey, dstfile) def reset(self): shutil.rmtree(self._path) os.mkdir(self._path) def fsck(self): return self._hfs.fsck() def update_index_manifest(self, hash_files): for key in hash_files: values = list(hash_files[key]) for e in values: self._mf.add(key, e) self._save_index() def get_index_yalm(self): return self._full_idx def remove_deleted_files_index_manifest(self, deleted_files): manifest = self.get_index() for file in deleted_files: manifest.rm_file(file) manifest.save()
def test_put1024K(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) hfs.put('data/think-hires.jpg') for files in hfs.walk(): for file in files: self.assertTrue(file in chunks1024)