Exemplo n.º 1
0
    def test_fsck(self):
        hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024)

        original_file = 'data/think-hires.jpg'
        hfs.put(original_file)

        chunk = os.path.join(
            self.tmp_dir, 'hashfs', 'aU', 'No',
            'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv')

        corrupted_files = hfs.fsck()
        self.assertTrue(len(corrupted_files) == 0)

        # Create a hard link placing the file on a wrong directory
        chunk_in_wrong_dir = os.path.join(
            self.tmp_dir, 'hashfs', 'aU', 'NB',
            'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv')
        os.makedirs(os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB'))
        os.link(chunk, chunk_in_wrong_dir)

        corrupted_files = hfs.fsck()
        self.assertTrue(len(corrupted_files) == 1)
        self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in
                        corrupted_files)

        with open(chunk, 'wb') as f:
            f.write(b'blabla')

        corrupted_files = hfs.fsck()
        self.assertTrue(len(corrupted_files) == 2)
        self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in
                        corrupted_files)
Exemplo n.º 2
0
    def test_fsck_with_remove_corrupted(self):
        hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024)

        original_file = 'data/think-hires.jpg'
        hfs.put(original_file)

        chunk = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'No', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv')
        corrupted_files = hfs.fsck()
        self.assertTrue(len(corrupted_files) == 0)

        chunk_in_wrong_dir = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB',
                                          'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv')

        os.makedirs(os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB'))
        os.link(chunk, chunk_in_wrong_dir)

        corrupted_files = hfs.fsck(remove_corrupted=True)
        self.assertTrue(len(corrupted_files) == 1)
        self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in corrupted_files)
        self.assertFalse(os.path.exists(chunk_in_wrong_dir))
Exemplo n.º 3
0
class MultihashIndex(object):

    def __init__(self, spec, index_path, object_path, mutability=Mutability.STRICT.value, cache_path=None):
        self._spec = spec
        self._path = index_path
        self._hfs = MultihashFS(object_path)
        self._mf = self._get_index(index_path)
        self._full_idx = FullIndex(spec, index_path, mutability)
        self._cache = cache_path

    def _get_index(self, idxpath):
        metadatapath = os.path.join(idxpath, 'metadata', self._spec)
        ensure_path_exists(metadatapath)

        mfpath = os.path.join(metadatapath, 'MANIFEST.yaml')
        return Manifest(mfpath)

    def add(self, path, manifestpath, files=[]):
        self.wp = pool_factory(pb_elts=0, pb_desc='files')
        if len(files) > 0:
            single_files = filter(lambda x: os.path.isfile(os.path.join(path, x)), files)
            self.wp.progress_bar_total_inc(len(list(single_files)))
            for f in files:
                fullpath = os.path.join(path, f)
                if os.path.isdir(fullpath):
                    self._add_dir(path, manifestpath, f)
                elif os.path.isfile(fullpath):
                    self._add_single_file(path, manifestpath, f)
                else:
                    log.warn('[%s] Not found!' % fullpath, class_name=MULTI_HASH_CLASS_NAME)
        else:
            if os.path.isdir(path):
                self._add_dir(path, manifestpath)
        self.wp.progress_bar_close()

    def _adding_dir_work_future_process(self, futures, wp):
        for future in futures:
            scid, filepath, previous_hash = future.result()
            self.update_index(scid, filepath, previous_hash) if scid is not None else None
        wp.reset_futures()

    def _adding_dir_work(self, files, args):
        for k in files:
            filepath = args['all_files'][k]
            if ('.spec' in filepath) or ('README' in filepath):
                args['wp'].progress_bar_total_inc(-1)
                self.add_metadata(args['basepath'], filepath)
            else:
                args['wp'].submit(self._add_file, args['basepath'], filepath, args['f_index_file'])
        futures = self.wp.wait()
        try:
            self._adding_dir_work_future_process(futures, self.wp)
        except Exception as e:
            self._full_idx.save_manifest_index()
            self._mf.save()
            log.error('Error adding dir [%s] -- [%s]' % (args['dirpath'], e), class_name=MULTI_HASH_CLASS_NAME)
            return False
        return True

    def _add_dir(self, dirpath, manifestpath, file_path='', trust_links=True):
        self.manifestfiles = yaml_load(manifestpath)
        f_index_file = self._full_idx.get_index()
        all_files = []
        for root, dirs, files in os.walk(os.path.join(dirpath, file_path)):
            if '.' == root[0]:
                continue
            basepath = root[:len(dirpath)+1:]
            relativepath = root[len(dirpath)+1:]
            for file in files:
                all_files.append(os.path.join(relativepath, file))
            self.wp.progress_bar_total_inc(len(all_files))
            args = {'wp': self.wp, 'basepath': basepath, 'f_index_file': f_index_file, 'all_files': all_files, 'dirpath': dirpath}
            result = run_function_per_group(range(len(all_files)), 10000, function=self._adding_dir_work, arguments=args)
            if not result:
                return False
        self._full_idx.save_manifest_index()
        self._mf.save()

    def _add_single_file(self, base_path, manifestpath, file_path):
        self.manifestfiles = yaml_load(manifestpath)

        f_index_file = self._full_idx.get_index()
        if ('.spec' in file_path) or ('README' in file_path):
            self.wp.progress_bar_total_inc(-1)
            self.add_metadata(base_path, file_path)
        else:
            self.wp.submit(self._add_file, base_path, file_path, f_index_file)
            futures = self.wp.wait()
            for future in futures:
                try:
                    scid, filepath, previous_hash = future.result()
                    self.update_index(scid, filepath, previous_hash) if scid is not None else None
                except Exception as e:
                    # save the manifest of files added to index so far
                    self._full_idx.save_manifest_index()
                    self._mf.save()
                    log.error('Error adding dir [%s] -- [%s]' % (base_path, e), class_name=MULTI_HASH_CLASS_NAME)
                    return
            self.wp.reset_futures()
        self._full_idx.save_manifest_index()
        self._mf.save()

    def add_metadata(self, basepath, filepath):
        log.debug('Add file [%s] to ml-git index' % filepath, class_name=MULTI_HASH_CLASS_NAME)
        fullpath = os.path.join(basepath, filepath)

        metadatapath = os.path.join(self._path, 'metadata', self._spec)
        ensure_path_exists(metadatapath)

        dstpath = os.path.join(metadatapath, filepath)
        if not os.path.exists(dstpath):
            shutil.copy2(fullpath, dstpath)
        else:
            os.unlink(dstpath)
            shutil.copy2(fullpath, dstpath)

    # TODO add : stat to MANIFEST from original file ...
    def update_index(self, objectkey, filename, previous_hash=None):

        self._mf.add(objectkey, posix_path(filename), previous_hash)

    def remove_manifest(self):
        index_metadata_path = os.path.join(self._path, 'metadata', self._spec)
        try:
            os.unlink(os.path.join(index_metadata_path, 'MANIFEST.yaml'))
        except FileNotFoundError:
            pass

    def _save_index(self):
        self._mf.save()

    def get_index(self):
        return self._mf

    def _add_file(self, basepath, filepath, f_index_file):
        fullpath = os.path.join(basepath, filepath)
        metadatapath = os.path.join(self._path, 'metadata', self._spec)
        ensure_path_exists(metadatapath)

        scid = None
        check_file = f_index_file.get(posix_path(filepath))
        previous_hash = None
        if check_file is not None:
            if self._full_idx.check_and_update(filepath, check_file, self._hfs, posix_path(filepath), fullpath, self._cache):
                scid = self._hfs.put(fullpath)

            updated_check = f_index_file.get(posix_path(filepath))
            if 'previous_hash' in updated_check:
                previous_hash = updated_check['previous_hash']
        else:
            scid = self._hfs.put(fullpath)
            self._full_idx.update_full_index(posix_path(filepath), fullpath, Status.a.name, scid)

        return scid, filepath, previous_hash

    def get(self, objectkey, path, file):
        log.info('Getting file [%s] from local index' % file, class_name=MULTI_HASH_CLASS_NAME)
        dirs = os.path.dirname(file)
        fulldir = os.path.join(path, dirs)
        ensure_path_exists(fulldir)

        dstfile = os.path.join(path, file)
        return self._hfs.get(objectkey, dstfile)

    def reset(self):
        shutil.rmtree(self._path)
        os.mkdir(self._path)

    def fsck(self):
        return self._hfs.fsck()

    def update_index_manifest(self, hash_files):
        for key in hash_files:
            values = list(hash_files[key])
            for e in values:
                self._mf.add(key, e)
        self._save_index()

    def get_index_yalm(self):
        return self._full_idx

    def remove_deleted_files_index_manifest(self, deleted_files):
        manifest = self.get_index()
        for file in deleted_files:
            manifest.rm_file(file)
        manifest.save()