示例#1
0
 def ls_partitions(self):
     """List all partitions available for storing bundles and how many bundles are currently stored."""
     partitions, _ = path_util.ls(self.partitions)
     print '%d %s' % (len(partitions), 'partition' if len(partitions) == 1 else 'partitions')
     for d in partitions:
         partition_path = os.path.join(self.partitions, d)
         real_path = os.readlink(partition_path)
         bundles = reduce(lambda x,y: x+y, path_util.ls(os.path.join(partition_path, MultiDiskBundleStore.DATA_SUBDIRECTORY)))
         print '- %-016s\n\tmountpoint: %s\n\t%d %s' % (d, real_path, len(bundles), 'bundle' if len(bundles) == 1 else 'bundles')
示例#2
0
    def add_partition(self, target, new_partition_name):
        """
        MultiDiskBundleStore specific method. Add a new partition to the bundle store. The "target" is actually a symlink to
        the target directory, which the user has configured as the mountpoint for some desired partition.

        First, all bundles that are to be relocated onto the new partition are copied to a temp location to be resilient
        against failures. After the copy is performed, the bundles are subsequently moved to the new partition, and finally
        the original copy of the bundles are deleted from their old locations
        """
        target = os.path.abspath(target)
        new_partition_location = os.path.join(self.partitions, new_partition_name)

        mtemp = os.path.join(target, MultiDiskBundleStore.TEMP_SUBDIRECTORY)

        try:
            path_util.make_directory(mtemp)
        except:
            print >> sys.stderr, "Could not make directory %s on partition %s, aborting" % (mtemp, target)
            sys.exit(1)

        self.ring.add_node(new_partition_name)  # Add the node to the partition locations
        delete_on_success = []  # Paths to bundles that will be deleted after the copy finishes successfully

        print >> sys.stderr, "Marking bundles for placement on new partition %s (might take a while)" % new_partition_name
        # For each bundle in the bundle store, check to see if any hash to the new partition. If so move them over
        partitions, _ = path_util.ls(self.partitions)
        for partition in partitions:
            partition_abs_path = os.path.join(self.partitions, partition, MultiDiskBundleStore.DATA_SUBDIRECTORY)
            bundles = reduce(lambda dirs, files: dirs + files, path_util.ls(partition_abs_path))
            for bundle in bundles:
                correct_partition = self.ring.get_node(bundle)
                if correct_partition != partition:
                    # Reposition the node to the correct partition
                    from_path = os.path.join(self.partitions, partition, MultiDiskBundleStore.DATA_SUBDIRECTORY, bundle)
                    to_path = os.path.join(mtemp, bundle)
                    print >> sys.stderr, "copying %s to %s" % (from_path, to_path)
                    path_util.copy(from_path, to_path)
                    delete_on_success += [from_path]

        print >> sys.stderr, "Adding new partition as %s..." % new_partition_location
        path_util.soft_link(target, new_partition_location)

        # Atomically move the temp location to the new partition's mdata
        new_mdata = os.path.join(new_partition_location, MultiDiskBundleStore.DATA_SUBDIRECTORY)
        new_mtemp = os.path.join(new_partition_location, MultiDiskBundleStore.TEMP_SUBDIRECTORY)
        path_util.rename(new_mtemp, new_mdata)
        path_util.make_directory(new_mtemp)

        # Go through and purge all of the originals at this time
        print >> sys.stderr, "Cleaning up drives..."
        for to_delete in delete_on_success:
            path_util.remove(to_delete)

        print >> sys.stderr, "Successfully added partition '%s' to the pool." % new_partition_name
示例#3
0
 def ls_partitions(self):
     """List all partitions available for storing bundles and how many bundles are currently stored."""
     partitions, _ = path_util.ls(self.partitions)
     print '%d %s' % (len(partitions),
                      'partition' if len(partitions) == 1 else 'partitions')
     for d in partitions:
         partition_path = os.path.join(self.partitions, d)
         real_path = os.readlink(partition_path)
         bundles = reduce(
             lambda x, y: x + y,
             path_util.ls(
                 os.path.join(partition_path,
                              MultiDiskBundleStore.DATA_SUBDIRECTORY)))
         print '- %-016s\n\tmountpoint: %s\n\t%d %s' % (
             d, real_path, len(bundles),
             'bundle' if len(bundles) == 1 else 'bundles')
示例#4
0
    def rm_partition(self, partition):
        """
        Deletes the given partition entry from the bundle store, and purges the lru cache. Does not move any bundles.
        """

        if self.__get_num_partitions() == 1:
            """
            Prevent foot-shooting
            """
            print >>sys.stderr, "Error, cannot remove last partition. If you really wish to delete CodaLab, please run the following command:"
            print >>sys.stderr, "      rm -rf %s" % self.codalab_home
            return

        partition_abs_path = os.path.join(self.partitions, partition)

        try:
            print(partition_abs_path)
            path_util.check_isvalid(partition_abs_path, 'rm-partition')
        except:
            print >>sys.stderr, "Partition with name '%s' does not exist. Run `cl ls-partitions` to see a list of mounted partitions." % partition
            sys.exit(1)

        print >>sys.stderr, "Unlinking partition %s from CodaLab deployment..." % partition
        path_util.remove(partition_abs_path)
        nodes, _ = path_util.ls(self.partitions)
        self.nodes = nodes
        print >>sys.stderr, "Partition removed successfully from bundle store pool"
        print >>sys.stdout, "Warning: this does not affect the bundles in the removed partition or any entries in the bundle database"
        self.lru_cache = OrderedDict()
示例#5
0
    def rm_partition(self, partition):
        """
        Deletes the given partition entry from the bundle store, and purges the lru cache. Does not move any bundles.
        """

        if self.__get_num_partitions() == 1:
            """
            Prevent foot-shooting
            """
            print >> sys.stderr, "Error, cannot remove last partition. If you really wish to delete CodaLab, please run the following command:"
            print >> sys.stderr, "      rm -rf %s" % self.codalab_home
            return

        partition_abs_path = os.path.join(self.partitions, partition)

        try:
            print(partition_abs_path)
            path_util.check_isvalid(partition_abs_path, 'rm-partition')
        except:
            print >> sys.stderr, "Partition with name '%s' does not exist. Run `cl ls-partitions` to see a list of mounted partitions." % partition
            sys.exit(1)

        print >> sys.stderr, "Unlinking partition %s from CodaLab deployment..." % partition
        path_util.remove(partition_abs_path)
        nodes, _ = path_util.ls(self.partitions)
        self.nodes = nodes
        print >> sys.stderr, "Partition removed successfully from bundle store pool"
        print >> sys.stdout, "Warning: this does not affect the bundles in the removed partition or any entries in the bundle database"
        self.lru_cache = OrderedDict()
示例#6
0
    def rm_partition(self, partition):
        """
        Deletes the given disk from the bundle store, and if it is not the last partition, it redistributes the bundles
        from that partition across the remaining partitions.
        """
        # Transfer all of the files to their correct locations.

        if self.__get_num_partitions() == 1:
            """
            Prevent foot-shooting
            """
            print >> sys.stderr, "Error, cannot remove last partition. If you really wish to delete CodaLab, please run the following command:"
            print >> sys.stderr, "      rm -rf %s" % self.codalab_home
            return

        relocations = dict()
        partition_abs_path = os.path.join(self.partitions, partition)
        old_mdata = os.path.join(partition_abs_path, MultiDiskBundleStore.DATA_SUBDIRECTORY)
        old_mtemp = os.path.join(partition_abs_path, MultiDiskBundleStore.TEMP_SUBDIRECTORY)

        try:
            print partition_abs_path
            path_util.check_isvalid(partition_abs_path, 'rm-partition')
        except:
            print >> sys.stderr, "Partition with name '%s' does not exist. Run `cl ls-partitions` to see a list of mounted partitions." % partition
            sys.exit(1)

        # Reset the ring to distribute across remaining partitions
        self.ring.remove_node(partition)
        bundles_to_move = reduce(lambda dirs, files: dirs + files, path_util.ls(old_mdata))

        for bundle in bundles_to_move:
            new_partition = self.ring.get_node(bundle)
            relocations[bundle] = os.path.join(self.partitions, new_partition)

        # Copy all bundles off of the old partition to temp directories on the new partition
        for bundle, partition in relocations.iteritems():
            # temporary directory on the partition
            temp_dir = os.path.join(partition, MultiDiskBundleStore.TEMP_SUBDIRECTORY)
            from_path = os.path.join(old_mdata, bundle)
            to_path = os.path.join(temp_dir, 'stage-%s' % bundle)
            path_util.copy(from_path, to_path)

        # Now that each bundle is on the proper partition, move each from the staging area to the
        # production mdata/ subdirectory on its partition.
        for bundle, partition in relocations.iteritems():
            temp_dir = os.path.join(partition, MultiDiskBundleStore.TEMP_SUBDIRECTORY)
            from_path = os.path.join(temp_dir, 'stage-%s' % bundle)
            to_path = os.path.join(partition, MultiDiskBundleStore.DATA_SUBDIRECTORY, bundle)
            path_util.rename(from_path, to_path)

        # Remove data from partition and unlink from CodaLab
        print >> sys.stderr, "Cleaning bundles off of partition..."
        path_util.remove(old_mdata)
        path_util.remove(old_mtemp)
        print >> sys.stderr, "Unlinking partition %s from CodaLab deployment..." % partition
        path_util.remove(partition_abs_path)
        print >> sys.stderr, "Partition removed successfully from bundle store pool"
示例#7
0
    def __init__(self, codalab_home):
        self.codalab_home = path_util.normalize(codalab_home)

        self.partitions = os.path.join(self.codalab_home, 'partitions')
        self.mtemp = os.path.join(self.codalab_home, MultiDiskBundleStore.MISC_TEMP_SUBDIRECTORY)

        # Perform initialization first to ensure that directories will be populated
        super(MultiDiskBundleStore, self).__init__()
        nodes, _ = path_util.ls(self.partitions)
        self.nodes = nodes
        self.lru_cache = OrderedDict()
        super(MultiDiskBundleStore, self).__init__()
示例#8
0
    def __init__(self, codalab_home):
        self.codalab_home = path_util.normalize(codalab_home)

        self.partitions = os.path.join(self.codalab_home, 'partitions')
        self.mtemp = os.path.join(self.codalab_home,
                                  MultiDiskBundleStore.MISC_TEMP_SUBDIRECTORY)

        # Perform initialization first to ensure that directories will be populated
        super(MultiDiskBundleStore, self).__init__()
        nodes, _ = path_util.ls(self.partitions)

        self.ring = HashRing(nodes)
        super(MultiDiskBundleStore, self).__init__()
示例#9
0
    def health_check(self,
                     model,
                     force=False,
                     compute_data_hash=False,
                     repair_hashes=False):
        """
        MultiDiskBundleStore.health_check(): In the MultiDiskBundleStore, bundle contents are stored on disk, and
        occasionally the disk gets out of sync with the database, in which case we make repairs in the following ways:

            1. Deletes bundles with corresponding UUID not in the database.
            3. Deletes any files not beginning with UUID string.
            4. For each bundle marked READY or FAILED, ensure that its dependencies are not located in the bundle
               directory. If they are then delete the dependencies.
            5. For bundle <UUID> marked READY or FAILED, <UUID>.cid or <UUID>.status, or the <UUID>(-internal).sh files
               should not exist.
        |force|: Perform any destructive operations on the bundle store the health check determines are necessary. False by default
        |compute_data_hash|: If True, compute the data_hash for every single bundle ourselves and see if it's consistent with what's in
                             the database. False by default.
        """
        UUID_REGEX = re.compile(r'^(%s)' % spec_util.UUID_STR)

        def _delete_path(loc):
            cmd = 'rm -r \'%s\'' % loc
            print(cmd)
            if force:
                path_util.remove(loc)

        def _get_uuid(path):
            fname = os.path.basename(path)
            try:
                return UUID_REGEX.match(fname).groups()[0]
            except:
                return None

        def _is_bundle(path):
            """Returns whether the given path is a bundle directory/file"""
            return _get_uuid(path) == os.path.basename(path)

        def _check_bundle_paths(bundle_paths, db_bundle_by_uuid):
            """
            Takes in a list of bundle paths and a mapping of UUID to BundleModel, and returns a list of paths and
            subpaths that need to be removed.
            """
            to_delete = []
            # Batch get information for all bundles stored on-disk

            for bundle_path in bundle_paths:
                uuid = _get_uuid(bundle_path)
                # Screen for bundles stored on disk that are no longer in the database
                bundle = db_bundle_by_uuid.get(uuid, None)
                if bundle == None:
                    to_delete += [bundle_path]
                    continue
                # Delete dependencies stored inside of READY or FAILED bundles
                if bundle.state in [State.READY, State.FAILED]:
                    dep_paths = [
                        os.path.join(bundle_path, dep.child_path)
                        for dep in bundle.dependencies
                    ]
                    to_delete += list(filter(os.path.exists, dep_paths))
            return to_delete

        def _check_other_paths(other_paths, db_bundle_by_uuid):
            """
            Given a list of non-bundle paths, and a mapping of UUID to BundleModel, returns a list of paths to delete.
            """
            to_delete = []
            for path in other_paths:
                uuid = _get_uuid(path)
                bundle = db_bundle_by_uuid.get(uuid, None)
                if bundle == None:
                    to_delete += [path]
                    continue
                ends_with_ext = (path.endswith('.cid')
                                 or path.endswith('.status')
                                 or path.endswith('.sh'))
                if bundle.state in [State.READY, State.FAILED]:
                    if ends_with_ext:
                        to_delete += [path]
                        continue
                    elif '.' in path:
                        print('WARNING: File %s is likely junk.' % path,
                              file=sys.stderr)
            return to_delete

        partitions, _ = path_util.ls(self.partitions)
        trash_count = 0

        for partition in partitions:
            print('Looking for trash in partition %s...' % partition,
                  file=sys.stderr)
            partition_path = os.path.join(
                self.partitions, partition,
                MultiDiskBundleStore.DATA_SUBDIRECTORY)
            entries = list(
                map(
                    lambda f: os.path.join(partition_path, f),
                    reduce(lambda d, f: d + f, path_util.ls(partition_path)),
                ))
            bundle_paths = list(filter(_is_bundle, entries))
            other_paths = set(entries) - set(bundle_paths)

            uuids = list(map(_get_uuid, bundle_paths))
            db_bundles = model.batch_get_bundles(uuid=uuids)
            db_bundle_by_uuid = dict()
            for bundle in db_bundles:
                db_bundle_by_uuid[bundle.uuid] = bundle

            # Check both bundles and non-bundles and remove each
            for to_delete in _check_bundle_paths(bundle_paths,
                                                 db_bundle_by_uuid):
                trash_count += 1
                _delete_path(to_delete)
            for to_delete in _check_other_paths(other_paths,
                                                db_bundle_by_uuid):
                trash_count += 1
                _delete_path(to_delete)

            # Check for each bundle if we need to compute its data_hash
            data_hash_recomputed = 0

            print('Checking data_hash of bundles in partition %s...' %
                  partition,
                  file=sys.stderr)
            for bundle_path in bundle_paths:
                uuid = _get_uuid(bundle_path)
                bundle = db_bundle_by_uuid.get(uuid, None)
                if bundle == None:
                    continue
                if compute_data_hash or bundle.data_hash == None:
                    dirs_and_files = (path_util.recursive_ls(bundle_path)
                                      if os.path.isdir(bundle_path) else
                                      ([], [bundle_path]))
                    data_hash = '0x%s' % path_util.hash_directory(
                        bundle_path, dirs_and_files)
                    if bundle.data_hash == None:
                        data_hash_recomputed += 1
                        print(
                            'Giving bundle %s data_hash %s' %
                            (bundle_path, data_hash),
                            file=sys.stderr,
                        )
                        if force:
                            db_update = dict(data_hash=data_hash)
                            model.update_bundle(bundle, db_update)
                    elif compute_data_hash and data_hash != bundle.data_hash:
                        data_hash_recomputed += 1
                        print(
                            'Bundle %s should have data_hash %s, actual digest is %s'
                            % (bundle_path, bundle.data_hash, data_hash),
                            file=sys.stderr,
                        )
                        if repair_hashes and force:
                            db_update = dict(data_hash=data_hash)
                            model.update_bundle(bundle, db_update)

        if force:
            print('\tDeleted %d objects from the bundle store' % trash_count,
                  file=sys.stderr)
            print('\tRecomputed data_hash for %d bundles' %
                  data_hash_recomputed,
                  file=sys.stderr)
        else:
            print(
                'Dry-Run Statistics, re-run with --force to perform updates:',
                file=sys.stderr)
            print('\tObjects marked for deletion: %d' % trash_count,
                  file=sys.stderr)
            print(
                '\tBundles that need data_hash recompute: %d' %
                data_hash_recomputed,
                file=sys.stderr,
            )
示例#10
0
 def __get_num_partitions(self):
     """
     Returns the current number of disks being used by this MultiDiskBundleStore.
     This is calculated as the number of directories in self.partitions
     """
     return reduce(lambda dirs, _: len(dirs), path_util.ls(self.partitions))
示例#11
0
    def rm_partition(self, partition):
        """
        Deletes the given disk from the bundle store, and if it is not the last partition, it redistributes the bundles
        from that partition across the remaining partitions.
        """
        # Transfer all of the files to their correct locations.

        if self.__get_num_partitions() == 1:
            """
            Prevent foot-shooting
            """
            print >> sys.stderr, "Error, cannot remove last partition. If you really wish to delete CodaLab, please run the following command:"
            print >> sys.stderr, "      rm -rf %s" % self.codalab_home
            return

        relocations = dict()
        partition_abs_path = os.path.join(self.partitions, partition)
        old_mdata = os.path.join(partition_abs_path,
                                 MultiDiskBundleStore.DATA_SUBDIRECTORY)
        old_mtemp = os.path.join(partition_abs_path,
                                 MultiDiskBundleStore.TEMP_SUBDIRECTORY)

        try:
            print partition_abs_path
            path_util.check_isvalid(partition_abs_path, 'rm-partition')
        except:
            print >> sys.stderr, "Partition with name '%s' does not exist. Run `cl ls-partitions` to see a list of mounted partitions." % partition
            sys.exit(1)

        # Reset the ring to distribute across remaining partitions
        self.ring.remove_node(partition)
        bundles_to_move = reduce(lambda dirs, files: dirs + files,
                                 path_util.ls(old_mdata))

        for bundle in bundles_to_move:
            new_partition = self.ring.get_node(bundle)
            relocations[bundle] = os.path.join(self.partitions, new_partition)

        # Copy all bundles off of the old partition to temp directories on the new partition
        for bundle, partition in relocations.iteritems():
            # temporary directory on the partition
            temp_dir = os.path.join(partition,
                                    MultiDiskBundleStore.TEMP_SUBDIRECTORY)
            from_path = os.path.join(old_mdata, bundle)
            to_path = os.path.join(temp_dir, 'stage-%s' % bundle)
            path_util.copy(from_path, to_path)

        # Now that each bundle is on the proper partition, move each from the staging area to the
        # production mdata/ subdirectory on its partition.
        for bundle, partition in relocations.iteritems():
            temp_dir = os.path.join(partition,
                                    MultiDiskBundleStore.TEMP_SUBDIRECTORY)
            from_path = os.path.join(temp_dir, 'stage-%s' % bundle)
            to_path = os.path.join(partition,
                                   MultiDiskBundleStore.DATA_SUBDIRECTORY,
                                   bundle)
            path_util.rename(from_path, to_path)

        # Remove data from partition and unlink from CodaLab
        print >> sys.stderr, "Cleaning bundles off of partition..."
        path_util.remove(old_mdata)
        path_util.remove(old_mtemp)
        print >> sys.stderr, "Unlinking partition %s from CodaLab deployment..." % partition
        path_util.remove(partition_abs_path)
        print >> sys.stderr, "Partition removed successfully from bundle store pool"
示例#12
0
    def add_partition(self, target, new_partition_name):
        """
        MultiDiskBundleStore specific method. Add a new partition to the bundle store. The "target" is actually a symlink to
        the target directory, which the user has configured as the mountpoint for some desired partition.

        First, all bundles that are to be relocated onto the new partition are copied to a temp location to be resilient
        against failures. After the copy is performed, the bundles are subsequently moved to the new partition, and finally
        the original copy of the bundles are deleted from their old locations
        """
        target = os.path.abspath(target)
        new_partition_location = os.path.join(self.partitions,
                                              new_partition_name)

        mtemp = os.path.join(target, MultiDiskBundleStore.TEMP_SUBDIRECTORY)

        try:
            path_util.make_directory(mtemp)
        except:
            print >> sys.stderr, "Could not make directory %s on partition %s, aborting" % (
                mtemp, target)
            sys.exit(1)

        self.ring.add_node(
            new_partition_name)  # Add the node to the partition locations
        delete_on_success = [
        ]  # Paths to bundles that will be deleted after the copy finishes successfully

        print >> sys.stderr, "Marking bundles for placement on new partition %s (might take a while)" % new_partition_name
        # For each bundle in the bundle store, check to see if any hash to the new partition. If so move them over
        partitions, _ = path_util.ls(self.partitions)
        for partition in partitions:
            partition_abs_path = os.path.join(
                self.partitions, partition,
                MultiDiskBundleStore.DATA_SUBDIRECTORY)
            bundles = reduce(lambda dirs, files: dirs + files,
                             path_util.ls(partition_abs_path))
            for bundle in bundles:
                correct_partition = self.ring.get_node(bundle)
                if correct_partition != partition:
                    # Reposition the node to the correct partition
                    from_path = os.path.join(
                        self.partitions, partition,
                        MultiDiskBundleStore.DATA_SUBDIRECTORY, bundle)
                    to_path = os.path.join(mtemp, bundle)
                    print >> sys.stderr, "copying %s to %s" % (from_path,
                                                               to_path)
                    path_util.copy(from_path, to_path)
                    delete_on_success += [from_path]

        print >> sys.stderr, "Adding new partition as %s..." % new_partition_location
        path_util.soft_link(target, new_partition_location)

        # Atomically move the temp location to the new partition's mdata
        new_mdata = os.path.join(new_partition_location,
                                 MultiDiskBundleStore.DATA_SUBDIRECTORY)
        new_mtemp = os.path.join(new_partition_location,
                                 MultiDiskBundleStore.TEMP_SUBDIRECTORY)
        path_util.rename(new_mtemp, new_mdata)
        path_util.make_directory(new_mtemp)

        # Go through and purge all of the originals at this time
        print >> sys.stderr, "Cleaning up drives..."
        for to_delete in delete_on_success:
            path_util.remove(to_delete)

        print >> sys.stderr, "Successfully added partition '%s' to the pool." % new_partition_name
dry_run = False if len(sys.argv) > 1 and sys.argv[1] == '-f' else True

manager = CodaLabManager()
model = manager.model()

CODALAB_HOME = manager.codalab_home
"""Move data/ directory over to a temp area, and create a staging tree for uuid-based storage"""
DATA_DIR = os.path.join(CODALAB_HOME, 'data')
FINAL_LOCATION = os.path.join(CODALAB_HOME, 'bundles')

if not dry_run:
    path_util.make_directory(FINAL_LOCATION)
"""For each data hash, get a list of all bundles that have that hash, and make a copy of the bundle in the staging
area under the UUID for the bundle."""
data_hashes = reduce(lambda x, y: x + y, path_util.ls(DATA_DIR))
for data_hash in data_hashes:
    orig_location = os.path.join(DATA_DIR, data_hash)

    bundles_with_hash = model.batch_get_bundles(data_hash=data_hash)
    # We'd prefer renaming bundles to making copies, but because we are converting from deduplicated storage
    # we need to make sure that we only perform renames if we map 1:1 UUID->Hash.
    rename_allowed = len(bundles_with_hash) <= 1
    for bundle in bundles_with_hash:
        # Build the command to be executed in a subshell
        uuid = bundle.uuid
        copy_location = os.path.join(FINAL_LOCATION, uuid)
        command = '%s %s %s' % ('mv' if rename_allowed else 'cp -a',
                                orig_location, copy_location)
        print(command)
        if not dry_run:
示例#14
0
    def health_check(self, model, force=False, compute_data_hash=False, repair_hashes=False):
        """
        MultiDiskBundleStore.health_check(): In the MultiDiskBundleStore, bundle contents are stored on disk, and
        occasionally the disk gets out of sync with the database, in which case we make repairs in the following ways:

            1. Deletes bundles with corresponding UUID not in the database.
            3. Deletes any files not beginning with UUID string.
            4. For each bundle marked READY or FAILED, ensure that its dependencies are not located in the bundle
               directory. If they are then delete the dependencies.
            5. For bundle <UUID> marked READY or FAILED, <UUID>.cid or <UUID>.status, or the <UUID>(-internal).sh files
               should not exist.
        |force|: Perform any destructive operations on the bundle store the health check determines are necessary. False by default
        |compute_data_hash|: If True, compute the data_hash for every single bundle ourselves and see if it's consistent with what's in
                             the database. False by default.
        """
        UUID_REGEX = re.compile(r'^(%s)' % spec_util.UUID_STR)

        def _delete_path(loc):
            cmd = 'rm -r \'%s\'' % loc
            print(cmd)
            if force:
                path_util.remove(loc)

        def _get_uuid(path):
            fname = os.path.basename(path)
            try:
                return UUID_REGEX.match(fname).groups()[0]
            except:
                return None

        def _is_bundle(path):
            """Returns whether the given path is a bundle directory/file"""
            return _get_uuid(path) == os.path.basename(path)

        def _check_bundle_paths(bundle_paths, db_bundle_by_uuid):
            """
            Takes in a list of bundle paths and a mapping of UUID to BundleModel, and returns a list of paths and
            subpaths that need to be removed.
            """
            to_delete = []
            # Batch get information for all bundles stored on-disk

            for bundle_path in bundle_paths:
                uuid = _get_uuid(bundle_path)
                # Screen for bundles stored on disk that are no longer in the database
                bundle = db_bundle_by_uuid.get(uuid, None)
                if bundle == None:
                    to_delete += [bundle_path]
                    continue
                # Delete dependencies stored inside of READY or FAILED bundles
                if bundle.state in [State.READY, State.FAILED]:
                    dep_paths = [
                        os.path.join(bundle_path, dep.child_path) for dep in bundle.dependencies
                    ]
                    to_delete += filter(os.path.exists, dep_paths)
            return to_delete

        def _check_other_paths(other_paths, db_bundle_by_uuid):
            """
            Given a list of non-bundle paths, and a mapping of UUID to BundleModel, returns a list of paths to delete.
            """
            to_delete = []
            for path in other_paths:
                uuid = _get_uuid(path)
                bundle = db_bundle_by_uuid.get(uuid, None)
                if bundle == None:
                    to_delete += [path]
                    continue
                ends_with_ext = (
                    path.endswith('.cid') or path.endswith('.status') or path.endswith('.sh')
                )
                if bundle.state in [State.READY, State.FAILED]:
                    if ends_with_ext:
                        to_delete += [path]
                        continue
                    elif '.' in path:
                        print >>sys.stderr, 'WARNING: File %s is likely junk.' % path
            return to_delete

        partitions, _ = path_util.ls(self.partitions)
        trash_count = 0

        for partition in partitions:
            print >>sys.stderr, 'Looking for trash in partition %s...' % partition
            partition_path = os.path.join(
                self.partitions, partition, MultiDiskBundleStore.DATA_SUBDIRECTORY
            )
            entries = map(
                lambda f: os.path.join(partition_path, f),
                reduce(lambda d, f: d + f, path_util.ls(partition_path)),
            )
            bundle_paths = filter(_is_bundle, entries)
            other_paths = set(entries) - set(bundle_paths)

            uuids = map(_get_uuid, bundle_paths)
            db_bundles = model.batch_get_bundles(uuid=uuids)
            db_bundle_by_uuid = dict()
            for bundle in db_bundles:
                db_bundle_by_uuid[bundle.uuid] = bundle

            # Check both bundles and non-bundles and remove each
            for to_delete in _check_bundle_paths(bundle_paths, db_bundle_by_uuid):
                trash_count += 1
                _delete_path(to_delete)
            for to_delete in _check_other_paths(other_paths, db_bundle_by_uuid):
                trash_count += 1
                _delete_path(to_delete)

            # Check for each bundle if we need to compute its data_hash
            data_hash_recomputed = 0

            print >>sys.stderr, 'Checking data_hash of bundles in partition %s...' % partition
            for bundle_path in bundle_paths:
                uuid = _get_uuid(bundle_path)
                bundle = db_bundle_by_uuid.get(uuid, None)
                if bundle == None:
                    continue
                if compute_data_hash or bundle.data_hash == None:
                    dirs_and_files = (
                        path_util.recursive_ls(bundle_path)
                        if os.path.isdir(bundle_path)
                        else ([], [bundle_path])
                    )
                    data_hash = '0x%s' % path_util.hash_directory(bundle_path, dirs_and_files)
                    if bundle.data_hash == None:
                        data_hash_recomputed += 1
                        print >>sys.stderr, 'Giving bundle %s data_hash %s' % (
                            bundle_path,
                            data_hash,
                        )
                        if force:
                            db_update = dict(data_hash=data_hash)
                            model.update_bundle(bundle, db_update)
                    elif compute_data_hash and data_hash != bundle.data_hash:
                        data_hash_recomputed += 1
                        print >>sys.stderr, 'Bundle %s should have data_hash %s, actual digest is %s' % (
                            bundle_path,
                            bundle.data_hash,
                            data_hash,
                        )
                        if repair_hashes and force:
                            db_update = dict(data_hash=data_hash)
                            model.update_bundle(bundle, db_update)

        if force:
            print >>sys.stderr, '\tDeleted %d objects from the bundle store' % trash_count
            print >>sys.stderr, '\tRecomputed data_hash for %d bundles' % data_hash_recomputed
        else:
            print >>sys.stderr, 'Dry-Run Statistics, re-run with --force to perform updates:'
            print >>sys.stderr, '\tObjects marked for deletion: %d' % trash_count
            print >>sys.stderr, '\tBundles that need data_hash recompute: %d' % data_hash_recomputed
示例#15
0
 def __get_num_partitions(self):
     """
     Returns the current number of disks being used by this MultiDiskBundleStore.
     This is calculated as the number of directories in self.partitions
     """
     return reduce(lambda dirs, _: len(dirs), path_util.ls(self.partitions))
示例#16
0
 def refresh_partitions(self):
     nodes, _ = path_util.ls(self.partitions)
     self.nodes = nodes
manager = CodaLabManager()
model = manager.model()

CODALAB_HOME = manager.codalab_home

"""Move data/ directory over to a temp area, and create a staging tree for uuid-based storage"""
DATA_DIR = os.path.join(CODALAB_HOME, 'data')
FINAL_LOCATION = os.path.join(CODALAB_HOME, 'bundles')

if not dry_run:
    path_util.make_directory(FINAL_LOCATION)

"""For each data hash, get a list of all bundles that have that hash, and make a copy of the bundle in the staging
area under the UUID for the bundle."""
data_hashes = reduce(lambda x,y: x+y, path_util.ls(DATA_DIR))
for data_hash in data_hashes:
    orig_location = os.path.join(DATA_DIR, data_hash)

    bundles_with_hash = model.batch_get_bundles(data_hash=data_hash)
    # We'd prefer renaming bundles to making copies, but because we are converting from deduplicated storage
    # we need to make sure that we only perform renames if we map 1:1 UUID->Hash.
    rename_allowed = len(bundles_with_hash) <= 1
    for bundle in bundles_with_hash:
        # Build the command to be executed in a subshell
        uuid = bundle.uuid
        copy_location = os.path.join(FINAL_LOCATION, uuid)
        command = '%s %s %s' % ('mv' if rename_allowed else 'cp -a', orig_location, copy_location)
        print command
        if not dry_run:
            exec_str = shlex.split(command)
示例#18
0
 def ls(self, target):
   path = self.get_target_path(target)
   return path_util.ls(path)