Exemplo n.º 1
0
    def _get_tree_hash(self, directory):
        """Recursively generate hashes of nodes for current directory"""
        # Validate the given root directory
        if not os.path.isdir(directory):
            raise ValueError('Not a directory: {}'.format(directory))

        # Get all files & directories for this level (excluding our pbb dir)
        directories = utils.list_directories(directory, [self.REPO_DIR])
        files = utils.list_files(directory)

        node_entries = []

        # Recursively create nodes for subdirectories
        for subdir in directories:
            node_hash = self._get_tree_hash(
                utils.posixjoin(directory, subdir)
            )
            node_entries.append('tree {} {}'.format(node_hash, subdir))

        for file in files:
            node_hash = self._get_blob_hash(
                utils.posixjoin(directory, file)
            )
            node_entries.append('blob {} {}'.format(node_hash, file))

        # Join node entries into the node content
        node_content = '\n'.join(node_entries) + '\n'

        # Get node content hash
        return self._hash_diget(self._byte_convert(node_content))
Exemplo n.º 2
0
    def _create_tree_node(self, directory):
        """Recursive function creates tree nodes for current snapshot"""
        # Validate the given root directory
        if not os.path.isdir(directory):
            raise ValueError('Not a directory: {}'.format(directory))

        # Get all files & directories for this level (excluding our pbb dir)
        directories = utils.list_directories(directory, [self.REPO_DIR])
        files = utils.list_files(directory)

        node_entries = []

        # Recursively create nodes for subdirectories
        for subdir in directories:
            node_hash = self._create_tree_node(
                utils.posixjoin(directory, subdir)
            )
            node_entries.append('tree {} {}'.format(node_hash, subdir))

        for file in files:
            node_hash = self._create_blob_node(
                utils.posixjoin(directory, file)
            )
            node_entries.append('blob {} {}'.format(node_hash, file))

        # Join node entries into the node content
        node_content = '\n'.join(node_entries) + '\n'

        # Save the node contents to a vc object
        return self._save_node(directory, node_content)
Exemplo n.º 3
0
class Repository:

    """Manages a repository instance

    Directory structure:
    +--.pbb/
    |  +--objects/
    |     +--<first 2 hash chars>/
    |        <remaining 38 harsh chars>
    |  +--refs/
    |     +--heads/
    |        master
    |  objhashcache
    |  HEAD
    |  snapshots
    """

    DEFAULT_BRANCH = 'master'
    REPO_DIR = '.pbb'
    DIRS = {
        'top': REPO_DIR,
        'objects': utils.posixjoin(REPO_DIR, 'objects'),
        'refs': utils.posixjoin(REPO_DIR, 'refs'),
        'heads': utils.posixjoin(REPO_DIR, 'refs', 'heads'),
    }
    FILES = {
        'objhashcache': '.pbb/objhashcache',
        'head': '.pbb/HEAD',
        'snapshots': '.pbb/snapshots',
    }

    def __init__(self, root_dir, create=False):
        """Initialize instance variables"""
        self.root_dir = root_dir
        self.create = create

        # Instance variables
        self.objhashcache = {}

        # Validate that a repository exists at the given location
        if not self.validate_repo():
            if self.create:
                self.create_repo()
            else:
                raise ValueError('Repository does not exist or is invalid')

        # Load any existing attributes
        self._load_hashmap()

    def validate_repo(self):
        """Check that the repository structure exists and is valid"""
        def is_dir(rel_path):
            return os.path.isdir(self._join_root(rel_path))

        def is_file(rel_path):
            return os.path.isfile(self._join_root(rel_path))

        return (
            all(map(is_dir, self.DIRS.values())) and
            all(map(is_file, self.FILES.values()))
        )

    def create_repo(self):
        """Create a new repository directory in the root location"""
        # Create all directories
        for rel_dir in self.DIRS.values():
            os.makedirs(self._join_root(rel_dir), exist_ok=True)

        # Make the new version control folder hidden
        ctypes.windll.kernel32.SetFileAttributesW(
            self._join_root(self.REPO_DIR), 0x02
        )

        # Create HEAD file and set branch to the default name
        self._set_branch(self.DEFAULT_BRANCH)

        # Create objhashcache file and set as empty
        self._save_objhashcache()

        # Create the snapshots database
        ssdb.execute(self.FILES['snapshots'], ssdb.CREATE)

    def current_branch(self):
        """Returns the name of the current branch"""
        with open(self._join_root(self.FILES['head']), 'r') as head_file:
            return head_file.read().strip()

    def snapshot(self, message='', user=''):
        """Takes a snapshot of the the current status of the directory"""
        # Recursively build tree structure
        with utils.temp_wd(self.root_dir):
            top_hash = self._create_tree_node('.')

        # Get hash of the current snapshot and if it is detached
        old_hash, detached = self._current_snapshot_hash()

        # Check if any changes were made and if the snapshot should be saved
        if old_hash == top_hash:
            raise CleanDirectoryException('No changes to repository')

        if detached:
            raise DetachedHeadException(
                'Detached HEAD. Snapshot was not saved. '
                'Save as branch to make changes.'
            )

        # Save updated hashcache
        self._save_objhashcache()

        # Update current branch head with new snapshot hash
        self._update_branch_head(top_hash)

        # Insert snapshot data into the snapshot database
        self._insert_snapshot(top_hash, message, user)
        return top_hash

    def create_branch(self, name, snapshot=None, message='', user=''):
        """Creates a new branch with the given name at the given snapshot

        Raises:
          InvalidHashException: If not a single unique hash is found
        """
        if snapshot is None:
            full_hash, _ = self._current_snapshot_hash()
        else:
            full_hash = self._full_hash(snapshot)

        self._update_branch_head(full_hash, name)

        # Insert the branch reference into the database
        self._insert_snapshot(full_hash, message, user, branch=name)

        if snapshot is None:
            # If we are branching from our current directory, automatically
            # check out the branch after creation
            self._set_branch(name)

    def list_snapshots(self):
        """Return a list of sqlite.Row objects for each snapshot"""
        return ssdb.execute(
            self.FILES['snapshots'], ssdb.SELECT,
            row_factory=ssdb.Row, cursor='fetchall'
        )

    def checkout(self, checkout, create=None, force=False, branch=False):
        """Checks out a different snapshot in the repository

        If a string is given for branch parameter, a new branch at the
        checkout location will be created

        Raises:
          InvalidHashException: If not a single unique hash is found
          DirtyDirectoryException: If changes made since last save
        """
        # If branch option was given, attempt to switch to an existing branch
        if branch:
            # Just switch branch instead
            self.switch_branch(checkout, force)
            return

        # Get the full hash to be checked out
        full_hash = self._full_hash(checkout)

        # Raise exception on a dirty directory if no force option
        if not force:
            self._check_dirty()

        # Check if create option was given
        if create is not None:
            # Crate a new branch as the checkout location, then the
            # following code will simply check out that branch
            self.create_branch(create, full_hash)
            self.switch_branch(create, force)
        else:
            # Check if the hash matches any current branch
            branch = self._match_branch(full_hash)
            if branch is not None:
                # Just switch branch instead of checkout a detached HEAD
                self.switch_branch(branch, force)
                return
            # If the hash doesn't match a branch, we need to detach the HEAD
            self._set_branch(full_hash)

        # TODO: Switch all files in the directory
        self._update_files()

    def switch_branch(self, name, force=False):
        """Sets the branch to the given name then updates all files

        Raises:
          ValueError: No branch found with given name
          DirtyDirectoryException: If changes made since last save
        """
        # Validate given branch name
        if name not in self.list_branches():
            raise ValueError('No branch found: {}'.format(name))

        # Raise exception on a dirty directory if no force option
        if not force:
            self._check_dirty()

        # Switch the the existing branch
        self._set_branch(name)
        self._update_files()

    def list_branches(self):
        """Returns a list of all existing branch names"""
        return utils.list_files(self._join_root(self.DIRS['heads']))

    def _check_dirty(self):
        """Raises exception if the directory has changes since last save

        Raises:
          DirtyDirectoryException: If changes made since last save
        """
        # Get hash of directory in it's current form
        with utils.temp_wd(self.root_dir):
            dir_hash = self._get_tree_hash('.')

        # Check if any outstanding changes are in the directory
        cur_hash, _ = self._current_snapshot_hash()
        if cur_hash != dir_hash:
            # Changes have been made and we want to warn the user
            raise DirtyDirectoryException(
                'Changes have been made to the directory. '
                'Use force option to overwrite.'
            )

    def _full_hash(self, partial):
        """Returns a unique full snapshot hash from from a partial hash

        Raises:
          InvalidHashException: If not a single unique hash is found
        """
        snapshots = self.list_snapshots()
        matches = utils.get_matches(
            partial, [row['hash'] for row in snapshots],
        )

        if len(matches) < 1:
            raise InvalidHashException(
                'No snapshots found for: {}'.format(partial), matches
            )

        if len(matches) > 1:
            # If multiple matches are all the same, pick the first
            # (All should be the same, just different branch names, which
            #  we don't care about at the moment as we're not checking out)
            if len(set(matches)) == 1:
                # Matches have the same hash
                return matches[0]

            print([(row['hash'], row['branch']) for row in snapshots])
            raise InvalidHashException(
                'No unique match for: {}'.format(partial), matches
            )

        # Get the full matched hash
        return matches[0]

    def _update_files(self):
        """Updates directory with the files for the given snapshot

        Clears out the entire directory, then rebuilds the directory from
        the repository. Would be better in the future to probably only
        overwrite files that needed updates and remove files that no longer
        should be there, but this was simple and I can optimize later if
        it needs better performance.
        """
        # Get all files and directories for this level (excluding repo)
        directories = utils.list_directories(self.root_dir, [self.REPO_DIR])
        files = utils.list_files(self.root_dir)

        # Remove all of these files and recursively remove directories
        # Remove files
        for file in files:
            os.remove(self._join_root(file))
        # Remove directories
        for directory in directories:
            shutil.rmtree(self._join_root(directory))

        # Clears the current hashcache; must be rebuild along with files
        self.objhashcache = {}

        # Get hash of the current snapshot
        top_hash, _ = self._current_snapshot_hash()

        self._build_tree(top_hash, self.root_dir)

        # Save the rebuilt hashcache
        self._save_objhashcache()

    def _build_tree(self, node_hash, current_path):
        """Recursive function to rebuild file structure for objects"""
        content = self._read_object(node_hash).decode().rstrip()

        for line in content.split('\n'):
            obj_type, obj_hash, obj_name = self._parse_tree_line(line)
            new_path = os.path.join(current_path, obj_name)

            # Add the new file or directory to the objhashcache
            self.objhashcache[new_path] = obj_hash

            # Process each type of object
            if obj_type == 'tree':
                # Make the directory
                os.makedirs(new_path)
                # Make the directory
                self._build_tree(obj_hash, new_path)
            if obj_type == 'blob':
                # Rebuild the file
                with open(new_path, 'wb') as obj_file:
                    obj_file.write(self._read_object(obj_hash))

    def _parse_tree_line(self, line):
        """Parses each line in a tree object"""
        clean = line.rstrip()
        return clean[:5].rstrip(), clean[5:46].rstrip(), clean[46:].rstrip()

    def _join_root(self, rel_path):
        """Return a joined relative path with the instance root directory"""
        return os.path.join(self.root_dir, rel_path)

    def _set_branch(self, branch_name):
        """Sets the current branch to the given name"""
        with open(self.FILES['head'], 'w') as head_file:
            head_file.write(branch_name)

    def _save_objhashcache(self):
        """Saves the current state of the hashmap"""
        with open(self.FILES['objhashcache'], 'wb') as hash_file:
            pickle.dump(self.objhashcache, hash_file)

    def _load_hashmap(self):
        """Loads a saved hashmap from a file"""
        with open(self.FILES['objhashcache'], 'rb') as hash_file:
            self.objhashcache = pickle.load(hash_file)

    def _current_snapshot_hash(self):
        """Returns the hash of the current snapshot and if it is detached"""
        # Check HEAD for branch name of snapshot address
        branch = self.current_branch()

        # Branch might not be a branch, could be detached address
        if branch not in self.list_branches():
            # On initialization, a default branch is created with no actual
            # snapshots, meaning we should return filler data
            if branch == self.DEFAULT_BRANCH:
                snapshot_hash = None
                detached = False

            # Branch name is actually detached address
            else:
                snapshot_hash = branch
                detached = True
        else:
            snapshot_hash = self._get_branch_head()
            detached = False

        return (snapshot_hash, detached)

    def _get_branch_head(self, branch=None):
        """Returns the hash for the given branch"""
        if branch is None:
            branch = self.current_branch()

        # Construct path to the reference file
        ref_path = self._join_root(os.path.join(self.DIRS['heads'], branch))
        try:
            with open(ref_path, 'r') as branch_file:
                return branch_file.read().strip()
        except FileNotFoundError:
            # The branch file does not exist yet (new repo)
            return None

    def _update_branch_head(self, new_hash, branch=None):
        """Updates a branch with a new hash address to a head snapshot"""
        if branch is None:
            branch = self.current_branch()

        # Construct path to the reference file
        ref_path = self._join_root(os.path.join(self.DIRS['heads'], branch))

        # Overwrite the reference file with the new hash
        with open(ref_path, 'w') as ref_file:
            ref_file.write(new_hash)

    def _insert_snapshot(self, obj_hash, message='', user='', branch=None):
        """Updates snapshots database with snapshot data"""
        if branch is None:
            branch = self.current_branch()
        data = {
            'hash': obj_hash,
            'branch': branch,
            'message': message,
            'user': user,
        }

        ssdb.execute(self.FILES['snapshots'], ssdb.INSERT, data, commit=True)

    def _create_tree_node(self, directory):
        """Recursive function creates tree nodes for current snapshot"""
        # Validate the given root directory
        if not os.path.isdir(directory):
            raise ValueError('Not a directory: {}'.format(directory))

        # Get all files & directories for this level (excluding our pbb dir)
        directories = utils.list_directories(directory, [self.REPO_DIR])
        files = utils.list_files(directory)

        node_entries = []

        # Recursively create nodes for subdirectories
        for subdir in directories:
            node_hash = self._create_tree_node(
                utils.posixjoin(directory, subdir)
            )
            node_entries.append('tree {} {}'.format(node_hash, subdir))

        for file in files:
            node_hash = self._create_blob_node(
                utils.posixjoin(directory, file)
            )
            node_entries.append('blob {} {}'.format(node_hash, file))

        # Join node entries into the node content
        node_content = '\n'.join(node_entries) + '\n'

        # Save the node contents to a vc object
        return self._save_node(directory, node_content)

    def _create_blob_node(self, path):
        """Creates nodes for files in the current snapshot"""
        with open(path, 'rb') as input_file:
            node_content = input_file.read()

        # Save the node contents to a vc object
        return self._save_node(path, node_content)

    def _get_tree_hash(self, directory):
        """Recursively generate hashes of nodes for current directory"""
        # Validate the given root directory
        if not os.path.isdir(directory):
            raise ValueError('Not a directory: {}'.format(directory))

        # Get all files & directories for this level (excluding our pbb dir)
        directories = utils.list_directories(directory, [self.REPO_DIR])
        files = utils.list_files(directory)

        node_entries = []

        # Recursively create nodes for subdirectories
        for subdir in directories:
            node_hash = self._get_tree_hash(
                utils.posixjoin(directory, subdir)
            )
            node_entries.append('tree {} {}'.format(node_hash, subdir))

        for file in files:
            node_hash = self._get_blob_hash(
                utils.posixjoin(directory, file)
            )
            node_entries.append('blob {} {}'.format(node_hash, file))

        # Join node entries into the node content
        node_content = '\n'.join(node_entries) + '\n'

        # Get node content hash
        return self._hash_diget(self._byte_convert(node_content))

    def _get_blob_hash(self, path):
        """Get the hash for a given blob file at the path"""
        with open(path, 'rb') as input_file:
            node_content = input_file.read()

        # Convert to bytes if necessary
        bytes_content = self._byte_convert(node_content)

        # Get node content hash
        return self._hash_diget(bytes_content)

    def _save_node(self, path, node_content):
        """Calculates a content hash and saves the content to a file"""
        # Convert to bytes if necessary
        bytes_content = self._byte_convert(node_content)
        # Get node content hash
        digest = self._hash_diget(bytes_content)

        # Parse object directory and filename
        obj_dir = self._join_root(
            os.path.join(self.DIRS['objects'], digest[:2])
        )
        obj_path = os.path.join(obj_dir, digest[2:])

        # Make the directory if it does not exist
        os.makedirs(obj_dir, exist_ok=True)

        # Binary compress new files or return original if no reference
        final_content = self._delta_compress(path, digest, bytes_content)

        # Return the hash with no further processing if no changes
        if final_content is None:
            return digest

        # Write the final content to the final object file
        with open(obj_path, 'wb') as obj_file:
            obj_file.write(final_content)

        # Update hashmap
        self.objhashcache[path] = digest

        return digest

    def _byte_convert(self, payload):
        """Check that an object is bytes, otherwise attempt to encode"""
        if type(payload) is bytes:
            return payload

        # Try to encode if not bytes already
        return payload.encode()

    def _hash_diget(self, payload):
        """Returns a hex digest for the hash of the given payload"""
        hasher = hashlib.sha1()
        hasher.update(payload)
        return hasher.hexdigest()

    def _delta_compress(self, obj_path, obj_hash, obj_content):
        """Compresses a new object file by replacing with a delta

        Returns either a patch to a previous version of this file or returns
        the original content to be written as a new reference.

        NOTE:
          The file name hash no longer will reflect the true file
          content, rather the content that the delta reflects.
        """

        # Check if the path is in the objhashcache
        if obj_path not in self.objhashcache:
            # Return the uncompress content
            return obj_content

        # Check if changes were made to the object file
        if self.objhashcache[obj_path] == obj_hash:
            return None

        # Get paths to the reference file
        ref_hash = self.objhashcache[obj_path]

        # Calculate delta from the reference version to the new version
        patch = bindifflib.diff(
            self._byte_convert(obj_content),
            self._read_object(ref_hash),
        )

        # Format delta contents
        patch_tuple = (ref_hash, patch)
        return pickle.dumps(patch_tuple)

    def _read_object(self, obj_hash):
        """Reads and returns the contents of an object file with given hash

        Recursively rebuilds any necessary files from their deltas
        """
        obj_dir = self._join_root(
            os.path.join(self.DIRS['objects'], obj_hash[:2])
        )
        obj_path = os.path.join(obj_dir, obj_hash[2:])

        # Read the object file content
        with open(obj_path, 'rb') as obj_file:
            content = obj_file.read()

        # Check if a delta by comparing the content to the hash value
        if obj_hash != self._hash_diget(content):
            # Delta object must be rebuilt
            patch_tuple = pickle.loads(content)
            ref_content = self._read_object(patch_tuple[0])
            return bindifflib.patch(patch_tuple[1], ref_content)

        # If object is not a delta, simply return it's content
        return content

    def _match_branch(self, snapshot_hash):
        """Checks if any current branch matches the given hash"""
        head_dir = self._join_root(self.DIRS['heads'])

        branches = self.list_branches()

        # Reach each branch reference for matching hash
        for branch in branches:
            with open(os.path.join(head_dir, branch), 'r') as branch_file:
                branch_hash = branch_file.read().strip()
            if branch_hash == snapshot_hash:
                return branch

        # If no matching hashes were found, return None
        return None