def run(self, repo_uris: Dict[str, str] = None, retain: int = None, **kwargs: Any) -> Union[Version, None]: """ Args: Returns: """ repo_infos = dict( (name, parse_repo(uri)) for (name, uri) in repo_uris.items()) repos = dict((name, Repository(namespace=repo_info.namespace, repository=repo_info.repository)) for (name, repo_info) in repo_infos.items()) repos_to_prune = dict( (name, (repos[name] if not repo_info.remote_name else Repository. from_template(repos[name], engine=get_engine(repo_info.remote_name)))) for (name, repo_info) in repo_infos.items()) for name, repo_info in repo_infos.items(): repo = repos_to_prune[name] prerelease = repo_info.prerelease image_tags = repo.get_all_hashes_tags() tag_dict = dict((tag, image_hash) for (image_hash, tag) in image_tags if image_hash) #reverse keys version_list = [ parse_tag(tag) for tag in sorted(list(tag_dict.keys()), key=len, reverse=True) ] valid_versions = [version for version in version_list if version] non_prerelease_versions = [ version for version in valid_versions if len(version.prerelease) == 0 ] prerelease_versions = [ version for version in valid_versions if prerelease and len(version.prerelease) > 0 and version.prerelease[0] == prerelease ] prune_candidates = prerelease_versions if prerelease else non_prerelease_versions total_candidates = len(prune_candidates) prune_count = total_candidates - retain prune_list = sorted(prune_candidates)[:prune_count] for version in prune_list: tag = str(version) image_hash = tag_dict[tag] image = repo.images[image_hash] image.delete_tag(tag)
def exists(self, location: str, **kwargs: Any) -> bool: """ Checks whether the target result exists in the file system. Does not validate whether the result is `valid`, only that it is present. Args: - location (str): Location of the result in the specific result target. Will check whether the provided location exists - **kwargs (Any): string format arguments for `location` Returns: - bool: whether or not the target result exists """ try: repo_info = parse_repo(location) repo = Repository(namespace=repo_info.namespace, repository=repo_info.repository) remote = Repository.from_template(repo, engine=get_engine( repo_info.remote_name, autocommit=True)) table_exists_at(remote, repo_info.table) return self.client.get_object(Bucket=self.bucket, Key=location.format(**kwargs)) except Exception as exc: self.logger.exception( "Unexpected error while reading from Splitgraph: {}".format( repr(exc))) raise
def run(self, workspaces: Dict[str, Workspace] = None, sgr_tags: Dict[str, List[str]] = None, **kwargs: Any): """ Args: Returns: """ repo_infos = dict((name, parse_repo(workspace['repo_uri'])) for (name, workspace) in workspaces.items()) repos = dict((name, Repository(namespace=repo_info.namespace, repository=repo_info.repository)) for (name, repo_info) in repo_infos.items()) repos_with_new_images = dict( (name, repo) for (name, repo) in repos.items() if repo.head and repo.head.image_hash != workspaces[name]['image_hash']) for name, repo in repos_with_new_images.items(): repo_tags = sgr_tags[name] if sgr_tags and name in sgr_tags else [] for tag in repo_tags: repo.head.tag(tag) # Push all repos. We don't know for sure that it shouldn't be pushed for name, repo in repos.items(): remote_name = repo_infos[name].remote_name if not remote_name: self.logger.warn( f'No remote_name specified. Not pushing {name}.') continue remote = Repository.from_template(repo, engine=get_engine(remote_name)) repo.push( remote, handler="S3", handler_options={"threads": 8}, overwrite_objects=True, overwrite_tags=True, ) self.logger.info(f'Pushed {name} to {remote_name}') tagged_repo_uris = dict( (name, workspaces[name]['repo_uri']) for (name, repo) in repos_with_new_images.items()) return tagged_repo_uris
def run(self, workspaces: Dict[str, Workspace] = None, comment: str = None, **kwargs: Any): """ Args: Returns: """ self.logger.info(f'Commit will eval: {workspaces}') engine = get_engine() repo_infos = dict((name, parse_repo(workspace['repo_uri'])) for (name, workspace) in workspaces.items()) repos = dict((name, Repository(namespace=repo_info.namespace, repository=repo_info.repository)) for (name, repo_info) in repo_infos.items()) repos_with_changes = dict() for name, repo in repos.items(): old_image_hash = workspaces[name]['image_hash'] new_image = repo.commit(comment=comment, chunk_size=self.chunk_size) unchanged = self.image_contents_equal(repo.images[old_image_hash], new_image) if unchanged: repo.images.delete([new_image.image_hash]) else: repos_with_changes[name] = repo self.logger.info(f'Commit complete: {name}') self.logger.info(f'Commit now done') committed_repo_uris = dict( (name, workspaces[name]['repo_uri']) for (name, repo) in repos_with_changes.items()) return committed_repo_uris
def init_repo(self, repo_info: RepoInfo) -> Repository: repo = Repository(namespace=repo_info.namespace, repository=repo_info.repository) if not repository_exists(repo): self.logger.info("Creating repo {}/{}...".format( repo.namespace, repo.repository)) repo.init() if repo_info.remote_name: remote = Repository.from_template(repo, engine=get_engine( repo_info.remote_name)) cloned_repo = clone( remote, local_repository=repo, download_all=False, overwrite_objects=True, overwrite_tags=True, ) return repo
def read(self, location: str) -> Result: new = self.copy() new.location = location try: repo = Repository(namespace=new.repo_info.namespace, repository=new.repo_info.repository) remote = Repository.from_template(repo, engine=get_engine( new.repo_info.remote_name, autocommit=True)) cloned_repo = clone( remote, local_repository=repo, download_all=True, overwrite_objects=True, overwrite_tags=True, single_image=new.repo_info.tag, ) data = sql_to_df(f"SELECT * FROM {new.repo_info.table}", repository=cloned_repo, use_lq=self.layer_query) if self.schema is not None: errors = self.schema.validate(data) if errors: raise SchemaValidationError(errors) new.value = data except Exception as exc: self.logger.exception( "Unexpected error while reading from result handler: {}". format(repr(exc))) raise exc return new
def write(self, value_: Any, **kwargs: Any) -> Result: """ Writes the result to a repository on Splitgraph Args: - value_ (Any): the value to write; will then be stored as the `value` attribute of the returned `Result` instance - **kwargs (optional): if provided, will be used to format the `table`, `comment`, and `tag` Returns: - Result: returns a new `Result` with both `value`, `comment`, `table`, and `tag` attributes """ if self.schema is not None: errors = self.schema.validate(value_) if errors: raise SchemaValidationError(errors) new = self.format(**kwargs) new.value = value_ repo_info = parse_repo(new.location) repo = Repository(namespace=repo_info.namespace, repository=repo_info.repository) remote = Repository.from_template(repo, engine=get_engine( repo_info.remote_name, autocommit=True)) assert isinstance(value_, pd.DataFrame) if not repository_exists(repo) and self.auto_init_repo: self.logger.info("Creating repo {}/{}...".format( repo.namespace, repo.repository)) repo.init() # TODO: Retrieve the repo from bedrock first self.logger.info("Starting to upload result to {}...".format( new.location)) with self.atomic(repo.engine): self.logger.info("checkout") img = repo.head img.checkout(force=True) self.logger.info("df to table") df_to_table(new.value, repository=repo, table=repo_info.table, if_exists='replace') self.logger.info("commit") new_img = repo.commit(comment=new.comment, chunk_size=10000) new_img.tag(repo_info.tag) # if (repo.diff(new.table, img, new_img)): if self.auto_push: self.logger.info("push") repo.push( remote, handler="S3", overwrite_objects=True, overwrite_tags=True, reupload_objects=True, ) self.logger.info("Finished uploading result to {}...".format( new.location)) return new
def get_upstream(self, repository: Repository): return Repository.from_template(repository, engine=get_engine('bedrock', autocommit=True))