def _load_gigantum_data(self) -> None: """Method to load the labbook YAML file to a dictionary Returns: None """ if not self.root_dir: raise GigantumException("No root directory assigned to lab book. " "Failed to get root directory.") schema_path = os.path.join(self.root_dir, '.gigantum', 'project.yaml') old_schema_path = os.path.join(self.root_dir, ".gigantum", "labbook.yaml") if os.path.exists(schema_path): with open(schema_path, 'rt') as lbfile: d = yaml.safe_load(lbfile) self._data = d elif os.path.exists(old_schema_path): # For backward compatibility with open(old_schema_path, 'rt') as lbfile: d = yaml.safe_load(lbfile) # "Virtualize" old schemas into new schemas to support back-compatability self._data = translate_schema(d, self.root_dir) else: if 'gm.workspace' in self.get_branches()['local']: logger.warning("Master branch empty, attempting to load gm.workspace") self.checkout_branch('gm.workspace') self._load_gigantum_data() else: raise GigantumException('Cannot find configuration yaml file') if self.schema == 2: # Make sure untracked directory exists (it and its contents are ignored) os.makedirs(os.path.join(self.root_dir, 'output', 'untracked'), exist_ok=True)
def checkout_branch(self, branch_name: str, new: bool = False) -> None: """ Checkout a Git branch. Create a new branch locally. Args: branch_name(str): Name of branch to checkout or create new(bool): Indicates this branch should be created. Return: None """ if not self.is_repo_clean: raise GigantumException(f"Cannot checkout {branch_name}: Untracked and/or uncommitted changes") try: if new: logger.info(f"Creating a new branch {branch_name}...") self.git.create_branch(branch_name) logger.info(f"Checking out branch {branch_name}...") self.git.checkout(branch_name=branch_name) # Clear out checkout context if self._root_dir and os.path.exists(os.path.join(self._root_dir, ".gigantum", ".checkout")): os.remove(os.path.join(self._root_dir, ".gigantum", ".checkout")) self._checkout_id = None except ValueError as e: logger.error(f"Cannot checkout branch {branch_name}: {e}") raise GigantumException(e)
def _clone(remote_url: str, working_dir: str) -> str: clone_tokens = f"git clone {remote_url}".split() call_subprocess(clone_tokens, cwd=working_dir) # Affirm there is only one directory created dirs = os.listdir(working_dir) if len(dirs) != 1: raise GigantumException('Git clone produced extra directories') p = os.path.join(working_dir, dirs[0]) if not os.path.exists(p): raise GigantumException( 'Could not find expected path of repo after clone') try: # This is for backward compatibility -- old projects will clone to # branch "gm.workspace" by default -- even if it has already been migrated. # This will therefore set the user to the proper branch if the project has been # migrated, and will have no affect if it hasn't r = call_subprocess("git checkout master".split(), cwd=p) except Exception as e: logger.error(e) return p
def _start_dev_tool(cls, labbook: LabBook, username: str, dev_tool: str, container_override_id: str = None): router = ProxyRouter.get_proxy(labbook.client_config.config['proxy']) bam = BundledAppManager(labbook) bundled_apps = bam.get_bundled_apps() bundled_app_names = [x for x in bundled_apps] if dev_tool == "rstudio": suffix = cls._start_rstudio(labbook, router, username) elif dev_tool in ["jupyterlab", "notebook"]: # Note that starting the dev tool is identical whether we're targeting jupyterlab or notebook suffix = cls._start_jupyter_tool(labbook, router, username, container_override_id) elif dev_tool in bundled_app_names: app_data = bundled_apps[dev_tool] app_data['name'] = dev_tool suffix = cls._start_bundled_app(labbook, router, username, app_data, container_override_id) else: raise GigantumException(f"'{dev_tool}' not currently supported as a Dev Tool") # Don't include the port in the path if running on 80 apparent_proxy_port = labbook.client_config.config['proxy']["apparent_proxy_port"] if apparent_proxy_port == 80: path = suffix else: path = f':{apparent_proxy_port}{suffix}' return path
def start_rserver(labbook: LabBook, username: str, tag: Optional[str] = None, check_reachable: bool = True) -> None: """ Main entrypoint to launch rstudio-server. Note, the caller must determine for themselves the host and port. Raises an exception if there's a problem. Returns: Path to rstudio-server """ owner = InventoryManager().query_owner(labbook) lb_key = tag or infer_docker_image_name( labbook_name=labbook.name, owner=owner, username=username) docker_client = get_docker_client() lb_container = docker_client.containers.get(lb_key) if lb_container.status != 'running': raise GigantumException(f"{str(labbook)} container is not running") rserver_ps = ps_search(lb_container, 'rserver') if len(rserver_ps) == 1: # we have an existing rstudio-server instance return elif len(rserver_ps) == 0: _start_rserver_process(lb_container) else: # If "ps aux" for rserver returns multiple hits - this should never happen. for n, l in enumerate(rserver_ps): logger.error( f'Multiple RStudio-Server instances - ({n+1} of {len(rserver_ps)}) - {l}' ) raise ValueError( f'Multiple ({len(rserver_ps)}) RStudio Server instances detected')
def _start_dev_tool(cls, lb: LabBook, username: str, dev_tool: str, container_override_id: str = None): pr = ProxyRouter.get_proxy(lb.client_config.config['proxy']) if dev_tool == "rstudio": suffix = cls._start_rstudio(lb, pr, username) elif dev_tool in ["jupyterlab", "notebook"]: # Note that starting the dev tool is identical whether we're targeting jupyterlab or notebook suffix = cls._start_jupyter_tool(lb, pr, username, container_override_id) else: raise GigantumException( f"'{dev_tool}' not currently supported as a Dev Tool") # Don't include the port in the path if running on 80 apparent_proxy_port = lb.client_config.config['proxy'][ "apparent_proxy_port"] if apparent_proxy_port == 80: path = suffix else: path = f':{apparent_proxy_port}{suffix}' return path
def start_bundled_app(labbook: LabBook, username: str, command: str, tag: Optional[str] = None) -> None: """ Method to start a bundled app by running the user specified command inside the running Project container Args: labbook: labbook instance username: current logged in user command: user specified command to run tag: optional tag for the container override id Returns: """ if len(command) == 0: return owner = InventoryManager().query_owner(labbook) lb_key = tag or infer_docker_image_name(labbook_name=labbook.name, owner=owner, username=username) docker_client = get_docker_client() lb_container = docker_client.containers.get(lb_key) if lb_container.status != 'running': raise GigantumException(f"{str(labbook)} container is not running. Start it before starting a bundled app.") lb_container.exec_run(f'sh -c "{command}"', detach=True, user='******')
def start_mitm_proxy(cls, lb_endpoint: str, key: str) -> str: """Launch a proxy cointainer between client and labbook. Args: lb_endpoint: the specific target running a dev tool key: a unique key for this instance (related to the monitored Project container - e.g., RStudio) Returns: str that contains the proxy endpoint as http://{ip}:{port} """ # setup the environment - note that UID is obtained inside the container based on labmanager_share_vol # (mounted at /mnt/share) env_var = [f"LBENDPOINT={lb_endpoint}", f"PROXYID={key}"] nametag = f"gmitmproxy.{key}" volumes_dict = { 'labmanager_share_vol': {'bind': '/mnt/share', 'mode': 'rw'} } docker_client = get_docker_client() container = docker_client.containers.run("gigantum/mitmproxy_proxy:" + CURRENT_MITMPROXY_TAG, detach=True, init=True, name=nametag, volumes=volumes_dict, environment=env_var) # For now, we hammer repeatedly for 5 seconds # Plan for a better solution is mentioned in #434 for _ in range(50): time.sleep(.1) # Hope that our container is actually up and reload container.reload() container_ip = container.attrs['NetworkSettings']['Networks']['bridge']['IPAddress'] if container_ip: break if not container_ip: raise GigantumException("Unable to get mitmproxy_proxy IP address.") mitm_endpoint = f'http://{container_ip}:8079' # register the proxy in KV store redis_conn = redis.Redis(db=1) redis_conn.set(f"{lb_endpoint}-mitm-endpoint", mitm_endpoint) redis_conn.set(f"{lb_endpoint}-mitm-container_id", container.id) redis_conn.set(f"{lb_endpoint}-mitm-key", key) # make sure proxy is up. for timeout in range(10): time.sleep(1) ec, new_ps_list = container.exec_run( f'sh -c "ps aux | grep nginx | grep -v \' grep \'"') new_ps_list = new_ps_list.decode().split('\n') if any('nginx' in l for l in new_ps_list): logger.info(f"Proxy to rserver started within {timeout + 1} seconds") break else: raise ValueError('mitmproxy failed to start after 10 seconds') return mitm_endpoint
def modify_tag_visibility(self, tag: str, show: str): """Modify all detail objecvts with matching tag to have visibility specified in show""" if not self._in_modify: raise GigantumException( "Attempt to use ActivityRecord.modify_tag_visibility() outside of " "ActivityRecord.inspect_detail_objects()") # We'll actually do the modifications in one pass when we exit the with-context self._tags_to_update[tag] = show
def remote(self) -> Optional[str]: try: r = self.git.list_remotes() if r: return r[0]['url'] else: return None except Exception as e: logger.exception(e) raise GigantumException(e)
def remove_remote(self, remote_name: Optional[str] = "origin") -> None: """Remove a remove from the git config Args: remote_name: Optional name of remote (default "origin") """ try: logger.info(f"Removing remote {remote_name} from {str(self)}") self.git.remove_remote(remote_name) except Exception as e: raise GigantumException(e)
def has_remote(self): """Return True if the Repository has a remote that it can push/pull to/from Returns: bool indicating whether a remote is set. """ try: return len(self.git.list_remotes()) > 0 except Exception as e: logger.exception(e) raise GigantumException(e)
def add_remote(self, remote_name: str, url: str) -> None: """Add a new git remote Args: remote_name: Name of remote, e.g., "origin" url: Path to remote Git repository. """ try: logger.info(f"Adding new remote {remote_name} at {url}") self.git.add_remote(remote_name, url) self.git.fetch(remote=remote_name) except Exception as e: raise GigantumException(e)
def get_storage_backend(storage_type: str) -> Union[ManagedStorageBackend, UnmanagedStorageBackend]: """ Args: storage_type(str): Identifier to load class Returns: gtmcore.dataset.storage.backend.StorageBackend """ if storage_type in SUPPORTED_STORAGE_BACKENDS.keys(): module, package = SUPPORTED_STORAGE_BACKENDS.get(storage_type) # type: ignore imported = importlib.import_module(module, package) class_instance = getattr(imported, package) return class_instance() else: raise GigantumException(f"Unsupported Dataset Storage Type: {storage_type}")
def get_branches(self) -> Dict[str, List[str]]: """Return all branches a Dict of Lists. Dict contains two keys "local" and "remote". Args: None Returns: Dictionary of lists for "remote" and "local" branches. """ try: # Note - do NOT fetch here - fetch should be done before this is called. return self.git.list_branches() except Exception as e: # Unsure what specific exception add_remote creates, so make a catchall. logger.exception(e) raise GigantumException(e)
def update_detail_object(self, obj: ActivityDetailRecord, index: int) -> None: """Method to update a detail object in place Can only be used while in the context of self.inspect_detail_objects Args: obj: detail record to add index: index to update """ if not self._in_modify: raise GigantumException( "Attempt to use ActivityRecord.update_detail_object() outside of " "ActivityRecord.inspect_detail_objects()") if index < 0 or index >= len(self._detail_objects): raise ValueError("Index out of range when updating detail object") self._detail_objects[index] = (obj.show, obj.type.value, obj.importance, obj)
def check_jupyter_reachable(ip_address: str, port: int, prefix: str): for n in range(20): test_url = f'http://{ip_address}:{port}{prefix}/api' logger.debug( f"Attempt {n + 1}: Testing if JupyerLab is up at {test_url}...") try: r = requests.get(test_url, timeout=0.5) if r.status_code != 200: time.sleep(0.5) else: if "version" in r.json(): logger.info( f'Found JupyterLab up at {test_url} after {n/2.0} seconds' ) break else: time.sleep(0.5) except requests.exceptions.ConnectionError: # Assume API isn't up at all yet, so no connection can be made time.sleep(0.5) else: raise GigantumException( f'Could not reach JupyterLab at {test_url} after timeout')
def get_cache_manager_class(config: Configuration) -> Callable: """ Args: config(Configuration): Configuration for the client Returns: gtmcore.dataset.cache.CacheManager """ dataset_config = config.config.get('datasets') if not dataset_config: # Fallback to default host manager manager_str = 'host' else: manager_str = dataset_config.get('cache_manager') if manager_str in SUPPORTED_CACHE_MANAGERS.keys(): module, package = SUPPORTED_CACHE_MANAGERS.get(manager_str) # type: ignore imported = importlib.import_module(module, package) class_instance = getattr(imported, package) return class_instance else: raise GigantumException(f"Unsupported Dataset File Cache Manager: {manager_str}")
def start_mitm_proxy(cls, devtool_endpoint: str, target_key: str) -> str: """Launch a proxy cointainer between client and labbook. Args: devtool_endpoint: the specific target running a dev tool target_key: a unique key for this instance (related to the monitored Project container - e.g., RStudio) Returns: str that contains the proxy endpoint as http://{ip}:{port} """ hkey = cls.get_mitm_redis_key(target_key) # setup the environment - note that UID is obtained inside the container based on labmanager_share_vol # (mounted at /mnt/share) logfile_path = f'/mnt/share/{cls.logfile_dir}/{target_key}.rserver.dump' env_var = [ f"LBENDPOINT={devtool_endpoint}", f"LOGFILE_NAME={logfile_path}" ] nametag = f"gmitmproxy.{target_key}" volumes_dict = { 'labmanager_share_vol': { 'bind': '/mnt/share', 'mode': 'rw' } } docker_client = get_docker_client() container = docker_client.containers.run("gigantum/mitmproxy_proxy:" + CURRENT_MITMPROXY_TAG, detach=True, init=True, name=nametag, volumes=volumes_dict, environment=env_var) # We hammer repeatedly for 5 seconds (this should be very fast since it's a small, simple container) for _ in range(10): time.sleep(.5) # Hope that our container is actually up and reload container.reload() mitm_ip = container.attrs['NetworkSettings']['Networks']['bridge'][ 'IPAddress'] if mitm_ip: break if not mitm_ip: raise GigantumException( "Unable to get mitmproxy_proxy IP address.") # This is the port for NGINX mitm_endpoint = f'http://{mitm_ip}:8079' # register the proxy in KV store redis_conn = redis.Redis(db=1) redis_conn.hset(hkey, "endpoint", mitm_endpoint) redis_conn.hset(hkey, "container_id", container.id) redis_conn.hset(hkey, "logfile_path", logfile_path) redis_conn.hset(hkey, "devtool_endpoint", devtool_endpoint) # make sure proxy is up. for timeout in range(10): time.sleep(1) if ps_search(container, 'nginx'): logger.info( f"Proxy to rserver started within {timeout + 1} seconds") break else: raise ValueError('mitmproxy failed to start after 10 seconds') return mitm_endpoint
def start_jupyter(labbook: LabBook, username: str, tag: Optional[str] = None, check_reachable: bool = True, proxy_prefix: Optional[str] = None) -> str: """ Main entrypoint to launching Jupyter. Note, the caller must determine for themselves the host and port. Returns: Path to jupyter (e.g., "/lab?token=xyz") """ owner = InventoryManager().query_owner(labbook) lb_key = tag or infer_docker_image_name( labbook_name=labbook.name, owner=owner, username=username) docker_client = get_docker_client() lb_container = docker_client.containers.get(lb_key) if lb_container.status != 'running': raise GigantumException( f"{str(labbook)} container is not running. Start it before launch a dev tool." ) jupyter_ps = ps_search(lb_container, 'jupyter lab') # Get IP of container on Docker Bridge Network lb_ip_addr = get_container_ip(lb_key) if len(jupyter_ps) == 1: logger.info(f'Found existing Jupyter instance for {str(labbook)}.') # Get token from PS in container t = re.search("token='?([a-zA-Z\d-]+)'?", jupyter_ps[0]) if not t: raise GigantumException('Cannot detect Jupyter Lab token') token = t.groups()[0] suffix = f'{proxy_prefix or ""}/lab/tree/code?token={token}' if check_reachable: check_jupyter_reachable(lb_ip_addr, DEFAULT_JUPYTER_PORT, f'{proxy_prefix or ""}') return suffix elif len(jupyter_ps) == 0: token = str(uuid.uuid4()).replace('-', '') if proxy_prefix and proxy_prefix[0] != '/': proxy_prefix = f'/{proxy_prefix}' _start_jupyter_process(labbook, lb_container, username, lb_key, token, proxy_prefix) suffix = f'{proxy_prefix or ""}/lab/tree/code?token={token}' if check_reachable: check_jupyter_reachable(lb_ip_addr, DEFAULT_JUPYTER_PORT, f'{proxy_prefix or ""}') return suffix else: # If "ps aux" for jupyterlab returns multiple hits - this should never happen. for n, l in enumerate(jupyter_ps): logger.error( f'Multiple JupyerLab instances - ({n+1} of {len(jupyter_ps)}) - {l}' ) raise ValueError( f'Multiple Jupyter Lab instances detected in project env. You should restart the container.' )