def clean_source(cfg: Dict[str, str]) -> NoReturn: """ The guts of the "clean-source" command, this function deletes the source notebooks for current course from the remote Databricks instance. :param cfg: The config. COURSE_NAME, COURSE_REMOTE_SOURCE, and DB_PROFILE are assumed to be set. :return: Nothing """ check_config(cfg) db_profile = cfg['DB_PROFILE'] remote_source = cfg['COURSE_REMOTE_SOURCE'] w = databricks.Workspace(profile=db_profile) w.mkdirs(remote_source) w.rm(remote_source, recursive=True)
def update_config(cfg: Dict[str, str]) -> Dict[str, str]: """ Update the configuration, setting values that depend on course name, which is assumed to be set in the configuration. :param cfg: current configuration :return: possibly adjusted configuration :raises CourseError: Configuration error. """ course = cfg.get('COURSE_NAME') if not course: return cfg from os.path import join, normpath adj = cfg.copy() repo = adj['COURSE_REPO'] self_paced = list(get_self_paced_courses(cfg)) prefix = 'Self-Paced' if course in self_paced else '' adj['PREFIX'] = prefix adj['COURSE_HOME'] = normpath(join(repo, 'courses', prefix, course)) if not adj.get('COURSE_YAML'): adj['COURSE_YAML'] = join(adj['COURSE_HOME'], 'build.yaml') adj['COURSE_MODULES'] = join(repo, 'modules', prefix, course) db_shard_home = adj.get('DB_SHARD_HOME') if not db_shard_home: # Let the databricks Workspace layer figure out the appropriate value # for home. try: w = databricks.Workspace(adj['DB_PROFILE']) db_shard_home = w.home except databricks.DatabricksError as e: # Ignore config errors. ~/.databrickscfg might not be there. if e.code != databricks.StatusCode.CONFIG_ERROR: raise if db_shard_home: adj['COURSE_REMOTE_SOURCE'] = f'{db_shard_home}/{adj["SOURCE"]}/{course}' adj['COURSE_REMOTE_TARGET'] = f'{db_shard_home}/{adj["TARGET"]}/{course}' return adj
def import_dbc(dbc: str, build: bdc.BuildData) -> NoReturn: ''' Import a single DBC. Assumes (a) the working directory is the build directory, and (b) that the remote target path has already been created. ''' w = databricks.Workspace(profile=db_profile) if build.has_profiles: parent_subpath = os.path.dirname(dbc) dir_to_make = f'{remote_target}/{os.path.dirname(parent_subpath)}' w.mkdirs(dir_to_make) remote_path = f'{remote_target}/{parent_subpath}' else: remote_path = remote_target info(f'Importing "{dbc}" to "{remote_path}"...') w.import_dbc(dbc, remote_path)
def clean(cfg: Dict[str, str]) -> NoReturn: """ The guts of the "clean" command, this function deletes the built (target) notebooks for current course from the remote Databricks instance. :param cfg: The config. COURSE_NAME, COURSE_REMOTE_TARGET, and DB_PROFILE are assumed to be set. :return: Nothing """ check_config(cfg) db_profile = cfg['DB_PROFILE'] remote_target = cfg['COURSE_REMOTE_TARGET'] # It's odd to ensure that the directory exists before removing it, but # it's easier (and costs no more time, really) than to issue a REST call # to check whether it exists in the first place. And "rm" will die if # called on a nonexistent remote path. w = databricks.Workspace(profile=db_profile) w.mkdirs(remote_target) w.rm(remote_target, recursive=True)
def import_dbcs(cfg: Dict[str, str], build_dir: str, build_file: str) -> NoReturn: """ Find all DBC files under the build output directory for the current course, and upload them (import them) into the Databricks instance. :param cfg: The config. COURSE_NAME, COURSE_REMOTE_TARGET, and DB_PROFILE are assumed to be set. :param build_dir: The path to the build directory. :return: NOthing """ check_config(cfg) remote_target = cfg['COURSE_REMOTE_TARGET'] db_profile = cfg['DB_PROFILE'] def import_dbc(dbc: str, build: bdc.BuildData) -> NoReturn: ''' Import a single DBC. Assumes (a) the working directory is the build directory, and (b) that the remote target path has already been created. ''' w = databricks.Workspace(profile=db_profile) if build.has_profiles: parent_subpath = os.path.dirname(dbc) dir_to_make = f'{remote_target}/{os.path.dirname(parent_subpath)}' w.mkdirs(dir_to_make) remote_path = f'{remote_target}/{parent_subpath}' else: remote_path = remote_target info(f'Importing "{dbc}" to "{remote_path}"...') w.import_dbc(dbc, remote_path) # Get the build information. We'll need it later. build = bdc.bdc_load_build(build_file) print( f'Importing all DBCs under "{build_dir}" to remote "{remote_target}"') dbcs = [] with working_directory(build_dir) as pwd: for dirpath, _, filenames in os.walk('.'): for filename in filenames: _, ext = os.path.splitext(filename) if ext != '.dbc': continue dbcs.append(os.path.normpath(os.path.join(dirpath, filename))) if not dbcs: warn('No DBCs found.') else: clean(cfg) w = databricks.Workspace(profile=db_profile) # If we're doing a profile-based build, create the remote target. # The import operations will implicitly create the remote # subfolders. However, if we're not doing profile-based builds, # then creating the remote target ahead of time will cause the # import to fail, so don't do that. if build.has_profiles: w.mkdirs(remote_target) for dbc in dbcs: info(f'\nIn "{pwd}":') import_dbc(dbc, build)