def _download_workspace(self, resource_properties, overwrite): """ Download workspace asset. :param resource_properties: dict of properties for the workspace asset. Must contain the 'source_path', 'path' and 'object_type' fields. :param overwrite: Whether or not to overwrite the contents of workspace notebooks. """ local_path = resource_properties.get(WORKSPACE_RESOURCE_SOURCE_PATH) workspace_path = resource_properties.get(WORKSPACE_RESOURCE_PATH) object_type = resource_properties.get(WORKSPACE_RESOURCE_OBJECT_TYPE) click.echo('Downloading {} from Databricks path {} to {}'.format(object_type, workspace_path, local_path)) if object_type == NOTEBOOK: # Inference of notebook language and format. A tuple of (language, fmt) or Nonetype. language_fmt = WorkspaceLanguage.to_language_and_format(local_path) if language_fmt is None: raise StackError("Workspace Notebook language and format cannot be inferred." "Please check file extension of notebook 'source_path'.") (_, fmt) = language_fmt local_dir = os.path.dirname(os.path.abspath(local_path)) if not os.path.exists(local_dir): os.makedirs(local_dir) self.workspace_client.export_workspace(workspace_path, local_path, fmt, overwrite) elif object_type == DIRECTORY: self.workspace_client.export_workspace_dir(workspace_path, local_path, overwrite) else: raise StackError("Invalid value for '{}' field: {}" .format(WORKSPACE_RESOURCE_OBJECT_TYPE, object_type))
def _put_job(self, job_settings): """ Given settings of the job in job_settings, create a new job. For purposes of idempotency and to reduce leaked resources in alpha versions of stack deployment, if a job exists with the same name, that job will be updated. If multiple jobs are found with the same name, the deployment will abort. :param job_settings: :return: job_id, Physical ID of job on Databricks server. """ if 'name' not in job_settings: raise StackError("Please supply 'name' in job resource 'properties'") job_name = job_settings.get('name') jobs_same_name = self.jobs_client._list_jobs_by_name(job_name) if len(jobs_same_name) > 1: raise StackError("Multiple jobs with the same name '{}' already exist, aborting" " stack deployment".format(job_name)) elif len(jobs_same_name) == 1: existing_job = jobs_same_name[0] creator_name = existing_job.get('creator_user_name') timestamp = existing_job.get('created_time') / MS_SEC # Convert to readable date. date_created = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S') click.echo("Warning: Job exists with same name '{}' created by {} on {}. Job will " "be overwritten".format(job_name, creator_name, date_created)) # Calling jobs_client.reset_job directly so as to not call same level function. self.jobs_client.reset_job({'job_id': existing_job.get('job_id'), 'new_settings': job_settings}) return existing_job.get('job_id') else: job_id = self.jobs_client.create_job(job_settings).get('job_id') return job_id
def _deploy_workspace(self, resource_properties, databricks_id, overwrite): """ Deploy workspace asset. :param resource_properties: dict of properties for the workspace asset. Must contain the 'source_path', 'path' and 'object_type' fields. :param databricks_id: dict containing physical identifier of workspace asset on databricks. Should contain the field 'path'. :param overwrite: Whether or not to overwrite the contents of workspace notebooks. :return: (dict, dict) of (databricks_id, deploy_output). databricks_id is the physical ID for the stack status that contains the workspace path of the notebook or directory on datbricks. deploy_output is the initial information about the asset on databricks at deploy time returned by the REST API. """ local_path = resource_properties.get(WORKSPACE_RESOURCE_SOURCE_PATH) workspace_path = resource_properties.get(WORKSPACE_RESOURCE_PATH) object_type = resource_properties.get(WORKSPACE_RESOURCE_OBJECT_TYPE) actual_object_type = DIRECTORY if os.path.isdir( local_path) else NOTEBOOK if object_type != actual_object_type: raise StackError('Field "{}" ({}) not consistent ' 'with actual object type ({})'.format( WORKSPACE_RESOURCE_OBJECT_TYPE, object_type, actual_object_type)) click.echo('Uploading {} from {} to Databricks workspace at {}'.format( object_type, local_path, workspace_path)) if object_type == NOTEBOOK: # Inference of notebook language and format language_fmt = WorkspaceLanguage.to_language_and_format(local_path) if language_fmt is None: raise StackError( "Workspace notebook language and format cannot be inferred. " "Please check file extension of notebook file.") language, fmt = language_fmt # Create needed directories in workspace. self.workspace_client.mkdirs(os.path.dirname(workspace_path)) self.workspace_client.import_workspace(local_path, workspace_path, language, fmt, overwrite) elif object_type == DIRECTORY: self.workspace_client.import_workspace_dir( local_path, workspace_path, overwrite, exclude_hidden_files=True) else: # Shouldn't reach here because of verification of object_type above. assert False if databricks_id and databricks_id[ WORKSPACE_RESOURCE_PATH] != workspace_path: # databricks_id['path'] is the workspace path from the last deployment. Alert when # changed click.echo("Workspace asset had path changed from {} to {}".format( databricks_id[WORKSPACE_RESOURCE_PATH], workspace_path)) new_databricks_id = {WORKSPACE_RESOURCE_PATH: workspace_path} deploy_output = self.workspace_client.client.get_status(workspace_path) return new_databricks_id, deploy_output
def _deploy_workspace(self, resource_properties, physical_id, overwrite): """ Deploy workspace asset. :param resource_properties: dict of properties for the workspace asset. Must contain the 'source_path' and 'path' fields. The other fields will be inferred if not provided. :param physical_id: dict containing physical identifier of workspace asset on databricks. Should contain the field 'path'. :param overwrite: Whether or not to overwrite the contents of workspace notebooks. :return: (dict, dict) of (physical_id, deploy_output). physical_id is the physical ID for the stack status that contains the workspace path of the notebook or directory on datbricks. deploy_output is the initial information about the asset on databricks at deploy time returned by the REST API. """ # Required fields. TODO(alinxie) put in _validate_config local_path = resource_properties.get('source_path') workspace_path = resource_properties.get('path') object_type = resource_properties.get('object_type') actual_object_type = 'DIRECTORY' if os.path.isdir( local_path) else 'NOTEBOOK' if object_type != actual_object_type: raise StackError("Field 'object_type' ({}) not consistent" "with actual object type ({})".format( object_type, actual_object_type)) click.echo('Uploading {} from {} to Databricks workspace at {}'.format( object_type, local_path, workspace_path)) if object_type == 'NOTEBOOK': # Inference of notebook language and format language_fmt = WorkspaceLanguage.to_language_and_format(local_path) if language_fmt is None: raise StackError( "Workspace notebook language and format cannot be inferred" "Please check file extension of notebook file.") language, fmt = language_fmt # Create needed directories in workspace. self.workspace_client.mkdirs(os.path.dirname(workspace_path)) self.workspace_client.import_workspace(local_path, workspace_path, language, fmt, overwrite) elif object_type == 'DIRECTORY': self.workspace_client.import_workspace_dir( local_path, workspace_path, overwrite, exclude_hidden_files=True) else: # Shouldn't reach here because of verification of object_type above. assert False if physical_id and physical_id['path'] != workspace_path: # physical_id['path'] is the workspace path from the last deployment. Alert when changed click.echo("Workspace asset had path changed from {} to {}".format( physical_id['path'], workspace_path)) new_physical_id = {'path': workspace_path} deploy_output = self.workspace_client.client.get_status(workspace_path) return new_physical_id, deploy_output
def _validate_config(self, stack_config): """ Validate fields within a stack configuration. This ensures that an inputted configuration has the necessary fields for stack deployment to function well. :param stack_config: dict- stack config that is inputted by the user. :return: None. Raises errors to stop deployment if there is a problem. """ click.echo('Validating fields in stack configuration...') self._assert_fields_in_dict([STACK_NAME, STACK_RESOURCES], stack_config) seen_resource_ids = set( ) # Store seen resources to restrict duplicates. for resource in stack_config.get(STACK_RESOURCES): # Get validate resource ID exists, then get it. self._assert_fields_in_dict([RESOURCE_ID], resource) resource_id = resource.get(RESOURCE_ID) click.echo('Validating fields in resource with ID "{}"'.format( resource_id)) self._assert_fields_in_dict( [RESOURCE_SERVICE, RESOURCE_PROPERTIES], resource) resource_service = resource.get(RESOURCE_SERVICE) resource_properties = resource.get(RESOURCE_PROPERTIES) # Error on duplicate resource ID's if resource_id in seen_resource_ids: raise StackError( 'Duplicate resource ID "{}" found, please resolve.'.format( resource_id)) seen_resource_ids.add(resource_id) # Resource service-specific validations click.echo('Validating fields in "{}" of {} resource.'.format( RESOURCE_PROPERTIES, resource_service)) if resource_service == JOBS_SERVICE: self._assert_fields_in_dict([JOBS_RESOURCE_NAME], resource_properties) elif resource_service == WORKSPACE_SERVICE: self._assert_fields_in_dict([ WORKSPACE_RESOURCE_PATH, WORKSPACE_RESOURCE_SOURCE_PATH, WORKSPACE_RESOURCE_OBJECT_TYPE ], resource_properties) elif resource_service == DBFS_SERVICE: self._assert_fields_in_dict([ DBFS_RESOURCE_PATH, DBFS_RESOURCE_SOURCE_PATH, DBFS_RESOURCE_IS_DIR ], resource_properties) else: raise StackError('Resource service "{}" not supported'.format( resource_service))
def _validate_status(self, stack_status): """ Validate fields within a stack status. This ensures that a stack status has the necessary fields for stack deployment to function well. If there is an error here, then it is either an implementation error that must be fixed by a developer or the User edited the stack status file created by the program. :param stack_status: dict- stack status that is created by the program. :return: None. Raises errors to stop deployment if there is a problem. """ click.echo('Validating fields in stack status...') self._assert_fields_in_dict([STACK_NAME, STACK_RESOURCES, STACK_DEPLOYED], stack_status) for resource_status in stack_status.get(STACK_DEPLOYED): self._assert_fields_in_dict([RESOURCE_ID], resource_status) resource_id = resource_status.get(RESOURCE_ID) click.echo('Validating fields in resource status of resource with ID "{}"' .format(resource_id)) self._assert_fields_in_dict([RESOURCE_SERVICE, RESOURCE_PHYSICAL_ID, RESOURCE_DEPLOY_OUTPUT], resource_status) resource_service = resource_status.get(RESOURCE_SERVICE) resource_physical_id = resource_status.get(RESOURCE_PHYSICAL_ID) click.echo('Validating fields in "{}" of {} resource status' .format(RESOURCE_PHYSICAL_ID, resource_service)) if resource_service == JOBS_SERVICE: self._assert_fields_in_dict([JOBS_RESOURCE_JOB_ID], resource_physical_id) elif resource_service == WORKSPACE_SERVICE: self._assert_fields_in_dict([WORKSPACE_RESOURCE_PATH], resource_physical_id) elif resource_service == DBFS_SERVICE: self._assert_fields_in_dict([DBFS_RESOURCE_PATH], resource_physical_id) else: raise StackError("{} not a valid resource status service".format(resource_service))
def _deploy_resource(self, resource_config, resource_status=None, **kwargs): """ Deploys a resource given a resource information extracted from the stack JSON configuration template. :param resource_config: A dict of the resource with fields of 'id', 'service' and 'properties'. ex. {'id': 'example-resource', 'service': 'jobs', 'properties': {...}} :param resource_status: A dict of the resource's deployment info from the last deployment. Will be None if this is the first deployment. ex. {'id': 'example-resource', 'service': 'jobs', 'physical_id': {...}} :return: dict resource_status- A dictionary of deployment information of the resource to be stored at deploy time. It includes the resource id of the resource along with the physical id and deploy output of the resource. ex. {'id': 'example-resource', 'service': 'jobs', 'physical_id': {'job_id': 123}, 'timestamp': 123456789, 'deploy_output': {..}} """ resource_id = resource_config.get(RESOURCE_ID) resource_service = resource_config.get(RESOURCE_SERVICE) resource_properties = resource_config.get(RESOURCE_PROPERTIES) physical_id = resource_status.get(RESOURCE_PHYSICAL_ID) if resource_status else None if resource_service == JOBS_SERVICE: click.echo("Deploying job '{}' with properties: \n{}".format(resource_id, json.dumps( resource_properties, indent=2, separators=(',', ': ')))) new_physical_id, deploy_output = self._deploy_job(resource_properties, physical_id) elif resource_service == WORKSPACE_SERVICE: click.echo( "Deploying workspace asset '{}' with properties \n{}" .format( resource_id, json.dumps(resource_properties, indent=2, separators=(',', ': ')) ) ) overwrite = kwargs.get('overwrite', False) new_physical_id, deploy_output = self._deploy_workspace(resource_properties, physical_id, overwrite) elif resource_service == DBFS_SERVICE: click.echo( "Deploying DBFS asset '{}' with properties \n{}".format( resource_id, json.dumps(resource_properties, indent=2, separators=(',', ': ')) ) ) overwrite = kwargs.get('overwrite', False) new_physical_id, deploy_output = self._deploy_dbfs(resource_properties, physical_id, overwrite) else: raise StackError("Resource service '{}' not supported".format(resource_service)) new_resource_status = {RESOURCE_ID: resource_id, RESOURCE_SERVICE: resource_service, RESOURCE_DEPLOY_TIMESTAMP: # Milliseconds since epoch. int(time.mktime(datetime.now().timetuple()) * MS_SEC), RESOURCE_PHYSICAL_ID: new_physical_id, RESOURCE_DEPLOY_OUTPUT: deploy_output} return new_resource_status
def _validate_status(self, stack_status): """ Validate fields within a stack status. This ensures that a stack status has the necessary fields for stack deployment to function well. If there is an error here, then it is either an implementation error that must be fixed by a developer or the User edited the stack status file created by the program. TODO(alinxie): Add validation for separate resource services and their physical id's. :param stack_status: dict- stack status that is created by the program. :return: None. Raises errors to stop deployment if there is a problem. """ if STACK_NAME not in stack_status: raise StackError("'{}' not in status.".format(STACK_NAME)) if STACK_RESOURCES not in stack_status: raise StackError("'{}' not in status".format(STACK_RESOURCES)) if STACK_DEPLOYED not in stack_status: raise StackError("'{}' not in status".format(STACK_DEPLOYED)) for deployed_resource in stack_status.get(STACK_DEPLOYED): if RESOURCE_ID not in deployed_resource: raise StackError( "{} doesn't exist in deployed resource status".format( RESOURCE_ID)) if RESOURCE_SERVICE not in deployed_resource: raise StackError( "{} doesn't exist in deployed resource status".format( RESOURCE_SERVICE)) if RESOURCE_PHYSICAL_ID not in deployed_resource: raise StackError( "{} doesn't exist in deployed resource status".format( RESOURCE_PHYSICAL_ID))
def _validate_config(self, stack_config): """ Validate fields within a stack configuration. This ensures that an inputted configuration has the necessary fields for stack deployment to function well. TODO(alinxie): Add validation for separate resource services and their properties. :param stack_config: dict- stack config that is inputted by the user. :return: None. Raises errors to stop deployment if there is a problem. """ if STACK_NAME not in stack_config: raise StackError("'{}' not in configuration".format(STACK_NAME)) if STACK_RESOURCES not in stack_config: raise StackError( "'{}' not in configuration".format(STACK_RESOURCES)) seen_resource_ids = set( ) # Store seen resources to restrict duplicates. for resource in stack_config.get(STACK_RESOURCES): if RESOURCE_ID not in resource: raise StackError( "{} doesn't exist in resource config".format(RESOURCE_ID)) if RESOURCE_SERVICE not in resource: raise StackError("{} doesn't exist in resource config".format( RESOURCE_SERVICE)) if RESOURCE_PROPERTIES not in resource: raise StackError("{} doesn't exist in resource config".format( RESOURCE_PROPERTIES)) # Error on duplicate resource ID's resource_id = resource.get(RESOURCE_ID) if resource_id in seen_resource_ids: raise StackError( "Duplicate resource ID '{}' found, please resolve.".format( resource_id)) seen_resource_ids.add(resource_id)
def _deploy_dbfs(self, resource_properties, databricks_id, overwrite, headers=None): """ Deploy dbfs asset. :param resource_properties: dict of properties for the dbfs asset. Must contain the 'source_path', 'path' and 'is_dir' fields. :param databricks_id: dict containing physical identifier of dbfs asset on Databricks. Should contain the field 'path'. :param overwrite: Whether or not to overwrite the contents of dbfs files. :return: databricks_id: a dict that contains the dbfs path of the file on Databricks. ex.{"path":"dbfs:/path/in/dbfs"} """ local_path = resource_properties.get(DBFS_RESOURCE_SOURCE_PATH) dbfs_path = resource_properties.get(DBFS_RESOURCE_PATH) is_dir = resource_properties.get(DBFS_RESOURCE_IS_DIR) if is_dir != os.path.isdir(local_path): dir_or_file = 'directory' if os.path.isdir(local_path) else 'file' raise StackError( 'local source_path "{}" is found to be a {}, but is not specified' ' as one with is_dir: {}.'.format(local_path, dir_or_file, str(is_dir).lower())) if is_dir: click.echo('Uploading directory from {} to DBFS at {}'.format( local_path, dbfs_path)) self.dbfs_client.cp(recursive=True, overwrite=overwrite, src=local_path, dst=dbfs_path, headers=headers) else: click.echo('Uploading file from {} to DBFS at {}'.format( local_path, dbfs_path)) self.dbfs_client.cp(recursive=False, overwrite=overwrite, src=local_path, dst=dbfs_path, headers=headers) if databricks_id and databricks_id[DBFS_RESOURCE_PATH] != dbfs_path: # databricks_id['path'] is the dbfs path from the last deployment. Alert when changed click.echo("Dbfs asset had path changed from {} to {}".format( databricks_id[DBFS_RESOURCE_PATH], dbfs_path)) new_databricks_id = {DBFS_RESOURCE_PATH: dbfs_path} return new_databricks_id
def _deploy_dbfs(self, resource_properties, physical_id, overwrite): """ Deploy dbfs asset. :param resource_properties: dict of properties for the dbfs asset. Must contain the 'source_path', 'path' and 'is_dir' fields. :param physical_id: dict containing physical identifier of dbfs asset on Databricks. Should contain the field 'path'. :param overwrite: Whether or not to overwrite the contents of dbfs files. :return: (dict, dict) of (physical_id, deploy_output). physical_id is a dict that contains the dbfs path of the file on Databricks. ex.{"path":"dbfs:/path/in/dbfs"} deploy_output is the initial information about the dbfs asset at deploy time returned by the REST API. """ # Required fields. TODO(alinxie) validate fields in _validate_config local_path = resource_properties.get('source_path') dbfs_path = resource_properties.get('path') is_dir = resource_properties.get('is_dir') if is_dir != os.path.isdir(local_path): dir_or_file = 'directory' if os.path.isdir(local_path) else 'file' raise StackError( "local source_path '{}' is found to be a {}, but is not specified" " as one with is_dir: {}.".format(local_path, dir_or_file, str(is_dir).lower())) if is_dir: click.echo('Uploading directory from {} to DBFS at {}'.format( local_path, dbfs_path)) self.dbfs_client.cp(recursive=True, overwrite=overwrite, src=local_path, dst=dbfs_path) else: click.echo('Uploading file from {} to DBFS at {}'.format( local_path, dbfs_path)) self.dbfs_client.cp(recursive=False, overwrite=overwrite, src=local_path, dst=dbfs_path) if physical_id and physical_id['path'] != dbfs_path: # physical_id['path'] is the dbfs path from the last deployment. Alert when changed click.echo("Dbfs asset had path changed from {} to {}".format( physical_id['path'], dbfs_path)) new_physical_id = {'path': dbfs_path} deploy_output = self.dbfs_client.client.get_status(dbfs_path) return new_physical_id, deploy_output
def _update_job(self, job_settings, job_id, headers=None): """ Given job settings and an existing job_id of a job, update the job settings on databricks. :param job_settings: job settings to update the job with. :param job_id: physical job_id of job in databricks server. """ try: self.jobs_client.reset_job( { 'job_id': job_id, 'new_settings': job_settings }, headers=headers) except HTTPError: raise StackError( 'Job ID {} in stack status could not be found in the workspace. ' 'Please remove or make necessary changes to the current stack status ' 'to resolve this inconsistency before proceeding. Aborting ' 'stack deployment ...'.format(job_id))
def _assert_fields_in_dict(self, fields, dictionary): for field in fields: if field not in dictionary: raise StackError('Required field "{}" not found'.format(field))
def _deploy_resource(self, resource_config, resource_status=None, headers=None, **kwargs): """ Deploys a resource given a resource information extracted from the stack JSON configuration template. :param resource_config: A dict of the resource with fields of 'id', 'service' and 'properties'. ex. {'id': 'example-resource', 'service': 'jobs', 'properties': {...}} :param resource_status: A dict of the resource's deployment info from the last deployment. Will be None if this is the first deployment. ex. {'id': 'example-resource', 'service': 'jobs', 'databricks_id': {...}} :return: dict resource_status- A dictionary of deployment information of the resource to be stored at deploy time. It includes the resource id of the resource along with the databricks id and deploy output of the resource. ex. {'id': 'example-resource', 'service': 'jobs', 'databricks_id': {'job_id': 123}} """ resource_id = resource_config.get(RESOURCE_ID) resource_service = resource_config.get(RESOURCE_SERVICE) resource_properties = resource_config.get(RESOURCE_PROPERTIES) databricks_id = resource_status.get( RESOURCE_DATABRICKS_ID) if resource_status else None if resource_service == JOBS_SERVICE: click.echo('Deploying job "{}" with properties: \n{}'.format( resource_id, json.dumps(resource_properties, indent=2, separators=(',', ': ')))) new_databricks_id = self._deploy_job(resource_properties, databricks_id, headers=headers) elif resource_service == WORKSPACE_SERVICE: click.echo( 'Deploying workspace asset "{}" with properties \n{}'.format( resource_id, json.dumps(resource_properties, indent=2, separators=(',', ': ')))) overwrite = kwargs.get('overwrite', False) new_databricks_id = self._deploy_workspace(resource_properties, databricks_id, overwrite, headers=headers) elif resource_service == DBFS_SERVICE: click.echo('Deploying DBFS asset "{}" with properties \n{}'.format( resource_id, json.dumps(resource_properties, indent=2, separators=(',', ': ')))) overwrite = kwargs.get('overwrite', False) new_databricks_id = self._deploy_dbfs(resource_properties, databricks_id, overwrite, headers=headers) else: raise StackError( 'Resource service "{}" not supported'.format(resource_service)) new_resource_status = { RESOURCE_ID: resource_id, RESOURCE_SERVICE: resource_service, RESOURCE_DATABRICKS_ID: new_databricks_id } return new_resource_status