def rename_star(self, old_dir, new_dir): """Equivalent to `mv old_dir/* new""" if not self.isdir(old_dir): raise IOError(errno.ENOTDIR, _("'%s' is not a directory") % old_dir) if not self.exists(new_dir): self.mkdir(new_dir) elif not self.isdir(new_dir): raise IOError(errno.ENOTDIR, _("'%s' is not a directory") % new_dir) ls = self.listdir(old_dir) for dirent in ls: self.rename(Hdfs.join(old_dir, dirent), Hdfs.join(new_dir, dirent))
def _create_deployment_dir(self): """ Return the job deployment directory in HDFS, creating it if necessary. The actual deployment dir should be 0711 owned by the user """ path = Hdfs.join(REMOTE_DEPLOYMENT_DIR.get(), '_%s_-oozie-%s-%s' % (self.user.username, self.job.id, time.time())) self._create_dir(path) return path
def rename(self, old, new): """rename(old, new)""" old = Hdfs.normpath(old) if not new.startswith('/'): new = Hdfs.join(Hdfs.dirname(old), new) new = Hdfs.normpath(new) params = self._getparams() params['op'] = 'RENAME' # Encode `new' because it's in the params params['destination'] = smart_str(new) result = self._root.put(old, params) if not result['boolean']: error_message = "Rename failed:" if self.isdir(new) and not self.isdir(old): error_message = "Move failed:" new = Hdfs.join(new, self.basename(old)) raise IOError(_(error_message + " %s -> %s") % (str(smart_str(old)), str(smart_str(new))))
def _create_deployment_dir(self): """ Return the job deployment directory in HDFS, creating it if necessary. The actual deployment dir should be 0711 owned by the user """ if self.user != self.job.owner: path = Hdfs.join(REMOTE_DEPLOYMENT_DIR.get(), '_%s_-oozie-%s-%s' % (self.user.username, self.job.id, time.time())) self.fs.copy_remote_dir(self.job.deployment_dir, path, owner=self.user, dir_mode=0711) else: path = self.job.deployment_dir self._create_dir(path) return path
def rename(self, old, new): """rename(old, new)""" old = Hdfs.normpath(old) if not new.startswith("/"): new = Hdfs.join(Hdfs.dirname(old), new) new = Hdfs.normpath(new) params = self._getparams() params["op"] = "RENAME" # Encode `new' because it's in the params params["destination"] = smart_str(new) result = self._root.put(old, params) if not result["boolean"]: raise IOError("Rename failed: %s -> %s" % (smart_str(old), smart_str(new)))
def create_directories(fs, directory_list=[]): # If needed, create the remote home, deployment and data directories directories = [REMOTE_DEPLOYMENT_DIR.get()] + directory_list for directory in directories: if not fs.do_as_user(fs.DEFAULT_USER, fs.exists, directory): remote_home_dir = Hdfs.join('/user', fs.DEFAULT_USER) if directory.startswith(remote_home_dir): # Home is 755 fs.do_as_user(fs.DEFAULT_USER, fs.create_home_dir, remote_home_dir) # Shared by all the users fs.do_as_user(fs.DEFAULT_USER, fs.mkdir, directory, 01777) fs.do_as_user(fs.DEFAULT_USER, fs.chmod, directory, 01777) # To remove after https://issues.apache.org/jira/browse/HDFS-3491
def create_directories(fs): # If needed, create the remote home, deployment and data directories directories = (REMOTE_DEPLOYMENT_DIR.get(), REMOTE_SAMPLE_DIR.get()) for directory in directories: if not fs.do_as_user("hdfs", fs.exists, directory): remote_home_dir = Hdfs.join('/user', "hdfs") if directory.startswith(remote_home_dir): # Home is 755 fs.do_as_user("hdfs", fs.create_home_dir, remote_home_dir) # Shared by all the users fs.do_as_user("hdfs", fs.mkdir, directory, 511) fs.do_as_user("hdfs", fs.chmod, directory, 511) # To remove after https://issues.apache.org/jira/browse/HDFS-3491 return REMOTE_SAMPLE_DIR.get()
def create_directories(fs): # If needed, create the remote home, deployment and data directories directories = (REMOTE_DEPLOYMENT_DIR.get(), REMOTE_SAMPLE_DIR.get()) for directory in directories: if not fs.do_as_user(fs.DEFAULT_USER, fs.exists, directory): remote_home_dir = Hdfs.join('/user', fs.DEFAULT_USER) if directory.startswith(remote_home_dir): # Home is 755 fs.do_as_user(fs.DEFAULT_USER, fs.create_home_dir, remote_home_dir) # Shared by all the users fs.do_as_user(fs.DEFAULT_USER, fs.mkdir, directory, 01777) fs.do_as_user(fs.DEFAULT_USER, fs.chmod, directory, 01777) # To remove after https://issues.apache.org/jira/browse/HDFS-3491 return REMOTE_SAMPLE_DIR.get()
def rename(self, old, new): """rename(old, new)""" old = Hdfs.normpath(old) if not new.startswith('/'): new = Hdfs.join(Hdfs.dirname(old), new) new = Hdfs.normpath(new) params = self._getparams() params['op'] = 'RENAME' # Encode `new' because it's in the params params['destination'] = smart_str(new) result = self._root.put(old, params) if not result['boolean']: raise IOError( _("Rename failed: %s -> %s") % (str(smart_str(old)), str(smart_str(new))))
def __init__(self, file_status, parent_path): self.path = Hdfs.join(parent_path, decode_fs_path(file_status['pathSuffix'])) self.isDir = file_status['type'] == 'DIRECTORY' self.atime = file_status['accessTime'] / 1000 self.mtime = file_status['modificationTime'] / 1000 self.user = file_status['owner'] self.group = file_status['group'] self.size = file_status['length'] self.blockSize = file_status['blockSize'] self.replication = file_status['replication'] self.mode = int(file_status['permission'], 8) if self.isDir: self.mode |= stat.S_IFDIR else: self.mode |= stat.S_IFREG
def rename(self, old, new): """rename(old, new)""" old = self.strip_normpath(old) if not self.is_absolute(new): new = Hdfs.join(Hdfs.dirname(old), new) new = self.strip_normpath(new) params = self._getparams() params['op'] = 'RENAME' # Encode `new' because it's in the params params['destination'] = smart_str(new) headers = self._getheaders() result = self._root.put(old, params, headers=headers) if not result['boolean']: raise IOError( _("Rename failed: %s -> %s") % (smart_str( old, errors='replace'), smart_str(new, errors='replace')))
def _create_deployment_dir(self): """ Return the job deployment directory in HDFS, creating it if necessary. The actual deployment dir should be 0711 owned by the user """ if self.user != self.job.owner: path = Hdfs.join( REMOTE_DEPLOYMENT_DIR.get(), '_%s_-oozie-%s-%s' % (self.user.username, self.job.id, time.time())) self.fs.copy_remote_dir(self.job.deployment_dir, path, owner=self.user, dir_mode=0711) else: path = self.job.deployment_dir self._create_dir(path) return path
def create_data_dir(cls, fs): # If needed, create the remote home and data directories remote_data_dir = conf.REMOTE_DATA_DIR.get() user = fs.user try: fs.setuser(fs.DEFAULT_USER) if not fs.exists(remote_data_dir): remote_home_dir = Hdfs.join('/user', fs.user) if remote_data_dir.startswith(remote_home_dir): # Home is 755 fs.create_home_dir(remote_home_dir) # Shared by all the users fs.mkdir(remote_data_dir, 01777) finally: fs.setuser(user) return remote_data_dir
def __init__(self, file_status, parent_path): self.name = decode_fs_path(file_status['pathSuffix']) self.path = Hdfs.join(parent_path, self.name) self.isDir = file_status['type'] == 'DIRECTORY' self.type = file_status['type'] self.atime = file_status['accessTime'] / 1000 self.mtime = file_status['modificationTime'] / 1000 self.user = file_status['owner'] self.group = file_status['group'] self.size = file_status['length'] self.blockSize = file_status['blockSize'] self.replication = file_status['replication'] self.mode = int(file_status['permission'], 8) if self.isDir: self.mode |= stat.S_IFDIR else: self.mode |= stat.S_IFREG
def _create_deployment_dir(self): """ Return the job deployment directory in HDFS, creating it if necessary. The actual deployment dir should be 0711 owned by the user """ # Automatic setup of the required directories if needed create_directories(self.fs) # Case of a shared job if self.user != self.job.owner: path = Hdfs.join(REMOTE_DEPLOYMENT_DIR.get(), '_%s_-oozie-%s-%s' % (self.user.username, self.job.id, time.time())) # Shared coords or bundles might not have any existing workspaces if self.fs.exists(self.job.deployment_dir): self.fs.copy_remote_dir(self.job.deployment_dir, path, owner=self.user, dir_mode=0711) else: self._create_dir(path) else: path = self.job.deployment_dir self._create_dir(path) return path
def create_data_dir(fs): # If needed, create the remote home, deployment and data directories directories = (REMOTE_DEPLOYMENT_DIR.get(), REMOTE_SAMPLE_DIR.get()) user = fs.user try: fs.setuser(fs.DEFAULT_USER) for directory in directories: if not fs.exists(directory): remote_home_dir = Hdfs.join('/user', fs.user) if directory.startswith(remote_home_dir): # Home is 755 fs.create_home_dir(remote_home_dir) # Shared by all the users fs.mkdir(directory, 01777) fs.chmod(directory, 01777) # To remove after https://issues.apache.org/jira/browse/HDFS-3491 finally: fs.setuser(user) return REMOTE_SAMPLE_DIR.get()
class Submission(object): """ Represents one unique Oozie submission. Actions are: - submit - rerun """ def __init__(self, user, job=None, fs=None, jt=None, properties=None, oozie_id=None, local_tz=None): self.job = job self.user = user self.fs = fs self.jt = jt # Deprecated with YARN, we now use logical names only for RM self.oozie_id = oozie_id self.api = get_oozie(self.user) if properties is not None: self.properties = properties else: self.properties = {} if local_tz and isinstance(self.job.data, dict): local_tz = self.job.data.get('properties')['timezone'] # Modify start_date & end_date only when it's a coordinator from oozie.models2 import Coordinator if type(self.job) is Coordinator: if 'start_date' in self.properties: properties['start_date'] = convert_to_server_timezone( self.properties['start_date'], local_tz) if 'end_date' in self.properties: properties['end_date'] = convert_to_server_timezone( self.properties['end_date'], local_tz) if 'nominal_time' in self.properties: properties['nominal_time'] = convert_to_server_timezone( self.properties['nominal_time'], local_tz) self.properties['security_enabled'] = self.api.security_enabled def __str__(self): if self.oozie_id: res = "Submission for job '%s'." % (self.oozie_id, ) else: res = "Submission for job '%s' (id %s, owner %s)." % ( self.job.name, self.job.id, self.user) if self.oozie_id: res += " -- " + self.oozie_id return res @submit_dryrun def run(self, deployment_dir=None): """ Take care of all the actions of submitting a Oozie workflow. Returns the oozie job id if all goes well. """ if self.properties and 'oozie.use.system.libpath' not in self.properties: self.properties['oozie.use.system.libpath'] = 'true' self.oozie_id = self.api.submit_job(self.properties) LOG.info("Submitted: %s" % (self, )) if self._is_workflow(): self.api.job_control(self.oozie_id, 'start') LOG.info("Started: %s" % (self, )) return self.oozie_id def rerun(self, deployment_dir, fail_nodes=None, skip_nodes=None): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update({'oozie.wf.application.path': deployment_dir}) if 'oozie.coord.application.path' in self.properties: self.properties.pop('oozie.coord.application.path') if 'oozie.bundle.application.path' in self.properties: self.properties.pop('oozie.bundle.application.path') if fail_nodes: self.properties.update({'oozie.wf.rerun.failnodes': fail_nodes}) elif not skip_nodes: self.properties.update({'oozie.wf.rerun.failnodes': 'false'}) # Case empty 'skip_nodes' list else: self.properties.update({'oozie.wf.rerun.skip.nodes': skip_nodes}) self.api.rerun(self.oozie_id, properties=self.properties) LOG.info("Rerun: %s" % (self, )) return self.oozie_id def rerun_coord(self, deployment_dir, params): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update( {'oozie.coord.application.path': deployment_dir}) self.api.job_control(self.oozie_id, action='coord-rerun', properties=self.properties, parameters=params) LOG.info("Rerun: %s" % (self, )) return self.oozie_id def update_coord(self): self.api = get_oozie(self.user, api_version="v2") self.api.job_control(self.oozie_id, action='update', properties=self.properties, parameters=None) LOG.info("Update: %s" % (self, )) return self.oozie_id def rerun_bundle(self, deployment_dir, params): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update( {'oozie.bundle.application.path': deployment_dir}) self.api.job_control(self.oozie_id, action='bundle-rerun', properties=self.properties, parameters=params) LOG.info("Rerun: %s" % (self, )) return self.oozie_id def deploy(self, deployment_dir=None): try: if not deployment_dir: deployment_dir = self._create_deployment_dir() except Exception, ex: msg = _("Failed to create deployment directory: %s" % ex) LOG.exception(msg) raise PopupException(message=msg, detail=str(ex)) if self.api.security_enabled: jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties( jt_address ) # Needed for coordinator deploying workflows with credentials if hasattr(self.job, 'nodes'): for action in self.job.nodes: # Make sure XML is there # Don't support more than one level sub-workflow if action.data['type'] == 'subworkflow': from oozie.models2 import Workflow workflow = Workflow(document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['workflow'])) sub_deploy = Submission(self.user, workflow, self.fs, self.jt, self.properties) workspace = sub_deploy.deploy() self.job.override_subworkflow_id( action, workflow.id) # For displaying the correct graph self.properties[ 'workspace_%s' % workflow. uuid] = workspace # For pointing to the correct workspace elif action.data['type'] == 'altus': service = 'dataeng' # action.data['properties'].get('script_path') auth_key_id = ALTUS.AUTH_KEY_ID.get() auth_key_secret = ALTUS.AUTH_KEY_SECRET.get().replace( '\\n', '\n') shell_script = self._generate_altus_action_script( service=service, auth_key_id=auth_key_id, auth_key_secret=auth_key_secret) self._create_file(deployment_dir, action.data['name'] + '.py', shell_script) self.fs.do_as_user( self.user, self.fs.copyFromLocal, os.path.join(get_desktop_root(), 'core', 'ext-py', 'navoptapi-0.1.0'), self.job.deployment_dir) elif action.data['type'] == 'impala' or action.data[ 'type'] == 'impala-document': from oozie.models2 import _get_impala_url from impala.impala_flags import get_ssl_server_certificate if action.data['type'] == 'impala-document': from notebook.models import Notebook if action.data['properties'].get('uuid'): notebook = Notebook( document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['uuid'])) statements = notebook.get_str() statements = Template(statements).safe_substitute( **self.properties) script_name = action.data['name'] + '.sql' self._create_file(deployment_dir, script_name, statements) else: script_name = os.path.basename( action.data['properties'].get('script_path')) if self.api.security_enabled: kinit = 'kinit -k -t *.keytab %(user_principal)s' % { 'user_principal': self.properties.get( 'user_principal', action.data['properties']. get('user_principal')) } else: kinit = '' shell_script = """#!/bin/bash # Needed to launch impala shell in oozie export PYTHON_EGG_CACHE=./myeggs %(kinit)s impala-shell %(kerberos_option)s %(ssl_option)s -i %(impalad_host)s -f %(query_file)s""" % { 'impalad_host': action.data['properties'].get('impalad_host') or _get_impala_url(), 'kerberos_option': '-k' if self.api.security_enabled else '', 'ssl_option': '--ssl' if get_ssl_server_certificate() else '', 'query_file': script_name, 'kinit': kinit } self._create_file(deployment_dir, action.data['name'] + '.sh', shell_script) elif action.data['type'] == 'hive-document': from notebook.models import Notebook if action.data['properties'].get('uuid'): notebook = Notebook( document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['uuid'])) statements = notebook.get_str() else: statements = action.data['properties'].get( 'statements') if self.properties.get('send_result_path'): statements = """ INSERT OVERWRITE DIRECTORY '%s' ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES ( "separatorChar" = "\t", "quoteChar" = "'", "escapeChar" = "\\" ) STORED AS TEXTFILE %s""" % (self.properties.get('send_result_path'), '\n\n\n'.join([ snippet['statement_raw'] for snippet in notebook.get_data()['snippets'] ])) if statements is not None: self._create_file(deployment_dir, action.data['name'] + '.sql', statements) elif action.data['type'] in ('java-document', 'java', 'mapreduce-document'): if action.data['type'] == 'java-document' or action.data[ 'type'] == 'mapreduce-document': from notebook.models import Notebook notebook = Notebook( document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['uuid'])) properties = notebook.get_data( )['snippets'][0]['properties'] else: properties = action.data['properties'] if properties.get('app_jar'): LOG.debug("Adding to oozie.libpath %s" % properties['app_jar']) paths = [properties['app_jar']] if self.properties.get('oozie.libpath'): paths.append(self.properties['oozie.libpath']) self.properties['oozie.libpath'] = ','.join(paths) elif action.data['type'] == 'pig-document': from notebook.models import Notebook notebook = Notebook(document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties'] ['uuid'])) statements = notebook.get_data( )['snippets'][0]['statement_raw'] self._create_file(deployment_dir, action.data['name'] + '.pig', statements) elif action.data['type'] == 'spark' or action.data[ 'type'] == 'spark-document': if not [ f for f in action.data.get('properties').get( 'files', []) if f.get('value').endswith('hive-site.xml') ]: hive_site_lib = Hdfs.join(deployment_dir + '/lib/', 'hive-site.xml') hive_site_content = get_hive_site_content() if not self.fs.do_as_user( self.user, self.fs.exists, hive_site_lib) and hive_site_content: self.fs.do_as_user( self.user, self.fs.create, hive_site_lib, overwrite=True, permission=0700, data=smart_str(hive_site_content)) oozie_xml = self.job.to_xml(self.properties) self._do_as(self.user.username, self._copy_files, deployment_dir, oozie_xml, self.properties) return deployment_dir