def sync_coord_workflow(request, job_id): ParametersFormSet = formset_factory(ParameterForm, extra=0) job = check_job_access_permission(request, job_id) check_job_edition_permission(job, request.user) hue_coord = get_history().get_coordinator_from_config(job.conf_dict) hue_wf = (hue_coord and hue_coord.workflow) or get_history().get_workflow_from_config(job.conf_dict) wf_application_path = job.conf_dict.get('wf_application_path') and Hdfs.urlsplit(job.conf_dict['wf_application_path'])[2] or '' coord_application_path = job.conf_dict.get('oozie.coord.application.path') and Hdfs.urlsplit(job.conf_dict['oozie.coord.application.path'])[2] or '' properties = hue_coord and hue_coord.properties and dict([(param['name'], param['value']) for param in hue_coord.properties]) or None if request.method == 'POST': params_form = ParametersFormSet(request.POST) if params_form.is_valid(): mapping = dict([(param['name'], param['value']) for param in params_form.cleaned_data]) # Update workflow params in coordinator hue_coord.clear_workflow_params() properties = dict([(param['name'], param['value']) for param in hue_coord.properties]) # Deploy WF XML submission = Submission(user=request.user, job=hue_wf, fs=request.fs, jt=request.jt, properties=properties) submission.deploy(deployment_dir=wf_application_path) submission._create_file(wf_application_path, hue_wf.XML_FILE_NAME, hue_wf.to_xml(mapping=properties), do_as=True) # Deploy Coordinator XML job.conf_dict.update(mapping) submission = Submission(user=request.user, job=hue_coord, fs=request.fs, jt=request.jt, properties=job.conf_dict, oozie_id=job.id) submission._create_file(coord_application_path, hue_coord.XML_FILE_NAME, hue_coord.to_xml(mapping=job.conf_dict), do_as=True) # Server picks up deployed Coordinator XML changes after running 'update' action submission.update_coord() request.info(_('Successfully updated Workflow definition')) return redirect(reverse('oozie:list_oozie_coordinator', kwargs={'job_id': job_id})) else: request.error(_('Invalid submission form: %s' % params_form.errors)) else: new_params = hue_wf and hue_wf.find_all_parameters() or [] new_params = dict([(param['name'], param['value']) for param in new_params]) # Set previous values if properties: new_params = dict([(key, properties[key]) if key in properties.keys() else (key, new_params[key]) for key, value in new_params.iteritems()]) initial_params = ParameterForm.get_initial_params(new_params) params_form = ParametersFormSet(initial=initial_params) popup = render('editor2/submit_job_popup.mako', request, { 'params_form': params_form, 'name': _('Job'), 'header': _('Sync Workflow definition?'), 'action': reverse('oozie:sync_coord_workflow', kwargs={'job_id': job_id}) }, force_template=True).content return JsonResponse(popup, safe=False)
def rename_star(self, old_dir, new_dir): """Equivalent to `mv old_dir/* new""" if not self.isdir(old_dir): raise IOError(errno.ENOTDIR, _("'%s' is not a directory") % old_dir) if not self.exists(new_dir): self.mkdir(new_dir) elif not self.isdir(new_dir): raise IOError(errno.ENOTDIR, _("'%s' is not a directory") % new_dir) ls = self.listdir(old_dir) for dirent in ls: self.rename(Hdfs.join(old_dir, dirent), Hdfs.join(new_dir, dirent))
def rename(self, old, new): """rename(old, new)""" old = Hdfs.normpath(old) if not new.startswith("/"): new = Hdfs.join(Hdfs.dirname(old), new) new = Hdfs.normpath(new) params = self._getparams() params["op"] = "RENAME" # Encode `new' because it's in the params params["destination"] = smart_str(new) result = self._root.put(old, params) if not result["boolean"]: raise IOError("Rename failed: %s -> %s" % (smart_str(old), smart_str(new)))
def rename(self, old, new): """rename(old, new)""" old = Hdfs.normpath(old) if not new.startswith('/'): new = Hdfs.join(Hdfs.dirname(old), new) new = Hdfs.normpath(new) params = self._getparams() params['op'] = 'RENAME' # Encode `new' because it's in the params params['destination'] = smart_str(new) result = self._root.put(old, params) if not result['boolean']: raise IOError(_("Rename failed: %s -> %s") % (str(smart_str(old)), str(smart_str(new))))
def listdir(self, path, glob=None): """ listdir(path, glob=None) -> [ entry names ] Get directory entry names without stats. """ dirents = self.listdir_stats(path, glob) return [Hdfs.basename(x.path) for x in dirents]
def _create_deployment_dir(self): """ Return the job deployment directory in HDFS, creating it if necessary. The actual deployment dir should be 0711 owned by the user """ path = Hdfs.join(REMOTE_DEPLOYMENT_DIR.get(), '_%s_-oozie-%s-%s' % (self.user.username, self.job.id, time.time())) self._create_dir(path) return path
def get_content_summary(self, path): """ get_content_summary(path) -> WebHdfsContentSummary """ path = Hdfs.normpath(path) params = self._getparams() params["op"] = "GETCONTENTSUMMARY" json = self._root.get(path, params) return WebHdfsContentSummary(json["ContentSummary"])
def hdfs_link_js(url): link = 'javascript:void(0)' if url: path = Hdfs.urlsplit(url)[2] if path: link = ('/filebrowser/view=%s' if path.startswith(posixpath.sep) else '/filebrowser/home_relative_view=/%s') % path return link
def get_content_summary(self, path): """ get_content_summary(path) -> WebHdfsContentSummary """ path = Hdfs.normpath(path) params = self._getparams() params['op'] = 'GETCONTENTSUMMARY' json = self._root.get(path, params) return WebHdfsContentSummary(json['ContentSummary'])
def parse_breadcrumbs(path): breadcrumbs_parts = Hdfs.normpath(path).split('/') i = 1 breadcrumbs = [{'url': '', 'label': '/'}] while (i < len(breadcrumbs_parts)): breadcrumb_url = breadcrumbs[i - 1]['url'] + '/' + breadcrumbs_parts[i] if breadcrumb_url != '/': breadcrumbs.append({'url': breadcrumb_url, 'label': breadcrumbs_parts[i]}) i = i + 1 return breadcrumbs
def parse_breadcrumbs(path): breadcrumbs_parts = Hdfs.normpath(path).split("/") i = 1 breadcrumbs = [{"url": "", "label": "/"}] while i < len(breadcrumbs_parts): breadcrumb_url = breadcrumbs[i - 1]["url"] + "/" + breadcrumbs_parts[i] if breadcrumb_url != "/": breadcrumbs.append({"url": breadcrumb_url, "label": breadcrumbs_parts[i]}) i = i + 1 return breadcrumbs
def _get_service_url(hdfs_config): override = hdfs_config.WEBHDFS_URL.get() if override: return override fs_defaultfs = hdfs_config.FS_DEFAULTFS.get() netloc = Hdfs.urlsplit(fs_defaultfs)[1] host = netloc.split(':')[0] port = hadoop.conf.DEFAULT_NN_HTTP_PORT return "http://%s:%s/webhdfs/v1" % (host, port)
def append(self, path, data): """ append(path, data) Append data to a given file. """ path = Hdfs.normpath(path) params = self._getparams() params['op'] = 'APPEND' self._invoke_with_redirect('POST', path, params, data)
def chown(self, path, user=None, group=None): """chown(path, user=None, group=None)""" path = Hdfs.normpath(path) params = self._getparams() params['op'] = 'SETOWNER' if user is not None: params['owner'] = user if group is not None: params['group'] = group self._root.put(path, params)
def append(self, path, data): """ append(path, data) Append data to a given file. """ path = Hdfs.normpath(path) params = self._getparams() params["op"] = "APPEND" self._invoke_with_redirect("POST", path, params, data)
def chmod(self, path, mode): """ chmod(path, mode) `mode' should be an octal integer or string. """ path = Hdfs.normpath(path) params = self._getparams() params['op'] = 'SETPERMISSION' params['permission'] = safe_octal(mode) self._root.put(path, params)
def create_directory(request): parent_path = json.loads(request.POST.get("parent_path")) name = json.loads(request.POST.get("name")) parent_dir = Directory.objects.get(owner=request.user, name=parent_path) path = Hdfs.normpath(parent_path + "/" + name) file_doc = Directory.objects.create(name=path, type="directory", owner=request.user) parent_dir.dependencies.add(file_doc) return JsonResponse({"status": 0, "file": file_doc.to_dict()})
def _create_deployment_dir(self): """ Return the job deployment directory in HDFS, creating it if necessary. The actual deployment dir should be 0711 owned by the user """ if self.user != self.job.owner: path = Hdfs.join(REMOTE_DEPLOYMENT_DIR.get(), '_%s_-oozie-%s-%s' % (self.user.username, self.job.id, time.time())) self.fs.copy_remote_dir(self.job.deployment_dir, path, owner=self.user, dir_mode=0711) else: path = self.job.deployment_dir self._create_dir(path) return path
def listdir(request, path, chooser): """ Implements directory listing (or index). Intended to be called via view(). TODO: Remove? """ if not request.fs.isdir(path): raise PopupException(_("Not a directory: %(path)s") % {'path': path}) file_filter = request.REQUEST.get('file_filter', 'any') assert file_filter in ['any', 'file', 'dir'] home_dir_path = request.user.get_home_directory() breadcrumbs = parse_breadcrumbs(path) data = { 'path': path, 'file_filter': file_filter, 'breadcrumbs': breadcrumbs, 'current_dir_path': path, # These could also be put in automatically via # http://docs.djangoproject.com/en/dev/ref/templates/api/#django-core-context-processors-request, # but manually seems cleaner, since we only need it here. 'current_request_path': request.path, 'home_directory': request.fs.isdir(home_dir_path) and home_dir_path or None, 'cwd_set': True, 'is_superuser': request.user.username == request.fs.superuser, 'groups': request.user.username == request.fs.superuser and [str(x) for x in Group.objects.values_list('name', flat=True)] or [], 'users': request.user.username == request.fs.superuser and [str(x) for x in User.objects.values_list('username', flat=True)] or [], 'superuser': request.fs.superuser, 'show_upload': (request.REQUEST.get('show_upload') == 'false' and (False,) or (True,))[0] } stats = request.fs.listdir_stats(path) # Include parent dir, unless at filesystem root. if Hdfs.normpath(path) != posixpath.sep: parent_path = request.fs.join(path, "..") parent_stat = request.fs.stats(parent_path) # The 'path' field would be absolute, but we want its basename to be # actually '..' for display purposes. Encode it since _massage_stats expects byte strings. parent_stat['path'] = parent_path stats.insert(0, parent_stat) data['files'] = [_massage_stats(request, stat) for stat in stats] if chooser: return render('chooser.mako', request, data) else: return render('listdir.mako', request, data)
def hdfs_link(url): if url: path = Hdfs.urlsplit(url)[2] if path: if path.startswith(posixpath.sep): return "/filebrowser/view=" + path else: return "/filebrowser/home_relative_view=/" + path else: return url else: return url
def _stats(self, path): """This version of stats returns None if the entry is not found""" path = Hdfs.normpath(path) params = self._getparams() params['op'] = 'GETFILESTATUS' try: json = self._root.get(path, params) return WebHdfsStat(json['FileStatus'], path) except WebHdfsException, ex: if ex.server_exc == 'FileNotFoundException' or ex.code == 404: return None raise ex
def create_directories(fs, directory_list=[]): # If needed, create the remote home, deployment and data directories directories = [REMOTE_DEPLOYMENT_DIR.get()] + directory_list for directory in directories: if not fs.do_as_user(fs.DEFAULT_USER, fs.exists, directory): remote_home_dir = Hdfs.join('/user', fs.DEFAULT_USER) if directory.startswith(remote_home_dir): # Home is 755 fs.do_as_user(fs.DEFAULT_USER, fs.create_home_dir, remote_home_dir) # Shared by all the users fs.do_as_user(fs.DEFAULT_USER, fs.mkdir, directory, 01777) fs.do_as_user(fs.DEFAULT_USER, fs.chmod, directory, 01777) # To remove after https://issues.apache.org/jira/browse/HDFS-3491
def check_access(self, path, aclspec='rw-'): path = Hdfs.normpath(path) params = self._getparams() params['op'] = 'CHECKACCESS' params['fsaction'] = aclspec try: return self._root.get(path, params) except WebHdfsException, ex: if ex.code == 500 or ex.code == 400: LOG.warn('Failed to check access to path %s, CHECKACCESS operation may not be supported.' % path) return None else: raise ex
def mkdir(self, path, mode=None): """ mkdir(path, mode=None) Creates a directory and any parent directory if necessary. """ path = Hdfs.normpath(path) params = self._getparams() params['op'] = 'MKDIRS' if mode is not None: params['permission'] = safe_octal(mode) success = self._root.put(path, params) if not success: raise IOError(_("Mkdir failed: %s") % path)
def listdir_stats(self, path, glob=None): """ listdir_stats(path, glob=None) -> [ WebHdfsStat ] Get directory listing with stats. """ path = Hdfs.normpath(path) params = self._getparams() if glob is not None: params['filter'] = glob params['op'] = 'LISTSTATUS' json = self._root.get(path, params) filestatus_list = json['FileStatuses']['FileStatus'] return [ WebHdfsStat(st, path) for st in filestatus_list ]
def mkdir(self, path, mode=None): """ mkdir(path, mode=None) Creates a directory and any parent directory if necessary. """ path = Hdfs.normpath(path) params = self._getparams() params["op"] = "MKDIRS" if mode is not None: params["permission"] = safe_octal(mode) success = self._root.put(path, params) if not success: raise IOError("Mkdir failed: %s" % (smart_str(path),))
def chown(self, path, user=None, group=None, recursive=False): """chown(path, user=None, group=None, recursive=False)""" path = Hdfs.normpath(path) params = self._getparams() params['op'] = 'SETOWNER' if user is not None: params['owner'] = user if group is not None: params['group'] = group if recursive: for xpath in self.listdir_recursive(path): self._root.put(xpath, params) else: self._root.put(path, params)
def listdir_stats(self, path, glob=None): """ listdir_stats(path, glob=None) -> [ WebHdfsStat ] Get directory listing with stats. """ path = Hdfs.normpath(path) params = self._getparams() if glob is not None: params["filter"] = glob params["op"] = "LISTSTATUS" json = self._root.get(path, params) filestatus_list = json["FileStatuses"]["FileStatus"] return [WebHdfsStat(st, path) for st in filestatus_list]
def chown(self, path, user=None, group=None, recursive=False): """chown(path, user=None, group=None, recursive=False)""" path = Hdfs.normpath(path) params = self._getparams() params["op"] = "SETOWNER" if user is not None: params["owner"] = user if group is not None: params["group"] = group if recursive: for xpath in self._listdir_r(path): self._root.put(xpath, params) else: self._root.put(path, params)
def create_directory(request): parent_path = json.loads(request.POST.get('parent_path')) name = json.loads(request.POST.get('name')) parent_dir = Directory.objects.get(owner=request.user, name=parent_path) path = Hdfs.normpath(parent_path + '/' + name) file_doc = Directory.objects.create(name=path, owner=request.user) parent_dir.dependencies.add(file_doc) return JsonResponse({ 'status': 0, 'file': file_doc.to_dict() })
def strip_normpath(self, path): split = urlparse(path) path = split._replace(scheme="", netloc="").geturl() return Hdfs.normpath(path)
]: metrics = api.get_metrics() sharelib_url = 'gauges' in metrics and 'libs.sharelib.system.libpath' in metrics[ 'gauges'] and [ metrics['gauges']['libs.sharelib.system.libpath']['value'] ] or [] else: intrumentation = api.get_instrumentation() sharelib_url = [ param['value'] for group in intrumentation['variables'] for param in group['data'] if param['name'] == 'sharelib.system.libpath' ] if sharelib_url: sharelib_url = Hdfs.urlsplit(sharelib_url[0])[2] if not sharelib_url: res.append((status, _('Oozie Share Lib path is not available'))) class ConfigMock: def __init__(self, value): self.value = value def get(self): return self.value def get_fully_qualifying_key(self): return self.value for cluster in get_all_hdfs().values():
class Submission(object): """ Represents one unique Oozie submission. Actions are: - submit - rerun """ def __init__(self, user, job=None, fs=None, jt=None, properties=None, oozie_id=None, local_tz=None): self.job = job self.user = user self.fs = fs self.jt = jt # Deprecated with YARN, we now use logical names only for RM self.oozie_id = oozie_id self.api = get_oozie(self.user) if properties is not None: self.properties = properties else: self.properties = {} if local_tz and isinstance(self.job.data, dict): local_tz = self.job.data.get('properties')['timezone'] # Modify start_date & end_date only when it's a coordinator from oozie.models2 import Coordinator if type(self.job) is Coordinator: if 'start_date' in self.properties: properties['start_date'] = convert_to_server_timezone( self.properties['start_date'], local_tz) if 'end_date' in self.properties: properties['end_date'] = convert_to_server_timezone( self.properties['end_date'], local_tz) if 'nominal_time' in self.properties: properties['nominal_time'] = convert_to_server_timezone( self.properties['nominal_time'], local_tz) self.properties['security_enabled'] = self.api.security_enabled def __str__(self): if self.oozie_id: res = "Submission for job '%s'." % (self.oozie_id, ) else: res = "Submission for job '%s' (id %s, owner %s)." % ( self.job.name, self.job.id, self.user) if self.oozie_id: res += " -- " + self.oozie_id return res @submit_dryrun def run(self, deployment_dir=None): """ Take care of all the actions of submitting a Oozie workflow. Returns the oozie job id if all goes well. """ if self.properties and 'oozie.use.system.libpath' not in self.properties: self.properties['oozie.use.system.libpath'] = 'true' self.oozie_id = self.api.submit_job(self.properties) LOG.info("Submitted: %s" % (self, )) if self._is_workflow(): self.api.job_control(self.oozie_id, 'start') LOG.info("Started: %s" % (self, )) return self.oozie_id def rerun(self, deployment_dir, fail_nodes=None, skip_nodes=None): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update({'oozie.wf.application.path': deployment_dir}) if 'oozie.coord.application.path' in self.properties: self.properties.pop('oozie.coord.application.path') if 'oozie.bundle.application.path' in self.properties: self.properties.pop('oozie.bundle.application.path') if fail_nodes: self.properties.update({'oozie.wf.rerun.failnodes': fail_nodes}) elif not skip_nodes: self.properties.update({'oozie.wf.rerun.failnodes': 'false'}) # Case empty 'skip_nodes' list else: self.properties.update({'oozie.wf.rerun.skip.nodes': skip_nodes}) self.api.rerun(self.oozie_id, properties=self.properties) LOG.info("Rerun: %s" % (self, )) return self.oozie_id def rerun_coord(self, deployment_dir, params): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update( {'oozie.coord.application.path': deployment_dir}) self.api.job_control(self.oozie_id, action='coord-rerun', properties=self.properties, parameters=params) LOG.info("Rerun: %s" % (self, )) return self.oozie_id def update_coord(self): self.api = get_oozie(self.user, api_version="v2") self.api.job_control(self.oozie_id, action='update', properties=self.properties, parameters=None) LOG.info("Update: %s" % (self, )) return self.oozie_id def rerun_bundle(self, deployment_dir, params): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update( {'oozie.bundle.application.path': deployment_dir}) self.api.job_control(self.oozie_id, action='bundle-rerun', properties=self.properties, parameters=params) LOG.info("Rerun: %s" % (self, )) return self.oozie_id def deploy(self, deployment_dir=None): try: if not deployment_dir: deployment_dir = self._create_deployment_dir() except Exception, ex: msg = _("Failed to create deployment directory: %s" % ex) LOG.exception(msg) raise PopupException(message=msg, detail=str(ex)) if self.api.security_enabled: jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties( jt_address ) # Needed for coordinator deploying workflows with credentials if hasattr(self.job, 'nodes'): for action in self.job.nodes: # Make sure XML is there # Don't support more than one level sub-workflow if action.data['type'] == 'subworkflow': from oozie.models2 import Workflow workflow = Workflow(document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['workflow'])) sub_deploy = Submission(self.user, workflow, self.fs, self.jt, self.properties) workspace = sub_deploy.deploy() self.job.override_subworkflow_id( action, workflow.id) # For displaying the correct graph self.properties[ 'workspace_%s' % workflow. uuid] = workspace # For pointing to the correct workspace elif action.data['type'] == 'altus': service = 'dataeng' # action.data['properties'].get('script_path') auth_key_id = ALTUS.AUTH_KEY_ID.get() auth_key_secret = ALTUS.AUTH_KEY_SECRET.get().replace( '\\n', '\n') shell_script = self._generate_altus_action_script( service=service, auth_key_id=auth_key_id, auth_key_secret=auth_key_secret) self._create_file(deployment_dir, action.data['name'] + '.py', shell_script) self.fs.do_as_user( self.user, self.fs.copyFromLocal, os.path.join(get_desktop_root(), 'core', 'ext-py', 'navoptapi-0.1.0'), self.job.deployment_dir) elif action.data['type'] == 'impala' or action.data[ 'type'] == 'impala-document': from oozie.models2 import _get_impala_url from impala.impala_flags import get_ssl_server_certificate if action.data['type'] == 'impala-document': from notebook.models import Notebook if action.data['properties'].get('uuid'): notebook = Notebook( document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['uuid'])) statements = notebook.get_str() statements = Template(statements).safe_substitute( **self.properties) script_name = action.data['name'] + '.sql' self._create_file(deployment_dir, script_name, statements) else: script_name = os.path.basename( action.data['properties'].get('script_path')) if self.api.security_enabled: kinit = 'kinit -k -t *.keytab %(user_principal)s' % { 'user_principal': self.properties.get( 'user_principal', action.data['properties']. get('user_principal')) } else: kinit = '' shell_script = """#!/bin/bash # Needed to launch impala shell in oozie export PYTHON_EGG_CACHE=./myeggs %(kinit)s impala-shell %(kerberos_option)s %(ssl_option)s -i %(impalad_host)s -f %(query_file)s""" % { 'impalad_host': action.data['properties'].get('impalad_host') or _get_impala_url(), 'kerberos_option': '-k' if self.api.security_enabled else '', 'ssl_option': '--ssl' if get_ssl_server_certificate() else '', 'query_file': script_name, 'kinit': kinit } self._create_file(deployment_dir, action.data['name'] + '.sh', shell_script) elif action.data['type'] == 'hive-document': from notebook.models import Notebook if action.data['properties'].get('uuid'): notebook = Notebook( document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['uuid'])) statements = notebook.get_str() else: statements = action.data['properties'].get( 'statements') if self.properties.get('send_result_path'): statements = """ INSERT OVERWRITE DIRECTORY '%s' ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES ( "separatorChar" = "\t", "quoteChar" = "'", "escapeChar" = "\\" ) STORED AS TEXTFILE %s""" % (self.properties.get('send_result_path'), '\n\n\n'.join([ snippet['statement_raw'] for snippet in notebook.get_data()['snippets'] ])) if statements is not None: self._create_file(deployment_dir, action.data['name'] + '.sql', statements) elif action.data['type'] in ('java-document', 'java', 'mapreduce-document'): if action.data['type'] == 'java-document' or action.data[ 'type'] == 'mapreduce-document': from notebook.models import Notebook notebook = Notebook( document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['uuid'])) properties = notebook.get_data( )['snippets'][0]['properties'] else: properties = action.data['properties'] if properties.get('app_jar'): LOG.debug("Adding to oozie.libpath %s" % properties['app_jar']) paths = [properties['app_jar']] if self.properties.get('oozie.libpath'): paths.append(self.properties['oozie.libpath']) self.properties['oozie.libpath'] = ','.join(paths) elif action.data['type'] == 'pig-document': from notebook.models import Notebook notebook = Notebook(document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties'] ['uuid'])) statements = notebook.get_data( )['snippets'][0]['statement_raw'] self._create_file(deployment_dir, action.data['name'] + '.pig', statements) elif action.data['type'] == 'spark' or action.data[ 'type'] == 'spark-document': if not [ f for f in action.data.get('properties').get( 'files', []) if f.get('value').endswith('hive-site.xml') ]: hive_site_lib = Hdfs.join(deployment_dir + '/lib/', 'hive-site.xml') hive_site_content = get_hive_site_content() if not self.fs.do_as_user( self.user, self.fs.exists, hive_site_lib) and hive_site_content: self.fs.do_as_user( self.user, self.fs.create, hive_site_lib, overwrite=True, permission=0700, data=smart_str(hive_site_content)) oozie_xml = self.job.to_xml(self.properties) self._do_as(self.user.username, self._copy_files, deployment_dir, oozie_xml, self.properties) return deployment_dir
def get_acl_status(self, path): path = Hdfs.normpath(path) params = self._getparams() params['op'] = 'GETACLSTATUS' return self._root.get(path, params)
def listdir_paged(request, path): """ A paginated version of listdir. Query parameters: pagenum - The page number to show. Defaults to 1. pagesize - How many to show on a page. Defaults to 15. sortby=? - Specify attribute to sort by. Accepts: (type, name, atime, mtime, size, user, group) Defaults to name. descending - Specify a descending sort order. Default to false. filter=? - Specify a substring filter to search for in the filename field. """ if not request.fs.isdir(path): raise PopupException("Not a directory: %s" % (path, )) pagenum = int(request.GET.get('pagenum', 1)) pagesize = int(request.GET.get('pagesize', 30)) home_dir_path = request.user.get_home_directory() breadcrumbs = parse_breadcrumbs(path) all_stats = request.fs.listdir_stats(path) # Filter first filter_str = request.GET.get('filter', None) if filter_str: filtered_stats = filter(lambda sb: filter_str in sb['name'], all_stats) all_stats = filtered_stats # Sort next sortby = request.GET.get('sortby', None) descending_param = request.GET.get('descending', None) if sortby is not None: if sortby not in ('type', 'name', 'atime', 'mtime', 'user', 'group', 'size'): logger.info("Invalid sort attribute '%s' for listdir." % (sortby, )) else: all_stats = sorted(all_stats, key=operator.attrgetter(sortby), reverse=coerce_bool(descending_param)) # Do pagination page = paginator.Paginator(all_stats, pagesize).page(pagenum) shown_stats = page.object_list # Include parent dir always as first option, unless at filesystem root. if Hdfs.normpath(path) != posixpath.sep: parent_path = request.fs.join(path, "..") parent_stat = request.fs.stats(parent_path) # The 'path' field would be absolute, but we want its basename to be # actually '..' for display purposes. Encode it since _massage_stats expects byte strings. parent_stat['path'] = parent_path parent_stat['name'] = ".." shown_stats.insert(0, parent_stat) page.object_list = [_massage_stats(request, s) for s in shown_stats] data = { 'path': path, 'breadcrumbs': breadcrumbs, 'current_request_path': request.path, 'files': page.object_list, 'page': _massage_page(page), 'pagesize': pagesize, 'home_directory': request.fs.isdir(home_dir_path) and home_dir_path or None, 'filter_str': filter_str, 'sortby': sortby, 'descending': descending_param, # The following should probably be deprecated 'cwd_set': True, 'file_filter': 'any', 'current_dir_path': path, 'is_fs_superuser': request.user.username == request.fs.superuser, 'is_superuser': request.user.username == request.fs.superuser, 'groups': request.user.username == request.fs.superuser and [str(x) for x in Group.objects.values_list('name', flat=True)] or [], 'users': request.user.username == request.fs.superuser and [str(x) for x in User.objects.values_list('username', flat=True)] or [], 'superuser': request.fs.superuser } return render('listdir.mako', request, data)
def config_validator(user): """ config_validator() -> [ (config_variable, error_message) ] Called by core check_config() view. """ from hadoop.cluster import get_all_hdfs from hadoop.fs.hadoopfs import Hdfs from liboozie.oozie_api import get_oozie res = [] if OOZIE_URL.get(): status = get_oozie_status(user) if 'NORMAL' not in status: res.append((status, _('The Oozie server is not available'))) api = get_oozie(user, api_version="v2") configuration = api.get_configuration() if 'org.apache.oozie.service.MetricsInstrumentationService' in [ c.strip() for c in configuration.get('oozie.services.ext', '').split(',') ]: metrics = api.get_metrics() sharelib_url = 'gauges' in metrics and 'libs.sharelib.system.libpath' in metrics[ 'gauges'] and [ metrics['gauges']['libs.sharelib.system.libpath']['value'] ] or [] else: intrumentation = api.get_instrumentation() sharelib_url = [ param['value'] for group in intrumentation['variables'] for param in group['data'] if param['name'] == 'sharelib.system.libpath' ] if sharelib_url: sharelib_url = Hdfs.urlsplit(sharelib_url[0])[2] if not sharelib_url: res.append((status, _('Oozie Share Lib path is not available'))) class ConfigMock: def __init__(self, value): self.value = value def get(self): return self.value def get_fully_qualifying_key(self): return self.value for cluster in get_all_hdfs().values(): res.extend( validate_path( ConfigMock(sharelib_url), is_dir=True, fs=cluster, message=_( 'Oozie Share Lib not installed in default location.'))) return res
def remove_acl_entries(self, path, aclspec): path = Hdfs.normpath(path) params = self._getparams() params['op'] = 'REMOVEACLENTRIES' params['aclspec'] = aclspec return self._root.put(path, params)
def modify_acl_entries(self, path, aclspec): path = Hdfs.normpath(path) params = self._getparams() params['op'] = 'MODIFYACLENTRIES' params['aclspec'] = aclspec return self._root.put(path, params)
def config_validator(user): """ config_validator() -> [ (config_variable, error_message) ] Called by core check_config() view. """ from desktop.lib.fsmanager import get_filesystem from hadoop.cluster import get_all_hdfs from hadoop.fs.hadoopfs import Hdfs from liboozie.oozie_api import get_oozie res = [] try: from oozie.conf import REMOTE_SAMPLE_DIR except Exception as e: LOG.warn('Config check failed because Oozie app not installed: %s' % e) return res if OOZIE_URL.get(): status = get_oozie_status(user) if 'NORMAL' not in status: res.append((status, _('The Oozie server is not available'))) fs = get_filesystem() NICE_NAME = 'Oozie' if fs.do_as_superuser(fs.exists, REMOTE_SAMPLE_DIR.get()): stats = fs.do_as_superuser(fs.stats, REMOTE_SAMPLE_DIR.get()) mode = oct(stats.mode) # if neither group nor others have write permission group_has_write = int(mode[-2]) & 2 others_has_write = int(mode[-1]) & 2 if not group_has_write and not others_has_write: res.append( (NICE_NAME, "The permissions of workspace '%s' are too restrictive" % REMOTE_SAMPLE_DIR.get())) api = get_oozie(user, api_version="v2") configuration = api.get_configuration() if 'org.apache.oozie.service.MetricsInstrumentationService' in [ c.strip() for c in configuration.get('oozie.services.ext', '').split(',') ]: metrics = api.get_metrics() sharelib_url = 'gauges' in metrics and 'libs.sharelib.system.libpath' in metrics[ 'gauges'] and [ metrics['gauges']['libs.sharelib.system.libpath']['value'] ] or [] else: intrumentation = api.get_instrumentation() sharelib_url = [ param['value'] for group in intrumentation['variables'] for param in group['data'] if param['name'] == 'sharelib.system.libpath' ] if sharelib_url: sharelib_url = Hdfs.urlsplit(sharelib_url[0])[2] if not sharelib_url: res.append((status, _('Oozie Share Lib path is not available'))) class ConfigMock(object): def __init__(self, value): self.value = value def get(self): return self.value def get_fully_qualifying_key(self): return self.value for cluster in list(get_all_hdfs().values()): res.extend( validate_path( ConfigMock(sharelib_url), is_dir=True, fs=cluster, message=_( 'Oozie Share Lib not installed in default location.'))) return res
def listdir(request, path, chooser): """ Implements directory listing (or index). Intended to be called via view(). """ if not request.fs.isdir(path): raise PopupException(_("Not a directory: %(path)s") % {'path': path}) file_filter = request.REQUEST.get('file_filter', 'any') assert file_filter in ['any', 'file', 'dir'] home_dir_path = request.user.get_home_directory() breadcrumbs = parse_breadcrumbs(path) data = { 'path': path, 'file_filter': file_filter, 'breadcrumbs': breadcrumbs, 'current_dir_path': path, # These could also be put in automatically via # http://docs.djangoproject.com/en/dev/ref/templates/api/#django-core-context-processors-request, # but manually seems cleaner, since we only need it here. 'current_request_path': request.path, 'home_directory': request.fs.isdir(home_dir_path) and home_dir_path or None, 'cwd_set': True, 'is_superuser': request.user.username == request.fs.superuser, 'groups': request.user.username == request.fs.superuser and [str(x) for x in Group.objects.values_list('name', flat=True)] or [], 'users': request.user.username == request.fs.superuser and [str(x) for x in User.objects.values_list('username', flat=True)] or [], 'superuser': request.fs.superuser, 'show_upload': (request.REQUEST.get('show_upload') == 'false' and (False, ) or (True, ))[0] } stats = request.fs.listdir_stats(path) # Include parent dir, unless at filesystem root. if Hdfs.normpath(path) != posixpath.sep: parent_path = request.fs.join(path, "..") parent_stat = request.fs.stats(parent_path) # The 'path' field would be absolute, but we want its basename to be # actually '..' for display purposes. Encode it since _massage_stats expects byte strings. parent_stat['path'] = parent_path stats.insert(0, parent_stat) data['files'] = [_massage_stats(request, stat) for stat in stats] if chooser: return render('chooser.mako', request, data) else: return render('listdir.mako', request, data)
def sync_coord_workflow(request, job_id): ParametersFormSet = formset_factory(ParameterForm, extra=0) job = check_job_access_permission(request, job_id) check_job_edition_permission(job, request.user) hue_coord = get_history().get_coordinator_from_config(job.conf_dict) hue_wf = (hue_coord and hue_coord.workflow) or get_history().get_workflow_from_config(job.conf_dict) wf_application_path = job.conf_dict.get('wf_application_path') and Hdfs.urlsplit(job.conf_dict['wf_application_path'])[2] or '' coord_application_path = job.conf_dict.get('oozie.coord.application.path') and Hdfs.urlsplit(job.conf_dict['oozie.coord.application.path'])[2] or '' properties = hue_coord and hue_coord.properties and dict([(param['name'], param['value']) for param in hue_coord.properties]) or None if request.method == 'POST': response = {'status': -1, 'message': ''} params_form = ParametersFormSet(request.POST) if params_form.is_valid(): try: mapping = dict([(param['name'], param['value']) for param in params_form.cleaned_data]) # Update workflow params in coordinator hue_coord.clear_workflow_params() properties = dict([(param['name'], param['value']) for param in hue_coord.properties]) # Deploy WF XML submission = Submission(user=request.user, job=hue_wf, fs=request.fs, jt=request.jt, properties=properties) submission.deploy(deployment_dir=wf_application_path) submission._create_file(wf_application_path, hue_wf.XML_FILE_NAME, hue_wf.to_xml(mapping=properties), do_as=True) # Deploy Coordinator XML job.conf_dict.update(mapping) submission = Submission(user=request.user, job=hue_coord, fs=request.fs, jt=request.jt, properties=job.conf_dict, oozie_id=job.id) submission._create_file(coord_application_path, hue_coord.XML_FILE_NAME, hue_coord.to_xml(mapping=job.conf_dict), do_as=True) # Server picks up deployed Coordinator XML changes after running 'update' action submission.update_coord() response['status'] = 0 response['message'] = _('Successfully updated Workflow definition') except Exception as e: response['message'] = e.message else: response['message'] = _('Invalid submission form: %s' % params_form.errors) return JsonResponse(response) else: new_params = hue_wf and hue_wf.find_all_parameters() or [] new_params = dict([(param['name'], param['value']) for param in new_params]) # Set previous values if properties: new_params = dict([(key, properties[key]) if key in list(properties.keys()) else (key, new_params[key]) for key, value in new_params.items()]) initial_params = ParameterForm.get_initial_params(new_params) params_form = ParametersFormSet(initial=initial_params) popup = render('/scheduler/submit_job_popup.mako', request, { 'params_form': params_form, 'name': _('Job'), 'header': _('Sync Workflow definition?'), 'action': reverse('oozie:sync_coord_workflow', kwargs={'job_id': job_id}) }, force_template=True).content if not isinstance(popup, str): popup = popup.decode('utf-8') return JsonResponse(popup, safe=False)
def urlsplit(url): return Hdfs.urlsplit(url)
def create_table_from_a_file(self, source, destination, start_time=-1, file_encoding=None): if '.' in destination['name']: database, table_name = destination['name'].split('.', 1) else: database = 'default' table_name = destination['name'] final_table_name = table_name table_format = destination['tableFormat'] source_type = source['sourceType'] columns = destination['columns'] partition_columns = destination['partitionColumns'] kudu_partition_columns = destination['kuduPartitionColumns'] comment = destination['description'] source_path = urllib_unquote(source['path']) load_data = destination['importData'] external = not destination['useDefaultLocation'] external_path = urllib_unquote(destination['nonDefaultLocation']) editor_type = destination['sourceType'] is_transactional = destination['isTransactional'] default_transactional_type = 'insert_only' if destination[ 'isInsertOnly'] else 'default' skip_header = destination['hasHeader'] primary_keys = destination['primaryKeys'] if destination['useCustomDelimiters']: field_delimiter = destination['customFieldDelimiter'] collection_delimiter = destination[ 'customCollectionDelimiter'] or None map_delimiter = destination['customMapDelimiter'] or None else: field_delimiter = ',' collection_delimiter = r'\002' map_delimiter = r'\003' regexp_delimiter = destination['customRegexp'] file_format = 'TextFile' row_format = 'Delimited' serde_name = '' serde_properties = '' extra_create_properties = '' sql = '' if source['inputFormat'] == 'manual': load_data = False source['format'] = {'quoteChar': '"', 'fieldSeparator': ','} if table_format == 'json': row_format = 'serde' serde_name = 'org.apache.hive.hcatalog.data.JsonSerDe' elif table_format == 'regexp': row_format = 'serde' serde_name = 'org.apache.hadoop.hive.serde2.RegexSerDe' serde_properties = '"input.regex" = "%s"' % regexp_delimiter elif table_format == 'csv': if source['format']['quoteChar'] == '"': source['format']['quoteChar'] = '\\"' row_format = 'serde' serde_name = 'org.apache.hadoop.hive.serde2.OpenCSVSerde' serde_properties = '''"separatorChar" = "%(fieldSeparator)s", "quoteChar" = "%(quoteChar)s", "escapeChar" = "\\\\" ''' % source['format'] use_temp_table = table_format in ('parquet', 'orc', 'kudu') or is_transactional if use_temp_table: # We'll be using a temp table to load data if load_data: table_name, final_table_name = 'hue__tmp_%s' % table_name, table_name sql += '\n\nDROP TABLE IF EXISTS `%(database)s`.`%(table_name)s`;\n' % { 'database': database, 'table_name': table_name } else: # Manual row_format = '' file_format = table_format skip_header = False if table_format == 'kudu': columns = [ col for col in columns if col['name'] in primary_keys ] + [ col for col in columns if col['name'] not in primary_keys ] if table_format == 'kudu': collection_delimiter = None map_delimiter = None if external or (load_data and table_format in ( 'parquet', 'orc', 'kudu')): # We'll use location to load data if not self.fs.isdir(external_path): # File selected external_path, external_file_name = Hdfs.split(external_path) if len(self.fs.listdir(external_path)) > 1: # If dir not just the file, create data dir and move file there. Make sure it's unique. external_path = external_path + '/%s%s_table' % ( external_file_name, str(uuid.uuid4())) self.fs.mkdir(external_path) self.fs.rename(source_path, external_path) elif load_data: # We'll use load data command parent_path = self.fs.parent_path(source_path) stats = self.fs.stats(parent_path) split = urlparse(source_path) # Only for HDFS, import data and non-external table if split.scheme in ('', 'hdfs') and oct(stats["mode"])[-1] != '7': user_scratch_dir = self.fs.get_home_dir( ) + '/.scratchdir/%s' % str( uuid.uuid4()) # Make sure it's unique. self.fs.do_as_user(self.user, self.fs.mkdir, user_scratch_dir, 0o0777) self.fs.do_as_user(self.user, self.fs.rename, source['path'], user_scratch_dir) source_path = user_scratch_dir + '/' + source['path'].split( '/')[-1] if external_path.lower().startswith( "abfs"): #this is to check if its using an ABFS path external_path = abfspath(external_path) tbl_properties = OrderedDict() if skip_header: tbl_properties['skip.header.line.count'] = '1' # The temp table is not transactional, but final table can be if is_transactional. # tbl_properties that don't exist in previous versions can safely be added without error. tbl_properties['transactional'] = 'false' sql += django_mako.render_to_string( "gen/create_table_statement.mako", { 'table': { 'name': table_name, 'comment': comment, 'row_format': row_format, 'field_terminator': field_delimiter, 'collection_terminator': collection_delimiter if source_type == 'hive' else None, 'map_key_terminator': map_delimiter if source_type == 'hive' else None, 'serde_name': serde_name, 'serde_properties': serde_properties, 'file_format': file_format, 'external': external or load_data and table_format in ('parquet', 'orc', 'kudu'), 'path': external_path, 'primary_keys': primary_keys if table_format == 'kudu' and not load_data else [], 'tbl_properties': tbl_properties }, 'columns': columns, 'partition_columns': partition_columns, 'kudu_partition_columns': kudu_partition_columns, 'database': database }) if file_encoding and file_encoding != 'ASCII' and file_encoding != 'utf-8' and not use_temp_table: sql += '\n\nALTER TABLE `%(database)s`.`%(final_table_name)s` ' \ 'SET serdeproperties ("serialization.encoding"="%(file_encoding)s");' % { 'database': database, 'final_table_name': final_table_name, 'file_encoding': file_encoding } if table_format in ('text', 'json', 'csv', 'regexp') and not external and load_data: form_data = { 'path': source_path, 'overwrite': False, 'partition_columns': [(partition['name'], partition['partitionValue']) for partition in partition_columns], } query_server_config = dbms.get_query_server_config( name=source_type) db = dbms.get(self.user, query_server=query_server_config) sql += "\n\n%s;" % db.load_data( database, table_name, form_data, None, generate_ddl_only=True) if load_data and use_temp_table: file_format = 'TextFile' if table_format == 'text' else table_format if table_format == 'kudu': columns_list = [ '`%s`' % col for col in primary_keys + [ col['name'] for col in destination['columns'] if col['name'] not in primary_keys and col['keep'] ] ] extra_create_properties = """PRIMARY KEY (%(primary_keys)s) PARTITION BY HASH PARTITIONS 16 STORED AS %(file_format)s TBLPROPERTIES( 'kudu.num_tablet_replicas' = '1' )""" % { 'file_format': file_format, 'primary_keys': ', '.join(primary_keys) } else: columns_list = ['*'] extra_create_properties = 'STORED AS %(file_format)s' % { 'file_format': file_format } if is_transactional: extra_create_properties += '\nTBLPROPERTIES("transactional"="true", "transactional_properties"="%s")' % \ default_transactional_type sql += '''\n\nCREATE TABLE `%(database)s`.`%(final_table_name)s`%(comment)s %(extra_create_properties)s AS SELECT %(columns_list)s FROM `%(database)s`.`%(table_name)s`;''' % { 'database': database, 'final_table_name': final_table_name, 'table_name': table_name, 'extra_create_properties': extra_create_properties, 'columns_list': ', '.join(columns_list), 'comment': ' COMMENT "%s"' % comment if comment else '' } sql += '\n\nDROP TABLE IF EXISTS `%(database)s`.`%(table_name)s`;\n' % { 'database': database, 'table_name': table_name } if file_encoding and file_encoding != 'ASCII' and file_encoding != 'utf-8': sql += '\n\nALTER TABLE `%(database)s`.`%(final_table_name)s` ' \ 'SET serdeproperties ("serialization.encoding"="%(file_encoding)s");' % { 'database': database, 'final_table_name': final_table_name, 'file_encoding': file_encoding } on_success_url = reverse('metastore:describe_table', kwargs={ 'database': database, 'table': final_table_name }) + '?source_type=' + source_type return make_notebook(name=_('Creating table %(database)s.%(table)s') % { 'database': database, 'table': final_table_name }, editor_type=editor_type, statement=sql.strip(), status='ready', database=database, on_success_url=on_success_url, last_executed=start_time, is_task=True)
def remove_acl(self, path): path = Hdfs.normpath(path) params = self._getparams() params['op'] = 'REMOVEACL' return self._root.put(path, params)
def set_acl(self, path, aclspec): path = Hdfs.normpath(path) params = self._getparams() params['op'] = 'SETACL' params['aclspec'] = aclspec return self._root.put(path, params)
def run(self, request, collection_name, envelope, input_path, start_time=None, lib_path=None): workspace_path = self._upload_workspace(envelope) if lib_path is None: lib_path = CONFIG_JARS_LIBS_PATH.get() task = make_notebook( name=_('Indexing into %s') % collection_name, editor_type='notebook', #on_success_url=reverse('search:browse', kwargs={'name': collection_name}), #pub_sub_url='assist.collections.refresh', is_task=True, is_notebook=True, last_executed=start_time) if not DISABLE_HUE_3.config.default_value or True: # CDH5 shell_command_name = "pipeline.sh" shell_command = """#!/bin/bash export SPARK_DIST_CLASSPATH=`hadoop classpath` export SPARK_DIST_CLASSPATH=/etc/hive/conf:`hadoop classpath` export JAVA_HOME=/usr/java/jdk1.8.0_162 SPARK_KAFKA_VERSION=0.10 spark2-submit envelope.jar envelope.conf""" hdfs_shell_cmd_path = os.path.join(workspace_path, shell_command_name) self.fs.do_as_user(self.username, self.fs.create, hdfs_shell_cmd_path, data=shell_command) task.add_shell_snippet(shell_command=shell_command_name, files=[{ u'value': u'%s/envelope.conf' % workspace_path }, { u'value': hdfs_shell_cmd_path }, { u'value': lib_path }]) else: task.add_spark_snippet( clazz='com.cloudera.labs.envelope.EnvelopeMain', jars=Hdfs.basename(lib_path), arguments=[u'envelope.conf'], files=[ { u'path': u'%s/envelope.conf' % workspace_path, u'type': u'file' }, { u'path': lib_path, u'type': u'file' }, ]) return task.execute(request, batch=True)