class ManagerApi(object): """ https://cloudera.github.io/cm_api/ """ def __init__(self, user=None, security_enabled=False, ssl_cert_ca_verify=False): self._api_url = '%s/%s' % (MANAGER.API_URL.get().strip('/'), VERSION) self._username = get_navigator_auth_username() self._password = get_navigator_auth_password() self.user = user self._client = HttpClient(self._api_url, logger=LOG) if security_enabled: self._client.set_kerberos_auth() else: self._client.set_basic_auth(self._username, self._password) self._client.set_verify(ssl_cert_ca_verify) self._root = Resource(self._client) def tools_echo(self): try: params = (('message', 'hello'), ) LOG.info(params) return self._root.get('tools/echo', params=params) except RestException, e: raise ManagerApiException(e)
class NavigatorApi(object): """ http://cloudera.github.io/navigator/apidocs/v3/index.html """ def __init__(self, api_url=None, user=None, password=None): self._api_url = '%s/%s' % ((api_url or NAVIGATOR.API_URL.get()).strip('/'), VERSION) self._username = user or NAVIGATOR.AUTH_USERNAME.get() self._password = password or NAVIGATOR.AUTH_PASSWORD.get() self._client = HttpClient(self._api_url, logger=LOG) self._client.set_basic_auth(self._username, self._password) self._root = resource.Resource(self._client) self.__headers = {} self.__params = () def search_entities(self, query_s, limit=100, offset=0, **filters): """ GET /api/v3/entities?query=() http://cloudera.github.io/navigator/apidocs/v3/path__v3_entities.html :param query_s: a query string of search terms (e.g. - sales quarterly); Currently the search will perform an OR boolean search for all terms (split on whitespace), against a whitelist of search_fields. TODO: support smarter boolean searching with arbitrary ordering and precedence of conditionals :param filters: TODO: IMPLEMENT ME, required to support property search """ search_fields = ('originalName', 'originalDescription', 'name', 'description', 'tags') entity_types = ('DATABASE', 'TABLE', 'PARTITION', 'FIELD', 'FILE', 'OPERATION') try: params = self.__params search_terms = [term.lower() for term in query_s.strip().split()] query_clauses = [] for term in search_terms: query_clauses.append('OR'.join(['(%s:*%s*)' % (field, term) for field in search_fields])) filter_query = '(originalName:*.*)' if search_terms: filter_query = 'OR'.join(['(%s)' % clause for clause in query_clauses]) type_filter_clause = 'OR'.join(['(%s:%s)' % ('type', entity_type) for entity_type in entity_types]) filter_query = '%sAND(%s)' % (filter_query, type_filter_clause) params += ( ('query', filter_query), ('offset', offset), ('limit', limit), ) response = self._root.get('entities', headers=self.__headers, params=params) return response except RestException, e: msg = 'Failed to search for entities with search query: %s' % query_s LOG.exception(msg) raise NavigatorApiException(msg)
class THttpClient(TTransportBase): """ HTTP transport mode for Thrift. HTTPS and Kerberos support with Request. e.g. mode = THttpClient('http://hbase-thrift-v1.com:9090') mode = THttpClient('http://hive-localhost:10001/cliservice') """ def __init__(self, base_url): self._base_url = base_url self._client = HttpClient(self._base_url, logger=LOG) self._data = None self._headers = None self._wbuf = buffer_writer() def open(self): pass def set_kerberos_auth(self, service="HTTP"): self._client.set_kerberos_auth(service=service) def set_basic_auth(self, username, password): self._client.set_basic_auth(username, password) def set_bearer_auth(self, token): self._client.set_bearer_auth(token) def set_verify(self, verify=True): self._client.set_verify(verify) def close(self): self._headers = None # Close session too? def isOpen(self): return self._client is not None def setTimeout(self, ms): if not self._headers: self._headers = {} self._headers.update(timeout=str(int(ms / 1000))) def setCustomHeaders(self, headers): self._headers = headers def read(self, sz): return self._data def write(self, buf): self._wbuf.write(buf) def flush(self): data = self._wbuf.getvalue() self._wbuf = buffer_writer() # POST self._root = Resource(self._client) self._data = self._root.post('', data=data, headers=self._headers)
class ManagerApi(object): """ https://cloudera.github.io/cm_api/ """ def __init__(self, user=None, security_enabled=False, ssl_cert_ca_verify=False): self._api_url = '%s/%s' % (MANAGER.API_URL.get().strip('/'), VERSION) self._username = get_navigator_auth_username() self._password = get_navigator_auth_password() self.user = user self._client = HttpClient(self._api_url, logger=LOG) if security_enabled: self._client.set_kerberos_auth() else: self._client.set_basic_auth(self._username, self._password) self._client.set_verify(ssl_cert_ca_verify) self._root = Resource(self._client) def has_service(self, service_name, cluster_name=None): cluster = self._get_cluster(cluster_name) try: services = self._root.get( 'clusters/%(cluster_name)s/serviceTypes' % { 'cluster_name': cluster['name'], 'service_name': service_name })['items'] return service_name in services except RestException, e: raise ManagerApiException(e)
class THttpClient(TTransportBase): """ HTTP transport mode for Thrift. HTTPS and Kerberos support with Request. e.g. mode = THttpClient('http://hbase-thrift-v1.com:9090') mode = THttpClient('http://hive-localhost:10001/cliservice') """ def __init__(self, base_url, cert_validate=True): self._base_url = base_url self._client = HttpClient(self._base_url, logger=LOG, cert_validate=cert_validate) self._data = None self._headers = None self._wbuf = StringIO() def open(self): pass def set_basic_auth(self, username, password): self._client.set_basic_auth(username, password) def set_kerberos_auth(self): self._client.set_kerberos_auth() def close(self): self._headers = None # Close session too? def isOpen(self): return self._client is not None def setTimeout(self, ms): pass def setCustomHeaders(self, headers): self._headers = headers def read(self, sz): return self._data def write(self, buf): self._wbuf.write(buf) def flush(self): if self.isOpen(): self.close() self.open() data = self._wbuf.getvalue() self._wbuf = StringIO() # POST self._root = Resource(self._client) self._data = self._root.post('', data=data)
class THttpClient(TTransportBase): """ HTTP transport mode for Thrift. HTTPS and Kerberos support with Request. e.g. mode = THttpClient('http://hbase-thrift-v1.com:9090') mode = THttpClient('http://hive-localhost:10001/cliservice') """ def __init__(self, base_url, cert_validate=True): self._base_url = base_url self._client = HttpClient(self._base_url, logger=LOG, cert_validate=cert_validate) self._data = None self._headers = None self._wbuf = StringIO() def open(self): pass def set_basic_auth(self, username, password): self._client.set_basic_auth(username, password) def set_kerberos_auth(self): self._client.set_kerberos_auth() def close(self): self._headers = None # Close session too? def isOpen(self): return self._client is not None def setTimeout(self, ms): pass def setCustomHeaders(self, headers): self._headers = headers def read(self, sz): return self._data def write(self, buf): self._wbuf.write(buf) def flush(self): if self.isOpen(): self.close() self.open() data = self._wbuf.getvalue() self._wbuf = StringIO() # POST self._root = Resource(self._client) self._data = self._root.post('', data=data)
class NavigatorApi(object): """ http://cloudera.github.io/navigator/apidocs/v2/index.html """ def __init__(self, api_url=None, user=None, password=None): self._api_url = (api_url or NAVIGATOR.API_URL.get()).strip('/') self._username = user or NAVIGATOR.AUTH_USERNAME.get() self._password = password or NAVIGATOR.AUTH_PASSWORD.get() self._client = HttpClient(self._api_url, logger=LOG) self._client.set_basic_auth(self._username, self._password) self._root = resource.Resource(self._client) self.__headers = {} self.__params = () def find_entity(self, source_type, type, name, **filters): """ GET /api/v2/entities?query=((sourceType:<source_type>)AND(type:<type>)AND(originalName:<name>)) http://cloudera.github.io/navigator/apidocs/v2/path__v2_entities.html """ try: params = self.__params query_filters = { 'sourceType': source_type, 'type': type, 'originalName': name, 'deleted': 'false' } for key, value in filters.items(): query_filters[key] = value filter_query = 'AND'.join('(%s:%s)' % (key, value) for key, value in query_filters.items()) params += ( ('query', filter_query), ('offset', 0), ('limit', 2), # We are looking for single entity, so limit to 2 to check for multiple results ) response = self._root.get('entities', headers=self.__headers, params=params) if not response: raise NavigatorApiException('Could not find entity with query filters: %s' % str(query_filters)) elif len(response) > 1: raise NavigatorApiException('Found more than 1 entity with query filters: %s' % str(query_filters)) return response[0] except RestException, e: msg = 'Failed to find entity: %s' % str(e) LOG.exception(msg) raise NavigatorApiException(msg)
class NavigatorApi(object): """ http://cloudera.github.io/navigator/apidocs/v2/index.html """ def __init__(self, api_url=None, user=None, password=None): self._api_url = (api_url or NAVIGATOR.API_URL.get()).strip('/') self._username = user or NAVIGATOR.AUTH_USERNAME.get() self._password = password or NAVIGATOR.AUTH_PASSWORD.get() self._client = HttpClient(self._api_url, logger=LOG) self._client.set_basic_auth(self._username, self._password) self._root = resource.Resource(self._client) self.__headers = {} self.__params = () def find_entity(self, source_type, type, name, **filters): """ GET /api/v2/entities?query=((sourceType:<source_type>)AND(type:<type>)AND(originalName:<name>)) http://cloudera.github.io/navigator/apidocs/v2/path__v2_entities.html """ try: params = self.__params query_filters = { 'sourceType': source_type, 'type': type, 'originalName': name, 'deleted': 'false' } for key, value in filters.items(): query_filters[key] = value filter_query = 'AND'.join('(%s:%s)' % (key, value) for key, value in query_filters.items()) params += ( ('query', filter_query), ('offset', 0), ('limit', 2), # We are looking for single entity, so limit to 2 to check for multiple results ) response = self._root.get('entities', headers=self.__headers, params=params) if not response: raise NavigatorApiException('Could not find entity with query filters: %s' % str(query_filters)) elif len(response) > 1: raise NavigatorApiException('Found more than 1 entity with query filters: %s' % str(query_filters)) return response[0] except RestException, e: msg = 'Failed to find entity: %s' % str(e) LOG.exception(msg) raise NavigatorApiException(msg)
class ManagerApi(object): """ https://cloudera.github.io/cm_api/ """ def __init__(self, user=None, security_enabled=False, ssl_cert_ca_verify=False): self._api_url = '%s/%s' % (MANAGER.API_URL.get().strip('/'), VERSION) self._username = get_navigator_auth_username() self._password = get_navigator_auth_password() self.user = user self._client = HttpClient(self._api_url, logger=LOG) if security_enabled: self._client.set_kerberos_auth() else: self._client.set_basic_auth(self._username, self._password) self._client.set_verify(ssl_cert_ca_verify) self._root = Resource(self._client) def has_service(self, service_name, cluster_name=None): cluster = self._get_cluster(cluster_name) try: services = self._root.get( 'clusters/%(cluster_name)s/serviceTypes' % { 'cluster_name': cluster['name'], 'service_name': service_name })['items'] return service_name in services except RestException as e: raise ManagerApiException(e) def get_spark_history_server_configs(self, cluster_name=None): service_name = "SPARK_ON_YARN" shs_role_type = "SPARK_YARN_HISTORY_SERVER" try: cluster = self._get_cluster(cluster_name) services = self._root.get('clusters/%(cluster_name)s/services' % { 'cluster_name': cluster['name'], 'service_name': service_name })['items'] service_display_names = [ service['displayName'] for service in services if service['type'] == service_name ] if service_display_names: spark_service_display_name = service_display_names[0] servers = self._root.get( 'clusters/%(cluster_name)s/services/%(spark_service_display_name)s/roles' % { 'cluster_name': cluster['name'], 'spark_service_display_name': spark_service_display_name })['items'] shs_server_names = [ server['name'] for server in servers if server['type'] == shs_role_type ] shs_server_name = shs_server_names[ 0] if shs_server_names else None shs_server_hostRef = [ server['hostRef'] for server in servers if server['type'] == shs_role_type ] shs_server_hostId = shs_server_hostRef[0][ 'hostId'] if shs_server_hostRef else None if shs_server_name and shs_server_hostId: shs_server_configs = self._root.get( 'clusters/%(cluster_name)s/services/%(spark_service_display_name)s/roles/%(shs_server_name)s/config' % { 'cluster_name': cluster['name'], 'spark_service_display_name': spark_service_display_name, 'shs_server_name': shs_server_name }, params={'view': 'full'})['items'] return shs_server_hostId, shs_server_configs except Exception as e: LOG.warning("Check Spark History Server via ManagerApi: %s" % e) return None, None def get_spark_history_server_url(self, cluster_name=None): shs_server_hostId, shs_server_configs = self.get_spark_history_server_configs( cluster_name=cluster_name) if shs_server_hostId and shs_server_configs: shs_ui_port = None shs_ssl_port = None shs_ssl_enabled = None for config in shs_server_configs: if 'relatedName' in config and 'default' in config: if config['relatedName'] == 'spark.history.ui.port': shs_ui_port = config['default'] if config['relatedName'] == 'spark.ssl.historyServer.port': shs_ssl_port = config['default'] if config[ 'relatedName'] == 'spark.ssl.historyServer.enabled': shs_ssl_enabled = config['default'] shs_ui_host = self._root.get('hosts/%(hostId)s' % {'hostId': shs_server_hostId}) shs_ui_hostname = shs_ui_host['hostname'] if shs_ui_host else None return self.assemble_shs_url(shs_ui_hostname, shs_ui_port, shs_ssl_port, shs_ssl_enabled) return None def get_spark_history_server_security_enabled(self, cluster_name=None): shs_server_hostId, shs_server_configs = self.get_spark_history_server_configs( cluster_name=cluster_name) if shs_server_configs: for config in shs_server_configs: if 'relatedName' in config and 'default' in config and config[ 'relatedName'] == 'history_server_spnego_enabled': shs_security_enabled = config['default'] return shs_security_enabled and shs_security_enabled == 'true' return False def assemble_shs_url(self, shs_ui_hostname, shs_ui_port=None, shs_ssl_port=None, shs_ssl_enabled=None): if not shs_ui_hostname or not shs_ui_port or not shs_ssl_port or not shs_ssl_enabled: LOG.warning("Spark conf not found!") return None protocol = 'https' if shs_ssl_enabled.lower() == 'true' else 'http' shs_url = '%(protocol)s://%(hostname)s:%(port)s' % { 'protocol': protocol, 'hostname': shs_ui_hostname, 'port': shs_ssl_port if shs_ssl_enabled.lower() == 'true' else shs_ui_port, } return shs_url def tools_echo(self): try: params = (('message', 'hello'), ) LOG.info(params) return self._root.get('tools/echo', params=params) except RestException as e: raise ManagerApiException(e) def get_kafka_brokers(self, cluster_name=None): try: hosts = self._get_hosts('KAFKA', 'KAFKA_BROKER', cluster_name=cluster_name) brokers_hosts = [host['hostname'] + ':9092' for host in hosts] return ','.join(brokers_hosts) except RestException as e: raise ManagerApiException(e) def get_kudu_master(self, cluster_name=None): try: cluster = self._get_cluster(cluster_name) services = self._root.get('clusters/%(name)s/services' % cluster)['items'] service = [ service for service in services if service['type'] == 'KUDU' ][0] master = self._get_roles(cluster['name'], service['name'], 'KUDU_MASTER')[0] master_host = self._root.get('hosts/%(hostId)s' % master['hostRef']) return master_host['hostname'] except RestException as e: raise ManagerApiException(e) def get_kafka_topics(self, broker_host): try: client = HttpClient('http://%s:24042' % broker_host, logger=LOG) root = Resource(client) return root.get('/api/topics') except RestException as e: raise ManagerApiException(e) def update_flume_config(self, cluster_name, config_name, config_value): service = 'FLUME-1' cluster = self._get_cluster(cluster_name) roleConfigGroup = [ role['roleConfigGroupRef']['roleConfigGroupName'] for role in self._get_roles(cluster['name'], service, 'AGENT') ] data = { u'items': [{ u'url': u'/api/v8/clusters/%(cluster_name)s/services/%(service)s/roleConfigGroups/%(roleConfigGroups)s/config?message=Updated%20service%20and%20role%20type%20configurations.' .replace('%(cluster_name)s', urllib_quote(cluster['name'])).replace( '%(service)s', service).replace('%(roleConfigGroups)s', roleConfigGroup[0]), u'body': { u'items': [{ u'name': config_name, u'value': config_value }] }, u'contentType': u'application/json', u'method': u'PUT' }] } return self.batch(items=data) def get_flume_agents(self, cluster_name=None): return [ host['hostname'] for host in self._get_hosts( 'FLUME', 'AGENT', cluster_name=cluster_name) ] def _get_hosts(self, service_name, role_name, cluster_name=None): try: cluster = self._get_cluster(cluster_name) services = self._root.get('clusters/%(name)s/services' % cluster)['items'] service = [ service for service in services if service['type'] == service_name ][0] hosts = self._get_roles(cluster['name'], service['name'], role_name) hosts_ids = [host['hostRef']['hostId'] for host in hosts] hosts = self._root.get('hosts')['items'] return [host for host in hosts if host['hostId'] in hosts_ids] except RestException as e: raise ManagerApiException(e) def refresh_flume(self, cluster_name, restart=False): service = 'FLUME-1' cluster = self._get_cluster(cluster_name) roles = [ role['name'] for role in self._get_roles(cluster['name'], service, 'AGENT') ] if restart: return self.restart_services(cluster['name'], service, roles) else: return self.refresh_configs(cluster['name'], service, roles) def refresh_configs(self, cluster_name, service=None, roles=None): try: if service is None: return self._root.post( 'clusters/%(cluster_name)s/commands/refresh' % {'cluster_name': cluster_name}, contenttype="application/json") elif roles is None: return self._root.post( 'clusters/%(cluster_name)s/services/%(service)s/roleCommands/refresh' % { 'cluster_name': cluster_name, 'service': service }, contenttype="application/json") else: return self._root.post( 'clusters/%(cluster_name)s/services/%(service)s/roleCommands/refresh' % { 'cluster_name': cluster_name, 'service': service }, data=json.dumps({"items": roles}), contenttype="application/json") except RestException as e: raise ManagerApiException(e) def restart_services(self, cluster_name, service=None, roles=None): try: if service is None: return self._root.post( 'clusters/%(cluster_name)s/commands/restart' % {'cluster_name': cluster_name}, contenttype="application/json") elif roles is None: return self._root.post( 'clusters/%(cluster_name)s/services/%(service)s/roleCommands/restart' % { 'cluster_name': cluster_name, 'service': service }, contenttype="application/json") else: return self._root.post( 'clusters/%(cluster_name)s/services/%(service)s/roleCommands/restart' % { 'cluster_name': cluster_name, 'service': service }, data=json.dumps({"items": roles}), contenttype="application/json") except RestException as e: raise ManagerApiException(e) def batch(self, items): try: return self._root.post('batch', data=json.dumps(items), contenttype='application/json') except RestException as e: raise ManagerApiException(e) def _get_cluster(self, cluster_name=None): clusters = self._root.get('clusters/')['items'] if cluster_name is not None: cluster = [ cluster for cluster in clusters if cluster['name'] == cluster_name ][0] else: cluster = clusters[0] return cluster def _get_roles(self, cluster_name, service_name, role_type): roles = self._root.get( 'clusters/%(cluster_name)s/services/%(service_name)s/roles' % { 'cluster_name': cluster_name, 'service_name': service_name })['items'] return [role for role in roles if role['type'] == role_type] def get_impalad_config(self, key=None, impalad_host=None, cluster_name=None): if not key or not impalad_host: return None service_name = "IMPALA" role_type = 'IMPALAD' try: cluster = self._get_cluster(cluster_name) services = self._root.get('clusters/%(cluster_name)s/services' % { 'cluster_name': cluster['name'], 'service_name': service_name })['items'] service_display_names = [ service['displayName'] for service in services if service['type'] == service_name ] hosts = self._root.get('hosts')['items'] impalad_hostIds = [ host['hostId'] for host in hosts if host['hostname'] == impalad_host ] if impalad_hostIds and service_display_names: impalad_hostId = impalad_hostIds[0] impala_service_display_name = service_display_names[0] servers = self._root.get( 'clusters/%(cluster_name)s/services/%(spark_service_display_name)s/roles' % { 'cluster_name': cluster['name'], 'spark_service_display_name': impala_service_display_name })['items'] impalad_server_names = [ server['name'] for server in servers if server['type'] == role_type and server['hostRef']['hostId'] == impalad_hostId ] impalad_server_name = impalad_server_names[ 0] if impalad_server_names else None if impalad_server_name: server_configs = self._root.get( 'clusters/%(cluster_name)s/services/%(spark_service_display_name)s/roles/%(shs_server_name)s/config' % { 'cluster_name': cluster['name'], 'spark_service_display_name': impala_service_display_name, 'shs_server_name': impalad_server_name }, params={'view': 'full'})['items'] for config in server_configs: if 'relatedName' in config and 'value' in config: if config['relatedName'] == key: return config['value'] except Exception as e: LOG.warning( "Get Impala Daemon API configurations via ManangerAPI: %s" % e) return None
class ImpalaDaemonApi(object): def __init__(self, server_url): self._url = server_url self._client = HttpClient(self._url, logger=LOG) self._root = Resource(self._client) self._security_enabled = is_kerberos_enabled() self._webserver_spnego_enabled = is_webserver_spnego_enabled() self._thread_local = threading.local() # You can set username/password for Impala Web UI which overrides kerberos if DAEMON_API_USERNAME.get() is not None and DAEMON_API_PASSWORD.get( ) is not None: if DAEMON_API_AUTH_SCHEME.get().lower() == 'basic': self._client.set_basic_auth(DAEMON_API_USERNAME.get(), DAEMON_API_PASSWORD.get()) LOG.info( "Using username and password for basic authentication") else: self._client.set_digest_auth(DAEMON_API_USERNAME.get(), DAEMON_API_PASSWORD.get()) LOG.info( 'Using username and password for digest authentication') elif self._webserver_spnego_enabled or self._security_enabled: self._client.set_kerberos_auth() LOG.info('Using kerberos principal for authentication') def __str__(self): return "ImpalaDaemonApi at %s" % self._url @property def url(self): return self._url @property def security_enabled(self): return self._security_enabled @property def user(self): return self._thread_local.user def set_user(self, user): if hasattr(user, 'username'): self._thread_local.user = user.username else: self._thread_local.user = user def get_queries(self): params = {'json': 'true'} resp = self._root.get('queries', params=params) try: if isinstance(resp, basestring): return json.loads(resp) else: return resp except ValueError as e: raise ImpalaDaemonApiException( 'ImpalaDaemonApi did not return valid JSON: %s' % e) def get_query(self, query_id): params = {'query_id': query_id, 'json': 'true'} resp = self._root.get('query_plan', params=params) try: if isinstance(resp, basestring): return json.loads(resp) else: return resp except ValueError as e: raise ImpalaDaemonApiException( 'ImpalaDaemonApi did not return valid JSON: %s' % e) def get_query_profile(self, query_id): params = {'query_id': query_id, 'json': 'true'} resp = self._root.get('query_profile', params=params) try: if isinstance(resp, basestring): return json.loads(resp) else: return resp except ValueError as e: raise ImpalaDaemonApiException( 'ImpalaDaemonApi query_profile did not return valid JSON: %s' % e) def get_query_memory(self, query_id): params = {'query_id': query_id, 'json': 'true'} resp = self._root.get('query_memory', params=params) try: if isinstance(resp, basestring): return json.loads(resp) else: return resp except ValueError as e: raise ImpalaDaemonApiException( 'ImpalaDaemonApi query_memory did not return valid JSON: %s' % e) def kill(self, query_id): params = {'query_id': query_id, 'json': 'true'} resp = self._root.get('cancel_query', params=params) try: if isinstance(resp, basestring): return json.loads(resp) else: return resp except ValueError as e: raise ImpalaDaemonApiException( 'ImpalaDaemonApi kill did not return valid JSON: %s' % e) def get_query_backends(self, query_id): params = {'query_id': query_id, 'json': 'true'} resp = self._root.get('query_backends', params=params) try: if isinstance(resp, basestring): return json.loads(resp) else: return resp except ValueError as e: raise ImpalaDaemonApiException( 'ImpalaDaemonApi query_backends did not return valid JSON: %s' % e) def get_query_finstances(self, query_id): params = {'query_id': query_id, 'json': 'true'} resp = self._root.get('query_finstances', params=params) try: if isinstance(resp, basestring): return json.loads(resp) else: return resp except ValueError as e: raise ImpalaDaemonApiException( 'ImpalaDaemonApi query_finstances did not return valid JSON: %s' % e) def get_query_summary(self, query_id): params = {'query_id': query_id, 'json': 'true'} resp = self._root.get('query_summary', params=params) try: if isinstance(resp, basestring): return json.loads(resp) else: return resp except ValueError as e: raise ImpalaDaemonApiException( 'ImpalaDaemonApi query_summary did not return valid JSON: %s' % e) def get_query_profile_encoded(self, query_id): params = {'query_id': query_id} return self._root.get('query_profile_encoded', params=params)
class NavigatorApi(object): """ http://cloudera.github.io/navigator/apidocs/v3/index.html """ def __init__(self, api_url=None, user=None, password=None): self._api_url = '%s/%s' % ( (api_url or NAVIGATOR.API_URL.get()).strip('/'), VERSION) self._username = user or NAVIGATOR.AUTH_USERNAME.get() self._password = password or NAVIGATOR.AUTH_PASSWORD.get() self._client = HttpClient(self._api_url, logger=LOG) self._client.set_basic_auth(self._username, self._password) self._root = resource.Resource( self._client, urlencode=False) # For search_entities_interactive self.__headers = {} self.__params = () def _get_types_from_sources(self, sources): default_entity_types = entity_types = ('DATABASE', 'TABLE', 'PARTITION', 'FIELD', 'FILE', 'VIEW', 'OPERATION', 'DIRECTORY') if 'sql' in sources or 'hive' in sources or 'impala' in sources: default_entity_types = ('TABLE', 'VIEW') entity_types = ('TABLE', 'VIEW', 'DATABASE', 'PARTITION', 'FIELD') elif 'hdfs' in sources: entity_types = ('FILE', 'DIRECTORY') default_entity_types = ('FILE', 'DIRECTORY') return default_entity_types, entity_types def search_entities(self, query_s, limit=100, offset=0, **filters): """ Solr edismax query parser syntax. :param query_s: a query string of search terms (e.g. - sales quarterly); Currently the search will perform an OR boolean search for all terms (split on whitespace), against a whitelist of search_fields. TODO: support smarter boolean searching with arbitrary ordering and precedence of conditionals """ search_fields = ('originalName', 'originalDescription', 'name', 'description', 'tags') sources = filters.get('sources', []) default_entity_types, entity_types = self._get_types_from_sources( sources) try: params = self.__params search_terms = [term for term in query_s.strip().split()] query_clauses = [] user_filters = [] for term in search_terms: if ':' not in term: query_clauses.append('OR'.join([ '(%s:*%s*)' % (field, term) for field in search_fields ])) else: name, val = term.split(':') if val and (name != 'type' or val in entity_types): # Manual filter allowed user_filters.append(term + '*') # e.g. type:VIEW ca filter_query = '*' if query_clauses: filter_query = 'OR'.join( ['(%s)' % clause for clause in query_clauses]) user_filter_clause = 'OR '.join(['(%s)' % f for f in user_filters]) or '*' source_filter_clause = 'OR'.join([ '(%s:%s)' % ('type', entity_type) for entity_type in default_entity_types ]) filter_query = '%s AND (%s) AND (%s)' % ( filter_query, user_filter_clause, source_filter_clause) params += ( ('query', filter_query), ('offset', offset), ('limit', limit), ) LOG.info(params) response = self._root.get('entities', headers=self.__headers, params=params) return response except RestException, e: msg = 'Failed to search for entities with search query: %s' % query_s LOG.exception(msg) raise NavigatorApiException(msg)
class AtlasApi(Api): """ https://atlas.apache.org """ DEFAULT_SEARCH_FIELDS = (('originalName', 3), ('originalDescription', 1), ('name', 10), ('description', 3), ('tags', 5)) CATALOG_NAMESPACE = '__cloudera_internal_catalog_hue' NAV_TO_ATLAS_TYPE = { 'table': 'hive_table', 'database': 'hive_db', 'field': 'hive_column' } ATLAS_TO_NAV_TYPE = { 'hive_table': 'TABLE', 'hive_db': 'DATABASE', 'hive_column': 'FIELD' } CLASSIFICATION_RE = re.compile( '(?:tag|tags|classification)\s*\:\s*(?:(?:\"([^"]+)\")|([^ ]+))\s*', re.IGNORECASE) TYPE_RE = re.compile('type\s*\:\s*([^ ]+)\s*', re.IGNORECASE) OWNER_RE = re.compile('owner\s*\:\s*([^ ]+)\s*', re.IGNORECASE) def __init__(self, user=None): super(AtlasApi, self).__init__(user) self._api_url = CATALOG.API_URL.get().strip('/') + "/api/atlas" self._username = CATALOG.SERVER_USER.get() self._password = CATALOG.SERVER_PASSWORD.get() self._client = HttpClient(self._api_url, logger=LOG) if CATALOG.KERBEROS_ENABLED.get(): self._client.set_kerberos_auth() elif self._password: self._client.set_basic_auth(self._username, self._password) self._root = resource.Resource( self._client, urlencode=False) # For search_entities_interactive self.__headers = {} self.__params = () #self._fillup_properties() # Disabled currently def _get_types_from_sources(self, sources): default_entity_types = entity_types = ('DATABASE', 'TABLE', 'PARTITION', 'FIELD', 'FILE', 'VIEW', 'S3BUCKET', 'OPERATION', 'DIRECTORY') if 'sql' in sources or 'hive' in sources or 'impala' in sources: entity_types = ('TABLE', 'VIEW', 'DATABASE', 'PARTITION', 'FIELD') default_entity_types = ('TABLE', 'VIEW') elif 'hdfs' in sources: entity_types = ('FILE', 'DIRECTORY') default_entity_types = ('FILE', 'DIRECTORY') elif 's3' in sources: entity_types = ('FILE', 'DIRECTORY', 'S3BUCKET') default_entity_types = ('DIRECTORY', 'S3BUCKET') return default_entity_types, entity_types def adapt_atlas_entity_to_navigator(self, atlas_entity): nav_entity = { "created": 'createTime' in atlas_entity['attributes'] and atlas_entity['attributes']['createTime'], "customProperties": None, "description": atlas_entity['attributes'].get('description'), "identity": atlas_entity['guid'], "internalType": atlas_entity['typeName'], "meaningNames": atlas_entity['meaningNames'], # Atlas specific "meanings": atlas_entity['meanings'], # Atlas specific "name": atlas_entity['attributes'].get('name'), "original_name": atlas_entity['attributes'].get('name'), "originalDescription": None, "originalName": atlas_entity['attributes'].get('name'), "owner": atlas_entity['attributes'].get('owner'), "parentPath": '', # Set below "properties": {}, # Set below "sourceType": '', # Set below "classifications": [], "tags": atlas_entity['classificationNames'], "type": self.ATLAS_TO_NAV_TYPE.get(atlas_entity['typeName'].lower()) or atlas_entity['typeName'] } # Convert Atlas qualified name of form db.tbl.col@cluster to parentPath of form /db/tbl if atlas_entity['typeName'].lower().startswith('hive_'): nav_entity['sourceType'] = 'HIVE' qualified_path_parts = re.sub( r'@.*$', '', atlas_entity['attributes'].get('qualifiedName')).split('.') qualified_path_parts.pop( ) # it's just the parent path we want so remove the entity name nav_entity['parentPath'] = '/' + '/'.join(qualified_path_parts) if 'classifications' in atlas_entity: nav_entity['classifications'] = atlas_entity['classifications'] for atlas_classification in atlas_entity['classifications']: if 'attributes' in atlas_classification: for key, value in atlas_classification[ 'attributes'].iteritems(): nav_entity['properties'][key] = value return nav_entity def fetch_single_entity(self, dsl_query): ''' REQUEST: hue:8889/metadata/api/navigator/find_entity?type=database&name=default SAMPLE response for Navigator find_entity response {"status": 0, "entity": { "customProperties": null, "deleteTime": null, "fileSystemPath": "hdfs://nightly6x-1.vpc.cloudera.com:8020/user/hive/warehouse", "description": null, "params": null, "type": "DATABASE", "internalType": "hv_database", "sourceType": "HIVE", "tags": [], "deleted": false, "technicalProperties": null, "userEntity": false, "originalDescription": "Default Hive database", "metaClassName": "hv_database", "properties": {"__cloudera_internal__hueLink": "https://nightly6x-1.vpc.cloudera.com:8889/hue/metastore/tables/default"}, "identity": "23", "firstClassParentId": null, "name": null, "extractorRunId": "7##1", "sourceId": "7", "packageName": "nav", "parentPath": null, "originalName": "default"}} ''' response = {"status": 0, "entity": []} try: atlas_response = self._root.get('/v2/search/dsl?query=%s' % dsl_query, headers=self.__headers, params=self.__params) if not 'entities' in atlas_response or len( atlas_response['entities']) < 1: raise CatalogEntityDoesNotExistException( 'Could not find entity with query: %s' % dsl_query) for atlas_entity in atlas_response['entities']: response['entity'].append( self.adapt_atlas_entity_to_navigator(atlas_entity)) return response['entity'][0] except RestException as e: if e.code == 401: raise raise_popup_exception( 'Hue could not authenticate to Atlas', detail=e) else: raise raise_popup_exception('Hue could not query Atlas', detail=e) def get_database(self, name): # Search with Atlas API for hive database with specific name if get_catalog_search_cluster(): qualifiedNameCriteria = 'qualifiedName=\'%s@%s\'' % ( name, get_catalog_search_cluster()) else: qualifiedNameCriteria = 'qualifiedName like \'%s@*\'' % name return self.fetch_single_entity('hive_db where %s' % qualifiedNameCriteria) def get_table(self, database_name, table_name, is_view=False): # Search with Atlas API for hive tables with specific name if get_catalog_search_cluster(): qualifiedNameCriteria = 'qualifiedName=\'%s.%s@%s\'' % ( database_name, table_name, get_catalog_search_cluster()) else: qualifiedNameCriteria = 'qualifiedName like \'%s.%s@*\'' % ( database_name, table_name) return self.fetch_single_entity('hive_table where %s' % qualifiedNameCriteria) def get_field(self, database_name, table_name, field_name): # Search with Atlas API for hive tables with specific qualified name if get_catalog_search_cluster(): qualifiedNameCriteria = 'qualifiedName=\'%s.%s.%s@%s\'' % ( database_name, table_name, field_name, get_catalog_search_cluster()) else: qualifiedNameCriteria = 'qualifiedName like \'%s.%s.%s@*\'' % ( database_name, table_name, field_name) return self.fetch_single_entity('hive_column where %s' % qualifiedNameCriteria) def search_entities_interactive(self, query_s=None, limit=100, offset=0, facetFields=None, facetPrefix=None, facetRanges=None, filterQueries=None, firstClassEntitiesOnly=None, sources=None): response = {"status": 0, "results": [], "facets": {"tags": {}}} # This takes care of the list_tags endpoint if not query_s and facetFields and 'tags' in facetFields: classification_response = self._root.get( '/v2/types/typedefs?type=classification') for classification_def in classification_response[ 'classificationDefs']: if ' ' in classification_def['name']: response['facets']['tags']['"' + classification_def['name'] + '"'] = -1 else: response['facets']['tags'][classification_def['name']] = -1 return response query_s = (query_s.strip() if query_s else '').replace('*', '') atlas_type = None classification = None owner = None # Take the first classification and type facets and ignore other as we can't search multiple in Atlas. classification_facets = self.CLASSIFICATION_RE.findall(query_s) if classification_facets: classification = classification_facets[0][ 0] or classification_facets[0][1] query_s = self.CLASSIFICATION_RE.sub('', query_s).strip() atlas_type = 'Asset' # Filtered below to just contain hive_db, hive_table or hive_column owner_facets = self.OWNER_RE.findall(query_s) if owner_facets: owner = owner_facets[0] query_s = self.OWNER_RE.sub('', query_s).strip() type_facets = self.TYPE_RE.findall(query_s) if type_facets: atlas_type = self.NAV_TO_ATLAS_TYPE[ type_facets[0].lower()] or type_facets[0] query_s = self.TYPE_RE.sub('', query_s).strip() data = { 'attributes': None, 'classification': classification, 'entityFilters': { 'condition': 'AND', 'criterion': [{ 'condition': 'OR', 'criterion': [{ 'attributeName': 'name', 'attributeValue': query_s, 'operator': 'contains' }, { 'attributeName': 'description', 'attributeValue': query_s, 'operator': 'contains' }] }] }, 'excludeDeletedEntities': True, 'includeClassificationAttributes': True, 'includeSubClassifications': True, 'includeSubTypes': True, 'limit': limit, 'offset': 0, 'tagFilters': None, 'termName': None, 'typeName': atlas_type or 'hive_table' } if get_catalog_search_cluster(): data['entityFilters']['criterion'].append({ 'attributeName': 'qualifiedName', 'operator': 'contains', 'attributeValue': '@' + get_catalog_search_cluster() }) if owner: data['entityFilters']['criterion'].append({ 'attributeName': 'owner', 'operator': 'startsWith', 'attributeValue': owner }) try: atlas_response = self._root.post('/v2/search/basic', data=json.dumps(data), contenttype=_JSON_CONTENT_TYPE) # Adapt Atlas entities to Navigator structure in the results if 'entities' in atlas_response: for atlas_entity in atlas_response['entities']: if atlas_type != 'Asset' or atlas_entity['typeName'].lower( ) in ['hive_db', 'hive_table', 'hive_column']: response['results'].append( self.adapt_atlas_entity_to_navigator(atlas_entity)) return response except RestException as e: if e.code == 401: raise raise_popup_exception( 'Hue could not authenticate to Atlas', detail=e) else: raise raise_popup_exception('Hue could not query Atlas', detail=e) # search_enties is only used by the table browser to fetch child entities of a given table or database. def search_entities(self, query_s, limit=100, offset=0, raw_query=False, **filters): try: found_entities = [] search_terms = [term for term in query_s.strip().split() ] if query_s else [] parentPath = None for term in search_terms: if 'parentPath:' in term: name, val = term.split(':') parentPath = val.strip('"').lstrip('/').replace('/', '.') if query_s == 'type:database': if get_catalog_search_cluster(): atlas_dsl_query = 'from hive_db where qualifiedName like \'*@%s\' limit %s' % ( get_catalog_search_cluster(), limit) else: atlas_dsl_query = 'from hive_db limit %s' % limit elif not parentPath: return found_entities else: atlas_type = 'hive_table' if parentPath.count( '.') == 0 else 'hive_column' if get_catalog_search_cluster(): atlas_dsl_query = 'from %s where qualifiedName like \'%s*@%s\' limit %s' % ( atlas_type, parentPath, get_catalog_search_cluster(), limit) else: atlas_dsl_query = 'from %s where qualifiedName like \'%s*\' limit %s' % ( atlas_type, parentPath, limit) atlas_response = self._root.get('/v2/search/dsl?query=%s' % atlas_dsl_query) # Adapt Atlas entities to Navigator structure in the results if 'entities' in atlas_response: for atlas_entity in atlas_response['entities']: found_entities.append( self.adapt_atlas_entity_to_navigator(atlas_entity)) return found_entities except RestException as e: if e.code == 401: raise raise_popup_exception( 'Hue could not authenticate to Atlas', detail=e) else: raise raise_popup_exception('Hue could not query Atlas', detail=e) def suggest(self, prefix=None): try: return self._root.get('interactive/suggestions?query=%s' % (prefix or '*')) except RestException as e: if e.code == 401: raise raise_popup_exception( 'Hue could not authenticate to Atlas', detail=e) else: raise raise_popup_exception('Failed to search for entities', detail=e) def get_entity(self, entity_id): """ # TODO: get entity by Atlas __guid or qualifiedName GET /v2/search/dsl?query=? """ try: return self._root.get('entities/%s' % entity_id, headers=self.__headers, params=self.__params) except RestException as e: msg = 'Failed to get entity %s: %s' % (entity_id, str(e)) LOG.error(msg) raise CatalogApiException(e.message) def update_entity(self, entity, **metadata): """ PUT /api/v3/entities/:id http://cloudera.github.io/navigator/apidocs/v3/path__v3_entities_-id-.html """ try: # Workarounds NAV-6187: if we don't re-send those, they would get erased. properties = { 'name': entity['name'], 'description': entity['description'], 'properties': entity['properties'] or {}, 'customProperties': entity['customProperties'] or {} } properties.update(metadata) data = json.dumps(properties) return self._root.put('entities/%(identity)s' % entity, params=self.__params, data=data, contenttype=_JSON_CONTENT_TYPE, allow_redirects=True, clear_cookies=True) except RestException as e: if e.code == 401: raise raise_popup_exception( 'Hue could not authenticate to Atlas', detail=e) else: raise raise_popup_exception('Failed to update entity', detail=e) def get_cluster_source_ids(self): return [] # params = ( # ('query', 'clusterName:"%s"' % get_navigator_hue_server_name()), # ('limit', 200), # ) # LOG.info(params) # return self._root.get('entities', headers=self.__headers, params=params) def add_tags(self, entity_id, tags): entity = self.get_entity(entity_id) new_tags = entity['tags'] or [] new_tags.extend(tags) return self.update_entity(entity, tags=new_tags) def delete_tags(self, entity_id, tags): entity = self.get_entity(entity_id) new_tags = entity['tags'] or [] for tag in tags: if tag in new_tags: new_tags.remove(tag) return self.update_entity(entity, tags=new_tags) def update_properties(self, entity_id, properties, modified_custom_metadata=None, deleted_custom_metadata_keys=None): entity = self.get_entity(entity_id) if modified_custom_metadata: properties['properties'] = entity['properties'] or {} properties['properties'].update(modified_custom_metadata) if deleted_custom_metadata_keys: properties['properties'] = entity['properties'] or {} for key in deleted_custom_metadata_keys: if key in properties['properties']: del properties['properties'][key] return self.update_entity(entity, **properties) def delete_metadata_properties(self, entity_id, property_keys): entity = self.get_entity(entity_id) new_props = entity['properties'] or {} for key in property_keys: if key in new_props: del new_props[key] return self.update_entity(entity, properties=new_props) def get_lineage(self, entity_id): """ GET /api/v3/lineage/entityIds=:id http://cloudera.github.io/navigator/apidocs/v3/path__v3_lineage.html """ try: params = self.__params params += (('entityIds', entity_id), ) return self._root.get('lineage', headers=self.__headers, params=params) except RestException as e: if e.code == 401: raise raise_popup_exception( 'Hue could not authenticate to Atlas', detail=e) else: raise raise_popup_exception('Failed to get lineage', detail=e) def create_namespace(self, namespace, description=None): try: data = json.dumps({'name': namespace, 'description': description}) return self._root.post('models/namespaces/', data=data, contenttype=_JSON_CONTENT_TYPE, clear_cookies=True) except RestException as e: if e.code == 401: raise raise_popup_exception( 'Hue could not authenticate to Atlas', detail=e) else: raise raise_popup_exception('Failed to create namespace', detail=e) def get_namespace(self, namespace): try: return self._root.get('models/namespaces/%(namespace)s' % {'namespace': namespace}) except RestException as e: if e.code == 401: raise raise_popup_exception( 'Hue could not authenticate to Atlas', detail=e) else: raise raise_popup_exception('Failed to get namespace', detail=e) def create_namespace_property(self, namespace, properties): try: data = json.dumps(properties) return self._root.post( 'models/namespaces/%(namespace)s/properties' % {'namespace': namespace}, data=data, contenttype=_JSON_CONTENT_TYPE, clear_cookies=True) except RestException as e: if e.code == 401: raise raise_popup_exception( 'Hue could not authenticate to Atlas', detail=e) else: raise raise_popup_exception('Failed to create namespace', detail=e) def get_namespace_properties(self, namespace): try: return self._root.get( 'models/namespaces/%(namespace)s/properties' % {'namespace': namespace}) except RestException as e: if e.code == 401: raise raise_popup_exception( 'Hue could not authenticate to Atlas', detail=e) else: raise raise_popup_exception('Failed to create namespace', detail=e) def map_namespace_property(self, clazz, properties): try: data = json.dumps(properties) return self._root.post( 'models/packages/nav/classes/%(class)s/properties' % {'class': clazz}, data=data, contenttype=_JSON_CONTENT_TYPE, clear_cookies=True) except RestException as e: if e.code == 401: raise raise_popup_exception( 'Hue could not authenticate to Atlas', detail=e) else: raise raise_popup_exception('Failed to map class', detail=e) def get_model_properties_mapping(self): try: return self._root.get('models/properties/mappings') except RestException as e: if e.code == 401: raise raise_popup_exception( 'Hue could not authenticate to Atlas', detail=e) else: raise raise_popup_exception( 'Failed to get models properties mappings', detail=e) def _fillup_properties(self): global _HAS_CATALOG_NAMESPACE if _HAS_CATALOG_NAMESPACE is None: response = self.get_namespace(namespace=AtlasApi.CATALOG_NAMESPACE) if not response: self.create_namespace( namespace=AtlasApi.CATALOG_NAMESPACE, description="Set of fields to augment the data catalog") properties = self.get_namespace_properties( namespace=AtlasApi.CATALOG_NAMESPACE) if not [ _property for _property in properties if _property['name'] == 'relatedDocuments' ]: self.create_namespace_property( namespace=AtlasApi.CATALOG_NAMESPACE, properties={ "name": "relatedDocuments", "displayName": "Related documents", "description": "List of Hue document UUIDs related to this entity", "multiValued": True, "maxLength": 36, "pattern": ".*", # UUID "enumValues": None, "type": "TEXT" }) # Might want to check if the mapping is already done for clazz in ('hv_table', 'hv_view'): self.map_namespace_property(clazz, properties=[{ "namespace": AtlasApi.CATALOG_NAMESPACE, "name": "relatedDocuments" }]) _HAS_CATALOG_NAMESPACE = True def _get_boosted_term(self, term): return 'AND'.join([ '(%s)' % 'OR'.join([ '(%s:%s*^%s)' % (field, term, weight) for (field, weight) in AtlasApi.DEFAULT_SEARCH_FIELDS ]), # Matching fields '(%s)' % 'OR'.join([ '(%s:[* TO *])' % field for (field, weight) in AtlasApi.DEFAULT_SEARCH_FIELDS ]) # Boost entities with enriched fields # Could add certain customProperties and properties ]) def _clean_path(self, path): return path.rstrip('/').split('/')[-1], self._escape_slashes( path.rstrip('/')) def _escape_slashes(self, s): return s.replace('/', '\/')
class NavigatorApi(object): """ http://cloudera.github.io/navigator/apidocs/v3/index.html """ def __init__(self, api_url=None, user=None, password=None): self._api_url = '%s/%s' % ( (api_url or NAVIGATOR.API_URL.get()).strip('/'), VERSION) self._username = user or NAVIGATOR.AUTH_USERNAME.get() self._password = password or NAVIGATOR.AUTH_PASSWORD.get() self._client = HttpClient(self._api_url, logger=LOG) self._client.set_basic_auth(self._username, self._password) self._root = resource.Resource(self._client) self.__headers = {} self.__params = () def search_entities(self, query_s, limit=100, offset=0, **filters): """ GET /api/v3/entities?query=() http://cloudera.github.io/navigator/apidocs/v3/path__v3_entities.html :param query_s: a query string of search terms (e.g. - sales quarterly); Currently the search will perform an OR boolean search for all terms (split on whitespace), against a whitelist of search_fields. TODO: support smarter boolean searching with arbitrary ordering and precedence of conditionals :param filters: TODO: IMPLEMENT ME, required to support property search """ search_fields = ('originalName', 'originalDescription', 'name', 'description', 'tags') entity_types = ('DATABASE', 'TABLE', 'PARTITION', 'FIELD', 'FILE', 'OPERATION') try: params = self.__params search_terms = [term.lower() for term in query_s.strip().split()] query_clauses = [] for term in search_terms: query_clauses.append('OR'.join( ['(%s:*%s*)' % (field, term) for field in search_fields])) filter_query = '(originalName:*.*)' if search_terms: filter_query = 'OR'.join( ['(%s)' % clause for clause in query_clauses]) type_filter_clause = 'OR'.join([ '(%s:%s)' % ('type', entity_type) for entity_type in entity_types ]) filter_query = '%sAND(%s)' % (filter_query, type_filter_clause) params += ( ('query', filter_query), ('offset', offset), ('limit', limit), ) response = self._root.get('entities', headers=self.__headers, params=params) return response except RestException, e: msg = 'Failed to search for entities with search query: %s' % query_s LOG.exception(msg) raise NavigatorApiException(msg)
class NavigatorApi(object): """ http://cloudera.github.io/navigator/apidocs/v3/index.html """ def __init__(self, user=None): self._api_url = '%s/%s' % (NAVIGATOR.API_URL.get().strip('/'), VERSION) self._username = NAVIGATOR.AUTH_USERNAME.get() self._password = NAVIGATOR.AUTH_PASSWORD.get() self.user = user self._client = HttpClient(self._api_url, logger=LOG) self._client.set_basic_auth(self._username, self._password) self._root = resource.Resource( self._client, urlencode=False) # For search_entities_interactive self.__headers = {} self.__params = () def _get_types_from_sources(self, sources): default_entity_types = entity_types = ('DATABASE', 'TABLE', 'PARTITION', 'FIELD', 'FILE', 'VIEW', 'S3BUCKET', 'OPERATION', 'DIRECTORY') if 'sql' in sources or 'hive' in sources or 'impala' in sources: entity_types = ('TABLE', 'VIEW', 'DATABASE', 'PARTITION', 'FIELD') default_entity_types = ('TABLE', 'VIEW') elif 'hdfs' in sources: entity_types = ('FILE', 'DIRECTORY') default_entity_types = ('FILE', 'DIRECTORY') elif 's3' in sources: entity_types = ('FILE', 'DIRECTORY', 'S3BUCKET') default_entity_types = ('DIRECTORY', 'S3BUCKET') return default_entity_types, entity_types def search_entities(self, query_s, limit=100, offset=0, **filters): """ Solr edismax query parser syntax. :param query_s: a query string of search terms (e.g. - sales quarterly); Currently the search will perform an OR boolean search for all terms (split on whitespace), against a whitelist of search_fields. """ search_fields = ('originalName', 'originalDescription', 'name', 'description', 'tags') sources = filters.get('sources', []) default_entity_types, entity_types = self._get_types_from_sources( sources) try: params = self.__params search_terms = [term for term in query_s.strip().split()] query_clauses = [] user_filters = [] source_type_filter = [] for term in search_terms: if ':' not in term: query_clauses.append('OR'.join([ '(%s:*%s*)' % (field, term) for field in search_fields ])) else: name, val = term.split(':') if val: if name == 'type': term = '%s:%s' % (name, val.upper().strip('*')) default_entity_types = entity_types # Make sure type value still makes sense for the source user_filters.append( term + '*') # Manual filter allowed e.g. type:VIE* ca filter_query = '*' if query_clauses: filter_query = 'OR'.join( ['(%s)' % clause for clause in query_clauses]) user_filter_clause = 'OR '.join(['(%s)' % f for f in user_filters]) or '*' source_filter_clause = 'OR'.join([ '(%s:%s)' % ('type', entity_type) for entity_type in default_entity_types ]) if 's3' in sources: source_type_filter.append('sourceType:s3') filter_query = '%s AND (%s) AND (%s)' % ( filter_query, user_filter_clause, source_filter_clause) if source_type_filter: filter_query += ' AND (%s)' % 'OR '.join(source_type_filter) if get_navigator_hue_server_name(): filter_query += 'AND clusterName:%s' % get_navigator_hue_server_name( ) params += ( ('query', filter_query), ('offset', offset), ('limit', NAVIGATOR.FETCH_SIZE_SEARCH.get()), ) LOG.info(params) response = self._root.get('entities', headers=self.__headers, params=params) response = list(islice(self._secure_results(response), limit)) # Apply Sentry perms return response except RestException, e: msg = 'Failed to search for entities with search query: %s' % query_s LOG.exception(msg) raise NavigatorApiException(msg)
class NavigatorApi(object): """ http://cloudera.github.io/navigator/apidocs/v3/index.html """ def __init__(self, user=None): self._api_url = '%s/%s' % (NAVIGATOR.API_URL.get().strip('/'), VERSION) self._username = NAVIGATOR.AUTH_USERNAME.get() self._password = NAVIGATOR.AUTH_PASSWORD.get() self.user = user self._client = HttpClient(self._api_url, logger=LOG) self._client.set_basic_auth(self._username, self._password) self._root = resource.Resource(self._client, urlencode=False) # For search_entities_interactive self.__headers = {} self.__params = () def _get_types_from_sources(self, sources): default_entity_types = entity_types = ('DATABASE', 'TABLE', 'PARTITION', 'FIELD', 'FILE', 'VIEW', 'OPERATION', 'DIRECTORY') if 'sql' in sources or 'hive' in sources or 'impala' in sources: default_entity_types = ('TABLE', 'VIEW') entity_types = ('TABLE', 'VIEW', 'DATABASE', 'PARTITION', 'FIELD') elif 'hdfs' in sources: entity_types = ('FILE', 'DIRECTORY') default_entity_types = ('FILE', 'DIRECTORY') return default_entity_types, entity_types def search_entities(self, query_s, limit=100, offset=0, **filters): """ Solr edismax query parser syntax. :param query_s: a query string of search terms (e.g. - sales quarterly); Currently the search will perform an OR boolean search for all terms (split on whitespace), against a whitelist of search_fields. """ search_fields = ('originalName', 'originalDescription', 'name', 'description', 'tags') sources = filters.get('sources', []) default_entity_types, entity_types = self._get_types_from_sources(sources) try: params = self.__params search_terms = [term for term in query_s.strip().split()] query_clauses = [] user_filters = [] for term in search_terms: if ':' not in term: query_clauses.append('OR'.join(['(%s:*%s*)' % (field, term) for field in search_fields])) else: name, val = term.split(':') if val: if name == 'type': term = '%s:%s' % (name, val.upper().strip('*')) default_entity_types = entity_types # Make sure type value still makes sense for the source user_filters.append(term + '*') # Manual filter allowed e.g. type:VIE* ca filter_query = '*' if query_clauses: filter_query = 'OR'.join(['(%s)' % clause for clause in query_clauses]) user_filter_clause = 'OR '.join(['(%s)' % f for f in user_filters]) or '*' source_filter_clause = 'OR'.join(['(%s:%s)' % ('type', entity_type) for entity_type in default_entity_types]) filter_query = '%s AND (%s) AND (%s)' % (filter_query, user_filter_clause, source_filter_clause) params += ( ('query', filter_query), ('offset', offset), ('limit', limit), ) LOG.info(params) response = self._root.get('entities', headers=self.__headers, params=params) self._secure_results(response) return response except RestException, e: msg = 'Failed to search for entities with search query: %s' % query_s LOG.exception(msg) raise NavigatorApiException(msg)