def get_topic(name): if has_kafka_api(): pass else: manager = ManagerApi() broker_host = manager.get_kafka_brokers().split(',')[0].split(':')[0] return manager.get_kafka_topics(broker_host)[name]
def get_hosts(request): response = {'status': 0} api = ManagerApi(request.user) if request.POST.get('service', '').lower() == 'flume': response['hosts'] = api.get_flume_agents() return JsonResponse(response)
def start(self, destination_name, file_format, destination): responses = {'status': 0} api = ManagerApi(self.user) for config_name, config_value in self.generate_config(file_format, destination): responses[config_name] = api.update_flume_config(cluster_name=None, config_name=config_name, config_value=config_value) responses['refresh_flume'] = api.refresh_flume(cluster_name=None, restart=True) if destination['ouputFormat'] == 'index': responses['pubSubUrl'] = 'assist.collections.refresh' responses['on_success_url'] = reverse('search:browse', kwargs={'name': destination_name}) return responses
def get_topics(): if has_kafka_api(): return KafkaApi().topics() else: try: manager = ManagerApi() broker_host = manager.get_kafka_brokers().split(',')[0].split( ':')[0] return [ name for name in list(manager.get_kafka_topics(broker_host).keys()) if not name.startswith('__') ] except Exception as e: return ['user_behavior']
def get_topics(): if has_kafka_api(): return KafkaApi().topics() else: try: manager = ManagerApi() broker_host = manager.get_kafka_brokers().split(',')[0].split( ':')[0] return [ name for name in manager.get_kafka_topics(broker_host).keys() if not name.startswith('__') ] except Exception, e: print e return ["traffic", "hueAccessLogs"]
def get_spark_history_server_from_cm(): from metadata.conf import MANAGER from metadata.manager_client import ManagerApi if MANAGER.API_URL.get(): return ManagerApi().get_spark_history_server_url() return None
def get_daemon_config(key): from metadata.conf import MANAGER from metadata.manager_client import ManagerApi if MANAGER.API_URL.get(): return ManagerApi().get_impalad_config(key=key, impalad_host=SERVER_HOST.get()) return None
def get_spark_history_server_security_enabled(): """ Try to get Spark history server URL from Cloudera Manager API, otherwise give default URL """ from metadata.conf import MANAGER from metadata.manager_client import ManagerApi if MANAGER.API_URL.get(): return ManagerApi().get_spark_history_server_security_enabled() return False
def update_flume_config(request): api = ManagerApi(request.user) flume_agent_config = '''tier1.sources = source1 tier1.channels = channel1 tier1.sinks = sink1 tier1.sources.source1.type = exec tier1.sources.source1.command = tail -F /var/log/hue-httpd/access_log tier1.sources.source1.channels = channel1 tier1.channels.channel1.type = memory tier1.channels.channel1.capacity = 10000 tier1.channels.channel1.transactionCapacity = 1000 # Solr Sink configuration tier1.sinks.sink1.type = org.apache.flume.sink.solr.morphline.MorphlineSolrSink tier1.sinks.sink1.morphlineFile = morphlines.conf tier1.sinks.sink1.morphlineId = hue_accesslogs_no_geo tier1.sinks.sink1.channel = channel1''' morphline_config = open( os.path.join(config_morphline_path(), 'hue_accesslogs_no_geo.morphline.conf')).read() morphline_config = morphline_config.replace('${SOLR_COLLECTION}', 'log_analytics_demo').replace( '${ZOOKEEPER_ENSEMBLE}', '%s/solr' % zkensemble()) responses = {} responses['agent_config_file'] = api.update_flume_config( cluster_name=None, config_name='agent_config_file', config_value=flume_agent_config) responses['agent_morphlines_conf_file'] = api.update_flume_config( cluster_name=None, config_name='agent_morphlines_conf_file', config_value=morphline_config) responses['refresh_flume'] = api.refresh_flume(cluster_name=None, restart=True) return JsonResponse(responses)
def generate_config(self, source, destination): configs = [] if source['channelSourceType'] == 'directory': agent_source = ''' tier1.sources.source1.type = exec tier1.sources.source1.command = tail -F %(directory)s tier1.sources.source1.channels = channel1 ''' % { 'directory': source['channelSourcePath'] } elif source['channelSourceType'] == 'kafka': agent_source = ''' tier1.sources.source1.type = org.apache.flume.source.kafka.KafkaSource tier1.sources.source1.channels = channel1 tier1.sources.source1.batchSize = 5000 tier1.sources.source1.batchDurationMillis = 2000 tier1.sources.source1.kafka.bootstrap.servers = localhost:9092 tier1.sources.source1.kafka.topics = test1, test2 tier1.sources.source1.kafka.consumer.group.id = custom.g.id ''' % { 'directory': source['channelSourcePath'] } else: raise PopupException(_('Input format not recognized: %(channelSourceType)s') % source) if destination['ouputFormat'] == 'file': agent_sink = ''' a1.channels = c1 a1.sinks = k1 a1.sinks.k1.type = hdfs a1.sinks.k1.channel = c1 a1.sinks.k1.hdfs.path = /flume/events/%y-%m-%d/%H%M/%S a1.sinks.k1.hdfs.filePrefix = events- a1.sinks.k1.hdfs.round = true a1.sinks.k1.hdfs.roundValue = 10 a1.sinks.k1.hdfs.roundUnit = minute''' elif destination['ouputFormat'] == 'table': agent_sink = ''' a1.channels = c1 a1.channels.c1.type = memory a1.sinks = k1 a1.sinks.k1.type = hive a1.sinks.k1.channel = c1 a1.sinks.k1.hive.metastore = thrift://127.0.0.1:9083 a1.sinks.k1.hive.database = logsdb a1.sinks.k1.hive.table = weblogs a1.sinks.k1.hive.partition = asia,%{country},%y-%m-%d-%H-%M a1.sinks.k1.useLocalTimeStamp = false a1.sinks.k1.round = true a1.sinks.k1.roundValue = 10 a1.sinks.k1.roundUnit = minute a1.sinks.k1.serializer = DELIMITED a1.sinks.k1.serializer.delimiter = "\t" a1.sinks.k1.serializer.serdeSeparator = '\t' a1.sinks.k1.serializer.fieldnames =id,,msg''' elif destination['ouputFormat'] == 'kafka': manager = ManagerApi() agent_sink = ''' tier1.sinks.sink1.type = org.apache.flume.sink.kafka.KafkaSink tier1.sinks.sink1.topic = hueAccessLogs tier1.sinks.sink1.brokerList = %(brokers)s tier1.sinks.sink1.channel = channel1 tier1.sinks.sink1.batchSize = 20''' % { 'brokers': manager.get_kafka_brokers() } elif destination['ouputFormat'] == 'index': # Morphline file configs.append(self.generate_morphline_config(destination)) # Flume config agent_sink = ''' tier1.sinks.sink1.type = org.apache.flume.sink.solr.morphline.MorphlineSolrSink tier1.sinks.sink1.morphlineFile = morphlines.conf tier1.sinks.sink1.morphlineId = hue_accesslogs_no_geo tier1.sinks.sink1.channel = channel1''' else: raise PopupException(_('Output format not recognized: %(ouputFormat)s') % destination) # TODO: use agent id: input + output and do not overide all the configs # TODO: use Kafka channel if possible flume_config = '''tier1.sources = source1 tier1.channels = channel1 tier1.sinks = sink1 %(sources)s tier1.channels.channel1.type = memory tier1.channels.channel1.capacity = 10000 tier1.channels.channel1.transactionCapacity = 1000 %(sinks)s''' % { 'sources': agent_source, 'sinks': agent_sink, } configs.append(('agent_config_file', flume_config)) return configs
def hello(request): api = ManagerApi(request.user) response = api.tools_echo() return JsonResponse(response)
def _envelope_job(request, file_format, destination, start_time=None, lib_path=None): collection_name = destination['name'] indexer = EnvelopeIndexer(request.user, request.fs) lib_path = '/tmp/envelope-0.5.0.jar' input_path = None if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % file_format["path"] properties = { 'format': 'json' } elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'sfdc': properties = { 'streamSelection': file_format['streamSelection'], 'streamUsername': file_format['streamUsername'], 'streamPassword': file_format['streamPassword'], 'streamToken': file_format['streamToken'], 'streamEndpointUrl': file_format['streamEndpointUrl'], 'streamObject': file_format['streamObject'], } elif file_format['streamSelection'] == 'kafka': manager = ManagerApi() properties = { "brokers": manager.get_kafka_brokers(), "output_table": "impala::%s" % collection_name, "topics": file_format['kafkaSelectedTopics'], "kafkaFieldType": file_format['kafkaFieldType'], "kafkaFieldDelimiter": file_format['kafkaFieldDelimiter'], "kafkaFieldNames": file_format['kafkaFieldNames'], "kafkaFieldTypes": file_format['kafkaFieldTypes'] } if destination['outputFormat'] == 'table': if destination['isTargetExisting']: # Todo: check if format matches pass else: sql = SQLIndexer(user=request.user, fs=request.fs).create_table_from_a_file(file_format, destination).get_str() print sql if destination['tableFormat'] == 'kudu': manager = ManagerApi() properties["output_table"] = "impala::%s" % collection_name properties["kudu_master"] = manager.get_kudu_master() else: properties['output_table'] = collection_name elif destination['outputFormat'] == 'file': properties['path'] = file_format["path"] properties['format'] = file_format['tableFormat'] # or csv elif destination['outputFormat'] == 'index': properties['collectionName'] = collection_name properties['connection'] = SOLR_URL.get() if destination['isTargetExisting']: # Todo: check if format matches pass else: client = SolrClient(request.user) kwargs = {} _create_solr_collection(request.user, request.fs, client, destination, collection_name, kwargs) properties["app_name"] = 'Data Ingest' properties["inputFormat"] = file_format['inputFormat'] properties["ouputFormat"] = destination['ouputFormat'] properties["streamSelection"] = file_format["streamSelection"] envelope = indexer.generate_config(properties) return indexer.run(request, collection_name, envelope, input_path, start_time=start_time, lib_path=lib_path)
def _envelope_job(request, file_format, destination, start_time=None, lib_path=None): collection_name = destination['name'] indexer = EnvelopeIndexer(request.user, request.fs) lib_path = None # Todo optional input field input_path = None if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'file': input_path = file_format["path"] properties = {'input_path': input_path, 'format': 'csv'} elif file_format['inputFormat'] == 'stream' and file_format[ 'streamSelection'] == 'flume': pass elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': manager = ManagerApi() properties = { "brokers": manager.get_kafka_brokers(), "topics": file_format['kafkaSelectedTopics'], "kafkaFieldType": file_format['kafkaFieldType'], "kafkaFieldDelimiter": file_format['kafkaFieldDelimiter'], } if file_format.get( 'kafkaSelectedTopics') == 'NavigatorAuditEvents': schema_fields = MorphlineIndexer.get_kept_field_list( file_format['sampleCols']) properties.update({ "kafkaFieldNames": ', '.join([_field['name'] for _field in schema_fields]), "kafkaFieldTypes": ', '.join([_field['type'] for _field in schema_fields]) }) else: properties.update({ "kafkaFieldNames": file_format['kafkaFieldNames'], "kafkaFieldTypes": file_format['kafkaFieldTypes'] }) if True: properties['window'] = '' else: # For "KafkaSQL" properties['window'] = ''' window { enabled = true milliseconds = 60000 }''' elif file_format['inputFormat'] == 'connector': if file_format['streamSelection'] == 'flume': properties = { 'streamSelection': file_format['streamSelection'], 'channelSourceHosts': file_format['channelSourceHosts'], 'channelSourceSelectedHosts': file_format['channelSourceSelectedHosts'], 'channelSourcePath': file_format['channelSourcePath'], } else: # sfdc properties = { 'streamSelection': file_format['streamSelection'], 'streamUsername': file_format['streamUsername'], 'streamPassword': file_format['streamPassword'], 'streamToken': file_format['streamToken'], 'streamEndpointUrl': file_format['streamEndpointUrl'], 'streamObject': file_format['streamObject'], } if destination['outputFormat'] == 'table': if destination['isTargetExisting']: # Todo: check if format matches pass else: destination['importData'] = False # Avoid LOAD DATA if destination['tableFormat'] == 'kudu': properties['kafkaFieldNames'] = properties[ 'kafkaFieldNames'].lower( ) # Kudu names should be all lowercase # Create table if not request.POST.get('show_command'): SQLIndexer(user=request.user, fs=request.fs).create_table_from_a_file( file_format, destination).execute(request) if destination['tableFormat'] == 'kudu': manager = ManagerApi() properties["output_table"] = "impala::%s" % collection_name properties["kudu_master"] = manager.get_kudu_master() else: properties['output_table'] = collection_name elif destination['outputFormat'] == 'stream': manager = ManagerApi() properties['brokers'] = manager.get_kafka_brokers() properties['topics'] = file_format['kafkaSelectedTopics'] properties['kafkaFieldDelimiter'] = file_format['kafkaFieldDelimiter'] elif destination['outputFormat'] == 'file': properties['path'] = file_format["path"] if file_format['inputFormat'] == 'stream': properties['format'] = 'csv' else: properties['format'] = file_format['tableFormat'] # or csv elif destination['outputFormat'] == 'index': properties['collectionName'] = collection_name properties['connection'] = SOLR_URL.get() properties["app_name"] = 'Data Ingest' properties["inputFormat"] = file_format['inputFormat'] properties["ouputFormat"] = destination['ouputFormat'] properties["streamSelection"] = file_format["streamSelection"] configs = indexer.generate_config(properties) if request.POST.get('show_command'): return {'status': 0, 'commands': configs['envelope.conf']} else: return indexer.run(request, collection_name, configs, input_path, start_time=start_time, lib_path=lib_path)
def _envelope_job(request, file_format, destination, start_time=None, lib_path=None): collection_name = destination['name'] indexer = EnvelopeIndexer(request.user, request.fs) lib_path = None # Todo optional input field input_path = None if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'file': input_path = file_format["path"] properties = {'input_path': input_path, 'format': 'csv'} elif file_format['inputFormat'] == 'stream' and file_format[ 'streamSelection'] == 'flume': pass elif file_format['inputFormat'] in ('stream', 'sfdc'): if file_format['inputFormat'] == 'sfdc': properties = { 'streamSelection': file_format['streamSelection'], 'streamUsername': file_format['streamUsername'], 'streamPassword': file_format['streamPassword'], 'streamToken': file_format['streamToken'], 'streamEndpointUrl': file_format['streamEndpointUrl'], 'streamObject': file_format['streamObject'], } elif file_format['streamSelection'] == 'kafka': manager = ManagerApi() properties = { "brokers": manager.get_kafka_brokers(), "topics": file_format['kafkaSelectedTopics'], "kafkaFieldType": file_format['kafkaFieldType'], "kafkaFieldDelimiter": file_format['kafkaFieldDelimiter'], "kafkaFieldNames": file_format['kafkaFieldNames'], "kafkaFieldTypes": file_format['kafkaFieldTypes'] } if True: properties['window'] = '' else: # For "KafkaSQL" properties['window'] = ''' window { enabled = true milliseconds = 60000 }''' if destination['outputFormat'] == 'table': if destination['isTargetExisting']: # Todo: check if format matches pass else: sql = SQLIndexer(user=request.user, fs=request.fs).create_table_from_a_file( file_format, destination).get_str() print sql if destination['tableFormat'] == 'kudu': manager = ManagerApi() properties["output_table"] = "impala::%s" % collection_name properties["kudu_master"] = manager.get_kudu_master() else: properties['output_table'] = collection_name elif destination['outputFormat'] == 'file': properties['path'] = file_format["path"] if file_format['inputFormat'] == 'stream': properties['format'] = 'csv' else: properties['format'] = file_format['tableFormat'] # or csv elif destination['outputFormat'] == 'index': properties['collectionName'] = collection_name properties['connection'] = SOLR_URL.get() # No needed anymore # if destination['isTargetExisting']: # # Todo: check if format matches # pass # else: # client = SolrClient(request.user) # kwargs = {} # _create_solr_collection(request.user, request.fs, client, destination, collection_name, kwargs) if destination['outputFormat'] == 'stream': manager = ManagerApi() properties['brokers'] = manager.get_kafka_brokers() properties['topics'] = file_format['kafkaSelectedTopics'] properties['kafkaFieldDelimiter'] = file_format['kafkaFieldDelimiter'] properties["app_name"] = 'Data Ingest' properties["inputFormat"] = file_format['inputFormat'] properties["ouputFormat"] = destination['ouputFormat'] properties["streamSelection"] = file_format["streamSelection"] envelope = indexer.generate_config(properties) return indexer.run(request, collection_name, envelope, input_path, start_time=start_time, lib_path=lib_path)