def create_demo_kafka_topic(): '''Creates a kafka topic for the demo if it doesn't already exist. The caveat here in using this is that Kafka must be installed on the same machine as the demo, and thus the same machine as Ambari as well. The function will try to start the Kafka service through Ambari and then once the service is started is will use the location of the Kafka topics script to create the topic The name for the topic is specified in ``global.conf``. Args: N/A Returns: bool: True if the creation is successful. False otherwise. ''' conf = config.read_config('global.conf') am_conf = conf['AMBARI'] amc = Ambari(am_conf['username'], am_conf['password'], am_conf['proto'], am_conf['server'], am_conf['port']); logger.info('Starting Kafka Broker') if amc.service_action('Sandbox', 'KAFKA', 'START'): sh = Shell() topics_script = conf['DEMO']['kafka_topics_script'] zk = conf['DEMO']['zk_connection'] topic_name = conf['DEMO']['kafka_topic_name'] logger.info('Attempting to create new Kafka Topic') out = sh.run(topics_script + ' --create --zookeeper ' + zk + ' --replication-factor 1 --partitions 1 --topic ' + topic_name) logger.debug(str(out)) if len(out[1]) == 0: return True else: return False
def get_kafka_topics(): '''List the kafka topics on the current installation. Requires that Kafka is installed on the same machine and Ambari is up and running. Will start the service and use the Kafka scripts to list out all of the topics. Args: N/A Returns: list: [0] will contain the list of all the topics in a string, typically separated by newlines. [1] will contain any errors when retrieving the topics. ''' conf = config.read_config('global.conf') am_conf = conf['AMBARI'] amc = Ambari(am_conf['username'], am_conf['password'], am_conf['proto'], am_conf['server'], am_conf['port']) logger.info('Starting Kafka Broker') if amc.service_action('Sandbox', 'KAFKA', 'START'): sh = Shell() topics_script = conf['DEMO']['kafka_topics_script'] zk = conf['DEMO']['zk_connection'] logger.info('Attempting to create new Kafka Topic') out = sh.run(topics_script + ' --list --zookeeper ' + zk) if len(out[1]) == 0: topics = out[0] topics = topics.strip().split('\n') logger.info('Kafka topics output: ' + str(topics)) return topics return ['', 'Unable to get topics. Could not start Kafka Broker']
def create_demo_kafka_topic(): '''Creates a kafka topic for the demo if it doesn't already exist. The caveat here in using this is that Kafka must be installed on the same machine as the demo, and thus the same machine as Ambari as well. The function will try to start the Kafka service through Ambari and then once the service is started is will use the location of the Kafka topics script to create the topic The name for the topic is specified in ``global.conf``. Args: N/A Returns: bool: True if the creation is successful. False otherwise. ''' conf = config.read_config('global.conf') am_conf = conf['AMBARI'] amc = Ambari(am_conf['username'], am_conf['password'], am_conf['proto'], am_conf['server'], am_conf['port']) logger.info('Starting Kafka Broker') if amc.service_action('Sandbox', 'KAFKA', 'START'): sh = Shell() topics_script = conf['DEMO']['kafka_topics_script'] zk = conf['DEMO']['zk_connection'] topic_name = conf['DEMO']['kafka_topic_name'] logger.info('Attempting to create new Kafka Topic') out = sh.run(topics_script + ' --create --zookeeper ' + zk + ' --replication-factor 1 --partitions 1 --topic ' + topic_name) logger.debug(str(out)) if len(out[1]) == 0: return True else: return False
def on_service_start(): '''This method will run every time the service start Fill in this method with any necessary commands to set up and start other services for the demo Note that this method will always be the very last thing to be executed upon starting the demo service ''' print 'Running on_service_start' cfg = config.read_config('global.conf') # Ambari Client amc = Ambari(config=cfg['AMBARI']) #Queue Services amc.service_action('Sandbox', 'KAFKA', 'START', queue=True) amc.service_action('Sandbox', 'ZEPPELIN', 'START', queue=True) try: # Not guaranteed to be installed amc.service_action('Sandbox', 'NIFI', 'START', queue=True) except: log.warn('Failed to start NiFi') service_installer.add_zeppelin_notebooks() # Add anything else below that might be necessary for when the demo starts pass
def get_kafka_topics(): '''List the kafka topics on the current installation. Requires that Kafka is installed on the same machine and Ambari is up and running. Will start the service and use the Kafka scripts to list out all of the topics. Args: N/A Returns: list: [0] will contain the list of all the topics in a string, typically separated by newlines. [1] will contain any errors when retrieving the topics. ''' conf = config.read_config('global.conf') am_conf = conf['AMBARI'] amc = Ambari(am_conf['username'], am_conf['password'], am_conf['proto'], am_conf['server'], am_conf['port']); logger.info('Starting Kafka Broker') if amc.service_action('Sandbox', 'KAFKA', 'START'): sh = Shell() topics_script = conf['DEMO']['kafka_topics_script'] zk = conf['DEMO']['zk_connection'] logger.info('Attempting to create new Kafka Topic') out = sh.run(topics_script + ' --list --zookeeper ' + zk) if len(out[1]) == 0: topics = out[0] topics = topics.strip().split('\n') logger.info('Kafka topics output: ' + str(topics)) return topics return ['', 'Unable to get topics. Could not start Kafka Broker']
def __init__(self, schema, bps, outputs, data_pool_size=100): threading.Thread.__init__(self) self.outputs = outputs self.daemon = True self.flag = True self.data_pool_size = data_pool_size self.http_data_pool = [] self.hdfs_data_pool = [] if bps > 0: self.bps = bps else: self.bps = 50000 #50kb self.gen = generator.DataGenerator(schema) conf = config.read_config('global.conf')['DEMO'] self.exports = {} if 'KAFKA' in outputs: self.kafka_topic = conf['kafka_topic_name'] self.kafka_listener = conf['data_kafka_listener'] has_topic = False if not (self.kafka_topic in get_kafka_topics()[0]): topic_created = create_demo_kafka_topic() if topic_created: has_topic = True else: raise EnvironmentError('Could not create Kafka Topic') else: has_topic = True if has_topic: self.exports['KAFKA'] = True self.kafka_producer = KafkaProducer(bootstrap_servers=self.kafka_listener) else: self.exports['KAFKA'] = False msg = 'Could not create Kafka Topic. Please create manually' raise EnvironmentError(msg) logger.warn(msg) else: self.exports['KAFKA'] = False if 'FILE' in outputs: self.export_filename = conf['data_write_file_location'] self.exports['FILE'] = True with open(self.export_filename, 'w') as ex_data: pass else: self.exports['FILE'] = False if 'HDFS' in outputs: self.export_hdfs_file = conf['data_write_hdfs_file_location'] self.exports['HDFS'] = True else: self.exports['HDFS'] = False if 'HTTP' in outputs: self.export_http_url = conf['data_http_endpoint'] self.exports['HTTP'] = True else: self.exports['HTTP'] = False
def test_good_file(self, mock1): params = config.read_config("res/good-test.properties") assert params["SECTION1"]["key1"] == "val1" assert params["SECTION2"]["key2"] == "val2" assert params["SECTION3"]["key3"] == "val3" assert params["SECTION3"]["key4"] == "val4" assert len(params["SECTION3"]) > 1 assert len(params["SECTION2"]) > 0 assert len(params["SECTION1"]) > 0 assert len(params) == 3
def test_good_file(self, mock1): params = config.read_config('res/good-test.properties') assert params['SECTION1']['key1'] == 'val1' assert params['SECTION2']['key2'] == 'val2' assert params['SECTION3']['key3'] == 'val3' assert params['SECTION3']['key4'] == 'val4' assert len(params['SECTION3']) > 1 assert len(params['SECTION2']) > 0 assert len(params['SECTION1']) > 0 assert len(params) == 3
def test_missing_file(self, mock1): try: params = config.read_config("nofile") params = config.read_xml_config("nofile") self.fail("Should have thrown IOError") except IOError as e: if "could not find file" not in e.message: assert 0 try: params = config.read_xml_config("nofile") self.fail("Should have thrown IOError") except IOError as e: if "could not find file" not in e.message: assert 0
def test_missing_file(self, mock1): try: params = config.read_config('nofile') params = config.read_xml_config('nofile') self.fail('Should have thrown IOError') except IOError as e: if 'Could not find file' not in str(e): assert 0 try: params = config.read_xml_config('nofile') self.fail('Should have thrown IOError') except IOError as e: if 'Could not find file' not in str(e): assert 0
def generate_queries(schema, table_name='demo_table'): '''Generate test queries based on a configuration for the data generator Currently supported components - Spark - Hive Args: schema (str): The schema for the generator as a JSON string Returns: dict: An object that holds keys for different objects, where each key points to a list of strings (queries) for various components. ''' logger.info('Building queries') fields = json.loads(schema) conf = config.read_config('global.conf')['DEMO'] hdfs_file_path = conf['data_write_hdfs_file_location'] hdfs_data_dir = os.path.dirname(hdfs_file_path) table_name = 'demo_table' queries = {} # Build a hive query to insert into a table hive_queries = { 'Basic Table': 'CREATE TABLE IF NOT EXISTS ' + table_name, 'External Table': 'CREATE EXTERNAL TABLE IF NOT EXISTS ' + table_name, 'Drop Table': 'DROP TABLE ' + table_name, 'HDFS CSV': 'CREATE EXTERNAL TABLE IF NOT EXISTS ' + table_name, } basic_create = 'CREATE TABLE IF NOT EXISTS ' + table_name external_create = 'CREATE EXTERNAL TABLE IF NOT EXISTS ' + table_name drop_table = 'DROP TABLE ' + table_name cols = map(lambda d: [str(d['fieldName']), str(HIVE_TYPE_MAP[d['type']])], fields) ftypes = sorted(map(lambda c: ' '.join(c), cols)) field_set = ' (' + ', '.join(ftypes) + ')' hive_queries['Basic Table'] += field_set hive_queries['External Table'] += field_set + ' LOCATION \'' + hdfs_data_dir + '\'' hive_queries['HDFS CSV'] += field_set + '\nROW FORMAT\nDELIMITED FIELDS TERMINATED BY \', \'\nSTORED AS TEXTFILE\nLOCATION \'' + hdfs_data_dir + '\'' queries['HIVE'] = hive_queries spark_queries = { 'RDD and Temporary DataFrame': "", 'SparkSQL - Select all': '' } # Build class rdd_temp = '' class_name = "Data" cols = map(lambda d: [str(d['fieldName']), str(SPARK_TYPE_MAP[d['type']])], fields) ftypes = sorted(map(lambda c: ': '.join(c), cols)) field_set = '(' + ', '.join(ftypes) + ');' rdd_temp += "case class " + class_name + field_set + '\n' rdd_temp += 'val csv = sc.textFile("hdfs:' + hdfs_file_path + '");\n' rdd_temp += 'val data = csv.map(line => line.split(",").map(e => e.trim));\n' lambda_arg = 'a' class_args = [] sort_cols = sorted(cols) for i in range(len(cols)): # Build the class constructor for the map function s = lambda_arg + '(' + str(i) + ').to' + str(sort_cols[i][1]) class_args.append(s) lambda_func = lambda_arg + ' => ' + class_name + '(' + ', '.join(class_args) + ')' rdd_temp += 'val df = data.map(' + lambda_func + ').toDF();\n' rdd_temp += 'df.registerTempTable("' + table_name + '");' spark_queries['RDD and Temporary DataFrame'] = rdd_temp spark_queries['SparkSQL - Select all'] = 'SELECT * FROM ' + table_name queries['SPARK'] = spark_queries return queries
def test_missing_header(self, mock1): try: params = config.read_config("res/bad-test.properties") assert 0 except MissingSectionHeaderError as err: assert 1
filedir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(filedir + '/../demo_utils') import threading, time, json, flask, cluster, logging from flask import Flask, request from demo_utils import config, generator from cluster import ThreadedGenerator from demo_utils import logs from ws4py import configure_logger from flask_cors import CORS log = logs.Logger('DEMO_SERVER.py').getLogger() OUTPUTS = ['FILE', 'KAFKA', 'HTTP', 'HDFS'] '''The three different types of outputs from the generator''' conf = config.read_config('global.conf') app = Flask(__name__, static_url_path='') CORS(app) app_port = int(conf['DEMO']['server_port']) schema = conf['DEMO']['data_schema'] throughput = conf['DEMO']['bytes_per_second'] log_level = conf['LOGGING']['log-level'] # The Websockets will always be the demo_server port + 1 ws_port = app_port + 1 '''The port for the websocket server''' ws_app = cluster.WSDemoServer('0.0.0.0', ws_port) '''The websocket server object. Used to broadcast messages'''
def test_ambari_check_many_attempts(self, mock, mock2): conf = config.read_config('global.conf')['AMBARI'] assert service_installer.check_ambari_service_installed('ZEPPELIN', conf) == False
def test_missing_header(self, mock1): try: params = config.read_config('res/bad-test.properties') assert 0 except MissingSectionHeaderError as err: assert 1
def test_ambari_check_many_attempts(self, mock, mock2): conf = config.read_config('global.conf')['AMBARI'] assert service_installer.check_ambari_service_installed( 'ZEPPELIN', conf) == False
def test_ambari_check_good(self, mock, mock2): conf = config.read_config('global.conf')['AMBARI'] assert service_installer.check_ambari_service_installed( 'ZEPPELIN', conf) == True
def generate_queries(schema, table_name='demo_table'): '''Generate test queries based on a configuration for the data generator Currently supported components - Spark - Hive Args: schema (str): The schema for the generator as a JSON string Returns: dict: An object that holds keys for different objects, where each key points to a list of strings (queries) for various components. ''' logger.info('Building queries') fields = json.loads(schema) conf = config.read_config('global.conf')['DEMO'] hdfs_file_path = conf['data_write_hdfs_file_location'] hdfs_data_dir = os.path.dirname(hdfs_file_path) table_name = 'demo_table' queries = {} # Build a hive query to insert into a table hive_queries = { 'Basic Table': 'CREATE TABLE IF NOT EXISTS ' + table_name, 'External Table': 'CREATE EXTERNAL TABLE IF NOT EXISTS ' + table_name, 'Drop Table': 'DROP TABLE ' + table_name, 'HDFS CSV': 'CREATE EXTERNAL TABLE IF NOT EXISTS ' + table_name, } basic_create = 'CREATE TABLE IF NOT EXISTS ' + table_name external_create = 'CREATE EXTERNAL TABLE IF NOT EXISTS ' + table_name drop_table = 'DROP TABLE ' + table_name cols = map(lambda d: [str(d['fieldName']), str(HIVE_TYPE_MAP[d['type']])], fields) ftypes = sorted(map(lambda c: ' '.join(c), cols)) field_set = ' (' + ', '.join(ftypes) + ')' hive_queries['Basic Table'] += field_set hive_queries[ 'External Table'] += field_set + ' LOCATION \'' + hdfs_data_dir + '\'' hive_queries[ 'HDFS CSV'] += field_set + '\nROW FORMAT\nDELIMITED FIELDS TERMINATED BY \', \'\nSTORED AS TEXTFILE\nLOCATION \'' + hdfs_data_dir + '\'' queries['HIVE'] = hive_queries spark_queries = { 'RDD and Temporary DataFrame': "", 'SparkSQL - Select all': '' } # Build class rdd_temp = '' class_name = "Data" cols = map(lambda d: [str(d['fieldName']), str(SPARK_TYPE_MAP[d['type']])], fields) ftypes = sorted(map(lambda c: ': '.join(c), cols)) field_set = '(' + ', '.join(ftypes) + ');' rdd_temp += "case class " + class_name + field_set + '\n' rdd_temp += 'val csv = sc.textFile("hdfs:' + hdfs_file_path + '");\n' rdd_temp += 'val data = csv.map(line => line.split(",").map(e => e.trim));\n' lambda_arg = 'a' class_args = [] sort_cols = sorted(cols) for i in range(len(cols)): # Build the class constructor for the map function s = lambda_arg + '(' + str(i) + ').to' + str(sort_cols[i][1]) class_args.append(s) lambda_func = lambda_arg + ' => ' + class_name + '(' + ', '.join( class_args) + ')' rdd_temp += 'val df = data.map(' + lambda_func + ').toDF();\n' rdd_temp += 'df.registerTempTable("' + table_name + '");' spark_queries['RDD and Temporary DataFrame'] = rdd_temp spark_queries['SparkSQL - Select all'] = 'SELECT * FROM ' + table_name queries['SPARK'] = spark_queries return queries
def __init__(self, schema, bps, outputs, data_pool_size=100): threading.Thread.__init__(self) self.outputs = outputs self.daemon = True self.flag = True self.data_pool_size = data_pool_size self.http_data_pool = [] self.hdfs_data_pool = [] if bps > 0: self.bps = bps else: self.bps = 50000 #50kb self.gen = generator.DataGenerator(schema) conf = config.read_config('global.conf')['DEMO'] self.exports = {} if 'KAFKA' in outputs: self.kafka_topic = conf['kafka_topic_name'] self.kafka_listener = conf['data_kafka_listener'] has_topic = False if not (self.kafka_topic in get_kafka_topics()[0]): topic_created = create_demo_kafka_topic() if topic_created: has_topic = True else: raise EnvironmentError('Could not create Kafka Topic') else: has_topic = True if has_topic: self.exports['KAFKA'] = True self.kafka_producer = KafkaProducer( bootstrap_servers=self.kafka_listener) else: self.exports['KAFKA'] = False msg = 'Could not create Kafka Topic. Please create manually' raise EnvironmentError(msg) logger.warn(msg) else: self.exports['KAFKA'] = False if 'FILE' in outputs: self.export_filename = conf['data_write_file_location'] self.exports['FILE'] = True with open(self.export_filename, 'w') as ex_data: pass else: self.exports['FILE'] = False if 'HDFS' in outputs: self.export_hdfs_file = conf['data_write_hdfs_file_location'] self.exports['HDFS'] = True else: self.exports['HDFS'] = False if 'HTTP' in outputs: self.export_http_url = conf['data_http_endpoint'] self.exports['HTTP'] = True else: self.exports['HTTP'] = False
def test_ambari_check_good(self, mock, mock2): conf = config.read_config('global.conf')['AMBARI'] assert service_installer.check_ambari_service_installed('ZEPPELIN', conf) == True