Exemplo n.º 1
0
def create_demo_kafka_topic():
  '''Creates a kafka topic for the demo if it doesn't already exist.
  
  The caveat here in using this is that Kafka must be installed on the same machine as the demo, and thus the same machine as Ambari as well. The function will try to start the Kafka service through Ambari and then once the service is started is will use the location of the Kafka topics script to create the topic
  
  The name for the topic is specified in ``global.conf``.
  
  
  Args:
    N/A
    
  Returns:
    bool: True if the creation is successful. False otherwise.
  '''
  conf = config.read_config('global.conf')
  am_conf = conf['AMBARI']
  amc = Ambari(am_conf['username'], am_conf['password'], am_conf['proto'], am_conf['server'], am_conf['port']);
  
  logger.info('Starting Kafka Broker')
  
  if amc.service_action('Sandbox', 'KAFKA', 'START'):
    sh = Shell()
    topics_script = conf['DEMO']['kafka_topics_script']
    zk = conf['DEMO']['zk_connection']
    topic_name = conf['DEMO']['kafka_topic_name']
    logger.info('Attempting to create new Kafka Topic')
    out = sh.run(topics_script + ' --create --zookeeper ' + zk + ' --replication-factor 1 --partitions 1 --topic ' + topic_name)
    logger.debug(str(out))
    if len(out[1]) == 0:
      return True
    else:
      return False
Exemplo n.º 2
0
def get_kafka_topics():
    '''List the kafka topics on the current installation.
  
  Requires that Kafka is installed on the same machine and Ambari is up and running. Will start the service and use the Kafka scripts to list out all of the topics.
  
  
  Args:
    N/A
    
  Returns:
    list: [0] will contain the list of all the topics in a string, typically separated by newlines. [1] will contain any errors when retrieving the topics.
  
  '''
    conf = config.read_config('global.conf')
    am_conf = conf['AMBARI']
    amc = Ambari(am_conf['username'], am_conf['password'], am_conf['proto'],
                 am_conf['server'], am_conf['port'])

    logger.info('Starting Kafka Broker')

    if amc.service_action('Sandbox', 'KAFKA', 'START'):
        sh = Shell()
        topics_script = conf['DEMO']['kafka_topics_script']
        zk = conf['DEMO']['zk_connection']
        logger.info('Attempting to create new Kafka Topic')
        out = sh.run(topics_script + ' --list --zookeeper ' + zk)

        if len(out[1]) == 0:
            topics = out[0]
            topics = topics.strip().split('\n')
            logger.info('Kafka topics output: ' + str(topics))
            return topics

    return ['', 'Unable to get topics. Could not start Kafka Broker']
Exemplo n.º 3
0
def create_demo_kafka_topic():
    '''Creates a kafka topic for the demo if it doesn't already exist.
  
  The caveat here in using this is that Kafka must be installed on the same machine as the demo, and thus the same machine as Ambari as well. The function will try to start the Kafka service through Ambari and then once the service is started is will use the location of the Kafka topics script to create the topic
  
  The name for the topic is specified in ``global.conf``.
  
  
  Args:
    N/A
    
  Returns:
    bool: True if the creation is successful. False otherwise.
  '''
    conf = config.read_config('global.conf')
    am_conf = conf['AMBARI']
    amc = Ambari(am_conf['username'], am_conf['password'], am_conf['proto'],
                 am_conf['server'], am_conf['port'])

    logger.info('Starting Kafka Broker')

    if amc.service_action('Sandbox', 'KAFKA', 'START'):
        sh = Shell()
        topics_script = conf['DEMO']['kafka_topics_script']
        zk = conf['DEMO']['zk_connection']
        topic_name = conf['DEMO']['kafka_topic_name']
        logger.info('Attempting to create new Kafka Topic')
        out = sh.run(topics_script + ' --create --zookeeper ' + zk +
                     ' --replication-factor 1 --partitions 1 --topic ' +
                     topic_name)
        logger.debug(str(out))
        if len(out[1]) == 0:
            return True
        else:
            return False
def on_service_start():
    '''This method will run every time the service start
  
  Fill in this method with any necessary commands to set up and start other services for the demo
  
  Note that this method will always be the very last thing to be executed upon starting the demo service
  '''
    print 'Running on_service_start'
    cfg = config.read_config('global.conf')

    # Ambari Client
    amc = Ambari(config=cfg['AMBARI'])

    #Queue  Services
    amc.service_action('Sandbox', 'KAFKA', 'START', queue=True)
    amc.service_action('Sandbox', 'ZEPPELIN', 'START', queue=True)
    try:
        # Not guaranteed to be installed
        amc.service_action('Sandbox', 'NIFI', 'START', queue=True)
    except:
        log.warn('Failed to start NiFi')

    service_installer.add_zeppelin_notebooks()
    # Add anything else below that might be necessary for when the demo starts

    pass
Exemplo n.º 5
0
def get_kafka_topics():
  '''List the kafka topics on the current installation.
  
  Requires that Kafka is installed on the same machine and Ambari is up and running. Will start the service and use the Kafka scripts to list out all of the topics.
  
  
  Args:
    N/A
    
  Returns:
    list: [0] will contain the list of all the topics in a string, typically separated by newlines. [1] will contain any errors when retrieving the topics.
  
  '''
  conf = config.read_config('global.conf')
  am_conf = conf['AMBARI']
  amc = Ambari(am_conf['username'], am_conf['password'], am_conf['proto'], am_conf['server'], am_conf['port']);
  
  logger.info('Starting Kafka Broker')
  
  if amc.service_action('Sandbox', 'KAFKA', 'START'):
    sh = Shell()
    topics_script = conf['DEMO']['kafka_topics_script']
    zk = conf['DEMO']['zk_connection']
    logger.info('Attempting to create new Kafka Topic')
    out = sh.run(topics_script + ' --list --zookeeper ' + zk)
    
    if len(out[1]) == 0:
      topics = out[0]
      topics = topics.strip().split('\n')
      logger.info('Kafka topics output: ' + str(topics))
      return topics
    
  return ['', 'Unable to get topics. Could not start Kafka Broker']
Exemplo n.º 6
0
def on_service_start():
  '''This method will run every time the service start
  
  Fill in this method with any necessary commands to set up and start other services for the demo
  
  Note that this method will always be the very last thing to be executed upon starting the demo service
  '''
  print 'Running on_service_start'
  cfg = config.read_config('global.conf')
  
  # Ambari Client
  amc = Ambari(config=cfg['AMBARI'])
  
  #Queue  Services
  amc.service_action('Sandbox', 'KAFKA', 'START', queue=True)
  amc.service_action('Sandbox', 'ZEPPELIN', 'START', queue=True)
  try:
    # Not guaranteed to be installed
    amc.service_action('Sandbox', 'NIFI', 'START', queue=True)
  except:
    log.warn('Failed to start NiFi')
  
  service_installer.add_zeppelin_notebooks()
  # Add anything else below that might be necessary for when the demo starts
  
  
  
  
  pass
Exemplo n.º 7
0
 def __init__(self, schema, bps, outputs, data_pool_size=100):
   threading.Thread.__init__(self)
   self.outputs = outputs
   self.daemon = True
   self.flag = True
   self.data_pool_size = data_pool_size
   self.http_data_pool = []
   self.hdfs_data_pool = []
   if bps > 0:
     self.bps = bps
   else:
     self.bps = 50000 #50kb
   self.gen = generator.DataGenerator(schema)
   
   conf = config.read_config('global.conf')['DEMO']
   self.exports = {}
   if 'KAFKA' in outputs:
     self.kafka_topic = conf['kafka_topic_name']
     self.kafka_listener = conf['data_kafka_listener']
     has_topic = False
     if not (self.kafka_topic in get_kafka_topics()[0]):
       topic_created = create_demo_kafka_topic()
       if topic_created:
         has_topic = True
       else:
         raise EnvironmentError('Could not create Kafka Topic')
         
     else:
       has_topic = True
     
     if has_topic:
       self.exports['KAFKA'] = True
       self.kafka_producer = KafkaProducer(bootstrap_servers=self.kafka_listener)
     else:
       self.exports['KAFKA'] = False
       msg = 'Could not create Kafka Topic. Please create manually'
       raise EnvironmentError(msg)
       logger.warn(msg)
   else:
     self.exports['KAFKA'] = False
   
   if 'FILE' in outputs:
     self.export_filename = conf['data_write_file_location']
     self.exports['FILE'] = True
     with open(self.export_filename, 'w') as ex_data:
       pass
   else:
     self.exports['FILE'] = False
     
   if 'HDFS' in outputs:
     self.export_hdfs_file = conf['data_write_hdfs_file_location']
     self.exports['HDFS'] = True
   else:
     self.exports['HDFS'] = False
   
   if 'HTTP' in outputs:
     self.export_http_url = conf['data_http_endpoint']
     self.exports['HTTP'] = True
   else:
     self.exports['HTTP'] = False
Exemplo n.º 8
0
 def test_good_file(self, mock1):
     params = config.read_config("res/good-test.properties")
     assert params["SECTION1"]["key1"] == "val1"
     assert params["SECTION2"]["key2"] == "val2"
     assert params["SECTION3"]["key3"] == "val3"
     assert params["SECTION3"]["key4"] == "val4"
     assert len(params["SECTION3"]) > 1
     assert len(params["SECTION2"]) > 0
     assert len(params["SECTION1"]) > 0
     assert len(params) == 3
	def test_good_file(self, mock1):
		params = config.read_config('res/good-test.properties')
		assert params['SECTION1']['key1'] == 'val1'
		assert params['SECTION2']['key2'] == 'val2'
		assert params['SECTION3']['key3'] == 'val3'
		assert params['SECTION3']['key4'] == 'val4'
		assert len(params['SECTION3']) > 1
		assert len(params['SECTION2']) > 0
		assert len(params['SECTION1']) > 0
		assert len(params) == 3
Exemplo n.º 10
0
    def test_missing_file(self, mock1):
        try:
            params = config.read_config("nofile")
            params = config.read_xml_config("nofile")
            self.fail("Should have thrown IOError")
        except IOError as e:
            if "could not find file" not in e.message:
                assert 0

        try:
            params = config.read_xml_config("nofile")
            self.fail("Should have thrown IOError")
        except IOError as e:
            if "could not find file" not in e.message:
                assert 0
	def test_missing_file(self, mock1):
		try:
			params = config.read_config('nofile')
			params = config.read_xml_config('nofile')
			self.fail('Should have thrown IOError')
		except IOError as e:
			if 'Could not find file' not in str(e):
				assert 0
				
		try:
			params = config.read_xml_config('nofile')
			self.fail('Should have thrown IOError')
		except IOError as e:
			if 'Could not find file' not in str(e):
				assert 0
Exemplo n.º 12
0
def generate_queries(schema, table_name='demo_table'):
  '''Generate test queries based on a configuration for the data generator
  
  Currently supported components
  
  - Spark
  - Hive
  
  Args:
    schema (str): The schema for the generator as a JSON string
    
  Returns:
    dict: An object that holds keys for different objects, where each key points to a list of strings (queries) for various components.
    
  '''
  logger.info('Building queries')
  fields = json.loads(schema)
  conf = config.read_config('global.conf')['DEMO']
  hdfs_file_path = conf['data_write_hdfs_file_location']
  
  hdfs_data_dir = os.path.dirname(hdfs_file_path)
  
  table_name = 'demo_table'
  queries = {}
  # Build a hive query to insert into a table
  hive_queries = {
    'Basic Table': 'CREATE TABLE IF NOT EXISTS ' + table_name,
    'External Table': 'CREATE EXTERNAL TABLE IF NOT EXISTS ' + table_name,
    'Drop Table': 'DROP TABLE ' + table_name,
    'HDFS CSV': 'CREATE EXTERNAL TABLE IF NOT EXISTS ' + table_name,
  }
  basic_create = 'CREATE TABLE IF NOT EXISTS ' + table_name
  external_create = 'CREATE EXTERNAL TABLE IF NOT EXISTS ' + table_name
  drop_table = 'DROP TABLE ' + table_name
  
  cols = map(lambda d: [str(d['fieldName']), str(HIVE_TYPE_MAP[d['type']])], fields)
  ftypes = sorted(map(lambda c: ' '.join(c), cols))
  field_set = ' (' + ', '.join(ftypes) + ')'
  
  hive_queries['Basic Table'] += field_set
  hive_queries['External Table'] += field_set + ' LOCATION \'' + hdfs_data_dir + '\''
  hive_queries['HDFS CSV'] += field_set + '\nROW FORMAT\nDELIMITED FIELDS TERMINATED BY \', \'\nSTORED AS TEXTFILE\nLOCATION \'' + hdfs_data_dir + '\''
  queries['HIVE'] = hive_queries
  
  spark_queries = {
    'RDD and Temporary DataFrame': "",
    'SparkSQL - Select all': ''
  }
  # Build class
  rdd_temp = ''
  class_name = "Data"
  cols = map(lambda d: [str(d['fieldName']), str(SPARK_TYPE_MAP[d['type']])], fields)
  ftypes = sorted(map(lambda c: ': '.join(c), cols))
  field_set = '(' + ', '.join(ftypes) + ');'
  rdd_temp += "case class " + class_name + field_set + '\n'
  rdd_temp += 'val csv = sc.textFile("hdfs:' + hdfs_file_path + '");\n'
  rdd_temp += 'val data = csv.map(line => line.split(",").map(e => e.trim));\n'
  lambda_arg = 'a'
  class_args = []
  sort_cols = sorted(cols)
  for i in range(len(cols)):
    # Build the class constructor for the map function
    s = lambda_arg + '(' + str(i) + ').to' + str(sort_cols[i][1])
    class_args.append(s)
  
  lambda_func = lambda_arg + ' => ' + class_name + '(' + ', '.join(class_args) + ')'
  rdd_temp += 'val df = data.map(' + lambda_func + ').toDF();\n'
  rdd_temp += 'df.registerTempTable("' + table_name + '");'
  
  spark_queries['RDD and Temporary DataFrame'] = rdd_temp
  spark_queries['SparkSQL - Select all'] = 'SELECT * FROM ' + table_name
  
  queries['SPARK'] = spark_queries
  
  
  return queries
Exemplo n.º 13
0
 def test_missing_header(self, mock1):
     try:
         params = config.read_config("res/bad-test.properties")
         assert 0
     except MissingSectionHeaderError as err:
         assert 1
Exemplo n.º 14
0
filedir = os.path.dirname(os.path.realpath(__file__))
sys.path.append(filedir + '/../demo_utils')

import threading, time, json, flask, cluster, logging
from flask import Flask, request
from demo_utils import config, generator
from cluster import ThreadedGenerator
from demo_utils import logs
from ws4py import configure_logger
from flask_cors import CORS
log = logs.Logger('DEMO_SERVER.py').getLogger()

OUTPUTS = ['FILE', 'KAFKA', 'HTTP', 'HDFS']
'''The three different types of outputs from the generator'''

conf = config.read_config('global.conf')

app = Flask(__name__, static_url_path='')
CORS(app)
app_port = int(conf['DEMO']['server_port'])
schema = conf['DEMO']['data_schema']
throughput = conf['DEMO']['bytes_per_second']
log_level = conf['LOGGING']['log-level']

# The Websockets will always be the demo_server port + 1
ws_port = app_port + 1
'''The port for the websocket server'''

ws_app = cluster.WSDemoServer('0.0.0.0', ws_port)
'''The websocket server object. Used to broadcast messages'''
 def test_ambari_check_many_attempts(self, mock, mock2):
   conf = config.read_config('global.conf')['AMBARI']
   assert service_installer.check_ambari_service_installed('ZEPPELIN', conf) == False
	def test_missing_header(self, mock1):
		try:
			params = config.read_config('res/bad-test.properties')
			assert 0
		except MissingSectionHeaderError as err:
			assert 1
 def test_ambari_check_many_attempts(self, mock, mock2):
     conf = config.read_config('global.conf')['AMBARI']
     assert service_installer.check_ambari_service_installed(
         'ZEPPELIN', conf) == False
 def test_ambari_check_good(self, mock, mock2):
     conf = config.read_config('global.conf')['AMBARI']
     assert service_installer.check_ambari_service_installed(
         'ZEPPELIN', conf) == True
Exemplo n.º 19
0
def generate_queries(schema, table_name='demo_table'):
    '''Generate test queries based on a configuration for the data generator
  
  Currently supported components
  
  - Spark
  - Hive
  
  Args:
    schema (str): The schema for the generator as a JSON string
    
  Returns:
    dict: An object that holds keys for different objects, where each key points to a list of strings (queries) for various components.
    
  '''
    logger.info('Building queries')
    fields = json.loads(schema)
    conf = config.read_config('global.conf')['DEMO']
    hdfs_file_path = conf['data_write_hdfs_file_location']

    hdfs_data_dir = os.path.dirname(hdfs_file_path)

    table_name = 'demo_table'
    queries = {}
    # Build a hive query to insert into a table
    hive_queries = {
        'Basic Table': 'CREATE TABLE IF NOT EXISTS ' + table_name,
        'External Table': 'CREATE EXTERNAL TABLE IF NOT EXISTS ' + table_name,
        'Drop Table': 'DROP TABLE ' + table_name,
        'HDFS CSV': 'CREATE EXTERNAL TABLE IF NOT EXISTS ' + table_name,
    }
    basic_create = 'CREATE TABLE IF NOT EXISTS ' + table_name
    external_create = 'CREATE EXTERNAL TABLE IF NOT EXISTS ' + table_name
    drop_table = 'DROP TABLE ' + table_name

    cols = map(lambda d: [str(d['fieldName']),
                          str(HIVE_TYPE_MAP[d['type']])], fields)
    ftypes = sorted(map(lambda c: ' '.join(c), cols))
    field_set = ' (' + ', '.join(ftypes) + ')'

    hive_queries['Basic Table'] += field_set
    hive_queries[
        'External Table'] += field_set + ' LOCATION \'' + hdfs_data_dir + '\''
    hive_queries[
        'HDFS CSV'] += field_set + '\nROW FORMAT\nDELIMITED FIELDS TERMINATED BY \', \'\nSTORED AS TEXTFILE\nLOCATION \'' + hdfs_data_dir + '\''
    queries['HIVE'] = hive_queries

    spark_queries = {
        'RDD and Temporary DataFrame': "",
        'SparkSQL - Select all': ''
    }
    # Build class
    rdd_temp = ''
    class_name = "Data"
    cols = map(lambda d: [str(d['fieldName']),
                          str(SPARK_TYPE_MAP[d['type']])], fields)
    ftypes = sorted(map(lambda c: ': '.join(c), cols))
    field_set = '(' + ', '.join(ftypes) + ');'
    rdd_temp += "case class " + class_name + field_set + '\n'
    rdd_temp += 'val csv = sc.textFile("hdfs:' + hdfs_file_path + '");\n'
    rdd_temp += 'val data = csv.map(line => line.split(",").map(e => e.trim));\n'
    lambda_arg = 'a'
    class_args = []
    sort_cols = sorted(cols)
    for i in range(len(cols)):
        # Build the class constructor for the map function
        s = lambda_arg + '(' + str(i) + ').to' + str(sort_cols[i][1])
        class_args.append(s)

    lambda_func = lambda_arg + ' => ' + class_name + '(' + ', '.join(
        class_args) + ')'
    rdd_temp += 'val df = data.map(' + lambda_func + ').toDF();\n'
    rdd_temp += 'df.registerTempTable("' + table_name + '");'

    spark_queries['RDD and Temporary DataFrame'] = rdd_temp
    spark_queries['SparkSQL - Select all'] = 'SELECT * FROM ' + table_name

    queries['SPARK'] = spark_queries

    return queries
Exemplo n.º 20
0
    def __init__(self, schema, bps, outputs, data_pool_size=100):
        threading.Thread.__init__(self)
        self.outputs = outputs
        self.daemon = True
        self.flag = True
        self.data_pool_size = data_pool_size
        self.http_data_pool = []
        self.hdfs_data_pool = []
        if bps > 0:
            self.bps = bps
        else:
            self.bps = 50000  #50kb
        self.gen = generator.DataGenerator(schema)

        conf = config.read_config('global.conf')['DEMO']
        self.exports = {}
        if 'KAFKA' in outputs:
            self.kafka_topic = conf['kafka_topic_name']
            self.kafka_listener = conf['data_kafka_listener']
            has_topic = False
            if not (self.kafka_topic in get_kafka_topics()[0]):
                topic_created = create_demo_kafka_topic()
                if topic_created:
                    has_topic = True
                else:
                    raise EnvironmentError('Could not create Kafka Topic')

            else:
                has_topic = True

            if has_topic:
                self.exports['KAFKA'] = True
                self.kafka_producer = KafkaProducer(
                    bootstrap_servers=self.kafka_listener)
            else:
                self.exports['KAFKA'] = False
                msg = 'Could not create Kafka Topic. Please create manually'
                raise EnvironmentError(msg)
                logger.warn(msg)
        else:
            self.exports['KAFKA'] = False

        if 'FILE' in outputs:
            self.export_filename = conf['data_write_file_location']
            self.exports['FILE'] = True
            with open(self.export_filename, 'w') as ex_data:
                pass
        else:
            self.exports['FILE'] = False

        if 'HDFS' in outputs:
            self.export_hdfs_file = conf['data_write_hdfs_file_location']
            self.exports['HDFS'] = True
        else:
            self.exports['HDFS'] = False

        if 'HTTP' in outputs:
            self.export_http_url = conf['data_http_endpoint']
            self.exports['HTTP'] = True
        else:
            self.exports['HTTP'] = False
Exemplo n.º 21
0
filedir = os.path.dirname(os.path.realpath(__file__))
sys.path.append(filedir + '/../demo_utils')

import threading, time, json, flask, cluster, logging
from flask import Flask, request
from demo_utils import config, generator
from cluster import ThreadedGenerator
from demo_utils import logs
from ws4py import configure_logger
from flask_cors import CORS
log = logs.Logger('DEMO_SERVER.py').getLogger()

OUTPUTS = ['FILE', 'KAFKA', 'HTTP', 'HDFS']
'''The three different types of outputs from the generator'''

conf = config.read_config('global.conf')

app = Flask(__name__, static_url_path='')
CORS(app)
app_port = int(conf['DEMO']['server_port'])
schema = conf['DEMO']['data_schema']
throughput = conf['DEMO']['bytes_per_second']
log_level = conf['LOGGING']['log-level']

# The Websockets will always be the demo_server port + 1
ws_port = app_port + 1
'''The port for the websocket server'''

ws_app = cluster.WSDemoServer('0.0.0.0', ws_port)
'''The websocket server object. Used to broadcast messages'''
 def test_ambari_check_good(self, mock, mock2):
   conf = config.read_config('global.conf')['AMBARI']
   assert service_installer.check_ambari_service_installed('ZEPPELIN', conf) == True