Exemplo n.º 1
0
def ingest_file(file,partition,hdfs_root_path,topic,kafka_servers):

        logger = logging.getLogger('SPOT.INGEST.FLOW.{0}'.format(os.getpid()))

        try:

            # get file name and date.
            file_name_parts = file.split('/')
            file_name = file_name_parts[len(file_name_parts)-1]
            file_date = file_name.split('.')[1]

            file_date_path = file_date[0:8]
            file_date_hour = file_date[8:10]

            # hdfs path with timestamp.
            hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path,file_date_path,file_date_hour)
            Util.creat_hdfs_folder(hdfs_path,logger)

            # load to hdfs.
            hdfs_file = "{0}/{1}".format(hdfs_path,file_name)
            Util.load_to_hdfs(file,hdfs_file,logger)

            # create event for workers to process the file.
            logger.info("Sending file to worker number: {0}".format(partition))
            KafkaTopic.SendMessage(hdfs_file,kafka_servers,topic,partition)    
            logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,topic))

        except Exception as err:
            logger.error("There was a problem, please check the following error message:{0}".format(err.message))
            logger.error("Exception: {0}".format(err))
Exemplo n.º 2
0
def ingest_file(file, partition, hdfs_root_path, topic, kafka_servers):

    logger = logging.getLogger('SPOT.INGEST.FLOW.{0}'.format(os.getpid()))

    try:

        # get file name and date.
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]
        file_date = file_name.split('.')[1]

        file_date_path = file_date[0:8]
        file_date_hour = file_date[8:10]

        # hdfs path with timestamp.
        hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path, file_date_path,
                                                file_date_hour)
        Util.creat_hdfs_folder(hdfs_path, logger)

        # load to hdfs.
        hdfs_file = "{0}/{1}".format(hdfs_path, file_name)
        Util.load_to_hdfs(file, hdfs_file, logger)

        # create event for workers to process the file.
        logger.info("Sending file to worker number: {0}".format(partition))
        KafkaTopic.SendMessage(hdfs_file, kafka_servers, topic, partition)
        logger.info(
            "File {0} has been successfully sent to Kafka Topic to: {1}".
            format(file, topic))

    except Exception as err:
        logger.error(
            "There was a problem, please check the following error message:{0}"
            .format(err.message))
        logger.error("Exception: {0}".format(err))
Exemplo n.º 3
0
    def start(self):

        self._logger.info("Creating Spark Job for topic: {0}".format(self._kafka_consumer.Topic))                

        # parser
        parser = self._conf["parser"]

        #spark conf
        diver_memory = self._spark_conf["driver_memory"]
        num_exec = self._spark_conf["spark_exec"]
        exec_memory = self._spark_conf["spark_executor_memory"]
        exec_cores = self._spark_conf["spark_executor_cores"]
        batch_size = self._spark_conf["spark_batch_size"]
        
        jar_path = os.path.dirname(os.path.dirname(self._script_path))
        # spark job command.          
        spark_job_cmd = ("spark-submit --master yarn "
                        "--driver-memory {0} "
                        "--num-executors {1} "
                        "--conf spark.executor.memory={2} "
                        "--conf spark.executor.cores={3} "
                        "--jars {4}/common/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar "
                        "{5}/{6} "
                        "-zk {7} "
                        "-t {8} "
                        "-db {9} "
                        "-dt {10} "
                        "-w {11} "
                        "-bs {12}".format(diver_memory,num_exec,exec_memory,exec_cores,jar_path,self._script_path,parser,self._kafka_consumer.ZookeperServer,self._kafka_consumer.Topic,self._db_name,"proxy",self._processes,batch_size))
        
        # start spark job.
        Util.execute_cmd(spark_job_cmd,self._logger)
Exemplo n.º 4
0
def ingest_file(file, message_size, topic, kafka_servers):

    logger = logging.getLogger('SPOT.INGEST.PROXY.{0}'.format(os.getpid()))
    try:
        message = ""
        logger.info("Ingesting file: {0} process:{1}".format(
            file, os.getpid()))
        with open(file, "rb") as f:
            for line in f:
                message += line
                if len(message) > message_size:
                    KafkaProducer.SendMessage(message, kafka_servers, topic, 0)
                    message = ""
            #send the last package.
            KafkaProducer.SendMessage(message, kafka_servers, topic, 0)
        rm_file = "rm {0}".format(file)
        Util.execute_cmd(rm_file, logger)
        logger.info(
            "File {0} has been successfully sent to Kafka Topic: {1}".format(
                file, topic))

    except Exception as err:
        logger.error(
            "There was a problem, please check the following error message:{0}"
            .format(err.message))
        logger.error("Exception: {0}".format(err))
Exemplo n.º 5
0
    def run(cls):
        '''
            Main command-line entry point.

        :param cls: The class as implicit first argument.
        '''
        try:
            args = _parse_args()
            conf = json.loads(args.config_file.read())

            # .........................set up logger
            Util.get_logger('SPOT', args.log_level)

            # .........................check kerberos authentication
            if os.getenv('KRB_AUTH'):
                kb = Kerberos()
                kb.authenticate()

            conf['producer'] = {
                'bootstrap_servers': [
                    '{0}:{1}'.format(conf['kafka']['kafka_server'],
                                     conf['kafka']['kafka_port'])
                ]
            }

            conf['file_watcher'] = {
                'path': conf['pipelines'][args.type]['collector_path'],
                'supported_files':
                conf['pipelines'][args.type]['supported_files'],
                'recursive': True
            }

            # .........................migrate configs
            if not 'local_staging' in conf['pipelines'][args.type].keys():
                conf['pipelines'][args.type]['local_staging'] = '/tmp'

            if 'max_request_size' in conf['kafka'].keys():
                conf['producer']['max_request_size'] = conf['kafka'][
                    'max_request_size']

            if not 'process_opt' in conf['pipelines'][args.type].keys():
                conf['pipelines'][args.type]['process_opt'] = ''

            if 'recursive' in conf['pipelines'][args.type].keys():
                conf['file_watcher']['recursive'] = conf['pipelines'][
                    args.type]['recursive']

            collector = cls(args.type, args.topic, args.skip_conversion,
                            **conf)
            collector.start()

        except SystemExit:
            raise
        except:
            sys.excepthook(*sys.exc_info())
            sys, exit(1)
Exemplo n.º 6
0
 def test_add_fail(self):
     res = Util.safe_call(self.dbm.add_driver, 1, {})
     self.assertTrue(
         isinstance(res, DbException),
         "Expected exception when adding invalid driver, got %s" % res)
     # check that we can't insert driver with existing id
     res = Util.safe_call(self.dbm.add_driver, 0,
                          DbManagerTestCase.TestDriver(0))
     self.assertTrue(isinstance(res, DbException),
                     "Expected exception, but got %s" % res)
     self.assertEqual(len(self.dbm), 1, "DB manager size changed")
Exemplo n.º 7
0
 def test_cmd_all(self):
     res = Util.safe_call(self.dbm.add_driver, 1,
                          DbManagerTestCase.TestDriver(1))
     self.assertTrue(not isinstance(res, Exception),
                     "Exception adding new driver: %s" % res)
     arg = 1
     exp_vals = range(arg, len(self.dbm) + arg)
     res = Util.safe_call(self.dbm.cmd_all, lambda db: db.echo(arg))
     self.assertTrue(not isinstance(res, Exception), "Exception: %s" % res)
     self.assertTrue(not Util.list_diff(res, exp_vals),
                     "Got %s, expected %s" % (res, exp_vals))
Exemplo n.º 8
0
def spark_job(script_file, **kwargs):
    '''
        Run given script file by appling it as a Spark Job.
    '''
    spark_job = 'spark2-submit --master {0}'.format(kwargs.pop('master'))
    spark_job += ' --deploy-mode {0}'.format(kwargs.pop('deploy_mode'))
    spark_job += ' --py-files {0}'.format(kwargs.pop('py_files'))

    if 'driver_memory' in kwargs.keys():
        spark_job += ' --driver-memory {0}'.format(kwargs.pop('driver_memory'))

    if 'spark_exec' in kwargs.keys():
        spark_job += ' --num-executors {0}'.format(kwargs.pop('spark_exec'))

    if 'spark_executor_memory' in kwargs.keys():
        spark_job += ' --conf spark.executor.memory={0}'.format(
            kwargs.pop('spark_executor_memory'))

    if 'spark_executor_cores' in kwargs.keys():
        spark_job += ' --conf spark.executor.cores={0}'.format(
            kwargs.pop('spark_executor_cores'))

    spark_job += ' {0}'.format(os.path.abspath(script_file))

    if 'spark_batch_size' in kwargs.keys():
        spark_job += ' -b {0}'.format(kwargs.pop('spark_batch_size'))

    spark_job += ' -d {0}'.format(kwargs.pop('database'))

    if kwargs['group_id'] is not None:
        spark_job += ' -g {0}'.format(kwargs.pop('group_id'))

    spark_job += ' -l {0}'.format(kwargs.pop('log_level'))

    if kwargs['app_name'] is not None:
        spark_job += ' -n {0}'.format(kwargs.pop('app_name'))

    spark_job += ' -p {0}'.format(kwargs.pop('partitions'))
    spark_job += ' -t {0}'.format(kwargs.pop('type'))
    spark_job += ' --topic {0}'.format(kwargs.pop('topic'))
    spark_job += ' --zkquorum {0}'.format(kwargs.pop('zkquorum'))

    if kwargs['redirect_spark_logs'] is not None:
        spark_job += ' 2>{0}'.format(kwargs.pop('redirect_spark_logs'))

    try:
        Util.call(spark_job, True)
    except Exception as exc:
        sys.stderr.write('Failed to submit Spark Job!\n')
        sys.stderr.write('[{0}] {1}\n\n'.format(exc.__class__.__name__,
                                                exc.message))
        sys.exit(2)
Exemplo n.º 9
0
    def _create_topic(self):

        self._logger.info("Creating topic: {0} with {1} parititions".format(
            self._topic, self._num_of_partitions))

        # get script path
        zk_conf = "{0}:{1}".format(self._zk_server, self._zk_port)
        create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format(
            os.path.dirname(os.path.abspath(__file__)), self._topic, zk_conf,
            self._num_of_partitions)

        # execute create topic cmd
        Util.execute_cmd(create_topic_cmd, self._logger)
Exemplo n.º 10
0
def _ingest_file(new_file, hdfs_root_path, producer, topic):

    logger = logging.getLogger('SPOT.INGEST.FLOW.{0}'.format(os.getpid()))

    try:

        # get file name and date.
        file_name_parts = new_file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]
        file_date = file_name.split('.')[1]
        file_date_path = file_date[0:8]
        file_date_hour = file_date[8:10]

        # hdfs path with timestamp.
        hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path, file_date_path,
                                                file_date_hour)
        hdfs_file = "{0}/{1}".format(hdfs_path, file_name)

        try:
            if len(hdfs.list_dir(hdfs_path)) == 0:
                logger.info('creating directory: ' + hdfs_path)
                hdfs.mkdir(hdfs_path)
            logger.info('uploading file to hdfs: ' + hdfs_file)
            result = hdfs.upload_file(hdfs_path, new_file)
            if not result:
                logger.error('File failed to upload: ' + hdfs_file)
                raise HdfsException
            else:
                rm_file = "rm {0}".format(new_file)
                logger.info(
                    "Removing files from local staging: {0}".format(rm_file))
                Util.execute_cmd(rm_file, logger)

        except HdfsException as err:
            logger.error('Exception: ' + err.exception)
            logger.info('Check Hdfs Connection settings and server health')

    except Exception as err:
        logger.error("There was a problem, Exception: {0}".format(err))

        # create event for workers to process the file.
        # logger.info("Sending file to worker number: {0}".format(partition))
    try:
        producer.SendMessage(hdfs_file, topic)
        logger.info(
            "File {0} has been successfully sent to Kafka Topic to: {1}".
            format(hdfs_file, topic))
    except Exception as err:
        logger.info("File {0} failed to be sent to Kafka Topic to: {1}".format(
            hdfs_file, topic))
        logger.error("Error: {0}".format(err))
Exemplo n.º 11
0
def start_collector(type, workers_num, id=None):

    # generate ingest id
    ingest_id = str(datetime.datetime.time(datetime.datetime.now())).replace(
        ":", "_").replace(".", "_")

    # create logger.
    logger = Util.get_logger("SPOT.INGEST")

    # validate the given configuration exists in ingest_conf.json.
    if not type in master_conf["pipelines"]:
        logger.error("'{0}' type is not a valid configuration.".format(type))
        sys.exit(1)

    # validate the type is a valid module.
    if not Util.validate_data_source(master_conf["pipelines"][type]["type"]):
        logger.error(
            "'{0}' type is not configured. Please check you ingest conf file".
            format(master_conf["pipelines"][type]["type"]))
        sys.exit(1)

    # validate if kerberos authentication is required.
    if os.getenv('KRB_AUTH'):
        kb = Kerberos()
        kb.authenticate()

    # kafka server info.
    logger.info("Initializing kafka instance")
    k_server = master_conf["kafka"]['kafka_server']
    k_port = master_conf["kafka"]['kafka_port']

    # required zookeeper info.
    zk_server = master_conf["kafka"]['zookeper_server']
    zk_port = master_conf["kafka"]['zookeper_port']

    topic = "SPOT-INGEST-{0}_{1}".format(type, ingest_id) if not id else id
    kafka = KafkaTopic(topic, k_server, k_port, zk_server, zk_port,
                       workers_num)

    # create a collector instance based on data source type.
    logger.info("Starting {0} ingest instance".format(topic))
    module = __import__("pipelines.{0}.collector".format(
        master_conf["pipelines"][type]["type"]),
                        fromlist=['Collector'])

    # start collector.
    ingest_collector = module.Collector(master_conf['hdfs_app_path'], kafka,
                                        type)
    ingest_collector.start()
Exemplo n.º 12
0
def ingest_file(file, pkt_num, pcap_split_staging, partition, hdfs_root_path,
                topic, kafka_servers):

    logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid()))

    try:
        # get file name and date.
        org_file = file
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]

        # split file.
        name = file_name.split('.')[0]
        split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format(
            pkt_num, file, pcap_split_staging, name)
        logger.info("Splitting file: {0}".format(split_cmd))
        Util.execute_cmd(split_cmd, logger)

        logger.info("Removing file: {0}".format(org_file))
        rm_big_file = "rm {0}".format(org_file)
        Util.execute_cmd(rm_big_file, logger)

        for currdir, subdir, files in os.walk(pcap_split_staging):
            for file in files:
                if file.endswith(".pcap") and "{0}_spot".format(name) in file:

                    # get timestamp from the file name to build hdfs path.
                    file_date = file.split('.')[0]
                    pcap_hour = file_date[-6:-4]
                    pcap_date_path = file_date[-14:-6]

                    # hdfs path with timestamp.
                    hdfs_path = "{0}/binary/{1}/{2}".format(
                        hdfs_root_path, pcap_date_path, pcap_hour)

                    # create hdfs path.
                    Util.creat_hdfs_folder(hdfs_path, logger)

                    # load file to hdfs.
                    hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file)
                    Util.load_to_hdfs(os.path.join(currdir, file),
                                      hadoop_pcap_file, logger)

                    # create event for workers to process the file.
                    logger.info(
                        "Sending split file to worker number: {0}".format(
                            partition))
                    KafkaTopic.SendMessage(hadoop_pcap_file, kafka_servers,
                                           topic, partition)
                    logger.info(
                        "File {0} has been successfully sent to Kafka Topic to: {1}"
                        .format(file, topic))

    except Exception as err:

        logger.error(
            "There was a problem, please check the following error message:{0}"
            .format(err.message))
        logger.error("Exception: {0}".format(err))
Exemplo n.º 13
0
 def test_add(self):
     self.assertEqual(len(self.dbm), 0, "DB Manager not empty")
     res = Util.safe_call(self.dbm.add_driver, 0,
                          DbManagerTestCase.TestDriver(0))
     self.assertTrue(not isinstance(res, Exception),
                     "Exception adding driver: %s" % res)
     self.assertEqual(len(self.dbm), 1, "DB Manager is still empty")
Exemplo n.º 14
0
 def test_cmd_one(self):
     test_val = 5
     res = Util.safe_call(self.dbm.cmd_one, 0, lambda db: db.echo(test_val))
     self.assertTrue(not isinstance(res, Exception),
                     "Exception calling method: %s" % res)
     self.assertEqual(res, test_val,
                      "Got %s, expected %d" % (res, test_val))
Exemplo n.º 15
0
    def __init__(self):

        self._logger = Util.get_logger('SPOT.COMMON.KERBEROS')
        principal, keytab, sasl_mech, security_proto = config.kerberos()

        if os.getenv('KINITPATH'):
            self._kinit = os.getenv('KINITPATH')
        else:
            self._kinit = "kinit"

        self._kinitopts = os.getenv('KINITOPTS')
        self._keytab = "-kt {0}".format(keytab)
        self._krb_user = principal

        if self._kinit == None or self._keytab == None or self._krb_user == None:
            self._logger.error(
                "Please verify kerberos configuration, some environment variables are missing."
            )
            sys.exit(1)

        if self._kinitopts is None:
            self._kinit_cmd = "{0} {1} {2}".format(self._kinit, self._keytab,
                                                   self._krb_user)
        else:
            self._kinit_cmd = "{0} {1} {2} {3}".format(self._kinit,
                                                       self._kinitopts,
                                                       self._keytab,
                                                       self._krb_user)
Exemplo n.º 16
0
    def _create_topic(self):

        self._logger.info("Creating topic: {0} with {1} parititions".format(self._topic,self._num_of_partitions))     

        # Create partitions for the workers.
        self._partitions = [ TopicPartition(self._topic,p) for p in range(int(self._num_of_partitions))]        

        # create partitioner
        self._partitioner = RoundRobinPartitioner(self._partitions)
        
        # get script path 
        zk_conf = "{0}:{1}".format(self._zk_server,self._zk_port)
        create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format(os.path.dirname(os.path.abspath(__file__)),self._topic,zk_conf,self._num_of_partitions)

        # execute create topic cmd
        Util.execute_cmd(create_topic_cmd,self._logger)
Exemplo n.º 17
0
    def start(self):

        self._logger.info("Starting PROXY collector")
        self._watcher.start()   
    
        try:
            while True:
                #self._ingest_files()
                self._ingest_files_pool()              
                time.sleep(self._ingestion_interval)
        except KeyboardInterrupt:
            self._logger.info("Stopping Proxy collector...")  
            Util.remove_kafka_topic(self._kafka_topic.Zookeeper,self._kafka_topic.Topic,self._logger)          
            self._watcher.stop()
            self._pool.terminate()
            self._pool.close()            
            self._pool.join()
Exemplo n.º 18
0
    def start(self):

        self._logger.info("Starting PROXY collector")
        self._watcher.start()   
    
        try:
            while True:
                #self._ingest_files()
                self._ingest_files_pool()              
                time.sleep(self._ingestion_interval)
        except KeyboardInterrupt:
            self._logger.info("Stopping Proxy collector...")  
            Util.remove_kafka_topic(self._kafka_topic.Zookeeper,self._kafka_topic.Topic,self._logger)          
            self._watcher.stop()
            self._pool.terminate()
            self._pool.close()            
            self._pool.join()
Exemplo n.º 19
0
    def start(self):

        self._logger.info("Starting FLOW ingest") 
        self._watcher.start()
            
        try:
            while True:                
                self._ingest_files_pool()              
                time.sleep(self._ingestion_interval)
        except KeyboardInterrupt:
            self._logger.info("Stopping FLOW collector...")  
            Util.remove_kafka_topic(self._kafka_topic.Zookeeper,self._kafka_topic.Topic,self._logger)          
            self._watcher.stop()
            self._pool.terminate()
            self._pool.close()            
            self._pool.join()
            SystemExit("Ingest finished...")
Exemplo n.º 20
0
    def start(self):

        self._logger.info("Starting DNS ingest")
        self._watcher.start()

        try:
            while True:
                self._ingest_files_pool()
                time.sleep(self._ingestion_interval)
        except KeyboardInterrupt:
            self._logger.info("Stopping DNS collector...")
            Util.remove_kafka_topic(self._producer.Zookeeper,
                                    self._producer.Topic, self._logger)
            self._watcher.stop()
            self._pool.terminate()
            self._pool.close()
            self._pool.join()
            SystemExit("Ingest finished...")
Exemplo n.º 21
0
def start_worker(type, topic, id, processes=None):

    logger = Util.get_logger("SPOT.INGEST.WORKER")

    # validate the given configuration exists in ingest_conf.json.
    if not type in WORKER_CONF["pipelines"]:
        logger.error("'{0}' type is not a valid configuration.".format(type))
        sys.exit(1)

    # validate the type is a valid module.
    if not Util.validate_data_source(WORKER_CONF["pipelines"][type]["type"]):
        logger.error("The provided data source {0} is not valid".format(type))
        sys.exit(1)

    # validate if kerberos authentication is requiered.
    if os.getenv('KRB_AUTH'):
        kb = Kerberos()
        kb.authenticate()

    # create a worker instance based on the data source type.
    module = __import__("pipelines.{0}.worker".format(
        WORKER_CONF["pipelines"][type]["type"]),
                        fromlist=['Worker'])

    # kafka server info.
    logger.info("Initializing kafka instance")
    k_server = WORKER_CONF["kafka"]['kafka_server']
    k_port = WORKER_CONF["kafka"]['kafka_port']

    # required zookeeper info.
    zk_server = WORKER_CONF["kafka"]['zookeper_server']
    zk_port = WORKER_CONF["kafka"]['zookeper_port']
    topic = topic

    # create kafka consumer.
    kafka_consumer = KafkaConsumer(topic, k_server, k_port, zk_server, zk_port,
                                   id)

    # start worker.
    db_name = WORKER_CONF['dbname']
    app_path = WORKER_CONF['hdfs_app_path']
    ingest_worker = module.Worker(db_name, app_path, kafka_consumer, type,
                                  processes)
    ingest_worker.start()
Exemplo n.º 22
0
def index():
    data = apl.process_todays_claims()
    if not data.empty:
        columns = list(data.columns)
        columns.remove('_id')
        return render_template("claims.html",
                               date=Util.datetime(),
                               cols=columns,
                               rows=[r for _, r in data.iterrows()])
    else:
        return "No new claims found"
Exemplo n.º 23
0
    def test_fill_random(self):
        self._init_db('test')

        exp_cnt = 5
        res = Util.safe_call(self.modb.fill_random, 'check', self.test_temp,
                             exp_cnt)
        self.assertTrue(not isinstance(res, DbException),
                        "Exception filling DB: %s" % res)

        cnt = self.modb.active_db['check'].count()
        self.assertEqual(cnt, exp_cnt + 1,
                         "Expected %d entries, got %d" % (exp_cnt + 1, cnt))
Exemplo n.º 24
0
    def test_add(self):
        db_name = 'test'
        res = Util.safe_call(self.modb.add_db, db_name)
        self.assertTrue(not isinstance(res, Exception),
                        "Exception adding database: %s" % res)
        # try adding DB with same name again
        res = Util.safe_call(self.modb.add_db, db_name)
        self.assertTrue(isinstance(res, DbException),
                        "Didn't get exception adding same DB name twice")
        # try accessing database that wasn't added
        try:
            self.modb.active_db = 'test1'
        except Exception as e:
            self.assertTrue(isinstance(e, DbException),
                            "Expected DB error, but got %s" % e)
        else:
            self.assertTrue(False)

        self.modb.active_db = db_name
        cnt = self.modb.active_db.count()
        self.assertEqual(cnt, 1, "Expected 1 table, got %d" % cnt)
Exemplo n.º 25
0
def ingest_file(file,message_size,topic,kafka_servers):
    
    logger = logging.getLogger('SPOT.INGEST.PROXY.{0}'.format(os.getpid()))
    try:        
        message = ""
        logger.info("Ingesting file: {0} process:{1}".format(file,os.getpid())) 
        with open(file,"rb") as f:
            for line in f:
                message += line
                if len(message) > message_size:
                    KafkaTopic.SendMessage(message,kafka_servers,topic,0)
                    message = ""
            #send the last package.        
            KafkaTopic.SendMessage(message,kafka_servers,topic,0)            
        rm_file = "rm {0}".format(file)
        Util.execute_cmd(rm_file,logger)
        logger.info("File {0} has been successfully sent to Kafka Topic: {1}".format(file,topic))

    except Exception as err:        
        logger.error("There was a problem, please check the following error message:{0}".format(err.message))
        logger.error("Exception: {0}".format(err))
Exemplo n.º 26
0
def ingest_file(file,pkt_num,pcap_split_staging, partition,hdfs_root_path,topic,kafka_servers):

    logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid()))
    
    try:
        # get file name and date.
        org_file = file
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts)-1]

        # split file.
        name = file_name.split('.')[0]
        split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format(pkt_num,file,pcap_split_staging,name)
        logger.info("Splitting file: {0}".format(split_cmd))
        Util.execute_cmd(split_cmd,logger)

        logger.info("Removing file: {0}".format(org_file))
        rm_big_file = "rm {0}".format(org_file)
        Util.execute_cmd(rm_big_file,logger)    

        for currdir,subdir,files in os.walk(pcap_split_staging):
            for file in files:
                if file.endswith(".pcap") and "{0}_spot".format(name) in file:

                        # get timestamp from the file name to build hdfs path.
                        file_date = file.split('.')[0]
                        pcap_hour = file_date[-6:-4]
                        pcap_date_path = file_date[-14:-6]

                        # hdfs path with timestamp.
                        hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path,pcap_date_path,pcap_hour)

                        # create hdfs path.
                        Util.creat_hdfs_folder(hdfs_path,logger)

                        # load file to hdfs.
                        hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file)
                        Util.load_to_hdfs(os.path.join(currdir,file),hadoop_pcap_file,logger)

                        # create event for workers to process the file.
                        logger.info( "Sending split file to worker number: {0}".format(partition))
                        KafkaTopic.SendMessage(hadoop_pcap_file,kafka_servers,topic,partition)
                        logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,topic))


  
    except Exception as err:
        
        logger.error("There was a problem, please check the following error message:{0}".format(err.message))
        logger.error("Exception: {0}".format(err))
Exemplo n.º 27
0
def start_worker(type,topic,id,processes=None):

    logger = Util.get_logger("SPOT.INGEST.WORKER")

    # validate the given configuration exists in ingest_conf.json.
    if not type in worker_conf["pipelines"]:
        logger.error("'{0}' type is not a valid configuration.".format(type));
        sys.exit(1)

    # validate the type is a valid module.
    if not Util.validate_data_source(worker_conf["pipelines"][type]["type"]):
        logger.error("The provided data source {0} is not valid".format(type));sys.exit(1)

    # validate if kerberos authentication is requiered.
    if os.getenv('KRB_AUTH'):
        kb = Kerberos()
        kb.authenticate()

    # create a worker instance based on the data source type.
    module = __import__("pipelines.{0}.worker".format(worker_conf["pipelines"][type]["type"]),fromlist=['Worker'])

    # kafka server info.
    logger.info("Initializing kafka instance")
    k_server = worker_conf["kafka"]['kafka_server']
    k_port = worker_conf["kafka"]['kafka_port']

    # required zookeeper info.
    zk_server = worker_conf["kafka"]['zookeper_server']
    zk_port = worker_conf["kafka"]['zookeper_port']
    topic = topic

    # create kafka consumer.
    kafka_consumer = KafkaConsumer(topic,k_server,k_port,zk_server,zk_port,id)

    # start worker.
    db_name = worker_conf['dbname']
    app_path = worker_conf['hdfs_app_path']
    ingest_worker = module.Worker(db_name,app_path,kafka_consumer,type,processes)
    ingest_worker.start()
Exemplo n.º 28
0
def convert(logfile, tmpdir, opts='', prefix=None):
    '''
        Copy log file to the local staging area.

    :param logfile: Path of log file.
    :param tmpdir : Path of local staging area.
    :param opts   : A set of options for the `cp` command.
    :param prefix : If `prefix` is specified, the file name will begin with that;
                     otherwise, a default `prefix` is used.
    :returns      : Path of log file in local staging area.
    :rtype        : ``str``
    '''
    logger = logging.getLogger('SPOT.INGEST.PROXY.PROCESS')

    with tempfile.NamedTemporaryFile(prefix=prefix, dir=tmpdir,
                                     delete=False) as fp:
        command = COMMAND.format(opts, logfile, fp.name)

        logger.debug('Execute command: {0}'.format(command))
        Util.popen(command, raises=True)

        return fp.name
Exemplo n.º 29
0
    def _create_topic(self):

        self._logger.info("Creating topic: {0} with {1} parititions".format(
            self._topic, self._num_of_partitions))

        # Create partitions for the workers.
        self._partitions = [
            TopicPartition(self._topic, p)
            for p in range(int(self._num_of_partitions))
        ]

        # create partitioner
        self._partitioner = RoundRobinPartitioner(self._partitions)

        # get script path
        zk_conf = "{0}:{1}".format(self._zk_server, self._zk_port)
        create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format(
            os.path.dirname(os.path.abspath(__file__)), self._topic, zk_conf,
            self._num_of_partitions)

        # execute create topic cmd
        Util.execute_cmd(create_topic_cmd, self._logger)
Exemplo n.º 30
0
    def start(self):

        self._logger.info("Creating Spark Job for topic: {0}".format(
            self._kafka_consumer.Topic))

        # parser
        parser = self._conf["parser"]

        #spark conf
        diver_memory = self._spark_conf["driver_memory"]
        num_exec = self._spark_conf["spark_exec"]
        exec_memory = self._spark_conf["spark_executor_memory"]
        exec_cores = self._spark_conf["spark_executor_cores"]
        batch_size = self._spark_conf["spark_batch_size"]

        jar_path = os.path.dirname(os.path.dirname(self._script_path))
        # spark job command.
        spark_job_cmd = (
            "spark-submit --master yarn "
            "--driver-memory {0} "
            "--num-executors {1} "
            "--conf spark.executor.memory={2} "
            "--conf spark.executor.cores={3} "
            "--jars {4}/common/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar "
            "{5}/{6} "
            "-zk {7} "
            "-t {8} "
            "-db {9} "
            "-dt {10} "
            "-w {11} "
            "-bs {12}".format(diver_memory, num_exec, exec_memory, exec_cores,
                              jar_path, self._script_path, parser,
                              self._kafka_consumer.ZookeperServer,
                              self._kafka_consumer.Topic, self._db_name,
                              "proxy", self._processes, batch_size))

        # start spark job.
        Util.execute_cmd(spark_job_cmd, self._logger)
Exemplo n.º 31
0
def convert(netflow, tmpdir, opts='', prefix=None):
    '''
        Convert `nfcapd` file to a comma-separated output format.

    :param netflow : Path of binary file.
    :param tmpdir  : Path of local staging area.
    :param opts    : A set of options for `nfdump` command.
    :param prefix  : If `prefix` is specified, the file name will begin with that;
                     otherwise, a default `prefix` is used.
    :returns       : Path of CSV-converted file.
    :rtype         : ``str``
    :raises OSError: If an error occurs while executing the `nfdump` command.
    '''
    logger = logging.getLogger('SPOT.INGEST.FLOW.PROCESS')

    with tempfile.NamedTemporaryFile(prefix=prefix, dir=tmpdir,
                                     delete=False) as fp:
        command = COMMAND.format(netflow, opts, fp.name)

        logger.debug('Execute command: {0}'.format(command))
        Util.popen(command, raises=True)

        return fp.name
Exemplo n.º 32
0
    def _initialize_members(self,db_name,hdfs_app_path,kafka_consumer,conf_type,processes):
        
        # get logger instance.
        self._logger = Util.get_logger('SPOT.INGEST.WRK.PROXY')

        self._db_name = db_name
        self._hdfs_app_path = hdfs_app_path
        self._kafka_consumer = kafka_consumer

        # read proxy configuration.
        self._script_path = os.path.dirname(os.path.abspath(__file__))
        conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._spark_conf  = conf["spark-streaming"]
        self._conf = conf["pipelines"][conf_type]
        self._processes = processes
Exemplo n.º 33
0
    def _initialize_members(self,db_name,hdfs_app_path,kafka_consumer,conf_type):

        # get logger instance.
        self._logger = Util.get_logger('SPOT.INGEST.WRK.FLOW')

        self._db_name = db_name
        self._hdfs_app_path = hdfs_app_path

        # read proxy configuration.
        self._script_path = os.path.dirname(os.path.abspath(__file__))
        conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._conf = conf["pipelines"][conf_type]

        self._process_opt = self._conf['process_opt']
        self._local_staging = self._conf['local_staging']
        self.kafka_consumer = kafka_consumer
Exemplo n.º 34
0
    def _initialize_members(self, db_name, hdfs_app_path, kafka_consumer,
                            conf_type, processes):

        # get logger instance.
        self._logger = Util.get_logger('SPOT.INGEST.WRK.PROXY')

        self._db_name = db_name
        self._hdfs_app_path = hdfs_app_path
        self._kafka_consumer = kafka_consumer

        # read proxy configuration.
        self._script_path = os.path.dirname(os.path.abspath(__file__))
        conf_file = "{0}/ingest_conf.json".format(
            os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._spark_conf = conf["spark-streaming"]
        self._conf = conf["pipelines"][conf_type]
        self._processes = processes
Exemplo n.º 35
0
    def _initialize_members(self, db_name, hdfs_app_path, kafka_consumer,
                            conf_type):

        # get logger instance.
        self._logger = Util.get_logger('SPOT.INGEST.WRK.DNS')

        self._db_name = db_name
        self._hdfs_app_path = hdfs_app_path

        # read proxy configuration.
        self._script_path = os.path.dirname(os.path.abspath(__file__))
        conf_file = "{0}/ingest_conf.json".format(
            os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._conf = conf["pipelines"][conf_type]

        self._process_opt = self._conf['process_opt']
        self._local_staging = self._conf['local_staging']
        self.kafka_consumer = kafka_consumer
Exemplo n.º 36
0
def _ingest_file(hdfs_client, new_file, pkt_num, pcap_split_staging,
                 hdfs_root_path, producer, topic):

    logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid()))

    try:
        # get file name and date.
        org_file = new_file
        file_name_parts = new_file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]

        # split file.
        name = file_name.split('.')[0]
        split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format(
            pkt_num, new_file, pcap_split_staging, name)
        logger.info("Splitting file: {0}".format(split_cmd))
        Util.execute_cmd(split_cmd, logger)

        logger.info("Removing file: {0}".format(org_file))
        rm_big_file = "rm {0}".format(org_file)
        Util.execute_cmd(rm_big_file, logger)

    except Exception as err:
        logger.error("There was a problem splitting the file: {0}".format(
            err.message))
        logger.error("Exception: {0}".format(err))

    for currdir, subdir, files in os.walk(pcap_split_staging):
        for file in files:
            if file.endswith(".pcap") and "{0}_spot".format(name) in file:
                # get timestamp from the file name to build hdfs path.
                file_date = file.split('.')[0]
                pcap_hour = file_date[-6:-4]
                pcap_date_path = file_date[-14:-6]

                # hdfs path with timestamp.
                hdfs_path = "{0}/binary/{1}/{2}".format(
                    hdfs_root_path, pcap_date_path, pcap_hour)

                # create hdfs path.
                try:
                    if len(hdfs.list_dir(hdfs_path, hdfs_client)) == 0:
                        logger.info('creating directory: ' + hdfs_path)
                        hdfs_client.mkdir(hdfs_path, hdfs_client)

                    # load file to hdfs.
                    hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file)
                    result = hdfs_client.upload_file(
                        hadoop_pcap_file, os.path.join(currdir, file))
                    if not result:
                        logger.error('File failed to upload: ' +
                                     hadoop_pcap_file)
                        raise HdfsException

                    # create event for workers to process the file.
                    logger.info(
                        "Sending split file to Topic: {0}".format(topic))
                    producer.SendMessage(hadoop_pcap_file, topic)
                    logger.info(
                        "File {0} has been successfully sent to Kafka Topic to: {1}"
                        .format(file, topic))

                except HdfsException as err:
                    logger.error('Exception: ' + err.exception)
                    logger.info(
                        'Check Hdfs Connection settings and server health')

                except Exception as err:
                    logger.info(
                        "File {0} failed to be sent to Kafka Topic to: {1}".
                        format(new_file, topic))
                    logger.error("Error: {0}".format(err))
Exemplo n.º 37
0
    def authenticate(self):

        Util.execute_cmd(self._kinit_cmd, self._logger)
        self._logger.info("Kerberos ticket obtained")
Exemplo n.º 38
0
    def _process_new_file(self, file):

        # get file from hdfs
        get_file_cmd = "hadoop fs -get {0} {1}.".format(
            file, self._local_staging)
        self._logger.info("Getting file from hdfs: {0}".format(get_file_cmd))
        Util.execute_cmd(get_file_cmd, self._logger)

        # get file name and date
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]

        binary_hour = file_name_parts[len(file_name_parts) - 2]
        binary_date_path = file_name_parts[len(file_name_parts) - 3]
        binary_year = binary_date_path[0:4]
        binary_month = binary_date_path[4:6]
        binary_day = binary_date_path[6:8]

        # build process cmd.
        process_cmd = "tshark -r {0}{1} {2} > {0}{1}.csv".format(
            self._local_staging, file_name, self._process_opt)
        self._logger.info("Processing file: {0}".format(process_cmd))
        Util.execute_cmd(process_cmd, self._logger)

        # create hdfs staging.
        hdfs_path = "{0}/dns".format(self._hdfs_app_path)
        staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4]
        hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path,
                                                   staging_timestamp)
        create_staging_cmd = "hadoop fs -mkdir -p {0}".format(
            hdfs_staging_path)
        self._logger.info("Creating staging: {0}".format(create_staging_cmd))
        Util.execute_cmd(create_staging_cmd, self._logger)

        # move to stage.
        mv_to_staging = "hadoop fs -moveFromLocal {0}{1}.csv {2}/.".format(
            self._local_staging, file_name, hdfs_staging_path)
        self._logger.info("Moving data to staging: {0}".format(mv_to_staging))
        Util.execute_cmd(mv_to_staging, self._logger)

        #load to avro
        load_to_avro_cmd = "hive -hiveconf dbname={0} -hiveconf y={1} -hiveconf m={2} -hiveconf d={3} -hiveconf h={4} -hiveconf data_location='{5}' -f pipelines/dns/load_dns_avro_parquet.hql".format(
            self._db_name, binary_year, binary_month, binary_day, binary_hour,
            hdfs_staging_path)

        self._logger.info("Loading data to hive: {0}".format(load_to_avro_cmd))
        Util.execute_cmd(load_to_avro_cmd, self._logger)

        # remove from hdfs staging
        rm_hdfs_staging_cmd = "hadoop fs -rm -R -skipTrash {0}".format(
            hdfs_staging_path)
        self._logger.info(
            "Removing staging path: {0}".format(rm_hdfs_staging_cmd))
        Util.execute_cmd(rm_hdfs_staging_cmd, self._logger)

        # remove from local staging.
        rm_local_staging = "rm {0}{1}".format(self._local_staging, file_name)
        self._logger.info(
            "Removing files from local staging: {0}".format(rm_local_staging))
        Util.execute_cmd(rm_local_staging, self._logger)

        self._logger.info(
            "File {0} was successfully processed.".format(file_name))
Exemplo n.º 39
0
    def _process_new_file(self,file):

        # get file from hdfs
        get_file_cmd = "hadoop fs -get {0} {1}.".format(file,self._local_staging)
        self._logger.info("Getting file from hdfs: {0}".format(get_file_cmd))
        Util.execute_cmd(get_file_cmd,self._logger)

        # get file name and date
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts)-1]

        flow_date = file_name.split('.')[1]
        flow_year = flow_date[0:4]
        flow_month = flow_date[4:6]
        flow_day = flow_date[6:8]
        flow_hour = flow_date[8:10]

        # build process cmd.
        process_cmd = "nfdump -o csv -r {0}{1} {2} > {0}{1}.csv".format(self._local_staging,file_name,self._process_opt)
        self._logger.info("Processing file: {0}".format(process_cmd))
        Util.execute_cmd(process_cmd,self._logger)        

        # create hdfs staging.
        hdfs_path = "{0}/flow".format(self._hdfs_app_path)
        staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4]
        hdfs_staging_path =  "{0}/stage/{1}".format(hdfs_path,staging_timestamp)
        create_staging_cmd = "hadoop fs -mkdir -p {0}".format(hdfs_staging_path)
        self._logger.info("Creating staging: {0}".format(create_staging_cmd))
        Util.execute_cmd(create_staging_cmd,self._logger)

        # move to stage.
        mv_to_staging ="hadoop fs -moveFromLocal {0}{1}.csv {2}/.".format(self._local_staging,file_name,hdfs_staging_path)
        self._logger.info("Moving data to staging: {0}".format(mv_to_staging))
        subprocess.call(mv_to_staging,shell=True)

        #load to avro
        load_to_avro_cmd = "hive -hiveconf dbname={0} -hiveconf y={1} -hiveconf m={2} -hiveconf d={3} -hiveconf h={4} -hiveconf data_location='{5}' -f pipelines/flow/load_flow_avro_parquet.hql".format(self._db_name,flow_year,flow_month,flow_day,flow_hour,hdfs_staging_path)

        self._logger.info( "Loading data to hive: {0}".format(load_to_avro_cmd))
        Util.execute_cmd(load_to_avro_cmd,self._logger)

        # remove from hdfs staging
        rm_hdfs_staging_cmd = "hadoop fs -rm -R -skipTrash {0}".format(hdfs_staging_path)
        self._logger.info("Removing staging path: {0}".format(rm_hdfs_staging_cmd))
        Util.execute_cmd(rm_hdfs_staging_cmd,self._logger)

        # remove from local staging.
        rm_local_staging = "rm {0}{1}".format(self._local_staging,file_name)
        self._logger.info("Removing files from local staging: {0}".format(rm_local_staging))
        Util.execute_cmd(rm_local_staging,self._logger)

        self._logger.info("File {0} was successfully processed.".format(file_name))
Exemplo n.º 40
0
    def _process_new_file(self, nf):

        # get file from hdfs
        self._logger.info("Getting file from hdfs: {0}".format(nf))
        if hdfs.file_exists(nf):
            hdfs.download_file(nf, self._local_staging)
        else:
            self._logger.info("file: {0} not found".format(nf))
            # TODO: error handling

        # get file name and date
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]

        binary_hour = file_name_parts[len(file_name_parts) - 2]
        binary_date_path = file_name_parts[len(file_name_parts) - 3]
        binary_year = binary_date_path[0:4]
        binary_month = binary_date_path[4:6]
        binary_day = binary_date_path[6:8]

        # build process cmd.
        process_cmd = "tshark -r {0}{1} {2} > {0}{1}.csv".format(
            self._local_staging, file_name, self._process_opt)
        self._logger.info("Processing file: {0}".format(process_cmd))
        Util.execute_cmd(process_cmd, self._logger)

        # create hdfs staging.
        hdfs_path = "{0}/dns".format(self._hdfs_app_path)
        staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4]
        hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path,
                                                   staging_timestamp)
        self._logger.info("Creating staging: {0}".format(hdfs_staging_path))
        hdfs.mkdir(hdfs_staging_path)

        # move to stage.
        local_file = "{0}{1}.csv".format(self._local_staging, file_name)
        self._logger.info(
            "Moving data to staging: {0}".format(hdfs_staging_path))
        hdfs.upload_file(hdfs_staging_path, local_file)

        #load to avro
        drop_table = 'DROP TABLE IF EXISTS {0}.dns_tmp'.format(self._db_name)
        self._cursor.execute(drop_table)

        # Create external table
        create_external = (
            "\n"
            "CREATE EXTERNAL TABLE {0}.dns_tmp (\n"
            "  frame_day STRING,\n"
            "  frame_time STRING,\n"
            "  unix_tstamp BIGINT,\n"
            "  frame_len INT,\n"
            "  ip_src STRING,\n"
            "  ip_dst STRING,\n"
            "  dns_qry_name STRING,\n"
            "  dns_qry_type INT,\n"
            "  dns_qry_class STRING,\n"
            "  dns_qry_rcode INT,\n"
            "  dns_a STRING  \n"
            "  )\n"
            "  ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\n"
            "  STORED AS TEXTFILE\n"
            "  LOCATION '{1}'\n"
            "  TBLPROPERTIES ('avro.schema.literal'='{{\n"
            "  \"type\":   \"record\"\n"
            "  , \"name\":   \"RawDnsRecord\"\n"
            "  , \"namespace\" : \"com.cloudera.accelerators.dns.avro\"\n"
            "  , \"fields\": [\n"
            "      {{\"name\": \"frame_day\",        \"type\":[\"string\", \"null\"]}\n"
            "      , {{\"name\": \"frame_time\",     \"type\":[\"string\", \"null\"]}\n"
            "      , {{\"name\": \"unix_tstamp\",    \"type\":[\"bigint\", \"null\"]}\n"
            "      , {{\"name\": \"frame_len\",      \"type\":[\"int\",    \"null\"]}\n"
            "      , {{\"name\": \"ip_src\",         \"type\":[\"string\", \"null\"]}\n"
            "      , {{\"name\": \"ip_dst\",         \"type\":[\"string\", \"null\"]}\n"
            "      , {{\"name\": \"dns_qry_name\",   \"type\":[\"string\", \"null\"]}\n"
            "      , {{\"name\": \"dns_qry_type\",   \"type\":[\"int\",    \"null\"]}\n"
            "      , {{\"name\": \"dns_qry_class\",  \"type\":[\"string\", \"null\"]}\n"
            "      , {{\"name\": \"dns_qry_rcode\",  \"type\":[\"int\",    \"null\"]}\n"
            "      , {{\"name\": \"dns_a\",          \"type\":[\"string\", \"null\"]}\n"
            "      ]\n"
            "}')\n").format(self._db_name, hdfs_staging_path)
        self._logger.info(
            "Creating external table: {0}".format(create_external))
        self._cursor.execute(create_external)

        # Insert data
        insert_into_table = """
            INSERT INTO TABLE {0}.dns
            PARTITION (y={1}, m={2}, d={3}, h={4)
            SELECT   CONCAT(frame_day , frame_time) as treceived, unix_tstamp, frame_len, ip_dst, ip_src, dns_qry_name,
            dns_qry_class,dns_qry_type, dns_qry_rcode, dns_a 
            FROM {0}.dns_tmp
        """.format(self._db_name, binary_year, binary_month, binary_day,
                   binary_hour)
        self._logger.info("Loading data to {0}: {1}".format(
            self._db_name, insert_into_table))
        self._cursor.execute(insert_into_table)

        # remove from hdfs staging
        self._logger.info(
            "Removing staging path: {0}".format(hdfs_staging_path))
        hdfs.delete_folder(hdfs_staging_path)

        # remove from local staging.
        rm_local_staging = "rm {0}{1}".format(self._local_staging, file_name)
        self._logger.info(
            "Removing files from local staging: {0}".format(rm_local_staging))
        Util.execute_cmd(rm_local_staging, self._logger)

        self._logger.info(
            "File {0} was successfully processed.".format(file_name))
Exemplo n.º 41
0
    def _process_new_file(self, nf):

        # get file name and date
        file_name_parts = nf.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]
        nf_path = nf.rstrip(file_name)
        flow_date = file_name.split('.')[1]
        flow_year = flow_date[0:4]
        flow_month = flow_date[4:6]
        flow_day = flow_date[6:8]
        flow_hour = flow_date[8:10]

        # get file from hdfs
        if hdfs.file_exists(nf_path, file_name):
            self._logger.info("Getting file from hdfs: {0}".format(nf))
            hdfs.download_file(nf, self._local_staging)
        else:
            self._logger.info("file: {0} not found".format(nf))
            # TODO: error handling

        # build process cmd.
        sf = "{0}{1}.csv".format(self._local_staging, file_name)
        process_cmd = "nfdump -o csv -r {0}{1} {2} > {3}".format(
            self._local_staging, file_name, self._process_opt, sf)
        self._logger.info("Processing file: {0}".format(process_cmd))
        Util.execute_cmd(process_cmd, self._logger)

        # create hdfs staging.
        hdfs_path = "{0}/flow".format(self._hdfs_app_path)
        staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4]
        hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path,
                                                   staging_timestamp)
        self._logger.info("Creating staging: {0}".format(hdfs_staging_path))
        hdfs.mkdir(hdfs_staging_path)

        # move to stage.
        local_file = "{0}{1}.csv".format(self._local_staging, file_name)
        self._logger.info(
            "Moving data to staging: {0}".format(hdfs_staging_path))
        hdfs.upload_file(hdfs_staging_path, local_file)

        # load with impyla
        drop_table = "DROP TABLE IF EXISTS {0}.flow_tmp".format(self._db_name)
        self._logger.info("Dropping temp table: {0}".format(drop_table))
        self._cursor.execute_query(drop_table)

        create_external = (
            "\n"
            "CREATE EXTERNAL TABLE {0}.flow_tmp (\n"
            "  treceived STRING,\n"
            "  tryear INT,\n"
            "  trmonth INT,\n"
            "  trday INT,\n"
            "  trhour INT,\n"
            "  trminute INT,\n"
            "  trsec INT,\n"
            "  tdur FLOAT,\n"
            "  sip  STRING,\n"
            "  dip STRING,\n"
            "  sport INT,\n"
            "  dport INT,\n"
            "  proto STRING,\n"
            "  flag STRING,\n"
            "  fwd INT,\n"
            "  stos INT,\n"
            "  ipkt BIGINT,\n"
            "  ibyt BIGINT,\n"
            "  opkt BIGINT,\n"
            "  obyt BIGINT,\n"
            "  input INT,\n"
            "  output INT,\n"
            "  sas INT,\n"
            "  das INT,\n"
            "  dtos INT,\n"
            "  dir INT,\n"
            "  rip STRING\n"
            "  )\n"
            "  ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\n"
            "  STORED AS TEXTFILE\n"
            "  LOCATION '{1}'\n"
            "  TBLPROPERTIES ('avro.schema.literal'='{{\n"
            "  \"type\":   \"record\"\n"
            "  , \"name\":   \"RawFlowRecord\"\n"
            "  , \"namespace\" : \"com.cloudera.accelerators.flows.avro\"\n"
            "  , \"fields\": [\n"
            "      {{\"name\": \"treceived\",             \"type\":[\"string\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"tryear\",              \"type\":[\"float\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"trmonth\",             \"type\":[\"float\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"trday\",               \"type\":[\"float\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"trhour\",              \"type\":[\"float\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"trminute\",            \"type\":[\"float\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"trsec\",               \"type\":[\"float\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"tdur\",                \"type\":[\"float\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"sip\",                \"type\":[\"string\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"sport\",                 \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"dip\",                \"type\":[\"string\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"dport\",                 \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"proto\",              \"type\":[\"string\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"flag\",               \"type\":[\"string\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"fwd\",                   \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"stos\",                  \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"ipkt\",               \"type\":[\"bigint\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"ibyt\",              \"type\":[\"bigint\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"opkt\",               \"type\":[\"bigint\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"obyt\",               \"type\":[\"bigint\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"input\",                 \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"output\",                \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"sas\",                   \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"das\",                   \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"dtos\",                  \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"dir\",                   \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"rip\",                \"type\":[\"string\",   \"null\"]}}\n"
            "      ]\n"
            "}}')\n").format(self._db_name, hdfs_staging_path)
        self._logger.info(
            "Creating external table: {0}".format(create_external))
        self._cursor.execute_query(create_external)

        insert_into_table = """
        INSERT INTO TABLE {0}.flow
        PARTITION (y={1}, m={2}, d={3}, h={4})
        SELECT   treceived,  unix_timestamp(treceived) AS unix_tstamp, tryear,  trmonth, trday,  trhour,  trminute,  trsec,
          tdur,  sip, dip, sport, dport,  proto,  flag,  fwd,  stos,  ipkt,  ibyt,  opkt,  obyt,  input,  output,
          sas,  das,  dtos,  dir,  rip
        FROM {0}.flow_tmp
        """.format(self._db_name, flow_year, flow_month, flow_day, flow_hour)
        self._logger.info("Loading data to {0}: {1}".format(
            self._db_name, insert_into_table))
        self._cursor.execute_query(insert_into_table)

        # remove from hdfs staging
        self._logger.info(
            "Removing staging path: {0}".format(hdfs_staging_path))
        hdfs.delete_folder(hdfs_staging_path)

        # remove from local staging.
        rm_local_staging = "rm {0}{1}".format(self._local_staging, file_name)
        self._logger.info(
            "Removing files from local staging: {0}".format(rm_local_staging))
        Util.execute_cmd(rm_local_staging, self._logger)

        rm_local_staging = "rm {0}".format(sf)
        self._logger.info(
            "Removing files from local staging: {0}".format(rm_local_staging))
        Util.execute_cmd(rm_local_staging, self._logger)

        self._logger.info(
            "File {0} was successfully processed.".format(file_name))