def ingest_file(file,partition,hdfs_root_path,topic,kafka_servers): logger = logging.getLogger('SPOT.INGEST.FLOW.{0}'.format(os.getpid())) try: # get file name and date. file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts)-1] file_date = file_name.split('.')[1] file_date_path = file_date[0:8] file_date_hour = file_date[8:10] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path,file_date_path,file_date_hour) Util.creat_hdfs_folder(hdfs_path,logger) # load to hdfs. hdfs_file = "{0}/{1}".format(hdfs_path,file_name) Util.load_to_hdfs(file,hdfs_file,logger) # create event for workers to process the file. logger.info("Sending file to worker number: {0}".format(partition)) KafkaTopic.SendMessage(hdfs_file,kafka_servers,topic,partition) logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,topic)) except Exception as err: logger.error("There was a problem, please check the following error message:{0}".format(err.message)) logger.error("Exception: {0}".format(err))
def ingest_file(file, partition, hdfs_root_path, topic, kafka_servers): logger = logging.getLogger('SPOT.INGEST.FLOW.{0}'.format(os.getpid())) try: # get file name and date. file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] file_date = file_name.split('.')[1] file_date_path = file_date[0:8] file_date_hour = file_date[8:10] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path, file_date_path, file_date_hour) Util.creat_hdfs_folder(hdfs_path, logger) # load to hdfs. hdfs_file = "{0}/{1}".format(hdfs_path, file_name) Util.load_to_hdfs(file, hdfs_file, logger) # create event for workers to process the file. logger.info("Sending file to worker number: {0}".format(partition)) KafkaTopic.SendMessage(hdfs_file, kafka_servers, topic, partition) logger.info( "File {0} has been successfully sent to Kafka Topic to: {1}". format(file, topic)) except Exception as err: logger.error( "There was a problem, please check the following error message:{0}" .format(err.message)) logger.error("Exception: {0}".format(err))
def start(self): self._logger.info("Creating Spark Job for topic: {0}".format(self._kafka_consumer.Topic)) # parser parser = self._conf["parser"] #spark conf diver_memory = self._spark_conf["driver_memory"] num_exec = self._spark_conf["spark_exec"] exec_memory = self._spark_conf["spark_executor_memory"] exec_cores = self._spark_conf["spark_executor_cores"] batch_size = self._spark_conf["spark_batch_size"] jar_path = os.path.dirname(os.path.dirname(self._script_path)) # spark job command. spark_job_cmd = ("spark-submit --master yarn " "--driver-memory {0} " "--num-executors {1} " "--conf spark.executor.memory={2} " "--conf spark.executor.cores={3} " "--jars {4}/common/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar " "{5}/{6} " "-zk {7} " "-t {8} " "-db {9} " "-dt {10} " "-w {11} " "-bs {12}".format(diver_memory,num_exec,exec_memory,exec_cores,jar_path,self._script_path,parser,self._kafka_consumer.ZookeperServer,self._kafka_consumer.Topic,self._db_name,"proxy",self._processes,batch_size)) # start spark job. Util.execute_cmd(spark_job_cmd,self._logger)
def ingest_file(file, message_size, topic, kafka_servers): logger = logging.getLogger('SPOT.INGEST.PROXY.{0}'.format(os.getpid())) try: message = "" logger.info("Ingesting file: {0} process:{1}".format( file, os.getpid())) with open(file, "rb") as f: for line in f: message += line if len(message) > message_size: KafkaProducer.SendMessage(message, kafka_servers, topic, 0) message = "" #send the last package. KafkaProducer.SendMessage(message, kafka_servers, topic, 0) rm_file = "rm {0}".format(file) Util.execute_cmd(rm_file, logger) logger.info( "File {0} has been successfully sent to Kafka Topic: {1}".format( file, topic)) except Exception as err: logger.error( "There was a problem, please check the following error message:{0}" .format(err.message)) logger.error("Exception: {0}".format(err))
def run(cls): ''' Main command-line entry point. :param cls: The class as implicit first argument. ''' try: args = _parse_args() conf = json.loads(args.config_file.read()) # .........................set up logger Util.get_logger('SPOT', args.log_level) # .........................check kerberos authentication if os.getenv('KRB_AUTH'): kb = Kerberos() kb.authenticate() conf['producer'] = { 'bootstrap_servers': [ '{0}:{1}'.format(conf['kafka']['kafka_server'], conf['kafka']['kafka_port']) ] } conf['file_watcher'] = { 'path': conf['pipelines'][args.type]['collector_path'], 'supported_files': conf['pipelines'][args.type]['supported_files'], 'recursive': True } # .........................migrate configs if not 'local_staging' in conf['pipelines'][args.type].keys(): conf['pipelines'][args.type]['local_staging'] = '/tmp' if 'max_request_size' in conf['kafka'].keys(): conf['producer']['max_request_size'] = conf['kafka'][ 'max_request_size'] if not 'process_opt' in conf['pipelines'][args.type].keys(): conf['pipelines'][args.type]['process_opt'] = '' if 'recursive' in conf['pipelines'][args.type].keys(): conf['file_watcher']['recursive'] = conf['pipelines'][ args.type]['recursive'] collector = cls(args.type, args.topic, args.skip_conversion, **conf) collector.start() except SystemExit: raise except: sys.excepthook(*sys.exc_info()) sys, exit(1)
def test_add_fail(self): res = Util.safe_call(self.dbm.add_driver, 1, {}) self.assertTrue( isinstance(res, DbException), "Expected exception when adding invalid driver, got %s" % res) # check that we can't insert driver with existing id res = Util.safe_call(self.dbm.add_driver, 0, DbManagerTestCase.TestDriver(0)) self.assertTrue(isinstance(res, DbException), "Expected exception, but got %s" % res) self.assertEqual(len(self.dbm), 1, "DB manager size changed")
def test_cmd_all(self): res = Util.safe_call(self.dbm.add_driver, 1, DbManagerTestCase.TestDriver(1)) self.assertTrue(not isinstance(res, Exception), "Exception adding new driver: %s" % res) arg = 1 exp_vals = range(arg, len(self.dbm) + arg) res = Util.safe_call(self.dbm.cmd_all, lambda db: db.echo(arg)) self.assertTrue(not isinstance(res, Exception), "Exception: %s" % res) self.assertTrue(not Util.list_diff(res, exp_vals), "Got %s, expected %s" % (res, exp_vals))
def spark_job(script_file, **kwargs): ''' Run given script file by appling it as a Spark Job. ''' spark_job = 'spark2-submit --master {0}'.format(kwargs.pop('master')) spark_job += ' --deploy-mode {0}'.format(kwargs.pop('deploy_mode')) spark_job += ' --py-files {0}'.format(kwargs.pop('py_files')) if 'driver_memory' in kwargs.keys(): spark_job += ' --driver-memory {0}'.format(kwargs.pop('driver_memory')) if 'spark_exec' in kwargs.keys(): spark_job += ' --num-executors {0}'.format(kwargs.pop('spark_exec')) if 'spark_executor_memory' in kwargs.keys(): spark_job += ' --conf spark.executor.memory={0}'.format( kwargs.pop('spark_executor_memory')) if 'spark_executor_cores' in kwargs.keys(): spark_job += ' --conf spark.executor.cores={0}'.format( kwargs.pop('spark_executor_cores')) spark_job += ' {0}'.format(os.path.abspath(script_file)) if 'spark_batch_size' in kwargs.keys(): spark_job += ' -b {0}'.format(kwargs.pop('spark_batch_size')) spark_job += ' -d {0}'.format(kwargs.pop('database')) if kwargs['group_id'] is not None: spark_job += ' -g {0}'.format(kwargs.pop('group_id')) spark_job += ' -l {0}'.format(kwargs.pop('log_level')) if kwargs['app_name'] is not None: spark_job += ' -n {0}'.format(kwargs.pop('app_name')) spark_job += ' -p {0}'.format(kwargs.pop('partitions')) spark_job += ' -t {0}'.format(kwargs.pop('type')) spark_job += ' --topic {0}'.format(kwargs.pop('topic')) spark_job += ' --zkquorum {0}'.format(kwargs.pop('zkquorum')) if kwargs['redirect_spark_logs'] is not None: spark_job += ' 2>{0}'.format(kwargs.pop('redirect_spark_logs')) try: Util.call(spark_job, True) except Exception as exc: sys.stderr.write('Failed to submit Spark Job!\n') sys.stderr.write('[{0}] {1}\n\n'.format(exc.__class__.__name__, exc.message)) sys.exit(2)
def _create_topic(self): self._logger.info("Creating topic: {0} with {1} parititions".format( self._topic, self._num_of_partitions)) # get script path zk_conf = "{0}:{1}".format(self._zk_server, self._zk_port) create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format( os.path.dirname(os.path.abspath(__file__)), self._topic, zk_conf, self._num_of_partitions) # execute create topic cmd Util.execute_cmd(create_topic_cmd, self._logger)
def _ingest_file(new_file, hdfs_root_path, producer, topic): logger = logging.getLogger('SPOT.INGEST.FLOW.{0}'.format(os.getpid())) try: # get file name and date. file_name_parts = new_file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] file_date = file_name.split('.')[1] file_date_path = file_date[0:8] file_date_hour = file_date[8:10] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path, file_date_path, file_date_hour) hdfs_file = "{0}/{1}".format(hdfs_path, file_name) try: if len(hdfs.list_dir(hdfs_path)) == 0: logger.info('creating directory: ' + hdfs_path) hdfs.mkdir(hdfs_path) logger.info('uploading file to hdfs: ' + hdfs_file) result = hdfs.upload_file(hdfs_path, new_file) if not result: logger.error('File failed to upload: ' + hdfs_file) raise HdfsException else: rm_file = "rm {0}".format(new_file) logger.info( "Removing files from local staging: {0}".format(rm_file)) Util.execute_cmd(rm_file, logger) except HdfsException as err: logger.error('Exception: ' + err.exception) logger.info('Check Hdfs Connection settings and server health') except Exception as err: logger.error("There was a problem, Exception: {0}".format(err)) # create event for workers to process the file. # logger.info("Sending file to worker number: {0}".format(partition)) try: producer.SendMessage(hdfs_file, topic) logger.info( "File {0} has been successfully sent to Kafka Topic to: {1}". format(hdfs_file, topic)) except Exception as err: logger.info("File {0} failed to be sent to Kafka Topic to: {1}".format( hdfs_file, topic)) logger.error("Error: {0}".format(err))
def start_collector(type, workers_num, id=None): # generate ingest id ingest_id = str(datetime.datetime.time(datetime.datetime.now())).replace( ":", "_").replace(".", "_") # create logger. logger = Util.get_logger("SPOT.INGEST") # validate the given configuration exists in ingest_conf.json. if not type in master_conf["pipelines"]: logger.error("'{0}' type is not a valid configuration.".format(type)) sys.exit(1) # validate the type is a valid module. if not Util.validate_data_source(master_conf["pipelines"][type]["type"]): logger.error( "'{0}' type is not configured. Please check you ingest conf file". format(master_conf["pipelines"][type]["type"])) sys.exit(1) # validate if kerberos authentication is required. if os.getenv('KRB_AUTH'): kb = Kerberos() kb.authenticate() # kafka server info. logger.info("Initializing kafka instance") k_server = master_conf["kafka"]['kafka_server'] k_port = master_conf["kafka"]['kafka_port'] # required zookeeper info. zk_server = master_conf["kafka"]['zookeper_server'] zk_port = master_conf["kafka"]['zookeper_port'] topic = "SPOT-INGEST-{0}_{1}".format(type, ingest_id) if not id else id kafka = KafkaTopic(topic, k_server, k_port, zk_server, zk_port, workers_num) # create a collector instance based on data source type. logger.info("Starting {0} ingest instance".format(topic)) module = __import__("pipelines.{0}.collector".format( master_conf["pipelines"][type]["type"]), fromlist=['Collector']) # start collector. ingest_collector = module.Collector(master_conf['hdfs_app_path'], kafka, type) ingest_collector.start()
def ingest_file(file, pkt_num, pcap_split_staging, partition, hdfs_root_path, topic, kafka_servers): logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid())) try: # get file name and date. org_file = file file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] # split file. name = file_name.split('.')[0] split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format( pkt_num, file, pcap_split_staging, name) logger.info("Splitting file: {0}".format(split_cmd)) Util.execute_cmd(split_cmd, logger) logger.info("Removing file: {0}".format(org_file)) rm_big_file = "rm {0}".format(org_file) Util.execute_cmd(rm_big_file, logger) for currdir, subdir, files in os.walk(pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_spot".format(name) in file: # get timestamp from the file name to build hdfs path. file_date = file.split('.')[0] pcap_hour = file_date[-6:-4] pcap_date_path = file_date[-14:-6] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format( hdfs_root_path, pcap_date_path, pcap_hour) # create hdfs path. Util.creat_hdfs_folder(hdfs_path, logger) # load file to hdfs. hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file) Util.load_to_hdfs(os.path.join(currdir, file), hadoop_pcap_file, logger) # create event for workers to process the file. logger.info( "Sending split file to worker number: {0}".format( partition)) KafkaTopic.SendMessage(hadoop_pcap_file, kafka_servers, topic, partition) logger.info( "File {0} has been successfully sent to Kafka Topic to: {1}" .format(file, topic)) except Exception as err: logger.error( "There was a problem, please check the following error message:{0}" .format(err.message)) logger.error("Exception: {0}".format(err))
def test_add(self): self.assertEqual(len(self.dbm), 0, "DB Manager not empty") res = Util.safe_call(self.dbm.add_driver, 0, DbManagerTestCase.TestDriver(0)) self.assertTrue(not isinstance(res, Exception), "Exception adding driver: %s" % res) self.assertEqual(len(self.dbm), 1, "DB Manager is still empty")
def test_cmd_one(self): test_val = 5 res = Util.safe_call(self.dbm.cmd_one, 0, lambda db: db.echo(test_val)) self.assertTrue(not isinstance(res, Exception), "Exception calling method: %s" % res) self.assertEqual(res, test_val, "Got %s, expected %d" % (res, test_val))
def __init__(self): self._logger = Util.get_logger('SPOT.COMMON.KERBEROS') principal, keytab, sasl_mech, security_proto = config.kerberos() if os.getenv('KINITPATH'): self._kinit = os.getenv('KINITPATH') else: self._kinit = "kinit" self._kinitopts = os.getenv('KINITOPTS') self._keytab = "-kt {0}".format(keytab) self._krb_user = principal if self._kinit == None or self._keytab == None or self._krb_user == None: self._logger.error( "Please verify kerberos configuration, some environment variables are missing." ) sys.exit(1) if self._kinitopts is None: self._kinit_cmd = "{0} {1} {2}".format(self._kinit, self._keytab, self._krb_user) else: self._kinit_cmd = "{0} {1} {2} {3}".format(self._kinit, self._kinitopts, self._keytab, self._krb_user)
def _create_topic(self): self._logger.info("Creating topic: {0} with {1} parititions".format(self._topic,self._num_of_partitions)) # Create partitions for the workers. self._partitions = [ TopicPartition(self._topic,p) for p in range(int(self._num_of_partitions))] # create partitioner self._partitioner = RoundRobinPartitioner(self._partitions) # get script path zk_conf = "{0}:{1}".format(self._zk_server,self._zk_port) create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format(os.path.dirname(os.path.abspath(__file__)),self._topic,zk_conf,self._num_of_partitions) # execute create topic cmd Util.execute_cmd(create_topic_cmd,self._logger)
def start(self): self._logger.info("Starting PROXY collector") self._watcher.start() try: while True: #self._ingest_files() self._ingest_files_pool() time.sleep(self._ingestion_interval) except KeyboardInterrupt: self._logger.info("Stopping Proxy collector...") Util.remove_kafka_topic(self._kafka_topic.Zookeeper,self._kafka_topic.Topic,self._logger) self._watcher.stop() self._pool.terminate() self._pool.close() self._pool.join()
def start(self): self._logger.info("Starting FLOW ingest") self._watcher.start() try: while True: self._ingest_files_pool() time.sleep(self._ingestion_interval) except KeyboardInterrupt: self._logger.info("Stopping FLOW collector...") Util.remove_kafka_topic(self._kafka_topic.Zookeeper,self._kafka_topic.Topic,self._logger) self._watcher.stop() self._pool.terminate() self._pool.close() self._pool.join() SystemExit("Ingest finished...")
def start(self): self._logger.info("Starting DNS ingest") self._watcher.start() try: while True: self._ingest_files_pool() time.sleep(self._ingestion_interval) except KeyboardInterrupt: self._logger.info("Stopping DNS collector...") Util.remove_kafka_topic(self._producer.Zookeeper, self._producer.Topic, self._logger) self._watcher.stop() self._pool.terminate() self._pool.close() self._pool.join() SystemExit("Ingest finished...")
def start_worker(type, topic, id, processes=None): logger = Util.get_logger("SPOT.INGEST.WORKER") # validate the given configuration exists in ingest_conf.json. if not type in WORKER_CONF["pipelines"]: logger.error("'{0}' type is not a valid configuration.".format(type)) sys.exit(1) # validate the type is a valid module. if not Util.validate_data_source(WORKER_CONF["pipelines"][type]["type"]): logger.error("The provided data source {0} is not valid".format(type)) sys.exit(1) # validate if kerberos authentication is requiered. if os.getenv('KRB_AUTH'): kb = Kerberos() kb.authenticate() # create a worker instance based on the data source type. module = __import__("pipelines.{0}.worker".format( WORKER_CONF["pipelines"][type]["type"]), fromlist=['Worker']) # kafka server info. logger.info("Initializing kafka instance") k_server = WORKER_CONF["kafka"]['kafka_server'] k_port = WORKER_CONF["kafka"]['kafka_port'] # required zookeeper info. zk_server = WORKER_CONF["kafka"]['zookeper_server'] zk_port = WORKER_CONF["kafka"]['zookeper_port'] topic = topic # create kafka consumer. kafka_consumer = KafkaConsumer(topic, k_server, k_port, zk_server, zk_port, id) # start worker. db_name = WORKER_CONF['dbname'] app_path = WORKER_CONF['hdfs_app_path'] ingest_worker = module.Worker(db_name, app_path, kafka_consumer, type, processes) ingest_worker.start()
def index(): data = apl.process_todays_claims() if not data.empty: columns = list(data.columns) columns.remove('_id') return render_template("claims.html", date=Util.datetime(), cols=columns, rows=[r for _, r in data.iterrows()]) else: return "No new claims found"
def test_fill_random(self): self._init_db('test') exp_cnt = 5 res = Util.safe_call(self.modb.fill_random, 'check', self.test_temp, exp_cnt) self.assertTrue(not isinstance(res, DbException), "Exception filling DB: %s" % res) cnt = self.modb.active_db['check'].count() self.assertEqual(cnt, exp_cnt + 1, "Expected %d entries, got %d" % (exp_cnt + 1, cnt))
def test_add(self): db_name = 'test' res = Util.safe_call(self.modb.add_db, db_name) self.assertTrue(not isinstance(res, Exception), "Exception adding database: %s" % res) # try adding DB with same name again res = Util.safe_call(self.modb.add_db, db_name) self.assertTrue(isinstance(res, DbException), "Didn't get exception adding same DB name twice") # try accessing database that wasn't added try: self.modb.active_db = 'test1' except Exception as e: self.assertTrue(isinstance(e, DbException), "Expected DB error, but got %s" % e) else: self.assertTrue(False) self.modb.active_db = db_name cnt = self.modb.active_db.count() self.assertEqual(cnt, 1, "Expected 1 table, got %d" % cnt)
def ingest_file(file,message_size,topic,kafka_servers): logger = logging.getLogger('SPOT.INGEST.PROXY.{0}'.format(os.getpid())) try: message = "" logger.info("Ingesting file: {0} process:{1}".format(file,os.getpid())) with open(file,"rb") as f: for line in f: message += line if len(message) > message_size: KafkaTopic.SendMessage(message,kafka_servers,topic,0) message = "" #send the last package. KafkaTopic.SendMessage(message,kafka_servers,topic,0) rm_file = "rm {0}".format(file) Util.execute_cmd(rm_file,logger) logger.info("File {0} has been successfully sent to Kafka Topic: {1}".format(file,topic)) except Exception as err: logger.error("There was a problem, please check the following error message:{0}".format(err.message)) logger.error("Exception: {0}".format(err))
def ingest_file(file,pkt_num,pcap_split_staging, partition,hdfs_root_path,topic,kafka_servers): logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid())) try: # get file name and date. org_file = file file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts)-1] # split file. name = file_name.split('.')[0] split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format(pkt_num,file,pcap_split_staging,name) logger.info("Splitting file: {0}".format(split_cmd)) Util.execute_cmd(split_cmd,logger) logger.info("Removing file: {0}".format(org_file)) rm_big_file = "rm {0}".format(org_file) Util.execute_cmd(rm_big_file,logger) for currdir,subdir,files in os.walk(pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_spot".format(name) in file: # get timestamp from the file name to build hdfs path. file_date = file.split('.')[0] pcap_hour = file_date[-6:-4] pcap_date_path = file_date[-14:-6] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path,pcap_date_path,pcap_hour) # create hdfs path. Util.creat_hdfs_folder(hdfs_path,logger) # load file to hdfs. hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file) Util.load_to_hdfs(os.path.join(currdir,file),hadoop_pcap_file,logger) # create event for workers to process the file. logger.info( "Sending split file to worker number: {0}".format(partition)) KafkaTopic.SendMessage(hadoop_pcap_file,kafka_servers,topic,partition) logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,topic)) except Exception as err: logger.error("There was a problem, please check the following error message:{0}".format(err.message)) logger.error("Exception: {0}".format(err))
def start_worker(type,topic,id,processes=None): logger = Util.get_logger("SPOT.INGEST.WORKER") # validate the given configuration exists in ingest_conf.json. if not type in worker_conf["pipelines"]: logger.error("'{0}' type is not a valid configuration.".format(type)); sys.exit(1) # validate the type is a valid module. if not Util.validate_data_source(worker_conf["pipelines"][type]["type"]): logger.error("The provided data source {0} is not valid".format(type));sys.exit(1) # validate if kerberos authentication is requiered. if os.getenv('KRB_AUTH'): kb = Kerberos() kb.authenticate() # create a worker instance based on the data source type. module = __import__("pipelines.{0}.worker".format(worker_conf["pipelines"][type]["type"]),fromlist=['Worker']) # kafka server info. logger.info("Initializing kafka instance") k_server = worker_conf["kafka"]['kafka_server'] k_port = worker_conf["kafka"]['kafka_port'] # required zookeeper info. zk_server = worker_conf["kafka"]['zookeper_server'] zk_port = worker_conf["kafka"]['zookeper_port'] topic = topic # create kafka consumer. kafka_consumer = KafkaConsumer(topic,k_server,k_port,zk_server,zk_port,id) # start worker. db_name = worker_conf['dbname'] app_path = worker_conf['hdfs_app_path'] ingest_worker = module.Worker(db_name,app_path,kafka_consumer,type,processes) ingest_worker.start()
def convert(logfile, tmpdir, opts='', prefix=None): ''' Copy log file to the local staging area. :param logfile: Path of log file. :param tmpdir : Path of local staging area. :param opts : A set of options for the `cp` command. :param prefix : If `prefix` is specified, the file name will begin with that; otherwise, a default `prefix` is used. :returns : Path of log file in local staging area. :rtype : ``str`` ''' logger = logging.getLogger('SPOT.INGEST.PROXY.PROCESS') with tempfile.NamedTemporaryFile(prefix=prefix, dir=tmpdir, delete=False) as fp: command = COMMAND.format(opts, logfile, fp.name) logger.debug('Execute command: {0}'.format(command)) Util.popen(command, raises=True) return fp.name
def _create_topic(self): self._logger.info("Creating topic: {0} with {1} parititions".format( self._topic, self._num_of_partitions)) # Create partitions for the workers. self._partitions = [ TopicPartition(self._topic, p) for p in range(int(self._num_of_partitions)) ] # create partitioner self._partitioner = RoundRobinPartitioner(self._partitions) # get script path zk_conf = "{0}:{1}".format(self._zk_server, self._zk_port) create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format( os.path.dirname(os.path.abspath(__file__)), self._topic, zk_conf, self._num_of_partitions) # execute create topic cmd Util.execute_cmd(create_topic_cmd, self._logger)
def start(self): self._logger.info("Creating Spark Job for topic: {0}".format( self._kafka_consumer.Topic)) # parser parser = self._conf["parser"] #spark conf diver_memory = self._spark_conf["driver_memory"] num_exec = self._spark_conf["spark_exec"] exec_memory = self._spark_conf["spark_executor_memory"] exec_cores = self._spark_conf["spark_executor_cores"] batch_size = self._spark_conf["spark_batch_size"] jar_path = os.path.dirname(os.path.dirname(self._script_path)) # spark job command. spark_job_cmd = ( "spark-submit --master yarn " "--driver-memory {0} " "--num-executors {1} " "--conf spark.executor.memory={2} " "--conf spark.executor.cores={3} " "--jars {4}/common/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar " "{5}/{6} " "-zk {7} " "-t {8} " "-db {9} " "-dt {10} " "-w {11} " "-bs {12}".format(diver_memory, num_exec, exec_memory, exec_cores, jar_path, self._script_path, parser, self._kafka_consumer.ZookeperServer, self._kafka_consumer.Topic, self._db_name, "proxy", self._processes, batch_size)) # start spark job. Util.execute_cmd(spark_job_cmd, self._logger)
def convert(netflow, tmpdir, opts='', prefix=None): ''' Convert `nfcapd` file to a comma-separated output format. :param netflow : Path of binary file. :param tmpdir : Path of local staging area. :param opts : A set of options for `nfdump` command. :param prefix : If `prefix` is specified, the file name will begin with that; otherwise, a default `prefix` is used. :returns : Path of CSV-converted file. :rtype : ``str`` :raises OSError: If an error occurs while executing the `nfdump` command. ''' logger = logging.getLogger('SPOT.INGEST.FLOW.PROCESS') with tempfile.NamedTemporaryFile(prefix=prefix, dir=tmpdir, delete=False) as fp: command = COMMAND.format(netflow, opts, fp.name) logger.debug('Execute command: {0}'.format(command)) Util.popen(command, raises=True) return fp.name
def _initialize_members(self,db_name,hdfs_app_path,kafka_consumer,conf_type,processes): # get logger instance. self._logger = Util.get_logger('SPOT.INGEST.WRK.PROXY') self._db_name = db_name self._hdfs_app_path = hdfs_app_path self._kafka_consumer = kafka_consumer # read proxy configuration. self._script_path = os.path.dirname(os.path.abspath(__file__)) conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._spark_conf = conf["spark-streaming"] self._conf = conf["pipelines"][conf_type] self._processes = processes
def _initialize_members(self,db_name,hdfs_app_path,kafka_consumer,conf_type): # get logger instance. self._logger = Util.get_logger('SPOT.INGEST.WRK.FLOW') self._db_name = db_name self._hdfs_app_path = hdfs_app_path # read proxy configuration. self._script_path = os.path.dirname(os.path.abspath(__file__)) conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._conf = conf["pipelines"][conf_type] self._process_opt = self._conf['process_opt'] self._local_staging = self._conf['local_staging'] self.kafka_consumer = kafka_consumer
def _initialize_members(self, db_name, hdfs_app_path, kafka_consumer, conf_type, processes): # get logger instance. self._logger = Util.get_logger('SPOT.INGEST.WRK.PROXY') self._db_name = db_name self._hdfs_app_path = hdfs_app_path self._kafka_consumer = kafka_consumer # read proxy configuration. self._script_path = os.path.dirname(os.path.abspath(__file__)) conf_file = "{0}/ingest_conf.json".format( os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._spark_conf = conf["spark-streaming"] self._conf = conf["pipelines"][conf_type] self._processes = processes
def _initialize_members(self, db_name, hdfs_app_path, kafka_consumer, conf_type): # get logger instance. self._logger = Util.get_logger('SPOT.INGEST.WRK.DNS') self._db_name = db_name self._hdfs_app_path = hdfs_app_path # read proxy configuration. self._script_path = os.path.dirname(os.path.abspath(__file__)) conf_file = "{0}/ingest_conf.json".format( os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._conf = conf["pipelines"][conf_type] self._process_opt = self._conf['process_opt'] self._local_staging = self._conf['local_staging'] self.kafka_consumer = kafka_consumer
def _ingest_file(hdfs_client, new_file, pkt_num, pcap_split_staging, hdfs_root_path, producer, topic): logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid())) try: # get file name and date. org_file = new_file file_name_parts = new_file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] # split file. name = file_name.split('.')[0] split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format( pkt_num, new_file, pcap_split_staging, name) logger.info("Splitting file: {0}".format(split_cmd)) Util.execute_cmd(split_cmd, logger) logger.info("Removing file: {0}".format(org_file)) rm_big_file = "rm {0}".format(org_file) Util.execute_cmd(rm_big_file, logger) except Exception as err: logger.error("There was a problem splitting the file: {0}".format( err.message)) logger.error("Exception: {0}".format(err)) for currdir, subdir, files in os.walk(pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_spot".format(name) in file: # get timestamp from the file name to build hdfs path. file_date = file.split('.')[0] pcap_hour = file_date[-6:-4] pcap_date_path = file_date[-14:-6] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format( hdfs_root_path, pcap_date_path, pcap_hour) # create hdfs path. try: if len(hdfs.list_dir(hdfs_path, hdfs_client)) == 0: logger.info('creating directory: ' + hdfs_path) hdfs_client.mkdir(hdfs_path, hdfs_client) # load file to hdfs. hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file) result = hdfs_client.upload_file( hadoop_pcap_file, os.path.join(currdir, file)) if not result: logger.error('File failed to upload: ' + hadoop_pcap_file) raise HdfsException # create event for workers to process the file. logger.info( "Sending split file to Topic: {0}".format(topic)) producer.SendMessage(hadoop_pcap_file, topic) logger.info( "File {0} has been successfully sent to Kafka Topic to: {1}" .format(file, topic)) except HdfsException as err: logger.error('Exception: ' + err.exception) logger.info( 'Check Hdfs Connection settings and server health') except Exception as err: logger.info( "File {0} failed to be sent to Kafka Topic to: {1}". format(new_file, topic)) logger.error("Error: {0}".format(err))
def authenticate(self): Util.execute_cmd(self._kinit_cmd, self._logger) self._logger.info("Kerberos ticket obtained")
def _process_new_file(self, file): # get file from hdfs get_file_cmd = "hadoop fs -get {0} {1}.".format( file, self._local_staging) self._logger.info("Getting file from hdfs: {0}".format(get_file_cmd)) Util.execute_cmd(get_file_cmd, self._logger) # get file name and date file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] binary_hour = file_name_parts[len(file_name_parts) - 2] binary_date_path = file_name_parts[len(file_name_parts) - 3] binary_year = binary_date_path[0:4] binary_month = binary_date_path[4:6] binary_day = binary_date_path[6:8] # build process cmd. process_cmd = "tshark -r {0}{1} {2} > {0}{1}.csv".format( self._local_staging, file_name, self._process_opt) self._logger.info("Processing file: {0}".format(process_cmd)) Util.execute_cmd(process_cmd, self._logger) # create hdfs staging. hdfs_path = "{0}/dns".format(self._hdfs_app_path) staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4] hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path, staging_timestamp) create_staging_cmd = "hadoop fs -mkdir -p {0}".format( hdfs_staging_path) self._logger.info("Creating staging: {0}".format(create_staging_cmd)) Util.execute_cmd(create_staging_cmd, self._logger) # move to stage. mv_to_staging = "hadoop fs -moveFromLocal {0}{1}.csv {2}/.".format( self._local_staging, file_name, hdfs_staging_path) self._logger.info("Moving data to staging: {0}".format(mv_to_staging)) Util.execute_cmd(mv_to_staging, self._logger) #load to avro load_to_avro_cmd = "hive -hiveconf dbname={0} -hiveconf y={1} -hiveconf m={2} -hiveconf d={3} -hiveconf h={4} -hiveconf data_location='{5}' -f pipelines/dns/load_dns_avro_parquet.hql".format( self._db_name, binary_year, binary_month, binary_day, binary_hour, hdfs_staging_path) self._logger.info("Loading data to hive: {0}".format(load_to_avro_cmd)) Util.execute_cmd(load_to_avro_cmd, self._logger) # remove from hdfs staging rm_hdfs_staging_cmd = "hadoop fs -rm -R -skipTrash {0}".format( hdfs_staging_path) self._logger.info( "Removing staging path: {0}".format(rm_hdfs_staging_cmd)) Util.execute_cmd(rm_hdfs_staging_cmd, self._logger) # remove from local staging. rm_local_staging = "rm {0}{1}".format(self._local_staging, file_name) self._logger.info( "Removing files from local staging: {0}".format(rm_local_staging)) Util.execute_cmd(rm_local_staging, self._logger) self._logger.info( "File {0} was successfully processed.".format(file_name))
def _process_new_file(self,file): # get file from hdfs get_file_cmd = "hadoop fs -get {0} {1}.".format(file,self._local_staging) self._logger.info("Getting file from hdfs: {0}".format(get_file_cmd)) Util.execute_cmd(get_file_cmd,self._logger) # get file name and date file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts)-1] flow_date = file_name.split('.')[1] flow_year = flow_date[0:4] flow_month = flow_date[4:6] flow_day = flow_date[6:8] flow_hour = flow_date[8:10] # build process cmd. process_cmd = "nfdump -o csv -r {0}{1} {2} > {0}{1}.csv".format(self._local_staging,file_name,self._process_opt) self._logger.info("Processing file: {0}".format(process_cmd)) Util.execute_cmd(process_cmd,self._logger) # create hdfs staging. hdfs_path = "{0}/flow".format(self._hdfs_app_path) staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4] hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path,staging_timestamp) create_staging_cmd = "hadoop fs -mkdir -p {0}".format(hdfs_staging_path) self._logger.info("Creating staging: {0}".format(create_staging_cmd)) Util.execute_cmd(create_staging_cmd,self._logger) # move to stage. mv_to_staging ="hadoop fs -moveFromLocal {0}{1}.csv {2}/.".format(self._local_staging,file_name,hdfs_staging_path) self._logger.info("Moving data to staging: {0}".format(mv_to_staging)) subprocess.call(mv_to_staging,shell=True) #load to avro load_to_avro_cmd = "hive -hiveconf dbname={0} -hiveconf y={1} -hiveconf m={2} -hiveconf d={3} -hiveconf h={4} -hiveconf data_location='{5}' -f pipelines/flow/load_flow_avro_parquet.hql".format(self._db_name,flow_year,flow_month,flow_day,flow_hour,hdfs_staging_path) self._logger.info( "Loading data to hive: {0}".format(load_to_avro_cmd)) Util.execute_cmd(load_to_avro_cmd,self._logger) # remove from hdfs staging rm_hdfs_staging_cmd = "hadoop fs -rm -R -skipTrash {0}".format(hdfs_staging_path) self._logger.info("Removing staging path: {0}".format(rm_hdfs_staging_cmd)) Util.execute_cmd(rm_hdfs_staging_cmd,self._logger) # remove from local staging. rm_local_staging = "rm {0}{1}".format(self._local_staging,file_name) self._logger.info("Removing files from local staging: {0}".format(rm_local_staging)) Util.execute_cmd(rm_local_staging,self._logger) self._logger.info("File {0} was successfully processed.".format(file_name))
def _process_new_file(self, nf): # get file from hdfs self._logger.info("Getting file from hdfs: {0}".format(nf)) if hdfs.file_exists(nf): hdfs.download_file(nf, self._local_staging) else: self._logger.info("file: {0} not found".format(nf)) # TODO: error handling # get file name and date file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] binary_hour = file_name_parts[len(file_name_parts) - 2] binary_date_path = file_name_parts[len(file_name_parts) - 3] binary_year = binary_date_path[0:4] binary_month = binary_date_path[4:6] binary_day = binary_date_path[6:8] # build process cmd. process_cmd = "tshark -r {0}{1} {2} > {0}{1}.csv".format( self._local_staging, file_name, self._process_opt) self._logger.info("Processing file: {0}".format(process_cmd)) Util.execute_cmd(process_cmd, self._logger) # create hdfs staging. hdfs_path = "{0}/dns".format(self._hdfs_app_path) staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4] hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path, staging_timestamp) self._logger.info("Creating staging: {0}".format(hdfs_staging_path)) hdfs.mkdir(hdfs_staging_path) # move to stage. local_file = "{0}{1}.csv".format(self._local_staging, file_name) self._logger.info( "Moving data to staging: {0}".format(hdfs_staging_path)) hdfs.upload_file(hdfs_staging_path, local_file) #load to avro drop_table = 'DROP TABLE IF EXISTS {0}.dns_tmp'.format(self._db_name) self._cursor.execute(drop_table) # Create external table create_external = ( "\n" "CREATE EXTERNAL TABLE {0}.dns_tmp (\n" " frame_day STRING,\n" " frame_time STRING,\n" " unix_tstamp BIGINT,\n" " frame_len INT,\n" " ip_src STRING,\n" " ip_dst STRING,\n" " dns_qry_name STRING,\n" " dns_qry_type INT,\n" " dns_qry_class STRING,\n" " dns_qry_rcode INT,\n" " dns_a STRING \n" " )\n" " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\n" " STORED AS TEXTFILE\n" " LOCATION '{1}'\n" " TBLPROPERTIES ('avro.schema.literal'='{{\n" " \"type\": \"record\"\n" " , \"name\": \"RawDnsRecord\"\n" " , \"namespace\" : \"com.cloudera.accelerators.dns.avro\"\n" " , \"fields\": [\n" " {{\"name\": \"frame_day\", \"type\":[\"string\", \"null\"]}\n" " , {{\"name\": \"frame_time\", \"type\":[\"string\", \"null\"]}\n" " , {{\"name\": \"unix_tstamp\", \"type\":[\"bigint\", \"null\"]}\n" " , {{\"name\": \"frame_len\", \"type\":[\"int\", \"null\"]}\n" " , {{\"name\": \"ip_src\", \"type\":[\"string\", \"null\"]}\n" " , {{\"name\": \"ip_dst\", \"type\":[\"string\", \"null\"]}\n" " , {{\"name\": \"dns_qry_name\", \"type\":[\"string\", \"null\"]}\n" " , {{\"name\": \"dns_qry_type\", \"type\":[\"int\", \"null\"]}\n" " , {{\"name\": \"dns_qry_class\", \"type\":[\"string\", \"null\"]}\n" " , {{\"name\": \"dns_qry_rcode\", \"type\":[\"int\", \"null\"]}\n" " , {{\"name\": \"dns_a\", \"type\":[\"string\", \"null\"]}\n" " ]\n" "}')\n").format(self._db_name, hdfs_staging_path) self._logger.info( "Creating external table: {0}".format(create_external)) self._cursor.execute(create_external) # Insert data insert_into_table = """ INSERT INTO TABLE {0}.dns PARTITION (y={1}, m={2}, d={3}, h={4) SELECT CONCAT(frame_day , frame_time) as treceived, unix_tstamp, frame_len, ip_dst, ip_src, dns_qry_name, dns_qry_class,dns_qry_type, dns_qry_rcode, dns_a FROM {0}.dns_tmp """.format(self._db_name, binary_year, binary_month, binary_day, binary_hour) self._logger.info("Loading data to {0}: {1}".format( self._db_name, insert_into_table)) self._cursor.execute(insert_into_table) # remove from hdfs staging self._logger.info( "Removing staging path: {0}".format(hdfs_staging_path)) hdfs.delete_folder(hdfs_staging_path) # remove from local staging. rm_local_staging = "rm {0}{1}".format(self._local_staging, file_name) self._logger.info( "Removing files from local staging: {0}".format(rm_local_staging)) Util.execute_cmd(rm_local_staging, self._logger) self._logger.info( "File {0} was successfully processed.".format(file_name))
def _process_new_file(self, nf): # get file name and date file_name_parts = nf.split('/') file_name = file_name_parts[len(file_name_parts) - 1] nf_path = nf.rstrip(file_name) flow_date = file_name.split('.')[1] flow_year = flow_date[0:4] flow_month = flow_date[4:6] flow_day = flow_date[6:8] flow_hour = flow_date[8:10] # get file from hdfs if hdfs.file_exists(nf_path, file_name): self._logger.info("Getting file from hdfs: {0}".format(nf)) hdfs.download_file(nf, self._local_staging) else: self._logger.info("file: {0} not found".format(nf)) # TODO: error handling # build process cmd. sf = "{0}{1}.csv".format(self._local_staging, file_name) process_cmd = "nfdump -o csv -r {0}{1} {2} > {3}".format( self._local_staging, file_name, self._process_opt, sf) self._logger.info("Processing file: {0}".format(process_cmd)) Util.execute_cmd(process_cmd, self._logger) # create hdfs staging. hdfs_path = "{0}/flow".format(self._hdfs_app_path) staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4] hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path, staging_timestamp) self._logger.info("Creating staging: {0}".format(hdfs_staging_path)) hdfs.mkdir(hdfs_staging_path) # move to stage. local_file = "{0}{1}.csv".format(self._local_staging, file_name) self._logger.info( "Moving data to staging: {0}".format(hdfs_staging_path)) hdfs.upload_file(hdfs_staging_path, local_file) # load with impyla drop_table = "DROP TABLE IF EXISTS {0}.flow_tmp".format(self._db_name) self._logger.info("Dropping temp table: {0}".format(drop_table)) self._cursor.execute_query(drop_table) create_external = ( "\n" "CREATE EXTERNAL TABLE {0}.flow_tmp (\n" " treceived STRING,\n" " tryear INT,\n" " trmonth INT,\n" " trday INT,\n" " trhour INT,\n" " trminute INT,\n" " trsec INT,\n" " tdur FLOAT,\n" " sip STRING,\n" " dip STRING,\n" " sport INT,\n" " dport INT,\n" " proto STRING,\n" " flag STRING,\n" " fwd INT,\n" " stos INT,\n" " ipkt BIGINT,\n" " ibyt BIGINT,\n" " opkt BIGINT,\n" " obyt BIGINT,\n" " input INT,\n" " output INT,\n" " sas INT,\n" " das INT,\n" " dtos INT,\n" " dir INT,\n" " rip STRING\n" " )\n" " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\n" " STORED AS TEXTFILE\n" " LOCATION '{1}'\n" " TBLPROPERTIES ('avro.schema.literal'='{{\n" " \"type\": \"record\"\n" " , \"name\": \"RawFlowRecord\"\n" " , \"namespace\" : \"com.cloudera.accelerators.flows.avro\"\n" " , \"fields\": [\n" " {{\"name\": \"treceived\", \"type\":[\"string\", \"null\"]}}\n" " , {{\"name\": \"tryear\", \"type\":[\"float\", \"null\"]}}\n" " , {{\"name\": \"trmonth\", \"type\":[\"float\", \"null\"]}}\n" " , {{\"name\": \"trday\", \"type\":[\"float\", \"null\"]}}\n" " , {{\"name\": \"trhour\", \"type\":[\"float\", \"null\"]}}\n" " , {{\"name\": \"trminute\", \"type\":[\"float\", \"null\"]}}\n" " , {{\"name\": \"trsec\", \"type\":[\"float\", \"null\"]}}\n" " , {{\"name\": \"tdur\", \"type\":[\"float\", \"null\"]}}\n" " , {{\"name\": \"sip\", \"type\":[\"string\", \"null\"]}}\n" " , {{\"name\": \"sport\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"dip\", \"type\":[\"string\", \"null\"]}}\n" " , {{\"name\": \"dport\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"proto\", \"type\":[\"string\", \"null\"]}}\n" " , {{\"name\": \"flag\", \"type\":[\"string\", \"null\"]}}\n" " , {{\"name\": \"fwd\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"stos\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"ipkt\", \"type\":[\"bigint\", \"null\"]}}\n" " , {{\"name\": \"ibyt\", \"type\":[\"bigint\", \"null\"]}}\n" " , {{\"name\": \"opkt\", \"type\":[\"bigint\", \"null\"]}}\n" " , {{\"name\": \"obyt\", \"type\":[\"bigint\", \"null\"]}}\n" " , {{\"name\": \"input\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"output\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"sas\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"das\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"dtos\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"dir\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"rip\", \"type\":[\"string\", \"null\"]}}\n" " ]\n" "}}')\n").format(self._db_name, hdfs_staging_path) self._logger.info( "Creating external table: {0}".format(create_external)) self._cursor.execute_query(create_external) insert_into_table = """ INSERT INTO TABLE {0}.flow PARTITION (y={1}, m={2}, d={3}, h={4}) SELECT treceived, unix_timestamp(treceived) AS unix_tstamp, tryear, trmonth, trday, trhour, trminute, trsec, tdur, sip, dip, sport, dport, proto, flag, fwd, stos, ipkt, ibyt, opkt, obyt, input, output, sas, das, dtos, dir, rip FROM {0}.flow_tmp """.format(self._db_name, flow_year, flow_month, flow_day, flow_hour) self._logger.info("Loading data to {0}: {1}".format( self._db_name, insert_into_table)) self._cursor.execute_query(insert_into_table) # remove from hdfs staging self._logger.info( "Removing staging path: {0}".format(hdfs_staging_path)) hdfs.delete_folder(hdfs_staging_path) # remove from local staging. rm_local_staging = "rm {0}{1}".format(self._local_staging, file_name) self._logger.info( "Removing files from local staging: {0}".format(rm_local_staging)) Util.execute_cmd(rm_local_staging, self._logger) rm_local_staging = "rm {0}".format(sf) self._logger.info( "Removing files from local staging: {0}".format(rm_local_staging)) Util.execute_cmd(rm_local_staging, self._logger) self._logger.info( "File {0} was successfully processed.".format(file_name))