class Collector(object): def __init__(self, hdfs_app_path, kafka_topic, conf_type): self._initialize_members(hdfs_app_path, kafka_topic, conf_type) def _initialize_members(self, hdfs_app_path, kafka_topic, conf_type): # getting parameters. self._logger = logging.getLogger('SPOT.INGEST.PROXY') self._hdfs_app_path = hdfs_app_path self._kafka_topic = kafka_topic # get script path self._script_path = os.path.dirname(os.path.abspath(__file__)) # read proxy configuration. conf_file = "{0}/ingest_conf.json".format( os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._message_size = conf["kafka"]["message_size"] self._conf = conf["pipelines"][conf_type] # get collector path. self._collector_path = self._conf['collector_path'] #get supported files self._supported_files = self._conf['supported_files'] # create collector watcher self._watcher = FileWatcher(self._collector_path, self._supported_files) # Multiprocessing. self._processes = conf["collector_processes"] self._ingestion_interval = conf["ingestion_interval"] self._pool = Pool(processes=self._processes) def start(self): self._logger.info("Starting PROXY collector") self._watcher.start() try: while True: #self._ingest_files() self._ingest_files_pool() time.sleep(self._ingestion_interval) except KeyboardInterrupt: self._logger.info("Stopping Proxy collector...") Util.remove_kafka_topic(self._kafka_topic.Zookeeper, self._kafka_topic.Topic, self._logger) self._watcher.stop() self._pool.terminate() self._pool.close() self._pool.join() def _ingest_files_pool(self): if self._watcher.HasFiles: for x in range(0, self._processes): file = self._watcher.GetNextFile() resutl = self._pool.apply_async( ingest_file, args=(file, self._message_size, self._kafka_topic.Topic, self._kafka_topic.BootstrapServers)) #resutl.get() # to debug add try and catch. if not self._watcher.HasFiles: break return True
class Collector(object): def __init__(self, hdfs_app_path, kafka_topic, conf_type): self._initialize_members(hdfs_app_path, kafka_topic, conf_type) def _initialize_members(self, hdfs_app_path, kafka_topic, conf_type): # getting parameters. self._logger = logging.getLogger('SPOT.INGEST.DNS') self._hdfs_app_path = hdfs_app_path self._kafka_topic = kafka_topic # get script path self._script_path = os.path.dirname(os.path.abspath(__file__)) # read dns configuration. conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._conf = conf["pipelines"][conf_type] # set configuration. self._collector_path = self._conf['collector_path'] self._dsource = 'dns' self._hdfs_root_path = "{0}/{1}".format(hdfs_app_path, self._dsource) # set configuration. self._pkt_num = self._conf['pkt_num'] self._pcap_split_staging = self._conf['pcap_split_staging'] self._supported_files = self._conf['supported_files'] # create collector watcher self._watcher = FileWatcher(self._collector_path, self._supported_files) # Multiprocessing. self._processes = conf["collector_processes"] self._ingestion_interval = conf["ingestion_interval"] self._pool = Pool(processes=self._processes) def start(self): self._logger.info("Starting DNS ingest") self._watcher.start() try: while True: self._ingest_files_pool() time.sleep(self._ingestion_interval) except KeyboardInterrupt: self._logger.info("Stopping DNS collector...") Util.remove_kafka_topic(self._kafka_topic.Zookeeper, self._kafka_topic.Topic, self._logger) self._watcher.stop() self._pool.terminate() self._pool.close() self._pool.join() SystemExit("Ingest finished...") def _ingest_files_pool(self): if self._watcher.HasFiles: for x in range(0, self._processes): file = self._watcher.GetNextFile() resutl = self._pool.apply_async(ingest_file, args=(file, self._pkt_num, self._pcap_split_staging, self._kafka_topic.Partition, self._hdfs_root_path, self._kafka_topic.Topic, self._kafka_topic.BootstrapServers, )) #resutl.get() # to debug add try and catch. if not self._watcher.HasFiles: break return True
class Collector(object): def __init__(self, hdfs_app_path, kafkaproducer, conf_type): self._initialize_members(hdfs_app_path, kafkaproducer, conf_type) def _initialize_members(self, hdfs_app_path, kafkaproducer, conf_type): # getting parameters. self._logger = logging.getLogger('SPOT.INGEST.DNS') self._hdfs_app_path = hdfs_app_path self._producer = kafkaproducer # get script path self._script_path = os.path.dirname(os.path.abspath(__file__)) # read dns configuration. conf_file = "{0}/ingest_conf.json".format( os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._conf = conf["pipelines"][conf_type] # set configuration. self._collector_path = self._conf['collector_path'] self._dsource = 'dns' self._hdfs_root_path = "{0}/{1}".format(hdfs_app_path, self._dsource) # set configuration. self._pkt_num = self._conf['pkt_num'] self._pcap_split_staging = self._conf['pcap_split_staging'] self._supported_files = self._conf['supported_files'] # create collector watcher self._watcher = FileWatcher(self._collector_path, self._supported_files) # Multiprocessing. self._processes = conf["collector_processes"] self._ingestion_interval = conf["ingestion_interval"] self._pool = Pool(processes=self._processes) # TODO: review re-use of hdfs.client self._hdfs_client = hdfs.get_client() def start(self): self._logger.info("Starting DNS ingest") self._watcher.start() try: while True: self._ingest_files_pool() time.sleep(self._ingestion_interval) except KeyboardInterrupt: self._logger.info("Stopping DNS collector...") Util.remove_kafka_topic(self._producer.Zookeeper, self._producer.Topic, self._logger) self._watcher.stop() self._pool.terminate() self._pool.close() self._pool.join() SystemExit("Ingest finished...") def _ingest_files_pool(self): if self._watcher.HasFiles: for x in range(0, self._processes): self._logger.info('processes: {0}'.format(self._processes)) new_file = self._watcher.GetNextFile() if self._processes <= 1: _ingest_file(self._hdfs_client, new_file, self._pkt_num, self._pcap_split_staging, self._hdfs_root_path, self._producer, self._producer.Topic) else: resutl = self._pool.apply_async( _ingest_file, args=(self._hdfs_client, new_file, self._pkt_num, self._pcap_split_staging, self._hdfs_root_path, self._producer, self._producer.Topic)) # resutl.get() # to debug add try and catch. if not self._watcher.HasFiles: break return True
class Collector(object): def __init__(self,hdfs_app_path,kafka_topic,conf_type): self._initialize_members(hdfs_app_path,kafka_topic,conf_type) def _initialize_members(self,hdfs_app_path,kafka_topic,conf_type): # getting parameters. self._logger = logging.getLogger('SPOT.INGEST.PROXY') self._hdfs_app_path = hdfs_app_path self._kafka_topic= kafka_topic # get script path self._script_path = os.path.dirname(os.path.abspath(__file__)) # read proxy configuration. conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._message_size = conf["kafka"]["message_size"] self._conf = conf["pipelines"][conf_type] # get collector path. self._collector_path = self._conf['collector_path'] #get supported files self._supported_files = self._conf['supported_files'] # create collector watcher self._watcher = FileWatcher(self._collector_path,self._supported_files) # Multiprocessing. self._processes = conf["collector_processes"] self._ingestion_interval = conf["ingestion_interval"] self._pool = Pool(processes=self._processes) def start(self): self._logger.info("Starting PROXY collector") self._watcher.start() try: while True: #self._ingest_files() self._ingest_files_pool() time.sleep(self._ingestion_interval) except KeyboardInterrupt: self._logger.info("Stopping Proxy collector...") Util.remove_kafka_topic(self._kafka_topic.Zookeeper,self._kafka_topic.Topic,self._logger) self._watcher.stop() self._pool.terminate() self._pool.close() self._pool.join() def _ingest_files_pool(self): if self._watcher.HasFiles: for x in range(0,self._processes): file = self._watcher.GetNextFile() resutl = self._pool.apply_async(ingest_file,args=(file,self._message_size,self._kafka_topic.Topic,self._kafka_topic.BootstrapServers)) #resutl.get() # to debug add try and catch. if not self._watcher.HasFiles: break return True