Пример #1
0
class Collector(object):
    def __init__(self, hdfs_app_path, kafka_topic, conf_type):

        self._initialize_members(hdfs_app_path, kafka_topic, conf_type)

    def _initialize_members(self, hdfs_app_path, kafka_topic, conf_type):

        # getting parameters.
        self._logger = logging.getLogger('SPOT.INGEST.PROXY')
        self._hdfs_app_path = hdfs_app_path
        self._kafka_topic = kafka_topic

        # get script path
        self._script_path = os.path.dirname(os.path.abspath(__file__))

        # read proxy configuration.
        conf_file = "{0}/ingest_conf.json".format(
            os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._message_size = conf["kafka"]["message_size"]
        self._conf = conf["pipelines"][conf_type]

        # get collector path.
        self._collector_path = self._conf['collector_path']

        #get supported files
        self._supported_files = self._conf['supported_files']

        # create collector watcher
        self._watcher = FileWatcher(self._collector_path,
                                    self._supported_files)

        # Multiprocessing.
        self._processes = conf["collector_processes"]
        self._ingestion_interval = conf["ingestion_interval"]
        self._pool = Pool(processes=self._processes)

    def start(self):

        self._logger.info("Starting PROXY collector")
        self._watcher.start()

        try:
            while True:
                #self._ingest_files()
                self._ingest_files_pool()
                time.sleep(self._ingestion_interval)
        except KeyboardInterrupt:
            self._logger.info("Stopping Proxy collector...")
            Util.remove_kafka_topic(self._kafka_topic.Zookeeper,
                                    self._kafka_topic.Topic, self._logger)
            self._watcher.stop()
            self._pool.terminate()
            self._pool.close()
            self._pool.join()

    def _ingest_files_pool(self):

        if self._watcher.HasFiles:

            for x in range(0, self._processes):
                file = self._watcher.GetNextFile()
                resutl = self._pool.apply_async(
                    ingest_file,
                    args=(file, self._message_size, self._kafka_topic.Topic,
                          self._kafka_topic.BootstrapServers))
                #resutl.get() # to debug add try and catch.
                if not self._watcher.HasFiles: break
        return True
Пример #2
0
class Collector(object):

    def __init__(self, hdfs_app_path, kafka_topic, conf_type):
        self._initialize_members(hdfs_app_path, kafka_topic, conf_type)

    def _initialize_members(self, hdfs_app_path, kafka_topic, conf_type):
        # getting parameters.
        self._logger = logging.getLogger('SPOT.INGEST.DNS')
        self._hdfs_app_path = hdfs_app_path
        self._kafka_topic = kafka_topic

        # get script path
        self._script_path = os.path.dirname(os.path.abspath(__file__))

        # read dns configuration.
        conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._conf = conf["pipelines"][conf_type]

        # set configuration.
        self._collector_path = self._conf['collector_path']
        self._dsource = 'dns'
        self._hdfs_root_path = "{0}/{1}".format(hdfs_app_path, self._dsource)

        # set configuration.
        self._pkt_num = self._conf['pkt_num']
        self._pcap_split_staging = self._conf['pcap_split_staging']
        self._supported_files = self._conf['supported_files']

        # create collector watcher
        self._watcher = FileWatcher(self._collector_path, self._supported_files)

        # Multiprocessing.
        self._processes = conf["collector_processes"]
        self._ingestion_interval = conf["ingestion_interval"]
        self._pool = Pool(processes=self._processes)

    def start(self):

        self._logger.info("Starting DNS ingest")
        self._watcher.start()

        try:
            while True:
                self._ingest_files_pool()
                time.sleep(self._ingestion_interval)

        except KeyboardInterrupt:
            self._logger.info("Stopping DNS collector...")
            Util.remove_kafka_topic(self._kafka_topic.Zookeeper, self._kafka_topic.Topic, self._logger)
            self._watcher.stop()
            self._pool.terminate()
            self._pool.close()
            self._pool.join()
            SystemExit("Ingest finished...")


    def _ingest_files_pool(self):
        if self._watcher.HasFiles:
            for x in range(0, self._processes):
                file = self._watcher.GetNextFile()
                resutl = self._pool.apply_async(ingest_file, args=(file, self._pkt_num, self._pcap_split_staging, self._kafka_topic.Partition, self._hdfs_root_path, self._kafka_topic.Topic, self._kafka_topic.BootstrapServers, ))
                #resutl.get() # to debug add try and catch.
                if  not self._watcher.HasFiles: break    
        return True
Пример #3
0
class Collector(object):
    def __init__(self, hdfs_app_path, kafkaproducer, conf_type):

        self._initialize_members(hdfs_app_path, kafkaproducer, conf_type)

    def _initialize_members(self, hdfs_app_path, kafkaproducer, conf_type):

        # getting parameters.
        self._logger = logging.getLogger('SPOT.INGEST.DNS')
        self._hdfs_app_path = hdfs_app_path
        self._producer = kafkaproducer

        # get script path
        self._script_path = os.path.dirname(os.path.abspath(__file__))

        # read dns configuration.
        conf_file = "{0}/ingest_conf.json".format(
            os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._conf = conf["pipelines"][conf_type]

        # set configuration.
        self._collector_path = self._conf['collector_path']
        self._dsource = 'dns'
        self._hdfs_root_path = "{0}/{1}".format(hdfs_app_path, self._dsource)

        # set configuration.
        self._pkt_num = self._conf['pkt_num']
        self._pcap_split_staging = self._conf['pcap_split_staging']
        self._supported_files = self._conf['supported_files']

        # create collector watcher
        self._watcher = FileWatcher(self._collector_path,
                                    self._supported_files)

        # Multiprocessing.
        self._processes = conf["collector_processes"]
        self._ingestion_interval = conf["ingestion_interval"]
        self._pool = Pool(processes=self._processes)
        # TODO: review re-use of hdfs.client
        self._hdfs_client = hdfs.get_client()

    def start(self):

        self._logger.info("Starting DNS ingest")
        self._watcher.start()

        try:
            while True:
                self._ingest_files_pool()
                time.sleep(self._ingestion_interval)
        except KeyboardInterrupt:
            self._logger.info("Stopping DNS collector...")
            Util.remove_kafka_topic(self._producer.Zookeeper,
                                    self._producer.Topic, self._logger)
            self._watcher.stop()
            self._pool.terminate()
            self._pool.close()
            self._pool.join()
            SystemExit("Ingest finished...")

    def _ingest_files_pool(self):

        if self._watcher.HasFiles:

            for x in range(0, self._processes):
                self._logger.info('processes: {0}'.format(self._processes))
                new_file = self._watcher.GetNextFile()
                if self._processes <= 1:
                    _ingest_file(self._hdfs_client, new_file, self._pkt_num,
                                 self._pcap_split_staging,
                                 self._hdfs_root_path, self._producer,
                                 self._producer.Topic)
                else:
                    resutl = self._pool.apply_async(
                        _ingest_file,
                        args=(self._hdfs_client, new_file, self._pkt_num,
                              self._pcap_split_staging, self._hdfs_root_path,
                              self._producer, self._producer.Topic))
                # resutl.get() # to debug add try and catch.
                if not self._watcher.HasFiles:
                    break
        return True
Пример #4
0
class Collector(object):
    
    def __init__(self,hdfs_app_path,kafka_topic,conf_type):
        
        self._initialize_members(hdfs_app_path,kafka_topic,conf_type)
        
    def _initialize_members(self,hdfs_app_path,kafka_topic,conf_type):
        
        # getting parameters.
        self._logger = logging.getLogger('SPOT.INGEST.PROXY')
        self._hdfs_app_path = hdfs_app_path
        self._kafka_topic= kafka_topic

        # get script path
        self._script_path = os.path.dirname(os.path.abspath(__file__))

        # read proxy configuration.
        conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._message_size = conf["kafka"]["message_size"]
        self._conf = conf["pipelines"][conf_type]

        # get collector path.
        self._collector_path = self._conf['collector_path']

        #get supported files
        self._supported_files = self._conf['supported_files']

        # create collector watcher
        self._watcher = FileWatcher(self._collector_path,self._supported_files)

        # Multiprocessing. 
        self._processes = conf["collector_processes"]
        self._ingestion_interval = conf["ingestion_interval"]
        self._pool = Pool(processes=self._processes)

    def start(self):

        self._logger.info("Starting PROXY collector")
        self._watcher.start()   
    
        try:
            while True:
                #self._ingest_files()
                self._ingest_files_pool()              
                time.sleep(self._ingestion_interval)
        except KeyboardInterrupt:
            self._logger.info("Stopping Proxy collector...")  
            Util.remove_kafka_topic(self._kafka_topic.Zookeeper,self._kafka_topic.Topic,self._logger)          
            self._watcher.stop()
            self._pool.terminate()
            self._pool.close()            
            self._pool.join()
             

    def _ingest_files_pool(self):
            
       
        if self._watcher.HasFiles:
            
            for x in range(0,self._processes):
                file = self._watcher.GetNextFile()
                resutl = self._pool.apply_async(ingest_file,args=(file,self._message_size,self._kafka_topic.Topic,self._kafka_topic.BootstrapServers))
                #resutl.get() # to debug add try and catch.
                if  not self._watcher.HasFiles: break    
        return True