Пример #1
0
    def _ingest_file(self, file, partition):

        # get file name and date.
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]
        file_date = file_name.split('.')[1]

        file_date_path = file_date[0:8]
        file_date_hour = file_date[8:10]

        # hdfs path with timestamp.
        hdfs_path = "{0}/binary/{1}/{2}".format(self._hdfs_root_path,
                                                file_date_path, file_date_hour)
        Util.creat_hdfs_folder(hdfs_path, self._logger)

        # load to hdfs.
        hdfs_file = "{0}/{1}".format(hdfs_path, file_name)
        Util.load_to_hdfs(file, hdfs_file, self._logger)

        # create event for workers to process the file.
        self._logger.info(
            "Sending file to worker number: {0}".format(partition))
        self.kafka_topic.send_message(hdfs_file, partition)

        self._logger.info(
            "File {0} has been successfully sent to Kafka Topic to: {1}".
            format(file, self._kafka_topic.Topic))
Пример #2
0
	def _initialize_members(self,conf):

		self._collector_path = None
		self._hdfs_root_path = None
		self._queue_name = None
		self._pkt_num = None
		self._pcap_split_staging = None
		self._time_to_wait = None
		self._dsource='dns'

		# valdiate configuration info.
                conf_err_msg = "Please provide a valid '{0}' in the configuration file"
                Util.validate_parameter(conf['collector_path'],conf_err_msg.format("collector_path"))
                Util.validate_parameter(conf['queue_name'],conf_err_msg.format("queue_name"))
                Util.validate_parameter(conf['pkt_num'],conf_err_msg.format("pkt_num"))
                Util.validate_parameter(conf['pcap_split_staging'],conf_err_msg.format("pcap_split_staging"))
                Util.validate_parameter(conf['time_to_wait'],conf_err_msg.format("time_to_wait"))

		# set configuration.
		self._collector_path = conf['collector_path']
                self._hdfs_root_path = "{0}/{1}".format(conf['huser'] , self._dsource)
                self._time_to_wait = conf['time_to_wait']
		self._pkt_num = conf['pkt_num']
                self._pcap_split_staging = conf['pcap_split_staging']
                self._queue_name = conf['queue_name']
Пример #3
0
    def start(self):

        self._logger.info("Creating Spark Job for topic: {0}".format(
            self._kafka_consumer.Topic))

        # parser
        parser = self._conf["parser"]

        # spark job command.
        spark_job_cmd = (
            "spark-submit --master yarn "
            "--jars {0}/oni/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar "
            "{1}/{2} "
            "-zk {3} "
            "-t {4} "
            "-db {5} "
            "-dt {6} "
            "-w {7}".format(
                os.path.dirname(os.path.dirname(self._script_path)),
                self._script_path, parser, self._kafka_consumer.ZookeperServer,
                self._kafka_consumer.Topic, self._db_name, "proxy",
                self._processes))

        # start spark job.
        Util.execute_cmd(spark_job_cmd, self._logger)
Пример #4
0
    def _initialize_members(self, conf):

        self._collector_path = None
        self._hdfs_root_path = None
        self._queue_name = None
        self._pkt_num = None
        self._pcap_split_staging = None
        self._time_to_wait = None
        self._dsource = 'dns'

        # valdiate configuration info.
        conf_err_msg = "Please provide a valid '{0}' in the configuration file"
        Util.validate_parameter(conf['collector_path'],
                                conf_err_msg.format("collector_path"))
        Util.validate_parameter(conf['queue_name'],
                                conf_err_msg.format("queue_name"))
        Util.validate_parameter(conf['pkt_num'],
                                conf_err_msg.format("pkt_num"))
        Util.validate_parameter(conf['pcap_split_staging'],
                                conf_err_msg.format("pcap_split_staging"))
        Util.validate_parameter(conf['time_to_wait'],
                                conf_err_msg.format("time_to_wait"))

        # set configuration.
        self._collector_path = conf['collector_path']
        self._hdfs_root_path = "{0}/{1}".format(conf['huser'], self._dsource)
        self._time_to_wait = conf['time_to_wait']
        self._pkt_num = conf['pkt_num']
        self._pcap_split_staging = conf['pcap_split_staging']
        self._queue_name = conf['queue_name']
Пример #5
0
def start_collector(type, workers_num, id=None):

    # generate ingest id
    ingest_id = str(datetime.datetime.time(datetime.datetime.now())).replace(
        ":", "_").replace(".", "_")

    # create logger.
    logger = Util.get_logger("ONI.INGEST")

    # validate the given configuration exists in ingest_conf.json.
    if not type in master_conf["pipelines"]:
        logger.error("'{0}' type is not a valid configuration.".format(type))
        sys.exit(1)

    # validate the type is a valid module.
    if not Util.validate_data_source(master_conf["pipelines"][type]["type"]):
        logger.error(
            "'{0}' type is not configured. Please check you ingest conf file".
            format(master_conf["pipelines"][type]["type"]))
        sys.exit(1)

    # validate if kerberos authentication is required.
    if os.getenv('KRB_AUTH'):
        kb = Kerberos()
        kb.authenticate()

    # kafka server info.
    logger.info("Initializing kafka instance")
    k_server = master_conf["kafka"]['kafka_server']
    k_port = master_conf["kafka"]['kafka_port']

    # required zookeeper info.
    zk_server = master_conf["kafka"]['zookeper_server']
    zk_port = master_conf["kafka"]['zookeper_port']

    topic = "ONI-INGEST-{0}_{1}".format(type, ingest_id) if not id else id
    kafka = KafkaTopic(topic, k_server, k_port, zk_server, zk_port,
                       workers_num)

    # create a collector instance based on data source type.
    logger.info("Starting {0} ingest instance".format(topic))
    module = __import__("pipelines.{0}.collector".format(
        master_conf["pipelines"][type]["type"]),
                        fromlist=['Collector'])

    # start collector.
    ingest_collector = module.Collector(master_conf['hdfs_app_path'], kafka,
                                        type)
    ingest_collector.start()
Пример #6
0
    def _ingest_file(self,file):

        message = ""
        with open(file,"rb") as f:

            for line in f:
                message += line
                if len(message) > self._message_size:
                    self._kafka_topic.send_message(message,self._kafka_topic.Partition)
                    message = ""
            # send the last package.
            self._kafka_topic.send_message(message,self._kafka_topic.Partition)
        rm_file = "rm {0}".format(file)
        Util.execute_cmd(rm_file,self._logger)
        self._logger.info("File {0} has been successfully sent to Kafka Topic:{1}".format(file,self._kafka_topic.Topic))
Пример #7
0
    def _initialize_members(self,hdfs_app_path,kafka_topic,conf_type):
    
        # getting parameters.
        self._logger = logging.getLogger('ONI.INGEST.DNS')
        self._hdfs_app_path = hdfs_app_path
        self._kafka_topic = kafka_topic

        # get script path
        self._script_path = os.path.dirname(os.path.abspath(__file__))

        # read dns configuration.
        conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._conf = conf["pipelines"][conf_type]

        # set configuration.
        self._collector_path = self._conf['collector_path']        
        self._dsource = 'dns'
        self._hdfs_root_path = "{0}/{1}".format(hdfs_app_path, self._dsource)

        # set configuration.
        self._pkt_num = self._conf['pkt_num']
        self._pcap_split_staging = self._conf['pcap_split_staging']

        # initialize message broker client.
        self.kafka_topic = kafka_topic

        # create collector watcher
        self._watcher =  Util.create_watcher(self._collector_path,NewFileEvent(self),self._logger)
    def _create_topic(self):

        self._logger.info("Creating topic: {0} with {1} parititions".format(self._topic,self._num_of_partitions))     

        # Create partitions for the workers.
        self._partitions = [ TopicPartition(self._topic,p) for p in range(int(self._num_of_partitions))]        

        # create partitioner
        self._partitioner = RoundRobinPartitioner(self._partitions)
        
        # get script path 
        zk_conf = "{0}:{1}".format(self._zk_server,self._zk_port)
        create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format(os.path.dirname(os.path.abspath(__file__)),self._topic,zk_conf,self._num_of_partitions)

        # execute create topic cmd
        Util.execute_cmd(create_topic_cmd,self._logger)
Пример #9
0
    def _initialize_members(self, hdfs_app_path, kafka_topic, conf_type):

        # getting parameters.
        self._logger = logging.getLogger('ONI.INGEST.FLOW')
        self._hdfs_app_path = hdfs_app_path
        self._kafka_topic = kafka_topic

        # get script path
        self._script_path = os.path.dirname(os.path.abspath(__file__))

        # read flow configuration.
        conf_file = "{0}/ingest_conf.json".format(
            os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._conf = conf["pipelines"][conf_type]

        # set configuration.
        self._collector_path = self._conf['collector_path']
        self._dsource = 'flow'
        self._hdfs_root_path = "{0}/{1}".format(hdfs_app_path, self._dsource)

        # initialize message broker client.
        self.kafka_topic = kafka_topic

        # create collector watcher
        self._watcher = Util.create_watcher(self._collector_path,
                                            NewFileEvent(self), self._logger)
Пример #10
0
	def _initialize_members(self,conf):

		self._collector_path = None
                self._hdfs_root_path = None
                self._queue_name = None
                self._dsource = 'flow'

		# valdiate configuration info.
                conf_err_msg = "Please provide a valid '{0}' in the configuration file"
                Util.validate_parameter(conf['collector_path'],conf_err_msg.format("collector_path"))
                Util.validate_parameter(conf['queue_name'],conf_err_msg.format("queue_name"))

               	# set configuration.
		self._collector_path = conf['collector_path']
                self._hdfs_root_path = "{0}/{1}".format(os.getenv('HUSER','/user/duxbury') , self._dsource)
               	self._queue_name = conf['queue_name']
Пример #11
0
    def start(self):

        self._logger.info("Starting PROXY ingest")
        self._logger.info("Watching: {0}".format(self._collector_path))
        self._watcher.start()

        try:
            while True:
                time.sleep(1)
        except KeyboardInterrupt:
            self._logger.info("Stopping PROXY collector...")
            self._watcher.stop()
            self._watcher.join()

            # remove kafka topic
            Util.remove_kafka_topic(self._kafka_topic.Zookeeper,self._kafka_topic.Topic,self._logger)
Пример #12
0
def process_new_binary_file(new_file):

	# get file from hdfs
	get_file_cmd = "hadoop fs -get {0} ../stage/.".format(new_file)
	print get_file_cmd
	subprocess.call(get_file_cmd,shell=True)

	# get file name and date
	binary_year,binary_month,binary_day,binary_hour,binary_date_path,file_name =  Util.build_hdfs_path(new_file,ingest_type)

	# build process cmd.
	post_process_cmd = None
	process_opt = worker_conf[ingest_type]['process_opt']
	if ingest_type == 'dns':
		post_process_cmd = "tshark -r ../stage/{0} {1} >> ../stage/{0}.csv".format(file_name,process_opt)
	elif ingest_type == 'flow':
		post_process_cmd = "nfdump -o csv -r ../stage/{0} {1} > ../stage/{0}.csv".format(file_name,process_opt)
        else:
            print "Unsupported ingest type"
            sys.exit(1)

	print post_process_cmd
	subprocess.call(post_process_cmd,shell=True)

	# create folder if it does not exist
	h_base_path = "{0}/{1}".format(os.getenv('HUSER','/user/oni'), ingest_type)
	h_csv_path = "{0}/csv".format(h_base_path)
	create_folder_cmd = "hadoop fs -mkdir -p {0}/y={1}/m={2}/d={3}/h={4}".format(h_csv_path,binary_year,binary_month,binary_day,binary_hour)
	print create_folder_cmd
	subprocess.call(create_folder_cmd,shell=True)

	#move to hdfs.
	upld_cmd = "hadoop fs -moveFromLocal ../stage/{0}.csv {1}/y={2}/m={3}/d={4}/h={5}/.".format(file_name,h_csv_path,binary_year,binary_month,binary_day,binary_hour)
	print upld_cmd
	subprocess.call(upld_cmd,shell=True)

	#make tmp folder in stage
        h_stage_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4]
	h_stage_path =  "{0}/stage/{1}".format(h_base_path,h_stage_timestamp)
	create_tmp_cmd = "hadoop fs -mkdir -p {0}".format(h_stage_path)
	print create_tmp_cmd
	subprocess.call(create_tmp_cmd,shell=True)

	#load to avro
	load_avro_cmd = "hive -hiveconf dbname={6} -hiveconf y={0} -hiveconf m={1} -hiveconf d={2} -hiveconf h={3} -hiveconf data_location='{4}' -f oni/load_{5}_avro_parquet.hql".format(binary_year,binary_month,binary_day,binary_hour,h_stage_path,ingest_type,os.getenv('DBNAME','default') )

	print load_avro_cmd
	subprocess.call(load_avro_cmd,shell=True)

	#remove from stage
	rm_tmp_cmd = "hadoop fs -rm -R -skipTrash {0}".format(h_stage_path)
	print rm_tmp_cmd
	subprocess.call(rm_tmp_cmd,shell=True)

	#can this delete other files when all is running on the same edge server?
	rm_tmp = "rm ../stage/{0}*".format(file_name)
	subprocess.call(rm_tmp,shell=True)

	print datetime.datetime.now()
Пример #13
0
    def start(self):

        self._logger.info("Starting PROXY ingest")
        self._logger.info("Watching: {0}".format(self._collector_path))
        self._watcher.start()

        try:
            while True:
                time.sleep(1)
        except KeyboardInterrupt:
            self._logger.info("Stopping PROXY collector...")
            self._watcher.stop()
            self._watcher.join()

            # remove kafka topic
            Util.remove_kafka_topic(self._kafka_topic.Zookeeper,
                                    self._kafka_topic.Topic, self._logger)
Пример #14
0
    def _initialize_members(self, conf):

        self._collector_path = None
        self._hdfs_root_path = None
        self._queue_name = None
        self._dsource = 'flow'

        # valdiate configuration info.
        conf_err_msg = "Please provide a valid '{0}' in the configuration file"
        Util.validate_parameter(conf['collector_path'],
                                conf_err_msg.format("collector_path"))
        Util.validate_parameter(conf['queue_name'],
                                conf_err_msg.format("queue_name"))

        # set configuration.
        self._collector_path = conf['collector_path']
        self._hdfs_root_path = "{0}/{1}".format(conf['huser'], self._dsource)
        self._queue_name = conf['queue_name']
Пример #15
0
def start_worker(type, topic, id, processes=None):

    logger = Util.get_logger("ONI.INGEST.WORKER")

    # validate the given configuration exists in ingest_conf.json.
    if not type in worker_conf["pipelines"]:
        logger.error("'{0}' type is not a valid configuration.".format(type))
        sys.exit(1)

    # validate the type is a valid module.
    if not Util.validate_data_source(worker_conf["pipelines"][type]["type"]):
        logger.error("The provided data source {0} is not valid".format(type))
        sys.exit(1)

    # validate if kerberos authentication is requiered.
    if os.getenv('KRB_AUTH'):
        kb = Kerberos()
        kb.authenticate()

    # create a worker instance based on the data source type.
    module = __import__("pipelines.{0}.worker".format(
        worker_conf["pipelines"][type]["type"]),
                        fromlist=['Worker'])

    # kafka server info.
    logger.info("Initializing kafka instance")
    k_server = worker_conf["kafka"]['kafka_server']
    k_port = worker_conf["kafka"]['kafka_port']

    # required zookeeper info.
    zk_server = worker_conf["kafka"]['zookeper_server']
    zk_port = worker_conf["kafka"]['zookeper_port']
    topic = topic

    # create kafka consumer.
    kafka_consumer = KafkaConsumer(topic, k_server, k_port, zk_server, zk_port,
                                   id)

    # start worker.
    db_name = worker_conf['dbname']
    app_path = worker_conf['hdfs_app_path']
    ingest_worker = module.Worker(db_name, app_path, kafka_consumer, type,
                                  processes)
    ingest_worker.start()
Пример #16
0
    def _ingest_file(self, file):

        message = ""
        with open(file, "rb") as f:

            for line in f:
                message += line
                if len(message) > self._message_size:
                    self._kafka_topic.send_message(message,
                                                   self._kafka_topic.Partition)
                    message = ""
            # send the last package.
            self._kafka_topic.send_message(message,
                                           self._kafka_topic.Partition)
        rm_file = "rm {0}".format(file)
        Util.execute_cmd(rm_file, self._logger)
        self._logger.info(
            "File {0} has been successfully sent to Kafka Topic:{1}".format(
                file, self._kafka_topic.Topic))
Пример #17
0
    def start(self):

        self._logger.info("Creating Spark Job for topic: {0}".format(self._kafka_consumer.Topic))                

        # parser
        parser = self._conf["parser"]
        

        # spark job command.
        spark_job_cmd = ("spark-submit --master yarn "
                        "--jars {0}/oni/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar "
                        "{1}/{2} " 
                        "-zk {3} " 
                        "-t {4} "
                        "-db {5} "
                        "-dt {6} " 
                        "-w {7}".format(os.path.dirname(os.path.dirname(self._script_path)),self._script_path,parser,self._kafka_consumer.ZookeperServer,self._kafka_consumer.Topic,self._db_name,"proxy",self._processes))

        
        # start spark job.
        Util.execute_cmd(spark_job_cmd,self._logger)
Пример #18
0
	def _split_pcap_file(self,file_name,file_local_path,hdfs_path):

		# split file.
		name = file_name.split('.')[0]
		split_cmd="editcap -c {0} {1} {2}/{3}_split.pcap".format(self._pkt_num,file_local_path,self._pcap_split_staging,name)
		print split_cmd
		subprocess.call(split_cmd,shell=True)

		for currdir,subdir,files in os.walk(self._pcap_split_staging):
			for file in files:
				if file.endswith(".pcap") and "{0}_split".format(name) in file:
					# load file to hdfs.
					Util.load_to_hdfs(file,os.path.join(currdir,file),hdfs_path)

					#send rabbitmq notificaion.
					hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file)
                        		Util.send_new_file_notification(hadoop_pcap_file,self._queue_name)

        	rm_big_file = "rm {0}".format(file_local_path)
        	print rm_big_file
 	 	subprocess.call(rm_big_file,shell=True)
Пример #19
0
    def _process_pcap_file(self, file_name, file_local_path, hdfs_root_path):

        # get timestamp from the file name.
        file_date = file_name.split('.')[0]
        pcap_hour = file_date[-4:-2]
        pcap_date_path = file_date[-12:-4]

        # hdfs path with timestamp.
        hdfs_path = "{0}/{1}/{2}".format(hdfs_root_path, pcap_date_path,
                                         pcap_hour)
        Util.creat_hdfs_folder(hdfs_path)

        # get file size.
        file_size = os.stat(file_local_path)
        if file_size.st_size > 1145498644:

            # split file.
            self._split_pcap_file(file_name, file_local_path, hdfs_path)
        else:
            # load file to hdfs
            Util.load_to_hdfs(file_name, file_local_path, hdfs_path)

            # send rabbitmq notification.
            hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file_name)
            Util.send_new_file_notification(hadoop_pcap_file, self._queue_name)
Пример #20
0
def start_worker(type,topic,id,processes=None):

    logger = Util.get_logger("ONI.INGEST.WORKER")

    # validate the given configuration exists in ingest_conf.json.
    if not type in worker_conf["pipelines"]:
        logger.error("'{0}' type is not a valid configuration.".format(type));
        sys.exit(1)

    # validate the type is a valid module.
    if not Util.validate_data_source(worker_conf["pipelines"][type]["type"]):
        logger.error("The provided data source {0} is not valid".format(type));sys.exit(1)

    # validate if kerberos authentication is requiered.
    if os.getenv('KRB_AUTH'):
        kb = Kerberos()
        kb.authenticate()

    # create a worker instance based on the data source type.
    module = __import__("pipelines.{0}.worker".format(worker_conf["pipelines"][type]["type"]),fromlist=['Worker'])

    # kafka server info.
    logger.info("Initializing kafka instance")
    k_server = worker_conf["kafka"]['kafka_server']
    k_port = worker_conf["kafka"]['kafka_port']

    # required zookeeper info.
    zk_server = worker_conf["kafka"]['zookeper_server']
    zk_port = worker_conf["kafka"]['zookeper_port']
    topic = topic

    # create kafka consumer.
    kafka_consumer = KafkaConsumer(topic,k_server,k_port,zk_server,zk_port,id)

    # start worker.
    db_name = worker_conf['dbname']
    app_path = worker_conf['hdfs_app_path']
    ingest_worker = module.Worker(db_name,app_path,kafka_consumer,type,processes)
    ingest_worker.start()
Пример #21
0
    def _create_topic(self):

        self._logger.info("Creating topic: {0} with {1} parititions".format(
            self._topic, self._num_of_partitions))

        # Create partitions for the workers.
        self._partitions = [
            TopicPartition(self._topic, p)
            for p in range(int(self._num_of_partitions))
        ]

        # create partitioner
        self._partitioner = RoundRobinPartitioner(self._partitions)

        # get script path
        zk_conf = "{0}:{1}".format(self._zk_server, self._zk_port)
        create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format(
            os.path.dirname(os.path.abspath(__file__)), self._topic, zk_conf,
            self._num_of_partitions)

        # execute create topic cmd
        Util.execute_cmd(create_topic_cmd, self._logger)
Пример #22
0
    def _ingest_file(self, file, partition):

        # get file name and date.
        org_file = file
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]

        # split file.
        name = file_name.split('.')[0]
        split_cmd = "editcap -c {0} {1} {2}/{3}_oni.pcap".format(
            self._pkt_num, file, self._pcap_split_staging, name)
        self._logger.info("Splitting file: {0}".format(split_cmd))
        Util.execute_cmd(split_cmd, self._logger)

        for currdir, subdir, files in os.walk(self._pcap_split_staging):
            for file in files:
                if file.endswith(".pcap") and "{0}_oni".format(name) in file:

                    # get timestamp from the file name to build hdfs path.
                    file_date = file.split('.')[0]
                    pcap_hour = file_date[-6:-4]
                    pcap_date_path = file_date[-14:-6]

                    # hdfs path with timestamp.
                    hdfs_path = "{0}/binary/{1}/{2}".format(
                        self._hdfs_root_path, pcap_date_path, pcap_hour)

                    # create hdfs path.
                    Util.creat_hdfs_folder(hdfs_path, self._logger)

                    # load file to hdfs.
                    hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file)
                    Util.load_to_hdfs(os.path.join(currdir, file),
                                      hadoop_pcap_file, self._logger)

                    # create event for workers to process the file.
                    self._logger.info(
                        "Sending split file to worker number: {0}".format(
                            partition))
                    self._kafka_topic.send_message(hadoop_pcap_file, partition)
                    self._logger.info(
                        "File {0} has been successfully sent to Kafka Topic to: {1}"
                        .format(file, self._kafka_topic.Topic))

        self._logger.info("Removing file: {0}".format(org_file))
        rm_big_file = "rm {0}".format(org_file)
        Util.execute_cmd(rm_big_file, self._logger)
Пример #23
0
    def _initialize_members(self,db_name,hdfs_app_path,kafka_consumer,conf_type,processes):
        
        # get logger instance.
        self._logger = Util.get_logger('ONI.INGEST.WRK.PROXY')

        self._db_name = db_name
        self._hdfs_app_path = hdfs_app_path
        self._kafka_consumer = kafka_consumer

        # read proxy configuration.
        self._script_path = os.path.dirname(os.path.abspath(__file__))
        conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._conf = conf["pipelines"][conf_type]
        self._processes = processes
Пример #24
0
    def _split_pcap_file(self, file_name, file_local_path, hdfs_path):

        # split file.
        name = file_name.split('.')[0]
        split_cmd = "editcap -c {0} {1} {2}/{3}_split.pcap".format(
            self._pkt_num, file_local_path, self._pcap_split_staging, name)
        print split_cmd
        subprocess.call(split_cmd, shell=True)

        for currdir, subdir, files in os.walk(self._pcap_split_staging):
            for file in files:
                if file.endswith(".pcap") and "{0}_split".format(name) in file:
                    # load file to hdfs.
                    Util.load_to_hdfs(file, os.path.join(currdir, file),
                                      hdfs_path)

                    #send rabbitmq notificaion.
                    hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file)
                    Util.send_new_file_notification(hadoop_pcap_file,
                                                    self._queue_name)

        rm_big_file = "rm {0}".format(file_local_path)
        print rm_big_file
        subprocess.call(rm_big_file, shell=True)
Пример #25
0
    def _initialize_members(self, db_name, hdfs_app_path, kafka_consumer,
                            conf_type, processes):

        # get logger instance.
        self._logger = Util.get_logger('ONI.INGEST.WRK.PROXY')

        self._db_name = db_name
        self._hdfs_app_path = hdfs_app_path
        self._kafka_consumer = kafka_consumer

        # read proxy configuration.
        self._script_path = os.path.dirname(os.path.abspath(__file__))
        conf_file = "{0}/ingest_conf.json".format(
            os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._conf = conf["pipelines"][conf_type]
        self._processes = processes
Пример #26
0
    def _ingest_file(self,file,partition):

        # get file name and date.
        org_file = file
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts)-1]

        # split file.
        name = file_name.split('.')[0]
        split_cmd="editcap -c {0} {1} {2}/{3}_oni.pcap".format(self._pkt_num,file,self._pcap_split_staging,name)
        self._logger.info("Splitting file: {0}".format(split_cmd))
        Util.execute_cmd(split_cmd,self._logger)

        for currdir,subdir,files in os.walk(self._pcap_split_staging):
            for file in files:
                if file.endswith(".pcap") and "{0}_oni".format(name) in file:

                        # get timestamp from the file name to build hdfs path.
                        file_date = file.split('.')[0]
                        pcap_hour = file_date[-6:-4]
                        pcap_date_path = file_date[-14:-6]

                        # hdfs path with timestamp.
                        hdfs_path = "{0}/binary/{1}/{2}".format(self._hdfs_root_path,pcap_date_path,pcap_hour)

 			            # create hdfs path.
                        Util.creat_hdfs_folder(hdfs_path,self._logger)

  			            # load file to hdfs.
                        hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file)
                        Util.load_to_hdfs(os.path.join(currdir,file),hadoop_pcap_file,self._logger)

                        # create event for workers to process the file.
                        self._logger.info( "Sending split file to worker number: {0}".format(partition))
                        self._kafka_topic.send_message(hadoop_pcap_file,partition)
                        self._logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,self._kafka_topic.Topic))


        self._logger.info("Removing file: {0}".format(org_file))
        rm_big_file = "rm {0}".format(org_file)
        Util.execute_cmd(rm_big_file,self._logger)
Пример #27
0
	def _load_to_hdfs(self,file):

		# get file name and date
		binary_year,binary_month,binary_day,binary_hour,binary_date_path,file_name =  Util.build_hdfs_path(file,'flow')

		# hdfs path with timestamp.
                hdfs_path = "{0}/{1}/{2}".format(self._hdfs_root_path,binary_date_path,binary_hour)
                Util.creat_hdfs_folder(hdfs_path)

		# load to hdfs.
		Util.load_to_hdfs(file_name,file,hdfs_path)

		# send the notification to rabbitmq server.
		hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file_name)
                Util.send_new_file_notification(hadoop_pcap_file,self._queue_name)

		print "Done !!!!!"
Пример #28
0
    def _initialize_members(self,hdfs_app_path,kafka_topic,conf_type):
        
        # getting parameters.
        self._logger = logging.getLogger('ONI.INGEST.PROXY')
        self._hdfs_app_path = hdfs_app_path
        self._kafka_topic = kafka_topic

        # get script path
        self._script_path = os.path.dirname(os.path.abspath(__file__))

        # read proxy configuration.
        conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._message_size = conf["kafka"]["message_size"]
        self._conf = conf["pipelines"][conf_type]

        # get collector path.
        self._collector_path = self._conf['collector_path']

        # create collector watcher
        self._watcher =  Util.create_watcher(self._collector_path,NewFileEvent(self),self._logger)
Пример #29
0
    def _initialize_members(self, hdfs_app_path, kafka_topic, conf_type):

        # getting parameters.
        self._logger = logging.getLogger('ONI.INGEST.PROXY')
        self._hdfs_app_path = hdfs_app_path
        self._kafka_topic = kafka_topic

        # get script path
        self._script_path = os.path.dirname(os.path.abspath(__file__))

        # read proxy configuration.
        conf_file = "{0}/ingest_conf.json".format(
            os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._message_size = conf["kafka"]["message_size"]
        self._conf = conf["pipelines"][conf_type]

        # get collector path.
        self._collector_path = self._conf['collector_path']

        # create collector watcher
        self._watcher = Util.create_watcher(self._collector_path,
                                            NewFileEvent(self), self._logger)
Пример #30
0
    def _load_to_hdfs(self, file):

        # get file name and date
        binary_year, binary_month, binary_day, binary_hour, binary_date_path, file_name = Util.build_hdfs_path(
            file, 'flow')

        # hdfs path with timestamp.
        hdfs_path = "{0}/binary/{1}/{2}".format(self._hdfs_root_path,
                                                binary_date_path, binary_hour)
        Util.creat_hdfs_folder(hdfs_path)

        # load to hdfs.
        Util.load_to_hdfs(file_name, file, hdfs_path)

        # send the notification to rabbitmq server.
        hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file_name)
        Util.send_new_file_notification(hadoop_pcap_file, self._queue_name)

        print "Done !!!!!"
Пример #31
0
	def _process_pcap_file(self,file_name,file_local_path,hdfs_root_path):

		# get timestamp from the file name.
		file_date = file_name.split('.')[0]
                pcap_hour=file_date[-4:-2]
                pcap_date_path = file_date[-12:-4]

		# hdfs path with timestamp.
		hdfs_path = "{0}/{1}/{2}".format(hdfs_root_path,pcap_date_path,pcap_hour)
		Util.creat_hdfs_folder(hdfs_path)

		# get file size.
		file_size = os.stat(file_local_path)
		if file_size.st_size > 1145498644:

			# split file.
			self._split_pcap_file(file_name,file_local_path,hdfs_path)
	        else:
			# load file to hdfs
            		Util.load_to_hdfs(file_name,file_local_path,hdfs_path)

			# send rabbitmq notification.
			hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file_name)
			Util.send_new_file_notification(hadoop_pcap_file,self._queue_name)
Пример #32
0
def process_new_binary_file(new_file):

	# get file from hdfs
	get_file_cmd = "hadoop fs -get {0} ../stage/.".format(new_file)
	print get_file_cmd
	subprocess.call(get_file_cmd,shell=True)

	# get file name and date
	binary_year,binary_month,binary_day,binary_hour,binary_date_path,file_name =  Util.build_hdfs_path(new_file,ingest_type)

	# build process cmd.
	post_process_cmd = None
	process_opt = worker_conf[ingest_type]['process_opt']
	
	if ingest_type == 'dns':
		post_process_cmd = "tshark -r ../stage/{0} {1} >> ../stage/{0}.csv".format(file_name,process_opt)
	elif ingest_type == 'flow':
		post_process_cmd = "nfdump -o csv -r ../stage/{0} {1} > ../stage/{0}.csv".format(file_name,process_opt)
	else:
		print "Unsupported ingest type"
		sys.exit(1)

	print post_process_cmd
	subprocess.call(post_process_cmd,shell=True)

	# create folder if it does not exist
	h_base_path = "{0}/{1}".format(worker_conf['huser'], ingest_type)
	h_csv_path = "{0}/csv".format(h_base_path)
	create_folder_cmd = "hadoop fs -mkdir -p {0}/y={1}/m={2}/d={3}/h={4}".format(h_csv_path,binary_year,binary_month,binary_day,binary_hour)
	print create_folder_cmd
	subprocess.call(create_folder_cmd,shell=True)

	#move to hdfs.
	upld_cmd = "hadoop fs -moveFromLocal ../stage/{0}.csv {1}/y={2}/m={3}/d={4}/h={5}/.".format(file_name,h_csv_path,binary_year,binary_month,binary_day,binary_hour)
	print upld_cmd
	subprocess.call(upld_cmd,shell=True)

	#make tmp folder in stage
	h_stage_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4]
	h_stage_path =  "{0}/stage/{1}".format(h_base_path,h_stage_timestamp)
	create_tmp_cmd = "hadoop fs -mkdir -p {0}".format(h_stage_path)
	print create_tmp_cmd
	subprocess.call(create_tmp_cmd,shell=True)

	# move to stage.
	mv_to_stage = "hadoop fs -cp {0}/y={1}/m={2}/d={3}/h={4}/{5}.csv {6}/.".format(h_csv_path,binary_year,binary_month,binary_day,binary_hour,file_name,h_stage_path)
	print mv_to_stage
  	subprocess.call(mv_to_stage,shell=True)

	#load to avro
	load_avro_cmd = "hive -hiveconf dbname={6} -hiveconf y={0} -hiveconf m={1} -hiveconf d={2} -hiveconf h={3} -hiveconf data_location='{4}' -f oni/load_{5}_avro_parquet.hql".format(binary_year,binary_month,binary_day,binary_hour,h_stage_path,ingest_type,worker_conf['dbname'] )

	print load_avro_cmd
	subprocess.call(load_avro_cmd,shell=True)

	#remove from stage
	rm_tmp_cmd = "hadoop fs -rm -R -skipTrash {0}".format(h_stage_path)
	print rm_tmp_cmd
	subprocess.call(rm_tmp_cmd,shell=True)

	#can this delete other files when all is running on the same edge server?
	rm_tmp = "rm ../stage/{0}*".format(file_name)
	subprocess.call(rm_tmp,shell=True)

	print datetime.datetime.now()
Пример #33
0
    def _process_new_file(self, file):

        # get file from hdfs
        get_file_cmd = "hadoop fs -get {0} {1}.".format(
            file, self._local_staging)
        self._logger.info("Getting file from hdfs: {0}".format(get_file_cmd))
        Util.execute_cmd(get_file_cmd, self._logger)

        # get file name and date
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]

        flow_date = file_name.split('.')[1]
        flow_year = flow_date[0:4]
        flow_month = flow_date[4:6]
        flow_day = flow_date[6:8]
        flow_hour = flow_date[8:10]

        # build process cmd.
        process_cmd = "nfdump -o csv -r {0}{1} {2} > {0}{1}.csv".format(
            self._local_staging, file_name, self._process_opt)
        self._logger.info("Processing file: {0}".format(process_cmd))
        Util.execute_cmd(process_cmd, self._logger)

        # create hdfs staging.
        hdfs_path = "{0}/flow".format(self._hdfs_app_path)
        staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4]
        hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path,
                                                   staging_timestamp)
        create_staging_cmd = "hadoop fs -mkdir -p {0}".format(
            hdfs_staging_path)
        self._logger.info("Creating staging: {0}".format(create_staging_cmd))
        Util.execute_cmd(create_staging_cmd, self._logger)

        # move to stage.
        mv_to_staging = "hadoop fs -moveFromLocal {0}{1}.csv {2}/.".format(
            self._local_staging, file_name, hdfs_staging_path)
        self._logger.info("Moving data to staging: {0}".format(mv_to_staging))
        subprocess.call(mv_to_staging, shell=True)

        #load to avro
        load_to_avro_cmd = "hive -hiveconf dbname={0} -hiveconf y={1} -hiveconf m={2} -hiveconf d={3} -hiveconf h={4} -hiveconf data_location='{5}' -f pipelines/flow/load_flow_avro_parquet.hql".format(
            self._db_name, flow_year, flow_month, flow_day, flow_hour,
            hdfs_staging_path)

        self._logger.info("Loading data to hive: {0}".format(load_to_avro_cmd))
        Util.execute_cmd(load_to_avro_cmd, self._logger)

        # remove from hdfs staging
        rm_hdfs_staging_cmd = "hadoop fs -rm -R -skipTrash {0}".format(
            hdfs_staging_path)
        self._logger.info(
            "Removing staging path: {0}".format(rm_hdfs_staging_cmd))
        Util.execute_cmd(rm_hdfs_staging_cmd, self._logger)

        # remove from local staging.
        rm_local_staging = "rm {0}{1}".format(self._local_staging, file_name)
        self._logger.info(
            "Removing files from local staging: {0}".format(rm_local_staging))
        Util.execute_cmd(rm_local_staging, self._logger)

        self._logger.info(
            "File {0} was successfully processed.".format(file_name))
Пример #34
0
    def _process_new_file(self,file):

        # get file from hdfs
        get_file_cmd = "hadoop fs -get {0} {1}.".format(file,self._local_staging)
        self._logger.info("Getting file from hdfs: {0}".format(get_file_cmd))
        Util.execute_cmd(get_file_cmd,self._logger)

        # get file name and date
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts)-1]

        binary_hour = file_name_parts[len(file_name_parts)-2]
        binary_date_path = file_name_parts[len(file_name_parts)-3]
        binary_year = binary_date_path[0:4]
        binary_month = binary_date_path[4:6]
        binary_day = binary_date_path[6:8]

        # build process cmd.
        process_cmd = "tshark -r {0}{1} {2} > {0}{1}.csv".format(self._local_staging,file_name,self._process_opt)
        self._logger.info("Processing file: {0}".format(process_cmd))
        Util.execute_cmd(process_cmd,self._logger)

        # create hdfs staging.
        hdfs_path = "{0}/dns".format(self._hdfs_app_path)
        staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4]
        hdfs_staging_path =  "{0}/stage/{1}".format(hdfs_path,staging_timestamp)
        create_staging_cmd = "hadoop fs -mkdir -p {0}".format(hdfs_staging_path)
        self._logger.info("Creating staging: {0}".format(create_staging_cmd))
        Util.execute_cmd(create_staging_cmd,self._logger)

        # move to stage.
        mv_to_staging ="hadoop fs -moveFromLocal {0}{1}.csv {2}/.".format(self._local_staging,file_name,hdfs_staging_path)
        self._logger.info("Moving data to staging: {0}".format(mv_to_staging))
        Util.execute_cmd(mv_to_staging,self._logger)

        #load to avro
        load_to_avro_cmd = "hive -hiveconf dbname={0} -hiveconf y={1} -hiveconf m={2} -hiveconf d={3} -hiveconf h={4} -hiveconf data_location='{5}' -f pipelines/dns/load_dns_avro_parquet.hql".format(self._db_name,binary_year,binary_month,binary_day,binary_hour,hdfs_staging_path)

        self._logger.info("Loading data to hive: {0}".format(load_to_avro_cmd))
        Util.execute_cmd(load_to_avro_cmd,self._logger)

        # remove from hdfs staging
        rm_hdfs_staging_cmd = "hadoop fs -rm -R -skipTrash {0}".format(hdfs_staging_path)
        self._logger.info("Removing staging path: {0}".format(rm_hdfs_staging_cmd))
        Util.execute_cmd(rm_hdfs_staging_cmd,self._logger)

        # remove from local staging.
        rm_local_staging = "rm {0}{1}".format(self._local_staging,file_name)
        self._logger.info("Removing files from local staging: {0}".format(rm_local_staging))
        Util.execute_cmd(rm_local_staging,self._logger)

        self._logger.info("File {0} was successfully processed.".format(file_name))