Пример #1
0
def process_new_binary_file(new_file):

	# get file from hdfs
	get_file_cmd = "hadoop fs -get {0} ../stage/.".format(new_file)
	print get_file_cmd
	subprocess.call(get_file_cmd,shell=True)

	# get file name and date
	binary_year,binary_month,binary_day,binary_hour,binary_date_path,file_name =  Util.build_hdfs_path(new_file,ingest_type)

	# build process cmd.
	post_process_cmd = None
	process_opt = worker_conf[ingest_type]['process_opt']
	if ingest_type == 'dns':
		post_process_cmd = "tshark -r ../stage/{0} {1} >> ../stage/{0}.csv".format(file_name,process_opt)
	elif ingest_type == 'flow':
		post_process_cmd = "nfdump -o csv -r ../stage/{0} {1} > ../stage/{0}.csv".format(file_name,process_opt)
        else:
            print "Unsupported ingest type"
            sys.exit(1)

	print post_process_cmd
	subprocess.call(post_process_cmd,shell=True)

	# create folder if it does not exist
	h_base_path = "{0}/{1}".format(os.getenv('HUSER','/user/oni'), ingest_type)
	h_csv_path = "{0}/csv".format(h_base_path)
	create_folder_cmd = "hadoop fs -mkdir -p {0}/y={1}/m={2}/d={3}/h={4}".format(h_csv_path,binary_year,binary_month,binary_day,binary_hour)
	print create_folder_cmd
	subprocess.call(create_folder_cmd,shell=True)

	#move to hdfs.
	upld_cmd = "hadoop fs -moveFromLocal ../stage/{0}.csv {1}/y={2}/m={3}/d={4}/h={5}/.".format(file_name,h_csv_path,binary_year,binary_month,binary_day,binary_hour)
	print upld_cmd
	subprocess.call(upld_cmd,shell=True)

	#make tmp folder in stage
        h_stage_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4]
	h_stage_path =  "{0}/stage/{1}".format(h_base_path,h_stage_timestamp)
	create_tmp_cmd = "hadoop fs -mkdir -p {0}".format(h_stage_path)
	print create_tmp_cmd
	subprocess.call(create_tmp_cmd,shell=True)

	#load to avro
	load_avro_cmd = "hive -hiveconf dbname={6} -hiveconf y={0} -hiveconf m={1} -hiveconf d={2} -hiveconf h={3} -hiveconf data_location='{4}' -f oni/load_{5}_avro_parquet.hql".format(binary_year,binary_month,binary_day,binary_hour,h_stage_path,ingest_type,os.getenv('DBNAME','default') )

	print load_avro_cmd
	subprocess.call(load_avro_cmd,shell=True)

	#remove from stage
	rm_tmp_cmd = "hadoop fs -rm -R -skipTrash {0}".format(h_stage_path)
	print rm_tmp_cmd
	subprocess.call(rm_tmp_cmd,shell=True)

	#can this delete other files when all is running on the same edge server?
	rm_tmp = "rm ../stage/{0}*".format(file_name)
	subprocess.call(rm_tmp,shell=True)

	print datetime.datetime.now()
Пример #2
0
	def _load_to_hdfs(self,file):

		# get file name and date
		binary_year,binary_month,binary_day,binary_hour,binary_date_path,file_name =  Util.build_hdfs_path(file,'flow')

		# hdfs path with timestamp.
                hdfs_path = "{0}/{1}/{2}".format(self._hdfs_root_path,binary_date_path,binary_hour)
                Util.creat_hdfs_folder(hdfs_path)

		# load to hdfs.
		Util.load_to_hdfs(file_name,file,hdfs_path)

		# send the notification to rabbitmq server.
		hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file_name)
                Util.send_new_file_notification(hadoop_pcap_file,self._queue_name)

		print "Done !!!!!"
Пример #3
0
    def _load_to_hdfs(self, file):

        # get file name and date
        binary_year, binary_month, binary_day, binary_hour, binary_date_path, file_name = Util.build_hdfs_path(
            file, 'flow')

        # hdfs path with timestamp.
        hdfs_path = "{0}/binary/{1}/{2}".format(self._hdfs_root_path,
                                                binary_date_path, binary_hour)
        Util.creat_hdfs_folder(hdfs_path)

        # load to hdfs.
        Util.load_to_hdfs(file_name, file, hdfs_path)

        # send the notification to rabbitmq server.
        hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file_name)
        Util.send_new_file_notification(hadoop_pcap_file, self._queue_name)

        print "Done !!!!!"
Пример #4
0
def process_new_binary_file(new_file):

	# get file from hdfs
	get_file_cmd = "hadoop fs -get {0} ../stage/.".format(new_file)
	print get_file_cmd
	subprocess.call(get_file_cmd,shell=True)

	# get file name and date
	binary_year,binary_month,binary_day,binary_hour,binary_date_path,file_name =  Util.build_hdfs_path(new_file,ingest_type)

	# build process cmd.
	post_process_cmd = None
	process_opt = worker_conf[ingest_type]['process_opt']
	
	if ingest_type == 'dns':
		post_process_cmd = "tshark -r ../stage/{0} {1} >> ../stage/{0}.csv".format(file_name,process_opt)
	elif ingest_type == 'flow':
		post_process_cmd = "nfdump -o csv -r ../stage/{0} {1} > ../stage/{0}.csv".format(file_name,process_opt)
	else:
		print "Unsupported ingest type"
		sys.exit(1)

	print post_process_cmd
	subprocess.call(post_process_cmd,shell=True)

	# create folder if it does not exist
	h_base_path = "{0}/{1}".format(worker_conf['huser'], ingest_type)
	h_csv_path = "{0}/csv".format(h_base_path)
	create_folder_cmd = "hadoop fs -mkdir -p {0}/y={1}/m={2}/d={3}/h={4}".format(h_csv_path,binary_year,binary_month,binary_day,binary_hour)
	print create_folder_cmd
	subprocess.call(create_folder_cmd,shell=True)

	#move to hdfs.
	upld_cmd = "hadoop fs -moveFromLocal ../stage/{0}.csv {1}/y={2}/m={3}/d={4}/h={5}/.".format(file_name,h_csv_path,binary_year,binary_month,binary_day,binary_hour)
	print upld_cmd
	subprocess.call(upld_cmd,shell=True)

	#make tmp folder in stage
	h_stage_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4]
	h_stage_path =  "{0}/stage/{1}".format(h_base_path,h_stage_timestamp)
	create_tmp_cmd = "hadoop fs -mkdir -p {0}".format(h_stage_path)
	print create_tmp_cmd
	subprocess.call(create_tmp_cmd,shell=True)

	# move to stage.
	mv_to_stage = "hadoop fs -cp {0}/y={1}/m={2}/d={3}/h={4}/{5}.csv {6}/.".format(h_csv_path,binary_year,binary_month,binary_day,binary_hour,file_name,h_stage_path)
	print mv_to_stage
  	subprocess.call(mv_to_stage,shell=True)

	#load to avro
	load_avro_cmd = "hive -hiveconf dbname={6} -hiveconf y={0} -hiveconf m={1} -hiveconf d={2} -hiveconf h={3} -hiveconf data_location='{4}' -f oni/load_{5}_avro_parquet.hql".format(binary_year,binary_month,binary_day,binary_hour,h_stage_path,ingest_type,worker_conf['dbname'] )

	print load_avro_cmd
	subprocess.call(load_avro_cmd,shell=True)

	#remove from stage
	rm_tmp_cmd = "hadoop fs -rm -R -skipTrash {0}".format(h_stage_path)
	print rm_tmp_cmd
	subprocess.call(rm_tmp_cmd,shell=True)

	#can this delete other files when all is running on the same edge server?
	rm_tmp = "rm ../stage/{0}*".format(file_name)
	subprocess.call(rm_tmp,shell=True)

	print datetime.datetime.now()