예제 #1
0
def process_new_binary_file(new_file):
	
	# get file from hdfs
	get_file_cmd = "hadoop fs -get {0} ../stage/.".format(new_file)
	print get_file_cmd
	subprocess.call(get_file_cmd,shell=True)

	# get file name and date
	binary_year,binary_month,binary_day,binary_hour,binary_date_path,file_name =  util.build_hdfs_path(new_file,ingest_type)

	# build process cmd.	
	post_process_cmd = None
	process_opt = worker_conf[ingest_type]['process_opt']
	if ingest_type == 'dns':		
		post_process_cmd = "tshark -r tmp/{0} {1} >> tmp/{0}.csv".format(file_name,process_opt)
	elif ingest_type == 'flows':
		post_process_cmd = "nfdump -o csv -r tmp/{0} {1} > tmp/{0}.csv".format(file_name,process_opt)

	print post_process_cmd
	subprocess.call(post_process_cmd,shell=True)

	# create folder if it does not exist
	h_base_path = os.environ['HUSER']+'/'+ingest_type
	h_csv_path = "{0}/csv".format(h_base_path)
	create_folder_cmd = "hadoop fs -mkdir -p {0}/y={1}/m={2}/d={3}/h={4}".format(h_csv_path,binary_year,binary_month,binary_day,binary_hour)
	print create_folder_cmd
	subprocess.call(create_folder_cmd,shell=True)	

	#move to hdfs.
	upld_cmd = "hadoop fs -moveFromLocal tmp/{0}.csv {1}/y={2}/m={3}/d={4}/h={5}/.".format(file_name,h_csv_path,binary_year,binary_month,binary_day,binary_hour)
	print upld_cmd
	subprocess.call(upld_cmd,shell=True)

	#make tmp folder in stage
	h_stage_path =  "{0}/stage/{1}".format(h_base_path,file_date)
	create_tmp_cmd = "hadoop fs -mkdir -p {0}".format(h_stage_path)
	print create_tmp_cmd
	subprocess.call(create_tmp_cmd,shell=True)
	
	#load to avro	
	load_avro_cmd = "hive -hiveconf dbname={6} -hiveconf y={0} -hiveconf m={1} -hiveconf d={2} -hiveconf h={3} -hiveconf data_location='{4}' -f load_{5}_avro_parquet.hql".format(binary_year,binary_month,binary_day,binary_hour,h_stage_path,ingest_type,os.environ['DBNAME'])

	print load_avro_cmd
	subprocess.call(load_avro_cmd,shell=True)

	#remove from stage
	rm_tmp_cmd = "hadoop fs -rm -R -skipTrash {0}".format(h_stage_path)
	print rm_tmp_cmd
	subprocess.call(rm_tmp_cmd,shell=True)

	#can this delete other files when all is running on the same edge server?
	rm_tmp = "rm ../stage/{0}".format(new_file)
	subprocess.call(rm_tmp,shell=True)

	print datetime.datetime.now()
	def _load_to_hdfs(self,file):
		
		# get file name and date
		binary_year,binary_month,binary_day,binary_hour,binary_date_path,file_name =  util.build_hdfs_path(file,'flow')

		# hdfs path with timestamp.
                hdfs_path = "{0}/{1}/{2}".format(self._hdfs_root_path,binary_date_path,binary_hour)
                util.creat_hdfs_folder(hdfs_path)

		# load to hdfs.
		util.load_to_hdfs(file_name,file,hdfs_path)

		# send the notification to rabbitmq server.
		hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file_name)
                util.send_new_file_notification(hadoop_pcap_file,self._queue_name)	
			
		print "Done !!!!!"