def ingest_file(file, message_size, topic, kafka_servers): logger = logging.getLogger('SPOT.INGEST.PROXY.{0}'.format(os.getpid())) try: message = "" logger.info("Ingesting file: {0} process:{1}".format( file, os.getpid())) with open(file, "rb") as f: for line in f: message += line if len(message) > message_size: KafkaProducer.SendMessage(message, kafka_servers, topic, 0) message = "" #send the last package. KafkaProducer.SendMessage(message, kafka_servers, topic, 0) rm_file = "rm {0}".format(file) Util.execute_cmd(rm_file, logger) logger.info( "File {0} has been successfully sent to Kafka Topic: {1}".format( file, topic)) except Exception as err: logger.error( "There was a problem, please check the following error message:{0}" .format(err.message)) logger.error("Exception: {0}".format(err))
def start(self): self._logger.info("Creating Spark Job for topic: {0}".format(self._kafka_consumer.Topic)) # parser parser = self._conf["parser"] #spark conf diver_memory = self._spark_conf["driver_memory"] num_exec = self._spark_conf["spark_exec"] exec_memory = self._spark_conf["spark_executor_memory"] exec_cores = self._spark_conf["spark_executor_cores"] batch_size = self._spark_conf["spark_batch_size"] jar_path = os.path.dirname(os.path.dirname(self._script_path)) # spark job command. spark_job_cmd = ("spark-submit --master yarn " "--driver-memory {0} " "--num-executors {1} " "--conf spark.executor.memory={2} " "--conf spark.executor.cores={3} " "--jars {4}/common/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar " "{5}/{6} " "-zk {7} " "-t {8} " "-db {9} " "-dt {10} " "-w {11} " "-bs {12}".format(diver_memory,num_exec,exec_memory,exec_cores,jar_path,self._script_path,parser,self._kafka_consumer.ZookeperServer,self._kafka_consumer.Topic,self._db_name,"proxy",self._processes,batch_size)) # start spark job. Util.execute_cmd(spark_job_cmd,self._logger)
def ingest_file(file, pkt_num, pcap_split_staging, partition, hdfs_root_path, topic, kafka_servers): logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid())) try: # get file name and date. org_file = file file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] # split file. name = file_name.split('.')[0] split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format( pkt_num, file, pcap_split_staging, name) logger.info("Splitting file: {0}".format(split_cmd)) Util.execute_cmd(split_cmd, logger) logger.info("Removing file: {0}".format(org_file)) rm_big_file = "rm {0}".format(org_file) Util.execute_cmd(rm_big_file, logger) for currdir, subdir, files in os.walk(pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_spot".format(name) in file: # get timestamp from the file name to build hdfs path. file_date = file.split('.')[0] pcap_hour = file_date[-6:-4] pcap_date_path = file_date[-14:-6] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format( hdfs_root_path, pcap_date_path, pcap_hour) # create hdfs path. Util.creat_hdfs_folder(hdfs_path, logger) # load file to hdfs. hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file) Util.load_to_hdfs(os.path.join(currdir, file), hadoop_pcap_file, logger) # create event for workers to process the file. logger.info( "Sending split file to worker number: {0}".format( partition)) KafkaTopic.SendMessage(hadoop_pcap_file, kafka_servers, topic, partition) logger.info( "File {0} has been successfully sent to Kafka Topic to: {1}" .format(file, topic)) except Exception as err: logger.error( "There was a problem, please check the following error message:{0}" .format(err.message)) logger.error("Exception: {0}".format(err))
def _create_topic(self): self._logger.info("Creating topic: {0} with {1} parititions".format( self._topic, self._num_of_partitions)) # get script path zk_conf = "{0}:{1}".format(self._zk_server, self._zk_port) create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format( os.path.dirname(os.path.abspath(__file__)), self._topic, zk_conf, self._num_of_partitions) # execute create topic cmd Util.execute_cmd(create_topic_cmd, self._logger)
def _ingest_file(new_file, hdfs_root_path, producer, topic): logger = logging.getLogger('SPOT.INGEST.FLOW.{0}'.format(os.getpid())) try: # get file name and date. file_name_parts = new_file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] file_date = file_name.split('.')[1] file_date_path = file_date[0:8] file_date_hour = file_date[8:10] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path, file_date_path, file_date_hour) hdfs_file = "{0}/{1}".format(hdfs_path, file_name) try: if len(hdfs.list_dir(hdfs_path)) == 0: logger.info('creating directory: ' + hdfs_path) hdfs.mkdir(hdfs_path) logger.info('uploading file to hdfs: ' + hdfs_file) result = hdfs.upload_file(hdfs_path, new_file) if not result: logger.error('File failed to upload: ' + hdfs_file) raise HdfsException else: rm_file = "rm {0}".format(new_file) logger.info( "Removing files from local staging: {0}".format(rm_file)) Util.execute_cmd(rm_file, logger) except HdfsException as err: logger.error('Exception: ' + err.exception) logger.info('Check Hdfs Connection settings and server health') except Exception as err: logger.error("There was a problem, Exception: {0}".format(err)) # create event for workers to process the file. # logger.info("Sending file to worker number: {0}".format(partition)) try: producer.SendMessage(hdfs_file, topic) logger.info( "File {0} has been successfully sent to Kafka Topic to: {1}". format(hdfs_file, topic)) except Exception as err: logger.info("File {0} failed to be sent to Kafka Topic to: {1}".format( hdfs_file, topic)) logger.error("Error: {0}".format(err))
def ingest_file(file,pkt_num,pcap_split_staging, partition,hdfs_root_path,topic,kafka_servers): logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid())) try: # get file name and date. org_file = file file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts)-1] # split file. name = file_name.split('.')[0] split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format(pkt_num,file,pcap_split_staging,name) logger.info("Splitting file: {0}".format(split_cmd)) Util.execute_cmd(split_cmd,logger) logger.info("Removing file: {0}".format(org_file)) rm_big_file = "rm {0}".format(org_file) Util.execute_cmd(rm_big_file,logger) for currdir,subdir,files in os.walk(pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_spot".format(name) in file: # get timestamp from the file name to build hdfs path. file_date = file.split('.')[0] pcap_hour = file_date[-6:-4] pcap_date_path = file_date[-14:-6] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path,pcap_date_path,pcap_hour) # create hdfs path. Util.creat_hdfs_folder(hdfs_path,logger) # load file to hdfs. hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file) Util.load_to_hdfs(os.path.join(currdir,file),hadoop_pcap_file,logger) # create event for workers to process the file. logger.info( "Sending split file to worker number: {0}".format(partition)) KafkaTopic.SendMessage(hadoop_pcap_file,kafka_servers,topic,partition) logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,topic)) except Exception as err: logger.error("There was a problem, please check the following error message:{0}".format(err.message)) logger.error("Exception: {0}".format(err))
def _create_topic(self): self._logger.info("Creating topic: {0} with {1} parititions".format(self._topic,self._num_of_partitions)) # Create partitions for the workers. self._partitions = [ TopicPartition(self._topic,p) for p in range(int(self._num_of_partitions))] # create partitioner self._partitioner = RoundRobinPartitioner(self._partitions) # get script path zk_conf = "{0}:{1}".format(self._zk_server,self._zk_port) create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format(os.path.dirname(os.path.abspath(__file__)),self._topic,zk_conf,self._num_of_partitions) # execute create topic cmd Util.execute_cmd(create_topic_cmd,self._logger)
def ingest_file(file,message_size,topic,kafka_servers): logger = logging.getLogger('SPOT.INGEST.PROXY.{0}'.format(os.getpid())) try: message = "" logger.info("Ingesting file: {0} process:{1}".format(file,os.getpid())) with open(file,"rb") as f: for line in f: message += line if len(message) > message_size: KafkaTopic.SendMessage(message,kafka_servers,topic,0) message = "" #send the last package. KafkaTopic.SendMessage(message,kafka_servers,topic,0) rm_file = "rm {0}".format(file) Util.execute_cmd(rm_file,logger) logger.info("File {0} has been successfully sent to Kafka Topic: {1}".format(file,topic)) except Exception as err: logger.error("There was a problem, please check the following error message:{0}".format(err.message)) logger.error("Exception: {0}".format(err))
def _create_topic(self): self._logger.info("Creating topic: {0} with {1} parititions".format( self._topic, self._num_of_partitions)) # Create partitions for the workers. self._partitions = [ TopicPartition(self._topic, p) for p in range(int(self._num_of_partitions)) ] # create partitioner self._partitioner = RoundRobinPartitioner(self._partitions) # get script path zk_conf = "{0}:{1}".format(self._zk_server, self._zk_port) create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format( os.path.dirname(os.path.abspath(__file__)), self._topic, zk_conf, self._num_of_partitions) # execute create topic cmd Util.execute_cmd(create_topic_cmd, self._logger)
def start(self): self._logger.info("Creating Spark Job for topic: {0}".format( self._kafka_consumer.Topic)) # parser parser = self._conf["parser"] #spark conf diver_memory = self._spark_conf["driver_memory"] num_exec = self._spark_conf["spark_exec"] exec_memory = self._spark_conf["spark_executor_memory"] exec_cores = self._spark_conf["spark_executor_cores"] batch_size = self._spark_conf["spark_batch_size"] jar_path = os.path.dirname(os.path.dirname(self._script_path)) # spark job command. spark_job_cmd = ( "spark-submit --master yarn " "--driver-memory {0} " "--num-executors {1} " "--conf spark.executor.memory={2} " "--conf spark.executor.cores={3} " "--jars {4}/common/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar " "{5}/{6} " "-zk {7} " "-t {8} " "-db {9} " "-dt {10} " "-w {11} " "-bs {12}".format(diver_memory, num_exec, exec_memory, exec_cores, jar_path, self._script_path, parser, self._kafka_consumer.ZookeperServer, self._kafka_consumer.Topic, self._db_name, "proxy", self._processes, batch_size)) # start spark job. Util.execute_cmd(spark_job_cmd, self._logger)
def _ingest_file(hdfs_client, new_file, pkt_num, pcap_split_staging, hdfs_root_path, producer, topic): logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid())) try: # get file name and date. org_file = new_file file_name_parts = new_file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] # split file. name = file_name.split('.')[0] split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format( pkt_num, new_file, pcap_split_staging, name) logger.info("Splitting file: {0}".format(split_cmd)) Util.execute_cmd(split_cmd, logger) logger.info("Removing file: {0}".format(org_file)) rm_big_file = "rm {0}".format(org_file) Util.execute_cmd(rm_big_file, logger) except Exception as err: logger.error("There was a problem splitting the file: {0}".format( err.message)) logger.error("Exception: {0}".format(err)) for currdir, subdir, files in os.walk(pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_spot".format(name) in file: # get timestamp from the file name to build hdfs path. file_date = file.split('.')[0] pcap_hour = file_date[-6:-4] pcap_date_path = file_date[-14:-6] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format( hdfs_root_path, pcap_date_path, pcap_hour) # create hdfs path. try: if len(hdfs.list_dir(hdfs_path, hdfs_client)) == 0: logger.info('creating directory: ' + hdfs_path) hdfs_client.mkdir(hdfs_path, hdfs_client) # load file to hdfs. hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file) result = hdfs_client.upload_file( hadoop_pcap_file, os.path.join(currdir, file)) if not result: logger.error('File failed to upload: ' + hadoop_pcap_file) raise HdfsException # create event for workers to process the file. logger.info( "Sending split file to Topic: {0}".format(topic)) producer.SendMessage(hadoop_pcap_file, topic) logger.info( "File {0} has been successfully sent to Kafka Topic to: {1}" .format(file, topic)) except HdfsException as err: logger.error('Exception: ' + err.exception) logger.info( 'Check Hdfs Connection settings and server health') except Exception as err: logger.info( "File {0} failed to be sent to Kafka Topic to: {1}". format(new_file, topic)) logger.error("Error: {0}".format(err))
def _process_new_file(self, nf): # get file name and date file_name_parts = nf.split('/') file_name = file_name_parts[len(file_name_parts) - 1] nf_path = nf.rstrip(file_name) flow_date = file_name.split('.')[1] flow_year = flow_date[0:4] flow_month = flow_date[4:6] flow_day = flow_date[6:8] flow_hour = flow_date[8:10] # get file from hdfs if hdfs.file_exists(nf_path, file_name): self._logger.info("Getting file from hdfs: {0}".format(nf)) hdfs.download_file(nf, self._local_staging) else: self._logger.info("file: {0} not found".format(nf)) # TODO: error handling # build process cmd. sf = "{0}{1}.csv".format(self._local_staging, file_name) process_cmd = "nfdump -o csv -r {0}{1} {2} > {3}".format( self._local_staging, file_name, self._process_opt, sf) self._logger.info("Processing file: {0}".format(process_cmd)) Util.execute_cmd(process_cmd, self._logger) # create hdfs staging. hdfs_path = "{0}/flow".format(self._hdfs_app_path) staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4] hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path, staging_timestamp) self._logger.info("Creating staging: {0}".format(hdfs_staging_path)) hdfs.mkdir(hdfs_staging_path) # move to stage. local_file = "{0}{1}.csv".format(self._local_staging, file_name) self._logger.info( "Moving data to staging: {0}".format(hdfs_staging_path)) hdfs.upload_file(hdfs_staging_path, local_file) # load with impyla drop_table = "DROP TABLE IF EXISTS {0}.flow_tmp".format(self._db_name) self._logger.info("Dropping temp table: {0}".format(drop_table)) self._cursor.execute_query(drop_table) create_external = ( "\n" "CREATE EXTERNAL TABLE {0}.flow_tmp (\n" " treceived STRING,\n" " tryear INT,\n" " trmonth INT,\n" " trday INT,\n" " trhour INT,\n" " trminute INT,\n" " trsec INT,\n" " tdur FLOAT,\n" " sip STRING,\n" " dip STRING,\n" " sport INT,\n" " dport INT,\n" " proto STRING,\n" " flag STRING,\n" " fwd INT,\n" " stos INT,\n" " ipkt BIGINT,\n" " ibyt BIGINT,\n" " opkt BIGINT,\n" " obyt BIGINT,\n" " input INT,\n" " output INT,\n" " sas INT,\n" " das INT,\n" " dtos INT,\n" " dir INT,\n" " rip STRING\n" " )\n" " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\n" " STORED AS TEXTFILE\n" " LOCATION '{1}'\n" " TBLPROPERTIES ('avro.schema.literal'='{{\n" " \"type\": \"record\"\n" " , \"name\": \"RawFlowRecord\"\n" " , \"namespace\" : \"com.cloudera.accelerators.flows.avro\"\n" " , \"fields\": [\n" " {{\"name\": \"treceived\", \"type\":[\"string\", \"null\"]}}\n" " , {{\"name\": \"tryear\", \"type\":[\"float\", \"null\"]}}\n" " , {{\"name\": \"trmonth\", \"type\":[\"float\", \"null\"]}}\n" " , {{\"name\": \"trday\", \"type\":[\"float\", \"null\"]}}\n" " , {{\"name\": \"trhour\", \"type\":[\"float\", \"null\"]}}\n" " , {{\"name\": \"trminute\", \"type\":[\"float\", \"null\"]}}\n" " , {{\"name\": \"trsec\", \"type\":[\"float\", \"null\"]}}\n" " , {{\"name\": \"tdur\", \"type\":[\"float\", \"null\"]}}\n" " , {{\"name\": \"sip\", \"type\":[\"string\", \"null\"]}}\n" " , {{\"name\": \"sport\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"dip\", \"type\":[\"string\", \"null\"]}}\n" " , {{\"name\": \"dport\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"proto\", \"type\":[\"string\", \"null\"]}}\n" " , {{\"name\": \"flag\", \"type\":[\"string\", \"null\"]}}\n" " , {{\"name\": \"fwd\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"stos\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"ipkt\", \"type\":[\"bigint\", \"null\"]}}\n" " , {{\"name\": \"ibyt\", \"type\":[\"bigint\", \"null\"]}}\n" " , {{\"name\": \"opkt\", \"type\":[\"bigint\", \"null\"]}}\n" " , {{\"name\": \"obyt\", \"type\":[\"bigint\", \"null\"]}}\n" " , {{\"name\": \"input\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"output\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"sas\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"das\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"dtos\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"dir\", \"type\":[\"int\", \"null\"]}}\n" " , {{\"name\": \"rip\", \"type\":[\"string\", \"null\"]}}\n" " ]\n" "}}')\n").format(self._db_name, hdfs_staging_path) self._logger.info( "Creating external table: {0}".format(create_external)) self._cursor.execute_query(create_external) insert_into_table = """ INSERT INTO TABLE {0}.flow PARTITION (y={1}, m={2}, d={3}, h={4}) SELECT treceived, unix_timestamp(treceived) AS unix_tstamp, tryear, trmonth, trday, trhour, trminute, trsec, tdur, sip, dip, sport, dport, proto, flag, fwd, stos, ipkt, ibyt, opkt, obyt, input, output, sas, das, dtos, dir, rip FROM {0}.flow_tmp """.format(self._db_name, flow_year, flow_month, flow_day, flow_hour) self._logger.info("Loading data to {0}: {1}".format( self._db_name, insert_into_table)) self._cursor.execute_query(insert_into_table) # remove from hdfs staging self._logger.info( "Removing staging path: {0}".format(hdfs_staging_path)) hdfs.delete_folder(hdfs_staging_path) # remove from local staging. rm_local_staging = "rm {0}{1}".format(self._local_staging, file_name) self._logger.info( "Removing files from local staging: {0}".format(rm_local_staging)) Util.execute_cmd(rm_local_staging, self._logger) rm_local_staging = "rm {0}".format(sf) self._logger.info( "Removing files from local staging: {0}".format(rm_local_staging)) Util.execute_cmd(rm_local_staging, self._logger) self._logger.info( "File {0} was successfully processed.".format(file_name))
def _process_new_file(self, nf): # get file from hdfs self._logger.info("Getting file from hdfs: {0}".format(nf)) if hdfs.file_exists(nf): hdfs.download_file(nf, self._local_staging) else: self._logger.info("file: {0} not found".format(nf)) # TODO: error handling # get file name and date file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] binary_hour = file_name_parts[len(file_name_parts) - 2] binary_date_path = file_name_parts[len(file_name_parts) - 3] binary_year = binary_date_path[0:4] binary_month = binary_date_path[4:6] binary_day = binary_date_path[6:8] # build process cmd. process_cmd = "tshark -r {0}{1} {2} > {0}{1}.csv".format( self._local_staging, file_name, self._process_opt) self._logger.info("Processing file: {0}".format(process_cmd)) Util.execute_cmd(process_cmd, self._logger) # create hdfs staging. hdfs_path = "{0}/dns".format(self._hdfs_app_path) staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4] hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path, staging_timestamp) self._logger.info("Creating staging: {0}".format(hdfs_staging_path)) hdfs.mkdir(hdfs_staging_path) # move to stage. local_file = "{0}{1}.csv".format(self._local_staging, file_name) self._logger.info( "Moving data to staging: {0}".format(hdfs_staging_path)) hdfs.upload_file(hdfs_staging_path, local_file) #load to avro drop_table = 'DROP TABLE IF EXISTS {0}.dns_tmp'.format(self._db_name) self._cursor.execute(drop_table) # Create external table create_external = ( "\n" "CREATE EXTERNAL TABLE {0}.dns_tmp (\n" " frame_day STRING,\n" " frame_time STRING,\n" " unix_tstamp BIGINT,\n" " frame_len INT,\n" " ip_src STRING,\n" " ip_dst STRING,\n" " dns_qry_name STRING,\n" " dns_qry_type INT,\n" " dns_qry_class STRING,\n" " dns_qry_rcode INT,\n" " dns_a STRING \n" " )\n" " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\n" " STORED AS TEXTFILE\n" " LOCATION '{1}'\n" " TBLPROPERTIES ('avro.schema.literal'='{{\n" " \"type\": \"record\"\n" " , \"name\": \"RawDnsRecord\"\n" " , \"namespace\" : \"com.cloudera.accelerators.dns.avro\"\n" " , \"fields\": [\n" " {{\"name\": \"frame_day\", \"type\":[\"string\", \"null\"]}\n" " , {{\"name\": \"frame_time\", \"type\":[\"string\", \"null\"]}\n" " , {{\"name\": \"unix_tstamp\", \"type\":[\"bigint\", \"null\"]}\n" " , {{\"name\": \"frame_len\", \"type\":[\"int\", \"null\"]}\n" " , {{\"name\": \"ip_src\", \"type\":[\"string\", \"null\"]}\n" " , {{\"name\": \"ip_dst\", \"type\":[\"string\", \"null\"]}\n" " , {{\"name\": \"dns_qry_name\", \"type\":[\"string\", \"null\"]}\n" " , {{\"name\": \"dns_qry_type\", \"type\":[\"int\", \"null\"]}\n" " , {{\"name\": \"dns_qry_class\", \"type\":[\"string\", \"null\"]}\n" " , {{\"name\": \"dns_qry_rcode\", \"type\":[\"int\", \"null\"]}\n" " , {{\"name\": \"dns_a\", \"type\":[\"string\", \"null\"]}\n" " ]\n" "}')\n").format(self._db_name, hdfs_staging_path) self._logger.info( "Creating external table: {0}".format(create_external)) self._cursor.execute(create_external) # Insert data insert_into_table = """ INSERT INTO TABLE {0}.dns PARTITION (y={1}, m={2}, d={3}, h={4) SELECT CONCAT(frame_day , frame_time) as treceived, unix_tstamp, frame_len, ip_dst, ip_src, dns_qry_name, dns_qry_class,dns_qry_type, dns_qry_rcode, dns_a FROM {0}.dns_tmp """.format(self._db_name, binary_year, binary_month, binary_day, binary_hour) self._logger.info("Loading data to {0}: {1}".format( self._db_name, insert_into_table)) self._cursor.execute(insert_into_table) # remove from hdfs staging self._logger.info( "Removing staging path: {0}".format(hdfs_staging_path)) hdfs.delete_folder(hdfs_staging_path) # remove from local staging. rm_local_staging = "rm {0}{1}".format(self._local_staging, file_name) self._logger.info( "Removing files from local staging: {0}".format(rm_local_staging)) Util.execute_cmd(rm_local_staging, self._logger) self._logger.info( "File {0} was successfully processed.".format(file_name))
def _process_new_file(self,file): # get file from hdfs get_file_cmd = "hadoop fs -get {0} {1}.".format(file,self._local_staging) self._logger.info("Getting file from hdfs: {0}".format(get_file_cmd)) Util.execute_cmd(get_file_cmd,self._logger) # get file name and date file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts)-1] flow_date = file_name.split('.')[1] flow_year = flow_date[0:4] flow_month = flow_date[4:6] flow_day = flow_date[6:8] flow_hour = flow_date[8:10] # build process cmd. process_cmd = "nfdump -o csv -r {0}{1} {2} > {0}{1}.csv".format(self._local_staging,file_name,self._process_opt) self._logger.info("Processing file: {0}".format(process_cmd)) Util.execute_cmd(process_cmd,self._logger) # create hdfs staging. hdfs_path = "{0}/flow".format(self._hdfs_app_path) staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4] hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path,staging_timestamp) create_staging_cmd = "hadoop fs -mkdir -p {0}".format(hdfs_staging_path) self._logger.info("Creating staging: {0}".format(create_staging_cmd)) Util.execute_cmd(create_staging_cmd,self._logger) # move to stage. mv_to_staging ="hadoop fs -moveFromLocal {0}{1}.csv {2}/.".format(self._local_staging,file_name,hdfs_staging_path) self._logger.info("Moving data to staging: {0}".format(mv_to_staging)) subprocess.call(mv_to_staging,shell=True) #load to avro load_to_avro_cmd = "hive -hiveconf dbname={0} -hiveconf y={1} -hiveconf m={2} -hiveconf d={3} -hiveconf h={4} -hiveconf data_location='{5}' -f pipelines/flow/load_flow_avro_parquet.hql".format(self._db_name,flow_year,flow_month,flow_day,flow_hour,hdfs_staging_path) self._logger.info( "Loading data to hive: {0}".format(load_to_avro_cmd)) Util.execute_cmd(load_to_avro_cmd,self._logger) # remove from hdfs staging rm_hdfs_staging_cmd = "hadoop fs -rm -R -skipTrash {0}".format(hdfs_staging_path) self._logger.info("Removing staging path: {0}".format(rm_hdfs_staging_cmd)) Util.execute_cmd(rm_hdfs_staging_cmd,self._logger) # remove from local staging. rm_local_staging = "rm {0}{1}".format(self._local_staging,file_name) self._logger.info("Removing files from local staging: {0}".format(rm_local_staging)) Util.execute_cmd(rm_local_staging,self._logger) self._logger.info("File {0} was successfully processed.".format(file_name))
def _process_new_file(self, file): # get file from hdfs get_file_cmd = "hadoop fs -get {0} {1}.".format( file, self._local_staging) self._logger.info("Getting file from hdfs: {0}".format(get_file_cmd)) Util.execute_cmd(get_file_cmd, self._logger) # get file name and date file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] binary_hour = file_name_parts[len(file_name_parts) - 2] binary_date_path = file_name_parts[len(file_name_parts) - 3] binary_year = binary_date_path[0:4] binary_month = binary_date_path[4:6] binary_day = binary_date_path[6:8] # build process cmd. process_cmd = "tshark -r {0}{1} {2} > {0}{1}.csv".format( self._local_staging, file_name, self._process_opt) self._logger.info("Processing file: {0}".format(process_cmd)) Util.execute_cmd(process_cmd, self._logger) # create hdfs staging. hdfs_path = "{0}/dns".format(self._hdfs_app_path) staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4] hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path, staging_timestamp) create_staging_cmd = "hadoop fs -mkdir -p {0}".format( hdfs_staging_path) self._logger.info("Creating staging: {0}".format(create_staging_cmd)) Util.execute_cmd(create_staging_cmd, self._logger) # move to stage. mv_to_staging = "hadoop fs -moveFromLocal {0}{1}.csv {2}/.".format( self._local_staging, file_name, hdfs_staging_path) self._logger.info("Moving data to staging: {0}".format(mv_to_staging)) Util.execute_cmd(mv_to_staging, self._logger) #load to avro load_to_avro_cmd = "hive -hiveconf dbname={0} -hiveconf y={1} -hiveconf m={2} -hiveconf d={3} -hiveconf h={4} -hiveconf data_location='{5}' -f pipelines/dns/load_dns_avro_parquet.hql".format( self._db_name, binary_year, binary_month, binary_day, binary_hour, hdfs_staging_path) self._logger.info("Loading data to hive: {0}".format(load_to_avro_cmd)) Util.execute_cmd(load_to_avro_cmd, self._logger) # remove from hdfs staging rm_hdfs_staging_cmd = "hadoop fs -rm -R -skipTrash {0}".format( hdfs_staging_path) self._logger.info( "Removing staging path: {0}".format(rm_hdfs_staging_cmd)) Util.execute_cmd(rm_hdfs_staging_cmd, self._logger) # remove from local staging. rm_local_staging = "rm {0}{1}".format(self._local_staging, file_name) self._logger.info( "Removing files from local staging: {0}".format(rm_local_staging)) Util.execute_cmd(rm_local_staging, self._logger) self._logger.info( "File {0} was successfully processed.".format(file_name))
def authenticate(self): Util.execute_cmd(self._kinit_cmd, self._logger) self._logger.info("Kerberos ticket obtained")