def write_to_file(self, data_dict, output_file_path): """write_to_file :param data_dict: :param output_file_path: """ log.info("saving={}".format(output_file_path)) with open(output_file_path, "w") as output_file: output_file.write(str(ppj(data_dict)))
def build_csv( pipeline_files=[], fulldata_file=None, clean_file=None, post_proc_rules=None, label_rules=None, metadata_filename="metadata.json"): """build_csv :param pipeline_files: files to process :param fulldata_file: output all columns to this csv file :param clean_file: output all numeric-ready columns to this csv file :param post_proc_rules: rules after building the DataFrame :param label_rules: labeling rules :param metadata_filename: metadata """ save_node = { "status": INVALID, "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "fulldata_file": fulldata_file, "fulldata_metadata_file": None, "clean_file": clean_file, "clean_metadata_file": None, "features_to_process": [], "feature_to_predict": None, "ignore_features": [], "df_json": {} } if not fulldata_file: log.error("missing fulldata_file - stopping") save_node["status"] = INVALID return save_node if not clean_file: log.error("missing clean_file - stopping") save_node["status"] = INVALID return save_node log.info("build_csv - START") common_headers, \ headers_dict = find_all_headers( pipeline_files=pipeline_files) log.info(("num common_headers={} headers={}") .format(len(common_headers), common_headers)) # since the headers can be different we rebuild a new one: hdrs = {} for h in common_headers: hdrs[h] = None features_to_process = [] feature_to_predict = None ignore_features = [] set_if_above = None labels = [] label_values = [] if label_rules: set_if_above = label_rules["set_if_above"] labels = label_rules["labels"] label_values = label_rules["label_values"] all_rows = [] num_done = 0 total_files = len(pipeline_files) for c in pipeline_files: log.info(("merging={}/{} csv={}") .format(num_done, total_files, c)) cf = pd.read_csv(c) log.info((" processing rows={}") .format(len(cf.index))) for index, row in cf.iterrows(): valid_row = True new_row = copy.deepcopy(hdrs) new_row["src_file"] = c for k in hdrs: if k in row: new_row[k] = row[k] # end of for all headers to copy in if label_rules: test_rand = random.randint(0, 100) if test_rand > set_if_above: new_row["label_value"] = label_values[1] new_row["label_name"] = labels[1] else: new_row["label_value"] = label_values[0] new_row["label_name"] = labels[0] # end of applying label rules if valid_row: all_rows.append(new_row) # end of for all rows in this file num_done += 1 # end of building all files into one list log.info(("fulldata rows={} generating df") .format(len(all_rows))) df = pd.DataFrame(all_rows) log.info(("df rows={} headers={}") .format(len(df.index), df.columns.values)) if ev("CONVERT_DF", "0") == "1": log.info("converting df to json") save_node["df_json"] = df.to_json() if clean_file: log.info(("writing fulldata_file={}") .format(fulldata_file)) df.to_csv(fulldata_file, sep=',', encoding='utf-8', index=False) log.info(("done writing fulldata_file={}") .format(fulldata_file)) if post_proc_rules: clean_metadata_file = "" feature_to_predict = "label_name" features_to_process = [] ignore_features = [] if label_rules: ignore_features = [feature_to_predict] if "drop_columns" in post_proc_rules: for p in post_proc_rules["drop_columns"]: if p in headers_dict: ignore_features.append(p) # post proce filter more features out # for non-int/float types for d in df.columns.values: add_this_one = True for i in ignore_features: if d == i: add_this_one = False break if add_this_one: features_to_process.append(d) # for all df columns we're not ignoring... # add them as features to process fulldata_metadata_file = "{}/fulldata_{}".format( "/".join(fulldata_file.split("/")[:-1]), metadata_filename) log.info(("writing fulldata metadata file={}") .format(fulldata_metadata_file)) header_data = {"headers": list(df.columns.values), "output_type": "fulldata", "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "features_to_process": features_to_process, "feature_to_predict": feature_to_predict, "ignore_features": ignore_features, "created": rnow()} with open(fulldata_metadata_file, "w") as otfile: otfile.write(str(ppj(header_data))) keep_these = features_to_process keep_these.append(feature_to_predict) log.info(("creating new clean_file={} " "keep_these={} " "predict={}") .format(clean_file, keep_these, feature_to_predict)) # need to remove all columns that are all nan clean_df = df[keep_these].dropna( axis=1, how='all').dropna() cleaned_features = clean_df.columns.values cleaned_to_process = [] cleaned_ignore_features = [] for c in cleaned_features: if c == feature_to_predict: cleaned_ignore_features.append(c) else: keep_it = True for ign in ignore_features: if c == ign: cleaned_ignore_features.append(c) keep_it = False break # end of for all feaures to remove if keep_it: cleaned_to_process.append(c) # end of new feature columns log.info(("writing DROPPED clean_file={} " "features_to_process={} " "ignore_features={} " "predict={}") .format(clean_file, cleaned_to_process, cleaned_ignore_features, feature_to_predict)) write_clean_df = clean_df.drop( columns=cleaned_ignore_features ) log.info(("cleaned_df rows={}") .format(len(write_clean_df.index))) write_clean_df.to_csv( clean_file, sep=',', encoding='utf-8', index=False) clean_metadata_file = "{}/cleaned_{}".format( "/".join(clean_file.split("/")[:-1]), metadata_filename) log.info(("writing clean metadata file={}") .format(clean_metadata_file)) header_data = {"headers": list(write_clean_df.columns.values), "output_type": "clean", "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "features_to_process": cleaned_to_process, "feature_to_predict": feature_to_predict, "ignore_features": cleaned_ignore_features, "created": rnow()} with open(clean_metadata_file, "w") as otfile: otfile.write(str(ppj(header_data))) else: for d in df.columns.values: add_this_one = True for i in ignore_features: if d == i: add_this_one = False break if add_this_one: features_to_process.append(d) # for all df columns we're not ignoring... # add them as features to process fulldata_metadata_file = "{}/fulldata_{}".format( "/".join(fulldata_file.split("/")[:-1]), metadata_filename) log.info(("writing fulldata metadata file={}") .format(fulldata_metadata_file)) header_data = {"headers": list(df.columns.values), "output_type": "fulldata", "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "features_to_process": features_to_process, "feature_to_predict": feature_to_predict, "ignore_features": ignore_features, "created": rnow()} with open(fulldata_metadata_file, "w") as otfile: otfile.write(str(ppj(header_data))) keep_these = features_to_process keep_these.append(feature_to_predict) log.info(("creating new clean_file={} " "keep_these={} " "predict={}") .format(clean_file, keep_these, feature_to_predict)) # need to remove all columns that are all nan clean_df = df[keep_these].dropna( axis=1, how='all').dropna() cleaned_features = clean_df.columns.values cleaned_to_process = [] cleaned_ignore_features = [] for c in cleaned_features: if c == feature_to_predict: cleaned_ignore_features.append(c) else: keep_it = True for ign in ignore_features: if c == ign: cleaned_ignore_features.append(c) keep_it = False break # end of for all feaures to remove if keep_it: cleaned_to_process.append(c) # end of new feature columns log.info(("writing DROPPED clean_file={} " "features_to_process={} " "ignore_features={} " "predict={}") .format(clean_file, cleaned_to_process, cleaned_ignore_features, feature_to_predict)) write_clean_df = clean_df.drop( columns=cleaned_ignore_features ) log.info(("cleaned_df rows={}") .format(len(write_clean_df.index))) write_clean_df.to_csv( clean_file, sep=',', encoding='utf-8', index=False) clean_metadata_file = "{}/cleaned_{}".format( "/".join(clean_file.split("/")[:-1]), metadata_filename) log.info(("writing clean metadata file={}") .format(clean_metadata_file)) header_data = {"headers": list(write_clean_df.columns.values), "output_type": "clean", "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "features_to_process": cleaned_to_process, "feature_to_predict": feature_to_predict, "ignore_features": cleaned_ignore_features, "created": rnow()} with open(clean_metadata_file, "w") as otfile: otfile.write(str(ppj(header_data))) # end of if/else save_node["clean_file"] = clean_file save_node["clean_metadata_file"] = clean_metadata_file log.info(("done writing clean_file={}") .format(clean_file)) # end of post_proc_rules save_node["fulldata_file"] = fulldata_file save_node["fulldata_metadata_file"] = fulldata_metadata_file save_node["status"] = VALID # end of writing the file save_node["features_to_process"] = features_to_process save_node["feature_to_predict"] = feature_to_predict save_node["ignore_features"] = ignore_features log.info("build_csv - END") return save_node
def prepare_new_dataset(): """prepare_new_dataset""" clean_dir = ev( "OUTPUT_DIR", "/tmp") clean_file = ev( "CLEANED_FILE", "{}/cleaned_attack_scans.csv".format( clean_dir)) fulldata_file = ev( "FULLDATA_FILE", "{}/fulldata_attack_scans.csv".format( clean_dir)) dataset_dir = ev( "DS_DIR", "/opt/antinex/datasets") csv_glob_path = ev( "DS_GLOB_PATH", "{}/*/*.csv".format( dataset_dir)) pipeline_files = find_all_pipeline_csvs( csv_glob_path=csv_glob_path) post_proc_rules = { "drop_columns": [ "src_file", "raw_id", "raw_load", "raw_hex_load", "raw_hex_field_load", "pad_load", "eth_dst", # need to make this an int "eth_src", # need to make this an int "ip_dst", # need to make this an int "ip_src" # need to make this an int ], "predict_feature": "label_name" } label_rules = { "set_if_above": 85, "labels": ["not_attack", "attack"], "label_values": [0, 1] } log.info("building csv") save_node = build_csv( pipeline_files=pipeline_files, fulldata_file=fulldata_file, clean_file=clean_file, post_proc_rules=post_proc_rules, label_rules=label_rules) if save_node["status"] == VALID: log.info("Successfully process datasets:") if ev("SHOW_SUMMARY", "1") == "1": log.info(("Full csv: {}") .format(save_node["fulldata_file"])) log.info(("Full meta: {}") .format(save_node["fulldata_metadata_file"])) log.info(("Clean csv: {}") .format(save_node["clean_file"])) log.info(("Clean meta: {}") .format(save_node["clean_metadata_file"])) log.info("------------------------------------------") log.info(("Predicting Feature: {}") .format(save_node["feature_to_predict"])) log.info(("Features to Process: {}") .format(ppj(save_node["features_to_process"]))) log.info(("Ignored Features: {}") .format(ppj(save_node["ignore_features"]))) log.info("------------------------------------------") # end of show summary log.info("") log.info("done saving csv:") log.info("Full: {}".format( save_node["fulldata_file"])) log.info("Cleaned (no-NaNs in columns): {}".format( save_node["clean_file"])) log.info("") else: log.info("Failed to process datasets")
def handle_msg(self, body, org_message): """handle_msg :param body: dictionary contents from the message body :param org_message: message object can ack, requeue or reject """ if os.path.exists(self.stop_for_file): log.info(("Detected stop_file={} " "shutting down") .format(self.stop_for_file)) # drop the message back in the queue # for next time org_message.requeue() sys.exit(1) # end of stop file detection try: log.debug(("handle body={}") .format(ppj(body))) msg = body id = build_packet_key() recv_time = rnow() # this could be made into celery tasks... flat_msg = self.build_flat_msg( id=id, msg=msg) if not flat_msg: log.error(("Failed to build a flat message " "for message={}") .format(msg)) return msg["id"] = id msg["received"] = recv_time if len(flat_msg) > 0: if self.debug: log.info(ppj(flat_msg)) flat_msg["id"] = id flat_msg["received"] = recv_time self.all_flat.append(flat_msg) self.recv_msgs.append(msg) # end of adding all flat messages already_saved = False num_recv = len(self.recv_msgs) if (num_recv % self.save_after_num) == 0: already_saved = False self.save_data() # end of saving a snapshot if self.stop_after_num: if num_recv >= self.stop_after_num: if not already_saved: self.save_data() # avoid waiting on the save again log.info("archive successful - purging buffer") sys.exit(2) # shutdown - good for testing # if now set up for infinite consuming except Exception as e: log.error(("Failed processing msg={} " "ex={}") .format(body, e)) # end of processing message try: org_message.ack() except Exception as e: log.error(("Failed ack-ing msg={} " "ex={}") .format(body, e)) # end of acknowleding message was processed log.info("done handle")
def build_flat_msg(self, id=None, msg=None): """build_flat_msg :param id: unique id for this message :param msg: message dictionary to flatten """ flat_msg = {} if not id: log.error("Please pass in an id") return None if not msg: log.error("Please pass in a msg") return None for k in msg["data"]: if k == "ether": flat_msg.update(self.process_ether_frame( id=id, msg=msg["data"][k])) # end of ether elif k == "ip": flat_msg.update(self.process_ip_frame( id=id, msg=msg["data"][k])) # end of ip elif k == "ipv6": flat_msg.update(self.process_ipvsix_frame( id=id, msg=msg["data"][k])) # end of ipv6 elif k == "tcp": flat_msg.update(self.process_tcp_frame( id=id, msg=msg["data"][k])) # end of tcp elif k == "udp": flat_msg.update(self.process_udp_frame( id=id, msg=msg["data"][k])) # end of udp elif k == "dns": flat_msg.update(self.process_dns_frame( id=id, msg=msg["data"][k])) # end of dns elif k == "icmp": flat_msg.update(self.process_icmp_frame( id=id, msg=msg["data"][k])) # end of icmp elif k == "arp": flat_msg.update(self.process_arp_frame( id=id, msg=msg["data"][k])) # end of arp elif k == "raw": flat_msg.update(self.process_raw_frame( id=id, msg=msg["data"][k])) # end of raw elif k == "padding": flat_msg.update(self.process_pad_frame( id=id, msg=msg["data"][k])) # end of pad else: log.error(("Unsupported frame type={} " "please file an issue to track this " "with data={} msg={}") .format(k, ppj(msg["data"][k]), msg["data"])) # end of processing new message return flat_msg
def convert_pkt_to_json(pkg): """ convert_pkt_to_json Inspired by: https://gist.githubusercontent.com/cr0hn/1b0c2e672cd0721d3a07/raw/9144676ceb12dbd545e6dce366822bbedde8de2c/pkg_to_json.py This function convert a Scapy packet to JSON :param pkg: A kamene package :type pkg: objects :return: A JSON data :rtype: dict() """ results = defaultdict(dict) try: for index in range(0, len(pkg)): layer = pkg[index] # Get layer name layer_tmp_name = str(layer.__dict__["aliastypes"][0]) layer_start_pos = layer_tmp_name.rfind(".") + 1 layer_name = layer_tmp_name[layer_start_pos:-2].lower() # Get the layer info tmp_t = {} for default_x, y in layer.__dict__["default_fields"].items(): x = "default_{}".format(default_x) if DEBUG_PACKETS: log.info("default: key={} val={}".format(x, y)) try: tmp_t["hex_default_{}".format(default_x)] = y.hex() except Exception: # http://python3porting.com/differences.html#long if y and not isinstance( y, (str, int, int, float, list, dict)): if x in tmp_t: tmp_t[x].update(convert_pkt_to_json(y)) else: tmp_t[x] = y else: tmp_t[x] = y # end of fields results[layer_name] = tmp_t try: tmp_t = {} for fields_x, y in layer.__dict__["fields"].items(): if DEBUG_PACKETS: log.info("fields: key={} val={}".format(x, y)) if fields_x == "qd": if y: tmp_t["fields_qd"] = json.loads( convert_pkt_to_json(y)) elif fields_x == "ar": if y: tmp_t["fields_ar"] = json.loads( convert_pkt_to_json(y)) elif fields_x == "an": if y: tmp_t["fields_an"] = json.loads( convert_pkt_to_json(y)) elif fields_x == "arcount": if y: tmp_t["fields_arcount"] = json.loads( convert_pkt_to_json(y)) elif fields_x == "ns": if y: """ 'ns': <DNSRR rrname='ubuntu.com.' type=SOA rclass=IN ttl=1345 rdata=b'\x03ns1\tcanonical \xc0\x19\nhostmaster\xc02xHl\x8e \x00\x00*0\x00\x00\x0e\x10\x00 \t:\x80\x00\x00\x0e\x10' |>, """ tmp_t["fields_ns"] = str(y) elif fields_x == "proto": if y: tmp_t[x] = y elif fields_x == "flags": if y: tmp_t[x] = y elif fields_x == "ack": if y: tmp_t[x] = y elif fields_x == "id": if y: tmp_t[x] = y elif fields_x == "window": if y: tmp_t[x] = y elif fields_x == "dataofs": if y: tmp_t[x] = y elif fields_x == "frag": if y: tmp_t[x] = y elif fields_x == "reserved": if y: tmp_t[x] = y elif fields_x == "ttl": if y: tmp_t[x] = y elif fields_x == "chksum": if y: tmp_t[x] = y elif fields_x == "options": if y: cur_d = {} try: test = dict(y) if "EOL" in test: cur_d["EOL"] = test["EOL"] if "NOP" in test: cur_d["NOP"] = test["NOP"] if "MSS" in test: cur_d["MSS"] = test["MSS"] if "WScale" in test: cur_d["WScale"] = test["WScale"] if "SAckOK" in test: cur_d["SAckOK"] = \ test["SAckOK"].decode("utf-8") if "SAck" in test: cur_d["SAck"] = test["SAck"] if "Timestamp" in test: if test["Timestamp"]: cur_d["Timestamp"] = \ test["Timestamp"][0] if "AltChkSum" in test: cur_d["AltChkSum"] = test["AltChkSum"] if "AltChkSumOpt" in test: cur_d["AltChkSumOpt"] = \ test["AltChkSumOpt"] if "Mood" in test: cur_d["Mood"] = test["Mood"] if "Experiment" in test: cur_d["Experiment"] = test["Experiment"] except Exception as exct: log.error(("1 Failed parsing " "{}={} ex={}").format(x, y, exct)) cur_d = str(y) # end of parsing cur_d tmp_t["fields_{}".format(fields_x)] = cur_d elif fields_x == "urgptr": if y: cur_d = {} try: for f in y: cur_f = "{}_{}".format(fields_x, f) try: cur_d[cur_f] = y.decode("utf-8") except Exception: cur_d["hex_" + cur_f] = y[f].hex() except Exception as exct: log.error(("2 Failed parsing " "{}={} ex={}").format(x, y, exct)) cur_d = y # end of parsing cur_d tmp_t["fields_{}".format(fields_x)] = cur_d else: x = "{}".format(fields_x) try: hex_key = "hex_field_{}".format(fields_x) if fields_x == "load": try: tmp_t["load"] = y.decode("utf-8") except Exception: tmp_t[hex_key] = y.hex() else: tmp_t[hex_key] = y.hex() except Exception: # http://python3porting.com/differences.html#long if y and not isinstance( y, (str, int, int, float, list, dict)): if x in tmp_t: tmp_t[x].update(convert_pkt_to_json(y)) else: tmp_t[x] = y else: tmp_t[x] = y # end of special handling: # qd results[layer_name] = tmp_t except KeyError: # No custom fields pass except Exception: # Package finish -> do nothing pass if "padding" in results: try: if "load" in results["padding"]: results["padding"]["load"] = \ results["padding"]["load"].encode("utf-8").hex() except Exception: log.error(("failed parsing padding={}").format(results["padding"])) # end of fixing padding if "raw" in results: try: if "load" in results["raw"]: results["raw"]["load"] = \ results["raw"]["load"].encode("utf-8").hex() except Exception: log.error(("failed parsing raw={}").format(results["raw"])) # end of fixing raw if DEBUG_PACKETS: log.debug("") log.debug("pre json serialization:") log.debug(results) log.debug("post json.dumps:") log.debug(ppj(results)) log.debug("") else: log.info(ppj(results)) return results
def handle_processing_packets(): """handle_processing_packets Replacement packet processing engine. This is not done. """ host = os.getenv("LISTEN_ON_HOST", "127.0.0.1").strip().lstrip() port = int(os.getenv("LISTEN_ON_PORT", "80").strip().lstrip()) backlog = int(os.getenv("LISTEN_BACKLOG", "5").strip().lstrip()) size = int(os.getenv("LISTEN_SIZE", "102400").strip().lstrip()) sleep_in_seconds = float(os.getenv("LISTEN_SLEEP", "0.5").strip().lstrip()) needs_response = bool( os.getenv("LISTEN_SEND_RESPONSE", "0").strip().lstrip() == "1") shutdown_hook = os.getenv( "LISTEN_SHUTDOWN_HOOK", "/tmp/shutdown-listen-server-{}-{}".format(host, port)).strip().lstrip() filter_key = os.getenv("IGNORE_KEY", INCLUDED_IGNORE_KEY).strip().lstrip() if os.path.exists(shutdown_hook): log.info(("Please remove the shutdown hook file: " "\nrm -f {}").format(shutdown_hook)) sys.exit(1) default_filter_key = filter_key bytes_for_filter_key = len(default_filter_key) offset_to_filter_key = (-1 * bytes_for_filter_key) offset_to_msg = offset_to_filter_key - 1 now = datetime.datetime.now().isoformat() log.info(("{} - Starting Server address={}:{} " "backlog={} size={} sleep={} shutdown={} " "filter_key={}").format(now, host, port, backlog, size, sleep_in_seconds, shutdown_hook, default_filter_key)) s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind((host, port)) s.listen(backlog) client, address = s.accept() midx = 0 while 1: data = None address = None ignore_key = None try: if not client: client, address = s.accept() except Exception as e: log.error(("socket accept with ex={}").format(e)) try: if client: data = client.recv(size) except Exception as e: log.error(("recv - disconnected with ex={}").format(e)) if data: now = datetime.datetime.now().isoformat() packet_to_process = data[0:offset_to_msg] ignore_key = data[offset_to_filter_key:] log.info( ("decoding data={} key={}").format(packet_to_process, ignore_key)) msg = None try: msg = json.loads(packet_to_process.decode("utf-8")) except Exception as e: msg = None log.error(("Invalid data={} with ex={}").format( packet_to_process, e)) if msg: log.info(("received msg={} " "data={} replying - ignore='{}'").format( ppj(msg), packet_to_process, ignore_key)) if msg["status"] == VALID: if msg["data_type"] == TCP: log.info("TCP") elif msg["data_type"] == UDP: log.info("TCP") elif msg["data_type"] == ARP: log.info("TCP") elif msg["data_type"] == ICMP: log.info("TCP") else: log.error( ("unsuppported type={}").format(msg["data_type"])) # end of supported eth protocol message types else: log.error( ("unsuppported msg status={}").format(msg["status"])) # end if msg was VALID # end of if found msg midx += 1 if midx > 1000000: midx = 0 else: log.debug("ignoring invalid data") # end of if valid msg or not if needs_response: client.send(ignore_key) else: log.info("no response") time.sleep(sleep_in_seconds) if os.path.exists(shutdown_hook): now = datetime.datetime.now().isoformat() log.info(("{} detected shutdown " "file={}").format(now, shutdown_hook)) # end of loop log.info("shutting down") client.close() log.info("done")