def graph_dns(g, df_dns): # Iterate through all the flows for i in df_dns.index: # Create the DNSTransaction node # name = str(df_dns.loc[i]["trans_id"]) name = "%d - %s - %s" % (df_dns.loc[i]["trans_id"], df_dns.loc[i]["qtype_name"], df_dns.loc[i]["query"]) timestamp = df_dns.loc[i]["ts"] flowname = df_dns.loc[i]["uid"] # Pick out the properties that belong on the transaction and add # them transaction = g.dnsTransaction.create(name=name, ts=df_dns.loc[i]["ts"], proto=df_dns.loc[i]["proto"], orig_p=df_dns.loc[i]["id.orig_p"], resp_p=df_dns.loc[i]["id.resp_p"], qclass=df_dns.loc[i]["qclass"], qclass_name=df_dns.loc[i]["qclass_name"], qtype=df_dns.loc[i]["qtype"], qtype_name=df_dns.loc[i]["qtype_name"], rcode=df_dns.loc[i]["rcode"], rcode_name=df_dns.loc[i]["rcode_name"], AA=df_dns.loc[i]["AA"], TC=df_dns.loc[i]["TC"], RD=df_dns.loc[i]["RD"], RA=df_dns.loc[i]["RA"], Z=df_dns.loc[i]["Z"], rejected=df_dns.loc[i]["rejected"]) # Create a node + edge for the query, if there is one in the log if df_dns.loc[i]["query"]: fqdn = g.fqdn.get_or_create("name", df_dns.loc[i]["query"], {"name":df_dns.loc[i]["query"], "domain":df_dns.loc[i]["query"]}) g.lookedUp.create(transaction,fqdn) # Now create the nodes and edges for the domains or addresses in # the answer (if there is an answer). There can be multiple # answers, so split this into a list and create one node + edge # for each. # # There should also be one TTL per answer, so we'll split those and # use array indices to tie them together. The arrays are supposed # to always be the same length, but maybe sometimes they are # not. We'll force the issue by extending the TTL list to be # the same size as the address list. if df_dns.loc[i]["answers"]: addrs = df_dns.loc[i]["answers"].split(",") ttls = df_dns.loc[i]["TTLs"].split(",") ttls = extend_list(ttls, ttls[len(ttls)-1],len(addrs)) for i in range(len(addrs)): ans = addrs[i] ttl = float(ttls[i]) # DNS answers can be either IPs or other names. Figure # out which type of node to create for each answer. if is_IP(ans): node = g.host.get_or_create("name",ans,{"name":ans, "address":ans}) else: node = g.fqdn.get_or_create("name",ans,{"name":ans, "address":ans}) g.resolvedTo.create(fqdn, node, {"ts":timestamp}) g.answer.create(transaction, node, {"TTL": ttl}) # Create a node + edge for the source of the DNS transaction # (the client host) if df_dns.loc[i]["id.orig_h"]: src = g.host.get_or_create("name", df_dns.loc[i]["id.orig_h"], {"name": df_dns.loc[i]["id.orig_h"], "address":df_dns.loc[i]["id.orig_h"]}) g.queried.create(src, transaction) # Create a node + edge for the destination of the DNS transaction # (the DNS server) if df_dns.loc[i]["id.resp_h"]: dst = g.host.get_or_create("name", df_dns.loc[i]["id.resp_h"], {"name": df_dns.loc[i]["id.resp_h"], "address":df_dns.loc[i]["id.resp_h"]}) g.queriedServer.create(transaction,dst) # Now connect this transaction to the correct flow flows = g.flow.index.lookup(name=flowname) if flows == None: # print "ERROR: Flow '%s' does not exist" % flowname pass else: # lookup returns a generator, but since there should only be one # flow with this name, just take the first one flow = flows.next() nodes = flow.outV("contains") if nodes == None or not (transaction in nodes): edge = g.contains.create(flow, transaction) # Associate the src host with the FQDN it resolved. Since a host # can resolve a domain multiple times, we'll also keep track of a # "weight" feature to count how many times this happened. if df_dns.loc[i]["query"]: neighbors = src.outV("resolved") if neighbors == None or not (fqdn in neighbors): e = g.resolved.create(src, fqdn) e.weight=1 e.save() else: edges = edge_list(g, src._id, fqdn._id, "resolved") # There should only be one of these edges, and we already know # it exists, so it's safe to just take the first one edge = edges.next() g.resolved.update(edge._id, weight=(edge.weight + 1))
def graph_dns(g, df_dns): # Iterate through all the flows for i in df_dns.index: # Create the DNSTransaction node # name = str(df_dns.loc[i]["trans_id"]) name = "%d - %s - %s" % (df_dns.loc[i]["trans_id"], df_dns.loc[i]["qtype_name"], df_dns.loc[i]["query"]) timestamp = df_dns.loc[i]["ts"] flowname = df_dns.loc[i]["uid"] # Pick out the properties that belong on the transaction and add # them transaction = g.dnsTransaction.create( name=name, ts=df_dns.loc[i]["ts"], proto=df_dns.loc[i]["proto"], orig_p=df_dns.loc[i]["id.orig_p"], resp_p=df_dns.loc[i]["id.resp_p"], qclass=df_dns.loc[i]["qclass"], qclass_name=df_dns.loc[i]["qclass_name"], qtype=df_dns.loc[i]["qtype"], qtype_name=df_dns.loc[i]["qtype_name"], rcode=df_dns.loc[i]["rcode"], rcode_name=df_dns.loc[i]["rcode_name"], AA=df_dns.loc[i]["AA"], TC=df_dns.loc[i]["TC"], RD=df_dns.loc[i]["RD"], RA=df_dns.loc[i]["RA"], Z=df_dns.loc[i]["Z"], rejected=df_dns.loc[i]["rejected"]) # Create a node + edge for the query, if there is one in the log if df_dns.loc[i]["query"]: fqdn = g.fqdn.get_or_create("name", df_dns.loc[i]["query"], { "name": df_dns.loc[i]["query"], "domain": df_dns.loc[i]["query"] }) g.lookedUp.create(transaction, fqdn) # Now create the nodes and edges for the domains or addresses in # the answer (if there is an answer). There can be multiple # answers, so split this into a list and create one node + edge # for each. # # There should also be one TTL per answer, so we'll split those and # use array indices to tie them together. The arrays are supposed # to always be the same length, but maybe sometimes they are # not. We'll force the issue by extending the TTL list to be # the same size as the address list. if df_dns.loc[i]["answers"]: addrs = df_dns.loc[i]["answers"].split(",") ttls = df_dns.loc[i]["TTLs"].split(",") ttls = extend_list(ttls, ttls[len(ttls) - 1], len(addrs)) for i in range(len(addrs)): ans = addrs[i] ttl = float(ttls[i]) # DNS answers can be either IPs or other names. Figure # out which type of node to create for each answer. if is_IP(ans): node = g.host.get_or_create("name", ans, { "name": ans, "address": ans }) else: node = g.fqdn.get_or_create("name", ans, { "name": ans, "address": ans }) g.resolvedTo.create(fqdn, node, {"ts": timestamp}) g.answer.create(transaction, node, {"TTL": ttl}) # Create a node + edge for the source of the DNS transaction # (the client host) if df_dns.loc[i]["id.orig_h"]: src = g.host.get_or_create( "name", df_dns.loc[i]["id.orig_h"], { "name": df_dns.loc[i]["id.orig_h"], "address": df_dns.loc[i]["id.orig_h"] }) g.queried.create(src, transaction) # Create a node + edge for the destination of the DNS transaction # (the DNS server) if df_dns.loc[i]["id.resp_h"]: dst = g.host.get_or_create( "name", df_dns.loc[i]["id.resp_h"], { "name": df_dns.loc[i]["id.resp_h"], "address": df_dns.loc[i]["id.resp_h"] }) g.queriedServer.create(transaction, dst) # Now connect this transaction to the correct flow flows = g.flow.index.lookup(name=flowname) if flows == None: # print "ERROR: Flow '%s' does not exist" % flowname pass else: # lookup returns a generator, but since there should only be one # flow with this name, just take the first one flow = flows.next() nodes = flow.outV("contains") if nodes == None or not (transaction in nodes): edge = g.contains.create(flow, transaction) # Associate the src host with the FQDN it resolved. Since a host # can resolve a domain multiple times, we'll also keep track of a # "weight" feature to count how many times this happened. if df_dns.loc[i]["query"]: neighbors = src.outV("resolved") if neighbors == None or not (fqdn in neighbors): e = g.resolved.create(src, fqdn) e.weight = 1 e.save() else: edges = edge_list(g, src._id, fqdn._id, "resolved") # There should only be one of these edges, and we already know # it exists, so it's safe to just take the first one edge = edges.next() g.resolved.update(edge._id, weight=(edge.weight + 1))
def graph_flows(g, df_conn): # Iterate through all the flows for con in df_conn.index: # For each flow, create new Host objects if necessary. Then create a # new Flow, and add the relationships between the Hosts and the Flow # Create the source & dest nodes src_host = g.host.get_or_create("name", df_conn.loc[con]["id.orig_h"], {"name": df_conn.loc[con]["id.orig_h"], "address":df_conn.loc[con]["id.orig_h"] }) dst_host = g.host.get_or_create("name", df_conn.loc[con]["id.resp_h"], {"name": df_conn.loc[con]["id.resp_h"], "address":df_conn.loc[con]["id.resp_h"] }) # If the flow is marked "local_orig", we need to update this feature # on the source host. We can't do this at creation time because we # might have seen this host before in another context, and created a # node for it without knowing it was a local host. if df_conn.loc[con]["local_orig"] == "T": src_host.local = "T" src_host.save() # Create the Flow object. Since we can run the same log file through # multiple times, or observe the same flow from different log files, # assume flows with the same name are actually the same flow. flowname = df_conn.loc[con]["uid"] # Create the flow node, with all the rich data properties = dict(df_conn.loc[con]) # Manually assign the "name" property properties["name"] = flowname # Take out the info about the source & dest IPs, since we should be # getting them from the connected host nodes del properties["id.orig_h"] del properties["id.resp_h"] flow = g.flow.get_or_create("name", flowname, properties) # Create the edges for this flow, if they don't already exist nodes = flow.inV("source") if nodes == None or not (src_host in nodes): g.source.create(src_host, flow) nodes = flow.outV("dest") if nodes == None or not (dst_host in nodes): g.dest.create(flow, dst_host) # Make a direct link between the src and dest hosts, as this # is a common analysis task. It doesn't *always* make sense # to go through the flows. neighbors = src_host.outV("connectedTo") if neighbors == None or not (dst_host in neighbors): e = g.connectedTo.create(src_host, dst_host) e.weight=1 e.save() else: edges = edge_list(g, src_host._id, dst_host._id, "connectedTo") # There should only be one of these edges, and we already know # it exists, so it's safe to just take the first one edge = edges.next() g.connectedTo.update(edge._id, weight=(edge.weight + 1))
def graph_flows(g, df_conn): # Iterate through all the flows for con in df_conn.index: # For each flow, create new Host objects if necessary. Then create a # new Flow, and add the relationships between the Hosts and the Flow # Create the source & dest nodes src_host = g.host.get_or_create( "name", df_conn.loc[con]["id.orig_h"], { "name": df_conn.loc[con]["id.orig_h"], "address": df_conn.loc[con]["id.orig_h"] }) dst_host = g.host.get_or_create( "name", df_conn.loc[con]["id.resp_h"], { "name": df_conn.loc[con]["id.resp_h"], "address": df_conn.loc[con]["id.resp_h"] }) # If the flow is marked "local_orig", we need to update this feature # on the source host. We can't do this at creation time because we # might have seen this host before in another context, and created a # node for it without knowing it was a local host. if df_conn.loc[con]["local_orig"] == "T": src_host.local = "T" src_host.save() # Create the Flow object. Since we can run the same log file through # multiple times, or observe the same flow from different log files, # assume flows with the same name are actually the same flow. flowname = df_conn.loc[con]["uid"] # Create the flow node, with all the rich data properties = dict(df_conn.loc[con]) # Manually assign the "name" property properties["name"] = flowname # Take out the info about the source & dest IPs, since we should be # getting them from the connected host nodes del properties["id.orig_h"] del properties["id.resp_h"] flow = g.flow.get_or_create("name", flowname, properties) # Create the edges for this flow, if they don't already exist nodes = flow.inV("source") if nodes == None or not (src_host in nodes): g.source.create(src_host, flow) nodes = flow.outV("dest") if nodes == None or not (dst_host in nodes): g.dest.create(flow, dst_host) # Make a direct link between the src and dest hosts, as this # is a common analysis task. It doesn't *always* make sense # to go through the flows. neighbors = src_host.outV("connectedTo") if neighbors == None or not (dst_host in neighbors): e = g.connectedTo.create(src_host, dst_host) e.weight = 1 e.save() else: edges = edge_list(g, src_host._id, dst_host._id, "connectedTo") # There should only be one of these edges, and we already know # it exists, so it's safe to just take the first one edge = edges.next() g.connectedTo.update(edge._id, weight=(edge.weight + 1))