def valgendp(cdmrectype, cdmrecval): """Generate a single value for a DARPA CDM/Cadets E2 record. Currently, only type information is used. Arguments: cdmrectype - CDM record type cdmrecval - CDM record Return: a single integer value of the record """ val = list() if cdmrectype == CDM_TYPE_SOCK or \ cdmrectype == CD2_TYPE_SOCK: val.append('NET_FLOW_OBJECT') elif cdmrectype == CDM_TYPE_PIPE or \ cdmrectype == CD2_TYPE_PIPE: val.append('UNNAMED_PIPE_OBJECT') elif cdmrectype == CDM_TYPE_MEMORY or \ cdmrectype == CD2_TYPE_MEMORY: val.append('MEMORY_OBJECT') elif cdmrectype == CDM_TYPE_HOST or \ cdmrectype == CD2_TYPE_HOST: val.append(cdmrecval['hostType']) else: val.append(cdmrecval['type']) return hashgen(val)
def valgencf(cfrecval): """Generate a single value for a CamFlow record. Currently, only type information is used. Arguments: cfrecval - CamFlow record Return: a single integer value of the record """ val = list() val.append(cfrecval["prov:type"]) return hashgen(val)
def gencd(parser, i, dbs, out): """Generate CADETS2/FiveDirections outputs using a list of databases. Arguments: parser - ijson parser that feeds JSON objects i - the start index of the database list dbs - a list of database out - output file object """ logging.basicConfig(filename='error.log', level=logging.DEBUG) description = '\x1b[6;30;43m[i]\x1b[0m Progress of Generating Output from File \x1b[6;30;42m{}\x1b[0m'.format( i) pb = tqdm.tqdm(desc=description, mininterval=5.0, unit="recs", position=i) for cdmrec in parser: pb.update() cdmrectype = cdmrec['datum'].keys()[0] cdmrecval = cdmrec['datum'][cdmrectype] if cdmrectype == CD2_TYPE_EVENT: if 'type' not in cdmrecval: logging.debug('CD2_TYPE_EVENT: type is missing. Event UUID: ' + repr(cdmrecval['uuid'])) continue else: edgetype = valgendp(cdmrectype, cdmrecval) if 'timestampNanos' not in cdmrecval: logging.debug( 'CD2_TYPE_EVENT: timestamp is missing. Event UUID: ' + repr(cdmrecval['uuid'])) continue else: timestamp = cdmrecval['timestampNanos'] srcUUID, dstUUID, bidirection = processevent(cdmrecval, 'cadets2') if srcUUID == None or dstUUID == None: continue srcVal = getfromdb(dbs, i, srcUUID) if srcVal == None: logging.error('An unmatched srcUUID from edge (' + repr(cdmrecval['uuid']) + ') of type: ' + cdmrecval['type']) continue dstVal = getfromdb(dbs, i, dstUUID) if dstVal == None: logging.error('An unmatched dstUUID from edge (' + repr(cdmrecval['uuid']) + ') of type: ' + cdmrecval['type']) continue out.write(str(hashgen([srcUUID.encode('utf-8')])) + '\t' \ + str(hashgen([dstUUID.encode('utf-8')])) + '\t' \ + str(srcVal) + ':' + str(dstVal) \ + ':' + str(edgetype) \ + ':' + str(timestamp) + '\t' + '\n') if bidirection: out.write(str(hashgen([dstUUID.encode('utf-8')])) + '\t' \ + str(hashgen([srcUUID.encode('utf-8')])) + '\t' \ + str(dstVal) + ':' + str(srcVal) \ + ':' + str(edgetype) \ + ':' + str(timestamp) + '\t' + '\n') else: pass pb.close() return
def gencf(parser, i, dbs, out): """Generate CamFlow outputs using a list of databases. Arguments: parser - ijson parser that feeds JSON objects i - the start index of the database list dbs - a list of database out - output file object """ logging.basicConfig(filename='error.log', level=logging.DEBUG) description = '\x1b[6;30;43m[i]\x1b[0m Progress of Generating Output from File \x1b[6;30;42m{}\x1b[0m'.format( i) pb = tqdm.tqdm(desc=description, mininterval=5.0, unit="recs", position=i) # for camflow, each file is independent db = dbs[i] for cfrec in parser: pb.update() if "used" in cfrec: used = cfrec["used"] for uid in used: if "prov:type" not in used[uid]: logging.debug("Edge (used) record without type. UUID: %s", uid) continue else: edgetype = valgencf(used[uid]) if "cf:id" not in used[uid]: # Can be used as timestamp logging.debug( "Edge (used) record without timestamp. UUID: %s", uid) continue else: timestamp = used[uid]["cf:id"] if "prov:entity" not in used[uid]: logging.debug( "Edge (used/{}) record without srcUUID. UUID: {}". format(used[uid]["prov:type"], uid)) continue if "prov:activity" not in used[uid]: logging.debug( "Edge (used/{}) record without dstUUID. UUID: {}". format(used[uid]["prov:type"], uid)) continue srcUUID = used[uid]["prov:entity"] dstUUID = used[uid]["prov:activity"] srcVal = db.get(srcUUID) if srcVal == None: logging.debug( "Edge (used/{}) record with an unmatched srcUUID. UUID: {}" .format(used[uid]["prov:type"], uid)) continue dstVal = db.get(dstUUID) if dstVal == None: logging.debug( "Edge (used/{}) record with an unmatched dstUUID. UUID: {}" .format(used[uid]["prov:type"], uid)) continue out.write(str(hashgen([srcUUID])) + '\t' \ + str(hashgen([dstUUID])) + '\t' \ + str(srcVal) + ':' + str(dstVal) \ + ':' + str(edgetype) \ + ':' + str(timestamp) + '\t' + '\n') if "wasGeneratedBy" in cfrec: wasGeneratedBy = cfrec["wasGeneratedBy"] for uid in wasGeneratedBy: if "prov:type" not in wasGeneratedBy[uid]: logging.debug( "Edge (wasGeneratedBy) record without type. UUID: %s", uid) continue else: edgetype = valgencf(wasGeneratedBy[uid]) if "cf:id" not in wasGeneratedBy[ uid]: # Can be used as timestamp logging.debug( "Edge (wasGeneratedBy) record without timestamp. UUID: %s", uid) continue else: timestamp = wasGeneratedBy[uid]["cf:id"] if "prov:entity" not in wasGeneratedBy[uid]: logging.debug( "Edge (wasGeneratedBy/{}) record without srcUUID. UUID: {}" .format(wasGeneratedBy[uid]["prov:type"], uid)) continue if "prov:activity" not in wasGeneratedBy[uid]: logging.debug( "Edge (wasGeneratedBy/{}) record without dstUUID. UUID: {}" .format(wasGeneratedBy[uid]["prov:type"], uid)) continue srcUUID = wasGeneratedBy[uid]["prov:activity"] dstUUID = wasGeneratedBy[uid]["prov:entity"] srcVal = db.get(srcUUID) if srcVal == None: logging.debug( "Edge (wasGeneratedBy/{}) record with an unmatched srcUUID. UUID: {}" .format(wasGeneratedBy[uid]["prov:type"], uid)) continue dstVal = db.get(dstUUID) if dstVal == None: logging.debug( "Edge (wasGeneratedBy/{}) record with an unmatched dstUUID. UUID: {}" .format(wasGeneratedBy[uid]["prov:type"], uid)) continue out.write(str(hashgen([srcUUID])) + '\t' \ + str(hashgen([dstUUID])) + '\t' \ + str(srcVal) + ':' + str(dstVal) \ + ':' + str(edgetype) \ + ':' + str(timestamp) + '\t' + '\n') if "wasInformedBy" in cfrec: wasInformedBy = cfrec["wasInformedBy"] for uid in wasInformedBy: if "prov:type" not in wasInformedBy[uid]: logging.debug( "Edge (wasInformedBy) record without type. UUID: %s", uid) continue else: edgetype = valgencf(wasInformedBy[uid]) if "cf:id" not in wasInformedBy[ uid]: # Can be used as timestamp logging.debug( "Edge (wasInformedBy) record without timestamp. UUID: %s", uid) continue else: timestamp = wasInformedBy[uid]["cf:id"] if "prov:informant" not in wasInformedBy[uid]: logging.debug( "Edge (wasInformedBy/{}) record without srcUUID. UUID: {}" .format(wasInformedBy[uid]["prov:type"], uid)) continue if "prov:informed" not in wasInformedBy[uid]: logging.debug( "Edge (wasInformedBy/{}) record without dstUUID. UUID: {}" .format(wasInformedBy[uid]["prov:type"], uid)) continue srcUUID = wasInformedBy[uid]["prov:informant"] dstUUID = wasInformedBy[uid]["prov:informed"] srcVal = db.get(srcUUID) if srcVal == None: logging.debug( "Edge (wasInformedBy/{}) record with an unmatched srcUUID. UUID: {}" .format(wasInformedBy[uid]["prov:type"], uid)) continue dstVal = db.get(dstUUID) if dstVal == None: logging.debug( "Edge (wasInformedBy/{}) record with an unmatched dstUUID. UUID: {}" .format(wasInformedBy[uid]["prov:type"], uid)) continue out.write(str(hashgen([srcUUID])) + '\t' \ + str(hashgen([dstUUID])) + '\t' \ + str(srcVal) + ':' + str(dstVal) \ + ':' + str(edgetype) \ + ':' + str(timestamp) + '\t' + '\n') if "wasDerivedFrom" in cfrec: wasDerivedFrom = cfrec["wasDerivedFrom"] for uid in wasDerivedFrom: if "prov:type" not in wasDerivedFrom[uid]: logging.debug( "Edge (wasDerivedFrom) record without type. UUID: %s", uid) continue else: edgetype = valgencf(wasDerivedFrom[uid]) if "cf:id" not in wasDerivedFrom[ uid]: # Can be used as timestamp logging.debug( "Edge (wasDerivedFrom) record without timestamp. UUID: %s", uid) continue else: timestamp = wasDerivedFrom[uid]["cf:id"] if "prov:usedEntity" not in wasDerivedFrom[uid]: logging.debug( "Edge (wasDerivedFrom/{}) record without srcUUID. UUID: {}" .format(wasDerivedFrom[uid]["prov:type"], uid)) continue if "prov:generatedEntity" not in wasDerivedFrom[uid]: logging.debug( "Edge (wasDerivedFrom/{}) record without dstUUID. UUID: {}" .format(wasDerivedFrom[uid]["prov:type"], uid)) continue srcUUID = wasDerivedFrom[uid]["prov:usedEntity"] dstUUID = wasDerivedFrom[uid]["prov:generatedEntity"] srcVal = db.get(srcUUID) if srcVal == None: logging.debug( "Edge (wasDerivedFrom/{}) record with an unmatched srcUUID. UUID: {}" .format(wasDerivedFrom[uid]["prov:type"], uid)) continue dstVal = db.get(dstUUID) if dstVal == None: logging.debug( "Edge (wasDerivedFrom/{}) record with an unmatched dstUUID. UUID: {}" .format(wasDerivedFrom[uid]["prov:type"], uid)) continue out.write(str(hashgen([srcUUID])) + '\t' \ + str(hashgen([dstUUID])) + '\t' \ + str(srcVal) + ':' + str(dstVal) \ + ':' + str(edgetype) \ + ':' + str(timestamp) + '\t' + '\n') pb.close() return
def cgencd(parser, db, out): """Generate Cadets E2/FiveDirections outputs from compressed/single file. Note: Only one subset of FiveDirections dataset uses this function. Other subsets use regular DARPA functions. Arguments: parser - ijson parser that feeds JSON objects db - database out - output file object """ logging.basicConfig(filename='error.log', level=logging.DEBUG) description = '\x1b[6;30;43m[i]\x1b[0m Progress of Generating Output' pb = tqdm.tqdm(desc=description, mininterval=5.0, unit="recs") for cdmrec in parser: pb.update() cdmrectype = cdmrec['datum'].keys()[0] cdmrecval = cdmrec['datum'][cdmrectype] if cdmrectype == CD2_TYPE_EVENT: if 'type' not in cdmrecval: logging.debug('CD2_TYPE_EVENT: type is missing. Event UUID: ' + repr(cdmrecval['uuid'])) continue else: edgetype = valgendp(cdmrectype, cdmrecval) if 'timestampNanos' not in cdmrecval: logging.debug( 'CD2_TYPE_EVENT: timestamp is missing. Event UUID: ' + repr(cdmrecval['uuid'])) continue else: timestamp = cdmrecval['timestampNanos'] srcUUID, dstUUID, bidirection = processevent(cdmrecval, 'cadets2') if srcUUID == None or dstUUID == None: continue srcVal = db.get(srcUUID.encode('utf-8')) if srcVal == None: logging.error('An unmatched srcUUID from edge (' + repr(cdmrecval['uuid']) + ') of type: ' + cdmrecval['type']) continue dstVal = db.get(dstUUID.encode('utf-8')) if dstVal == None: logging.error('An unmatched dstUUID from edge (' + repr(cdmrecval['uuid']) + ') of type: ' + cdmrecval['type']) continue out.write(str(hashgen([srcUUID.encode('utf-8')])) + '\t' \ + str(hashgen([dstUUID.encode('utf-8')])) + '\t' \ + str(srcVal) + ':' + str(dstVal) \ + ':' + str(edgetype) \ + ':' + str(timestamp) + '\t' + '\n') if bidirection: out.write(str(hashgen([dstUUID.encode('utf-8')])) + '\t' \ + str(hashgen([srcUUID.encode('utf-8')])) + '\t' \ + str(dstVal) + ':' + str(srcVal) \ + ':' + str(edgetype) \ + ':' + str(timestamp) + '\t' + '\n') else: pass return
def cgencf(parser, db, out): """Generate CamFlow outputs from compressed/single file. """ logging.basicConfig(filename='error.log', level=logging.DEBUG) description = '\x1b[6;30;43m[i]\x1b[0m Progress of Generating Output' pb = tqdm.tqdm(desc=description, mininterval=5.0, unit="recs") for cfrec in parser: pb.update() if "used" in cfrec: used = cfrec["used"] for uid in used: if "prov:type" not in used[uid]: logging.debug("Edge (used) record without type. UUID: %s", uid) continue else: edgetype = valgencf(used[uid]) if "cf:id" not in used[uid]: # Can be used as timestamp logging.debug( "Edge (used) record without timestamp. UUID: %s", uid) continue else: timestamp = used[uid]["cf:id"] if "prov:entity" not in used[uid]: logging.debug( "Edge (used/{}) record without srcUUID. UUID: {}". format(used[uid]["prov:type"], uid)) continue if "prov:activity" not in used[uid]: logging.debug( "Edge (used/{}) record without dstUUID. UUID: {}". format(used[uid]["prov:type"], uid)) continue srcUUID = used[uid]["prov:entity"] dstUUID = used[uid]["prov:activity"] srcVal = db.get(srcUUID) if srcVal == None: logging.debug( "Edge (used/{}) record with an unmatched srcUUID. UUID: {}" .format(used[uid]["prov:type"], uid)) continue dstVal = db.get(dstUUID) if dstVal == None: logging.debug( "Edge (used/{}) record with an unmatched dstUUID. UUID: {}" .format(used[uid]["prov:type"], uid)) continue out.write(str(hashgen([srcUUID])) + '\t' \ + str(hashgen([dstUUID])) + '\t' \ + str(srcVal) + ':' + str(dstVal) \ + ':' + str(edgetype) \ + ':' + str(timestamp) + '\t' + '\n') if "wasGeneratedBy" in cfrec: wasGeneratedBy = cfrec["wasGeneratedBy"] for uid in wasGeneratedBy: if "prov:type" not in wasGeneratedBy[uid]: logging.debug( "Edge (wasGeneratedBy) record without type. UUID: %s", uid) continue else: edgetype = valgencf(wasGeneratedBy[uid]) if "cf:id" not in wasGeneratedBy[ uid]: # Can be used as timestamp logging.debug( "Edge (wasGeneratedBy) record without timestamp. UUID: %s", uid) continue else: timestamp = wasGeneratedBy[uid]["cf:id"] if "prov:entity" not in wasGeneratedBy[uid]: logging.debug( "Edge (wasGeneratedBy/{}) record without srcUUID. UUID: {}" .format(wasGeneratedBy[uid]["prov:type"], uid)) continue if "prov:activity" not in wasGeneratedBy[uid]: logging.debug( "Edge (wasGeneratedBy/{}) record without dstUUID. UUID: {}" .format(wasGeneratedBy[uid]["prov:type"], uid)) continue srcUUID = wasGeneratedBy[uid]["prov:activity"] dstUUID = wasGeneratedBy[uid]["prov:entity"] srcVal = db.get(srcUUID) if srcVal == None: logging.debug( "Edge (wasGeneratedBy/{}) record with an unmatched srcUUID. UUID: {}" .format(wasGeneratedBy[uid]["prov:type"], uid)) continue dstVal = db.get(dstUUID) if dstVal == None: logging.debug( "Edge (wasGeneratedBy/{}) record with an unmatched dstUUID. UUID: {}" .format(wasGeneratedBy[uid]["prov:type"], uid)) continue out.write(str(hashgen([srcUUID])) + '\t' \ + str(hashgen([dstUUID])) + '\t' \ + str(srcVal) + ':' + str(dstVal) \ + ':' + str(edgetype) \ + ':' + str(timestamp) + '\t' + '\n') if "wasInformedBy" in cfrec: wasInformedBy = cfrec["wasInformedBy"] for uid in wasInformedBy: if "prov:type" not in wasInformedBy[uid]: logging.debug( "Edge (wasInformedBy) record without type. UUID: %s", uid) continue else: edgetype = valgencf(wasInformedBy[uid]) if "cf:id" not in wasInformedBy[ uid]: # Can be used as timestamp logging.debug( "Edge (wasInformedBy) record without timestamp. UUID: %s", uid) continue else: timestamp = wasInformedBy[uid]["cf:id"] if "prov:informant" not in wasInformedBy[uid]: logging.debug( "Edge (wasInformedBy/{}) record without srcUUID. UUID: {}" .format(wasInformedBy[uid]["prov:type"], uid)) continue if "prov:informed" not in wasInformedBy[uid]: logging.debug( "Edge (wasInformedBy/{}) record without dstUUID. UUID: {}" .format(wasInformedBy[uid]["prov:type"], uid)) continue srcUUID = wasInformedBy[uid]["prov:informant"] dstUUID = wasInformedBy[uid]["prov:informed"] srcVal = db.get(srcUUID) if srcVal == None: logging.debug( "Edge (wasInformedBy/{}) record with an unmatched srcUUID. UUID: {}" .format(wasInformedBy[uid]["prov:type"], uid)) continue dstVal = db.get(dstUUID) if dstVal == None: logging.debug( "Edge (wasInformedBy/{}) record with an unmatched dstUUID. UUID: {}" .format(wasInformedBy[uid]["prov:type"], uid)) continue out.write(str(hashgen([srcUUID])) + '\t' \ + str(hashgen([dstUUID])) + '\t' \ + str(srcVal) + ':' + str(dstVal) \ + ':' + str(edgetype) \ + ':' + str(timestamp) + '\t' + '\n') if "wasDerivedFrom" in cfrec: wasDerivedFrom = cfrec["wasDerivedFrom"] for uid in wasDerivedFrom: if "prov:type" not in wasDerivedFrom[uid]: logging.debug( "Edge (wasDerivedFrom) record without type. UUID: %s", uid) continue else: edgetype = valgencf(wasDerivedFrom[uid]) if "cf:id" not in wasDerivedFrom[ uid]: # Can be used as timestamp logging.debug( "Edge (wasDerivedFrom) record without timestamp. UUID: %s", uid) continue else: timestamp = wasDerivedFrom[uid]["cf:id"] if "prov:usedEntity" not in wasDerivedFrom[uid]: logging.debug( "Edge (wasDerivedFrom/{}) record without srcUUID. UUID: {}" .format(wasDerivedFrom[uid]["prov:type"], uid)) continue if "prov:generatedEntity" not in wasDerivedFrom[uid]: logging.debug( "Edge (wasDerivedFrom/{}) record without dstUUID. UUID: {}" .format(wasDerivedFrom[uid]["prov:type"], uid)) continue srcUUID = wasDerivedFrom[uid]["prov:usedEntity"] dstUUID = wasDerivedFrom[uid]["prov:generatedEntity"] srcVal = db.get(srcUUID) if srcVal == None: logging.debug( "Edge (wasDerivedFrom/{}) record with an unmatched srcUUID. UUID: {}" .format(wasDerivedFrom[uid]["prov:type"], uid)) continue dstVal = db.get(dstUUID) if dstVal == None: logging.debug( "Edge (wasDerivedFrom/{}) record with an unmatched dstUUID. UUID: {}" .format(wasDerivedFrom[uid]["prov:type"], uid)) continue out.write(str(hashgen([srcUUID])) + '\t' \ + str(hashgen([dstUUID])) + '\t' \ + str(srcVal) + ':' + str(dstVal) \ + ':' + str(edgetype) \ + ':' + str(timestamp) + '\t' + '\n') pb.close() return