def har_write(input_filename, output_filename): with open(input_filename) as input_file, open(output_filename, 'w') as output_file: flow_reader = flow.FlowReader(input_file) flows = [fl for fl in flow_reader.stream() if fl.response] har = create_har(flows) output_file.write(json.dumps(har))
def test_write(self): with tutils.tmpdir() as d: p = os.path.join(d, "a") self.dummy_cycle( self.mkmaster(None, outfile=(p, "wb"), verbosity=0), 1, b"" ) assert len(list(flow.FlowReader(open(p, "rb")).stream())) == 1
def test_error(self): sio = StringIO() sio.write("bogus") sio.seek(0) r = flow.FlowReader(sio) tutils.raises(flow.FlowReadError, list, r.stream()) f = flow.FlowReadError("foo") assert f.strerror == "foo"
def test_error(self): sio = StringIO() sio.write("bogus") sio.seek(0) r = flow.FlowReader(sio) tutils.raises(FlowReadException, list, r.stream()) f = FlowReadException("foo") assert str(f) == "foo"
def test_versioncheck(self): f = tutils.tflow() d = f.get_state() d["version"] = (0, 0) sio = StringIO() tnetstring.dump(d, sio) sio.seek(0) r = flow.FlowReader(sio) tutils.raises("version", list, r.stream())
def _treader(self): sio = StringIO() w = flow.FlowWriter(sio) for i in range(3): f = tutils.tflow(resp=True) w.add(f) for i in range(3): f = tutils.tflow(err=True) w.add(f) sio.seek(0) return flow.FlowReader(sio)
def test_roundtrip(self): sio = StringIO() f = tutils.tflow() f.request.content = "".join(chr(i) for i in range(255)) w = flow.FlowWriter(sio) w.add(f) sio.seek(0) r = flow.FlowReader(sio) l = list(r.stream()) assert len(l) == 1 f2 = l[0] assert f2.get_state() == f.get_state() assert f2.request == f.request
def test_filter(self): sio = StringIO() fl = filt.parse("~c 200") w = flow.FilteredFlowWriter(sio, fl) f = tutils.tflow(resp=True) f.response.status_code = 200 w.add(f) f = tutils.tflow(resp=True) f.response.status_code = 201 w.add(f) sio.seek(0) r = flow.FlowReader(sio) assert len(list(r.stream()))
def test_roundtrip(self): sio = io.BytesIO() f = tutils.tflow() f.marked = True f.request.content = bytes(bytearray(range(256))) w = flow.FlowWriter(sio) w.add(f) sio.seek(0) r = flow.FlowReader(sio) l = list(r.stream()) assert len(l) == 1 f2 = l[0] assert f2.get_state() == f.get_state() assert f2.request == f.request assert f2.marked
def r(): r = flow.FlowReader(open(p, "rb")) return list(r.stream())
def run(dump_files): num_http = 0 num_https = 0 err_cnt = 0 contacted_domains = dict() for dump_file in dump_files: print (dump_file) json_file_prefix = dump_file[:-5] flow_json_http = [] flow_json_https = [] http_json_file_name ='%s_http.log' % (json_file_prefix) https_json_file_name ='%s_https.log' % (json_file_prefix) with open(dump_file, "rb") as logfile: f_reader = flow.FlowReader(logfile) line_http = 0 line_https = 0 for f in f_reader.stream(): try: ts = f.request.timestamp_start # ts = '%.6f' % ts # print ts bro_uid = 'S{0}'.format(''.join( random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(17))) id_orig_h = f.client_conn.address.address[0] id_orig_p = f.client_conn.address.address[1] id_resp_h = f.request.host id_resp_p = f.request.port method = f.request.method host = f.request.host if 'host' in f.request.headers: host = f.request.headers['host'] if host not in contacted_domains: contacted_domains[host] = 0 contacted_domains[host] += 1 uri = f.request.path referrer = '' if 'referrer' in f.request.headers: referrer = f.request.headers['referrer'] user_agent = '-' if 'User-Agent' in f.request.headers: user_agent = f.request.headers['User-Agent'] status_code = f.response.status_code status_msg = f.response.reason request_body_len, response_body_len = get_bytes(f) trans_depth = '-' info_code = '-' info_msg = '-' filename = '-' tags = '(empty)' username = '******' password = '******' proxied = '-' orig_fuids = '-' # orig_mime_types = '-' resp_fuids = '-' resp_mime_types = '-' content_length = 0 content_encoding = '-' content_type = '-' if 'content-type' in f.request.headers: content_type = f.request.headers['content-type'] if 'content-length' in f.request.headers: content_length = f.request.headers['content-length'] transfer_encoding = '-' post_body = '-' if f.request.content is not None: # make sure the content are in the same line, WHEN DECODING, replace /n with \n post_body = urllib.quote(str(f.request.content)) # headers client_header_names = '' client_header_values = '' for hk in f.request.headers: hv = f.request.headers[hk] client_header_names += '%s,' % hk try: hv = urllib.quote(hv) finally: client_header_values += '%s,' % hv server_header_names = '' server_header_values = '' for hk in f.response.headers: hv = f.response.headers[hk] server_header_names += '%s,' % hk try: hv = urllib.quote(hv) finally: server_header_values += '%s,' % hv http_entry = '' http_entry += '%.6f\t%s\t%s\t%s\t%s\t%s\t' % (ts, bro_uid, id_orig_h, id_orig_p, id_resp_h, id_resp_p) http_entry += '%s\t%s\t%s\t%s\t%s\t%s\t' % (trans_depth, method, host, uri, referrer, user_agent) http_entry += '%s\t%s\t%s\t%s\t%s\t%s\t' % (request_body_len, response_body_len, status_code, status_msg, info_code, info_msg) http_entry += '%s\t%s\t%s\t%s\t%s\t%s\t' % (filename, tags, username, password, proxied, orig_fuids) http_entry += '%s\t%s\t%s\t%s\t%s\t%s\t' % (orig_mime_types, resp_fuids, resp_mime_types, content_length, content_encoding, content_type) http_entry += '%s\t%s\t%s\t%s\t%s\t%s' % (transfer_encoding, post_body, client_header_names, client_header_values, server_header_names, server_header_values) if f.client_conn.ssl_established: line_https += 1 flow_json_https.append(http_entry) else: line_http += 1 flow_json_http.append(http_entry) except flow.FlowReadError as v: print "Flow file corrupted. Stopped loading." print v.message err_cnt += 1 json.dump(f.get_state(), sys.stdout, indent=4) num_http += len(flow_json_http) num_https += len(flow_json_https) if num_http > 0: with open(http_json_file_name, 'w') as hf: hf.write(table_header + '\n') for he in flow_json_http: hf.write('%s\n' % he) if num_https > 0: with open(https_json_file_name, 'w') as hf: hf.write(table_header + '\n') for he in flow_json_https: hf.write('%s\n' % he) result = dict() result['num_http'] = num_http result['num_https'] = num_https result['num_errors'] = err_cnt result['contacted'] = contacted_domains result['num_domains'] = len(contacted_domains) return result
def analyse_https(data, https_file): # NOTE: data here is not aggregated at domain layer. We just copy-paste all the requests into the dicitonary if not isinstance(data, dict): raise Exception("This method requires a dictionary as data") # Check if the capture file exists if not os.path.exists(https_file): raise Exception("PCAP File does not exist.") else: https_file = os.path.abspath(https_file) # Extract https_requests requests = [] downloads = [] with open(https_file, "rb") as logfile: freader = Flow.FlowReader(logfile) # For each flow, get the relative request for flow in freader.stream(): r = dict() r['first_line_format'] = str(flow.request.first_line_format) r['method'] = flow.request.method r['scheme'] = flow.request.scheme r['host'] = flow.request.host r['hostname'] = flow.request.pretty_host r['port'] = flow.request.port r['path'] = flow.request.path r['http_version'] = flow.request.http_version #r['headers']=flow.request.data.headers # We also log the contents of the request. This might cause the log file to grow... r['content'] = base64.encodestring(flow.request.content) r['timestamp_start'] = flow.request.timestamp_start r['timestamp_end'] = flow.request.timestamp_end r['fullpath'] = flow.request.url requests.append(r) if (flow.response is not None) and (flow.response.content is not None) and len( flow.response.content) > 0: # In order to analyze the response content, we need to store it on disk with tempfile.NamedTemporaryFile(delete=True) as fp: fp.write(flow.response.content) fp.flush() # Define a recursive function used whenever the file is an archive, so we can deep inspect all the # contained files def recursive_analysis(flow, fname, parent_archive_sha1, nest_level, downloads): m_type = mime.from_file(fname) size = os.path.getsize(fname) with open(fname, 'r') as f: # Retrieve basic info about this file, such as MD5, SHA1, etc. m = md5() s = sha1() f.seek(0, 0) for chunk in iter(lambda: f.read(4096), b""): m.update(chunk) s.update(chunk) str_md5 = m.hexdigest() str_sha1 = s.hexdigest() str_fuzzy = ssdeep.hash_from_file(f.name) # Store info on the referenced dict d = dict() d['host'] = flow.request.host d['hostname'] = flow.request.pretty_host d['port'] = flow.request.port d['path'] = flow.request.path d['scheme'] = flow.request.scheme d['method'] = flow.request.method #d['request_headers'] = flow.request.headers d['status_code'] = flow.response.status_code d['fullpath'] = r['fullpath'] #d['response_headers'] = flow.response.status_code d['sha1'] = str_sha1.lower() d['md5'] = str_md5.lower() d['fuzzy'] = str_fuzzy.lower() d['mime'] = m_type d['size'] = size d['parent_archive'] = parent_archive_sha1 d['nest_level'] = nest_level downloads.append(d) # Try to extract the file if it is an archive tmpdir = tempfile.mkdtemp() try: # Brute force approach: we don't even check the mime file. # We try to unpack evey archive. # Extract all the files patoolib.extract_archive(f.name, outdir=tmpdir) # Analyze each file files = [ os.path.join(tmpdir, ff) for ff in os.listdir(tmpdir) if os.path.isfile(os.path.join(tmpdir, ff)) ] for ff in files: recursive_analysis(flow, ff, str_sha1, nest_level + 1, downloads) except: pass finally: # Remove the temporary file directory shutil.rmtree(tmpdir) recursive_analysis(flow, fp.name, None, 0, downloads) # Assign data to the dictionary data['https_requests'] = requests data['https_downloads'] = downloads
def test_write_append(self): with tutils.tmpdir() as d: p = os.path.join(d, "a.append") self._dummy_cycle(1, None, "", outfile=(p, "wb"), verbosity=0) self._dummy_cycle(1, None, "", outfile=(p, "ab"), verbosity=0) assert len(list(flow.FlowReader(open(p, "rb")).stream())) == 2