def test(): """Test for methods in this file""" import os pd.set_option('display.width', 1000) from bat.dataframe_to_parquet import parquet_to_df from bat.log_to_dataframe import LogToDataFrame from bat.utils import file_utils import tempfile # Grab a test file data_path = file_utils.relative_dir(__file__, '../data') log_path = os.path.join(data_path, 'dns.log') # Convert the log to a Pandas DataFrame log_to_df = LogToDataFrame() dns_df = log_to_df.create_dataframe(log_path) # Print out the head print(dns_df.head()) # Create a temporary file filename = tempfile.NamedTemporaryFile(delete=False).name # Write to a parquet file log_to_parquet(log_path, filename) # Read from the parquet file new_dns_df = parquet_to_df(filename) # Remove temp file os.remove(filename) # Print out the head print(new_dns_df.head()) # Make sure our conversions didn't lose type info # Note: This is no longer going to work # See: # See: https://issues.apache.org/jira/browse/ARROW-5379 # assert(dns_df.dtypes.values.tolist() == new_dns_df.dtypes.values.tolist()) # Test an empty log (a log with header/close but no data rows) test_path = os.path.join(data_path, 'http_empty.log') filename = tempfile.NamedTemporaryFile(delete=False).name log_to_parquet(test_path, filename) parquet_to_df(filename) os.remove(filename) print('DataFrame to Parquet Tests successful!')
def bro_log_to_df(file_path): """ Load a Bro log in to a pandas DataFrame object. :param file_path: Log file :return: pandas DataFrame object """ return LogToDataFrame(file_path)
def test(): """Test for methods in this file""" import os pd.set_option('display.width', 1000) from bat.dataframe_to_parquet import parquet_to_df from bat.log_to_dataframe import LogToDataFrame from bat.utils import file_utils import tempfile # Grab a test file data_path = file_utils.relative_dir(__file__, '../data') test_path = os.path.join(data_path, 'dns.log') # Convert the log to a Pandas DataFrame dns_df = LogToDataFrame(test_path) # Print out the head print(dns_df.head()) # Create a temporary file filename = tempfile.NamedTemporaryFile(delete=False).name # Write to a parquet file log_to_parquet(test_path, filename) # Read from the parquet file new_dns_df = parquet_to_df(filename) # Remove temp file os.remove(filename) # Print out the head print(new_dns_df.head()) assert (dns_df.dtypes.values.tolist() == new_dns_df.dtypes.values.tolist()) # Test an empty log (a log with header/close but no data rows) test_path = os.path.join(data_path, 'http_empty.log') filename = tempfile.NamedTemporaryFile(delete=False).name log_to_parquet(test_path, filename) parquet_to_df(filename) os.remove(filename) print('DataFrame to Parquet Tests successful!')
def test(): """Test for methods in this file""" import os pd.set_option('display.width', 1000) from bat.dataframe_to_parquet import parquet_to_df from bat.log_to_dataframe import LogToDataFrame from bat.utils import file_utils import tempfile # Grab a test file data_path = file_utils.relative_dir(__file__, '../data') test_path = os.path.join(data_path, 'dns.log') # Convert the log to a Pandas DataFrame dns_df = LogToDataFrame(test_path) # dns_df.reset_index(inplace=True) # Print out the head print(dns_df.head()) # Create a temporary file filename = tempfile.NamedTemporaryFile(delete=False).name # Write to a parquet file log_to_parquet(test_path, filename) # Read from the parquet file new_dns_df = parquet_to_df(filename) # Remove temp file os.remove(filename) # Print out the head print(new_dns_df.head()) # Make sure our conversions didn't lose type info # TODO: Uncomment this test when the following PR is fixed # - TimeDelta Support: https://issues.apache.org/jira/browse/ARROW-835 # assert(dns_df.dtypes.values.tolist() == new_dns_df.dtypes.values.tolist()) print('DataFrame to Parquet Tests successful!')
import pandas as pd from bat.log_to_dataframe import LogToDataFrame import plotly.offline as py import plotly.graph_objs as go web = LogToDataFrame('http.log') print(web.host.unique())
# Collect args from the command line parser = argparse.ArgumentParser() parser.add_argument('-f', '--bro-log', type=str, help='Specify a bro log to run BroLogReader test on') args, commands = parser.parse_known_args() # Check for unknown args if commands: print('Unrecognized args: %s' % commands) sys.exit(1) # If no args just call help if len(sys.argv) == 1: parser.print_help() sys.exit(1) # File may have a tilde in it if args.bro_log: args.bro_log = os.path.expanduser(args.bro_log) # Create a Pandas dataframe from a Bro log bro_df = LogToDataFrame(args.bro_log) # Print out the head of the dataframe print(bro_df.head()) # Print out the types of the columns print(bro_df.dtypes)
parser.add_argument('bro_log', type=str, help='Specify a bro log to run BroLogReader test on') args, commands = parser.parse_known_args() # Check for unknown args if commands: print('Unrecognized args: %s' % commands) sys.exit(1) # File may have a tilde in it if args.bro_log: args.bro_log = os.path.expanduser(args.bro_log) # Create a Pandas dataframe from a Bro log log_to_df = LogToDataFrame() bro_df = log_to_df.create_dataframe(args.bro_log) # Print out the head of the dataframe print(bro_df.head()) # Print out the types of the columns print(bro_df.dtypes) # Print out size and memory usage print('DF Shape: {:s}'.format(str(bro_df.shape))) print('DF Memory:') memory_usage = bro_df.memory_usage(deep=True) total = memory_usage.sum() for item in memory_usage.items(): print('\t {:s}: \t{:.2f} MB'.format(item[0], item[1] / 1e6))
x = datetime.now() print("Starting script at {}".format(x)) configfile_ok = 1 pp = pprint.PrettyPrinter(indent=4) new_list = [] brologfile = sys.argv[1] print('Analizing Bro File: ', str(brologfile)) csvfile = str(pwd) + "/" + str(brologfile).split("/")[0] + "/" + str( brologfile).split("/")[-1] + "_" + str(csvfile) + ".csv" print("Results at: " + str(csvfile)) print("Logs at: " + str(logfile)) # Create a Pandas dataframe from a Bro log log_to_df = LogToDataFrame() bro_df = log_to_df.create_dataframe(brologfile) # Insert new labels columns at the end columns = len(bro_df.columns) bro_df.insert(int(columns), "MultiLabel", "", True) bro_df.insert(int(columns) + 1, "UniLabel", "", True) bro_df.insert(int(columns) + 2, "Label", "", True) # Loop the generated dataframe rows to search for key analysis patterns (such as known malicious ports) iterrows = bro_df.iterrows() information = 0 leniterrows = os.popen('wc -l ' + str(brologfile) + '| cut -d " " -f1 ').read() print( "Analyzing a total of {} this may take several minutes or hours . . .".
def pull_data(uname, passwd, local_dir="./", device="em2", date=None, export=False): server = config['server'] user = uname password= passwd bro_dir = '/mnt/localraid/bro/logs' #get date and time. If none provided, defaults to current date if not date: cdt = dt.datetime.fromtimestamp(time.time()) else: if ':' not in date: date += " 12:00:00" cdt = parse(date) datestr = '-'.join([str(cdt.year),str(cdt.month),str(cdt.day-1)]) cur_date = '-'.join([str(cdt.year),str(cdt.month),str(cdt.day)]) sh = ShellHandler(server, user, password) # remove local and rmeote tmp folder, if it exists tmp_folder = "./tmp_{}".format(cur_date) sh.execute('cd {}'.format(bro_dir)) sh.execute('rm -rf {}'.format(tmp_folder)) os.system('rm -rf {}'.format(tmp_folder)) # create new empty tmp folder sh.execute('mkdir {}'.format(tmp_folder)) # find stats and capture_loss files sh.execute('touch --date "{}" /tmp/start'.format(datestr)) stats_cmd = 'find -type f -newer /tmp/start -name "*stats*"' _,stat_files,_ = sh.execute(stats_cmd) cap_loss_cmd = 'find -type f -newer /tmp/start -name "*capture_loss*"' _,cl_files,_ = sh.execute(cap_loss_cmd) all_files = cl_files + stat_files all_files = [bro_dir +f[1:].strip() for f in all_files if '.log' in f] all_files = [re.sub('\:', '\:', f) for f in all_files] # add dates to filenames if necessary to uniquely identify hours from different days dates = [] for l in all_files: dates += re.findall('''([0-9]{4}-[0-9]{2}-[0-9]{2})\/''',l) new_files = [] for date, f in zip(dates,all_files): f_tokens = f.split('/') new_filename = tmp_folder + '/' + date + '-' + f_tokens[-1] new_files.append(new_filename) cp_cmd = ';'.join([' cp {} {}'.format(old,new) for old,new in zip(all_files, new_files)]) if len(cp_cmd) > 0: sh.execute(cp_cmd) current_files = ['/mnt/localraid/bro/logs/current/capture_loss.log', '/mnt/localraid/bro/logs/current/stats.log'] for file in current_files: print 'cp {} {}'.format(file, tmp_folder) sh.execute('cp {} {}'.format(file, tmp_folder)) traffic_stats_filename = '/trafficStats_v{}_{}.txt'.format(version_number, device) traffic_stats_path = '/home/bea3ch/shared/trafficAnalysis' + traffic_stats_filename sh.execute('cp {} {}'.format(traffic_stats_path, tmp_folder)) # compress tmp folder into a tarball and copy to local sh.execute('tar -cvf tarball.tar {}'.format(tmp_folder)) sh.scp.get(r'{}/tarball.tar'.format(bro_dir), r'./') # remove tarball on server sh.execute('rm -rf {} tarball.tar'.format(tmp_folder)) # unzip local tarball to get tmp folder os.system('tar -xvf ./tarball.tar') local_files = os.listdir('{}'.format(tmp_folder)) # remove local tarball os.system('rm ./tarball.tar') # unzip any remaining gz files in tmp folder [os.system('gunzip {}'.format('/'.join([tmp_folder,f]))) for f in local_files if '.gz' in f] # remove any remaining .gz files os.system('rm {}/*.gz'.format(tmp_folder)) # read in capture loss files capture_loss_files = glob.glob('{}/*capture_loss*log'.format(tmp_folder)) capture_loss_files.sort() capture_loss_df = LogToDataFrame(capture_loss_files.pop()) for file in capture_loss_files: try: capture_loss_df.merge(LogToDataFrame(file)) except Exception as e: print 'Error loading', file + ':', e # reset index and convert datetimes to unix epochs capture_loss_df.reset_index(level=0, inplace=True) capture_loss_df.ts = capture_loss_df.ts.map(lambda x: (x-datetime.datetime(1970,1,1)).total_seconds()) capture_loss_df.drop('ts_delta', axis=1, inplace=True) # read in bro stats files stats_files = glob.glob('{}/*stats*log'.format(tmp_folder)) stats_files.sort() stats_df = LogToDataFrame(stats_files.pop()) for file in stats_files: try: stats_df.merge(LogToDataFrame(file)) except Exception as e: print 'Error loading', file + ':', e # reset index and convert datetimes to unix epochs stats_df.reset_index(level=0, inplace=True) stats_df.ts = stats_df.ts.map(lambda x: (x-datetime.datetime(1970,1,1)).total_seconds()) stats_df.pkt_lag = str(stats_df.pkt_lag) # read in trafficStats csv traffic_stats_df = pd.read_csv(tmp_folder + traffic_stats_filename, index_col=False) unique_traffic_stats_timestamps.update(traffic_stats_df.ts.unique()) # rename [cpu0 -> cpu00], [cpu1 -> cpu01], ..., [cpu9 -> cpu09] rename_keys = {} for i in xrange(10): rename_keys['cpu' + str(i)] = 'cpu0' + str(i) traffic_stats_df = traffic_stats_df.rename(columns=rename_keys) if export: capture_loss_df.to_csv('capture_loss_comb_{}.csv'.format(datestr)) stats_df.to_csv('stats_comb_{}.csv'.format(datestr)) traffic_stats_df.to_csv('trafficStats_comb_{}.csv'.format(datestr)) return sh, traffic_stats_df, capture_loss_df, stats_df