def test(): """Test for DataFrame Stats module""" import os from bat.utils import file_utils # Open a dataset (relative path) data_dir = file_utils.relative_dir(__file__, 'test_data') file_path = os.path.join(data_dir, 'g_test_data.csv') dataframe = pd.read_csv(file_path) print(dataframe.head()) # Print out the contingency_table print('\nContingency Table') print(contingency_table(dataframe, 'name', 'status')) # Print out the joint_distribution print('\nJoint Distribution Table') print(joint_distribution(dataframe, 'name', 'status')) # Print out the expected_counts print('\nExpected Counts Table') print(expected_counts(dataframe, 'name', 'status')) # Print out the g_test scores print('\nG-Test Scores') print(g_test_scores(dataframe, 'name', 'status'))
def test(): """Test for FileTailer Python Class""" # Grab a test file data_path = file_utils.relative_dir(__file__, '../../data') test_path = os.path.join(data_path, 'http.log') print('Opening Data File: {:s}'.format(test_path)) # Create the Class tailer = FileTailer(test_path, tail=False) # First with no tailing for line in tailer.readlines(): print(line) print('Read with NoTail Test successful!') # Now include tailing (note: as an automated test this needs to timeout quickly) try: from interruptingcow import timeout # Spin up the class tailer = FileTailer(test_path) # Tail = True # Tail the file for 2 seconds and then quit try: with timeout(2, exception=RuntimeError): for line in tailer.readlines(): print(line) except RuntimeError: # InterruptingCow raises a RuntimeError on timeout print('Tailing Test successful!') except ImportError: print('Tailing Test not run, need interruptcow module...')
def test(): """Test for LogToDataFrame Class""" import os pd.set_option('display.width', 1000) from bat.utils import file_utils # Grab a test file data_path = file_utils.relative_dir(__file__, '../data') test_path = os.path.join(data_path, 'http.log') # Convert it to a Pandas DataFrame http_df = LogToDataFrame(test_path) # Print out the head print(http_df.head()) # Print out the datatypes print(http_df.dtypes) # Test an empty log (a log with header/close but no data rows) test_path = os.path.join(data_path, 'http_empty.log') http_df = LogToDataFrame(test_path) # Print out the head print(http_df.head()) # Print out the datatypes print(http_df.dtypes) print('LogToDataFrame Test successful!')
def test(): """Test for BroLogReader Python Class""" import pytest # Grab a test file data_path = file_utils.relative_dir(__file__, '../data') # For each file, create the Class and test the reader files = [ 'app_stats.log', 'conn.log', 'dhcp.log', 'dns.log', 'files.log', 'ftp.log', 'http.log', 'notice.log', 'smtp.log', 'ssl.log', 'weird.log', 'x509.log' ] for bro_log in files: test_path = os.path.join(data_path, bro_log) print('Opening Data File: {:s}'.format(test_path)) reader = BroLogReader(test_path, tail=False) # First with no tailing for line in reader.readrows(): print(line) print('Read with NoTail Test successful!') # Test an empty log (a log with header/close but no data rows) test_path = os.path.join(data_path, 'http_empty.log') reader = BroLogReader(test_path) for line in reader.readrows(): print(line) # Test some of the error conditions reader.field_names = ['good', 'error'] reader.type_converters = [ int, lambda x: datetime.datetime.fromtimestamp(float(x)) ] reader.make_dict([5, '0, .5, .5']) # Test invalid file path with pytest.raises(IOError): BroLogReader('nowhere.log') # Now include tailing (note: as an automated test this needs to timeout quickly) try: from interruptingcow import timeout # Spin up the class tailer = BroLogReader(test_path, tail=True) # Tail the file for 2 seconds and then quit try: with timeout(2, exception=RuntimeError): for line in tailer.readrows(): print(line) except RuntimeError: # InterruptingCow raises a RuntimeError on timeout print('Tailing Test successful!') except ImportError: print('Tailing Test not run, need interruptcow module...')
def test(): """Test the DirWatcher Class""" watch_path = file_utils.relative_dir(__file__, '../../data') print('Watching Directory: %s' % watch_path) DirWatcher(watch_path, my_callback) # Create a file and then delete it temp_file = os.path.join(watch_path, 'test.tmp') open(temp_file, 'w').close() time.sleep(1) os.remove(temp_file)
def test(): """Test for LiveSimulator Python Class""" # Grab a test file data_path = file_utils.relative_dir(__file__, '../data') test_path = os.path.join(data_path, 'conn.log') print('Opening Data File: {:s}'.format(test_path)) # Create a LiveSimulator reader reader = LiveSimulator(test_path, max_rows=10) for line in reader.readrows(): print(line) print('Read with max_rows Test successful!')
def test(): """Test for methods in this file""" import os pd.set_option('display.width', 1000) from bat.dataframe_to_parquet import parquet_to_df from bat.log_to_dataframe import LogToDataFrame from bat.utils import file_utils import tempfile # Grab a test file data_path = file_utils.relative_dir(__file__, '../data') log_path = os.path.join(data_path, 'dns.log') # Convert the log to a Pandas DataFrame log_to_df = LogToDataFrame() dns_df = log_to_df.create_dataframe(log_path) # Print out the head print(dns_df.head()) # Create a temporary file filename = tempfile.NamedTemporaryFile(delete=False).name # Write to a parquet file log_to_parquet(log_path, filename) # Read from the parquet file new_dns_df = parquet_to_df(filename) # Remove temp file os.remove(filename) # Print out the head print(new_dns_df.head()) # Make sure our conversions didn't lose type info # Note: This is no longer going to work # See: # See: https://issues.apache.org/jira/browse/ARROW-5379 # assert(dns_df.dtypes.values.tolist() == new_dns_df.dtypes.values.tolist()) # Test an empty log (a log with header/close but no data rows) test_path = os.path.join(data_path, 'http_empty.log') filename = tempfile.NamedTemporaryFile(delete=False).name log_to_parquet(test_path, filename) parquet_to_df(filename) os.remove(filename) print('DataFrame to Parquet Tests successful!')
def test(): """Test for BroMultiLogReader Python Class""" from bat.utils import file_utils # Grab a test file data_path = file_utils.relative_dir(__file__, '../data') # For each file, create the Class and test the reader files = ['http.log.gz', 'dhcp*.log', 'dhcp*.log.gz'] for bro_log in files: test_path = os.path.join(data_path, bro_log) print('Opening Data File: {:s}'.format(test_path)) reader = BroMultiLogReader(test_path) for line in reader.readrows(): print(line) print('Tests successful!')
def test(): """Test for methods in this file""" import os pd.set_option('display.width', 1000) from bat.dataframe_to_parquet import parquet_to_df from bat.log_to_dataframe import LogToDataFrame from bat.utils import file_utils import tempfile # Grab a test file data_path = file_utils.relative_dir(__file__, '../data') test_path = os.path.join(data_path, 'dns.log') # Convert the log to a Pandas DataFrame dns_df = LogToDataFrame(test_path) # Print out the head print(dns_df.head()) # Create a temporary file filename = tempfile.NamedTemporaryFile(delete=False).name # Write to a parquet file log_to_parquet(test_path, filename) # Read from the parquet file new_dns_df = parquet_to_df(filename) # Remove temp file os.remove(filename) # Print out the head print(new_dns_df.head()) assert (dns_df.dtypes.values.tolist() == new_dns_df.dtypes.values.tolist()) # Test an empty log (a log with header/close but no data rows) test_path = os.path.join(data_path, 'http_empty.log') filename = tempfile.NamedTemporaryFile(delete=False).name log_to_parquet(test_path, filename) parquet_to_df(filename) os.remove(filename) print('DataFrame to Parquet Tests successful!')
def test(): """Test for LogToDataFrame Class""" import os pd.set_option('display.width', 1000) from bat.utils import file_utils # Grab a test file data_path = file_utils.relative_dir(__file__, '../data') log_path = os.path.join(data_path, 'conn.log') # Convert it to a Pandas DataFrame log_to_df = LogToDataFrame() my_df = log_to_df.create_dataframe(log_path) # Print out the head print(my_df.head()) # Print out the datatypes print(my_df.dtypes) # Test a bunch tests = [ 'app_stats.log', 'dns.log', 'http.log', 'notice.log', 'tor_ssl.log', 'conn.log', 'dhcp_002.log', 'files.log', 'smtp.log', 'weird.log', 'ftp.log', 'ssl.log', 'x509.log' ] for log_path in [os.path.join(data_path, log) for log in tests]: print('Testing: {:s}...'.format(log_path)) my_df = log_to_df.create_dataframe(log_path) print(my_df.head()) print(my_df.dtypes) # Test an empty log (a log with header/close but no data rows) log_path = os.path.join(data_path, 'http_empty.log') my_df = log_to_df.create_dataframe(log_path) # Print out the head print(my_df.head()) # Print out the datatypes print(my_df.dtypes) print('LogToDataFrame Test successful!')
def test(): """Test for methods in this file""" import os pd.set_option('display.width', 1000) from bat.dataframe_to_parquet import parquet_to_df from bat.log_to_dataframe import LogToDataFrame from bat.utils import file_utils import tempfile # Grab a test file data_path = file_utils.relative_dir(__file__, '../data') test_path = os.path.join(data_path, 'dns.log') # Convert the log to a Pandas DataFrame dns_df = LogToDataFrame(test_path) # dns_df.reset_index(inplace=True) # Print out the head print(dns_df.head()) # Create a temporary file filename = tempfile.NamedTemporaryFile(delete=False).name # Write to a parquet file log_to_parquet(test_path, filename) # Read from the parquet file new_dns_df = parquet_to_df(filename) # Remove temp file os.remove(filename) # Print out the head print(new_dns_df.head()) # Make sure our conversions didn't lose type info # TODO: Uncomment this test when the following PR is fixed # - TimeDelta Support: https://issues.apache.org/jira/browse/ARROW-835 # assert(dns_df.dtypes.values.tolist() == new_dns_df.dtypes.values.tolist()) print('DataFrame to Parquet Tests successful!')