def test(): """Test for methods in this file""" import os pd.set_option('display.width', 1000) from bat.dataframe_to_parquet import parquet_to_df from bat.log_to_dataframe import LogToDataFrame from bat.utils import file_utils import tempfile # Grab a test file data_path = file_utils.relative_dir(__file__, '../data') log_path = os.path.join(data_path, 'dns.log') # Convert the log to a Pandas DataFrame log_to_df = LogToDataFrame() dns_df = log_to_df.create_dataframe(log_path) # Print out the head print(dns_df.head()) # Create a temporary file filename = tempfile.NamedTemporaryFile(delete=False).name # Write to a parquet file log_to_parquet(log_path, filename) # Read from the parquet file new_dns_df = parquet_to_df(filename) # Remove temp file os.remove(filename) # Print out the head print(new_dns_df.head()) # Make sure our conversions didn't lose type info # Note: This is no longer going to work # See: # See: https://issues.apache.org/jira/browse/ARROW-5379 # assert(dns_df.dtypes.values.tolist() == new_dns_df.dtypes.values.tolist()) # Test an empty log (a log with header/close but no data rows) test_path = os.path.join(data_path, 'http_empty.log') filename = tempfile.NamedTemporaryFile(delete=False).name log_to_parquet(test_path, filename) parquet_to_df(filename) os.remove(filename) print('DataFrame to Parquet Tests successful!')
def test(): """Test for methods in this file""" import os pd.set_option('display.width', 1000) from bat.dataframe_to_parquet import parquet_to_df from bat.log_to_dataframe import LogToDataFrame from bat.utils import file_utils import tempfile # Grab a test file data_path = file_utils.relative_dir(__file__, '../data') test_path = os.path.join(data_path, 'dns.log') # Convert the log to a Pandas DataFrame dns_df = LogToDataFrame(test_path) # Print out the head print(dns_df.head()) # Create a temporary file filename = tempfile.NamedTemporaryFile(delete=False).name # Write to a parquet file log_to_parquet(test_path, filename) # Read from the parquet file new_dns_df = parquet_to_df(filename) # Remove temp file os.remove(filename) # Print out the head print(new_dns_df.head()) assert (dns_df.dtypes.values.tolist() == new_dns_df.dtypes.values.tolist()) # Test an empty log (a log with header/close but no data rows) test_path = os.path.join(data_path, 'http_empty.log') filename = tempfile.NamedTemporaryFile(delete=False).name log_to_parquet(test_path, filename) parquet_to_df(filename) os.remove(filename) print('DataFrame to Parquet Tests successful!')
def test(): """Test for methods in this file""" import os pd.set_option('display.width', 1000) from bat.dataframe_to_parquet import parquet_to_df from bat.log_to_dataframe import LogToDataFrame from bat.utils import file_utils import tempfile # Grab a test file data_path = file_utils.relative_dir(__file__, '../data') test_path = os.path.join(data_path, 'dns.log') # Convert the log to a Pandas DataFrame dns_df = LogToDataFrame(test_path) # dns_df.reset_index(inplace=True) # Print out the head print(dns_df.head()) # Create a temporary file filename = tempfile.NamedTemporaryFile(delete=False).name # Write to a parquet file log_to_parquet(test_path, filename) # Read from the parquet file new_dns_df = parquet_to_df(filename) # Remove temp file os.remove(filename) # Print out the head print(new_dns_df.head()) # Make sure our conversions didn't lose type info # TODO: Uncomment this test when the following PR is fixed # - TimeDelta Support: https://issues.apache.org/jira/browse/ARROW-835 # assert(dns_df.dtypes.values.tolist() == new_dns_df.dtypes.values.tolist()) print('DataFrame to Parquet Tests successful!')