def test_read_csv__from_stream(): with open(Path(__file__).parent / "input" / "bundle.csv", "r") as fh: bls = list(read_csv(fh)) tables = [bl for ty, bl in bls if ty == BlockType.TABLE] assert tables[1].name == "spelling_numbers" # raises exception on common error if not text stream with raises(Exception): with open(Path(__file__).parent / "input" / "bundle.csv", "rb") as fh: # binary stream! bls = list(read_csv(fh)) tables = [bl for ty, bl in bls if ty == BlockType.TABLE]
def test_read_csv__reads_transposed_tables_with_arbitrary_trailing_csv_delimiters( ): csv_data_transposed_tables = dedent("""\ **transposed* all diameter; cm; 1.23 melting_point; K; 273 **transposed*; all; diameter; cm; 1.23;;;;;;; melting_point; K; 273; **transposed*; all; diameter; cm; 1.23 melting_point; K; 273;;;;;;;;;;;;;;;;;;; **transposed*;;;;;; all;;;;;; diameter; cm; 1.23;;;; melting_point; K; 273;;;; """) bl = list(read_csv(io.StringIO(csv_data_transposed_tables))) tables: List[Table] = [b for t, b in bl if t == BlockType.TABLE] t0: Table = tables[0] assert t0.column_names == ["diameter", "melting_point"] assert len(t0.df) == 1 for t in tables: assert t.equals(t0)
def test_displays_all_error_messages(): """By default, ParseFixer stops on errors and outputs a message listing all encountered errors.""" expected_error_msg = dedent( """\ Stopped parsing after 2 errors in table 'farm_cols1' with messages: Duplicate column 'flt' at position 4 in table 'farm_cols1'. Duplicate column 'flt' at position 5 in table 'farm_cols1'.""" ) with raises(ValueError, match=expected_error_msg): blocks = list(read_csv(input_dir() / "cols1.csv"))
def test_read_csv__sep_is_comma(csv_data): bl = list(read_csv(io.StringIO(csv_data.replace(";", ",")), sep=",")) tables: List[Table] = [b for t, b in bl if t == BlockType.TABLE] template_rows = [b for t, b in bl if t == BlockType.TEMPLATE_ROW] met = [b for t, b in bl if t == BlockType.METADATA] assert len(met) == 1 assert len(tables) == 3 assert tables[0].df["place"][1] == "work" t2: Table = tables[2] assert t2.column_names == ["diameter", "melting_point"] assert t2.df["melting_point"][0] == 273 assert len(template_rows) == 1
def test_read_csv__successfully_ignores_comments_on_column_name_row(): csv_data_transposed_tables = dedent("""\ **places; all place;distance;ETA;is_hot;;;; --> this is a perfectly legal comment <-- ; text;km;datetime;onoff home;0.0;2020-08-04 08:00:00;1 work;1.0;2020-08-04 09:00:00;0 beach;2.0;2020-08-04 17:00:00;1 """) bl = list(read_csv(io.StringIO(csv_data_transposed_tables))) tables: List[Table] = [b for t, b in bl if t == BlockType.TABLE] t0: Table = tables[0] assert t0.column_names == ["place", "distance", "ETA", "is_hot"]
def test_custom_fixer(): """ Test custom ParseFixer Verify that read_csv uses custom ParseFixer """ class fix_pi(ParseFixer): def __init__(self): super().__init__() # augment existing method, simple fix float def fix_illegal_cell_value(self, vtype, value): if vtype == "float": return 22.0 / 7.0 else: fix_value = ParseFixer.fix_illegal_cell_value(self, vtype, value) return fix_value fix = fix_pi() fix.stop_on_errors = False fix._called_from_test = True with open(input_dir() / "types3.csv", "r") as fh: g = read_csv(fh, to="jsondata", fixer=fix) for tp, tt in g: if tp == BlockType.TABLE: assert tt["columns"]["num"]["values"][2] == 22.0 / 7.0 assert tt["columns"]["flt"]["values"][0] == 22.0 / 7.0 assert tt["columns"]["flt"]["values"][0] == 22.0 / 7.0 assert tt["columns"]["flt2"]["values"][2] == 22.0 / 7.0 with pytest.raises(ValueError): # test read_csv w. class (not instance) of fixer # class has default stop_on_errors = True with open(input_dir() / "types3.csv", "r") as fh: g = read_csv(fh, to="jsondata", fixer=fix_pi) for tp, tt in g: pass
def test_TableBundle_from_file(): """ Verify that TableBundle can be generated from top level API methods: read_csv, read_excel """ input_file = input_dir() / "bundle.csv" bundle = TableBundle(read_csv(input_file), as_dataframe=True) assert bundle is not None assert len(bundle) == 3 assert isinstance(bundle[0], TableDataFrame) assert bundle.unique("spelling_numbers").spelling[1] == "six" assert bundle[1].spelling[0] == "one" assert len(bundle.all("places_to_go")) == 2 bundle = TableBundle(read_csv(input_file), as_dataframe=False) assert bundle is not None assert len(bundle) == 3 assert isinstance(bundle[1], Table) assert bundle.spelling_numbers["spelling"].values[0] == "one" assert len(bundle.all("places_to_go")) == 2 input_file = input_dir() / "bundle.xlsx" bundle = TableBundle(read_excel(input_file), as_dataframe=False) assert bundle is not None assert len(bundle) == 3 assert isinstance(bundle[1], Table) assert bundle.spelling_numbers["spelling"].values[0] == "one" assert len(bundle.all("places_to_go")) == 2 bundle = TableBundle(read_excel(input_file), as_dataframe=True) assert bundle is not None assert len(bundle) == 3 assert isinstance(bundle[0], TableDataFrame) assert bundle.unique("spelling_numbers").spelling[1] == "six" assert bundle[1].spelling[0] == "one" assert len(bundle.all("places_to_go")) == 2
def handle_includes(bg: BlockIterator, input_dir, recursive: bool = False) -> BlockIterator: """Handles 'include' directives, optionally recursively. Handles 'include' directives. 'include' directives must contain a list of files located in directory 'input_dir'. Optionally handles 'include' directives recursively. No check is done for circular references. For example, if file1.csv includes file2.csv, and file2.csv includes file1.csv, then infinite recursion ensues upon reading either file1.csv or file2.csv with 'recursive' set to True. Args: bg: A block generator returned by read_csv input_dir: Path of directory in which include files are located. recursive: Handle 'include' directives recursively, i.e. 'include' directives in files themselves read as a consequence of an 'include' directive, will be handled. Default is False. Yields: A block generator yielding blocks from... * if recursive, the entire tree of files in 'include' directives. * if not recursive, the top-level file and those files listed in its 'include' directive (if any). """ deep_handler = (functools.partial( handle_includes, input_dir=input_dir, recursive=recursive) if recursive else lambda x: x) for block_type, block in bg: if block_type == BlockType.DIRECTIVE: directive: Directive = block if directive.name == "include": # Don't emit this directive block; handle it. for filename in directive.lines: yield from deep_handler( read_csv(Path(input_dir) / filename)) else: yield block_type, block else: yield block_type, block
def test_columns_duplicate(): """ Verify that default ParseFixer corrects duplicate column names """ tab = None with open(input_dir() / "cols1.csv", "r") as fh: g = read_csv(fh, fixer=custom_test_fixer) for tp, tt in g: if True: if tp == BlockType.TABLE: tab = tt break assert tab is not None assert tab.df["flt_fixed_001"] is not None assert tab.df["flt_fixed_001"][6] == 7.6 assert tab.df["flt"][0] == 3.0
def test_read_csv(csv_data): bl = list(read_csv(io.StringIO(csv_data))) tables: List[Table] = [b for t, b in bl if t == BlockType.TABLE] template_rows = [b for t, b in bl if t == BlockType.TEMPLATE_ROW] met = [b for t, b in bl if t == BlockType.METADATA] assert len(met) == 1 assert len(tables) == 3 # Correctly reads non-transposed table assert tables[0].df["place"][1] == "work" assert not tables[0].metadata.transposed # Correctly reads transposed table t2: Table = tables[2] assert t2.column_names == ["diameter", "melting_point"] assert len(t2.df) == 1 assert t2.df["melting_point"][0] == 273 assert len(template_rows) == 1 assert t2.metadata.transposed
def test_FAT(): """ Factory Acceptance Test Verify that we are able to read all files in ./input Using default ParseFixer """ all_files = 0 ignore_files = ["auto_fixed.py", "__init__.py", "all.json"] for fn in os.listdir(input_dir()): path = input_dir() / fn if not os.path.isfile(path): continue if fn in ignore_files: continue all_files += 1 # load targets with open(input_dir() / "all.json") as f: all_json = json.load(f) for fn in os.listdir(input_dir()): path = input_dir() / fn if not os.path.isfile(path): continue if fn in ignore_files: continue with open(input_dir() / fn, "r") as fh: g = read_csv(fh, origin=f'"{fn}"', to="jsondata", fixer=custom_test_fixer) count = 0 for tp, tt in g: if tp == BlockType.TABLE: count += 1 if fn != "all.csv": assert tt == all_json[fn] if fn == "all.csv": assert count == all_files - 1 else: assert count == 1
def read_bundle_from_csv( input_path: Union[str, PathLike, TextIO], sep: Optional[str] = ";", convert_units_to: TableUnitDispatcher = None, unit_converter: UnitConverter = None, ) -> TableBundle: """Read single csv-file to TableBundle With a demo of bulk unit conversion of all tables at read time. unit_converter must accept units of type returned by the TableUnitDispatcher. """ if convert_units_to and not unit_converter: raise ValueError("No unit converter supplied.") inputs = read_csv(input_path, sep) if convert_units_to is not None: inputs = normalized_table_generator(inputs, convert_units_to, unit_converter) return TableBundle(inputs)
work;1.0;2020-08-04 09:00:00;0 beach;2.0;2020-08-04 17:00:00;1 **farm_animals;;; your_farm my_farm other_farm;;; species;n_legs;avg_weight; text;-;kg; chicken;2;2; pig;4;89; cow;4;200; unicorn;4;NaN; """)) # Read the stream. Syntax is the same if reading CSV file. # Reader function returns a generator; unroll it in a list for convenience. block_list = list(read_csv(csv_data)) assert len(block_list) == 4 # %% [markdown] # The reader generates tuples of `(BlockType, block)`. Note that blank/comment lines are read but not parsed. # %% for bt, b in block_list: print(bt, type(b)) # %% [markdown] # Here's one of the tables. # %% t = block_list[2][1] assert t.name == "places"