예제 #1
0
def test_read_csv__from_stream():
    with open(Path(__file__).parent / "input" / "bundle.csv", "r") as fh:
        bls = list(read_csv(fh))
        tables = [bl for ty, bl in bls if ty == BlockType.TABLE]
        assert tables[1].name == "spelling_numbers"

    # raises exception on common error if not text stream
    with raises(Exception):
        with open(Path(__file__).parent / "input" / "bundle.csv", "rb") as fh:  # binary stream!
            bls = list(read_csv(fh))
            tables = [bl for ty, bl in bls if ty == BlockType.TABLE]
예제 #2
0
def test_read_csv__reads_transposed_tables_with_arbitrary_trailing_csv_delimiters(
):
    csv_data_transposed_tables = dedent("""\
        **transposed*
        all
        diameter; cm; 1.23
        melting_point; K; 273

        **transposed*;
        all;
        diameter; cm; 1.23;;;;;;;
        melting_point; K; 273;

        **transposed*;
        all;
        diameter; cm; 1.23
        melting_point; K; 273;;;;;;;;;;;;;;;;;;;

        **transposed*;;;;;;
        all;;;;;;
        diameter; cm; 1.23;;;;
        melting_point; K; 273;;;;
        """)
    bl = list(read_csv(io.StringIO(csv_data_transposed_tables)))
    tables: List[Table] = [b for t, b in bl if t == BlockType.TABLE]
    t0: Table = tables[0]
    assert t0.column_names == ["diameter", "melting_point"]
    assert len(t0.df) == 1
    for t in tables:
        assert t.equals(t0)
예제 #3
0
def test_displays_all_error_messages():
    """By default, ParseFixer stops on errors and outputs a message
    listing all encountered errors."""
    expected_error_msg = dedent(
        """\
        Stopped parsing after 2 errors in table 'farm_cols1' with messages:
        Duplicate column 'flt' at position 4 in table 'farm_cols1'.
        Duplicate column 'flt' at position 5 in table 'farm_cols1'."""
    )
    with raises(ValueError, match=expected_error_msg):
        blocks = list(read_csv(input_dir() / "cols1.csv"))
예제 #4
0
def test_read_csv__sep_is_comma(csv_data):
    bl = list(read_csv(io.StringIO(csv_data.replace(";", ",")), sep=","))
    tables: List[Table] = [b for t, b in bl if t == BlockType.TABLE]
    template_rows = [b for t, b in bl if t == BlockType.TEMPLATE_ROW]
    met = [b for t, b in bl if t == BlockType.METADATA]

    assert len(met) == 1
    assert len(tables) == 3
    assert tables[0].df["place"][1] == "work"
    t2: Table = tables[2]
    assert t2.column_names == ["diameter", "melting_point"]
    assert t2.df["melting_point"][0] == 273
    assert len(template_rows) == 1
예제 #5
0
def test_read_csv__successfully_ignores_comments_on_column_name_row():
    csv_data_transposed_tables = dedent("""\
        **places;
        all
        place;distance;ETA;is_hot;;;; --> this is a perfectly legal comment <-- ;
        text;km;datetime;onoff
        home;0.0;2020-08-04 08:00:00;1
        work;1.0;2020-08-04 09:00:00;0
        beach;2.0;2020-08-04 17:00:00;1
        """)
    bl = list(read_csv(io.StringIO(csv_data_transposed_tables)))
    tables: List[Table] = [b for t, b in bl if t == BlockType.TABLE]
    t0: Table = tables[0]
    assert t0.column_names == ["place", "distance", "ETA", "is_hot"]
예제 #6
0
def test_custom_fixer():
    """ Test custom ParseFixer
        Verify that read_csv uses custom ParseFixer
    """

    class fix_pi(ParseFixer):
        def __init__(self):
            super().__init__()

        # augment existing method, simple fix float
        def fix_illegal_cell_value(self, vtype, value):
            if vtype == "float":
                return 22.0 / 7.0
            else:
                fix_value = ParseFixer.fix_illegal_cell_value(self, vtype, value)
                return fix_value

    fix = fix_pi()
    fix.stop_on_errors = False
    fix._called_from_test = True

    with open(input_dir() / "types3.csv", "r") as fh:
        g = read_csv(fh, to="jsondata", fixer=fix)
        for tp, tt in g:
            if tp == BlockType.TABLE:
                assert tt["columns"]["num"]["values"][2] == 22.0 / 7.0
                assert tt["columns"]["flt"]["values"][0] == 22.0 / 7.0
                assert tt["columns"]["flt"]["values"][0] == 22.0 / 7.0
                assert tt["columns"]["flt2"]["values"][2] == 22.0 / 7.0

    with pytest.raises(ValueError):
        # test read_csv w. class (not instance) of fixer
        # class has default stop_on_errors = True
        with open(input_dir() / "types3.csv", "r") as fh:
            g = read_csv(fh, to="jsondata", fixer=fix_pi)
            for tp, tt in g:
                pass
def test_TableBundle_from_file():
    """ Verify that TableBundle can be generated from top level API methods: read_csv, read_excel
    """
    input_file = input_dir() / "bundle.csv"
    bundle = TableBundle(read_csv(input_file), as_dataframe=True)
    assert bundle is not None
    assert len(bundle) == 3
    assert isinstance(bundle[0], TableDataFrame)

    assert bundle.unique("spelling_numbers").spelling[1] == "six"
    assert bundle[1].spelling[0] == "one"
    assert len(bundle.all("places_to_go")) == 2

    bundle = TableBundle(read_csv(input_file), as_dataframe=False)
    assert bundle is not None
    assert len(bundle) == 3
    assert isinstance(bundle[1], Table)
    assert bundle.spelling_numbers["spelling"].values[0] == "one"
    assert len(bundle.all("places_to_go")) == 2

    input_file = input_dir() / "bundle.xlsx"
    bundle = TableBundle(read_excel(input_file), as_dataframe=False)
    assert bundle is not None
    assert len(bundle) == 3
    assert isinstance(bundle[1], Table)
    assert bundle.spelling_numbers["spelling"].values[0] == "one"
    assert len(bundle.all("places_to_go")) == 2

    bundle = TableBundle(read_excel(input_file), as_dataframe=True)
    assert bundle is not None
    assert len(bundle) == 3
    assert isinstance(bundle[0], TableDataFrame)

    assert bundle.unique("spelling_numbers").spelling[1] == "six"
    assert bundle[1].spelling[0] == "one"
    assert len(bundle.all("places_to_go")) == 2
예제 #8
0
def handle_includes(bg: BlockIterator,
                    input_dir,
                    recursive: bool = False) -> BlockIterator:
    """Handles 'include' directives, optionally recursively.

    Handles 'include' directives.
    'include' directives must contain a list of files located in directory 'input_dir'.

    Optionally handles 'include' directives recursively. No check is done for circular references.
    For example, if file1.csv includes file2.csv, and file2.csv includes file1.csv, then infinite
    recursion ensues upon reading either file1.csv or file2.csv with 'recursive' set to True.

    Args:
        bg:
            A block generator returned by read_csv

        input_dir:
            Path of directory in which include files are located.

        recursive:
            Handle 'include' directives recursively, i.e. 'include' directives in files themselves
            read as a consequence of an 'include' directive, will be handled. Default is False.

    Yields:
        A block generator yielding blocks from...
        * if recursive, the entire tree of files in 'include' directives.
        * if not recursive, the top-level file and those files listed in its 'include' directive (if
          any).

    """

    deep_handler = (functools.partial(
        handle_includes, input_dir=input_dir, recursive=recursive)
                    if recursive else lambda x: x)

    for block_type, block in bg:
        if block_type == BlockType.DIRECTIVE:
            directive: Directive = block
            if directive.name == "include":
                # Don't emit this directive block; handle it.
                for filename in directive.lines:
                    yield from deep_handler(
                        read_csv(Path(input_dir) / filename))
            else:
                yield block_type, block
        else:
            yield block_type, block
예제 #9
0
def test_columns_duplicate():
    """
       Verify that default ParseFixer corrects duplicate column names

    """
    tab = None
    with open(input_dir() / "cols1.csv", "r") as fh:
        g = read_csv(fh, fixer=custom_test_fixer)
        for tp, tt in g:
            if True:
                if tp == BlockType.TABLE:
                    tab = tt
                    break
    assert tab is not None
    assert tab.df["flt_fixed_001"] is not None
    assert tab.df["flt_fixed_001"][6] == 7.6
    assert tab.df["flt"][0] == 3.0
예제 #10
0
def test_read_csv(csv_data):
    bl = list(read_csv(io.StringIO(csv_data)))
    tables: List[Table] = [b for t, b in bl if t == BlockType.TABLE]
    template_rows = [b for t, b in bl if t == BlockType.TEMPLATE_ROW]
    met = [b for t, b in bl if t == BlockType.METADATA]

    assert len(met) == 1
    assert len(tables) == 3

    # Correctly reads non-transposed table
    assert tables[0].df["place"][1] == "work"
    assert not tables[0].metadata.transposed

    # Correctly reads transposed table
    t2: Table = tables[2]
    assert t2.column_names == ["diameter", "melting_point"]
    assert len(t2.df) == 1
    assert t2.df["melting_point"][0] == 273
    assert len(template_rows) == 1
    assert t2.metadata.transposed
예제 #11
0
def test_FAT():
    """ Factory Acceptance Test

        Verify that we are able to read all files in ./input
        Using default ParseFixer
    """
    all_files = 0
    ignore_files = ["auto_fixed.py", "__init__.py", "all.json"]
    for fn in os.listdir(input_dir()):
        path = input_dir() / fn
        if not os.path.isfile(path):
            continue
        if fn in ignore_files:
            continue
        all_files += 1

    # load targets
    with open(input_dir() / "all.json") as f:
        all_json = json.load(f)

    for fn in os.listdir(input_dir()):
        path = input_dir() / fn
        if not os.path.isfile(path):
            continue
        if fn in ignore_files:
            continue

        with open(input_dir() / fn, "r") as fh:
            g = read_csv(fh, origin=f'"{fn}"', to="jsondata", fixer=custom_test_fixer)
            count = 0
            for tp, tt in g:
                if tp == BlockType.TABLE:
                    count += 1
                    if fn != "all.csv":
                        assert tt == all_json[fn]

            if fn == "all.csv":
                assert count == all_files - 1
            else:
                assert count == 1
예제 #12
0
def read_bundle_from_csv(
    input_path: Union[str, PathLike, TextIO],
    sep: Optional[str] = ";",
    convert_units_to: TableUnitDispatcher = None,
    unit_converter: UnitConverter = None,
) -> TableBundle:
    """Read single csv-file to TableBundle

    With a demo of bulk unit conversion of all tables at read time.

    unit_converter must accept units of type returned by the TableUnitDispatcher.
    """
    if convert_units_to and not unit_converter:
        raise ValueError("No unit converter supplied.")

    inputs = read_csv(input_path, sep)

    if convert_units_to is not None:
        inputs = normalized_table_generator(inputs, convert_units_to,
                                            unit_converter)

    return TableBundle(inputs)
예제 #13
0
    work;1.0;2020-08-04 09:00:00;0
    beach;2.0;2020-08-04 17:00:00;1

    **farm_animals;;;
    your_farm my_farm other_farm;;;
    species;n_legs;avg_weight;
    text;-;kg;
    chicken;2;2;
    pig;4;89;
    cow;4;200;
    unicorn;4;NaN;
    """))

# Read the stream. Syntax is the same if reading CSV file.
# Reader function returns a generator; unroll it in a list for convenience.
block_list = list(read_csv(csv_data))
assert len(block_list) == 4

# %% [markdown]
# The reader generates tuples of `(BlockType, block)`. Note that blank/comment lines are read but not parsed.

# %%
for bt, b in block_list:
    print(bt, type(b))

# %% [markdown]
# Here's one of the tables.

# %%
t = block_list[2][1]
assert t.name == "places"