Пример #1
0
def _xsv_parse_success_with_numeric_headers(temp_dir: Path, sep: str,
                                            parser: Callable[[Path],
                                                             ParseResults]):
    s = sep
    input_ = temp_dir / str(uuid.uuid4())
    with open(input_, "w") as test_file:
        test_file.writelines([
            "Data type: some_type; Columns: 4; Version: 1\n",
            f"1{s} 2.0{s} 3{s} 4.1\n",  # test trimming
            f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n",
            f"val3 {s} val4{s} 1{s} 8.9\n",
        ])

    res = parser(input_)

    assert res == ParseResults(
        frozendict({
            "some_type":
            ParseResult(
                SpecificationSource(input_),
                tuple([
                    frozendict({
                        "1": "val3",
                        "2.0": "val4",
                        "3": 1,
                        "4.1": 8.9
                    }),
                ]))
        }))
Пример #2
0
def _xsv_parse_success(temp_dir: Path, sep: str,
                       parser: Callable[[Path], ParseResults]):
    s = sep
    input_ = temp_dir / str(uuid.uuid4())
    with open(input_, "w") as test_file:
        test_file.writelines([
            f"Data type: some_type; Columns: 4; Version: 1{s}{s}{s}\n",
            f"spec1{s} spec2{s}   spec3   {s} spec4\n",  # test trimming
            f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n",
            f"val1 {s}   val2   {s}    7     {s} 3.2\n",  # test trimming
            f"val3 {s} val4{s} 1{s} 8.9\n",
            f"val5 {s}{s}{s} 42.42\n",  # test missing values w/o whitespace
            f"val6 {s}      {s}      {s} 3.14\n"  # test missing values w/ whitespace
        ])

    res = parser(input_)

    assert res == ParseResults(
        frozendict({
            "some_type":
            ParseResult(
                SpecificationSource(input_),
                tuple([
                    frozendict({
                        "spec1": "val1",
                        "spec2": "val2",
                        "spec3": 7,
                        "spec4": 3.2
                    }),
                    frozendict({
                        "spec1": "val3",
                        "spec2": "val4",
                        "spec3": 1,
                        "spec4": 8.9
                    }),
                    frozendict({
                        "spec1": "val5",
                        "spec2": None,
                        "spec3": None,
                        "spec4": 42.42
                    }),
                    frozendict({
                        "spec1": "val6",
                        "spec2": None,
                        "spec3": None,
                        "spec4": 3.14
                    }),
                ]))
        }))
Пример #3
0
def _process_excel_tab(
        excel: pandas.ExcelFile,
        spcsrc: SpecificationSource) -> (O[str], O[ParseResult]):
    df = excel.parse(sheet_name=spcsrc.tab)
    if df.shape[
            0] < 3:  # might as well not error check headers in sheets with no data
        return (None, None)
    # at this point we know that at least 4 lines are present - expecting the data type header,
    # parameter ID header, display name header, and at least one data row
    header = df.columns.get_level_values(0)[0]
    datatype, columns = _parse_header(header, spcsrc, _VERSION)
    it = df.itertuples(index=False, name=None)
    hd1 = _process_excel_row(next(it), 2, columns, spcsrc)
    param_ids = _normalize_headers(hd1, 2, spcsrc)
    _process_excel_row(next(it), 3, columns, spcsrc)
    results = []
    for i, row in enumerate(it, start=4):
        row = _process_excel_row(row, i, columns, spcsrc)
        if any(map(lambda x: not pandas.isna(x), row)):  # skip empty rows
            results.append(
                frozendict({
                    param_ids[j]: _normalize_pandas(row[j])
                    for j in range(len(row))
                }))
    return datatype, ParseResult(spcsrc, tuple(results))
Пример #4
0
def _parse(
    paths: tuple[Path, ...],
    file_type_resolver: Callable[[Path], FileTypeResolution],
) -> ParseResults:
    results = {}
    errors = []
    for p in paths:
        file_type = file_type_resolver(p)
        if file_type.unsupported_type:
            errors.append(Error(
                ErrorType.PARSE_FAIL,
                f"{file_type.unsupported_type} "
                    + "is not a supported file type for import specifications",
                SpecificationSource(p)
            ))
            continue
        res = file_type.parser(p)
        if res.errors:
            errors.extend(res.errors)
        else:
            for data_type in res.results:
                if data_type in results:
                    errors.append(Error(
                        ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE,
                        f"Data type {data_type} appears in two importer specification sources",
                        results[data_type].source,
                        res.results[data_type].source
                    ))
                else:
                    results[data_type] = res.results[data_type]
    if errors:
        return ParseResults(errors=tuple(errors))
    else:
        return ParseResults(frozendict(results))
Пример #5
0
def _parse_xsv(path: Path, sep: str) -> ParseResults:
    spcsrc = SpecificationSource(path)
    try:
        filetype = magic.from_file(str(path), mime=True)
        if filetype not in _MAGIC_TEXT_FILES:
            return _error(
                Error(ErrorType.PARSE_FAIL, "Not a text file: " + filetype,
                      spcsrc))
        with open(path, newline='') as input_:
            rdr = csv.reader(input_,
                             delimiter=sep)  # let parser handle quoting
            dthd = _csv_next(rdr, 1, None, spcsrc,
                             "Missing data type / version header")
            datatype, columns = _parse_header(dthd[0], spcsrc, _VERSION)
            hd1 = _csv_next(rdr, 2, columns, spcsrc, "Missing 2nd header line")
            param_ids = _normalize_headers(hd1, 2, spcsrc)
            _csv_next(rdr, 3, columns, spcsrc, "Missing 3rd header line")
            results = []
            for i, row in enumerate(rdr, start=4):
                if row:  # skip empty rows
                    if len(row) != columns:
                        # could collect errors (first 10?) and throw an exception with a list
                        # lets wait and see if that's really needed
                        raise _ParseException(
                            Error(
                                ErrorType.INCORRECT_COLUMN_COUNT,
                                f"Incorrect number of items in line {i}, " +
                                f"expected {columns}, got {len(row)}", spcsrc))
                    results.append(
                        frozendict({
                            param_ids[j]: _normalize_xsv(row[j])
                            for j in range(len(row))
                        }))
        if not results:
            raise _ParseException(
                Error(ErrorType.PARSE_FAIL, "No non-header data in file",
                      spcsrc))
        return ParseResults(
            frozendict({datatype: ParseResult(spcsrc, tuple(results))}))
    except FileNotFoundError:
        return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc))
    except IsADirectoryError:
        return _error(
            Error(ErrorType.PARSE_FAIL, "The given path is a directory",
                  spcsrc))
    except _ParseException as e:
        return _error(e.args[0])
Пример #6
0
def test_excel_parse_success():
    """
    Tests files with
    * 3 different tabs with data, including
        * numeric headers
        * empty cells
        * empty rows
        * whitespace only cells
    * 2 tabs with no data
    * 1 tab with a single row, which should be ignored
    * 1 tab with two rows, which should be ignored
    * one completely empty tab
    """

    for ext in ["xls", "xlsx"]:
        ex = _get_test_file("testtabs3full2nodata1empty." + ext)

        res = parse_excel(ex)

        assert res == ParseResults(
            frozendict({
                "type1":
                ParseResult(SpecificationSource(ex, "tab1"), (
                    frozendict({
                        "header1": "foo",
                        "header2": 1,
                        "header3": 6.7
                    }),
                    frozendict({
                        "header1": "bar",
                        "header2": 2,
                        "header3": 8.9
                    }),
                    frozendict({
                        "header1": "baz",
                        "header2": None,
                        "header3": 3.4
                    }),
                    frozendict({
                        "header1": "bat",
                        "header2": 4,
                        "header3": None
                    }),
                )),
                "type2":
                ParseResult(SpecificationSource(ex, "tab2"),
                            (frozendict({
                                "h1": "golly gee",
                                "2": 42,
                                "h3": "super"
                            }), )),
                "type3":
                ParseResult(SpecificationSource(ex, "tab3"),
                            (frozendict({
                                "head1": "some data",
                                "head2": 1
                            }), )),
            }))
Пример #7
0
def parse_excel(path: Path) -> ParseResults:
    """
    Parse the provided Excel file.
    xls and xlsx files are supported.
    """
    spcsrc = SpecificationSource(path)
    errors = []
    try:
        with pandas.ExcelFile(path) as ex:
            results = {}
            datatype_to_tab = {}
            for tab in ex.sheet_names:
                spcsrc_tab = SpecificationSource(path, tab)
                try:
                    datatype, result = _process_excel_tab(ex, spcsrc_tab)
                    if not datatype:
                        continue
                    elif datatype in results:
                        errors.append(
                            Error(
                                ErrorType.
                                MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE,
                                f"Found datatype {datatype} in multiple tabs",
                                SpecificationSource(path,
                                                    datatype_to_tab[datatype]),
                                spcsrc_tab,
                            ))
                    else:
                        datatype_to_tab[datatype] = tab
                        results[datatype] = result
                except _ParseException as e:
                    errors.append(e.args[0])
    except FileNotFoundError:
        return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc))
    except IsADirectoryError:
        return _error(
            Error(ErrorType.PARSE_FAIL, "The given path is a directory",
                  spcsrc))
    except ValueError as e:
        if "Excel file format cannot be determined" in str(e):
            return _error(
                Error(ErrorType.PARSE_FAIL,
                      "Not a supported Excel file type",
                      source_1=spcsrc))
        raise e  # bail out, not sure what's wrong, not sure how to test either
    if errors:
        return ParseResults(errors=tuple(errors))
    elif results:
        return ParseResults(frozendict(results))
    else:
        return _error(
            Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc))
Пример #8
0
def _xsv_parse_success_with_internal_and_trailing_empty_lines(
        temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults]):
    s = sep
    input_ = temp_dir / str(uuid.uuid4())
    with open(input_, "w") as test_file:
        test_file.writelines([
            "Data type: other_type; Columns: 4; Version: 1\n",
            f"spec1{s} spec2{s} spec3{s} spec4\n",
            f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n",
            f"val3 {s} val4{s} 1{s} 8.9\n",
            "\n",
            f"val1 {s} val2{s}    7     {s} 3.2\n",
            "\n",
            "\n",
            "\n",
        ])

    res = parser(input_)

    assert res == ParseResults(
        frozendict({
            "other_type":
            ParseResult(
                SpecificationSource(input_),
                tuple([
                    frozendict({
                        "spec1": "val3",
                        "spec2": "val4",
                        "spec3": 1,
                        "spec4": 8.9
                    }),
                    frozendict({
                        "spec1": "val1",
                        "spec2": "val2",
                        "spec3": 7,
                        "spec4": 3.2
                    }),
                ]))
        }))