def merge_colspan_headers_in_place(table) -> None: """ Turn tuple colnames into strings. Pandas `read_html()` returns tuples for column names when scraping tables with colspan. Collapse duplicate entries and reformats to be human readable. E.g. ('year', 'year') -> 'year' and ('year', 'month') -> 'year - month' Alter the table in place, no return value. """ newcols = [] for c in table.columns: if isinstance(c, tuple): # collapse all runs of duplicate values: # 'a','a','b','c','c','c' -> 'a','b','c' vals = list(c) idx = 0 while idx < len(vals) - 1: if vals[idx] == vals[idx + 1]: vals.pop(idx) else: idx += 1 # put dashes between all remaining header values newcols.append(" - ".join(vals)) elif isinstance(c, int): # If first row isn't header and there's no <thead>, table.columns # will be an integer index. newcols.append("Column %d" % (c + 1)) else: newcols.append(c) # newcols can contain duplicates. Rename them. table.columns = [c.name for c in gen_unique_clean_colnames(newcols)]
def test_gen_truncate_during_conflict_consider_unicode(): assert gen_unique_clean_colnames(["aéé"] * 10, settings=MockSettings(5)) == [ UniqueCleanColname("aéé"), UniqueCleanColname("aé 2", is_numbered=True, is_truncated=True), UniqueCleanColname("aé 3", is_numbered=True, is_truncated=True), UniqueCleanColname("aé 4", is_numbered=True, is_truncated=True), UniqueCleanColname("aé 5", is_numbered=True, is_truncated=True), UniqueCleanColname("aé 6", is_numbered=True, is_truncated=True), UniqueCleanColname("aé 7", is_numbered=True, is_truncated=True), UniqueCleanColname("aé 8", is_numbered=True, is_truncated=True), UniqueCleanColname("aé 9", is_numbered=True, is_truncated=True), UniqueCleanColname("a 10", is_numbered=True, is_truncated=True), ]
def test_gen_truncate_during_conflict(): assert gen_unique_clean_colnames( [ "abcd", "abcd", "abcd", "abcd", "abcd", "abcd", "abcd", "abcd", "abcd", "abcd", "a 100", ], settings=MockSettings(4), ) == [ UniqueCleanColname("abcd"), UniqueCleanColname("ab 2", is_numbered=True, is_truncated=True), UniqueCleanColname("ab 3", is_numbered=True, is_truncated=True), UniqueCleanColname("ab 4", is_numbered=True, is_truncated=True), UniqueCleanColname("ab 5", is_numbered=True, is_truncated=True), UniqueCleanColname("ab 6", is_numbered=True, is_truncated=True), UniqueCleanColname("ab 7", is_numbered=True, is_truncated=True), UniqueCleanColname("ab 8", is_numbered=True, is_truncated=True), UniqueCleanColname("ab 9", is_numbered=True, is_truncated=True), UniqueCleanColname("a 11", is_numbered=True, is_truncated=True), UniqueCleanColname("a 10", is_truncated=True), # was "a 100" ]
def test_gen_name_default_columns_without_conflict(): assert gen_unique_clean_colnames(["Column 2", "", ""]) == [ UniqueCleanColname("Column 2"), UniqueCleanColname("Column 4", is_default=True, is_numbered=True), UniqueCleanColname("Column 3", is_default=True), # this 3 is "reserved" ]
def test_gen_calls_clean(): assert gen_unique_clean_colnames(["ab\n\ud800cd"], settings=MockSettings(6)) == [ UniqueCleanColname( "ab�c", is_ascii_cleaned=True, is_unicode_fixed=True, is_truncated=True) ]
def test_gen_avoid_existing_names(): assert gen_unique_clean_colnames(["", "foo"], existing_names=["Column 3", "foo"]) == [ UniqueCleanColname("Column 4", is_default=True, is_numbered=True), UniqueCleanColname("foo 2", is_numbered=True), ]
def parse_xls_file(path: Path, *, output_path: Path, has_header: bool, autoconvert_types: bool) -> RenderResult: """ Build a RenderResult from xls bytes or raise parse error. Peculiarities: * Error can be xlrd.XLRDError or pandas error * We read the entire file contents into memory before parsing """ # Use xlrd.open_workbook(): if we call pandas.read_excel(bytesio) it # will read the entire file into RAM. # dtype='category' crashes as of 2018-09-11 try: workbook = xlrd.open_workbook(path.as_posix()) table = pd.read_excel(workbook, engine="xlrd", dtype=object, header=(0 if has_header else None)) except xlrd.XLRDError as err: return RenderResult(errors=[ RenderError( I18nMessage.TODO_i18n(f"Error reading Excel file: %s" % str(err))) ]) if has_header: # pd.read_excel() _badly_ uniquifies column names: it adds ".1", ".2", # etc. This is hard to fix. We'd need to stop using pd.read_excel(). # [2019-12-09, adamhooper] Not today. # # In the meantime, ensure valid colnames so at least the user sees # _something_. Ignore all warnings. table.columns = [ cn.name for cn in gen_unique_clean_colnames( [str(c) for c in table.columns], settings=settings) ] else: table.columns = [f"Column {i + 1}" for i in range(len(table.columns))] autocast_dtypes_in_place(table) return ProcessResult(table).to_arrow(output_path)
def render(table, params, *, fetch_result): if not fetch_result: return table if fetch_result.status == "error": return fetch_result table = fetch_result.dataframe has_header: bool = params["first_row_is_header"] if has_header and len(table) >= 1: # if len == 0, no-op # TODO inform user of column-rename warnings table.columns = [ uccn.name for uccn in gen_unique_clean_colnames( [str(c) for c in table.iloc[0, :]]) ] table.drop(index=0, inplace=True) table.reset_index(drop=True, inplace=True) autocast_dtypes_in_place(table) if fetch_result.errors: return (table, fetch_result.errors) else: return table
def test_gen_name_default_columns(): assert gen_unique_clean_colnames(["", ""]) == [ UniqueCleanColname("Column 1", is_default=True), UniqueCleanColname("Column 2", is_default=True), ]
def test_gen_add_number_that_does_not_overwrite_existing_number(): assert gen_unique_clean_colnames(["A", "A", "A 2"]) == [ UniqueCleanColname("A"), UniqueCleanColname("A 3", is_numbered=True), UniqueCleanColname("A 2"), ]
def test_gen_add_number(): assert gen_unique_clean_colnames(["A", "A", "A"]) == [ UniqueCleanColname("A"), UniqueCleanColname("A 2", is_numbered=True), UniqueCleanColname("A 3", is_numbered=True), ]
def test_gen_number_1_is_unique(): assert gen_unique_clean_colnames(["A", "A 1", "A 2"]) == [ UniqueCleanColname("A"), UniqueCleanColname("A 1"), UniqueCleanColname("A 2"), ]
def test_gen_whitespace_only_key(): # issue #174927345: handle empty-string key assert gen_unique_clean_colnames([" 1", " 1"]) == [ UniqueCleanColname(" 1"), UniqueCleanColname(" 2", is_numbered=True), ]
def test_gen_do_not_number_name_without_key(): # issue #174927345: handle empty-string key assert gen_unique_clean_colnames([" 1", " 1"]) == [ UniqueCleanColname(" 1"), UniqueCleanColname(" 1 2", is_numbered=True), ]
def _postprocess_name_columns( table: pyarrow.Table, has_header: bool) -> Tuple[pyarrow.Table, List[ParseCsvWarning]]: """ Return `table`, with final column names but still String values. """ warnings = [] if has_header and table.num_rows > 0: n_ascii_cleaned = 0 first_ascii_cleaned = None n_truncated = 0 first_truncated = None n_numbered = 0 first_numbered = None names = [] for colname in gen_unique_clean_colnames( list(("" if c[0] is pyarrow.NULL else c[0].as_py()) for c in table.columns), settings=settings, ): names.append(colname.name) if colname.is_ascii_cleaned: if n_ascii_cleaned == 0: first_ascii_cleaned = colname.name n_ascii_cleaned += 1 if colname.is_truncated: if n_truncated == 0: first_truncated = colname.name n_truncated += 1 if colname.is_numbered: if n_numbered == 0: first_numbered = colname.name n_numbered += 1 # Unicode can't be fixed, because we assume valid UTF-8 input assert not colname.is_unicode_fixed # Stay silent if colname.is_default. Users expect us to # auto-generate default column names. if n_ascii_cleaned: warnings.append( ParseCsvWarning.CleanedAsciiColumnNames( n_ascii_cleaned, first_ascii_cleaned)) if n_truncated: warnings.append( ParseCsvWarning.TruncatedColumnNames(n_truncated, first_truncated)) if n_numbered: warnings.append( ParseCsvWarning.NumberedColumnNames(n_numbered, first_numbered)) # Remove header (zero-copy: builds new pa.Table with same backing data) table = table.slice(1) else: names = [f"Column {i + 1}" for i in range(len(table.columns))] return ( pyarrow.table({name: table.column(i) for i, name in enumerate(names)}), warnings, )