def test_parse_autocast_numbers(self): result = upload.parse_file(MockPath(['x.csv'], b'A,B\n1,2.0\n3,4.1'), True) assert_frame_equal(result, pd.DataFrame({ 'A': [1, 3], 'B': [2.0, 4.1] }))
def test_parse_csv_allow_empty_str(self): result = parse_file(MockPath(["x.csv"], b"A,B\na,\n,b"), True) assert_frame_equal(result, pd.DataFrame({ "A": ["a", ""], "B": ["", "b"] }))
def test_parse_fill_gaps_at_start_with_na(self): result = upload.parse_file(MockPath(['x.csv'], b'A,B\na\nb,c'), True) assert_frame_equal( result, pd.DataFrame({ 'A': ['a', 'b'], 'B': [np.nan, 'c'], }))
def test_parse_invalid_xlsx(self): result = parse_file(MockPath(["x.xlsx"], b"not an xlsx"), True) self.assertEqual( result, ("Error reading Excel file: Unsupported format, " "or corrupt file: Expected BOF record; found b'not an x'"), )
def test_parse_csv_allow_empty_str(self): result = upload.parse_file(MockPath(['x.csv'], b'A,B\na,\n,b'), True) assert_frame_equal(result, pd.DataFrame({ 'A': ['a', ''], 'B': ['', 'b'], }))
def test_parse_fill_gaps_at_start_with_na(self): result = parse_file(MockPath(["x.csv"], b"A,B\na\nb,c"), True) assert_frame_equal(result, pd.DataFrame({ "A": ["a", "b"], "B": [np.nan, "c"] }))
def test_parse_has_header_false(self): result = upload.parse_file(MockPath(['x.csv'], b'A,B\n1,2'), False) assert_frame_equal( result, pd.DataFrame({ 'Column 1': ['A', '1'], 'Column 2': ['B', '2'], }))
def test_parse_fill_gaps_at_end_with_na(self): result = parse_file(MockPath(["x.csv"], b"A,B\na,b\nc,d\ne"), True) assert_frame_equal( result, pd.DataFrame({ "A": ["a", "c", "e"], "B": ["b", "d", np.nan] }))
def test_parse_has_header_false(self): result = parse_file(MockPath(["x.csv"], b"A,B\n1,2"), False) assert_frame_equal( result, pd.DataFrame({ "Column 1": ["A", "1"], "Column 2": ["B", "2"] }))
def test_parse_fill_gaps_at_end_with_na(self): result = upload.parse_file(MockPath(['x.csv'], b'A,B\na,b\nc,d\ne'), True) assert_frame_equal( result, pd.DataFrame({ 'A': ['a', 'c', 'e'], 'B': ['b', 'd', np.nan], }))
def test_parse_too_many_bytes(self): result = parse_file(MockPath(["x.csv"], b"A,B\na,b\nc,d\ne,f"), True) self.assertEqual(result["error"], "The input was too large, so we removed 2 rows") assert_frame_equal(result["dataframe"], pd.DataFrame({ "A": ["a"], "B": ["b"] }))
def test_filename_in_traceback(self): path = MockPath(["root", "badname.py"], b"def intify(x):\n return int(x)") module = load_python_module("goodname", path) try: module.intify("not-a-number") except ValueError: s = traceback.format_exc() self.assertRegex( s, 'File "<Module goodname>", line 2, in intify\nValueError' )
def test_parse_auto_categorize(self): result = upload.parse_file(MockPath(['x.csv'], b'A,B\na,a\na,b\nb,c'), True) assert_frame_equal( result, pd.DataFrame({ # 'a', 'a', 'b' has repeated strings, so we categorize 'A': pd.Series(['a', 'a', 'b'], dtype='category'), 'B': pd.Series(['a', 'b', 'c'], dtype=str) }))
def test_parse_too_many_columns(self): result = parse_file(MockPath(["x.csv"], b"A,B,C,D\na,b,c,d"), True) self.assertEqual( result["error"], ("The input had too many columns, so we removed 2 columns")) assert_frame_equal(result["dataframe"], pd.DataFrame({ "A": ["a"], "B": ["b"] }))
def test_filename_in_traceback(self): path = MockPath(['root', 'badname.py'], b'def intify(x):\n return int(x)') module = load_python_module('goodname', path) try: module.intify('not-a-number') except ValueError: s = traceback.format_exc() self.assertRegex( s, 'File "<Module goodname>", line 2, in intify\nValueError')
def test_parse_auto_categorize(self): result = parse_file(MockPath(["x.csv"], b"A,B\na,a\na,b\nb,c"), True) assert_frame_equal( result, pd.DataFrame({ # 'a', 'a', 'b' has repeated strings, so we categorize "A": pd.Series(["a", "a", "b"], dtype="category"), "B": pd.Series(["a", "b", "c"], dtype=str), }), )
def test_parse_default_column_headers(self): # First row is ['A', '', None] result = upload.parse_file(MockPath(['x.csv'], b'A,""\na,b,c'), True) assert_frame_equal( result, pd.DataFrame({ 'A': ['a'], 'Column 2': ['b'], # "" => default, 'Column 2' 'Column 3': ['c'], # None => default, 'Column 3' }))
def test_parse_too_many_bytes(self): result = upload.parse_file(MockPath(['x.csv'], b'A,B\na,b\nc,d\ne,f'), True) self.assertEqual(result['error'], 'The input was too large, so we removed 2 rows') assert_frame_equal(result['dataframe'], pd.DataFrame({ 'A': ['a'], 'B': ['b'] }))
def test_parse_default_column_headers(self): # First row is ['A', '', None] result = parse_file(MockPath(["x.csv"], b'A,""\na,b,c'), True) assert_frame_equal( result, pd.DataFrame({ "A": ["a"], "Column 2": ["b"], # "" => default, 'Column 2' "Column 3": ["c"], # None => default, 'Column 3' }), )
def test_parse_too_many_columns(self): result = upload.parse_file(MockPath(['x.csv'], b'A,B,C,D\na,b,c,d'), True) self.assertEqual( result['error'], ('The input had too many columns, so we removed 2 columns')) assert_frame_equal(result['dataframe'], pd.DataFrame({ 'A': ['a'], 'B': ['b'] }))
def test_rewrite_conflicting_column_headers(self): result = upload.parse_file( # Columns 1 and 2 both have name, 'A' # Columns 3 and 4 (defaulted) both have name, 'Column 4' MockPath(['x.csv'], b'A,A,Column 4,\na,b,c,d'), True) assert_frame_equal( result, pd.DataFrame({ 'A': ['a'], 'A 2': ['b'], # rewritten 'Column 4': ['c'], 'Column 5': ['d'], # rewritten }))
def test_parse_csv_repair_errors(self): # It would be great to report "warnings" on invalid input. But Python's # `csv` module won't do that: it forces us to choose between mangling # input and raising an exception. Both are awful; mangling input is # slightly preferable, so that's what we do. result = parse_file( # CSV errors: # # * Data after close-quote: mangle by appending # * Unclosed quote: mangle by auto-closing MockPath(["x.csv"], b'A,B\n"x" y,"foo\nB'), True, ) assert_frame_equal(result, pd.DataFrame({"A": ["x y"], "B": ["foo\nB"]}))
def test_rewrite_conflicting_column_headers(self): result = parse_file( # Columns 1 and 2 both have name, 'A' # Columns 3 and 4 (defaulted) both have name, 'Column 4' MockPath(["x.csv"], b"A,A,Column 4,\na,b,c,d"), True, ) assert_frame_equal( result, pd.DataFrame({ "A": ["a"], "A 2": ["b"], # rewritten "Column 4": ["c"], "Column 5": ["d"], # rewritten }), )
def test_parse_invalid_xlsx(self): result = upload.parse_file(MockPath(['x.xlsx'], b'not an xlsx'), True) self.assertEqual( result, ('Error reading Excel file: Unsupported format, ' "or corrupt file: Expected BOF record; found b'not an x'"))
def test_parse_txt_sniff_delimiter(self): result = upload.parse_file(MockPath(['x.txt'], b'A;B\na,b;c'), True) assert_frame_equal(result, pd.DataFrame({'A': ['a,b'], 'B': ['c']}))
def test_parse_csv_detect_character_set(self): # tests that `chardet` is invoked csv = 'A\nfôo\nbar'.encode('windows-1252') result = upload.parse_file(MockPath(['x.csv'], csv), True) assert_frame_equal(result, pd.DataFrame({'A': ['fôo', 'bar']}))
def test_parse_skip_empty_row(self): result = upload.parse_file(MockPath(['x.csv'], b'A\n\na'), True) assert_frame_equal(result, pd.DataFrame({'A': ['a']}))
def test_parse_invalid_mime_type(self): result = upload.parse_file(MockPath(['x.bin'], b'A'), True) self.assertEqual( result, ("Unknown file extension '.bin'. Please upload a different file."))
def _load(self, filename, data): path = MockPath(["root", filename], data) return ModuleSpec.load_from_path(path)
def test_parse_too_many_rows(self): result = upload.parse_file(MockPath(['x.csv'], b'A\na\nb\nc'), False) self.assertEqual(result['error'], 'The input was too large, so we removed 2 rows') assert_frame_equal(result['dataframe'], pd.DataFrame({'Column 1': ['A', 'a']}))