def _parse_tsv(bytesio: io.BytesIO, text_encoding: _TextEncoding) -> DataFrame: """Build a DataFrame or raise parse error. Peculiarities: * The file encoding defaults to UTF-8. * Data types. This is a CSV, so every value is a string ... _but_ we do the pandas default auto-detection. """ dtype = _determine_dtype(bytesio) with _wrap_text(bytesio, text_encoding) as textio: data = pandas.read_table(textio, dtype=dtype) autocast_dtypes_in_place(data) return data
def _parse_txt(bytesio: io.BytesIO, text_encoding: _TextEncoding) -> DataFrame: """ Build a DataFrame from txt bytes or raise parse error. Peculiarities: * The file encoding defaults to UTF-8. * Data types. Call _detect_separator to determine separator """ dtype = _determine_dtype(bytesio) with _wrap_text(bytesio, text_encoding) as textio: sep = _detect_separator(textio) data = pandas.read_table(textio, dtype=dtype, sep=sep) autocast_dtypes_in_place(data) return data
def _parse_table(bytesio: io.BytesIO, sep: Optional[str], text_encoding: _TextEncoding) -> DataFrame: with _wrap_text(bytesio, text_encoding) as textio: if not sep: sep = _detect_separator(textio) # Pandas CSV parser looks like this: # # 1. "Tokenize" the input stream: copy all its _data_ bytes into # memory, and maintain arrays of "words" and "lines" pointers # into that array. # 2. Determine list of columns # 3. Per-column, convert dtypes to array # 4. Smoosh arrays together into a pd.DataFrame. # # When `low_memory=True`, all this happens in a bigger loop, so # that the tokenized data structure is smaller. # # `low_memory=True` forces re-coding categories. That's `O(Ch * N # * Ca lg Ca)`, where Ch is number of column-chunks (9,000 * 60 # in this case), N is number of records, Ch is number of chunks # (8, in this case), and Ca is the number of categories. # # This `rc11.txt` file has enormous `Ch`: 9,000 * 60 = 540,000. # `general.csv` (our 1.2GB file) is much smaller, at ~4,000, even # though it has 250x more rows. Pandas doesn't let us adjust # chunk size, and its heuristic is terrible for`rc11.txt`. # # Let's try `low_memory=False`. That makes the CPU cost # `O(N * Co * Ca lg Ca)`, where Co is the number of columns. Memory # usage grows by the number of cells. In the case of `general.csv`, # the cost is an extra 1GB. data = pandas.read_csv(textio, dtype='category', sep=sep, na_filter=False, low_memory=False) autocast_dtypes_in_place(data) return data
def _parse_xlsx(bytesio: io.BytesIO, _unused: _TextEncoding) -> DataFrame: """ Build a DataFrame from xlsx bytes or raise parse error. Peculiarities: * Error can be xlrd.XLRDError or pandas error * We read the entire file contents into memory before parsing """ # dtype='category' crashes as of 2018-09-11 try: # Use xlrd.open_workbook(): if we call pandas.read_excel(bytesio) it # will read the entire file into RAM. with tempfile.NamedTemporaryFile() as temp: shutil.copyfileobj(bytesio, temp) temp.flush() temp.seek(0) workbook = xlrd.open_workbook(temp.name) data = pandas.read_excel(workbook, engine='xlrd', dtype=object) except xlrd.XLRDError as err: return ProcessResult(error=f'Error reading Excel file: {str(err)}') autocast_dtypes_in_place(data) return data
def test_autocast_str_categories_from_str_categories(self): table = pd.DataFrame({'A': ['1', '2.1', 'Yay']}, dtype='category') autocast_dtypes_in_place(table) # should be no-op expected = pd.DataFrame({'A': ['1', '2.1', 'Yay']}, dtype='category') assert_frame_equal(table, expected)
def test_autocast_int_from_str_categories_with_empty_str(self): table = pd.DataFrame({'A': ['', '', '1']}, dtype='category') autocast_dtypes_in_place(table) expected = pd.DataFrame({'A': [np.nan, np.nan, 1.0]}, dtype=np.float64) assert_frame_equal(table, expected)
def test_autocast_float_from_str_categories_with_dup_floats(self): table = pd.DataFrame({'A': ['1', '1.0']}, dtype='category') autocast_dtypes_in_place(table) expected = pd.DataFrame({'A': [1.0, 1.0]}, dtype=np.float64) assert_frame_equal(table, expected)
def test_autocast_float_from_str_categories_with_empty_str(self): # example: used read_csv(dtype='category'), now want floats table = pd.DataFrame({'A': ['1', '2.1', '']}, dtype='category') autocast_dtypes_in_place(table) expected = pd.DataFrame({'A': [1.0, 2.1, np.nan]}, dtype=np.float64) assert_frame_equal(table, expected)
def test_autocast_int_from_str_categories(self): # example: used read_csv(dtype='category'), now want ints table = pd.DataFrame({'A': ['1', '2']}, dtype='category') autocast_dtypes_in_place(table) expected = pd.DataFrame({'A': [1, 2]}) assert_frame_equal(table, expected)
def test_autocast_int_from_str(self): table = pd.DataFrame({'A': ['1', '2']}) autocast_dtypes_in_place(table) expected = pd.DataFrame({'A': [1, 2]}) assert_frame_equal(table, expected)