def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, names=None, index_col=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, convert_float=True, has_index_names=None, true_values=None, false_values=None, squeeze=False, **kwds): data = self.__get_sheet(sheetname) parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, true_values=true_values, false_values=false_values, skiprows=skiprows, skipfooter=skip_footer, squeeze=squeeze, **kwds) return parser.read()
def _data_to_frame(data, header, index_col, skiprows, infer_types, parse_dates, tupleize_cols, thousands): head, body, foot = data if head: body = [head] + body if header is None: # special case when a table has <th> elements header = 0 if foot: body += [foot] # fill out elements of body that are "ragged" _expand_elements(body) tp = TextParser( body, header=header, index_col=index_col, skiprows=_get_skiprows(skiprows), parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands, ) df = tp.read() return df
def _data_to_frame(**kwargs): head, body, foot = kwargs.pop("data") header = kwargs.pop("header") kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"]) if head: body = head + body # Infer header when there is a <thead> or top <th>-only rows if header is None: if len(head) == 1: header = 0 else: # ignore all-empty-text rows header = [ i for i, row in enumerate(head) if any(text for text in row) ] if foot: body += foot # fill out elements of body that are "ragged" _expand_elements(body) tp = TextParser(body, header=header, **kwargs) df = tp.read() return df
def _data_to_frame(data, header, index_col, skiprows, infer_types, parse_dates, tupleize_cols, thousands): head, body, _ = data # _ is footer which is rarely used: ignore for now if head: body = [head] + body if header is None: # special case when a table has <th> elements header = 0 # fill out elements of body that are "ragged" _expand_elements(body) tp = TextParser(body, header=header, index_col=index_col, skiprows=_get_skiprows(skiprows), parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands) df = tp.read() if infer_types: # TODO: rm this code so infer_types has no effect in 0.14 df = df.convert_objects(convert_dates='coerce') else: df = df.applymap(text_type) return df
def _data_to_frame(data, header, index_col, skiprows, parse_dates, tupleize_cols, thousands): head, body, foot = data if head: body = [head] + body if header is None: # special case when a table has <th> elements header = 0 if foot: body += [foot] # fill out elements of body that are "ragged" _expand_elements(body) tp = TextParser(body, header=header, index_col=index_col, skiprows=_get_skiprows(skiprows), parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands) df = tp.read() return df
def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None): from xlrd import (xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN) datemode = self.book.datemode sheet = self.book.sheet_by_name(sheetname) data = [] should_parse = {} for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate( izip(sheet.row_values(i), sheet.row_types(i))): if parse_cols is not None and j not in should_parse: should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: if typ == XL_CELL_DATE: dt = xldate_as_tuple(value, datemode) # how to produce this first case? if dt[0] < datetime.MINYEAR: # pragma: no cover value = datetime.time(*dt[3:]) else: value = datetime.datetime(*dt) elif typ == XL_CELL_ERROR: value = np.nan elif typ == XL_CELL_BOOLEAN: value = bool(value) row.append(value) data.append(row) if header is not None: data[header] = _trim_excel_header(data[header]) parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, skip_footer=skip_footer, chunksize=chunksize) return parser.read()
def test_read_text_list(self): data = """A,B,C\nfoo,1,2,3\nbar,4,5,6""" as_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]] df = self.read_csv(StringIO(data), index_col=0) parser = TextParser(as_list, index_col=0, chunksize=2) chunk = parser.read(None) tm.assert_frame_equal(chunk, df)
def test_read_text_list(self): data = """A,B,C\nfoo,1,2,3\nbar,4,5,6""" as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar', '4', '5', '6']] df = self.read_csv(StringIO(data), index_col=0) parser = TextParser(as_list, index_col=0, chunksize=2) chunk = parser.read(None) tm.assert_frame_equal(chunk, df)
def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, **kwds): from xlrd import (xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN) datemode = self.book.datemode if isinstance(sheetname, compat.string_types): sheet = self.book.sheet_by_name(sheetname) else: # assume an integer if not a string sheet = self.book.sheet_by_index(sheetname) data = [] should_parse = {} for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate(zip(sheet.row_values(i), sheet.row_types(i))): if parse_cols is not None and j not in should_parse: should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: if typ == XL_CELL_DATE: dt = xldate_as_tuple(value, datemode) # how to produce this first case? if dt[0] < datetime.MINYEAR: # pragma: no cover value = datetime.time(*dt[3:]) else: value = datetime.datetime(*dt) elif typ == XL_CELL_ERROR: value = np.nan elif typ == XL_CELL_BOOLEAN: value = bool(value) row.append(value) data.append(row) if header is not None: data[header] = _trim_excel_header(data[header]) parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, skip_footer=skip_footer, chunksize=chunksize, **kwds) return parser.read()
def _data_to_frame(**kwargs): head, body, foot = kwargs.pop('data') header = kwargs.pop('header') kwargs['skiprows'] = _get_skiprows(kwargs['skiprows']) if head: rows = lrange(len(head)) body = head + body if header is None: # special case when a table has <th> elements header = 0 if rows == [0] else rows if foot: body += [foot] # fill out elements of body that are "ragged" _expand_elements(body) tp = TextParser(body, header=header, **kwargs) df = tp.read() return df
def _data_to_frame(data, header, index_col, skiprows, infer_types, parse_dates, tupleize_cols, thousands): head, body, _ = data # _ is footer which is rarely used: ignore for now if head: body = [head] + body if header is None: # special case when a table has <th> elements header = 0 # fill out elements of body that are "ragged" _expand_elements(body) tp = TextParser(body, header=header, index_col=index_col, skiprows=_get_skiprows(skiprows), parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands) df = tp.read() return df
def _data_to_frame(**kwargs): head, body, foot = kwargs.pop('data') header = kwargs.pop('header') kwargs['skiprows'] = _get_skiprows(kwargs['skiprows']) if head: body = head + body # Infer header when there is a <thead> or top <th>-only rows if header is None: if len(head) == 1: header = 0 else: # ignore all-empty-text rows header = [i for i, row in enumerate(head) if any(text for text in row)] if foot: body += foot # fill out elements of body that are "ragged" _expand_elements(body) tp = TextParser(body, header=header, **kwargs) df = tp.read() return df
def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, convert_float=True, true_values=None, false_values=None, verbose=False, dtype=None, squeeze=False, **kwds): skipfooter = kwds.pop('skipfooter', None) if skipfooter is not None: skip_footer = skipfooter _validate_header_arg(header) if has_index_names is not None: warn( "\nThe has_index_names argument is deprecated; index names " "will be automatically inferred based on index_col.\n" "This argmument is still necessary if reading Excel output " "from 0.16.2 or prior with index names.", FutureWarning, stacklevel=3) if 'chunksize' in kwds: raise NotImplementedError("chunksize keyword of read_excel " "is not implemented") if parse_dates is True and index_col is None: warn("The 'parse_dates=True' keyword of read_excel was provided" " without an 'index_col' keyword value.") def _parse_cell(cell_contents, cell_typ): """converts the contents of the cell into a pandas appropriate object""" if cell_typ == XL_CELL_DATE: if xlrd_0_9_3: # Use the newer xlrd datetime handling. try: cell_contents = \ xldate.xldate_as_datetime(cell_contents, epoch1904) except OverflowError: return cell_contents # Excel doesn't distinguish between dates and time, # so we treat dates on the epoch as times only. # Also, Excel supports 1900 and 1904 epochs. year = (cell_contents.timetuple())[0:3] if ((not epoch1904 and year == (1899, 12, 31)) or (epoch1904 and year == (1904, 1, 1))): cell_contents = time(cell_contents.hour, cell_contents.minute, cell_contents.second, cell_contents.microsecond) else: # Use the xlrd <= 0.9.2 date handling. try: dt = xldate.xldate_as_tuple(cell_contents, epoch1904) except xldate.XLDateTooLarge: return cell_contents if dt[0] < MINYEAR: cell_contents = time(*dt[3:]) else: cell_contents = datetime(*dt) elif cell_typ == XL_CELL_ERROR: cell_contents = np.nan elif cell_typ == XL_CELL_BOOLEAN: cell_contents = bool(cell_contents) elif convert_float and cell_typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less suprising val = int(cell_contents) if val == cell_contents: cell_contents = val return cell_contents ret_dict = False if isinstance(sheetname, list): sheets = sheetname ret_dict = True elif sheetname is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheetname] # handle same-type duplicates. sheets = list(OrderedDict.fromkeys(sheets).keys()) output = OrderedDict() import xlrd from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) epoch1904 = self.book.datemode # xlrd >= 0.9.3 can return datetime objects directly. if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): xlrd_0_9_3 = True else: xlrd_0_9_3 = False # Keep sheetname to maintain backwards compatibility. for asheetname in sheets: if verbose: print("Reading sheet %s" % asheetname) if isinstance(asheetname, compat.string_types): sheet = self.book.sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.book.sheet_by_index(asheetname) data = [] should_parse = {} if sheet.nrows > 5000: raise Exception( "The raw file contains more than 5000 rows. Please check if it is correct or split the files (max: 5000 rows) for upload" ) elif kwds.get('MaxTest'): continue for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate( zip(sheet.row_values(i), sheet.row_types(i))): if parse_cols is not None and j not in should_parse: should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: row.append(_parse_cell(value, typ)) data.append(row) # output[asheetname] = data if sheet.nrows == 0: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None: if is_list_like(header): header_names = [] control_row = [True for x in data[0]] for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = _fill_mi_header( data[row], control_row) header_name, data[row] = _pop_header_name( data[row], index_col) header_names.append(header_name) if is_list_like(index_col): # forward fill values for MultiIndex index if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == '' or data[row][col] is None: data[row][col] = last else: last = data[row][col] if is_list_like(header) and len(header) > 1: has_index_names = True if kwds.get('parsed'): try: parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, true_values=true_values, false_values=false_values, skiprows=skiprows, skipfooter=skip_footer, squeeze=squeeze, dtype=dtype, **kwds) output[asheetname] = parser.read() if names is not None: output[asheetname].columns = names if not squeeze or isinstance(output[asheetname], DataFrame): output[asheetname].columns = output[ asheetname].columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() else: output[asheetname] = data if ret_dict or kwds.get('MaxTest'): return output else: return output[asheetname]
def parse( self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True, **kwds, ): validate_header_arg(header) ret_dict = False # Keep sheetname to maintain backwards compatibility. if isinstance(sheet_name, list): sheets = sheet_name ret_dict = True elif sheet_name is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] # handle same-type duplicates. sheets = list(dict.fromkeys(sheets).keys()) output = {} for asheetname in sheets: if verbose: print(f"Reading sheet {asheetname}") if isinstance(asheetname, str): sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) usecols = maybe_convert_usecols(usecols) if not data: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True] * len(data[0]) for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = fill_mi_header( data[row], control_row) if index_col is not None: header_name, _ = pop_header_name(data[row], index_col) header_names.append(header_name) if is_list_like(index_col): # Forward fill values for MultiIndex index. if header is None: offset = 0 elif not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] has_index_names = is_list_like(header) and len(header) > 1 # GH 12292 : error when read one empty column from excel file try: parser = TextParser( data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds, ) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ asheetname].columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() if ret_dict: return output else: return output[asheetname]
def _data_to_frame(data): _expand_elements(data) tp = TextParser(data) df = tp.read() return df
def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, convert_float=True, verbose=False, **kwds): import xlrd from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) epoch1904 = self.book.datemode def _parse_cell(cell_contents, cell_typ): """converts the contents of the cell into a pandas appropriate object""" if cell_typ == XL_CELL_DATE: if xlrd_0_9_3: # Use the newer xlrd datetime handling. cell_contents = xldate.xldate_as_datetime( cell_contents, epoch1904) # Excel doesn't distinguish between dates and time, # so we treat dates on the epoch as times only. # Also, Excel supports 1900 and 1904 epochs. year = (cell_contents.timetuple())[0:3] if ((not epoch1904 and year == (1899, 12, 31)) or (epoch1904 and year == (1904, 1, 1))): cell_contents = datetime.time( cell_contents.hour, cell_contents.minute, cell_contents.second, cell_contents.microsecond) else: # Use the xlrd <= 0.9.2 date handling. dt = xldate.xldate_as_tuple(cell_contents, epoch1904) if dt[0] < datetime.MINYEAR: cell_contents = datetime.time(*dt[3:]) else: cell_contents = datetime.datetime(*dt) elif cell_typ == XL_CELL_ERROR: cell_contents = np.nan elif cell_typ == XL_CELL_BOOLEAN: cell_contents = bool(cell_contents) elif convert_float and cell_typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less suprising val = int(cell_contents) if val == cell_contents: cell_contents = val return cell_contents # xlrd >= 0.9.3 can return datetime objects directly. if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): xlrd_0_9_3 = True else: xlrd_0_9_3 = False ret_dict = False #Keep sheetname to maintain backwards compatibility. if isinstance(sheetname, list): sheets = sheetname ret_dict = True elif sheetname is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheetname] #handle same-type duplicates. sheets = list(set(sheets)) output = {} for asheetname in sheets: if verbose: print("Reading sheet %s" % asheetname) if isinstance(asheetname, compat.string_types): sheet = self.book.sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.book.sheet_by_index(asheetname) data = [] should_parse = {} for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate( zip(sheet.row_values(i), sheet.row_types(i))): if parse_cols is not None and j not in should_parse: should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: row.append(_parse_cell(value, typ)) data.append(row) if sheet.nrows == 0: return DataFrame() if header is not None: data[header] = _trim_excel_header(data[header]) parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, skip_footer=skip_footer, chunksize=chunksize, **kwds) output[asheetname] = parser.read() if ret_dict: return output else: return output[asheetname]
def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, convert_float=True, **kwds): import xlrd from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) epoch1904 = self.book.datemode # xlrd >= 0.9.3 can return datetime objects directly. if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): xlrd_0_9_3 = True else: xlrd_0_9_3 = False if isinstance(sheetname, compat.string_types): sheet = self.book.sheet_by_name(sheetname) else: # assume an integer if not a string sheet = self.book.sheet_by_index(sheetname) data = [] should_parse = {} for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate(zip(sheet.row_values(i), sheet.row_types(i))): if parse_cols is not None and j not in should_parse: should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: if typ == XL_CELL_DATE: if xlrd_0_9_3: # Use the newer xlrd datetime handling. value = xldate.xldate_as_datetime(value, epoch1904) # Excel doesn't distinguish between dates and time, # so we treat dates on the epoch as times only. # Also, Excel supports 1900 and 1904 epochs. year = (value.timetuple())[0:3] if ((not epoch1904 and year == (1899, 12, 31)) or (epoch1904 and year == (1904, 1, 1))): value = datetime.time(value.hour, value.minute, value.second, value.microsecond) else: # Use the xlrd <= 0.9.2 date handling. dt = xldate.xldate_as_tuple(value, epoch1904) if dt[0] < datetime.MINYEAR: value = datetime.time(*dt[3:]) else: value = datetime.datetime(*dt) elif typ == XL_CELL_ERROR: value = np.nan elif typ == XL_CELL_BOOLEAN: value = bool(value) elif convert_float and typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less suprising val = int(value) if val == value: value = val row.append(value) data.append(row) if header is not None: data[header] = _trim_excel_header(data[header]) parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, skip_footer=skip_footer, chunksize=chunksize, **kwds) return parser.read()
def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, convert_float=True, verbose=False, **kwds): import xlrd from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) epoch1904 = self.book.datemode def _parse_cell(cell_contents,cell_typ): """converts the contents of the cell into a pandas appropriate object""" if cell_typ == XL_CELL_DATE: if xlrd_0_9_3: # Use the newer xlrd datetime handling. cell_contents = xldate.xldate_as_datetime(cell_contents, epoch1904) # Excel doesn't distinguish between dates and time, # so we treat dates on the epoch as times only. # Also, Excel supports 1900 and 1904 epochs. year = (cell_contents.timetuple())[0:3] if ((not epoch1904 and year == (1899, 12, 31)) or (epoch1904 and year == (1904, 1, 1))): cell_contents = datetime.time(cell_contents.hour, cell_contents.minute, cell_contents.second, cell_contents.microsecond) else: # Use the xlrd <= 0.9.2 date handling. dt = xldate.xldate_as_tuple(cell_contents, epoch1904) if dt[0] < datetime.MINYEAR: cell_contents = datetime.time(*dt[3:]) else: cell_contents = datetime.datetime(*dt) elif cell_typ == XL_CELL_ERROR: cell_contents = np.nan elif cell_typ == XL_CELL_BOOLEAN: cell_contents = bool(cell_contents) elif convert_float and cell_typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less suprising val = int(cell_contents) if val == cell_contents: cell_contents = val return cell_contents # xlrd >= 0.9.3 can return datetime objects directly. if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): xlrd_0_9_3 = True else: xlrd_0_9_3 = False ret_dict = False #Keep sheetname to maintain backwards compatibility. if isinstance(sheetname, list): sheets = sheetname ret_dict = True elif sheetname is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheetname] #handle same-type duplicates. sheets = list(set(sheets)) output = {} for asheetname in sheets: if verbose: print("Reading sheet %s" % asheetname) if isinstance(asheetname, compat.string_types): sheet = self.book.sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.book.sheet_by_index(asheetname) data = [] should_parse = {} for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate(zip(sheet.row_values(i), sheet.row_types(i))): if parse_cols is not None and j not in should_parse: should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: row.append(_parse_cell(value,typ)) data.append(row) if header is not None: data[header] = _trim_excel_header(data[header]) parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, skip_footer=skip_footer, chunksize=chunksize, **kwds) output[asheetname] = parser.read() if ret_dict: return output else: return output[asheetname]
def parse( self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype: DtypeArg | None = None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=None, mangle_dupe_cols=True, **kwds, ): if convert_float is None: convert_float = True else: stacklevel = find_stack_level() warnings.warn( "convert_float is deprecated and will be removed in a future version.", FutureWarning, stacklevel=stacklevel, ) validate_header_arg(header) ret_dict = False # Keep sheetname to maintain backwards compatibility. if isinstance(sheet_name, list): sheets = sheet_name ret_dict = True elif sheet_name is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] # handle same-type duplicates. sheets = list(dict.fromkeys(sheets).keys()) output = {} for asheetname in sheets: if verbose: print(f"Reading sheet {asheetname}") if isinstance(asheetname, str): sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) if hasattr(sheet, "close"): # pyxlsb opens two TemporaryFiles sheet.close() usecols = maybe_convert_usecols(usecols) if not data: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True] * len(data[0]) for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = fill_mi_header( data[row], control_row) if index_col is not None: header_name, _ = pop_header_name(data[row], index_col) header_names.append(header_name) # If there is a MultiIndex header and an index then there is also # a row containing just the index name(s) has_index_names = (is_list_like(header) and len(header) > 1 and index_col is not None) if is_list_like(index_col): # Forward fill values for MultiIndex index. if header is None: offset = 0 elif not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # GH34673: if MultiIndex names present and not defined in the header, # offset needs to be incremented so that forward filling starts # from the first MI value instead of the name if has_index_names: offset += 1 # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] # GH 12292 : error when read one empty column from excel file try: parser = TextParser( data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, skip_blank_lines=False, # GH 39808 parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds, ) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ asheetname].columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() if ret_dict: return output else: return output[asheetname]
def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, convert_float=True, **kwds): import xlrd from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) epoch1904 = self.book.datemode # xlrd >= 0.9.3 can return datetime objects directly. if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): xlrd_0_9_3 = True else: xlrd_0_9_3 = False if isinstance(sheetname, compat.string_types): sheet = self.book.sheet_by_name(sheetname) else: # assume an integer if not a string sheet = self.book.sheet_by_index(sheetname) data = [] should_parse = {} for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate( zip(sheet.row_values(i), sheet.row_types(i))): if parse_cols is not None and j not in should_parse: should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: if typ == XL_CELL_DATE: if xlrd_0_9_3: # Use the newer xlrd datetime handling. value = xldate.xldate_as_datetime(value, epoch1904) # Excel doesn't distinguish between dates and time, # so we treat dates on the epoch as times only. # Also, Excel supports 1900 and 1904 epochs. year = (value.timetuple())[0:3] if ((not epoch1904 and year == (1899, 12, 31)) or (epoch1904 and year == (1904, 1, 1))): value = datetime.time(value.hour, value.minute, value.second, value.microsecond) else: # Use the xlrd <= 0.9.2 date handling. dt = xldate.xldate_as_tuple(value, epoch1904) if dt[0] < datetime.MINYEAR: value = datetime.time(*dt[3:]) else: value = datetime.datetime(*dt) elif typ == XL_CELL_ERROR: value = np.nan elif typ == XL_CELL_BOOLEAN: value = bool(value) elif convert_float and typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less suprising val = int(value) if val == value: value = val row.append(value) data.append(row) if header is not None: data[header] = _trim_excel_header(data[header]) parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, skip_footer=skip_footer, chunksize=chunksize, **kwds) return parser.read()
def parse(fname, **kwargs): num_splits = kwargs.pop("num_splits", None) start = kwargs.pop("start", None) end = kwargs.pop("end", None) _skiprows = kwargs.pop("skiprows") excel_header = kwargs.get("_header") sheet_name = kwargs.get("sheet_name", 0) footer = b"</sheetData></worksheet>" # Default to pandas case, where we are not splitting or partitioning if start is None or end is None: return pandas.read_excel(fname, **kwargs) from zipfile import ZipFile from openpyxl import load_workbook from openpyxl.worksheet._reader import WorksheetReader from openpyxl.reader.excel import ExcelReader from openpyxl.worksheet.worksheet import Worksheet from pandas.core.dtypes.common import is_list_like from pandas.io.excel._util import ( _fill_mi_header, _maybe_convert_usecols, ) from pandas.io.parsers import TextParser import re wb = load_workbook(filename=fname, read_only=True) # Get shared strings ex = ExcelReader(fname, read_only=True) ex.read_manifest() ex.read_strings() # Convert string name 0 to string if sheet_name == 0: sheet_name = wb.sheetnames[sheet_name] # get the worksheet to use with the worksheet reader ws = Worksheet(wb) # Read the raw data with ZipFile(fname) as z: with z.open("xl/worksheets/{}.xml".format( sheet_name.lower())) as file: file.seek(start) bytes_data = file.read(end - start) def update_row_nums(match): """Update the row numbers to start at 1. Note: This is needed because the parser we are using does not scale well if the row numbers remain because empty rows are inserted for all "missing" rows. Parameters ---------- match The match from the origin `re.sub` looking for row number tags. Returns ------- string The updated string with new row numbers. """ b = match.group(0) return re.sub( b"\d+", # noqa: W605 lambda c: str(int(c.group(0).decode("utf-8")) - _skiprows). encode("utf-8"), b, ) bytes_data = re.sub(b'r="[A-Z]*\d+"', update_row_nums, bytes_data) # noqa: W605 bytesio = BytesIO(excel_header + bytes_data + footer) # Use openpyxl to read/parse sheet data reader = WorksheetReader(ws, bytesio, ex.shared_strings, False) # Attach cells to worksheet object reader.bind_cells() data = PandasExcelParser.get_sheet_data( ws, kwargs.pop("convert_float", True)) usecols = _maybe_convert_usecols(kwargs.pop("usecols", None)) header = kwargs.pop("header", 0) index_col = kwargs.pop("index_col", None) # skiprows is handled externally skiprows = None # Handle header and create MultiIndex for columns if necessary if is_list_like(header) and len(header) == 1: header = header[0] if header is not None and is_list_like(header): control_row = [True] * len(data[0]) for row in header: data[row], control_row = _fill_mi_header( data[row], control_row) # Handle MultiIndex for row Index if necessary if is_list_like(index_col): # Forward fill values for MultiIndex index. if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if dataset is empty if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] parser = TextParser(data, header=header, index_col=index_col, has_index_names=is_list_like(header) and len(header) > 1, skiprows=skiprows, usecols=usecols, **kwargs) # In excel if you create a row with only a border (no values), this parser will # interpret that as a row of NaN values. Pandas discards these values, so we # also must discard these values. pandas_df = parser.read().dropna(how="all") # Since we know the number of rows that occur before this partition, we can # correctly assign the index in cases of RangeIndex. If it is not a RangeIndex, # the index is already correct because it came from the data. if isinstance(pandas_df.index, pandas.RangeIndex): pandas_df.index = pandas.RangeIndex(start=_skiprows, stop=len(pandas_df.index) + _skiprows) # We return the length if it is a RangeIndex (common case) to reduce # serialization cost. if index_col is not None: index = pandas_df.index else: # The lengths will become the RangeIndex index = len(pandas_df) return _split_result_for_readers(1, num_splits, pandas_df) + [ index, pandas_df.dtypes, ]
def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, convert_float=True, **kwds): from xlrd import (xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) datemode = self.book.datemode if isinstance(sheetname, compat.string_types): sheet = self.book.sheet_by_name(sheetname) else: # assume an integer if not a string sheet = self.book.sheet_by_index(sheetname) data = [] should_parse = {} for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate(zip(sheet.row_values(i), sheet.row_types(i))): if parse_cols is not None and j not in should_parse: should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: if typ == XL_CELL_DATE: dt = xldate_as_tuple(value, datemode) # how to produce this first case? if dt[0] < datetime.MINYEAR: # pragma: no cover value = datetime.time(*dt[3:]) else: value = datetime.datetime(*dt) elif typ == XL_CELL_ERROR: value = np.nan elif typ == XL_CELL_BOOLEAN: value = bool(value) elif convert_float and typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less suprising val = int(value) if val == value: value = val row.append(value) data.append(row) if header is not None: data[header] = _trim_excel_header(data[header]) parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, skip_footer=skip_footer, chunksize=chunksize, **kwds) return parser.read()
def parse(self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True, **kwds): _validate_header_arg(header) ret_dict = False # Keep sheetname to maintain backwards compatibility. if isinstance(sheet_name, list): sheets = sheet_name ret_dict = True elif sheet_name is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] # handle same-type duplicates. sheets = list(OrderedDict.fromkeys(sheets).keys()) output = OrderedDict() for asheetname in sheets: if verbose: print("Reading sheet {sheet}".format(sheet=asheetname)) if isinstance(asheetname, compat.string_types): sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) usecols = _maybe_convert_usecols(usecols) if sheet.nrows == 0: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True] * len(data[0]) for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = _fill_mi_header(data[row], control_row) if index_col is not None: header_name, _ = _pop_header_name(data[row], index_col) header_names.append(header_name) if is_list_like(index_col): # Forward fill values for MultiIndex index. if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == '' or data[row][col] is None: data[row][col] = last else: last = data[row][col] has_index_names = is_list_like(header) and len(header) > 1 # GH 12292 : error when read one empty column from excel file try: parser = TextParser(data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ asheetname].columns.set_names(header_names) elif compat.PY2: output[asheetname].columns = _maybe_convert_to_string( output[asheetname].columns) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() if ret_dict: return output else: return output[asheetname]