def parse(self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True, **kwds): _validate_header_arg(header) ret_dict = False # Keep sheetname to maintain backwards compatibility. if isinstance(sheet_name, list): sheets = sheet_name ret_dict = True elif sheet_name is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] # handle same-type duplicates. sheets = list(OrderedDict.fromkeys(sheets).keys()) output = OrderedDict() for asheetname in sheets: if verbose: print("Reading sheet {sheet}".format(sheet=asheetname)) if isinstance(asheetname, compat.string_types): sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) usecols = _maybe_convert_usecols(usecols) if sheet.nrows == 0: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True] * len(data[0]) for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = _fill_mi_header(data[row], control_row) if index_col is not None: header_name, _ = _pop_header_name(data[row], index_col) header_names.append(header_name) if is_list_like(index_col): # Forward fill values for MultiIndex index. if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == '' or data[row][col] is None: data[row][col] = last else: last = data[row][col] has_index_names = is_list_like(header) and len(header) > 1 # GH 12292 : error when read one empty column from excel file try: parser = TextParser(data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ asheetname].columns.set_names(header_names) elif compat.PY2: output[asheetname].columns = _maybe_convert_to_string( output[asheetname].columns) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() if ret_dict: return output else: return output[asheetname]
def parse( self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True, **kwds, ): validate_header_arg(header) ret_dict = False # Keep sheetname to maintain backwards compatibility. if isinstance(sheet_name, list): sheets = sheet_name ret_dict = True elif sheet_name is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] # handle same-type duplicates. sheets = list(dict.fromkeys(sheets).keys()) output = {} for asheetname in sheets: if verbose: print(f"Reading sheet {asheetname}") if isinstance(asheetname, str): sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) usecols = _maybe_convert_usecols(usecols) if not data: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True] * len(data[0]) for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = _fill_mi_header( data[row], control_row) if index_col is not None: header_name, _ = _pop_header_name(data[row], index_col) header_names.append(header_name) if is_list_like(index_col): # Forward fill values for MultiIndex index. if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] has_index_names = is_list_like(header) and len(header) > 1 # GH 12292 : error when read one empty column from excel file try: parser = TextParser( data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds, ) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ asheetname].columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() if ret_dict: return output else: return output[asheetname]
def _df_filter(ranger, lasso, header=0, names=None, index_col=None, parse_cols=None, usecols=None, squeeze=False, dtype=None, engine=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, keep_default_na=True, verbose=False, parse_dates=False, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True, **kwds): """ Converts captured values table as pandas DataFrame Most args copied from :func:`pandas.io.read_excel()` except: sheet_name, skip_footer, converters, date_parser Note that ``skip_footer`` has been deprecated by ``skipfooter``. """ data = lasso.values # Copied & adapted from `pandas.io.excel.py` v0.24.2+ (Jun 2019) # https://github.com/pandas-dev/pandas/blob/d47fc0c/pandas/io/excel/_base.py#L368 _validate_header_arg(header) invalid_args = (set("skip_footer chunksize date_parser converted".split()) & kwds.keys()) if bool(invalid_args): raise NotImplementedError("Cannot implement args: %s" % invalid_args) if not data: return pd.DataFrame() usecols = _maybe_convert_usecols(usecols) if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True for _ in data[0]] for row in header: if is_integer(skiprows): row += skiprows try: data[row], control_row = _fill_mi_header( data[row], control_row) except TypeError: ## Arg `control_row` introduced in pandas-v0.19.0 to fix # https://github.com/pandas-dev/pandas/issues/12453 # https://github.com/pandas-dev/pandas/commit/67b72e3cbbaeb89a5b9c780b2fe1c8d5eaa9c505 data[row] = _fill_mi_header(data[row]) if index_col is not None: header_name, data[row] = _pop_header_name(data[row], index_col) header_names.append(header_name) if is_list_like(index_col): # forward fill values for MultiIndex index if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] has_index_names = is_list_like(header) and len(header) > 1 # Pandaas expect '' instead of `None`! data = [["" if c is None else c for c in r] for r in data] # GH 12292 : error when read one empty column from excel file try: parser = pdparsers.TextParser(data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds) output = parser.read() if not squeeze or isinstance(output, pd.DataFrame): if header_names: output.columns = output.columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame output = pd.DataFrame() lasso = lasso._replace(values=output) return lasso