def __init__(self, src: FilePath | ReadBuffer[bytes], **kwds): self.kwds = kwds self.src = src ParserBase.__init__(self, kwds) self._parse_kwds()
def __init__(self, src: FilePathOrBuffer, **kwds): self.kwds = kwds self.src = src ParserBase.__init__(self, kwds) self._parse_kwds()
def __init__(self, src: FilePathOrBuffer, **kwds): self.kwds = kwds kwds = kwds.copy() ParserBase.__init__(self, kwds) # #2442 # error: Cannot determine type of 'index_col' kwds["allow_leading_cols"] = ( self.index_col is not False # type: ignore[has-type] ) # GH20529, validate usecol arg before TextReader kwds["usecols"] = self.usecols # open handles self._open_handles(src, kwds) assert self.handles is not None for key in ("storage_options", "encoding", "memory_map", "compression"): kwds.pop(key, None) try: self._reader = parsers.TextReader(self.handles.handle, **kwds) except Exception: self.handles.close() raise self.unnamed_cols = self._reader.unnamed_cols # error: Cannot determine type of 'names' passed_names = self.names is None # type: ignore[has-type] if self._reader.header is None: self.names = None else: if len(self._reader.header) > 1: # we have a multi index in the columns # error: Cannot determine type of 'names' # error: Cannot determine type of 'index_names' # error: Cannot determine type of 'col_names' ( self.names, # type: ignore[has-type] self.index_names, self.col_names, passed_names, ) = self._extract_multi_indexer_columns( self._reader.header, self.index_names, # type: ignore[has-type] self.col_names, # type: ignore[has-type] passed_names, ) else: # error: Cannot determine type of 'names' self.names = list(self._reader.header[0]) # type: ignore[has-type] # error: Cannot determine type of 'names' if self.names is None: # type: ignore[has-type] if self.prefix: # error: Cannot determine type of 'names' self.names = [ # type: ignore[has-type] f"{self.prefix}{i}" for i in range(self._reader.table_width) ] else: # error: Cannot determine type of 'names' self.names = list( # type: ignore[has-type] range(self._reader.table_width) ) # gh-9755 # # need to set orig_names here first # so that proper indexing can be done # with _set_noconvert_columns # # once names has been filtered, we will # then set orig_names again to names # error: Cannot determine type of 'names' self.orig_names = self.names[:] # type: ignore[has-type] if self.usecols: usecols = self._evaluate_usecols(self.usecols, self.orig_names) # GH 14671 # assert for mypy, orig_names is List or None, None would error in issubset assert self.orig_names is not None if self.usecols_dtype == "string" and not set(usecols).issubset( self.orig_names ): self._validate_usecols_names(usecols, self.orig_names) # error: Cannot determine type of 'names' if len(self.names) > len(usecols): # type: ignore[has-type] # error: Cannot determine type of 'names' self.names = [ # type: ignore[has-type] n # error: Cannot determine type of 'names' for i, n in enumerate(self.names) # type: ignore[has-type] if (i in usecols or n in usecols) ] # error: Cannot determine type of 'names' if len(self.names) < len(usecols): # type: ignore[has-type] # error: Cannot determine type of 'names' self._validate_usecols_names( usecols, self.names, # type: ignore[has-type] ) # error: Cannot determine type of 'names' self._validate_parse_dates_presence(self.names) # type: ignore[has-type] self._set_noconvert_columns() # error: Cannot determine type of 'names' self.orig_names = self.names # type: ignore[has-type] if not self._has_complex_date_col: # error: Cannot determine type of 'index_col' if self._reader.leading_cols == 0 and is_index_col( self.index_col # type: ignore[has-type] ): self._name_processed = True ( index_names, # error: Cannot determine type of 'names' self.names, # type: ignore[has-type] self.index_col, ) = self._clean_index_names( # error: Cannot determine type of 'names' self.names, # type: ignore[has-type] # error: Cannot determine type of 'index_col' self.index_col, # type: ignore[has-type] self.unnamed_cols, ) if self.index_names is None: self.index_names = index_names if self._reader.header is None and not passed_names: assert self.index_names is not None self.index_names = [None] * len(self.index_names) self._implicit_index = self._reader.leading_cols > 0
def __init__(self, f: FilePathOrBuffer | list, **kwds): """ Workhorse function for processing nested list into DataFrame """ ParserBase.__init__(self, kwds) self.data: Iterator[str] | None = None self.buf: list = [] self.pos = 0 self.line_pos = 0 self.skiprows = kwds["skiprows"] if callable(self.skiprows): self.skipfunc = self.skiprows else: self.skipfunc = lambda x: x in self.skiprows self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"]) self.delimiter = kwds["delimiter"] self.quotechar = kwds["quotechar"] if isinstance(self.quotechar, str): self.quotechar = str(self.quotechar) self.escapechar = kwds["escapechar"] self.doublequote = kwds["doublequote"] self.skipinitialspace = kwds["skipinitialspace"] self.lineterminator = kwds["lineterminator"] self.quoting = kwds["quoting"] self.skip_blank_lines = kwds["skip_blank_lines"] self.names_passed = kwds["names"] or None self.has_index_names = False if "has_index_names" in kwds: self.has_index_names = kwds["has_index_names"] self.verbose = kwds["verbose"] self.converters = kwds["converters"] self.dtype = copy(kwds["dtype"]) self.thousands = kwds["thousands"] self.decimal = kwds["decimal"] self.comment = kwds["comment"] # Set self.data to something that can read lines. if isinstance(f, list): # read_excel: f is a list self.data = cast(Iterator[str], f) else: self._open_handles(f, kwds) assert self.handles is not None assert hasattr(self.handles.handle, "readline") try: self._make_reader(self.handles.handle) except (csv.Error, UnicodeDecodeError): self.close() raise # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. self._col_indices: list[int] | None = None try: ( self.columns, self.num_original_columns, self.unnamed_cols, ) = self._infer_columns() except (TypeError, ValueError): self.close() raise # Now self.columns has the set of columns that we will process. # The original set is stored in self.original_columns. if len(self.columns) > 1: # we are processing a multi index column # error: Cannot determine type of 'index_names' # error: Cannot determine type of 'col_names' ( self.columns, self.index_names, self.col_names, _, ) = self._extract_multi_indexer_columns( self.columns, self.index_names, # type: ignore[has-type] self.col_names, # type: ignore[has-type] ) # Update list of original names to include all indices. self.num_original_columns = len(self.columns) else: self.columns = self.columns[0] # get popped off for index self.orig_names: list[int | str | tuple] = list(self.columns) # needs to be cleaned/refactored # multiple date column thing turning into a real spaghetti factory if not self._has_complex_date_col: (index_names, self.orig_names, self.columns) = self._get_index_name(self.columns) self._name_processed = True if self.index_names is None: self.index_names = index_names if self._col_indices is None: self._col_indices = list(range(len(self.columns))) self._validate_parse_dates_presence(self.columns) no_thousands_columns: set[int] | None = None if self.parse_dates: no_thousands_columns = self._set_noconvert_dtype_columns( self._col_indices, self.columns) self._no_thousands_columns = no_thousands_columns if len(self.decimal) != 1: raise ValueError("Only length-1 decimal markers supported") decimal = re.escape(self.decimal) if self.thousands is None: regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$" else: thousands = re.escape(self.thousands) regex = (fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?" fr"([0-9]?(E|e)\-?[0-9]+)?$") self.num = re.compile(regex)
def __init__(self, src: FilePathOrBuffer, **kwds): self.kwds = kwds kwds = kwds.copy() ParserBase.__init__(self, kwds) # #2442 kwds["allow_leading_cols"] = self.index_col is not False # GH20529, validate usecol arg before TextReader kwds["usecols"] = self.usecols # open handles self._open_handles(src, kwds) assert self.handles is not None for key in ("storage_options", "encoding", "memory_map", "compression"): kwds.pop(key, None) if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"): # error: Item "IO[Any]" of "Union[IO[Any], RawIOBase, BufferedIOBase, # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" # error: Item "RawIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase, # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" # error: Item "BufferedIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase, # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" # error: Item "TextIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase, # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" # error: Item "TextIOWrapper" of "Union[IO[Any], RawIOBase, BufferedIOBase, # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" # error: Item "mmap" of "Union[IO[Any], RawIOBase, BufferedIOBase, # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" self.handles.handle = self.handles.handle.mmap # type: ignore[union-attr] try: self._reader = parsers.TextReader(self.handles.handle, **kwds) except Exception: self.handles.close() raise self.unnamed_cols = self._reader.unnamed_cols passed_names = self.names is None if self._reader.header is None: self.names = None else: if len(self._reader.header) > 1: # we have a multi index in the columns ( self.names, self.index_names, self.col_names, passed_names, ) = self._extract_multi_indexer_columns( self._reader.header, self.index_names, self.col_names, passed_names) else: self.names = list(self._reader.header[0]) if self.names is None: if self.prefix: self.names = [ f"{self.prefix}{i}" for i in range(self._reader.table_width) ] else: self.names = list(range(self._reader.table_width)) # gh-9755 # # need to set orig_names here first # so that proper indexing can be done # with _set_noconvert_columns # # once names has been filtered, we will # then set orig_names again to names self.orig_names = self.names[:] if self.usecols: usecols = self._evaluate_usecols(self.usecols, self.orig_names) # GH 14671 # assert for mypy, orig_names is List or None, None would error in issubset assert self.orig_names is not None if self.usecols_dtype == "string" and not set(usecols).issubset( self.orig_names): self._validate_usecols_names(usecols, self.orig_names) if len(self.names) > len(usecols): self.names = [ n for i, n in enumerate(self.names) if (i in usecols or n in usecols) ] if len(self.names) < len(usecols): self._validate_usecols_names(usecols, self.names) self._validate_parse_dates_presence(self.names) self._set_noconvert_columns() self.orig_names = self.names if not self._has_complex_date_col: if self._reader.leading_cols == 0 and is_index_col(self.index_col): self._name_processed = True (index_names, self.names, self.index_col) = self._clean_index_names( self.names, self.index_col, self.unnamed_cols) if self.index_names is None: self.index_names = index_names if self._reader.header is None and not passed_names: assert self.index_names is not None self.index_names = [None] * len(self.index_names) self._implicit_index = self._reader.leading_cols > 0