def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.etree import XMLSyntaxError from lxml.html import ( HTMLParser, fromstring, parse, ) parser = HTMLParser(recover=True, encoding=self.encoding) try: if is_url(self.io): with urlopen(self.io) as f: r = parse(f, parser=parser) else: # try to parse the input in the simplest way r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass except (UnicodeDecodeError, OSError) as e: # if the input is a blob of html goop if not is_url(self.io): r = fromstring(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass else: raise e else: if not hasattr(r, "text_content"): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) for br in r.xpath("*//br"): br.tail = "\n" + (br.tail or "") return r
def _get_data_from_filepath(self, filepath_or_buffer): """ The function read_json accepts three input types: 1. filepath (string-like) 2. file-like object (e.g. open file object, StringIO) 3. JSON string This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. """ # if it is a string but the file does not exist, it might be a JSON string filepath_or_buffer = stringify_path(filepath_or_buffer) if (not isinstance(filepath_or_buffer, str) or is_url(filepath_or_buffer) or is_fsspec_url(filepath_or_buffer) or file_exists(filepath_or_buffer)): self.handles = get_handle( filepath_or_buffer, "r", encoding=self.encoding, compression=self.compression, storage_options=self.storage_options, errors=self.encoding_errors, ) filepath_or_buffer = self.handles.handle return filepath_or_buffer
def _get_path_or_handle( path: FilePathOrBuffer, fs: Any, storage_options: StorageOptions = None, mode: str = "rb", is_dir: bool = False, ) -> tuple[FilePathOrBuffer, IOHandles | None, Any]: """File handling for PyArrow.""" path_or_handle = stringify_path(path) if is_fsspec_url(path_or_handle) and fs is None: fsspec = import_optional_dependency("fsspec") fs, path_or_handle = fsspec.core.url_to_fs(path_or_handle, **(storage_options or {})) elif storage_options and (not is_url(path_or_handle) or mode != "rb"): # can't write to a remote url # without making use of fsspec at the moment raise ValueError( "storage_options passed with buffer, or non-supported URL") handles = None if (not fs and not is_dir and isinstance(path_or_handle, str) and not os.path.isdir(path_or_handle)): # use get_handle only when we are very certain that it is not a directory # fsspec resources can also point to directories # this branch is used for example when reading from non-fsspec URLs handles = get_handle(path_or_handle, mode, is_text=False, storage_options=storage_options) fs = None path_or_handle = handles.handle return path_or_handle, handles, fs
def _read(obj): """ Try to read from a url, file or string. Parameters ---------- obj : str, unicode, or file-like Returns ------- raw_text : str """ if is_url(obj): with urlopen(obj) as url: text = url.read() elif hasattr(obj, "read"): text = obj.read() elif isinstance(obj, (str, bytes)): text = obj try: if os.path.isfile(text): with open(text, "rb") as f: return f.read() except (TypeError, ValueError): pass else: raise TypeError(f"Cannot read object of type '{type(obj).__name__}'") return text
def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): filepath_or_buffer = get_filepath_or_buffer( filepath_or_buffer, storage_options=storage_options).filepath_or_buffer if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer elif hasattr(filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too filepath_or_buffer.seek(0) self.book = self.load_workbook(filepath_or_buffer) elif isinstance(filepath_or_buffer, str): self.book = self.load_workbook(filepath_or_buffer) elif isinstance(filepath_or_buffer, bytes): self.book = self.load_workbook(BytesIO(filepath_or_buffer)) else: raise ValueError( "Must explicitly set engine if not passing in buffer or path for io." )
def _write_cell( self, s: Any, kind: str = "td", indent: int = 0, tags: Optional[str] = None ) -> None: if tags is not None: start_tag = "<{kind} {tags}>".format(kind=kind, tags=tags) else: start_tag = "<{kind}>".format(kind=kind) if self.escape: # escape & first to prevent double escaping of & esc = {"&": r"&", "<": r"<", ">": r">"} else: esc = {} rs = pprint_thing(s, escape_chars=esc).strip() if self.render_links and is_url(rs): rs_unescaped = pprint_thing(s, escape_chars={}).strip() start_tag += '<a href="{url}" target="_blank">'.format(url=rs_unescaped) end_a = "</a>" else: end_a = "" self.write( "{start}{rs}{end_a}</{kind}>".format( start=start_tag, rs=rs, end_a=end_a, kind=kind ), indent, )
def _write_cell(self, s: Any, kind: str = "td", indent: int = 0, tags: str | None = None) -> None: if tags is not None: start_tag = f"<{kind} {tags}>" else: start_tag = f"<{kind}>" if self.escape: # escape & first to prevent double escaping of & esc = {"&": r"&", "<": r"<", ">": r">"} else: esc = {} rs = pprint_thing(s, escape_chars=esc).strip() if self.render_links and is_url(rs): rs_unescaped = pprint_thing(s, escape_chars={}).strip() start_tag += f'<a href="{rs_unescaped}" target="_blank">' end_a = "</a>" else: end_a = "" self.write(f"{start_tag}{rs}{end_a}</{kind}>", indent)
def _read( obj: bytes | FilePath | ReadBuffer[str] | ReadBuffer[bytes], encoding: str | None ) -> str | bytes: """ Try to read from a url, file or string. Parameters ---------- obj : str, unicode, path object, or file-like object Returns ------- raw_text : str """ text: str | bytes if ( is_url(obj) or hasattr(obj, "read") or (isinstance(obj, str) and file_exists(obj)) ): # error: Argument 1 to "get_handle" has incompatible type "Union[str, bytes, # Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]"; # expected "Union[PathLike[str], Union[str, Union[IO[Any], RawIOBase, # BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]]" with get_handle( obj, "r", encoding=encoding # type: ignore[arg-type] ) as handles: text = handles.handle.read() elif isinstance(obj, (str, bytes)): text = obj else: raise TypeError(f"Cannot read object of type '{type(obj).__name__}'") return text
def _check_source_format(self, src): src = self.source if is_url(src): fmt = 'url' elif is_file_like(src): fmt = 'filelike' elif is_fsspec_url(src): fmt = 's3' else: fmt = 'invalid' return fmt
def get_sbml_model( filepath_or_buffer ) -> Tuple[libsbml.SBMLReader, libsbml.SBMLDocument, libsbml.Model]: """Get an SBML model from file or URL or file handle :param filepath_or_buffer: File or URL or file handle to read the model from :return: The SBML document, model and reader """ from pandas.io.common import get_filepath_or_buffer, is_url, is_file_like if is_file_like(filepath_or_buffer) or is_url(filepath_or_buffer): buffer = get_filepath_or_buffer(filepath_or_buffer, mode='r')[0] if is_url(filepath_or_buffer): buffer = ''.join(line.decode('utf-8') for line in buffer) else: buffer = ''.join(line for line in buffer) # URL or already opened file, we will load the model from a string return load_sbml_from_string(buffer) return load_sbml_from_file(filepath_or_buffer)
def get_sbml_model( filepath_or_buffer ) -> Tuple[libsbml.SBMLReader, libsbml.SBMLDocument, libsbml.Model]: """Get an SBML model from file or URL or file handle :param filepath_or_buffer: File or URL or file handle to read the model from :return: The SBML document, model and reader """ if is_file_like(filepath_or_buffer) or is_url(filepath_or_buffer): with get_handle(filepath_or_buffer, mode='r') as io_handle: data = load_sbml_from_string(''.join(io_handle.handle)) # URL or already opened file, we will load the model from a string return data return load_sbml_from_file(filepath_or_buffer)
def get_data_from_filepath( filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str], encoding, compression: CompressionOptions, storage_options: StorageOptions, ) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]: """ Extract raw XML data. The method accepts three input types: 1. filepath (string-like) 2. file-like object (e.g. open file object, StringIO) 3. XML string or bytes This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. """ if not isinstance(filepath_or_buffer, bytes): filepath_or_buffer = stringify_path(filepath_or_buffer) if ( isinstance(filepath_or_buffer, str) and not filepath_or_buffer.startswith(("<?xml", "<")) ) and ( not isinstance(filepath_or_buffer, str) or is_url(filepath_or_buffer) or is_fsspec_url(filepath_or_buffer) or file_exists(filepath_or_buffer) ): with get_handle( filepath_or_buffer, "r", encoding=encoding, compression=compression, storage_options=storage_options, ) as handle_obj: filepath_or_buffer = ( # error: Incompatible types in assignment (expression has type # "Union[str, IO[str]]", variable has type "Union[Union[str, # PathLike[str]], bytes, ReadBuffer[bytes], ReadBuffer[str]]") handle_obj.handle.read() # type: ignore[assignment] if hasattr(handle_obj.handle, "read") else handle_obj.handle ) return filepath_or_buffer
def _get_data_from_filepath(self, filepath_or_buffer): """ The function read_json accepts three input types: 1. filepath (string-like) 2. file-like object (e.g. open file object, StringIO) 3. JSON string This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. It raises FileNotFoundError if the input is a string ending in one of .json, .json.gz, .json.bz2, etc. but no such file exists. """ # if it is a string but the file does not exist, it might be a JSON string filepath_or_buffer = stringify_path(filepath_or_buffer) if ( not isinstance(filepath_or_buffer, str) or is_url(filepath_or_buffer) or is_fsspec_url(filepath_or_buffer) or file_exists(filepath_or_buffer) ): self.handles = get_handle( filepath_or_buffer, "r", encoding=self.encoding, compression=self.compression, storage_options=self.storage_options, errors=self.encoding_errors, ) filepath_or_buffer = self.handles.handle elif ( isinstance(filepath_or_buffer, str) and filepath_or_buffer.lower().endswith( (".json",) + tuple(f".json{c}" for c in _extension_to_compression) ) and not file_exists(filepath_or_buffer) ): raise FileNotFoundError(f"File {filepath_or_buffer} does not exist") return filepath_or_buffer
def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): self.ioargs = IOArgs( filepath_or_buffer=filepath_or_buffer, encoding=None, mode=None, compression={"method": None}, ) # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): self.ioargs = IOArgs( filepath_or_buffer=BytesIO(urlopen(filepath_or_buffer).read()), should_close=True, encoding=None, mode=None, compression={"method": None}, ) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): self.ioargs = get_filepath_or_buffer( filepath_or_buffer, storage_options=storage_options) if isinstance(self.ioargs.filepath_or_buffer, self._workbook_class): self.book = self.ioargs.filepath_or_buffer elif hasattr(self.ioargs.filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too assert not isinstance(self.ioargs.filepath_or_buffer, str) self.ioargs.filepath_or_buffer.seek(0) self.book = self.load_workbook(self.ioargs.filepath_or_buffer) elif isinstance(self.ioargs.filepath_or_buffer, str): self.book = self.load_workbook(self.ioargs.filepath_or_buffer) elif isinstance(self.ioargs.filepath_or_buffer, bytes): self.book = self.load_workbook( BytesIO(self.ioargs.filepath_or_buffer)) else: raise ValueError( "Must explicitly set engine if not passing in buffer or path for io." )
def filepath_to_buffer( filepath: Any, encoding: Optional[str] = None, compression: Optional[str] = None, timeout: Optional[float] = None, start_byte: int = 0, ) -> Tuple[io.IOBase, Optional[str], Optional[str], int]: if not is_str(filepath): # if start_byte: # filepath.seek(start_byte) return cast(io.IOBase, filepath), encoding, compression, filepath.size() if is_url(filepath): headers = None if start_byte: headers = {"Range": "bytes={}-".format(start_byte)} req = requests.get(filepath, stream=True, headers=headers, timeout=timeout) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": compression = "gzip" size = req.headers.get("Content-Length", 0) # return HttpDesc(req.raw, filepath), encoding, compression, int(size) return cast(io.IOBase, req.raw), encoding, compression, int(size) if is_s3_url(filepath): reader, encoding, compression = s3_get_filepath_or_buffer( filepath, encoding=encoding, compression=compression ) return cast(io.IOBase, reader), encoding, compression, reader.size if _is_buffer_url(filepath): buffer = _url_to_buffer(filepath) return cast(io.IOBase, buffer), encoding, compression, buffer.size() filepath = os.path.expanduser(filepath) if not os.path.exists(filepath): raise ValueError("wrong filepath: {}".format(filepath)) size = os.stat(filepath).st_size stream = io.FileIO(filepath) if start_byte: stream.seek(start_byte) return stream, encoding, compression, size
def get_data_from_filepath( filepath_or_buffer, encoding, compression, storage_options, ) -> Union[str, bytes, Buffer]: """ Extract raw XML data. The method accepts three input types: 1. filepath (string-like) 2. file-like object (e.g. open file object, StringIO) 3. XML string or bytes This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. """ filepath_or_buffer = stringify_path(filepath_or_buffer) if (isinstance(filepath_or_buffer, str) and not filepath_or_buffer.startswith( ("<?xml", "<"))) and (not isinstance(filepath_or_buffer, str) or is_url(filepath_or_buffer) or is_fsspec_url(filepath_or_buffer) or file_exists(filepath_or_buffer)): with get_handle( filepath_or_buffer, "r", encoding=encoding, compression=compression, storage_options=storage_options, ) as handle_obj: filepath_or_buffer = (handle_obj.handle.read() if hasattr( handle_obj.handle, "read") else handle_obj.handle) return filepath_or_buffer
def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]: """ Iterparse xml nodes. This method will read in local disk, decompressed XML files for elements and underlying descendants using iterparse, a method to iterate through an XML tree without holding entire XML tree in memory. Raises ------ TypeError * If `iterparse` is not a dict or its dict value is not list-like. ParserError * If `path_or_buffer` is not a physical, decompressed file on disk. * If no data is returned from selected items in `iterparse`. Notes ----- Namespace URIs will be removed from return node values. Also, elements with missing children or attributes in submitted list will have optional keys filled with None values. """ dicts: list[dict[str, str | None]] = [] row: dict[str, str | None] | None = None if not isinstance(self.iterparse, dict): raise TypeError( f"{type(self.iterparse).__name__} is not a valid type for iterparse" ) row_node = next(iter(self.iterparse.keys())) if self.iterparse else "" if not is_list_like(self.iterparse[row_node]): raise TypeError( f"{type(self.iterparse[row_node])} is not a valid type " "for value in iterparse" ) if ( not isinstance(self.path_or_buffer, str) or is_url(self.path_or_buffer) or is_fsspec_url(self.path_or_buffer) or self.path_or_buffer.startswith(("<?xml", "<")) or infer_compression(self.path_or_buffer, "infer") is not None ): raise ParserError( "iterparse is designed for large XML files that are fully extracted on " "local disk and not as compressed files or online sources." ) for event, elem in iterparse(self.path_or_buffer, events=("start", "end")): curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag if event == "start": if curr_elem == row_node: row = {} if row is not None: if self.names: for col, nm in zip(self.iterparse[row_node], self.names): if curr_elem == col: elem_val = elem.text.strip() if elem.text else None if row.get(nm) != elem_val and nm not in row: row[nm] = elem_val if col in elem.attrib: if elem.attrib[col] not in row.values() and nm not in row: row[nm] = elem.attrib[col] else: for col in self.iterparse[row_node]: if curr_elem == col: row[col] = elem.text.strip() if elem.text else None if col in elem.attrib: row[col] = elem.attrib[col] if event == "end": if curr_elem == row_node and row is not None: dicts.append(row) row = None elem.clear() if hasattr(elem, "getprevious"): while ( elem.getprevious() is not None and elem.getparent() is not None ): del elem.getparent()[0] if dicts == []: raise ParserError("No result from selected items in iterparse.") keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] if self.names: dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts] return dicts
def _read_csv_check_support( cls, read_csv_kwargs: ReadCsvKwargsType, ) -> Tuple[bool, str]: """ Check if passed parameters are supported by current ``modin.pandas.read_csv`` implementation. Parameters ---------- read_csv_kwargs : dict Parameters of read_csv function. Returns ------- bool Whether passed parameters are supported or not. str Error message that should be raised if user explicitly set `engine="arrow"`. """ filepath_or_buffer = read_csv_kwargs.get("filepath_or_buffer", None) header = read_csv_kwargs.get("header", "infer") names = read_csv_kwargs.get("names", None) engine = read_csv_kwargs.get("engine", None) skiprows = read_csv_kwargs.get("skiprows", None) delimiter = read_csv_kwargs.get("delimiter", None) parse_dates = read_csv_kwargs.get("parse_dates", False) if read_csv_kwargs.get("compression", "infer") != "infer": return ( False, "read_csv with 'arrow' engine doesn't support explicit compression parameter, compression" " must be inferred automatically (supported compression types are gzip and bz2)", ) if isinstance(filepath_or_buffer, str): if not os.path.exists(filepath_or_buffer): if cls.file_exists(filepath_or_buffer) or is_url(filepath_or_buffer): return ( False, "read_csv with 'arrow' engine supports only local files", ) else: raise FileNotFoundError("No such file or directory") elif not cls.pathlib_or_pypath(filepath_or_buffer): if hasattr(filepath_or_buffer, "read"): return ( False, "read_csv with 'arrow' engine doesn't support file-like objects", ) else: raise ValueError( f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" ) for arg, def_value in cls.read_csv_unsup_defaults.items(): if read_csv_kwargs[arg] != def_value: return ( False, f"read_csv with 'arrow' engine doesn't support {arg} parameter", ) if delimiter is not None and read_csv_kwargs.get("delim_whitespace", False): raise ValueError( "Specified a delimiter with both sep and delim_whitespace=True; you can only specify one." ) parse_dates_unsupported = isinstance(parse_dates, dict) or ( isinstance(parse_dates, list) and isinstance(parse_dates[0], list) ) if parse_dates_unsupported: return ( False, "read_csv with 'arrow' engine supports only bool and " "flattened lists 'parse_dates' parameter", ) if names and names != lib.no_default: if header not in [None, 0, "infer"]: return ( False, "read_csv with 'arrow' engine and provided 'names' parameter supports only 0, None and " "'infer' header values", ) if isinstance(parse_dates, list) and not set(parse_dates).issubset(names): raise ValueError("Missing column provided to 'parse_dates'") empty_pandas_df = pandas.read_csv( **dict( read_csv_kwargs, nrows=0, skiprows=None, skipfooter=0, usecols=None, index_col=None, names=None, parse_dates=None, engine=None if engine == "arrow" else engine, ), ) columns_number = len(empty_pandas_df.columns) if columns_number != len(names): return ( False, "read_csv with 'arrow' engine doesn't support names parameter, which length doesn't match " "with actual number of columns", ) else: if header not in [0, "infer"]: return ( False, "read_csv with 'arrow' engine without 'names' parameter provided supports only 0 and 'infer' " "header values", ) if isinstance(parse_dates, list): empty_pandas_df = pandas.read_csv( **dict( read_csv_kwargs, nrows=0, skiprows=None, skipfooter=0, usecols=None, index_col=None, engine=None if engine == "arrow" else engine, ), ) if not set(parse_dates).issubset(empty_pandas_df.columns): raise ValueError("Missing column provided to 'parse_dates'") if not read_csv_kwargs.get("skip_blank_lines", True): # in some corner cases empty lines are handled as '', # while pandas handles it as NaNs - issue #3084 return ( False, "read_csv with 'arrow' engine doesn't support skip_blank_lines = False parameter", ) if skiprows is not None and not isinstance(skiprows, int): return ( False, "read_csv with 'arrow' engine doesn't support non-integer skiprows parameter", ) return True, None