def write(self, df, path, compression='snappy', index=None, partition_cols=None, **kwargs): self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. # Use tobytes() instead. if 'partition_on' in kwargs and partition_cols is not None: raise ValueError("Cannot use both partition_on and " "partition_cols. Use partition_cols for " "partitioning data") elif 'partition_on' in kwargs: partition_cols = kwargs.pop('partition_on') if partition_cols is not None: kwargs['file_scheme'] = 'hive' if is_s3_url(path): # path is s3:// so we need to open the s3file in 'wb' mode. # TODO: Support 'ab' path, _, _, _ = get_filepath_or_buffer(path, mode='wb') # And pass the opened s3file to the fastparquet internal impl. kwargs['open_with'] = lambda path, _: path else: path, _, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): self.api.write(path, df, compression=compression, write_index=index, partition_on=partition_cols, **kwargs)
def read(self, path, columns=None, **kwargs): if is_s3_url(path): # When path is s3:// an S3File is returned. # We need to retain the original path(str) while also # pass the S3File().open function to fsatparquet impl. s3, _, _ = get_filepath_or_buffer(path) parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open) else: path, _, _ = get_filepath_or_buffer(path) parquet_file = self.api.ParquetFile(path) return parquet_file.to_pandas(columns=columns, **kwargs)
def write(self, df, path, compression='snappy', coerce_timestamps='ms', index=None, **kwargs): self.validate_dataframe(df) # Only validate the index if we're writing it. if self._pyarrow_lt_070 and index is not False: self._validate_write_lt_070(df) path, _, _, _ = get_filepath_or_buffer(path, mode='wb') if index is None: from_pandas_kwargs = {} else: from_pandas_kwargs = {'preserve_index': index} if self._pyarrow_lt_060: table = self.api.Table.from_pandas(df, timestamps_to_ms=True, **from_pandas_kwargs) self.api.parquet.write_table( table, path, compression=compression, **kwargs) else: table = self.api.Table.from_pandas(df, **from_pandas_kwargs) self.api.parquet.write_table( table, path, compression=compression, coerce_timestamps=coerce_timestamps, **kwargs)
def __init__(self, path_or_buf, index=None, convert_dates=True, blank_missing=True, chunksize=None, encoding=None, convert_text=True, convert_header_text=True): self.index = index self.convert_dates = convert_dates self.blank_missing = blank_missing self.chunksize = chunksize self.encoding = encoding self.convert_text = convert_text self.convert_header_text = convert_header_text self.default_encoding = "latin-1" self.compression = "" self.column_names_strings = [] self.column_names = [] self.column_types = [] self.column_formats = [] self.columns = [] self._current_page_data_subheader_pointers = [] self._cached_page = None self._column_data_lengths = [] self._column_data_offsets = [] self._current_row_in_file_index = 0 self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 self._path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf) if isinstance(self._path_or_buf, compat.string_types): self._path_or_buf = open(self._path_or_buf, 'rb') self.handle = self._path_or_buf self._get_properties() self._parse_metadata()
def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs): """ Load msgpack pandas object from the specified file path THIS IS AN EXPERIMENTAL LIBRARY and the storage format may not be stable until a future release. Parameters ---------- path_or_buf : string File path, BytesIO like or string encoding : Encoding for decoding msgpack str type iterator : boolean, if True, return an iterator to the unpacker (default is False) Returns ------- obj : same type as object stored in file """ path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf) if iterator: return Iterator(path_or_buf) def read(fh): unpacked_obj = list(unpack(fh, encoding=encoding, **kwargs)) if len(unpacked_obj) == 1: return unpacked_obj[0] if should_close: try: path_or_buf.close() except IOError: pass return unpacked_obj # see if we have an actual file if isinstance(path_or_buf, str): try: exists = os.path.exists(path_or_buf) except (TypeError, ValueError): exists = False if exists: with open(path_or_buf, 'rb') as fh: return read(fh) if isinstance(path_or_buf, bytes): # treat as a binary-like fh = None try: fh = BytesIO(path_or_buf) return read(fh) finally: if fh is not None: fh.close() elif hasattr(path_or_buf, 'read') and callable(path_or_buf.read): # treat as a buffer like return read(path_or_buf) raise ValueError('path_or_buf needs to be a string file path or file-like')
def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1', chunksize=None): self._encoding = encoding self._lines_read = 0 self._index = index self._chunksize = chunksize if isinstance(filepath_or_buffer, str): (filepath_or_buffer, encoding, compression, should_close) = get_filepath_or_buffer( filepath_or_buffer, encoding=encoding) if isinstance(filepath_or_buffer, (str, compat.text_type, bytes)): self.filepath_or_buffer = open(filepath_or_buffer, 'rb') else: # Copy to BytesIO, and ensure no encoding contents = filepath_or_buffer.read() try: contents = contents.encode(self._encoding) except UnicodeEncodeError: pass self.filepath_or_buffer = compat.BytesIO(contents) self._read_header()
def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs): """ Load msgpack pandas object from the specified file path THIS IS AN EXPERIMENTAL LIBRARY and the storage format may not be stable until a future release. Parameters ---------- path_or_buf : string File path, BytesIO like or string encoding: Encoding for decoding msgpack str type iterator : boolean, if True, return an iterator to the unpacker (default is False) Returns ------- obj : type of object stored in file """ path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf) if iterator: return Iterator(path_or_buf) def read(fh): l = list(unpack(fh, encoding=encoding, **kwargs)) if len(l) == 1: return l[0] return l # see if we have an actual file if isinstance(path_or_buf, compat.string_types): try: exists = os.path.exists(path_or_buf) except (TypeError, ValueError): exists = False if exists: with open(path_or_buf, 'rb') as fh: return read(fh) if isinstance(path_or_buf, compat.binary_type): # treat as a binary-like fh = None try: # We can't distinguish between a path and a buffer of bytes in # Python 2 so instead assume the first byte of a valid path is # less than 0x80. if compat.PY3 or ord(path_or_buf[0]) >= 0x80: fh = compat.BytesIO(path_or_buf) return read(fh) finally: if fh is not None: fh.close() elif hasattr(path_or_buf, 'read') and compat.callable(path_or_buf.read): # treat as a buffer like return read(path_or_buf) raise ValueError('path_or_buf needs to be a string file path or file-like')
def test_get_filepath_or_buffer_with_path(self): filename = '~/sometest' filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer( filename) assert filepath_or_buffer != filename assert os.path.isabs(filepath_or_buffer) assert os.path.expanduser(filename) == filepath_or_buffer assert not should_close
def read(self, path, columns=None, **kwargs): path, _, _ = get_filepath_or_buffer(path) if self._pyarrow_lt_070: return self.api.parquet.read_pandas(path, columns=columns, **kwargs).to_pandas() kwargs['use_pandas_metadata'] = True return self.api.parquet.read_table(path, columns=columns, **kwargs).to_pandas()
def write(self, df, path, compression='snappy', **kwargs): # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. # Use tobytes() instead. path, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): self.api.write(path, df, compression=compression, **kwargs)
def read_msgpack(path_or_buf, encoding="utf-8", iterator=False, **kwargs): """ Load msgpack pandas object from the specified file path THIS IS AN EXPERIMENTAL LIBRARY and the storage format may not be stable until a future release. Parameters ---------- path_or_buf : string File path, BytesIO like or string encoding: Encoding for decoding msgpack str type iterator : boolean, if True, return an iterator to the unpacker (default is False) Returns ------- obj : type of object stored in file """ path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf) if iterator: return Iterator(path_or_buf) def read(fh): l = list(unpack(fh, encoding=encoding, **kwargs)) if len(l) == 1: return l[0] return l # see if we have an actual file if isinstance(path_or_buf, compat.string_types): try: exists = os.path.exists(path_or_buf) except (TypeError, ValueError): exists = False if exists: with open(path_or_buf, "rb") as fh: return read(fh) # treat as a binary-like if isinstance(path_or_buf, compat.binary_type): fh = None try: fh = compat.BytesIO(path_or_buf) return read(fh) finally: if fh is not None: fh.close() # a buffer like if hasattr(path_or_buf, "read") and compat.callable(path_or_buf.read): return read(path_or_buf) raise ValueError("path_or_buf needs to be a string file path or file-like")
def write( self, df: DataFrame, path, compression="snappy", index=None, partition_cols=None, **kwargs, ): self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. # Use tobytes() instead. if "partition_on" in kwargs and partition_cols is not None: raise ValueError("Cannot use both partition_on and " "partition_cols. Use partition_cols for " "partitioning data") elif "partition_on" in kwargs: partition_cols = kwargs.pop("partition_on") if partition_cols is not None: kwargs["file_scheme"] = "hive" if is_s3_url(path) or is_gcs_url(path): # if path is s3:// or gs:// we need to open the file in 'wb' mode. # TODO: Support 'ab' path, _, _, _ = get_filepath_or_buffer(path, mode="wb") # And pass the opened file to the fastparquet internal impl. kwargs["open_with"] = lambda path, _: path else: path, _, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): self.api.write( path, df, compression=compression, write_index=index, partition_on=partition_cols, **kwargs, )
def read(self, path, columns=None, **kwargs): path, _, _, should_close = get_filepath_or_buffer(path) kwargs["use_pandas_metadata"] = True result = self.api.parquet.read_table(path, columns=columns, **kwargs).to_pandas() if should_close: path.close() return result
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): # Dictionaries are no longer considered valid inputs # for "get_filepath_or_buffer" starting in pandas >= 0.20.0 if isinstance(filepath_or_buffer, dict): return filepath_or_buffer, encoding, compression return com.get_filepath_or_buffer( filepath_or_buffer, encoding=encoding, compression=None )
def write(self, df, path, compression='snappy', **kwargs): self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. # Use tobytes() instead. if is_s3_url(path): # path is s3:// so we need to open the s3file in 'wb' mode. # TODO: Support 'ab' path, _, _ = get_filepath_or_buffer(path, mode='wb') # And pass the opened s3file to the fastparquet internal impl. kwargs['open_with'] = lambda path, _: path else: path, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): self.api.write(path, df, compression=compression, **kwargs)
def read_feather(path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None): """ Load a feather-format object from the file path. Parameters ---------- path : str, path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.feather``. If you want to pass in a path object, pandas accepts any ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, such as a file handle (e.g. via builtin ``open`` function) or ``StringIO``. columns : sequence, default None If not provided, all columns are read. .. versionadded:: 0.24.0 use_threads : bool, default True Whether to parallelize reading using multiple threads. .. versionadded:: 0.24.0 storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values. .. versionadded:: 1.2.0 Returns ------- type of object stored in file """ import_optional_dependency("pyarrow") from pyarrow import feather ioargs = get_filepath_or_buffer(path, storage_options=storage_options) df = feather.read_feather(ioargs.filepath_or_buffer, columns=columns, use_threads=bool(use_threads)) ioargs.close() return df
def read_msgpack(path_or_buf, encoding="utf-8", iterator=False, **kwargs): """ Load msgpack pandas object from the specified file path Parameters ---------- path_or_buf : string File path, BytesIO like or string encoding: Encoding for decoding msgpack str type iterator : boolean, if True, return an iterator to the unpacker (default is False) Returns ------- obj : type of object stored in file """ path_or_buf, *_ = get_filepath_or_buffer(path_or_buf) if iterator: return Iterator(path_or_buf) def read(fh): l = list(unpack(fh, encoding=encoding, **kwargs)) if len(l) == 1: if isinstance(l[0], np.ndarray): return l[0].copy() return l[0] return l # see if we have an actual file if isinstance(path_or_buf, path_types): try: exists = os.path.exists(path_or_buf) except (TypeError, ValueError): exists = False if exists: with open(path_or_buf, "rb") as fh: return read(fh) # treat as a binary-like if isinstance(path_or_buf, binary_type): fh = None try: fh = BytesIO(path_or_buf) return read(fh) finally: if fh is not None: fh.close() # a buffer like if hasattr(path_or_buf, "read") and callable(path_or_buf.read): return read(path_or_buf) raise ValueError("path_or_buf needs to be a string file path or file-like")
def read(self, path, columns=None, **kwargs): if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") open_with = lambda path, _: fsspec.open(path, "rb").open() parquet_file = self.api.ParquetFile(path, open_with=open_with) else: path, _, _, _ = get_filepath_or_buffer(path) parquet_file = self.api.ParquetFile(path) return parquet_file.to_pandas(columns=columns, **kwargs)
def read_msgpack(path_or_buf, iterator=False, **kwargs): """ Load msgpack pandas object from the specified file path THIS IS AN EXPERIMENTAL LIBRARY and the storage format may not be stable until a future release. Parameters ---------- path_or_buf : string File path, BytesIO like or string iterator : boolean, if True, return an iterator to the unpacker (default is False) Returns ------- obj : type of object stored in file """ _importers() path_or_buf, _ = get_filepath_or_buffer(path_or_buf) if iterator: return Iterator(path_or_buf) def read(fh): l = list(unpack(fh)) if len(l) == 1: return l[0] return l # see if we have an actual file if isinstance(path_or_buf, compat.string_types): try: exists = os.path.exists(path_or_buf) except (TypeError, ValueError): exists = False if exists: with open(path_or_buf, 'rb') as fh: return read(fh) # treat as a string-like if not hasattr(path_or_buf, 'read'): try: fh = compat.BytesIO(path_or_buf) return read(fh) finally: fh.close() # a buffer like return read(path_or_buf)
def read_msgpack(path_or_buf, iterator=False, **kwargs): """ Load msgpack pandas object from the specified file path THIS IS AN EXPERIMENTAL LIBRARY and the storage format may not be stable until a future release. Parameters ---------- path_or_buf : string File path, BytesIO like or string iterator : boolean, if True, return an iterator to the unpacker (default is False) Returns ------- obj : type of object stored in file """ _importers() path_or_buf, _ = get_filepath_or_buffer(path_or_buf) if iterator: return Iterator(path_or_buf) def read(fh): l = list(unpack(fh)) if len(l) == 1: return l[0] return l # see if we have an actual file if isinstance(path_or_buf, compat.string_types): try: exists = os.path.exists(path_or_buf) except (TypeError,ValueError): exists = False if exists: with open(path_or_buf, 'rb') as fh: return read(fh) # treat as a string-like if not hasattr(path_or_buf, 'read'): try: fh = compat.BytesIO(path_or_buf) return read(fh) finally: fh.close() # a buffer like return read(path_or_buf)
def __init__( self, path_or_buf, index=None, convert_dates=True, blank_missing=True, chunksize=None, encoding=None, convert_text=True, convert_header_text=True, ): self.index = index self.convert_dates = convert_dates self.blank_missing = blank_missing self.chunksize = chunksize self.encoding = encoding self.convert_text = convert_text self.convert_header_text = convert_header_text self.default_encoding = "latin-1" self.compression = b"" self.column_names_strings = [] self.column_names = [] self.column_formats = [] self.columns = [] self._current_page_data_subheader_pointers = [] self._cached_page = None self._column_data_lengths = [] self._column_data_offsets = [] self._column_types = [] self._current_row_in_file_index = 0 self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 path_or_buf = get_filepath_or_buffer(path_or_buf).filepath_or_buffer if isinstance(path_or_buf, str): buf = open(path_or_buf, "rb") self.handle = buf else: buf = path_or_buf self._path_or_buf: IO[Any] = buf try: self._get_properties() self._parse_metadata() except Exception: self.close() raise
def write(self, df, path, compression='snappy', coerce_timestamps='ms', **kwargs): path, _, _ = get_filepath_or_buffer(path) if self._pyarrow_lt_060: table = self.api.Table.from_pandas(df, timestamps_to_ms=True) self.api.parquet.write_table( table, path, compression=compression, **kwargs) else: table = self.api.Table.from_pandas(df) self.api.parquet.write_table( table, path, compression=compression, coerce_timestamps=coerce_timestamps, **kwargs)
def read(self, path, columns=None, **kwargs): path, _, _, should_close = get_filepath_or_buffer(path) kwargs['use_pandas_metadata'] = True result = self.api.parquet.read_table(path, columns=columns, **kwargs).to_pandas() if should_close: try: path.close() except: # noqa: flake8 pass return result
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True, parse_dates=False, keep_default_dates=True): """ Convert JSON string to pandas object Parameters ---------- filepath_or_buffer : a VALID JSON string or file handle / StringIO. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.json orient : {'split', 'records', 'index'}, default 'index' The format of the JSON string split : dict like {index -> [index], name -> name, data -> [values]} records : list like [value, ... , value] index : dict like {index -> value} typ : type of object to recover (series or frame), default 'frame' dtype : dtype of the resulting object numpy: direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. parse_dates : a list of columns to parse for dates; If True, then try to parse datelike columns default is False keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns Returns ------- result : Series or DataFrame """ filepath_or_buffer,_ = get_filepath_or_buffer(path_or_buf) if isinstance(filepath_or_buffer, basestring): if os.path.exists(filepath_or_buffer): with open(filepath_or_buffer,'r') as fh: json = fh.read() else: json = filepath_or_buffer elif hasattr(filepath_or_buffer, 'read'): json = filepath_or_buffer.read() else: json = filepath_or_buffer obj = None if typ == 'frame': obj = FrameParser(json, orient, dtype, numpy, parse_dates, keep_default_dates).parse() if typ == 'series' or obj is None: obj = SeriesParser(json, orient, dtype, numpy, parse_dates, keep_default_dates).parse() return obj
def from_json(cls, path_or_buf=None): """ Returns an ACN-Sim object loaded from in_registry. Note URLs have not been tested as path_or_buf input. Args: path_or_buf (Union[str, FilePathOrBuffer]): a valid JSON str, path object or file-like object. Any valid string path is acceptable. """ # The code here is from pandas 1.0.1, io.json.from_json(), with # modifications. filepath_or_buffer, _, _, should_close = get_filepath_or_buffer( path_or_buf) exists = False if isinstance(filepath_or_buffer, str): try: exists = os.path.exists(filepath_or_buffer) except (TypeError, ValueError): pass if exists: filepath_or_buffer, _ = get_handle(filepath_or_buffer, "r") should_close = True if isinstance(filepath_or_buffer, str): should_close = False out_registry = json.loads(filepath_or_buffer) else: out_registry = json.load(filepath_or_buffer) if should_close: filepath_or_buffer.close() if out_registry["version"] is None: warnings.warn( f"Missing a recorded version of acnportal in the " f"loaded registry. Object may have been dumped with a " f"different version of acnportal.", UserWarning, ) if out_registry["dependency_versions"] is None: warnings.warn( f"Missing recorded dependency versions of acnportal in " f"the loaded registry. Object may have been dumped " f"with different dependency versions of acnportal.", UserWarning, ) out_obj = cls._from_registry(out_registry)[0] return out_obj
def write(self, df, path, compression='snappy', coerce_timestamps='ms', index=None, **kwargs): self.validate_dataframe(df) path, _, _, _ = get_filepath_or_buffer(path, mode='wb') if index is None: from_pandas_kwargs = {} else: from_pandas_kwargs = {'preserve_index': index} table = self.api.Table.from_pandas(df, **from_pandas_kwargs) self.api.parquet.write_table( table, path, compression=compression, coerce_timestamps=coerce_timestamps, **kwargs)
def write( self, df: DataFrame, path, compression="snappy", index=None, partition_cols=None, storage_options: StorageOptions = None, **kwargs, ): self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. # Use tobytes() instead. if "partition_on" in kwargs and partition_cols is not None: raise ValueError( "Cannot use both partition_on and " "partition_cols. Use partition_cols for partitioning data" ) elif "partition_on" in kwargs: partition_cols = kwargs.pop("partition_on") if partition_cols is not None: kwargs["file_scheme"] = "hive" if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") # if filesystem is provided by fsspec, file must be opened in 'wb' mode. kwargs["open_with"] = lambda path, _: fsspec.open( path, "wb", **(storage_options or {}) ).open() else: if storage_options: raise ValueError( "storage_options passed with file object or non-fsspec file path" ) path, _, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): self.api.write( path, df, compression=compression, write_index=index, partition_on=partition_cols, **kwargs, )
def par2txt(path, sep, header, index): """ Convert Parquet to CSV text. """ path, _, _, should_close = get_filepath_or_buffer(path) f = IterableParquetFile(path) n = 0 for chunk in f: print(chunk.to_csv(sep=sep, index=index, header=header)) n += len(chunk) if should_close: path.close()
def info(path, schema): """ Print Parquet file metadata. """ path, _, _, should_close = get_filepath_or_buffer(path) f = IterableParquetFile(path) if schema: print(format_schema(f.schema)) else: print(format_metadata(f.metadata)) if should_close: path.close()
def __init__(self, filepath_or_buffer): """Reader using xlrd engine. Parameters ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. """ err_msg = "Install xlrd >= 1.0.0 for Excel support" try: import xlrd except ImportError: raise ImportError(err_msg) else: if xlrd.__VERSION__ < LooseVersion("1.0.0"): raise ImportError(err_msg + ". Current version " + xlrd.__VERSION__) from pandas.io.excel._base import ExcelFile # If filepath_or_buffer is a url, want to keep the data as bytes so # can't pass to get_filepath_or_buffer() if _is_url(filepath_or_buffer): filepath_or_buffer = _urlopen(filepath_or_buffer) elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)): filepath_or_buffer, _, _, _ = get_filepath_or_buffer( filepath_or_buffer) if isinstance(filepath_or_buffer, xlrd.Book): self.book = filepath_or_buffer elif hasattr(filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too if hasattr(filepath_or_buffer, 'seek'): try: # GH 19779 filepath_or_buffer.seek(0) except UnsupportedOperation: # HTTPResponse does not support seek() # GH 20434 pass data = filepath_or_buffer.read() self.book = xlrd.open_workbook(file_contents=data) elif isinstance(filepath_or_buffer, compat.string_types): self.book = xlrd.open_workbook(filepath_or_buffer) else: raise ValueError('Must explicitly set engine if not passing in' ' buffer or path for io.')
def __init__(self, filepath_or_buffer): """Reader using xlrd engine. Parameters ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. """ err_msg = "Install xlrd >= 1.0.0 for Excel support" try: import xlrd except ImportError: raise ImportError(err_msg) else: if xlrd.__VERSION__ < LooseVersion("1.0.0"): raise ImportError(err_msg + ". Current version " + xlrd.__VERSION__) from pandas.io.excel._base import ExcelFile # If filepath_or_buffer is a url, want to keep the data as bytes so # can't pass to get_filepath_or_buffer() if _is_url(filepath_or_buffer): filepath_or_buffer = urlopen(filepath_or_buffer) elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)): filepath_or_buffer, _, _, _ = get_filepath_or_buffer( filepath_or_buffer) if isinstance(filepath_or_buffer, xlrd.Book): self.book = filepath_or_buffer elif hasattr(filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too if hasattr(filepath_or_buffer, 'seek'): try: # GH 19779 filepath_or_buffer.seek(0) except UnsupportedOperation: # HTTPResponse does not support seek() # GH 20434 pass data = filepath_or_buffer.read() self.book = xlrd.open_workbook(file_contents=data) elif isinstance(filepath_or_buffer, compat.string_types): self.book = xlrd.open_workbook(filepath_or_buffer) else: raise ValueError('Must explicitly set engine if not passing in' ' buffer or path for io.')
def open_filepath_or_buffer(f, open_flags="r", compression=None): """Use pandas IO functions to return a handle from a filepath or buffer. Parameters ---------- f : str or buffer filepath or buffer to open open_flags : str, optional mode to open file compression : str, optional compression arg passed to pandas functions Returns ------- f : file-like A file-like object handles : list of file-like A list of file-like objects opened. Seems mostly relevant for zipped archives. close : bool A flag indicating whether the caller should close the file object when done """ if not pandas: raise Exception("Please install pandas to use this function") res = get_filepath_or_buffer(f, compression=compression) # HACK: handle multiple pandas versions try: f, _, compression, should_close = res except TypeError: f = res.filepath_or_buffer compression = res.compression should_close = res.should_close close = False or should_close if isinstance(f, str): close = True res = get_handle(f, open_flags, compression=compression) # HACK: handle multiple pandas versions try: f, handles = res except TypeError: f = res.handle handles = res.created_handles return f, handles, close
def from_url(cls, url): """ Alternate constructor to create a GeoDataFrame from a GeoJSON file online. Example: df = geopandas.GeoDataFrame.from_url('https://raw.githubusercontent.com/geopandas/geopandas/master/examples/null_geom.geojson') Inspired by pandas.read_json(). """ raw = get_filepath_or_buffer(url)[0] data = raw.read() if isinstance(data, bytes): data = data.decode('utf-8') geojson = json.loads(data) return GeoDataFrame.from_features(geojson['features'])
def __init__(self, path_or_buf, encoding="cp1252"): super(StataReader, self).__init__(encoding) self.col_sizes = () self._has_string_data = False self._missing_values = False self._data_read = False self._value_labels_read = False if isinstance(path_or_buf, str): path_or_buf, encoding = get_filepath_or_buffer(path_or_buf, encoding=self._default_encoding) if isinstance(path_or_buf, (str, compat.text_type, bytes)): self.path_or_buf = open(path_or_buf, "rb") else: self.path_or_buf = path_or_buf self._read_header()
def read(self, path, columns=None, **kwargs): fs = get_fs_for_path(path) should_close = None # Avoid calling get_filepath_or_buffer for s3/gcs URLs since # since it returns an S3File which doesn't support dir reads in arrow if not fs: path, _, _, should_close = get_filepath_or_buffer(path) kwargs["use_pandas_metadata"] = True result = self.api.parquet.read_table( path, columns=columns, filesystem=fs, **kwargs ).to_pandas() if should_close: path.close() return result
def gsea_gmt_parser(gmt, min_size = 3, max_size = 5000, gene_list=None): """Parse gene_sets.gmt(gene set database) file. :param gmt: the gene_sets.gmt file where loacated inside edb folder. :param min_size: Minimum allowed number of genes from gene set also the data set. Default: 3. :param max_size: Maximum allowed number of genes from gene set also the data set. Default: 5000. :param gene_list: Used for filtering gene set. Only used this argument for :func:`run` method. :return: Return a new filtered gene set database dictionary. **DO NOT** filter gene sets, when use :func:`replot`. Because ``GSEA`` Desktop have already do this for you. """ file_or_buffer, encode, compression = get_filepath_or_buffer(gmt) genesets_dict = { line.rstrip("\n").split("\t")[0]: line.rstrip("\n").split("\t")[2:] for line in file_or_buffer.readlines()} #filtering dict if sys.version_info[0] == 3 : genesets_filter = {k: v for k, v in genesets_dict.items() if len(v) >= min_size and len(v) <= max_size} elif sys.version_info[0] == 2: genesets_filter = {k: v for k, v in genesets_dict.iteritems() if len(v) >= min_size and len(v) <= max_size} else: print("System failure. Please Provide correct input files") sys.exit(1) if gene_list is not None: subsets = sorted(genesets_filter.keys()) for subset in subsets: tag_indicator = in1d(unique(gene_list), genesets_filter.get(subset), assume_unique=True) tag_len = sum(tag_indicator) if tag_len <= min_size or tag_len >= max_size: del genesets_filter[subset] else: continue #some_dict = {key: value for key, value in some_dict.items() if value != value_to_remove} #use np.intersect1d() may be faster??? filsets_num = len(genesets_dict) - len(genesets_filter) print("{a} gene_sets have been filtered out when max_size={b} and min_size={c}".format(a=filsets_num,b=max_size,c=min_size)) print("{} gene_sets used for further calculating".format(len(genesets_filter))) if filsets_num == len(genesets_dict): print("No gene sets passed throught filtering condition!!!, try new paramters again!\n" +\ "Note: Gene names for gseapy is case sensitive." ) sys.exit(1) else: return genesets_filter
def read(self, path, columns=None, **kwargs): if is_s3_url(path): from pandas.io.s3 import get_file_and_filesystem # When path is s3:// an S3File is returned. # We need to retain the original path(str) while also # pass the S3File().open function to fsatparquet impl. s3, filesystem = get_file_and_filesystem(path) try: parquet_file = self.api.ParquetFile(path, open_with=filesystem.open) finally: s3.close() else: path, _, _, _ = get_filepath_or_buffer(path) parquet_file = self.api.ParquetFile(path) return parquet_file.to_pandas(columns=columns, **kwargs)
def read_orc( path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs, ) -> "DataFrame": """ Load an ORC object from the file path, returning a DataFrame. .. versionadded:: 1.0.0 Parameters ---------- path : str, path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.orc``. If you want to pass in a path object, pandas accepts any ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) or ``StringIO``. columns : list, default None If not None, only these columns will be read from the file. **kwargs Any additional kwargs are passed to pyarrow. Returns ------- DataFrame """ # we require a newer version of pyarrow than we support for parquet import pyarrow if distutils.version.LooseVersion(pyarrow.__version__) < "0.13.0": raise ImportError("pyarrow must be >= 0.13.0 for read_orc") import pyarrow.orc path, _, _, _ = get_filepath_or_buffer(path) orc_file = pyarrow.orc.ORCFile(path) result = orc_file.read(columns=columns, **kwargs).to_pandas() return result
def __init__(self, path_or_buf, encoding='cp1252'): super(StataReader, self).__init__(encoding) self.col_sizes = () self._has_string_data = False self._missing_values = False self._data_read = False self._value_labels_read = False if isinstance(path_or_buf, str): path_or_buf, encoding = get_filepath_or_buffer(path_or_buf, encoding='cp1252') if isinstance(path_or_buf, (str, compat.text_type, bytes)): self.path_or_buf = open(path_or_buf, 'rb') else: self.path_or_buf = path_or_buf self._read_header()
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): # Dictionaries are no longer considered valid inputs # for "get_filepath_or_buffer" starting in pandas >= 0.20.0 if isinstance(filepath_or_buffer, dict): return filepath_or_buffer, encoding, compression try: tmp = com._get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=None) return tmp.filepath_or_buffer, tmp.encoding, tmp.compression except AttributeError: tmp = com.get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=None) return tmp
def __init__(self, path_or_buf, encoding=None): super(StataReader, self).__init__(encoding) self.col_sizes = () self._has_string_data = False self._missing_values = False self._data_read = False self._value_labels_read = False if isinstance(path_or_buf, str): path_or_buf, encoding = get_filepath_or_buffer(path_or_buf, encoding='cp1252') if encoding is not None: self._encoding = encoding if type(path_or_buf) is str: self.path_or_buf = open(path_or_buf, 'rb') else: self.path_or_buf = path_or_buf self._read_header()
def open_file(filepath_or_buffer, mode="r", encoding=None, compression="infer"): if encoding is not None: encoding = re.sub("_", "-", encoding).lower() compression = _infer_compression(filepath_or_buffer, compression) filepath_or_buffer, _, compression = get_filepath_or_buffer(filepath_or_buffer, encoding, compression) is_path = isinstance(filepath_or_buffer, str) if compression: # GZ Compression if compression == "gzip": if is_path: return gzip.open(filepath_or_buffer, mode) return gzip.GzipFile(fileobj=filepath_or_buffer) # BZ Compression elif compression == "bz2": if is_path: return bz2.BZ2File(filepath_or_buffer, mode) return bz2.BZ2File(filepath_or_buffer) # ZIP Compression elif compression == "zip": zip_file = zipfile.ZipFile(filepath_or_buffer) zip_names = zip_file.namelist() if len(zip_names) == 1: return zip_file.open(zip_names.pop()) if len(zip_names) == 0: raise ValueError(f"Zero files found in ZIP file {filepath_or_buffer}") else: raise ValueError("Multiple files found in ZIP file." f" Only one file per ZIP: {filepath_or_buffer}") # XZ Compression elif compression == "xz": return lzma.LZMAFile(filepath_or_buffer, mode) # Unrecognized Compression raise ValueError(f"Unrecognized compression type: {compression}") elif is_path: return open(filepath_or_buffer, mode, encoding=encoding)
def __init__(self, filepath_or_buffer): # If filepath_or_buffer is a url, load the data into a BytesIO if _is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer elif hasattr(filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too filepath_or_buffer.seek(0) self.book = self.load_workbook(filepath_or_buffer) elif isinstance(filepath_or_buffer, str): self.book = self.load_workbook(filepath_or_buffer) else: raise ValueError( "Must explicitly set engine if not passing in buffer or path for io." )
def _read_content(path_or_buf): filepath_or_buffer, _ = get_filepath_or_buffer(path_or_buf) if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) except (TypeError,ValueError): exists = False if exists: with open(filepath_or_buffer, 'r') as fh: data = fh.read() else: data = filepath_or_buffer elif hasattr(filepath_or_buffer, 'read'): data = filepath_or_buffer.read() else: data = filepath_or_buffer return data
def read_feather(path, columns=None, use_threads: bool = True): """ Load a feather-format object from the file path. Parameters ---------- path : str, path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.feather``. If you want to pass in a path object, pandas accepts any ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) or ``StringIO``. columns : sequence, default None If not provided, all columns are read. .. versionadded:: 0.24.0 use_threads : bool, default True Whether to parallelize reading using multiple threads. .. versionadded:: 0.24.0 Returns ------- type of object stored in file """ import_optional_dependency("pyarrow") from pyarrow import feather path, _, _, should_close = get_filepath_or_buffer(path) df = feather.read_feather(path, columns=columns, use_threads=bool(use_threads)) # s3fs only validates the credentials when the file is closed. if should_close: path.close() return df
def __init__( self, formatter: "DataFrameFormatter", path_or_buf: FilePathOrBuffer[str] = "", sep: str = ",", cols: Optional[Sequence[Label]] = None, index_label: Optional[IndexLabel] = None, mode: str = "w", encoding: Optional[str] = None, errors: str = "strict", compression: CompressionOptions = "infer", quoting: Optional[int] = None, line_terminator="\n", chunksize: Optional[int] = None, quotechar: Optional[str] = '"', date_format: Optional[str] = None, doublequote: bool = True, escapechar: Optional[str] = None, storage_options: StorageOptions = None, ): self.fmt = formatter self.obj = self.fmt.frame self.ioargs = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, mode=mode, storage_options=storage_options, ) self.sep = sep self.index_label = self._initialize_index_label(index_label) self.errors = errors self.quoting = quoting or csvlib.QUOTE_MINIMAL self.quotechar = self._initialize_quotechar(quotechar) self.doublequote = doublequote self.escapechar = escapechar self.line_terminator = line_terminator or os.linesep self.date_format = date_format self.cols = self._initialize_columns(cols) self.chunksize = self._initialize_chunksize(chunksize)
def write(self, df, path, compression='snappy', coerce_timestamps='ms', index=None, partition_cols=None, **kwargs): self.validate_dataframe(df) path, _, _, _ = get_filepath_or_buffer(path, mode='wb') if index is None: from_pandas_kwargs = {} else: from_pandas_kwargs = {'preserve_index': index} table = self.api.Table.from_pandas(df, **from_pandas_kwargs) if partition_cols is not None: self.api.parquet.write_to_dataset( table, path, compression=compression, coerce_timestamps=coerce_timestamps, partition_cols=partition_cols, **kwargs) else: self.api.parquet.write_table( table, path, compression=compression, coerce_timestamps=coerce_timestamps, **kwargs)
def __init__(self, filepath_or_buffer): # If filepath_or_buffer is a url, load the data into a BytesIO if _is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): filepath_or_buffer, _, _, _ = get_filepath_or_buffer( filepath_or_buffer) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer elif hasattr(filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too filepath_or_buffer.seek(0) self.book = self.load_workbook(filepath_or_buffer) elif isinstance(filepath_or_buffer, str): self.book = self.load_workbook(filepath_or_buffer) else: raise ValueError('Must explicitly set engine if not passing in' ' buffer or path for io.')
def _read_content(path_or_buf): """ copied part of internal logic from pandas.io.read_json """ results = get_filepath_or_buffer(path_or_buf) # results length is 3 in pandas 0.17 or later, 2 in 0.16.2 or prior filepath_or_buffer = results[0] if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) except (TypeError, ValueError): exists = False if exists: with open(filepath_or_buffer, 'r') as fh: data = fh.read() else: data = filepath_or_buffer elif hasattr(filepath_or_buffer, 'read'): data = filepath_or_buffer.read() else: data = filepath_or_buffer return data
def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer( input_buffer) assert filepath_or_buffer == input_buffer assert not should_close
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None): """ Convert a JSON string to pandas object Parameters ---------- path_or_buf : a valid JSON string or file-like, default: None The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be ``file://localhost/path/to/table.json`` orient * `Series` - default is ``'index'`` - allowed values are: ``{'split','records','index'}`` - The Series index must be unique for orient ``'index'``. * `DataFrame` - default is ``'columns'`` - allowed values are: {'split','records','index','columns','values'} - The DataFrame index must be unique for orients 'index' and 'columns'. - The DataFrame columns must be unique for orients 'index', 'columns', and 'records'. * The format of the JSON string - split : dict like ``{index -> [index], columns -> [columns], data -> [values]}`` - records : list like ``[{column -> value}, ... , {column -> value}]`` - index : dict like ``{index -> {column -> value}}`` - columns : dict like ``{column -> {index -> value}}`` - values : just the values array typ : type of object to recover (series or frame), default 'frame' dtype : boolean or dict, default True If True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, applies only to the data. convert_axes : boolean, default True Try to convert the axes to the proper dtypes. convert_dates : boolean, default True List of columns to parse for dates; If True, then try to parse datelike columns default is True; a column label is datelike if * it ends with ``'_at'``, * it ends with ``'_time'``, * it begins with ``'timestamp'``, * it is ``'modified'``, or * it is ``'date'`` keep_default_dates : boolean, default True If parsing dates, then parse the default datelike columns numpy : boolean, default False Direct decoding to numpy arrays. Supports numeric data only, but non-numeric column and index labels are supported. Note also that the JSON ordering MUST be the same for each term if numpy=True. precise_float : boolean, default False Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (False) is to use fast but less precise builtin functionality date_unit : string, default None The timestamp unit to detect if converting dates. The default behaviour is to try and detect the correct precision, but if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, milliseconds, microseconds or nanoseconds respectively. Returns ------- result : Series or DataFrame """ filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf) if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) # if the filepath is too long will raise here # 5874 except (TypeError,ValueError): exists = False if exists: with open(filepath_or_buffer, 'r') as fh: json = fh.read() else: json = filepath_or_buffer elif hasattr(filepath_or_buffer, 'read'): json = filepath_or_buffer.read() else: json = filepath_or_buffer obj = None if typ == 'frame': obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit).parse() if typ == 'series' or obj is None: if not isinstance(dtype, bool): dtype = dict(data=dtype) obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit).parse() return obj
def read(self, path, columns=None, **kwargs): path, _, _ = get_filepath_or_buffer(path) return self.api.parquet.read_table(path, columns=columns, **kwargs).to_pandas()
def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer) self.assertEqual(filepath_or_buffer, input_buffer)
def test_get_filepath_or_buffer_with_path(self): filename = '~/sometest' filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename) self.assertNotEqual(filepath_or_buffer, filename) self.assertTrue(isabs(filepath_or_buffer)) self.assertEqual(os.path.expanduser(filename), filepath_or_buffer)
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, lines=False): """ Convert a JSON string to pandas object Parameters ---------- path_or_buf : a valid JSON string or file-like, default: None The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be ``file://localhost/path/to/table.json`` orient : string, Indication of expected JSON string format. Compatible JSON strings can be produced by ``to_json()`` with a corresponding orient value. The set of possible orients is: - ``'split'`` : dict like ``{index -> [index], columns -> [columns], data -> [values]}`` - ``'records'`` : list like ``[{column -> value}, ... , {column -> value}]`` - ``'index'`` : dict like ``{index -> {column -> value}}`` - ``'columns'`` : dict like ``{column -> {index -> value}}`` - ``'values'`` : just the values array The allowed and default values depend on the value of the `typ` parameter. * when ``typ == 'series'``, - allowed orients are ``{'split','records','index'}`` - default is ``'index'`` - The Series index must be unique for orient ``'index'``. * when ``typ == 'frame'``, - allowed orients are ``{'split','records','index', 'columns','values'}`` - default is ``'columns'`` - The DataFrame index must be unique for orients ``'index'`` and ``'columns'``. - The DataFrame columns must be unique for orients ``'index'``, ``'columns'``, and ``'records'``. typ : type of object to recover (series or frame), default 'frame' dtype : boolean or dict, default True If True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, applies only to the data. convert_axes : boolean, default True Try to convert the axes to the proper dtypes. convert_dates : boolean, default True List of columns to parse for dates; If True, then try to parse datelike columns default is True; a column label is datelike if * it ends with ``'_at'``, * it ends with ``'_time'``, * it begins with ``'timestamp'``, * it is ``'modified'``, or * it is ``'date'`` keep_default_dates : boolean, default True If parsing dates, then parse the default datelike columns numpy : boolean, default False Direct decoding to numpy arrays. Supports numeric data only, but non-numeric column and index labels are supported. Note also that the JSON ordering MUST be the same for each term if numpy=True. precise_float : boolean, default False Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (False) is to use fast but less precise builtin functionality date_unit : string, default None The timestamp unit to detect if converting dates. The default behaviour is to try and detect the correct precision, but if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, milliseconds, microseconds or nanoseconds respectively. lines : boolean, default False Read the file as a json object per line. .. versionadded:: 0.19.0 encoding : str, default is 'utf-8' The encoding to use to decode py3 bytes. .. versionadded:: 0.19.0 Returns ------- result : Series or DataFrame, depending on the value of `typ`. See Also -------- DataFrame.to_json Examples -------- >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], ... index=['row 1', 'row 2'], ... columns=['col 1', 'col 2']) Encoding/decoding a Dataframe using ``'split'`` formatted JSON: >>> df.to_json(orient='split') '{"columns":["col 1","col 2"], "index":["row 1","row 2"], "data":[["a","b"],["c","d"]]}' >>> pd.read_json(_, orient='split') col 1 col 2 row 1 a b row 2 c d Encoding/decoding a Dataframe using ``'index'`` formatted JSON: >>> df.to_json(orient='index') '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' >>> pd.read_json(_, orient='index') col 1 col 2 row 1 a b row 2 c d Encoding/decoding a Dataframe using ``'records'`` formatted JSON. Note that index labels are not preserved with this encoding. >>> df.to_json(orient='records') '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]' >>> pd.read_json(_, orient='records') col 1 col 2 0 a b 1 c d """ filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, encoding=encoding) if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) # if the filepath is too long will raise here # 5874 except (TypeError, ValueError): exists = False if exists: fh, handles = _get_handle(filepath_or_buffer, 'r', encoding=encoding) json = fh.read() fh.close() else: json = filepath_or_buffer elif hasattr(filepath_or_buffer, 'read'): json = filepath_or_buffer.read() else: json = filepath_or_buffer if lines: # If given a json lines file, we break the string into lines, add # commas and put it in a json list to make a valid json object. lines = list(StringIO(json.strip())) json = u'[' + u','.join(lines) + u']' obj = None if typ == 'frame': obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit).parse() if typ == 'series' or obj is None: if not isinstance(dtype, bool): dtype = dict(data=dtype) obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit).parse() return obj