def maybe_read_encoded_stream(reader, encoding=None): """ read an encoded stream from the reader and transform the bytes to unicode if required based on the encoding Parameters ---------- reader : a streamable file-like object encoding : optional, the encoding to attempt to read Returns ------- a tuple of (a stream of decoded bytes, the encoding which was used) """ if compat.PY3 or encoding is not None: # pragma: no cover if encoding: errors = 'strict' else: errors = 'replace' encoding = 'utf-8' reader = StringIO(reader.read().decode(encoding, errors)) else: encoding = None return reader, encoding
def test_to_html(self): # big mixed biggie = DataFrame({'A': np.random.randn(200), 'B': tm.makeStringIndex(200)}, index=lrange(200)) biggie.loc[:20, 'A'] = np.nan biggie.loc[:20, 'B'] = np.nan s = biggie.to_html() buf = StringIO() retval = biggie.to_html(buf=buf) assert retval is None assert buf.getvalue() == s assert isinstance(s, compat.string_types) biggie.to_html(columns=['B', 'A'], col_space=17) biggie.to_html(columns=['B', 'A'], formatters={'A': lambda x: '{x:.1f}'.format(x=x)}) biggie.to_html(columns=['B', 'A'], float_format=str) biggie.to_html(columns=['B', 'A'], col_space=12, float_format=str) frame = DataFrame(index=np.arange(200)) frame.to_html()
def test_to_csv_quoting(self): df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) buf = StringIO() df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC) result = buf.getvalue() expected = ('"A","B"\n' '1,"foo"\n' '2,"bar"\n' '3,"baz"\n') self.assertEqual(result, expected) # quoting windows line terminators, presents with encoding? # #3503 text = 'a,b,c\n1,"test \r\n",3\n' df = pd.read_csv(StringIO(text)) buf = StringIO() df.to_csv(buf, encoding='utf-8', index=False) self.assertEqual(buf.getvalue(), text) # testing if quoting parameter is passed through with multi-indexes # related to issue #7791 df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) df = df.set_index(['a', 'b']) expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n' self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected)
def test_to_csv_stringio(self): buf = StringIO() self.frame.to_csv(buf) buf.seek(0) recons = read_csv(buf, index_col=0) # TODO to_csv drops column name assert_frame_equal(recons, self.frame, check_names=False)
def test_to_html(self): # big mixed biggie = DataFrame({'A': np.random.randn(200), 'B': tm.makeStringIndex(200)}, index=lrange(200)) biggie.loc[:20, 'A'] = np.nan biggie.loc[:20, 'B'] = np.nan s = biggie.to_html() buf = StringIO() retval = biggie.to_html(buf=buf) self.assertIsNone(retval) self.assertEqual(buf.getvalue(), s) tm.assertIsInstance(s, compat.string_types) biggie.to_html(columns=['B', 'A'], col_space=17) biggie.to_html(columns=['B', 'A'], formatters={'A': lambda x: '%.1f' % x}) biggie.to_html(columns=['B', 'A'], float_format=str) biggie.to_html(columns=['B', 'A'], col_space=12, float_format=str) frame = DataFrame(index=np.arange(200)) frame.to_html()
def _read_one_data(self, ftppath, params): if re.search(_ZIP_RE, ftppath) is not None: index_file = self._read_zipfile(ftppath) elif re.search(_GZ_RE, ftppath) is not None: index_file = self._read_gzfile(ftppath) else: index_file = StringIO() index_list = [] try: self._sec_ftp_session.retrlines('RETR ' + ftppath, index_list.append) except EOFError: raise RemoteDataError('FTP server has closed the connection.') for line in index_list: index_file.write(line + '\n') index_file.seek(0) index_file = self._remove_header(index_file) index = read_csv(index_file, delimiter='|', header=None, index_col=False, names=_COLUMNS, low_memory=False, dtype=_COLUMN_TYPES) index['filename'] = index['filename'].map(self._fix_old_file_paths) return index
def test_to_csv_numpy_16_bug(self): frame = DataFrame({'a': date_range('1/1/2000', periods=10)}) buf = StringIO() frame.to_csv(buf) result = buf.getvalue() self.assertIn('2000-01-01', result)
def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover """ Attempt to write text representation of object to the system clipboard The clipboard can be then pasted into Excel for example. Parameters ---------- obj : the object to write to the clipboard excel : boolean, defaults to True if True, use the provided separator, writing in a csv format for allowing easy pasting into excel. if False, write a string representation of the object to the clipboard sep : optional, defaults to tab other keywords are passed to to_csv Notes ----- Requirements for your platform - Linux: xclip, or xsel (with gtk or PyQt4 modules) - Windows: - OS X: """ encoding = kwargs.pop('encoding', 'utf-8') # testing if an invalid encoding is passed to clipboard if encoding is not None and encoding.lower().replace('-', '') != 'utf8': raise ValueError('clipboard only supports utf-8 encoding') from pandas.io.clipboard import clipboard_set if excel is None: excel = True if excel: try: if sep is None: sep = '\t' buf = StringIO() # clipboard_set (pyperclip) expects unicode obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs) text = buf.getvalue() if PY2: text = text.decode('utf-8') clipboard_set(text) return except TypeError: warnings.warn('to_clipboard in excel mode requires a single ' 'character separator.') elif sep is not None: warnings.warn('to_clipboard with excel=False ignores the sep argument') if isinstance(obj, ABCDataFrame): # str(df) has various unhelpful defaults, like truncation with option_context('display.max_colwidth', 999999): objstr = obj.to_string(**kwargs) else: objstr = str(obj) clipboard_set(objstr)
def test_to_csv_unicode_index(self): buf = StringIO() s = Series([u("\u05d0"), "d2"], index=[u("\u05d0"), u("\u05d1")]) s.to_csv(buf, encoding="UTF-8") buf.seek(0) s2 = self.read_csv(buf, index_col=0, encoding="UTF-8") assert_series_equal(s, s2)
def test_info_duplicate_columns_shows_correct_dtypes(self): # GH11761 io = StringIO() frame = DataFrame([[1, 2.0]], columns=["a", "a"]) frame.info(buf=io) io.seek(0) lines = io.readlines() self.assertEqual("a 1 non-null int64\n", lines[3]) self.assertEqual("a 1 non-null float64\n", lines[4])
def test_to_csv_quote_none(self): # GH4328 df = DataFrame({'A': ['hello', '{"hello"}']}) for encoding in (None, 'utf-8'): buf = StringIO() df.to_csv(buf, quoting=csv.QUOTE_NONE, encoding=encoding, index=False) result = buf.getvalue() expected = 'A\nhello\n{"hello"}\n' self.assertEqual(result, expected)
def test_info_duplicate_columns_shows_correct_dtypes(self): # GH11761 io = StringIO() frame = DataFrame([[1, 2.0]], columns=['a', 'a']) frame.info(buf=io) io.seek(0) lines = io.readlines() assert 'a 1 non-null int64\n' == lines[3] assert 'a 1 non-null float64\n' == lines[4]
def test_to_csv_index_no_leading_comma(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['one', 'two', 'three']) buf = StringIO() df.to_csv(buf, index_label=False) expected = ('A,B\n' 'one,1,4\n' 'two,2,5\n' 'three,3,6\n') self.assertEqual(buf.getvalue(), expected)
def _remove_header(self, data): header = True cleaned_datafile = StringIO() for line in data: if header is False: cleaned_datafile.write(line + '\n') elif re.search(_DIVIDER, line) is not None: header = False cleaned_datafile.seek(0) return cleaned_datafile
def test_to_csv_gcs(mock): df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], 'dt': date_range('2018-06-18', periods=2)}) with mock.patch('gcsfs.GCSFileSystem') as MockFileSystem: s = StringIO() instance = MockFileSystem.return_value instance.open.return_value = s df1.to_csv('gs://test/test.csv', index=True) df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0) assert_frame_equal(df1, df2)
def test_constructor_bad_file(self): non_file = StringIO('I am not a file') non_file.fileno = lambda: -1 msg = "Invalid argument" tm.assertRaisesRegexp(mmap.error, msg, common.MMapWrapper, non_file) target = open(self.mmap_file, 'r') target.close() msg = "I/O operation on closed file" tm.assertRaisesRegexp(ValueError, msg, common.MMapWrapper, target)
def test_info_shows_column_dtypes(self): dtypes = ["int64", "float64", "datetime64[ns]", "timedelta64[ns]", "complex128", "object", "bool"] data = {} n = 10 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() df.info(buf=buf) res = buf.getvalue() for i, dtype in enumerate(dtypes): name = "%d %d non-null %s" % (i, n, dtype) assert name in res
def test_to_csv_index_no_leading_comma(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['one', 'two', 'three']) buf = StringIO() df.to_csv(buf, index_label=False) expected_rows = ['A,B', 'one,1,4', 'two,2,5', 'three,3,6'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert buf.getvalue() == expected
def test_to_csv_unicode_index_col(self): buf = StringIO('') df = DataFrame( [[u("\u05d0"), "d2", "d3", "d4"], ["a1", "a2", "a3", "a4"]], columns=[u("\u05d0"), u("\u05d1"), u("\u05d2"), u("\u05d3")], index=[u("\u05d0"), u("\u05d1")]) df.to_csv(buf, encoding='UTF-8') buf.seek(0) df2 = read_csv(buf, index_col=0, encoding='UTF-8') assert_frame_equal(df, df2)
def test_repr_bool_fails(self): s = Series([DataFrame(np.random.randn(2, 2)) for i in range(5)]) import sys buf = StringIO() tmp = sys.stderr sys.stderr = buf try: # it works (with no Cython exception barf)! repr(s) finally: sys.stderr = tmp self.assertEqual(buf.getvalue(), '')
def test_info_memory_usage_qualified(self): buf = StringIO() df = DataFrame(1, columns=list('ab'), index=[1, 2, 3]) df.info(buf=buf) assert '+' not in buf.getvalue() buf = StringIO() df = DataFrame(1, columns=list('ab'), index=list('ABC')) df.info(buf=buf) assert '+' in buf.getvalue() buf = StringIO() df = DataFrame(1, columns=list('ab'), index=pd.MultiIndex.from_product( [range(3), range(3)])) df.info(buf=buf) assert '+' not in buf.getvalue() buf = StringIO() df = DataFrame(1, columns=list('ab'), index=pd.MultiIndex.from_product( [range(3), ['foo', 'bar']])) df.info(buf=buf) assert '+' in buf.getvalue()
def test_to_csv_quote_none(self): # GH4328 df = DataFrame({'A': ['hello', '{"hello"}']}) for encoding in (None, 'utf-8'): buf = StringIO() df.to_csv(buf, quoting=csv.QUOTE_NONE, encoding=encoding, index=False) result = buf.getvalue() expected_rows = ['A', 'hello', '{"hello"}'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected
def test_info_max_cols(self): df = DataFrame(np.random.randn(10, 5)) for len_, verbose in [(5, None), (5, False), (10, True)]: # For verbose always ^ setting ^ summarize ^ full output with option_context('max_info_columns', 4): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() self.assertEqual(len(res.strip().split('\n')), len_) for len_, verbose in [(10, None), (5, False), (10, True)]: # max_cols no exceeded with option_context('max_info_columns', 5): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() self.assertEqual(len(res.strip().split('\n')), len_) for len_, max_cols in [(10, 5), (5, 4)]: # setting truncates with option_context('max_info_columns', 4): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() self.assertEqual(len(res.strip().split('\n')), len_) # setting wouldn't truncate with option_context('max_info_columns', 5): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() self.assertEqual(len(res.strip().split('\n')), len_)
def test_info_shows_column_dtypes(self): dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', 'complex128', 'object', 'bool'] data = {} n = 10 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() df.info(buf=buf) res = buf.getvalue() for i, dtype in enumerate(dtypes): name = '%d %d non-null %s' % (i, n, dtype) assert name in res
def test_to_csv_gcs(monkeypatch): df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], 'dt': date_range('2018-06-18', periods=2)}) s = StringIO() class MockGCSFileSystem(): def open(*args): return s monkeypatch.setattr('gcsfs.GCSFileSystem', MockGCSFileSystem) df1.to_csv('gs://test/test.csv', index=True) df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0) assert_frame_equal(df1, df2)
def test_to_csv_unicodewriter_quoting(self): df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) buf = StringIO() df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, encoding='utf-8') result = buf.getvalue() expected = ('"A","B"\n' '1,"foo"\n' '2,"bar"\n' '3,"baz"\n') self.assertEqual(result, expected)
def test_to_csv_unicodewriter_quoting(self): df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) buf = StringIO() df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, encoding='utf-8') result = buf.getvalue() expected_rows = ['"A","B"', '1,"foo"', '2,"bar"', '3,"baz"'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected
def test_constructor_bad_file(self): if is_platform_windows(): raise nose.SkipTest("skipping construction error messages " "tests on windows") non_file = StringIO('I am not a file') non_file.fileno = lambda: -1 msg = "Invalid argument" tm.assertRaisesRegexp(mmap.error, msg, common.MMapWrapper, non_file) target = open(self.mmap_file, 'r') target.close() msg = "I/O operation on closed file" tm.assertRaisesRegexp(ValueError, msg, common.MMapWrapper, target)
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): # Redirect output to a queue self.queue = StringIO() self.writer = csv.writer(self.queue, dialect=dialect, **kwds) self.stream = f self.encoder = codecs.getincrementalencoder(encoding)() self.quoting = kwds.get("quoting", None)
def _get_pretty_string(obj): """Return a prettier version of obj Parameters ---------- obj : object Object to pretty print Returns ------- s : str Pretty print object repr """ sio = StringIO() pprint.pprint(obj, stream=sio) return sio.getvalue()
def test_squeeze_no_view(self): # see gh-8217 # Series should not be a view data = """time,data\n0,10\n1,11\n2,12\n4,14\n5,15\n3,13""" result = self.read_csv(StringIO(data), index_col='time', squeeze=True) self.assertFalse(result._is_view)
def bdi(itype='D', retry_count=3, pause=0.001): for _ in range(retry_count): time.sleep(pause) try: request = Request(ct.BDI_URL % (ct.P_TYPE['http'], ct.DOMAINS['v500'])) lines = urlopen(request, timeout=10).read() if len(lines) < 100: #no data return None except Exception as e: print(e) else: linestr = lines.decode('utf-8') if ct.PY3 else lines if itype == 'D': # Daily reg = re.compile(r'\"chart_data\",\"(.*?)\"\);') lines = reg.findall(linestr) lines = lines[0] lines = lines.replace('chart', 'table').\ replace('</series><graphs>', '').\ replace('</graphs>', '').\ replace('series', 'tr').\ replace('value', 'td').\ replace('graph', 'tr').\ replace('graphs', 'td') df = pd.read_html(lines, encoding='utf8')[0] df = df.T df.columns = ['date', 'index'] df['date'] = df['date'].map(lambda x: x.replace(u'年', '-')).\ map(lambda x: x.replace(u'月', '-')).\ map(lambda x: x.replace(u'日', '')) df['date'] = pd.to_datetime(df['date']) df['index'] = df['index'].astype(float) df = df.sort_values('date', ascending=False).reset_index(drop=True) df['change'] = df['index'].pct_change(-1) df['change'] = df['change'] * 100 df['change'] = df['change'].map(lambda x: '%.2f' % x) df['change'] = df['change'].astype(float) return df else: #Weekly html = lxml.html.parse(StringIO(linestr)) res = html.xpath( "//table[@class=\"style33\"]/tr/td/table[last()]") if ct.PY3: sarr = [ etree.tostring(node).decode('utf-8') for node in res ] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>' % sarr df = pd.read_html(sarr)[0][1:] df.columns = ['month', 'index'] df['month'] = df['month'].map(lambda x: x.replace(u'年', '-')).\ map(lambda x: x.replace(u'月', '')) df['month'] = pd.to_datetime(df['month']) df['month'] = df['month'].map(lambda x: str(x).replace('-', '')).\ map(lambda x: x[:6]) df['index'] = df['index'].astype(float) df['change'] = df['index'].pct_change(-1) df['change'] = df['change'].map(lambda x: '%.2f' % x) df['change'] = df['change'].astype(float) return df
def test_verbose_import(self): text = """a,b,c,d one,1,2,3 one,1,2,3 ,1,2,3 one,1,2,3 ,1,2,3 ,1,2,3 one,1,2,3 two,1,2,3""" buf = StringIO() sys.stdout = buf try: # engines are verbose in different ways self.read_csv(StringIO(text), verbose=True) if self.engine == 'c': self.assertIn('Tokenization took:', buf.getvalue()) self.assertIn('Parser memory cleanup took:', buf.getvalue()) else: # Python engine self.assertEqual(buf.getvalue(), 'Filled 3 NA values in column a\n') finally: sys.stdout = sys.__stdout__ buf = StringIO() sys.stdout = buf text = """a,b,c,d one,1,2,3 two,1,2,3 three,1,2,3 four,1,2,3 five,1,2,3 ,1,2,3 seven,1,2,3 eight,1,2,3""" try: # engines are verbose in different ways self.read_csv(StringIO(text), verbose=True, index_col=0) if self.engine == 'c': self.assertIn('Tokenization took:', buf.getvalue()) self.assertIn('Parser memory cleanup took:', buf.getvalue()) else: # Python engine self.assertEqual(buf.getvalue(), 'Filled 1 NA values in column a\n') finally: sys.stdout = sys.__stdout__
def test_malformed(self): # see gh-6607 # all data = """ignore A,B,C 1,2,3 # comment 1,2,3,4,5 2,3,4 """ msg = 'Expected 3 fields in line 4, saw 5' with tm.assertRaisesRegexp(Exception, msg): self.read_table(StringIO(data), sep=',', header=1, comment='#') # first chunk data = """ignore A,B,C skip 1,2,3 3,5,10 # comment 1,2,3,4,5 2,3,4 """ msg = 'Expected 3 fields in line 6, saw 5' with tm.assertRaisesRegexp(Exception, msg): it = self.read_table(StringIO(data), sep=',', header=1, comment='#', iterator=True, chunksize=1, skiprows=[2]) it.read(5) # middle chunk data = """ignore A,B,C skip 1,2,3 3,5,10 # comment 1,2,3,4,5 2,3,4 """ msg = 'Expected 3 fields in line 6, saw 5' with tm.assertRaisesRegexp(Exception, msg): it = self.read_table(StringIO(data), sep=',', header=1, comment='#', iterator=True, chunksize=1, skiprows=[2]) it.read(3) # last chunk data = """ignore A,B,C skip 1,2,3 3,5,10 # comment 1,2,3,4,5 2,3,4 """ msg = 'Expected 3 fields in line 6, saw 5' with tm.assertRaisesRegexp(Exception, msg): it = self.read_table(StringIO(data), sep=',', header=1, comment='#', iterator=True, chunksize=1, skiprows=[2]) it.read() # skip_footer is not supported with the C parser yet if self.engine == 'python': # skip_footer data = """ignore A,B,C 1,2,3 # comment 1,2,3,4,5 2,3,4 footer """ msg = 'Expected 3 fields in line 4, saw 5' with tm.assertRaisesRegexp(Exception, msg): self.read_table(StringIO(data), sep=',', header=1, comment='#', skip_footer=1)
def test_float_parser(self): # see gh-9565 data = '45e-1,4.5,45.,inf,-inf' result = self.read_csv(StringIO(data), header=None) expected = DataFrame([[float(s) for s in data.split(',')]]) tm.assert_frame_equal(result, expected)
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, lines=False): """ Convert a JSON string to pandas object Parameters ---------- path_or_buf : a valid JSON string or file-like, default: None The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be ``file://localhost/path/to/table.json`` orient : string, Indication of expected JSON string format. Compatible JSON strings can be produced by ``to_json()`` with a corresponding orient value. The set of possible orients is: - ``'split'`` : dict like ``{index -> [index], columns -> [columns], data -> [values]}`` - ``'records'`` : list like ``[{column -> value}, ... , {column -> value}]`` - ``'index'`` : dict like ``{index -> {column -> value}}`` - ``'columns'`` : dict like ``{column -> {index -> value}}`` - ``'values'`` : just the values array The allowed and default values depend on the value of the `typ` parameter. * when ``typ == 'series'``, - allowed orients are ``{'split','records','index'}`` - default is ``'index'`` - The Series index must be unique for orient ``'index'``. * when ``typ == 'frame'``, - allowed orients are ``{'split','records','index', 'columns','values'}`` - default is ``'columns'`` - The DataFrame index must be unique for orients ``'index'`` and ``'columns'``. - The DataFrame columns must be unique for orients ``'index'``, ``'columns'``, and ``'records'``. typ : type of object to recover (series or frame), default 'frame' dtype : boolean or dict, default True If True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, applies only to the data. convert_axes : boolean, default True Try to convert the axes to the proper dtypes. convert_dates : boolean, default True List of columns to parse for dates; If True, then try to parse datelike columns default is True; a column label is datelike if * it ends with ``'_at'``, * it ends with ``'_time'``, * it begins with ``'timestamp'``, * it is ``'modified'``, or * it is ``'date'`` keep_default_dates : boolean, default True If parsing dates, then parse the default datelike columns numpy : boolean, default False Direct decoding to numpy arrays. Supports numeric data only, but non-numeric column and index labels are supported. Note also that the JSON ordering MUST be the same for each term if numpy=True. precise_float : boolean, default False Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (False) is to use fast but less precise builtin functionality date_unit : string, default None The timestamp unit to detect if converting dates. The default behaviour is to try and detect the correct precision, but if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, milliseconds, microseconds or nanoseconds respectively. lines : boolean, default False Read the file as a json object per line. .. versionadded:: 0.19.0 encoding : str, default is 'utf-8' The encoding to use to decode py3 bytes. .. versionadded:: 0.19.0 Returns ------- result : Series or DataFrame, depending on the value of `typ`. See Also -------- DataFrame.to_json Examples -------- >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], ... index=['row 1', 'row 2'], ... columns=['col 1', 'col 2']) Encoding/decoding a Dataframe using ``'split'`` formatted JSON: >>> df.to_json(orient='split') '{"columns":["col 1","col 2"], "index":["row 1","row 2"], "data":[["a","b"],["c","d"]]}' >>> pd.read_json(_, orient='split') col 1 col 2 row 1 a b row 2 c d Encoding/decoding a Dataframe using ``'index'`` formatted JSON: >>> df.to_json(orient='index') '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' >>> pd.read_json(_, orient='index') col 1 col 2 row 1 a b row 2 c d Encoding/decoding a Dataframe using ``'records'`` formatted JSON. Note that index labels are not preserved with this encoding. >>> df.to_json(orient='records') '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]' >>> pd.read_json(_, orient='records') col 1 col 2 0 a b 1 c d """ filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, encoding=encoding) if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) # if the filepath is too long will raise here # 5874 except (TypeError, ValueError): exists = False if exists: with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh: json = fh.read() else: json = filepath_or_buffer elif hasattr(filepath_or_buffer, 'read'): json = filepath_or_buffer.read() else: json = filepath_or_buffer if lines: # If given a json lines file, we break the string into lines, add # commas and put it in a json list to make a valid json object. lines = list(StringIO(json.strip())) json = u'[' + u','.join(lines) + u']' obj = None if typ == 'frame': obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit).parse() if typ == 'series' or obj is None: if not isinstance(dtype, bool): dtype = dict(data=dtype) obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit).parse() return obj
def _test(text, **kwargs): nice_text = text.replace('\r', '\r\n') result = TextReader(StringIO(text), **kwargs).read() expected = TextReader(StringIO(nice_text), **kwargs).read() assert_array_dicts_equal(result, expected)
def test_ignore_leading_whitespace(self): # see gh-3374, gh-6607 data = ' a b c\n 1 2 3\n 4 5 6\n 7 8 9' result = self.read_table(StringIO(data), sep='\s+') expected = DataFrame({'a': [1, 4, 7], 'b': [2, 5, 8], 'c': [3, 6, 9]}) tm.assert_frame_equal(result, expected)
def test_empty_with_index(self): # see gh-10184 data = 'x,y' result = self.read_csv(StringIO(data), index_col=0) expected = DataFrame([], columns=['y'], index=Index([], name='x')) tm.assert_frame_equal(result, expected)
def top_list(date = None, retry_count=3, pause=0.001): """ 获取每日龙虎榜列表 Parameters -------- date:string 明细数据日期 format:YYYY-MM-DD 如果为空,返回最近一个交易日的数据 retry_count : int, 默认 3 如遇网络等问题重复执行的次数 pause : int, 默认 0 重复请求数据过程中暂停的秒数,防止请求间隔时间太短出现的问题 Return ------ DataFrame code:代码 name :名称 pchange:涨跌幅 amount:龙虎榜成交额(万) buy:买入额(万) bratio:占总成交比例 sell:卖出额(万) sratio :占总成交比例 reason:上榜原因 date :日期 """ if date is None: if du.get_hour() < 18: date = du.last_tddate() else: date = du.today() else: if(du.is_holiday(date)): return None for _ in range(retry_count): time.sleep(pause) try: request = Request(rv.LHB_URL%(ct.P_TYPE['http'], ct.DOMAINS['em'], date)) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@id=\"dt_1\"]") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) df = pd.read_html(sarr)[0] df.columns = [i for i in range(1,12)] df = df.apply(_f_rows, axis=1) df = df.fillna(method='ffill') df = df.drop([1, 4], axis=1) df.columns = rv.LHB_COLS df = df.drop_duplicates() df['code'] = df['code'].astype(int) df['code'] = df['code'].map(lambda x: str(x).zfill(6)) df['date'] = date except: pass else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def test_iterator(self): # See gh-6607 reader = self.read_csv(StringIO(self.data1), index_col=0, iterator=True) df = self.read_csv(StringIO(self.data1), index_col=0) chunk = reader.read(3) tm.assert_frame_equal(chunk, df[:3]) last_chunk = reader.read(5) tm.assert_frame_equal(last_chunk, df[3:]) # pass list lines = list(csv.reader(StringIO(self.data1))) parser = TextParser(lines, index_col=0, chunksize=2) df = self.read_csv(StringIO(self.data1), index_col=0) chunks = list(parser) tm.assert_frame_equal(chunks[0], df[:2]) tm.assert_frame_equal(chunks[1], df[2:4]) tm.assert_frame_equal(chunks[2], df[4:]) # pass skiprows parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) chunks = list(parser) tm.assert_frame_equal(chunks[0], df[1:3]) treader = self.read_table(StringIO(self.data1), sep=',', index_col=0, iterator=True) tm.assertIsInstance(treader, TextFileReader) # gh-3967: stopping iteration when chunksize is specified data = """A,B,C foo,1,2,3 bar,4,5,6 baz,7,8,9 """ reader = self.read_csv(StringIO(data), iterator=True) result = list(reader) expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[3, 6, 9]), index=['foo', 'bar', 'baz']) tm.assert_frame_equal(result[0], expected) # chunksize = 1 reader = self.read_csv(StringIO(data), chunksize=1) result = list(reader) expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[3, 6, 9]), index=['foo', 'bar', 'baz']) self.assertEqual(len(result), 3) tm.assert_frame_equal(pd.concat(result), expected) # skip_footer is not supported with the C parser yet if self.engine == 'python': # test bad parameter (skip_footer) reader = self.read_csv(StringIO(self.data1), index_col=0, iterator=True, skip_footer=True) self.assertRaises(ValueError, reader.read, 3)
def test_value_counts_datetime64(self, klass): # GH 3002, datetime64[ns] # don't test names though txt = "\n".join([ 'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM' ]) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df['dt'].copy()) s.name = None idx = pd.to_datetime([ '2010-01-01 00:00:00', '2008-09-09 00:00:00', '2009-01-01 00:00:00' ]) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np_array_datetime64_compat([ '2010-01-01 00:00:00', '2009-01-01 00:00:00', '2008-09-09 00:00:00' ], dtype='datetime64[ns]') if isinstance(s, Index): tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) else: tm.assert_numpy_array_equal(s.unique(), expected) assert s.nunique() == 3 # with NaT s = df['dt'].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() assert result.index.dtype == 'datetime64[ns]' tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() assert unique.dtype == 'datetime64[ns]' # numpy_array_equal cannot compare pd.NaT if isinstance(s, Index): exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]) tm.assert_index_equal(unique, exp_idx) else: tm.assert_numpy_array_equal(unique[:3], expected) assert pd.isna(unique[3]) assert s.nunique() == 3 assert s.nunique(dropna=False) == 4 # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td, name='dt') result = td.value_counts() expected_s = Series([6], index=[Timedelta('1day')], name='dt') tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(['1 days'], name='dt') if isinstance(td, Index): tm.assert_index_equal(td.unique(), expected) else: tm.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2, name='dt') result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s)
def test_dtype_name_in_info(self, data): buf = StringIO() pd.DataFrame({"A": data}).info(buf=buf) result = buf.getvalue() assert data.dtype.name in result
def csv_to_df(text): df = pd.read_csv(StringIO(bytes_to_str(text)), index_col=0, parse_dates=True, infer_datetime_format=True, na_values='-')[::-1] # Yahoo! Finance sometimes does this awesome thing where they # return 2 rows for the most recent business day if len(df) > 2 and df.index[-1] == df.index[-2]: # pragma: no cover df = df[:-1] # Get rid of unicode characters in index name. try: df.index.name = df.index.name.decode('unicode_escape').encode('ascii', 'ignore') except AttributeError: # Python 3 string has no decode method. df.index.name = df.index.name.encode('ascii', 'ignore').decode() column_renames = {'Adj. Open': 'Adj Open', 'Adj. High': 'Adj High', 'Adj. Low': 'Adj Low', 'Adj. Close': 'Adj Close', 'Adj. Volume': 'Adj Volume'} df.rename(columns=column_renames, inplace=True) return df.tz_localize(pytz.UTC)
def test_raise_on_sep_with_delim_whitespace(self): # see gh-6607 data = 'a b c\n1 2 3' with tm.assertRaisesRegexp(ValueError, 'you can only specify one'): self.read_table(StringIO(data), sep='\s', delim_whitespace=True)
def test_string_factorize(self): # should this be optional? data = 'a\nb\na\nb\na' reader = TextReader(StringIO(data), header=None) result = reader.read() assert len(set(map(id, result[0]))) == 2
def test_eof_states(self): # see gh-10728, gh-10548 # With skip_blank_lines = True expected = DataFrame([[4, 5, 6]], columns=['a', 'b', 'c']) # gh-10728: WHITESPACE_LINE data = 'a,b,c\n4,5,6\n ' result = self.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) # gh-10548: EAT_LINE_COMMENT data = 'a,b,c\n4,5,6\n#comment' result = self.read_csv(StringIO(data), comment='#') tm.assert_frame_equal(result, expected) # EAT_CRNL_NOP data = 'a,b,c\n4,5,6\n\r' result = self.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) # EAT_COMMENT data = 'a,b,c\n4,5,6#comment' result = self.read_csv(StringIO(data), comment='#') tm.assert_frame_equal(result, expected) # SKIP_LINE data = 'a,b,c\n4,5,6\nskipme' result = self.read_csv(StringIO(data), skiprows=[2]) tm.assert_frame_equal(result, expected) # With skip_blank_lines = False # EAT_LINE_COMMENT data = 'a,b,c\n4,5,6\n#comment' result = self.read_csv(StringIO(data), comment='#', skip_blank_lines=False) expected = DataFrame([[4, 5, 6]], columns=['a', 'b', 'c']) tm.assert_frame_equal(result, expected) # IN_FIELD data = 'a,b,c\n4,5,6\n ' result = self.read_csv(StringIO(data), skip_blank_lines=False) expected = DataFrame([['4', 5, 6], [' ', None, None]], columns=['a', 'b', 'c']) tm.assert_frame_equal(result, expected) # EAT_CRNL data = 'a,b,c\n4,5,6\n\r' result = self.read_csv(StringIO(data), skip_blank_lines=False) expected = DataFrame([[4, 5, 6], [None, None, None]], columns=['a', 'b', 'c']) tm.assert_frame_equal(result, expected) # Should produce exceptions # ESCAPED_CHAR data = "a,b,c\n4,5,6\n\\" self.assertRaises(Exception, self.read_csv, StringIO(data), escapechar='\\') # ESCAPE_IN_QUOTED_FIELD data = 'a,b,c\n4,5,6\n"\\' self.assertRaises(Exception, self.read_csv, StringIO(data), escapechar='\\') # IN_QUOTED_FIELD data = 'a,b,c\n4,5,6\n"' self.assertRaises(Exception, self.read_csv, StringIO(data), escapechar='\\')
def test_single_line(self): # see gh-6607: sniff separator df = self.read_csv(StringIO('1,2'), names=['a', 'b'], header=None, sep=None) tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df)
def test_value_counts_inferred(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.unique(s_values)) self.assertEqual(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is platform-dep hist = s.value_counts(sort=False) hist.sort() expected = Series([3, 1, 4, 2], index=list('acbd')) expected.sort() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) expected = Series([1, 2, 3, 4], index=list('cdab')) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(hist, expected) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3])) self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({ 0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1 }, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series({ 0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25 }, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = [ 'a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b' ] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal( s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O')) self.assertEqual(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array([])) self.assertEqual(s.nunique(), 0) # GH 3002, datetime64[ns] txt = "\n".join([ 'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM' ]) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df['dt'].copy(), name='dt') idx = pd.to_datetime([ '2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X' ]) expected_s = Series([3, 2, 1], index=idx, name='dt') tm.assert_series_equal(s.value_counts(), expected_s) expected = np.array([ '2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z' ], dtype='datetime64[ns]') if isinstance(s, DatetimeIndex): expected = DatetimeIndex(expected) self.assertTrue(s.unique().equals(expected)) else: self.assert_numpy_array_equal(s.unique(), expected) self.assertEqual(s.nunique(), 3) # with NaT s = df['dt'].copy() s = klass([v for v in s.values] + [pd.NaT], name='dt') result = s.value_counts() self.assertEqual(result.index.dtype, 'datetime64[ns]') tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() self.assertEqual(unique.dtype, 'datetime64[ns]') # numpy_array_equal cannot compare pd.NaT self.assert_numpy_array_equal(unique[:3], expected) self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT) self.assertEqual(s.nunique(), 3) self.assertEqual(s.nunique(dropna=False), 4) # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td, name='dt') result = td.value_counts() expected_s = Series([6], index=[Timedelta('1day')], name='dt') tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(['1 days']) if isinstance(td, TimedeltaIndex): self.assertTrue(td.unique().equals(expected)) else: self.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2, name='dt') result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s)
def test_multiple_date_col(self): # Can use multiple date parsers data = """\ KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ def func(*date_cols): res = parsing.try_parse_dates(parsers._concat_date_cols(date_cols)) return res df = self.read_csv(StringIO(data), header=None, date_parser=func, prefix='X', parse_dates={ 'nominal': [1, 2], 'actual': [1, 3] }) assert 'nominal' in df assert 'actual' in df assert 'X1' not in df assert 'X2' not in df assert 'X3' not in df d = datetime(1999, 1, 27, 19, 0) assert df.loc[0, 'nominal'] == d df = self.read_csv(StringIO(data), header=None, date_parser=func, parse_dates={ 'nominal': [1, 2], 'actual': [1, 3] }, keep_date_col=True) assert 'nominal' in df assert 'actual' in df assert 1 in df assert 2 in df assert 3 in df data = """\ KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ df = self.read_csv(StringIO(data), header=None, prefix='X', parse_dates=[[1, 2], [1, 3]]) assert 'X1_X2' in df assert 'X1_X3' in df assert 'X1' not in df assert 'X2' not in df assert 'X3' not in df d = datetime(1999, 1, 27, 19, 0) assert df.loc[0, 'X1_X2'] == d df = self.read_csv(StringIO(data), header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True) assert '1_2' in df assert '1_3' in df assert 1 in df assert 2 in df assert 3 in df data = '''\ KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 ''' df = self.read_csv(StringIO(data), sep=',', header=None, parse_dates=[1], index_col=1) d = datetime(1999, 1, 27, 19, 0) assert df.index[0] == d