def _read_one_data(self, ftppath, params): if re.search(_ZIP_RE, ftppath) is not None: index_file = self._read_zipfile(ftppath) elif re.search(_GZ_RE, ftppath) is not None: index_file = self._read_gzfile(ftppath) else: index_file = StringIO() index_list = [] try: self._sec_ftp_session.retrlines('RETR ' + ftppath, index_list.append) except EOFError: raise RemoteDataError('FTP server has closed the connection.') for line in index_list: index_file.write(line + '\n') index_file.seek(0) index_file = self._remove_header(index_file) index = read_csv(index_file, delimiter='|', header=None, index_col=False, names=_COLUMNS, low_memory=False, dtype=_COLUMN_TYPES) index['filename'] = index['filename'].map(self._fix_old_file_paths) return index
def test_to_csv_stringio(self): buf = StringIO() self.frame.to_csv(buf) buf.seek(0) recons = read_csv(buf, index_col=0) # TODO to_csv drops column name assert_frame_equal(recons, self.frame, check_names=False)
def test_to_csv_unicode_index(self): buf = StringIO() s = Series([u("\u05d0"), "d2"], index=[u("\u05d0"), u("\u05d1")]) s.to_csv(buf, encoding="UTF-8") buf.seek(0) s2 = self.read_csv(buf, index_col=0, encoding="UTF-8") assert_series_equal(s, s2)
def test_info_duplicate_columns_shows_correct_dtypes(self): # GH11761 io = StringIO() frame = DataFrame([[1, 2.0]], columns=["a", "a"]) frame.info(buf=io) io.seek(0) lines = io.readlines() self.assertEqual("a 1 non-null int64\n", lines[3]) self.assertEqual("a 1 non-null float64\n", lines[4])
def test_info_duplicate_columns_shows_correct_dtypes(self): # GH11761 io = StringIO() frame = DataFrame([[1, 2.0]], columns=['a', 'a']) frame.info(buf=io) io.seek(0) lines = io.readlines() self.assertEqual('a 1 non-null int64\n', lines[3]) self.assertEqual('a 1 non-null float64\n', lines[4])
def _remove_header(self, data): header = True cleaned_datafile = StringIO() for line in data: if header is False: cleaned_datafile.write(line + '\n') elif re.search(_DIVIDER, line) is not None: header = False cleaned_datafile.seek(0) return cleaned_datafile
def test_info_duplicate_columns_shows_correct_dtypes(self): # GH11761 io = StringIO() frame = DataFrame([[1, 2.0]], columns=['a', 'a']) frame.info(buf=io) io.seek(0) lines = io.readlines() assert 'a 1 non-null int64\n' == lines[3] assert 'a 1 non-null float64\n' == lines[4]
def _read_url_as_StringIO(self, url, params=None): """ Open url (and retry) """ response = self._get_response(url, params=params) out = StringIO() if isinstance(response.content, compat.binary_type): out.write(bytes_to_str(response.content)) else: out.write(response.content) out.seek(0) return out
def test_to_csv_unicode_index_col(self): buf = StringIO('') df = DataFrame( [["\u05d0", "d2", "d3", "d4"], ["a1", "a2", "a3", "a4"]], columns=["\u05d0", "\u05d1", "\u05d2", "\u05d3"], index=["\u05d0", "\u05d1"]) df.to_csv(buf, encoding='UTF-8') buf.seek(0) df2 = read_csv(buf, index_col=0, encoding='UTF-8') assert_frame_equal(df, df2)
def test_to_csv_unicode_index_col(self): buf = StringIO('') df = DataFrame( [[u("\u05d0"), "d2", "d3", "d4"], ["a1", "a2", "a3", "a4"]], columns=[u("\u05d0"), u("\u05d1"), u("\u05d2"), u("\u05d3")], index=[u("\u05d0"), u("\u05d1")]) df.to_csv(buf, encoding='UTF-8') buf.seek(0) df2 = read_csv(buf, index_col=0, encoding='UTF-8') assert_frame_equal(df, df2)
def _read_url_as_StringIO(self, url, params=None): """ Open url (and retry) """ response = self._get_response(url, params=params) text = self._sanitize_response(response) out = StringIO() if len(text) == 0: service = self.__class__.__name__ raise IOError("{} request returned no data; check URL for invalid " "inputs: {}".format(service, self.url)) if isinstance(text, compat.binary_type): out.write(bytes_to_str(text)) else: out.write(text) out.seek(0) return out
def _read_url_as_StringIO(self, url, params=None, min=0, errors='ignore'): """重写基类同名方法 根据派生类提供的encoding解析文本 """ response = self._get_response(url, params=params) text = self._sanitize_response(response) out = StringIO() if len(text) <= self._read_url_as_StringIO_min_len: if self._read_url_as_StringIO_less_min_len: service = self.__class__.__name__ raise IOError("{} request returned no data; check URL for " "invalid inputs: {}".format(service, self.url)) else: return None if isinstance(text, compat.binary_type): out.write(bytes_to_str(text, encoding=self._encoding)) else: out.write(text) out.seek(0) return out
def read_iso_ts(indat, dense=True, parse_dates=True, extended_columns=False, force_freq=None): ''' Reads the format printed by 'print_iso' and maybe other formats. ''' import csv from pandas.compat import StringIO if force_freq is not None: # force_freq implies a dense series dense = True index_col = 0 if parse_dates is False: index_col = False # Would want this to be more generic... na_values = [] for spc in range(20)[1:]: spcs = ' ' * spc na_values.append(spcs) na_values.append(spcs + 'nan') fpi = None # Handle Series by converting to DataFrame if isinstance(indat, pd.Series): indat = pd.DataFrame(indat) if isinstance(indat, pd.DataFrame): if indat.index.is_all_dates: indat.index.name = 'Datetime' if dense: return asbestfreq(indat, force_freq=force_freq) else: return indat else: indat.index.name = 'UniqueID' return indat has_header = False dialect = csv.excel if isinstance(indat, str) or isinstance(indat, bytes): try: indat = str(indat, encoding='utf-8') except: pass if indat == '-': # if from stdin format must be the tstoolbox standard has_header = True fpi = openinput(indat) elif '\n' in indat or '\r' in indat: # a string fpi = StringIO(indat) elif os.path.exists(indat): # Is it a pickled file? try: result = pd.io.pickle.read_pickle(indat) fpi = False except: # Maybe a CSV file? fpi = openinput(indat) else: raise ValueError(''' * * File {0} doesn't exist. * '''.format(indat)) else: raise ValueError(''' * * Can't figure out what was passed to read_iso_ts. * ''') if fpi: try: fpi.seek(0) readsome = fpi.read(2048) fpi.seek(0) dialect = csv.Sniffer().sniff(readsome, delimiters=', \t:|') has_header = csv.Sniffer().has_header(readsome) except: # This is an assumption. has_header = True if extended_columns is True: fname = os.path.splitext(os.path.basename(fpi.name))[0] fstr = '{0}.{1}' else: fname = '' fstr = '{1}' if fname == '<stdin>': fname = '_' if has_header: result = pd.io.parsers.read_table(fpi, header=0, dialect=dialect, index_col=index_col, parse_dates=True, skipinitialspace=True) result.columns = [ fstr.format(fname, i.strip()) for i in result.columns ] else: result = pd.io.parsers.read_table(fpi, header=None, dialect=dialect, index_col=0, parse_dates=True, skipinitialspace=True) if len(result.columns) == 1: result.columns = [fname] else: result.columns = [ fstr.format(fname, i.strip()) for i in result.columns ] if result.index.is_all_dates is True: result.index.name = 'Datetime' if dense: try: return asbestfreq(result, force_freq=force_freq) except ValueError: return result else: if result.index.name != 'UniqueID': result.reset_index(level=0, inplace=True) result.index.name = 'UniqueID' return result