def test_url(self): # HTTP(S) url = 'https://raw.github.com/pydata/pandas/master/pandas/io/tests/salary.table' url_table = read_table(url) dirpath = curpath() localtable = os.path.join(dirpath, 'salary.table') local_table = read_table(localtable) assert_frame_equal(url_table, local_table)
def test_file(self): # FILE if sys.version_info[:2] < (2, 6): raise nose.SkipTest("file:// not supported with Python < 2.6") dirpath = curpath() localtable = os.path.join(dirpath, 'salary.table') local_table = read_table(localtable) url_table = read_table('file://localhost/' + localtable) assert_frame_equal(url_table, local_table)
def test_file(self): # FILE if sys.version_info[:2] < (2, 6): raise nose.SkipTest("file:// not supported with Python < 2.6") dirpath = curpath() localtable = os.path.join(dirpath, 'salary.table') local_table = read_table(localtable) url_table = read_table('file://localhost/'+localtable) assert_frame_equal(url_table, local_table)
def test_url(self): # HTTP(S) url = "https://raw.github.com/pydata/pandas/master/pandas/io/tests/salary.table" url_table = read_table(url) dirpath = curpath() localtable = os.path.join(dirpath, "salary.table") local_table = read_table(localtable) assert_frame_equal(url_table, local_table) # FILE url_table = read_table("file://localhost/" + localtable) assert_frame_equal(url_table, local_table)
def test_no_header(self): data = """1,2,3,4,5 6,7,8,9,10 11,12,13,14,15 """ df = read_table(StringIO(data), sep=",", header=None) names = ["foo", "bar", "baz", "quux", "panda"] df2 = read_table(StringIO(data), sep=",", header=None, names=names) expected = [[1, 2, 3, 4, 5.0], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]] assert_almost_equal(df.values, expected) assert_almost_equal(df.values, df2.values) self.assert_(np.array_equal(df.columns, ["X.1", "X.2", "X.3", "X.4", "X.5"])) self.assert_(np.array_equal(df2.columns, names))
def test_no_header(self): data = """1,2,3,4,5 6,7,8,9,10 11,12,13,14,15 """ df = read_table(StringIO(data), sep=',', header=None) names = ['foo', 'bar', 'baz', 'quux', 'panda'] df2 = read_table(StringIO(data), sep=',', header=None, names=names) expected = [[1, 2, 3, 4, 5.], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]] assert_almost_equal(df.values, expected) assert_almost_equal(df.values, df2.values) self.assert_( np.array_equal(df.columns, ['X.1', 'X.2', 'X.3', 'X.4', 'X.5'])) self.assert_(np.array_equal(df2.columns, names))
def test_no_header(self): data = """1,2,3,4,5 6,7,8,9,10 11,12,13,14,15 """ df = read_table(StringIO(data), sep=',', header=None) names = ['foo', 'bar', 'baz', 'quux', 'panda'] df2 = read_table(StringIO(data), sep=',', header=None, names=names) expected = [[1,2,3,4,5.], [6,7,8,9,10], [11,12,13,14,15]] assert_almost_equal(df.values, expected) self.assert_(np.array_equal(df.columns, ['X.1', 'X.2', 'X.3', 'X.4', 'X.5'])) self.assert_(np.array_equal(df2.columns, names))
def test_read_csv_no_index_name(self): df = read_csv(self.csv2, index_col=0, parse_dates=True) df2 = read_table(self.csv2, sep=",", index_col=0, parse_dates=True) self.assert_(np.array_equal(df.columns, ["A", "B", "C", "D", "E"])) self.assert_(isinstance(df.index[0], datetime)) self.assert_(df.ix[:, ["A", "B", "C", "D"]].values.dtype == np.float64) assert_frame_equal(df, df2)
def read_clipboard(**kwargs): # pragma: no cover """ Read text from clipboard and pass to read_table. See read_table for the full argument list Returns ------- parsed : DataFrame """ if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None: kwargs['sep'] = '\s+' from pandas.util.clipboard import clipboard_get from pandas.io.parsers import read_table text = clipboard_get() # try to decode (if needed on PY3) if compat.PY3: try: text = compat.bytes_to_str( text, encoding=(kwargs.get('encoding') or get_option('display.encoding'))) except: pass return read_table(StringIO(text), **kwargs)
def test_iterator(self): reader = read_csv(StringIO(self.data1), index_col=0, iterator=True) df = read_csv(StringIO(self.data1), index_col=0) chunk = reader.get_chunk(3) assert_frame_equal(chunk, df[:3]) last_chunk = reader.get_chunk(5) assert_frame_equal(last_chunk, df[3:]) # pass list lines = list(csv.reader(StringIO(self.data1))) parser = TextParser(lines, index_col=0, chunksize=2) df = read_csv(StringIO(self.data1), index_col=0) chunks = list(parser) assert_frame_equal(chunks[0], df[:2]) assert_frame_equal(chunks[1], df[2:4]) assert_frame_equal(chunks[2], df[4:]) treader = read_table(StringIO(self.data1), sep=',', index_col=0, iterator=True) self.assert_(isinstance(treader, TextParser))
def test_iterator(self): reader = read_csv(StringIO(self.data1), index_col=0, iterator=True) df = read_csv(StringIO(self.data1), index_col=0) chunk = reader.get_chunk(3) assert_frame_equal(chunk, df[:3]) last_chunk = reader.get_chunk(5) assert_frame_equal(last_chunk, df[3:]) # pass list lines = list(csv.reader(StringIO(self.data1))) parser = TextParser(lines, index_col=0, chunksize=2) df = read_csv(StringIO(self.data1), index_col=0) chunks = list(parser) assert_frame_equal(chunks[0], df[:2]) assert_frame_equal(chunks[1], df[2:4]) assert_frame_equal(chunks[2], df[4:]) # pass skiprows parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) chunks = list(parser) assert_frame_equal(chunks[0], df[1:3]) # test bad parameter (skip_footer) reader = read_csv(StringIO(self.data1), index_col=0, iterator=True, skip_footer=True) self.assertRaises(ValueError, reader.get_chunk, 3) treader = read_table(StringIO(self.data1), sep=',', index_col=0, iterator=True) self.assert_(isinstance(treader, TextParser))
def test_read_csv_no_index_name(self): df = read_csv(self.csv2, index_col=0, parse_dates=True) df2 = read_table(self.csv2, sep=',', index_col=0, parse_dates=True) self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D', 'E'])) self.assert_(isinstance(df.index[0], (datetime, np.datetime64, Timestamp))) self.assert_(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype == np.float64) assert_frame_equal(df, df2)
def read_clipboard(**kwargs): # pragma: no cover """ Read text from clipboard and pass to read_table. See read_table for the full argument list If unspecified, `sep` defaults to '\s+' Returns ------- parsed : DataFrame """ if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None: kwargs['sep'] = '\s+' from pandas.util.clipboard import clipboard_get from pandas.io.parsers import read_table text = clipboard_get() # try to decode (if needed on PY3) if compat.PY3: try: text = compat.bytes_to_str( text, encoding=(kwargs.get('encoding') or get_option('display.encoding')) ) except: pass return read_table(StringIO(text), **kwargs)
def test_parse_date_time(self): result = conv.parse_date_time(self.dates, self.times) self.assert_((result == self.expected).all()) data = """\ date, time, a, b 2001-01-05, 10:00:00, 0.0, 10. 2001-01-05, 00:00:00, 1., 11. """ datecols = {'date_time': [0, 1]} df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_date_time) self.assert_('date_time' in df) self.assertEqual(df.date_time.ix[0], datetime(2001, 1, 5, 10, 0, 0)) data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" "KORD,19990127, 23:00:00, 22:56:00, -0.5900") date_spec = {'nominal': [1, 2], 'actual': [1, 3]} df = read_csv(StringIO(data), header=None, parse_dates=date_spec, date_parser=conv.parse_date_time)
def test_parse_date_time(self): result = conv.parse_date_time(self.dates, self.times) self.assert_((result == self.expected).all()) data = """\ date, time, a, b 2001-01-05, 10:00:00, 0.0, 10. 2001-01-05, 00:00:00, 1., 11. """ datecols = {'date_time': [0, 1]} df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_date_time) self.assert_('date_time' in df) self.assert_(df.date_time.ix[0] == datetime(2001, 1, 5, 10, 0, 0)) data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" "KORD,19990127, 23:00:00, 22:56:00, -0.5900") date_spec = {'nominal': [1, 2], 'actual': [1, 3]} df = read_csv(StringIO(data), header=None, parse_dates=date_spec, date_parser=conv.parse_date_time)
def test_read_csv_no_index_name(self): df = read_csv(self.csv2) df2 = read_table(self.csv2, sep=',') self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D', 'E'])) self.assert_(isinstance(df.index[0], datetime)) self.assert_(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype == np.float64) assert_frame_equal(df, df2)
def test_generic(self): data = "year, month, day, a\n 2001, 01, 10, 10.\n 2001, 02, 1, 11." datecols = {"ym": [0, 1]} dateconverter = lambda y, m: date(year=int(y), month=int(m), day=1) df = read_table(StringIO(data), sep=",", header=0, parse_dates=datecols, date_parser=dateconverter) self.assertIn("ym", df) self.assertEqual(df.ym.ix[0], date(2001, 1, 1))
def test_read_csv_no_index_name(self): df = read_csv(self.csv2, index_col=0, parse_dates=True) df2 = read_table(self.csv2, sep=',', index_col=0, parse_dates=True) self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D', 'E'])) self.assert_(isinstance(df.index[0], datetime)) self.assert_(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype == np.float64) assert_frame_equal(df, df2)
def test_no_unnamed_index(self): data = """ id c0 c1 c2 0 1 0 a b 1 2 0 c d 2 2 2 e f """ df = read_table(StringIO(data), sep=' ') self.assert_(df.index.name is None)
def test_read_csv_dataframe(self): df = read_csv(self.csv1, index_col=0, parse_dates=True) df2 = read_table(self.csv1, sep=',', index_col=0, parse_dates=True) self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D'])) self.assert_(df.index.name == 'index') self.assert_(isinstance(df.index[0], datetime)) self.assert_(df.values.dtype == np.float64) assert_frame_equal(df, df2)
def test_duplicate_columns(self): data = """A,A,B,B,B 1,2,3,4,5 6,7,8,9,10 11,12,13,14,15 """ df = read_table(StringIO(data), sep=",") self.assert_(np.array_equal(df.columns, ["A", "A.1", "B", "B.1", "B.2"]))
def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover r""" Read text from clipboard and pass to read_table. See read_table for the full argument list Parameters ---------- sep : str, default '\s+'. A string or regex delimiter. The default of '\s+' denotes one or more whitespace characters. Returns ------- parsed : DataFrame """ encoding = kwargs.pop('encoding', 'utf-8') # only utf-8 is valid for passed value because that's what clipboard # supports if encoding is not None and encoding.lower().replace('-', '') != 'utf8': raise NotImplementedError( 'reading from clipboard only supports utf-8 encoding') from pandas.io.clipboard import clipboard_get from pandas.io.parsers import read_table text = clipboard_get() # try to decode (if needed on PY3) # Strange. linux py33 doesn't complain, win py33 does if compat.PY3: try: text = compat.bytes_to_str( text, encoding=(kwargs.get('encoding') or get_option('display.encoding')) ) except: pass # Excel copies into clipboard with \t separation # inspect no more then the 10 first lines, if they # all contain an equal number (>0) of tabs, infer # that this came from excel and set 'sep' accordingly lines = text[:10000].split('\n')[:-1][:10] # Need to remove leading white space, since read_table # accepts: # a b # 0 1 2 # 1 3 4 counts = {x.lstrip().count('\t') for x in lines} if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: sep = r'\t' if sep is None and kwargs.get('delim_whitespace') is None: sep = r'\s+' return read_table(StringIO(text), sep=sep, **kwargs)
def read_clipboard(sep='\s+', **kwargs): # pragma: no cover r""" Read text from clipboard and pass to read_table. See read_table for the full argument list Parameters ---------- sep : str, default '\s+'. A string or regex delimiter. The default of '\s+' denotes one or more whitespace characters. Returns ------- parsed : DataFrame """ encoding = kwargs.pop('encoding', 'utf-8') # only utf-8 is valid for passed value because that's what clipboard # supports if encoding is not None and encoding.lower().replace('-', '') != 'utf8': raise NotImplementedError( 'reading from clipboard only supports utf-8 encoding') from pandas.io.clipboard import clipboard_get from pandas.io.parsers import read_table text = clipboard_get() # try to decode (if needed on PY3) # Strange. linux py33 doesn't complain, win py33 does if compat.PY3: try: text = compat.bytes_to_str( text, encoding=(kwargs.get('encoding') or get_option('display.encoding'))) except: pass # Excel copies into clipboard with \t separation # inspect no more then the 10 first lines, if they # all contain an equal number (>0) of tabs, infer # that this came from excel and set 'sep' accordingly lines = text[:10000].split('\n')[:-1][:10] # Need to remove leading white space, since read_table # accepts: # a b # 0 1 2 # 1 3 4 counts = set([x.lstrip().count('\t') for x in lines]) if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: sep = '\t' if sep is None and kwargs.get('delim_whitespace') is None: sep = '\s+' return read_table(StringIO(text), sep=sep, **kwargs)
def test_read_csv_dataframe(self): df = read_csv(self.csv1, index_col=0, parse_dates=True) df2 = read_table(self.csv1, sep=',', index_col=0, parse_dates=True) self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D'])) self.assert_(df.index.name == 'index') self.assert_( isinstance(df.index[0], (datetime, np.datetime64, Timestamp))) self.assert_(df.values.dtype == np.float64) assert_frame_equal(df, df2)
def test_duplicate_columns(self): data = """A,A,B,B,B 1,2,3,4,5 6,7,8,9,10 11,12,13,14,15 """ df = read_table(StringIO(data), sep=',') self.assert_(np.array_equal(df.columns, ['A', 'A.1', 'B', 'B.1', 'B.2']))
def test_generic(self): data = "year, month, day, a\n 2001, 01, 10, 10.\n 2001, 02, 1, 11." datecols = {'ym': [0, 1]} dateconverter = lambda y, m: date(year=int(y), month=int(m), day=1) df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=dateconverter) self.assert_('ym' in df) self.assert_(df.ym.ix[0] == date(2001, 1, 1))
def test_regex_separator(self): data = """ A B C D a 1 2 3 4 b 1 2 3 4 c 1 2 3 4 """ df = read_table(StringIO(data), sep='\s+') expected = read_csv(StringIO(re.sub('[ ]+', ',', data)), index_col=0) self.assert_(expected.index.name is None) assert_frame_equal(df, expected)
def test_parse_date_fields(self): result = conv.parse_date_fields(self.years, self.months, self.days) expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)]) self.assertTrue((result == expected).all()) data = "year, month, day, a\n 2001 , 01 , 10 , 10.\n 2001 , 02 , 1 , 11." datecols = {"ymd": [0, 1, 2]} df = read_table(StringIO(data), sep=",", header=0, parse_dates=datecols, date_parser=conv.parse_date_fields) self.assertIn("ymd", df) self.assertEqual(df.ymd.ix[0], datetime(2001, 1, 10))
def test_read_table_buglet_4x_multiindex(self): text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" # it works! df = read_table(StringIO(text), sep='\s+') self.assertEquals(df.index.names, ['one', 'two', 'three', 'four'])
def _get_probe_mapping(self, agilent_file): """ Given an agilent file that maps probe ids to gene symbols return dataframe with ProbeID and GeneSymbol columns """ agl = parsers.read_table( join( self.working_dir, agilent_file ) ) agl.set_index('ProbeID') agl2 = agl[agl['GeneSymbol'].notnull()] agl2 = agl2.set_index('ProbeID') return agl2
def _get_data( self, data_file, annotations_file): """ Given data file and annotations, make dataframe indexed by ProbeName with control probes dropped """ data_orig = parsers.read_table( join(self.working_dir,data_file) ) annot = self._get_annotations( annotations_file) #set data index data_orig.index = annot['ProbeName'] return self._drop_controls( data_orig, annotations_file )
def test_unnamed_columns(self): data = """A,B,C,, 1,2,3,4,5 6,7,8,9,10 11,12,13,14,15 """ expected = [[1, 2, 3, 4, 5.0], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]] df = read_table(StringIO(data), sep=",") assert_almost_equal(df.values, expected) self.assert_(np.array_equal(df.columns, ["A", "B", "C", "Unnamed: 3", "Unnamed: 4"]))
def test_regex_separator(self): data = """ A B C D a 1 2 3 4 b 1 2 3 4 c 1 2 3 4 """ df = read_table(StringIO(data), sep="\s+") expected = read_csv(StringIO(re.sub("[ ]+", ",", data)), index_col=0) self.assert_(expected.index.name is None) assert_frame_equal(df, expected)
def test_squeeze(self): data = """\ a,1 b,2 c,3 """ expected = Series([1,2,3], ['a', 'b', 'c']) result = read_table(StringIO(data), sep=',', index_col=0, header=None, squeeze=True) self.assert_(isinstance(result, Series)) assert_series_equal(result, expected)
def readTable(fileName): columnTypes = {'id':int, 'date_published':datetime, 'date_saved':datetime, 'url':str, 'author':str, 'title_scraped':str, 'title_feed':str, 'description':str, 'text':str } # does not seem to have effect table = parsers.read_table(fileName, quotechar='"', parse_dates = [2]) table.replace(to_replace = {'text': {NaN:''}}, inplace=True) # empty string are read in as NaN, replace # date_published containes "null" values, so read_table wont parse the dates # code below seems to throw exceptions, see the docs newDatePub = table['date_published'].apply(lambda a : pandas.tslib.Timestamp(a) if a != 'null' else None) table['date_published'] = newDatePub return table
def test_url(self): import urllib2 try: # HTTP(S) url = ('https://raw.github.com/pydata/pandas/master/' 'pandas/io/tests/salary.table') url_table = read_table(url) dirpath = curpath() localtable = os.path.join(dirpath, 'salary.table') local_table = read_table(localtable) assert_frame_equal(url_table, local_table) #TODO: ftp testing except urllib2.URLError: try: urllib2.urlopen('http://www.google.com') except urllib2.URLError: raise nose.SkipTest else: raise
def test_datetime_fractional_seconds(self): data = """\ year, month, day, hour, minute, second, a, b 2001, 01, 05, 10, 00, 0.123456, 0.0, 10. 2001, 01, 5, 10, 0, 0.500000, 1., 11. """ datecols = {"ymdHMS": [0, 1, 2, 3, 4, 5]} df = read_table(StringIO(data), sep=",", header=0, parse_dates=datecols, date_parser=conv.parse_all_fields) self.assertIn("ymdHMS", df) self.assertEqual(df.ymdHMS.ix[0], datetime(2001, 1, 5, 10, 0, 0, microsecond=123456)) self.assertEqual(df.ymdHMS.ix[1], datetime(2001, 1, 5, 10, 0, 0, microsecond=500000))
def test_1000_sep(self): data = """A|B|C 1|2,334.0|5 10|13|10. """ expected = [[1, 2334., 5], [10, 13, 10]] df = read_csv(StringIO(data), sep='|', thousands=',') assert_almost_equal(df.values, expected) df = read_table(StringIO(data), sep='|', thousands=',') assert_almost_equal(df.values, expected)
def test_parse_date_fields(self): result = conv.parse_date_fields(self.years, self.months, self.days) expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)]) self.assert_((result == expected).all()) data = "year, month, day, a\n 2001 , 01 , 10 , 10.\n 2001 , 02 , 1 , 11." datecols = {'ymd': [0, 1, 2]} df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_date_fields) self.assert_('ymd' in df) self.assert_(df.ymd.ix[0] == datetime(2001, 1, 10))
def test_comment(self): data = """A,B,C 1,2.,4.#hello world 5.,NaN,10.0 """ expected = [[1., 2., 4.], [5., np.nan, 10.]] df = read_csv(StringIO(data), comment='#') assert_almost_equal(df.values, expected) df = read_table(StringIO(data), sep=',', comment='#', na_values=['NaN']) assert_almost_equal(df.values, expected)
def test_unnamed_columns(self): data = """A,B,C,, 1,2,3,4,5 6,7,8,9,10 11,12,13,14,15 """ expected = [[1, 2, 3, 4, 5.], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]] df = read_table(StringIO(data), sep=',') assert_almost_equal(df.values, expected) self.assert_( np.array_equal(df.columns, ['A', 'B', 'C', 'Unnamed: 3', 'Unnamed: 4']))
def check_compressed_urls(salaries_table, compression, extension, mode, engine): # test reading compressed urls with various engines and # extension inference base_url = ('https://github.com/pandas-dev/pandas/raw/master/' 'pandas/tests/io/parser/data/salaries.csv') url = base_url + extension if mode != 'explicit': compression = mode url_table = read_table(url, compression=compression, engine=engine) tm.assert_frame_equal(url_table, salaries_table)
def test_malformed(self): # all data = """ignore A,B,C 1,2,3 # comment 1,2,3,4,5 2,3,4 """ try: df = read_table(StringIO(data), sep=',', header=1, comment='#') self.assert_(False) except ValueError, inst: self.assert_('Expecting 3 columns, got 5 in row 3' in str(inst))
def test_squeeze(self): data = """\ a,1 b,2 c,3 """ expected = Series([1, 2, 3], ['a', 'b', 'c']) result = read_table(StringIO(data), sep=',', index_col=0, header=None, squeeze=True) self.assert_(isinstance(result, Series)) assert_series_equal(result, expected)
def read_clipboard(**kwargs): # pragma: no cover """ Read text from clipboard and pass to read_table. See read_table for the full argument list Returns ------- parsed : DataFrame """ if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None: kwargs['sep'] = '\s+' from pandas.util.clipboard import clipboard_get from pandas.io.parsers import read_table text = clipboard_get() return read_table(StringIO(text), **kwargs)
def test_datetime_fractional_seconds(self): data = """\ year, month, day, hour, minute, second, a, b 2001, 01, 05, 10, 00, 0.123456, 0.0, 10. 2001, 01, 5, 10, 0, 0.500000, 1., 11. """ datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]} df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_all_fields) self.assert_('ymdHMS' in df) self.assert_(df.ymdHMS.ix[0] == datetime(2001, 1, 5, 10, 0, 0, microsecond=123456)) self.assert_(df.ymdHMS.ix[1] == datetime(2001, 1, 5, 10, 0, 0, microsecond=500000))
def test_quoting(self): bad_line_small = """printer\tresult\tvariant_name Klosterdruckerei\tKlosterdruckerei <Salem> (1611-1804)\tMuller, Jacob Klosterdruckerei\tKlosterdruckerei <Salem> (1611-1804)\tMuller, Jakob Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\t"Furststiftische Hofdruckerei, <Kempten"" Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\tGaller, Alois Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\tHochfurstliche Buchhandlung <Kempten>""" self.assertRaises(Exception, read_table, StringIO(bad_line_small), sep='\t') good_line_small = bad_line_small + '"' df = read_table(StringIO(good_line_small), sep='\t') self.assert_(len(df) == 3)
def test_datetime_six_col(self): result = conv.parse_all_fields(self.years, self.months, self.days, self.hours, self.minutes, self.seconds) self.assert_((result == self.expected).all()) data = """\ year, month, day, hour, minute, second, a, b 2001, 01, 05, 10, 00, 0, 0.0, 10. 2001, 01, 5, 10, 0, 00, 1., 11. """ datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]} df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_all_fields) self.assert_('ymdHMS' in df) self.assert_(df.ymdHMS.ix[0] == datetime(2001, 1, 5, 10, 0, 0))
def read_clipboard(**kwargs): # pragma: no cover """ Read text from clipboard and pass to read_table. See read_table for the full argument list If unspecified, `sep` defaults to '\s+' Returns ------- parsed : DataFrame """ from pandas.util.clipboard import clipboard_get from pandas.io.parsers import read_table text = clipboard_get() # try to decode (if needed on PY3) # Strange. linux py33 doesn't complain, win py33 does if compat.PY3: try: text = compat.bytes_to_str( text, encoding=(kwargs.get('encoding') or get_option('display.encoding'))) except: pass # Excel copies into clipboard with \t seperation # inspect no more then the 10 first lines, if they # all contain an equal number (>0) of tabs, infer # that this came from excel and set 'sep' accordingly lines = text[:10000].split('\n')[:-1][:10] # Need to remove leading white space, since read_table # accepts: # a b # 0 1 2 # 1 3 4 counts = set([x.lstrip().count('\t') for x in lines]) if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: kwargs['sep'] = '\t' if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None: kwargs['sep'] = '\s+' return read_table(StringIO(text), **kwargs)
def test_custom_na_values(self): data = """A,B,C ignore,this,row 1,NA,3 -1.#IND,5,baz 7,8,NaN """ expected = [[1., nan, 3], [nan, 5, nan], [7, 8, nan]] df = read_csv(StringIO(data), na_values=['baz'], skiprows=[1]) assert_almost_equal(df.values, expected) df2 = read_table(StringIO(data), sep=',', na_values=['baz'], skiprows=[1]) assert_almost_equal(df2.values, expected)
def readTable(fileName): columnTypes = { 'id': int, 'date_published': datetime, 'date_saved': datetime, 'url': str, 'author': str, 'title_scraped': str, 'title_feed': str, 'description': str, 'text': str } # does not seem to have effect table = parsers.read_table(fileName, quotechar='"', parse_dates=[2]) table.replace(to_replace={'text': { NaN: '' }}, inplace=True) # empty string are read in as NaN, replace # date_published containes "null" values, so read_table wont parse the dates # code below seems to throw exceptions, see the docs newDatePub = table['date_published'].apply( lambda a: pandas.tslib.Timestamp(a) if a != 'null' else None) table['date_published'] = newDatePub return table
def salaries_table(parser_data): """DataFrame with the salaries dataset""" path = os.path.join(parser_data, 'salaries.csv') return read_table(path)