def test_write_infer(self, ext, get_random_path): base = get_random_path path1 = base + ext path2 = base + ".raw" compression = None for c in self._compression_to_extension: if self._compression_to_extension[c] == ext: compression = c break with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() # write to compressed file by inferred compression method df.to_pickle(p1) # decompress with tm.decompress_file(p1, compression=compression) as f: with open(p2, "wb") as fh: fh.write(f.read()) # read decompressed file df2 = pd.read_pickle(p2, compression=None) tm.assert_frame_equal(df, df2)
def test_excel_cell_error_na(self): _skip_if_no_xlrd() excel_data = ExcelFile(os.path.join(self.dirpath, 'test3.xls')) parsed = excel_data.parse('Sheet1') expected = DataFrame([[np.nan]], columns=['Test']) tm.assert_frame_equal(parsed, expected)
def test_query_single_element_booleans(self, parser, engine): columns = 'bid', 'bidsize', 'ask', 'asksize' data = np.random.randint(2, size=(1, len(columns))).astype(bool) df = DataFrame(data, columns=columns) res = df.query('bid & ask', engine=engine, parser=parser) expected = df[df.bid & df.ask] assert_frame_equal(res, expected)
def test_groupby_to_series_to_frame_2(self): df = pd.DataFrame({'a': [6, 2, 2], 'b': [4, 5, 6]}) labels = ['g1', 'g1', 'g2'] benchmark = df.groupby(labels).apply(frame_to_series) result = pandas_easy.groupby_to_series_to_frame( df, frame_to_series, 1, use_apply=False, by=labels) assert_frame_equal(result, benchmark)
def test_excel_stop_iterator(self): _skip_if_no_xlrd() excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xls')) parsed = excel_data.parse('Sheet1') expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) tm.assert_frame_equal(parsed, expected)
def test_nested_scope(self): from pandas.core.computation.ops import UndefinedVariableError engine = self.engine parser = self.parser # smoke test x = 1 # noqa result = pd.eval('x + 1', engine=engine, parser=parser) assert result == 2 df = DataFrame(np.random.randn(5, 3)) df2 = DataFrame(np.random.randn(5, 3)) # don't have the pandas parser with pytest.raises(SyntaxError): df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) with pytest.raises(UndefinedVariableError): df.query('(df>0) & (df2>0)', engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0)] result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine, parser=parser) assert_frame_equal(expected, result) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]', engine=engine, parser=parser) assert_frame_equal(expected, result)
def test_query_with_nested_special_character(self, parser, engine): skip_if_no_pandas_parser(parser) df = DataFrame({'a': ['a', 'b', 'test & test'], 'b': [1, 2, 3]}) res = df.query('a == "test & test"', parser=parser, engine=engine) expec = df[df.a == 'test & test'] assert_frame_equal(res, expec)
def test_sort_index_different_sortorder(self): A = np.arange(20).repeat(5) B = np.tile(np.arange(5), 20) indexer = np.random.permutation(100) A = A.take(indexer) B = B.take(indexer) df = DataFrame({'A': A, 'B': B, 'C': np.random.randn(100)}) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by=['A', 'B'], ascending=[1, 0]) result = df.sort_values(by=['A', 'B'], ascending=[1, 0]) ex_indexer = np.lexsort((df.B.max() - df.B, df.A)) expected = df.take(ex_indexer) assert_frame_equal(result, expected) # test with multiindex, too idf = df.set_index(['A', 'B']) result = idf.sort_index(ascending=[1, 0]) expected = idf.take(ex_indexer) assert_frame_equal(result, expected) # also, Series! result = idf['C'].sort_index(ascending=[1, 0]) assert_series_equal(result, expected['C'])
def test_query_python(self): df = self.df result = df.query('A>0', engine='python') assert_frame_equal(result, self.expected1) result = df.eval('A+1', engine='python') assert_series_equal(result, self.expected2, check_names=False)
def test_groupby_groups_datetimeindex(self): # GH#1430 periods = 1000 ind = pd.date_range(start='2012/1/1', freq='5min', periods=periods) df = DataFrame({'high': np.arange(periods), 'low': np.arange(periods)}, index=ind) grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) # it works! groups = grouped.groups assert isinstance(list(groups.keys())[0], datetime) # GH#11442 index = pd.date_range('2015/01/01', periods=5, name='date') df = pd.DataFrame({'A': [5, 6, 7, 8, 9], 'B': [1, 2, 3, 4, 5]}, index=index) result = df.groupby(level='date').groups dates = ['2015-01-05', '2015-01-04', '2015-01-03', '2015-01-02', '2015-01-01'] expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date') for date in dates} tm.assert_dict_equal(result, expected) grouped = df.groupby(level='date') for date in dates: result = grouped.get_group(date) data = [[df.loc[date, 'A'], df.loc[date, 'B']]] expected_index = pd.DatetimeIndex([date], name='date') expected = pd.DataFrame(data, columns=list('AB'), index=expected_index) tm.assert_frame_equal(result, expected)
def test_stable_descending_sort(self): # GH #6399 df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']], columns=['sort_col', 'order']) sorted_df = df.sort_values(by='sort_col', kind='mergesort', ascending=False) assert_frame_equal(df, sorted_df)
def run_tests(df, rhs, right): # label, index, slice r, i, s = list('bcd'), [1, 2, 3], slice(1, 4) c, j, l = ['joe', 'jolie'], [1, 2], slice(1, 3) left = df.copy() left.loc[r, c] = rhs tm.assert_frame_equal(left, right) left = df.copy() left.iloc[i, j] = rhs tm.assert_frame_equal(left, right) left = df.copy() with catch_warnings(record=True): left.ix[s, l] = rhs tm.assert_frame_equal(left, right) left = df.copy() with catch_warnings(record=True): left.ix[i, j] = rhs tm.assert_frame_equal(left, right) left = df.copy() with catch_warnings(record=True): left.ix[r, c] = rhs tm.assert_frame_equal(left, right)
def test_coercion_with_loc(self): for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: start_dataframe = DataFrame({'foo': start_data}) start_dataframe.loc[0, ['foo']] = None expected_dataframe = DataFrame({'foo': expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe)
def test_pivot_datetime_tz(self): dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00', '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'] dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00'] df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], 'dt1': dates1, 'dt2': dates2, 'value1': range(6), 'value2': [1, 2] * 3}) df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'], tz='US/Pacific', name='dt1') exp_col1 = Index(['value1', 'value1']) exp_col2 = Index(['a', 'b'], name='label') exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col) result = pivot_table(df, index=['dt1'], columns=['label'], values=['value1']) tm.assert_frame_equal(result, expected) exp_col1 = Index(['sum', 'sum', 'sum', 'sum', 'mean', 'mean', 'mean', 'mean']) exp_col2 = Index(['value1', 'value1', 'value2', 'value2'] * 2) exp_col3 = pd.DatetimeIndex(['2013-01-01 15:00:00', '2013-02-01 15:00:00'] * 4, tz='Asia/Tokyo', name='dt2') exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) expected = DataFrame(np.array([[0, 3, 1, 2, 0, 3, 1, 2], [1, 4, 2, 1, 1, 4, 2, 1], [2, 5, 1, 2, 2, 5, 1, 2]]), index=exp_idx, columns=exp_col) result = pivot_table(df, index=['dt1'], columns=['dt2'], values=['value1', 'value2'], aggfunc=[np.sum, np.mean]) tm.assert_frame_equal(result, expected)
def test_float_index_to_mixed(self): df = DataFrame({0.0: np.random.rand(10), 1.0: np.random.rand(10)}) df['a'] = 10 tm.assert_frame_equal(DataFrame({0.0: df[0.0], 1.0: df[1.0], 'a': [10] * 10}), df)
def test_parse_date_float(self, data, expected, parse_dates): # see gh-2697 # # Date parsing should fail, so we leave the data untouched # (i.e. float precision should remain unchanged). result = self.read_csv(StringIO(data), parse_dates=parse_dates) tm.assert_frame_equal(result, expected)
def test_ancom_3class_anova(self): t = pd.DataFrame([[9, 9, 19, 19, 29, 29], [10, 11, 20, 20, 29, 28], [9, 10, 9, 9, 10, 9], [9, 10, 9, 9, 9, 8], [9, 10, 9, 9, 9, 9], [9, 10, 9, 9, 9, 10], [9, 12, 9, 9, 9, 11]], index=['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7'], columns=['S1', 'S2', 'S3', 'S4', 'S5', 'S6']).T c = qiime2.CategoricalMetadataColumn( pd.Series(['0', '0', '1', '1', '2', '2'], name='n', index=pd.Index(['S1', 'S2', 'S3', 'S4', 'S5', 'S6'], name='id')) ) ancom(output_dir=self.temp_dir.name, table=t+1, metadata=c) res = pd.read_csv(os.path.join(self.temp_dir.name, 'ancom.tsv'), index_col=0, sep='\t') exp = pd.DataFrame( {'W': np.array([5, 5, 3, 3, 2, 2, 2]), 'Reject null hypothesis': np.array([True, True, False, False, False, False, False], dtype=bool)}, index=['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7'],) pdt.assert_frame_equal(res, exp)
def test_yy_format_with_yearfirst(self): data = """date,time,B,C 090131,0010,1,2 090228,1020,3,4 090331,0830,5,6 """ # See gh-217 import dateutil if dateutil.__version__ >= LooseVersion('2.5.0'): pytest.skip("testing yearfirst=True not-support" "on datetutil < 2.5.0 this works but" "is wrong") rs = self.read_csv(StringIO(data), index_col=0, parse_dates=[['date', 'time']]) idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), datetime(2009, 2, 28, 10, 20, 0), datetime(2009, 3, 31, 8, 30, 0)], dtype=object, name='date_time') xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx) tm.assert_frame_equal(rs, xp) rs = self.read_csv(StringIO(data), index_col=0, parse_dates=[[0, 1]]) idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), datetime(2009, 2, 28, 10, 20, 0), datetime(2009, 3, 31, 8, 30, 0)], dtype=object, name='date_time') xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx) tm.assert_frame_equal(rs, xp)
def test_dateparser_resolution_if_not_ns(self): # GH 10245 data = """\ date,time,prn,rxstatus 2013-11-03,19:00:00,126,00E80000 2013-11-03,19:00:00,23,00E80000 2013-11-03,19:00:00,13,00E80000 """ def date_parser(date, time): datetime = np_array_datetime64_compat( date + 'T' + time + 'Z', dtype='datetime64[s]') return datetime df = self.read_csv(StringIO(data), date_parser=date_parser, parse_dates={'datetime': ['date', 'time']}, index_col=['datetime', 'prn']) datetimes = np_array_datetime64_compat(['2013-11-03T19:00:00Z'] * 3, dtype='datetime64[s]') df_correct = DataFrame(data={'rxstatus': ['00E80000'] * 3}, index=MultiIndex.from_tuples( [(datetimes[0], 126), (datetimes[1], 23), (datetimes[2], 13)], names=['datetime', 'prn'])) tm.assert_frame_equal(df, df_correct)
def test_reindex(self): newFrame = self.frame.reindex(self.ts1.index) for col in newFrame.columns: for idx, val in compat.iteritems(newFrame[col]): if idx in self.frame.index: if np.isnan(val): self.assertTrue(np.isnan(self.frame[col][idx])) else: self.assertEqual(val, self.frame[col][idx]) else: self.assertTrue(np.isnan(val)) for col, series in compat.iteritems(newFrame): self.assertTrue(tm.equalContents(series.index, newFrame.index)) emptyFrame = self.frame.reindex(Index([])) self.assertEqual(len(emptyFrame.index), 0) # Cython code should be unit-tested directly nonContigFrame = self.frame.reindex(self.ts1.index[::2]) for col in nonContigFrame.columns: for idx, val in compat.iteritems(nonContigFrame[col]): if idx in self.frame.index: if np.isnan(val): self.assertTrue(np.isnan(self.frame[col][idx])) else: self.assertEqual(val, self.frame[col][idx]) else: self.assertTrue(np.isnan(val)) for col, series in compat.iteritems(nonContigFrame): self.assertTrue(tm.equalContents(series.index, nonContigFrame.index)) # corner cases # Same index, copies values but not index if copy=False newFrame = self.frame.reindex(self.frame.index, copy=False) self.assertIs(newFrame.index, self.frame.index) # length zero newFrame = self.frame.reindex([]) self.assertTrue(newFrame.empty) self.assertEqual(len(newFrame.columns), len(self.frame.columns)) # length zero with columns reindexed with non-empty index newFrame = self.frame.reindex([]) newFrame = newFrame.reindex(self.frame.index) self.assertEqual(len(newFrame.index), len(self.frame.index)) self.assertEqual(len(newFrame.columns), len(self.frame.columns)) # pass non-Index newFrame = self.frame.reindex(list(self.ts1.index)) self.assert_index_equal(newFrame.index, self.ts1.index) # copy with no axes result = self.frame.reindex() assert_frame_equal(result, self.frame) self.assertFalse(result is self.frame)
def test_isnull(): assert not isnull(1.) assert isnull(None) assert isnull(np.NaN) assert not isnull(np.inf) assert not isnull(-np.inf) # series for s in [tm.makeFloatSeries(),tm.makeStringSeries(), tm.makeObjectSeries(),tm.makeTimeSeries(),tm.makePeriodSeries()]: assert(isinstance(isnull(s), Series)) # frame for df in [tm.makeTimeDataFrame(),tm.makePeriodFrame(),tm.makeMixedDataFrame()]: result = isnull(df) expected = df.apply(isnull) tm.assert_frame_equal(result, expected) # panel for p in [ tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) ]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel_equal(result, expected) # panel 4d for p in [ tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D()) ]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel4d_equal(result, expected)
def test_loc_setitem_frame_multiples(self): # multiple setting df = DataFrame({'A': ['foo', 'bar', 'baz'], 'B': Series( range(3), dtype=np.int64)}) rhs = df.loc[1:2] rhs.index = df.index[0:2] df.loc[0:1] = rhs expected = DataFrame({'A': ['bar', 'baz', 'baz'], 'B': Series( [1, 2, 2], dtype=np.int64)}) tm.assert_frame_equal(df, expected) # multiple setting with frame on rhs (with M8) df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), 'val': Series( range(5), dtype=np.int64)}) expected = DataFrame({'date': [Timestamp('20000101'), Timestamp( '20000102'), Timestamp('20000101'), Timestamp('20000102'), Timestamp('20000103')], 'val': Series( [0, 1, 0, 1, 2], dtype=np.int64)}) rhs = df.loc[0:2] rhs.index = df.index[2:5] df.loc[2:4] = rhs tm.assert_frame_equal(df, expected)
def test_grouper_multilevel_freq(self): # GH 7885 # with level and freq specified in a pd.Grouper from datetime import date, timedelta d0 = date.today() - timedelta(days=14) dates = date_range(d0, date.today()) date_index = pd.MultiIndex.from_product( [dates, dates], names=['foo', 'bar']) df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index) # Check string level expected = df.reset_index().groupby([pd.Grouper( key='foo', freq='W'), pd.Grouper(key='bar', freq='W')]).sum() # reset index changes columns dtype to object expected.columns = pd.Index([0], dtype='int64') result = df.groupby([pd.Grouper(level='foo', freq='W'), pd.Grouper( level='bar', freq='W')]).sum() assert_frame_equal(result, expected) # Check integer level result = df.groupby([pd.Grouper(level=0, freq='W'), pd.Grouper( level=1, freq='W')]).sum() assert_frame_equal(result, expected)
def test_parse_ragged_csv(self): data = """1,2,3 1,2,3,4 1,2,3,4,5 1,2 1,2,3,4""" nice_data = """1,2,3,, 1,2,3,4, 1,2,3,4,5 1,2,,, 1,2,3,4,""" result = self.read_csv(StringIO(data), header=None, names=['a', 'b', 'c', 'd', 'e']) expected = self.read_csv(StringIO(nice_data), header=None, names=['a', 'b', 'c', 'd', 'e']) tm.assert_frame_equal(result, expected) # too many columns, cause segfault if not careful data = "1,2\n3,4,5" result = self.read_csv(StringIO(data), header=None, names=lrange(50)) expected = self.read_csv(StringIO(data), header=None, names=lrange(3)).reindex(columns=lrange(50)) tm.assert_frame_equal(result, expected)
def test_empty_header_read(count): s = StringIO(',' * count) expected = DataFrame(columns=[ 'Unnamed: {i}'.format(i=i) for i in range(count + 1)]) df = self.read_csv(s) tm.assert_frame_equal(df, expected)
def test_empty_with_mangled_column_pass_dtype_by_indexes(self): data = 'one,one' result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) expected = DataFrame( {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) tm.assert_frame_equal(result, expected, check_index_type=False)
def test_custom_lineterminator(self): data = 'a,b,c~1,2,3~4,5,6' result = self.read_csv(StringIO(data), lineterminator='~') expected = self.read_csv(StringIO(data.replace('~', '\n'))) tm.assert_frame_equal(result, expected)
def test_groupby_categorical_index_and_columns(self, observed): # GH18432 columns = ['A', 'B', 'A', 'B'] categories = ['B', 'A'] data = np.ones((5, 4), int) cat_columns = CategoricalIndex(columns, categories=categories, ordered=True) df = DataFrame(data=data, columns=cat_columns) result = df.groupby(axis=1, level=0, observed=observed).sum() expected_data = 2 * np.ones((5, 2), int) if observed: # if we are not-observed we undergo a reindex # so need to adjust the output as our expected sets us up # to be non-observed expected_columns = CategoricalIndex(['A', 'B'], categories=categories, ordered=True) else: expected_columns = CategoricalIndex(categories, categories=categories, ordered=True) expected = DataFrame(data=expected_data, columns=expected_columns) assert_frame_equal(result, expected) # test transposed version df = DataFrame(data.T, index=cat_columns) result = df.groupby(axis=0, level=0, observed=observed).sum() expected = DataFrame(data=expected_data.T, index=expected_columns) assert_frame_equal(result, expected)
def test_empty_pass_dtype(self): data = 'one,two' result = self.read_csv(StringIO(data), dtype={'one': 'u1'}) expected = DataFrame({'one': np.empty(0, dtype='u1'), 'two': np.empty(0, dtype=np.object)}) tm.assert_frame_equal(result, expected, check_index_type=False)
def test_drop_multiindex_not_lexsorted(self): # GH 11640 # define the lexsorted version lexsorted_mi = MultiIndex.from_tuples( [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c']) lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) self.assertTrue(lexsorted_df.columns.is_lexsorted()) # define the non-lexsorted version not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'], data=[[1, 'b1', 'c1', 3], [1, 'b2', 'c2', 4]]) not_lexsorted_df = not_lexsorted_df.pivot_table( index='a', columns=['b', 'c'], values='d') not_lexsorted_df = not_lexsorted_df.reset_index() self.assertFalse(not_lexsorted_df.columns.is_lexsorted()) # compare the results tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) expected = lexsorted_df.drop('a', axis=1) with tm.assert_produces_warning(PerformanceWarning): result = not_lexsorted_df.drop('a', axis=1) tm.assert_frame_equal(result, expected)
def test_apply_filter_query(test_df): filters = ['col1 < 3', 'col2 > 6'] filtered = util.apply_filter_query(test_df, filters) expected = pd.DataFrame({'col1': [2], 'col2': [7]}, index=['c']) pdt.assert_frame_equal(filtered, expected)
data = r"""date,numeric_col_1,non-numeric_col,numeric_col_2 2018-06-10,0,NaN,NaN 2018-06-11,1,NaN,1 2018-06-12,NaN,a,2 2018-06-13,3,b,NaN 2018-06-14,NaN,x,3 2018-06-15,NaN,c,5 2018-06-16,6,d,7 """ df = pd.read_csv( pd.compat.StringIO(data), index_col=['date'], parse_dates=True, date_parser=lambda x: pd.datetime.strptime(x, '%Y-%m-%d') ) assert isinstance(df.index, pd.core.indexes.datetimes.DatetimeIndex) print(df) df_copy = df.copy() # use private method df = df._get_numeric_data() print(df) # use select_dtypes df_copy = df_copy.select_dtypes([np.number]) print(df_copy) assert_frame_equal(df, df_copy)
def test_slicing_datetimes(): # GH 7523 # unique df = DataFrame(np.arange(4., dtype='float64'), index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 3, 4]]) result = df.loc[datetime(2001, 1, 1, 10):] assert_frame_equal(result, df) result = df.loc[:datetime(2001, 1, 4, 10)] assert_frame_equal(result, df) result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] assert_frame_equal(result, df) result = df.loc[datetime(2001, 1, 1, 11):] expected = df.iloc[1:] assert_frame_equal(result, expected) result = df.loc['20010101 11':] assert_frame_equal(result, expected) # duplicates df = pd.DataFrame( np.arange(5., dtype='float64'), index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 2, 3, 4]]) result = df.loc[datetime(2001, 1, 1, 10):] assert_frame_equal(result, df) result = df.loc[:datetime(2001, 1, 4, 10)] assert_frame_equal(result, df) result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] assert_frame_equal(result, df) result = df.loc[datetime(2001, 1, 1, 11):] expected = df.iloc[1:] assert_frame_equal(result, expected) result = df.loc['20010101 11':] assert_frame_equal(result, expected)
def test_filter_table(choosers, rates): filtered = util.filter_table(choosers, rates.iloc[1], ignore={'probability_of_relocating'}) pdt.assert_frame_equal(filtered, choosers.iloc[[2]])
def test_apply_filter_query_no_filter(test_df): filters = [] filtered = util.apply_filter_query(test_df, filters) expected = test_df pdt.assert_frame_equal(filtered, expected)
def test_apply_filter_query_or(test_df): filters = ['col1 < 1 or col2 > 8'] filtered = util.apply_filter_query(test_df, filters) expected = pd.DataFrame({'col1': [0, 4], 'col2': [5, 9]}, index=['a', 'e']) pdt.assert_frame_equal(filtered, expected)
def test_apply_filter_query_empty(test_df): filters = ['col1 < 1', 'col2 > 8'] filtered = util.apply_filter_query(test_df, filters) expected = pd.DataFrame({'col1': [], 'col2': []}, index=[]) pdt.assert_frame_equal(filtered, expected)
def test_value_array_record_prefix(self): # GH 21536 result = json_normalize({'A': [1, 2]}, 'A', record_prefix='Prefix.') expected = DataFrame([[1], [2]], columns=['Prefix.0']) tm.assert_frame_equal(result, expected)
def test_catalog(self, dummy_context, dummy_dataframe): assert dummy_context.catalog.layers == {"raw": {"boats"}} dummy_context.catalog.save("cars", dummy_dataframe) reloaded_df = dummy_context.catalog.load("cars") assert_frame_equal(reloaded_df, dummy_dataframe)
def test_empty_array(self): result = json_normalize([]) expected = DataFrame() tm.assert_frame_equal(result, expected)
def test_int64_overflow_issues(self): # #2690, combinatorial explosion df1 = DataFrame(np.random.randn(1000, 7), columns=list("ABCDEF") + ["G1"]) df2 = DataFrame(np.random.randn(1000, 7), columns=list("ABCDEF") + ["G2"]) # it works! result = merge(df1, df2, how="outer") assert len(result) == 2000 low, high, n = -1 << 10, 1 << 10, 1 << 20 left = DataFrame(np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG")) left["left"] = left.sum(axis=1) # one-2-one match i = np.random.permutation(len(left)) right = left.iloc[i].copy() right.columns = right.columns[:-1].tolist() + ["right"] right.index = np.arange(len(right)) right["right"] *= -1 out = merge(left, right, how="outer") assert len(out) == len(left) assert_series_equal(out["left"], -out["right"], check_names=False) result = out.iloc[:, :-2].sum(axis=1) assert_series_equal(out["left"], result, check_names=False) assert result.name is None out.sort_values(out.columns.tolist(), inplace=True) out.index = np.arange(len(out)) for how in ["left", "right", "outer", "inner"]: assert_frame_equal(out, merge(left, right, how=how, sort=True)) # check that left merge w/ sort=False maintains left frame order out = merge(left, right, how="left", sort=False) assert_frame_equal(left, out[left.columns.tolist()]) out = merge(right, left, how="left", sort=False) assert_frame_equal(right, out[right.columns.tolist()]) # one-2-many/none match n = 1 << 11 left = DataFrame( np.random.randint(low, high, (n, 7)).astype("int64"), columns=list("ABCDEFG"), ) # confirm that this is checking what it is supposed to check shape = left.apply(Series.nunique).values assert is_int64_overflow_possible(shape) # add duplicates to left frame left = concat([left, left], ignore_index=True) right = DataFrame( np.random.randint(low, high, (n // 2, 7)).astype("int64"), columns=list("ABCDEFG"), ) # add duplicates & overlap with left to the right frame i = np.random.choice(len(left), n) right = concat([right, right, left.iloc[i]], ignore_index=True) left["left"] = np.random.randn(len(left)) right["right"] = np.random.randn(len(right)) # shuffle left & right frames i = np.random.permutation(len(left)) left = left.iloc[i].copy() left.index = np.arange(len(left)) i = np.random.permutation(len(right)) right = right.iloc[i].copy() right.index = np.arange(len(right)) # manually compute outer merge ldict, rdict = defaultdict(list), defaultdict(list) for idx, row in left.set_index(list("ABCDEFG")).iterrows(): ldict[idx].append(row["left"]) for idx, row in right.set_index(list("ABCDEFG")).iterrows(): rdict[idx].append(row["right"]) vals = [] for k, lval in ldict.items(): rval = rdict.get(k, [np.nan]) for lv, rv in product(lval, rval): vals.append(k + tuple([lv, rv])) for k, rval in rdict.items(): if k not in ldict: for rv in rval: vals.append(k + tuple([np.nan, rv])) def align(df): df = df.sort_values(df.columns.tolist()) df.index = np.arange(len(df)) return df def verify_order(df): kcols = list("ABCDEFG") assert_frame_equal( df[kcols].copy(), df[kcols].sort_values(kcols, kind="mergesort") ) out = DataFrame(vals, columns=list("ABCDEFG") + ["left", "right"]) out = align(out) jmask = { "left": out["left"].notna(), "right": out["right"].notna(), "inner": out["left"].notna() & out["right"].notna(), "outer": np.ones(len(out), dtype="bool"), } for how in "left", "right", "outer", "inner": mask = jmask[how] frame = align(out[mask].copy()) assert mask.all() ^ mask.any() or how == "outer" for sort in [False, True]: res = merge(left, right, how=how, sort=sort) if sort: verify_order(res) # as in GH9092 dtypes break with outer/right join assert_frame_equal( frame, align(res), check_dtype=how not in ("right", "outer") )
def test_io(self, dummy_context, dummy_dataframe): dummy_context.io.save("cars", dummy_dataframe) reloaded_df = dummy_context.io.load("cars") assert_frame_equal(reloaded_df, dummy_dataframe)
def test_frame_from_json_precise_float(self): df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]]) result = read_json(df.to_json(), precise_float=True) assert_frame_equal(result, df)
def verify_order(df): kcols = list("ABCDEFG") assert_frame_equal( df[kcols].copy(), df[kcols].sort_values(kcols, kind="mergesort") )
def test_frame_empty(self): df = DataFrame(columns=['jim', 'joe']) self.assertFalse(df._is_mixed_type) assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df)
def test_default_handler(self): value = object() frame = DataFrame({'a': ['a', value]}) expected = frame.applymap(str) result = pd.read_json(frame.to_json(default_handler=str)) assert_frame_equal(expected, result)
def check(result, expected): str(result) result.dtypes tm.assert_frame_equal(result, expected)
def test_frame_empty_mixedtype(self): # mixed type df = DataFrame(columns=['jim', 'joe']) df['joe'] = df['joe'].astype('i8') self.assertTrue(df._is_mixed_type) assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df)
def test_iloc_getitem_frame(self): df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2), columns=lrange(0, 8, 2)) result = df.iloc[2] with catch_warnings(record=True): filterwarnings("ignore", "\\n.ix", DeprecationWarning) exp = df.ix[4] tm.assert_series_equal(result, exp) result = df.iloc[2, 2] with catch_warnings(record=True): filterwarnings("ignore", "\\n.ix", DeprecationWarning) exp = df.ix[4, 4] assert result == exp # slice result = df.iloc[4:8] with catch_warnings(record=True): filterwarnings("ignore", "\\n.ix", DeprecationWarning) expected = df.ix[8:14] tm.assert_frame_equal(result, expected) result = df.iloc[:, 2:3] with catch_warnings(record=True): filterwarnings("ignore", "\\n.ix", DeprecationWarning) expected = df.ix[:, 4:5] tm.assert_frame_equal(result, expected) # list of integers result = df.iloc[[0, 1, 3]] with catch_warnings(record=True): filterwarnings("ignore", "\\n.ix", DeprecationWarning) expected = df.ix[[0, 2, 6]] tm.assert_frame_equal(result, expected) result = df.iloc[[0, 1, 3], [0, 1]] with catch_warnings(record=True): filterwarnings("ignore", "\\n.ix", DeprecationWarning) expected = df.ix[[0, 2, 6], [0, 2]] tm.assert_frame_equal(result, expected) # neg indices result = df.iloc[[-1, 1, 3], [-1, 1]] with catch_warnings(record=True): filterwarnings("ignore", "\\n.ix", DeprecationWarning) expected = df.ix[[18, 2, 6], [6, 2]] tm.assert_frame_equal(result, expected) # dups indices result = df.iloc[[-1, -1, 1, 3], [-1, 1]] with catch_warnings(record=True): filterwarnings("ignore", "\\n.ix", DeprecationWarning) expected = df.ix[[18, 18, 2, 6], [6, 2]] tm.assert_frame_equal(result, expected) # with index-like s = Series(index=lrange(1, 5)) result = df.iloc[s.index] with catch_warnings(record=True): filterwarnings("ignore", "\\n.ix", DeprecationWarning) expected = df.ix[[2, 4, 6, 8]] tm.assert_frame_equal(result, expected)
def _check(df): result = read_json(df.to_json(orient='split'), orient='split', convert_dates=['x']) assert_frame_equal(result, df)
def test_iloc_exceeds_bounds(self): # GH6296 # iloc should allow indexers that exceed the bounds df = DataFrame(np.random.random_sample((20, 5)), columns=list('ABCDE')) # lists of positions should raise IndexErrror! msg = 'positional indexers are out-of-bounds' with pytest.raises(IndexError, match=msg): df.iloc[:, [0, 1, 2, 3, 4, 5]] pytest.raises(IndexError, lambda: df.iloc[[1, 30]]) pytest.raises(IndexError, lambda: df.iloc[[1, -30]]) pytest.raises(IndexError, lambda: df.iloc[[100]]) s = df['A'] pytest.raises(IndexError, lambda: s.iloc[[100]]) pytest.raises(IndexError, lambda: s.iloc[[-100]]) # still raise on a single indexer msg = 'single positional indexer is out-of-bounds' with pytest.raises(IndexError, match=msg): df.iloc[30] pytest.raises(IndexError, lambda: df.iloc[-30]) # GH10779 # single positive/negative indexer exceeding Series bounds should raise # an IndexError with pytest.raises(IndexError, match=msg): s.iloc[30] pytest.raises(IndexError, lambda: s.iloc[-30]) # slices are ok result = df.iloc[:, 4:10] # 0 < start < len < stop expected = df.iloc[:, 4:] tm.assert_frame_equal(result, expected) result = df.iloc[:, -4:-10] # stop < 0 < start < len expected = df.iloc[:, :0] tm.assert_frame_equal(result, expected) result = df.iloc[:, 10:4:-1] # 0 < stop < len < start (down) expected = df.iloc[:, :4:-1] tm.assert_frame_equal(result, expected) result = df.iloc[:, 4:-10:-1] # stop < 0 < start < len (down) expected = df.iloc[:, 4::-1] tm.assert_frame_equal(result, expected) result = df.iloc[:, -10:4] # start < 0 < stop < len expected = df.iloc[:, :4] tm.assert_frame_equal(result, expected) result = df.iloc[:, 10:4] # 0 < stop < len < start expected = df.iloc[:, :0] tm.assert_frame_equal(result, expected) result = df.iloc[:, -10:-11:-1] # stop < start < 0 < len (down) expected = df.iloc[:, :0] tm.assert_frame_equal(result, expected) result = df.iloc[:, 10:11] # 0 < len < start < stop expected = df.iloc[:, :0] tm.assert_frame_equal(result, expected) # slice bounds exceeding is ok result = s.iloc[18:30] expected = s.iloc[18:] tm.assert_series_equal(result, expected) result = s.iloc[30:] expected = s.iloc[:0] tm.assert_series_equal(result, expected) result = s.iloc[30::-1] expected = s.iloc[::-1] tm.assert_series_equal(result, expected) # doc example def check(result, expected): str(result) result.dtypes tm.assert_frame_equal(result, expected) dfl = DataFrame(np.random.randn(5, 2), columns=list('AB')) check(dfl.iloc[:, 2:3], DataFrame(index=dfl.index)) check(dfl.iloc[:, 1:3], dfl.iloc[:, [1]]) check(dfl.iloc[4:6], dfl.iloc[[4]]) pytest.raises(IndexError, lambda: dfl.iloc[[4, 5, 6]]) pytest.raises(IndexError, lambda: dfl.iloc[:, 4])
def test_iloc_mask(self): # GH 3631, iloc with a mask (of a series) should raise df = DataFrame(lrange(5), list('ABCDE'), columns=['a']) mask = (df.a % 2 == 0) pytest.raises(ValueError, df.iloc.__getitem__, tuple([mask])) mask.index = lrange(len(mask)) pytest.raises(NotImplementedError, df.iloc.__getitem__, tuple([mask])) # ndarray ok result = df.iloc[np.array([True] * len(mask), dtype=bool)] tm.assert_frame_equal(result, df) # the possibilities locs = np.arange(4) nums = 2 ** locs reps = lmap(bin, nums) df = DataFrame({'locs': locs, 'nums': nums}, reps) expected = { (None, ''): '0b1100', (None, '.loc'): '0b1100', (None, '.iloc'): '0b1100', ('index', ''): '0b11', ('index', '.loc'): '0b11', ('index', '.iloc'): ('iLocation based boolean indexing ' 'cannot use an indexable as a mask'), ('locs', ''): 'Unalignable boolean Series provided as indexer ' '(index of the boolean Series and of the indexed ' 'object do not match', ('locs', '.loc'): 'Unalignable boolean Series provided as indexer ' '(index of the boolean Series and of the ' 'indexed object do not match', ('locs', '.iloc'): ('iLocation based boolean indexing on an ' 'integer type is not available'), } # UserWarnings from reindex of a boolean mask with catch_warnings(record=True): simplefilter("ignore", UserWarning) result = dict() for idx in [None, 'index', 'locs']: mask = (df.nums > 2).values if idx: mask = Series(mask, list(reversed(getattr(df, idx)))) for method in ['', '.loc', '.iloc']: try: if method: accessor = getattr(df, method[1:]) else: accessor = df ans = str(bin(accessor[mask]['nums'].sum())) except Exception as e: ans = str(e) key = tuple([idx, method]) r = expected.get(key) if r != ans: raise AssertionError( "[%s] does not match [%s], received [%s]" % (key, ans, r))
def test_float_precision_round_trip_with_text(c_parser_only): # see gh-15140 parser = c_parser_only df = parser.read_csv(StringIO("a"), header=None, float_precision="round_trip") tm.assert_frame_equal(df, DataFrame({0: ["a"]}))
def test_iloc_getitem_slice_dups(self): df1 = DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']) df2 = DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2), columns=['A', 'C']) # axis=1 df = concat([df1, df2], axis=1) tm.assert_frame_equal(df.iloc[:, :4], df1) tm.assert_frame_equal(df.iloc[:, 4:], df2) df = concat([df2, df1], axis=1) tm.assert_frame_equal(df.iloc[:, :2], df2) tm.assert_frame_equal(df.iloc[:, 2:], df1) exp = concat([df2, df1.iloc[:, [0]]], axis=1) tm.assert_frame_equal(df.iloc[:, 0:3], exp) # axis=0 df = concat([df, df], axis=0) tm.assert_frame_equal(df.iloc[0:10, :2], df2) tm.assert_frame_equal(df.iloc[0:10, 2:], df1) tm.assert_frame_equal(df.iloc[10:, :2], df2) tm.assert_frame_equal(df.iloc[10:, 2:], df1)
def test_sanity_data_frame(self, mock_post): """Tests KustoResponse to pandas.DataFrame.""" from pandas import DataFrame, Series from pandas.util.testing import assert_frame_equal client = KustoClient("https://somecluster.kusto.windows.net") data_frame = dataframe_from_result_table( client.execute_query("PythonTest", "Deft").primary_results[0]) self.assertEqual(len(data_frame.columns), 19) expected_dict = { "rownumber": Series([None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]), "rowguid": Series( [ "", "00000000-0000-0000-0001-020304050607", "00000001-0000-0000-0001-020304050607", "00000002-0000-0000-0001-020304050607", "00000003-0000-0000-0001-020304050607", "00000004-0000-0000-0001-020304050607", "00000005-0000-0000-0001-020304050607", "00000006-0000-0000-0001-020304050607", "00000007-0000-0000-0001-020304050607", "00000008-0000-0000-0001-020304050607", "00000009-0000-0000-0001-020304050607", ], dtype=object, ), "xdouble": Series([ None, 0.0, 1.0001, 2.0002, 3.0003, 4.0004, 5.0005, 6.0006, 7.0007, 8.0008, 9.0009 ]), "xfloat": Series([ None, 0.0, 1.01, 2.02, 3.03, 4.04, 5.05, 6.06, 7.07, 8.08, 9.09 ]), "xbool": Series([ None, False, True, False, True, False, True, False, True, False, True ], dtype=bool), "xint16": Series([None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]), "xint32": Series([None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]), "xint64": Series([None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]), "xuint8": Series([None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]), "xuint16": Series([None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]), "xuint32": Series([None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]), "xuint64": Series([None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]), "xdate": Series([ pandas.to_datetime(None), pandas.to_datetime("2014-01-01T01:01:01.0000000Z"), pandas.to_datetime("2015-01-01T01:01:01.0000001Z"), pandas.to_datetime("2016-01-01T01:01:01.0000002Z"), pandas.to_datetime("2017-01-01T01:01:01.0000003Z"), pandas.to_datetime("2018-01-01T01:01:01.0000004Z"), pandas.to_datetime("2019-01-01T01:01:01.0000005Z"), pandas.to_datetime("2020-01-01T01:01:01.0000006Z"), pandas.to_datetime("2021-01-01T01:01:01.0000007Z"), pandas.to_datetime("2022-01-01T01:01:01.0000008Z"), pandas.to_datetime("2023-01-01T01:01:01.0000009Z"), ]), "xsmalltext": Series([ "", "Zero", "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine" ], dtype=object), "xtext": Series([ "", "Zero", "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine" ], dtype=object), "xnumberAsText": Series(["", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], dtype=object), "xtime": Series( [ "NaT", 0, "1 days 00:00:01.0010001", "-3 days 23:59:57.9979998", "3 days 00:00:03.0030003", "-5 days 23:59:55.9959996", "5 days 00:00:05.0050005", "-7 days 23:59:53.9939994", "7 days 00:00:07.0070007", "-9 days 23:59:51.9919992", "9 days 00:00:09.0090009", ], dtype="timedelta64[ns]", ), "xtextWithNulls": Series(["", "", "", "", "", "", "", "", "", "", ""], dtype=object), "xdynamicWithNulls": Series( [ text_type(""), text_type(""), { "rowId": 1, "arr": [0, 1] }, { "rowId": 2, "arr": [0, 2] }, { "rowId": 3, "arr": [0, 3] }, { "rowId": 4, "arr": [0, 4] }, { "rowId": 5, "arr": [0, 5] }, { "rowId": 6, "arr": [0, 6] }, { "rowId": 7, "arr": [0, 7] }, { "rowId": 8, "arr": [0, 8] }, { "rowId": 9, "arr": [0, 9] }, ], dtype=object, ), } columns = [ "rownumber", "rowguid", "xdouble", "xfloat", "xbool", "xint16", "xint32", "xint64", "xuint8", "xuint16", "xuint32", "xuint64", "xdate", "xsmalltext", "xtext", "xnumberAsText", "xtime", "xtextWithNulls", "xdynamicWithNulls", ] expected_data_frame = DataFrame(expected_dict, columns=columns, copy=True) assert_frame_equal(data_frame, expected_data_frame)
def test_weightscomp_pyparser_prepare_atomic(atomic, expected): assert_frame_equal(atomic, expected, check_names=False)
def test_to_frame(self): s = tm.SubclassedSeries([1, 2, 3, 4], index=list('abcd'), name='xxx') res = s.to_frame() exp = tm.SubclassedDataFrame({'xxx': [1, 2, 3, 4]}, index=list('abcd')) tm.assert_frame_equal(res, exp) assert isinstance(res, tm.SubclassedDataFrame)
def test_parse_trim_buffers(c_parser_only): # This test is part of a bugfix for gh-13703. It attempts to # to stress the system memory allocator, to cause it to move the # stream buffer and either let the OS reclaim the region, or let # other memory requests of parser otherwise modify the contents # of memory space, where it was formally located. # This test is designed to cause a `segfault` with unpatched # `tokenizer.c`. Sometimes the test fails on `segfault`, other # times it fails due to memory corruption, which causes the # loaded DataFrame to differ from the expected one. parser = c_parser_only # Generate a large mixed-type CSV file on-the-fly (one record is # approx 1.5KiB). record_ = \ """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \ """ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \ """ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \ """99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \ """9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \ """99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \ """99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \ """ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \ """ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \ """ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \ """9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \ """999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \ """,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \ """,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \ """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \ """,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \ """ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \ """,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \ """,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \ """9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \ """.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \ """,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \ """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \ """ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \ """-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \ """ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \ """,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \ """,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \ """.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,""" # Set the number of lines so that a call to `parser_trim_buffers` # is triggered: after a couple of full chunks are consumed a # relatively small 'residual' chunk would cause reallocation # within the parser. chunksize, n_lines = 128, 2 * 128 + 15 csv_data = "\n".join([record_] * n_lines) + "\n" # We will use StringIO to load the CSV from this text buffer. # pd.read_csv() will iterate over the file in chunks and will # finally read a residual chunk of really small size. # Generate the expected output: manually create the dataframe # by splitting by comma and repeating the `n_lines` times. row = tuple(val_ if val_ else np.nan for val_ in record_.split(",")) expected = DataFrame([row for _ in range(n_lines)], dtype=object, columns=None, index=None) # Iterate over the CSV file in chunks of `chunksize` lines chunks_ = parser.read_csv(StringIO(csv_data), header=None, dtype=object, chunksize=chunksize) result = concat(chunks_, axis=0, ignore_index=True) # Check for data corruption if there was no segfault tm.assert_frame_equal(result, expected) # This extra test was added to replicate the fault in gh-5291. # Force 'utf-8' encoding, so that `_string_convert` would take # a different execution branch. chunks_ = parser.read_csv(StringIO(csv_data), header=None, dtype=object, chunksize=chunksize, encoding="utf_8") result = concat(chunks_, axis=0, ignore_index=True) tm.assert_frame_equal(result, expected)
def test_createDataFrame_with_schema(self): pdf = self.create_pandas_data_frame() df = self.spark.createDataFrame(pdf, schema=self.schema) self.assertEquals(self.schema, df.schema) pdf_arrow = df.toPandas() assert_frame_equal(pdf_arrow, pdf)
def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected): # GH22084 pd.get_dummies incorrectly encodes unicode characters # in dataframe column names result = get_dummies(**get_dummies_kwargs) assert_frame_equal(result, expected)